kordoc 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -4
- package/dist/{chunk-C3XHIIJZ.js → chunk-KZMWHK72.js} +73 -48
- package/dist/cli.js +12 -3
- package/dist/index.cjs +73 -48
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +73 -48
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +21 -6
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -147,13 +147,18 @@ import type { IRBlock, IRTable, IRCell, CellContext } from "kordoc"
|
|
|
147
147
|
|
|
148
148
|
## Security
|
|
149
149
|
|
|
150
|
-
v0.2.
|
|
150
|
+
v0.2.2 security hardening (cumulative since v0.2.1):
|
|
151
151
|
|
|
152
152
|
- **ZIP bomb protection** — 100MB decompression limit, 500 entry cap
|
|
153
|
-
- **XXE prevention** —
|
|
154
|
-
- **Decompression bomb guard** — `maxOutputLength` on HWP5 zlib streams
|
|
153
|
+
- **XXE/Billion Laughs prevention** — Internal DTD subsets fully stripped from HWPX XML
|
|
154
|
+
- **Decompression bomb guard** — `maxOutputLength` on HWP5 zlib streams, cumulative 100MB limit across sections
|
|
155
|
+
- **colSpan/rowSpan clamping** — Crafted merge values clamped to grid bounds (MAX_COLS=200, MAX_ROWS=10,000)
|
|
156
|
+
- **Broken ZIP path traversal guard** — `..` and absolute path entries rejected, filename length capped
|
|
155
157
|
- **MCP path restriction** — Only `.hwp`, `.hwpx`, `.pdf` extensions allowed
|
|
156
|
-
- **
|
|
158
|
+
- **File size limit** — 500MB max in MCP server and CLI
|
|
159
|
+
- **PDF resource cleanup** — `doc.destroy()` prevents WASM memory leaks
|
|
160
|
+
- **Table memory guard** — Sparse Set-based allocation in Pass 1, 10,000 row cap
|
|
161
|
+
- **HWP5 section limit** — Max 100 sections to prevent infinite loop on corrupted files
|
|
157
162
|
|
|
158
163
|
## How It Works
|
|
159
164
|
|
|
@@ -1,17 +1,20 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/detect.ts
|
|
4
|
+
function magicBytes(buffer) {
|
|
5
|
+
return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
|
|
6
|
+
}
|
|
4
7
|
function isHwpxFile(buffer) {
|
|
5
|
-
const
|
|
6
|
-
return
|
|
8
|
+
const b = magicBytes(buffer);
|
|
9
|
+
return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
|
|
7
10
|
}
|
|
8
11
|
function isOldHwpFile(buffer) {
|
|
9
|
-
const
|
|
10
|
-
return
|
|
12
|
+
const b = magicBytes(buffer);
|
|
13
|
+
return b[0] === 208 && b[1] === 207 && b[2] === 17 && b[3] === 224;
|
|
11
14
|
}
|
|
12
15
|
function isPdfFile(buffer) {
|
|
13
|
-
const
|
|
14
|
-
return
|
|
16
|
+
const b = magicBytes(buffer);
|
|
17
|
+
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
15
18
|
}
|
|
16
19
|
function detectFormat(buffer) {
|
|
17
20
|
if (isHwpxFile(buffer)) return "hwpx";
|
|
@@ -21,7 +24,7 @@ function detectFormat(buffer) {
|
|
|
21
24
|
}
|
|
22
25
|
|
|
23
26
|
// src/utils.ts
|
|
24
|
-
var VERSION = true ? "0.2.
|
|
27
|
+
var VERSION = true ? "0.2.2" : "0.0.0-dev";
|
|
25
28
|
function toArrayBuffer(buf) {
|
|
26
29
|
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
27
30
|
}
|
|
@@ -37,22 +40,23 @@ var MAX_ROWS = 1e4;
|
|
|
37
40
|
function buildTable(rows) {
|
|
38
41
|
if (rows.length > MAX_ROWS) rows = rows.slice(0, MAX_ROWS);
|
|
39
42
|
const numRows = rows.length;
|
|
40
|
-
const tempOccupied =
|
|
43
|
+
const tempOccupied = /* @__PURE__ */ new Set();
|
|
41
44
|
let maxCols = 0;
|
|
42
45
|
for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
|
|
43
46
|
let colIdx = 0;
|
|
44
47
|
for (const cell of rows[rowIdx]) {
|
|
45
|
-
while (colIdx < MAX_COLS && tempOccupied
|
|
48
|
+
while (colIdx < MAX_COLS && tempOccupied.has(rowIdx * MAX_COLS + colIdx)) colIdx++;
|
|
46
49
|
if (colIdx >= MAX_COLS) break;
|
|
47
50
|
for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
|
|
48
51
|
for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, MAX_COLS); c++) {
|
|
49
|
-
tempOccupied
|
|
52
|
+
tempOccupied.add(r * MAX_COLS + c);
|
|
50
53
|
}
|
|
51
54
|
}
|
|
52
55
|
colIdx += cell.colSpan;
|
|
53
56
|
if (colIdx > maxCols) maxCols = colIdx;
|
|
54
57
|
}
|
|
55
58
|
}
|
|
59
|
+
tempOccupied.clear();
|
|
56
60
|
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
57
61
|
const grid = Array.from(
|
|
58
62
|
{ length: numRows },
|
|
@@ -162,8 +166,11 @@ function tableToMarkdown(table) {
|
|
|
162
166
|
// src/hwpx/parser.ts
|
|
163
167
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
164
168
|
var MAX_ZIP_ENTRIES = 500;
|
|
169
|
+
function clampSpan(val, max) {
|
|
170
|
+
return Math.max(1, Math.min(val, max));
|
|
171
|
+
}
|
|
165
172
|
function stripDtd(xml) {
|
|
166
|
-
return xml.replace(/<!DOCTYPE[
|
|
173
|
+
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
167
174
|
}
|
|
168
175
|
async function parseHwpxDocument(buffer) {
|
|
169
176
|
let zip;
|
|
@@ -200,6 +207,10 @@ function extractFromBrokenZip(buffer) {
|
|
|
200
207
|
const compSize = view.getUint32(pos + 18, true);
|
|
201
208
|
const nameLen = view.getUint16(pos + 26, true);
|
|
202
209
|
const extraLen = view.getUint16(pos + 28, true);
|
|
210
|
+
if (nameLen > 1024 || extraLen > 65535) {
|
|
211
|
+
pos += 30 + nameLen + extraLen;
|
|
212
|
+
continue;
|
|
213
|
+
}
|
|
203
214
|
const fileStart = pos + 30 + nameLen + extraLen;
|
|
204
215
|
if (fileStart + compSize > data.length) break;
|
|
205
216
|
if (compSize === 0 && method !== 0) {
|
|
@@ -208,6 +219,10 @@ function extractFromBrokenZip(buffer) {
|
|
|
208
219
|
}
|
|
209
220
|
const nameBytes = data.slice(pos + 30, pos + 30 + nameLen);
|
|
210
221
|
const name = new TextDecoder().decode(nameBytes);
|
|
222
|
+
if (name.includes("..") || name.startsWith("/")) {
|
|
223
|
+
pos = fileStart + compSize;
|
|
224
|
+
continue;
|
|
225
|
+
}
|
|
211
226
|
const fileData = data.slice(fileStart, fileStart + compSize);
|
|
212
227
|
pos = fileStart + compSize;
|
|
213
228
|
if (!name.toLowerCase().includes("section") || !name.endsWith(".xml")) continue;
|
|
@@ -331,8 +346,8 @@ function walkSection(node, blocks, tableCtx, tableStack) {
|
|
|
331
346
|
if (tableCtx?.cell) {
|
|
332
347
|
const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
|
|
333
348
|
const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
|
|
334
|
-
|
|
335
|
-
|
|
349
|
+
tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
|
|
350
|
+
tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
|
|
336
351
|
}
|
|
337
352
|
break;
|
|
338
353
|
case "p": {
|
|
@@ -498,6 +513,8 @@ function extractText(data) {
|
|
|
498
513
|
import { createRequire } from "module";
|
|
499
514
|
var require2 = createRequire(import.meta.url);
|
|
500
515
|
var CFB = require2("cfb");
|
|
516
|
+
var MAX_SECTIONS = 100;
|
|
517
|
+
var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
|
|
501
518
|
function parseHwp5Document(buffer) {
|
|
502
519
|
const cfb = CFB.parse(buffer);
|
|
503
520
|
const headerEntry = CFB.find(cfb, "/FileHeader");
|
|
@@ -510,8 +527,11 @@ function parseHwp5Document(buffer) {
|
|
|
510
527
|
const sections = findSections(cfb);
|
|
511
528
|
if (sections.length === 0) throw new Error("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
512
529
|
const blocks = [];
|
|
530
|
+
let totalDecompressed = 0;
|
|
513
531
|
for (const sectionData of sections) {
|
|
514
532
|
const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
533
|
+
totalDecompressed += data.length;
|
|
534
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new Error("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
515
535
|
const records = readRecords(data);
|
|
516
536
|
blocks.push(...parseSection(records));
|
|
517
537
|
}
|
|
@@ -519,7 +539,7 @@ function parseHwp5Document(buffer) {
|
|
|
519
539
|
}
|
|
520
540
|
function findSections(cfb) {
|
|
521
541
|
const sections = [];
|
|
522
|
-
for (let i = 0; ; i++) {
|
|
542
|
+
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
523
543
|
const entry = CFB.find(cfb, `/BodyText/Section${i}`);
|
|
524
544
|
if (!entry?.content) break;
|
|
525
545
|
sections.push({ idx: i, content: Buffer.from(entry.content) });
|
|
@@ -618,8 +638,8 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
618
638
|
if (rec.data.length >= 14) {
|
|
619
639
|
const cs = rec.data.readUInt16LE(10);
|
|
620
640
|
const rs = rec.data.readUInt16LE(12);
|
|
621
|
-
if (cs > 0) colSpan = cs;
|
|
622
|
-
if (rs > 0) rowSpan = rs;
|
|
641
|
+
if (cs > 0) colSpan = Math.min(cs, MAX_COLS);
|
|
642
|
+
if (rs > 0) rowSpan = Math.min(rs, MAX_ROWS);
|
|
623
643
|
}
|
|
624
644
|
let i = startIdx + 1;
|
|
625
645
|
while (i < records.length) {
|
|
@@ -692,40 +712,45 @@ async function parsePdfDocument(buffer) {
|
|
|
692
712
|
disableFontFace: true,
|
|
693
713
|
isEvalSupported: false
|
|
694
714
|
}).promise;
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
if (
|
|
724
|
-
|
|
715
|
+
try {
|
|
716
|
+
const pageCount = doc.numPages;
|
|
717
|
+
if (pageCount === 0) {
|
|
718
|
+
return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
|
|
719
|
+
}
|
|
720
|
+
const pageTexts = [];
|
|
721
|
+
let totalChars = 0;
|
|
722
|
+
for (let i = 1; i <= pageCount; i++) {
|
|
723
|
+
const page = await doc.getPage(i);
|
|
724
|
+
const textContent = await page.getTextContent();
|
|
725
|
+
const lines = groupTextItemsByLine(textContent.items);
|
|
726
|
+
const pageText = lines.join("\n");
|
|
727
|
+
totalChars += pageText.replace(/\s/g, "").length;
|
|
728
|
+
pageTexts.push(pageText);
|
|
729
|
+
}
|
|
730
|
+
const avgCharsPerPage = totalChars / pageCount;
|
|
731
|
+
if (avgCharsPerPage < 10) {
|
|
732
|
+
return {
|
|
733
|
+
success: false,
|
|
734
|
+
fileType: "pdf",
|
|
735
|
+
pageCount,
|
|
736
|
+
isImageBased: true,
|
|
737
|
+
error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF\uB85C \uCD94\uC815\uB429\uB2C8\uB2E4 (${pageCount}\uD398\uC774\uC9C0, \uCD94\uCD9C \uD14D\uC2A4\uD2B8 ${totalChars}\uC790).`
|
|
738
|
+
};
|
|
739
|
+
}
|
|
740
|
+
let markdown = "";
|
|
741
|
+
for (let i = 0; i < pageTexts.length; i++) {
|
|
742
|
+
const cleaned = cleanPdfText(pageTexts[i]);
|
|
743
|
+
if (cleaned.trim()) {
|
|
744
|
+
if (i > 0 && markdown) markdown += "\n\n";
|
|
745
|
+
markdown += cleaned;
|
|
746
|
+
}
|
|
725
747
|
}
|
|
748
|
+
markdown = reconstructTables(markdown);
|
|
749
|
+
return { success: true, fileType: "pdf", markdown, pageCount, isImageBased: false };
|
|
750
|
+
} finally {
|
|
751
|
+
await doc.destroy().catch(() => {
|
|
752
|
+
});
|
|
726
753
|
}
|
|
727
|
-
markdown = reconstructTables(markdown);
|
|
728
|
-
return { success: true, fileType: "pdf", markdown, pageCount, isImageBased: false };
|
|
729
754
|
}
|
|
730
755
|
function groupTextItemsByLine(items) {
|
|
731
756
|
if (items.length === 0) return [];
|
package/dist/cli.js
CHANGED
|
@@ -4,10 +4,10 @@ import {
|
|
|
4
4
|
detectFormat,
|
|
5
5
|
parse,
|
|
6
6
|
toArrayBuffer
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-KZMWHK72.js";
|
|
8
8
|
|
|
9
9
|
// src/cli.ts
|
|
10
|
-
import { readFileSync, writeFileSync, mkdirSync } from "fs";
|
|
10
|
+
import { readFileSync, writeFileSync, mkdirSync, statSync } from "fs";
|
|
11
11
|
import { basename, resolve } from "path";
|
|
12
12
|
import { Command } from "commander";
|
|
13
13
|
var program = new Command();
|
|
@@ -16,6 +16,14 @@ program.name("kordoc").description("\uBAA8\uB450 \uD30C\uC2F1\uD574\uBC84\uB9AC\
|
|
|
16
16
|
const absPath = resolve(filePath);
|
|
17
17
|
const fileName = basename(absPath);
|
|
18
18
|
try {
|
|
19
|
+
const fileSize = statSync(absPath).size;
|
|
20
|
+
if (fileSize > 500 * 1024 * 1024) {
|
|
21
|
+
process.stderr.write(`
|
|
22
|
+
[kordoc] SKIP: ${fileName} \u2014 \uD30C\uC77C\uC774 \uB108\uBB34 \uD07D\uB2C8\uB2E4 (${(fileSize / 1024 / 1024).toFixed(1)}MB)
|
|
23
|
+
`);
|
|
24
|
+
process.exitCode = 1;
|
|
25
|
+
continue;
|
|
26
|
+
}
|
|
19
27
|
const buffer = readFileSync(absPath);
|
|
20
28
|
const arrayBuffer = toArrayBuffer(buffer);
|
|
21
29
|
const format = detectFormat(arrayBuffer);
|
|
@@ -40,7 +48,8 @@ program.name("kordoc").description("\uBAA8\uB450 \uD30C\uC2F1\uD574\uBC84\uB9AC\
|
|
|
40
48
|
`);
|
|
41
49
|
} else if (opts.outDir) {
|
|
42
50
|
mkdirSync(opts.outDir, { recursive: true });
|
|
43
|
-
const
|
|
51
|
+
const outExt = opts.format === "json" ? ".json" : ".md";
|
|
52
|
+
const outPath = resolve(opts.outDir, fileName.replace(/\.[^.]+$/, outExt));
|
|
44
53
|
writeFileSync(outPath, output, "utf-8");
|
|
45
54
|
if (!opts.silent) process.stderr.write(` \u2192 ${outPath}
|
|
46
55
|
`);
|
package/dist/index.cjs
CHANGED
|
@@ -46,17 +46,20 @@ __export(index_exports, {
|
|
|
46
46
|
module.exports = __toCommonJS(index_exports);
|
|
47
47
|
|
|
48
48
|
// src/detect.ts
|
|
49
|
+
function magicBytes(buffer) {
|
|
50
|
+
return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
|
|
51
|
+
}
|
|
49
52
|
function isHwpxFile(buffer) {
|
|
50
|
-
const
|
|
51
|
-
return
|
|
53
|
+
const b = magicBytes(buffer);
|
|
54
|
+
return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
|
|
52
55
|
}
|
|
53
56
|
function isOldHwpFile(buffer) {
|
|
54
|
-
const
|
|
55
|
-
return
|
|
57
|
+
const b = magicBytes(buffer);
|
|
58
|
+
return b[0] === 208 && b[1] === 207 && b[2] === 17 && b[3] === 224;
|
|
56
59
|
}
|
|
57
60
|
function isPdfFile(buffer) {
|
|
58
|
-
const
|
|
59
|
-
return
|
|
61
|
+
const b = magicBytes(buffer);
|
|
62
|
+
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
60
63
|
}
|
|
61
64
|
function detectFormat(buffer) {
|
|
62
65
|
if (isHwpxFile(buffer)) return "hwpx";
|
|
@@ -76,22 +79,23 @@ var MAX_ROWS = 1e4;
|
|
|
76
79
|
function buildTable(rows) {
|
|
77
80
|
if (rows.length > MAX_ROWS) rows = rows.slice(0, MAX_ROWS);
|
|
78
81
|
const numRows = rows.length;
|
|
79
|
-
const tempOccupied =
|
|
82
|
+
const tempOccupied = /* @__PURE__ */ new Set();
|
|
80
83
|
let maxCols = 0;
|
|
81
84
|
for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
|
|
82
85
|
let colIdx = 0;
|
|
83
86
|
for (const cell of rows[rowIdx]) {
|
|
84
|
-
while (colIdx < MAX_COLS && tempOccupied
|
|
87
|
+
while (colIdx < MAX_COLS && tempOccupied.has(rowIdx * MAX_COLS + colIdx)) colIdx++;
|
|
85
88
|
if (colIdx >= MAX_COLS) break;
|
|
86
89
|
for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
|
|
87
90
|
for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, MAX_COLS); c++) {
|
|
88
|
-
tempOccupied
|
|
91
|
+
tempOccupied.add(r * MAX_COLS + c);
|
|
89
92
|
}
|
|
90
93
|
}
|
|
91
94
|
colIdx += cell.colSpan;
|
|
92
95
|
if (colIdx > maxCols) maxCols = colIdx;
|
|
93
96
|
}
|
|
94
97
|
}
|
|
98
|
+
tempOccupied.clear();
|
|
95
99
|
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
96
100
|
const grid = Array.from(
|
|
97
101
|
{ length: numRows },
|
|
@@ -201,8 +205,11 @@ function tableToMarkdown(table) {
|
|
|
201
205
|
// src/hwpx/parser.ts
|
|
202
206
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
203
207
|
var MAX_ZIP_ENTRIES = 500;
|
|
208
|
+
function clampSpan(val, max) {
|
|
209
|
+
return Math.max(1, Math.min(val, max));
|
|
210
|
+
}
|
|
204
211
|
function stripDtd(xml) {
|
|
205
|
-
return xml.replace(/<!DOCTYPE[
|
|
212
|
+
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
206
213
|
}
|
|
207
214
|
async function parseHwpxDocument(buffer) {
|
|
208
215
|
let zip;
|
|
@@ -239,6 +246,10 @@ function extractFromBrokenZip(buffer) {
|
|
|
239
246
|
const compSize = view.getUint32(pos + 18, true);
|
|
240
247
|
const nameLen = view.getUint16(pos + 26, true);
|
|
241
248
|
const extraLen = view.getUint16(pos + 28, true);
|
|
249
|
+
if (nameLen > 1024 || extraLen > 65535) {
|
|
250
|
+
pos += 30 + nameLen + extraLen;
|
|
251
|
+
continue;
|
|
252
|
+
}
|
|
242
253
|
const fileStart = pos + 30 + nameLen + extraLen;
|
|
243
254
|
if (fileStart + compSize > data.length) break;
|
|
244
255
|
if (compSize === 0 && method !== 0) {
|
|
@@ -247,6 +258,10 @@ function extractFromBrokenZip(buffer) {
|
|
|
247
258
|
}
|
|
248
259
|
const nameBytes = data.slice(pos + 30, pos + 30 + nameLen);
|
|
249
260
|
const name = new TextDecoder().decode(nameBytes);
|
|
261
|
+
if (name.includes("..") || name.startsWith("/")) {
|
|
262
|
+
pos = fileStart + compSize;
|
|
263
|
+
continue;
|
|
264
|
+
}
|
|
250
265
|
const fileData = data.slice(fileStart, fileStart + compSize);
|
|
251
266
|
pos = fileStart + compSize;
|
|
252
267
|
if (!name.toLowerCase().includes("section") || !name.endsWith(".xml")) continue;
|
|
@@ -370,8 +385,8 @@ function walkSection(node, blocks, tableCtx, tableStack) {
|
|
|
370
385
|
if (tableCtx?.cell) {
|
|
371
386
|
const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
|
|
372
387
|
const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
|
|
373
|
-
|
|
374
|
-
|
|
388
|
+
tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
|
|
389
|
+
tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
|
|
375
390
|
}
|
|
376
391
|
break;
|
|
377
392
|
case "p": {
|
|
@@ -538,6 +553,8 @@ var import_module = require("module");
|
|
|
538
553
|
var import_meta = {};
|
|
539
554
|
var require2 = (0, import_module.createRequire)(import_meta.url);
|
|
540
555
|
var CFB = require2("cfb");
|
|
556
|
+
var MAX_SECTIONS = 100;
|
|
557
|
+
var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
|
|
541
558
|
function parseHwp5Document(buffer) {
|
|
542
559
|
const cfb = CFB.parse(buffer);
|
|
543
560
|
const headerEntry = CFB.find(cfb, "/FileHeader");
|
|
@@ -550,8 +567,11 @@ function parseHwp5Document(buffer) {
|
|
|
550
567
|
const sections = findSections(cfb);
|
|
551
568
|
if (sections.length === 0) throw new Error("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
552
569
|
const blocks = [];
|
|
570
|
+
let totalDecompressed = 0;
|
|
553
571
|
for (const sectionData of sections) {
|
|
554
572
|
const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
573
|
+
totalDecompressed += data.length;
|
|
574
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new Error("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
555
575
|
const records = readRecords(data);
|
|
556
576
|
blocks.push(...parseSection(records));
|
|
557
577
|
}
|
|
@@ -559,7 +579,7 @@ function parseHwp5Document(buffer) {
|
|
|
559
579
|
}
|
|
560
580
|
function findSections(cfb) {
|
|
561
581
|
const sections = [];
|
|
562
|
-
for (let i = 0; ; i++) {
|
|
582
|
+
for (let i = 0; i < MAX_SECTIONS; i++) {
|
|
563
583
|
const entry = CFB.find(cfb, `/BodyText/Section${i}`);
|
|
564
584
|
if (!entry?.content) break;
|
|
565
585
|
sections.push({ idx: i, content: Buffer.from(entry.content) });
|
|
@@ -658,8 +678,8 @@ function parseCellBlock(records, startIdx, tableLevel) {
|
|
|
658
678
|
if (rec.data.length >= 14) {
|
|
659
679
|
const cs = rec.data.readUInt16LE(10);
|
|
660
680
|
const rs = rec.data.readUInt16LE(12);
|
|
661
|
-
if (cs > 0) colSpan = cs;
|
|
662
|
-
if (rs > 0) rowSpan = rs;
|
|
681
|
+
if (cs > 0) colSpan = Math.min(cs, MAX_COLS);
|
|
682
|
+
if (rs > 0) rowSpan = Math.min(rs, MAX_ROWS);
|
|
663
683
|
}
|
|
664
684
|
let i = startIdx + 1;
|
|
665
685
|
while (i < records.length) {
|
|
@@ -733,40 +753,45 @@ async function parsePdfDocument(buffer) {
|
|
|
733
753
|
disableFontFace: true,
|
|
734
754
|
isEvalSupported: false
|
|
735
755
|
}).promise;
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
if (
|
|
765
|
-
|
|
756
|
+
try {
|
|
757
|
+
const pageCount = doc.numPages;
|
|
758
|
+
if (pageCount === 0) {
|
|
759
|
+
return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
|
|
760
|
+
}
|
|
761
|
+
const pageTexts = [];
|
|
762
|
+
let totalChars = 0;
|
|
763
|
+
for (let i = 1; i <= pageCount; i++) {
|
|
764
|
+
const page = await doc.getPage(i);
|
|
765
|
+
const textContent = await page.getTextContent();
|
|
766
|
+
const lines = groupTextItemsByLine(textContent.items);
|
|
767
|
+
const pageText = lines.join("\n");
|
|
768
|
+
totalChars += pageText.replace(/\s/g, "").length;
|
|
769
|
+
pageTexts.push(pageText);
|
|
770
|
+
}
|
|
771
|
+
const avgCharsPerPage = totalChars / pageCount;
|
|
772
|
+
if (avgCharsPerPage < 10) {
|
|
773
|
+
return {
|
|
774
|
+
success: false,
|
|
775
|
+
fileType: "pdf",
|
|
776
|
+
pageCount,
|
|
777
|
+
isImageBased: true,
|
|
778
|
+
error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF\uB85C \uCD94\uC815\uB429\uB2C8\uB2E4 (${pageCount}\uD398\uC774\uC9C0, \uCD94\uCD9C \uD14D\uC2A4\uD2B8 ${totalChars}\uC790).`
|
|
779
|
+
};
|
|
780
|
+
}
|
|
781
|
+
let markdown = "";
|
|
782
|
+
for (let i = 0; i < pageTexts.length; i++) {
|
|
783
|
+
const cleaned = cleanPdfText(pageTexts[i]);
|
|
784
|
+
if (cleaned.trim()) {
|
|
785
|
+
if (i > 0 && markdown) markdown += "\n\n";
|
|
786
|
+
markdown += cleaned;
|
|
787
|
+
}
|
|
766
788
|
}
|
|
789
|
+
markdown = reconstructTables(markdown);
|
|
790
|
+
return { success: true, fileType: "pdf", markdown, pageCount, isImageBased: false };
|
|
791
|
+
} finally {
|
|
792
|
+
await doc.destroy().catch(() => {
|
|
793
|
+
});
|
|
767
794
|
}
|
|
768
|
-
markdown = reconstructTables(markdown);
|
|
769
|
-
return { success: true, fileType: "pdf", markdown, pageCount, isImageBased: false };
|
|
770
795
|
}
|
|
771
796
|
function groupTextItemsByLine(items) {
|
|
772
797
|
if (items.length === 0) return [];
|
|
@@ -842,7 +867,7 @@ function formatAsMarkdownTable(rows) {
|
|
|
842
867
|
}
|
|
843
868
|
|
|
844
869
|
// src/utils.ts
|
|
845
|
-
var VERSION = true ? "0.2.
|
|
870
|
+
var VERSION = true ? "0.2.2" : "0.0.0-dev";
|
|
846
871
|
|
|
847
872
|
// src/index.ts
|
|
848
873
|
async function parse(buffer) {
|