kordoc 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -147,13 +147,18 @@ import type { IRBlock, IRTable, IRCell, CellContext } from "kordoc"
147
147
 
148
148
  ## Security
149
149
 
150
- v0.2.1 includes the following security hardening:
150
+ v0.2.2 security hardening (cumulative since v0.2.1):
151
151
 
152
152
  - **ZIP bomb protection** — 100MB decompression limit, 500 entry cap
153
- - **XXE prevention** — DOCTYPE declarations stripped from HWPX XML
154
- - **Decompression bomb guard** — `maxOutputLength` on HWP5 zlib streams
153
+ - **XXE/Billion Laughs prevention** — Internal DTD subsets fully stripped from HWPX XML
154
+ - **Decompression bomb guard** — `maxOutputLength` on HWP5 zlib streams, cumulative 100MB limit across sections
155
+ - **colSpan/rowSpan clamping** — Crafted merge values clamped to grid bounds (MAX_COLS=200, MAX_ROWS=10,000)
156
+ - **Broken ZIP path traversal guard** — `..` and absolute path entries rejected, filename length capped
155
157
  - **MCP path restriction** — Only `.hwp`, `.hwpx`, `.pdf` extensions allowed
156
- - **Table memory guard** — 10,000 row cap on table builder
158
+ - **File size limit** — 500MB max in MCP server and CLI
159
+ - **PDF resource cleanup** — `doc.destroy()` prevents WASM memory leaks
160
+ - **Table memory guard** — Sparse Set-based allocation in Pass 1, 10,000 row cap
161
+ - **HWP5 section limit** — Max 100 sections to prevent infinite loop on corrupted files
157
162
 
158
163
  ## How It Works
159
164
 
@@ -1,17 +1,20 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/detect.ts
4
+ function magicBytes(buffer) {
5
+ return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
6
+ }
4
7
  function isHwpxFile(buffer) {
5
- const bytes = new Uint8Array(buffer.slice(0, 4));
6
- return bytes[0] === 80 && bytes[1] === 75 && bytes[2] === 3 && bytes[3] === 4;
8
+ const b = magicBytes(buffer);
9
+ return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
7
10
  }
8
11
  function isOldHwpFile(buffer) {
9
- const bytes = new Uint8Array(buffer.slice(0, 4));
10
- return bytes[0] === 208 && bytes[1] === 207 && bytes[2] === 17 && bytes[3] === 224;
12
+ const b = magicBytes(buffer);
13
+ return b[0] === 208 && b[1] === 207 && b[2] === 17 && b[3] === 224;
11
14
  }
12
15
  function isPdfFile(buffer) {
13
- const bytes = new Uint8Array(buffer.slice(0, 4));
14
- return bytes[0] === 37 && bytes[1] === 80 && bytes[2] === 68 && bytes[3] === 70;
16
+ const b = magicBytes(buffer);
17
+ return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
15
18
  }
16
19
  function detectFormat(buffer) {
17
20
  if (isHwpxFile(buffer)) return "hwpx";
@@ -21,7 +24,7 @@ function detectFormat(buffer) {
21
24
  }
22
25
 
23
26
  // src/utils.ts
24
- var VERSION = true ? "0.2.1" : "0.0.0-dev";
27
+ var VERSION = true ? "0.2.2" : "0.0.0-dev";
25
28
  function toArrayBuffer(buf) {
26
29
  return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
27
30
  }
@@ -37,22 +40,23 @@ var MAX_ROWS = 1e4;
37
40
  function buildTable(rows) {
38
41
  if (rows.length > MAX_ROWS) rows = rows.slice(0, MAX_ROWS);
39
42
  const numRows = rows.length;
40
- const tempOccupied = Array.from({ length: numRows }, () => Array(MAX_COLS).fill(false));
43
+ const tempOccupied = /* @__PURE__ */ new Set();
41
44
  let maxCols = 0;
42
45
  for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
43
46
  let colIdx = 0;
44
47
  for (const cell of rows[rowIdx]) {
45
- while (colIdx < MAX_COLS && tempOccupied[rowIdx][colIdx]) colIdx++;
48
+ while (colIdx < MAX_COLS && tempOccupied.has(rowIdx * MAX_COLS + colIdx)) colIdx++;
46
49
  if (colIdx >= MAX_COLS) break;
47
50
  for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
48
51
  for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, MAX_COLS); c++) {
49
- tempOccupied[r][c] = true;
52
+ tempOccupied.add(r * MAX_COLS + c);
50
53
  }
51
54
  }
52
55
  colIdx += cell.colSpan;
53
56
  if (colIdx > maxCols) maxCols = colIdx;
54
57
  }
55
58
  }
59
+ tempOccupied.clear();
56
60
  if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
57
61
  const grid = Array.from(
58
62
  { length: numRows },
@@ -162,8 +166,11 @@ function tableToMarkdown(table) {
162
166
  // src/hwpx/parser.ts
163
167
  var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
164
168
  var MAX_ZIP_ENTRIES = 500;
169
+ function clampSpan(val, max) {
170
+ return Math.max(1, Math.min(val, max));
171
+ }
165
172
  function stripDtd(xml) {
166
- return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
173
+ return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
167
174
  }
168
175
  async function parseHwpxDocument(buffer) {
169
176
  let zip;
@@ -200,6 +207,10 @@ function extractFromBrokenZip(buffer) {
200
207
  const compSize = view.getUint32(pos + 18, true);
201
208
  const nameLen = view.getUint16(pos + 26, true);
202
209
  const extraLen = view.getUint16(pos + 28, true);
210
+ if (nameLen > 1024 || extraLen > 65535) {
211
+ pos += 30 + nameLen + extraLen;
212
+ continue;
213
+ }
203
214
  const fileStart = pos + 30 + nameLen + extraLen;
204
215
  if (fileStart + compSize > data.length) break;
205
216
  if (compSize === 0 && method !== 0) {
@@ -208,6 +219,10 @@ function extractFromBrokenZip(buffer) {
208
219
  }
209
220
  const nameBytes = data.slice(pos + 30, pos + 30 + nameLen);
210
221
  const name = new TextDecoder().decode(nameBytes);
222
+ if (name.includes("..") || name.startsWith("/")) {
223
+ pos = fileStart + compSize;
224
+ continue;
225
+ }
211
226
  const fileData = data.slice(fileStart, fileStart + compSize);
212
227
  pos = fileStart + compSize;
213
228
  if (!name.toLowerCase().includes("section") || !name.endsWith(".xml")) continue;
@@ -331,8 +346,8 @@ function walkSection(node, blocks, tableCtx, tableStack) {
331
346
  if (tableCtx?.cell) {
332
347
  const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
333
348
  const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
334
- if (cs > 0) tableCtx.cell.colSpan = cs;
335
- if (rs > 0) tableCtx.cell.rowSpan = rs;
349
+ tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
350
+ tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
336
351
  }
337
352
  break;
338
353
  case "p": {
@@ -498,6 +513,8 @@ function extractText(data) {
498
513
  import { createRequire } from "module";
499
514
  var require2 = createRequire(import.meta.url);
500
515
  var CFB = require2("cfb");
516
+ var MAX_SECTIONS = 100;
517
+ var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
501
518
  function parseHwp5Document(buffer) {
502
519
  const cfb = CFB.parse(buffer);
503
520
  const headerEntry = CFB.find(cfb, "/FileHeader");
@@ -510,8 +527,11 @@ function parseHwp5Document(buffer) {
510
527
  const sections = findSections(cfb);
511
528
  if (sections.length === 0) throw new Error("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
512
529
  const blocks = [];
530
+ let totalDecompressed = 0;
513
531
  for (const sectionData of sections) {
514
532
  const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
533
+ totalDecompressed += data.length;
534
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new Error("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
515
535
  const records = readRecords(data);
516
536
  blocks.push(...parseSection(records));
517
537
  }
@@ -519,7 +539,7 @@ function parseHwp5Document(buffer) {
519
539
  }
520
540
  function findSections(cfb) {
521
541
  const sections = [];
522
- for (let i = 0; ; i++) {
542
+ for (let i = 0; i < MAX_SECTIONS; i++) {
523
543
  const entry = CFB.find(cfb, `/BodyText/Section${i}`);
524
544
  if (!entry?.content) break;
525
545
  sections.push({ idx: i, content: Buffer.from(entry.content) });
@@ -618,8 +638,8 @@ function parseCellBlock(records, startIdx, tableLevel) {
618
638
  if (rec.data.length >= 14) {
619
639
  const cs = rec.data.readUInt16LE(10);
620
640
  const rs = rec.data.readUInt16LE(12);
621
- if (cs > 0) colSpan = cs;
622
- if (rs > 0) rowSpan = rs;
641
+ if (cs > 0) colSpan = Math.min(cs, MAX_COLS);
642
+ if (rs > 0) rowSpan = Math.min(rs, MAX_ROWS);
623
643
  }
624
644
  let i = startIdx + 1;
625
645
  while (i < records.length) {
@@ -692,40 +712,45 @@ async function parsePdfDocument(buffer) {
692
712
  disableFontFace: true,
693
713
  isEvalSupported: false
694
714
  }).promise;
695
- const pageCount = doc.numPages;
696
- if (pageCount === 0) {
697
- return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
698
- }
699
- const pageTexts = [];
700
- let totalChars = 0;
701
- for (let i = 1; i <= pageCount; i++) {
702
- const page = await doc.getPage(i);
703
- const textContent = await page.getTextContent();
704
- const lines = groupTextItemsByLine(textContent.items);
705
- const pageText = lines.join("\n");
706
- totalChars += pageText.replace(/\s/g, "").length;
707
- pageTexts.push(pageText);
708
- }
709
- const avgCharsPerPage = totalChars / pageCount;
710
- if (avgCharsPerPage < 10) {
711
- return {
712
- success: false,
713
- fileType: "pdf",
714
- pageCount,
715
- isImageBased: true,
716
- error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF\uB85C \uCD94\uC815\uB429\uB2C8\uB2E4 (${pageCount}\uD398\uC774\uC9C0, \uCD94\uCD9C \uD14D\uC2A4\uD2B8 ${totalChars}\uC790).`
717
- };
718
- }
719
- let markdown = "";
720
- for (let i = 0; i < pageTexts.length; i++) {
721
- const cleaned = cleanPdfText(pageTexts[i]);
722
- if (cleaned.trim()) {
723
- if (i > 0 && markdown) markdown += "\n\n";
724
- markdown += cleaned;
715
+ try {
716
+ const pageCount = doc.numPages;
717
+ if (pageCount === 0) {
718
+ return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
719
+ }
720
+ const pageTexts = [];
721
+ let totalChars = 0;
722
+ for (let i = 1; i <= pageCount; i++) {
723
+ const page = await doc.getPage(i);
724
+ const textContent = await page.getTextContent();
725
+ const lines = groupTextItemsByLine(textContent.items);
726
+ const pageText = lines.join("\n");
727
+ totalChars += pageText.replace(/\s/g, "").length;
728
+ pageTexts.push(pageText);
729
+ }
730
+ const avgCharsPerPage = totalChars / pageCount;
731
+ if (avgCharsPerPage < 10) {
732
+ return {
733
+ success: false,
734
+ fileType: "pdf",
735
+ pageCount,
736
+ isImageBased: true,
737
+ error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF\uB85C \uCD94\uC815\uB429\uB2C8\uB2E4 (${pageCount}\uD398\uC774\uC9C0, \uCD94\uCD9C \uD14D\uC2A4\uD2B8 ${totalChars}\uC790).`
738
+ };
739
+ }
740
+ let markdown = "";
741
+ for (let i = 0; i < pageTexts.length; i++) {
742
+ const cleaned = cleanPdfText(pageTexts[i]);
743
+ if (cleaned.trim()) {
744
+ if (i > 0 && markdown) markdown += "\n\n";
745
+ markdown += cleaned;
746
+ }
725
747
  }
748
+ markdown = reconstructTables(markdown);
749
+ return { success: true, fileType: "pdf", markdown, pageCount, isImageBased: false };
750
+ } finally {
751
+ await doc.destroy().catch(() => {
752
+ });
726
753
  }
727
- markdown = reconstructTables(markdown);
728
- return { success: true, fileType: "pdf", markdown, pageCount, isImageBased: false };
729
754
  }
730
755
  function groupTextItemsByLine(items) {
731
756
  if (items.length === 0) return [];
package/dist/cli.js CHANGED
@@ -4,10 +4,10 @@ import {
4
4
  detectFormat,
5
5
  parse,
6
6
  toArrayBuffer
7
- } from "./chunk-C3XHIIJZ.js";
7
+ } from "./chunk-KZMWHK72.js";
8
8
 
9
9
  // src/cli.ts
10
- import { readFileSync, writeFileSync, mkdirSync } from "fs";
10
+ import { readFileSync, writeFileSync, mkdirSync, statSync } from "fs";
11
11
  import { basename, resolve } from "path";
12
12
  import { Command } from "commander";
13
13
  var program = new Command();
@@ -16,6 +16,14 @@ program.name("kordoc").description("\uBAA8\uB450 \uD30C\uC2F1\uD574\uBC84\uB9AC\
16
16
  const absPath = resolve(filePath);
17
17
  const fileName = basename(absPath);
18
18
  try {
19
+ const fileSize = statSync(absPath).size;
20
+ if (fileSize > 500 * 1024 * 1024) {
21
+ process.stderr.write(`
22
+ [kordoc] SKIP: ${fileName} \u2014 \uD30C\uC77C\uC774 \uB108\uBB34 \uD07D\uB2C8\uB2E4 (${(fileSize / 1024 / 1024).toFixed(1)}MB)
23
+ `);
24
+ process.exitCode = 1;
25
+ continue;
26
+ }
19
27
  const buffer = readFileSync(absPath);
20
28
  const arrayBuffer = toArrayBuffer(buffer);
21
29
  const format = detectFormat(arrayBuffer);
@@ -40,7 +48,8 @@ program.name("kordoc").description("\uBAA8\uB450 \uD30C\uC2F1\uD574\uBC84\uB9AC\
40
48
  `);
41
49
  } else if (opts.outDir) {
42
50
  mkdirSync(opts.outDir, { recursive: true });
43
- const outPath = resolve(opts.outDir, fileName.replace(/\.[^.]+$/, ".md"));
51
+ const outExt = opts.format === "json" ? ".json" : ".md";
52
+ const outPath = resolve(opts.outDir, fileName.replace(/\.[^.]+$/, outExt));
44
53
  writeFileSync(outPath, output, "utf-8");
45
54
  if (!opts.silent) process.stderr.write(` \u2192 ${outPath}
46
55
  `);
package/dist/index.cjs CHANGED
@@ -46,17 +46,20 @@ __export(index_exports, {
46
46
  module.exports = __toCommonJS(index_exports);
47
47
 
48
48
  // src/detect.ts
49
+ function magicBytes(buffer) {
50
+ return new Uint8Array(buffer, 0, Math.min(4, buffer.byteLength));
51
+ }
49
52
  function isHwpxFile(buffer) {
50
- const bytes = new Uint8Array(buffer.slice(0, 4));
51
- return bytes[0] === 80 && bytes[1] === 75 && bytes[2] === 3 && bytes[3] === 4;
53
+ const b = magicBytes(buffer);
54
+ return b[0] === 80 && b[1] === 75 && b[2] === 3 && b[3] === 4;
52
55
  }
53
56
  function isOldHwpFile(buffer) {
54
- const bytes = new Uint8Array(buffer.slice(0, 4));
55
- return bytes[0] === 208 && bytes[1] === 207 && bytes[2] === 17 && bytes[3] === 224;
57
+ const b = magicBytes(buffer);
58
+ return b[0] === 208 && b[1] === 207 && b[2] === 17 && b[3] === 224;
56
59
  }
57
60
  function isPdfFile(buffer) {
58
- const bytes = new Uint8Array(buffer.slice(0, 4));
59
- return bytes[0] === 37 && bytes[1] === 80 && bytes[2] === 68 && bytes[3] === 70;
61
+ const b = magicBytes(buffer);
62
+ return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
60
63
  }
61
64
  function detectFormat(buffer) {
62
65
  if (isHwpxFile(buffer)) return "hwpx";
@@ -76,22 +79,23 @@ var MAX_ROWS = 1e4;
76
79
  function buildTable(rows) {
77
80
  if (rows.length > MAX_ROWS) rows = rows.slice(0, MAX_ROWS);
78
81
  const numRows = rows.length;
79
- const tempOccupied = Array.from({ length: numRows }, () => Array(MAX_COLS).fill(false));
82
+ const tempOccupied = /* @__PURE__ */ new Set();
80
83
  let maxCols = 0;
81
84
  for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
82
85
  let colIdx = 0;
83
86
  for (const cell of rows[rowIdx]) {
84
- while (colIdx < MAX_COLS && tempOccupied[rowIdx][colIdx]) colIdx++;
87
+ while (colIdx < MAX_COLS && tempOccupied.has(rowIdx * MAX_COLS + colIdx)) colIdx++;
85
88
  if (colIdx >= MAX_COLS) break;
86
89
  for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
87
90
  for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, MAX_COLS); c++) {
88
- tempOccupied[r][c] = true;
91
+ tempOccupied.add(r * MAX_COLS + c);
89
92
  }
90
93
  }
91
94
  colIdx += cell.colSpan;
92
95
  if (colIdx > maxCols) maxCols = colIdx;
93
96
  }
94
97
  }
98
+ tempOccupied.clear();
95
99
  if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
96
100
  const grid = Array.from(
97
101
  { length: numRows },
@@ -201,8 +205,11 @@ function tableToMarkdown(table) {
201
205
  // src/hwpx/parser.ts
202
206
  var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
203
207
  var MAX_ZIP_ENTRIES = 500;
208
+ function clampSpan(val, max) {
209
+ return Math.max(1, Math.min(val, max));
210
+ }
204
211
  function stripDtd(xml) {
205
- return xml.replace(/<!DOCTYPE[^>]*>/gi, "");
212
+ return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
206
213
  }
207
214
  async function parseHwpxDocument(buffer) {
208
215
  let zip;
@@ -239,6 +246,10 @@ function extractFromBrokenZip(buffer) {
239
246
  const compSize = view.getUint32(pos + 18, true);
240
247
  const nameLen = view.getUint16(pos + 26, true);
241
248
  const extraLen = view.getUint16(pos + 28, true);
249
+ if (nameLen > 1024 || extraLen > 65535) {
250
+ pos += 30 + nameLen + extraLen;
251
+ continue;
252
+ }
242
253
  const fileStart = pos + 30 + nameLen + extraLen;
243
254
  if (fileStart + compSize > data.length) break;
244
255
  if (compSize === 0 && method !== 0) {
@@ -247,6 +258,10 @@ function extractFromBrokenZip(buffer) {
247
258
  }
248
259
  const nameBytes = data.slice(pos + 30, pos + 30 + nameLen);
249
260
  const name = new TextDecoder().decode(nameBytes);
261
+ if (name.includes("..") || name.startsWith("/")) {
262
+ pos = fileStart + compSize;
263
+ continue;
264
+ }
250
265
  const fileData = data.slice(fileStart, fileStart + compSize);
251
266
  pos = fileStart + compSize;
252
267
  if (!name.toLowerCase().includes("section") || !name.endsWith(".xml")) continue;
@@ -370,8 +385,8 @@ function walkSection(node, blocks, tableCtx, tableStack) {
370
385
  if (tableCtx?.cell) {
371
386
  const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
372
387
  const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
373
- if (cs > 0) tableCtx.cell.colSpan = cs;
374
- if (rs > 0) tableCtx.cell.rowSpan = rs;
388
+ tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
389
+ tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
375
390
  }
376
391
  break;
377
392
  case "p": {
@@ -538,6 +553,8 @@ var import_module = require("module");
538
553
  var import_meta = {};
539
554
  var require2 = (0, import_module.createRequire)(import_meta.url);
540
555
  var CFB = require2("cfb");
556
+ var MAX_SECTIONS = 100;
557
+ var MAX_TOTAL_DECOMPRESS = 100 * 1024 * 1024;
541
558
  function parseHwp5Document(buffer) {
542
559
  const cfb = CFB.parse(buffer);
543
560
  const headerEntry = CFB.find(cfb, "/FileHeader");
@@ -550,8 +567,11 @@ function parseHwp5Document(buffer) {
550
567
  const sections = findSections(cfb);
551
568
  if (sections.length === 0) throw new Error("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
552
569
  const blocks = [];
570
+ let totalDecompressed = 0;
553
571
  for (const sectionData of sections) {
554
572
  const data = compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
573
+ totalDecompressed += data.length;
574
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new Error("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
555
575
  const records = readRecords(data);
556
576
  blocks.push(...parseSection(records));
557
577
  }
@@ -559,7 +579,7 @@ function parseHwp5Document(buffer) {
559
579
  }
560
580
  function findSections(cfb) {
561
581
  const sections = [];
562
- for (let i = 0; ; i++) {
582
+ for (let i = 0; i < MAX_SECTIONS; i++) {
563
583
  const entry = CFB.find(cfb, `/BodyText/Section${i}`);
564
584
  if (!entry?.content) break;
565
585
  sections.push({ idx: i, content: Buffer.from(entry.content) });
@@ -658,8 +678,8 @@ function parseCellBlock(records, startIdx, tableLevel) {
658
678
  if (rec.data.length >= 14) {
659
679
  const cs = rec.data.readUInt16LE(10);
660
680
  const rs = rec.data.readUInt16LE(12);
661
- if (cs > 0) colSpan = cs;
662
- if (rs > 0) rowSpan = rs;
681
+ if (cs > 0) colSpan = Math.min(cs, MAX_COLS);
682
+ if (rs > 0) rowSpan = Math.min(rs, MAX_ROWS);
663
683
  }
664
684
  let i = startIdx + 1;
665
685
  while (i < records.length) {
@@ -733,40 +753,45 @@ async function parsePdfDocument(buffer) {
733
753
  disableFontFace: true,
734
754
  isEvalSupported: false
735
755
  }).promise;
736
- const pageCount = doc.numPages;
737
- if (pageCount === 0) {
738
- return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
739
- }
740
- const pageTexts = [];
741
- let totalChars = 0;
742
- for (let i = 1; i <= pageCount; i++) {
743
- const page = await doc.getPage(i);
744
- const textContent = await page.getTextContent();
745
- const lines = groupTextItemsByLine(textContent.items);
746
- const pageText = lines.join("\n");
747
- totalChars += pageText.replace(/\s/g, "").length;
748
- pageTexts.push(pageText);
749
- }
750
- const avgCharsPerPage = totalChars / pageCount;
751
- if (avgCharsPerPage < 10) {
752
- return {
753
- success: false,
754
- fileType: "pdf",
755
- pageCount,
756
- isImageBased: true,
757
- error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF\uB85C \uCD94\uC815\uB429\uB2C8\uB2E4 (${pageCount}\uD398\uC774\uC9C0, \uCD94\uCD9C \uD14D\uC2A4\uD2B8 ${totalChars}\uC790).`
758
- };
759
- }
760
- let markdown = "";
761
- for (let i = 0; i < pageTexts.length; i++) {
762
- const cleaned = cleanPdfText(pageTexts[i]);
763
- if (cleaned.trim()) {
764
- if (i > 0 && markdown) markdown += "\n\n";
765
- markdown += cleaned;
756
+ try {
757
+ const pageCount = doc.numPages;
758
+ if (pageCount === 0) {
759
+ return { success: false, fileType: "pdf", pageCount: 0, error: "PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4." };
760
+ }
761
+ const pageTexts = [];
762
+ let totalChars = 0;
763
+ for (let i = 1; i <= pageCount; i++) {
764
+ const page = await doc.getPage(i);
765
+ const textContent = await page.getTextContent();
766
+ const lines = groupTextItemsByLine(textContent.items);
767
+ const pageText = lines.join("\n");
768
+ totalChars += pageText.replace(/\s/g, "").length;
769
+ pageTexts.push(pageText);
770
+ }
771
+ const avgCharsPerPage = totalChars / pageCount;
772
+ if (avgCharsPerPage < 10) {
773
+ return {
774
+ success: false,
775
+ fileType: "pdf",
776
+ pageCount,
777
+ isImageBased: true,
778
+ error: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF\uB85C \uCD94\uC815\uB429\uB2C8\uB2E4 (${pageCount}\uD398\uC774\uC9C0, \uCD94\uCD9C \uD14D\uC2A4\uD2B8 ${totalChars}\uC790).`
779
+ };
780
+ }
781
+ let markdown = "";
782
+ for (let i = 0; i < pageTexts.length; i++) {
783
+ const cleaned = cleanPdfText(pageTexts[i]);
784
+ if (cleaned.trim()) {
785
+ if (i > 0 && markdown) markdown += "\n\n";
786
+ markdown += cleaned;
787
+ }
766
788
  }
789
+ markdown = reconstructTables(markdown);
790
+ return { success: true, fileType: "pdf", markdown, pageCount, isImageBased: false };
791
+ } finally {
792
+ await doc.destroy().catch(() => {
793
+ });
767
794
  }
768
- markdown = reconstructTables(markdown);
769
- return { success: true, fileType: "pdf", markdown, pageCount, isImageBased: false };
770
795
  }
771
796
  function groupTextItemsByLine(items) {
772
797
  if (items.length === 0) return [];
@@ -842,7 +867,7 @@ function formatAsMarkdownTable(rows) {
842
867
  }
843
868
 
844
869
  // src/utils.ts
845
- var VERSION = true ? "0.2.1" : "0.0.0-dev";
870
+ var VERSION = true ? "0.2.2" : "0.0.0-dev";
846
871
 
847
872
  // src/index.ts
848
873
  async function parse(buffer) {