@clazic/kordoc 2.3.1 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/{batch-provider-PCT4I4LK.js → batch-provider-PNDCSGQW.js} +3 -4
  2. package/dist/batch-provider-PNDCSGQW.js.map +1 -0
  3. package/dist/chunk-4PP34NVQ.js +121 -0
  4. package/dist/chunk-4PP34NVQ.js.map +1 -0
  5. package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
  6. package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
  7. package/dist/chunk-JOGAFNIL.js +153 -0
  8. package/dist/chunk-JOGAFNIL.js.map +1 -0
  9. package/dist/{chunk-W5KUC23B.js → chunk-NU3KFVVZ.js} +2 -2
  10. package/dist/{chunk-ZOEUKD77.js → chunk-UDFKY7CH.js} +204 -49
  11. package/dist/chunk-UDFKY7CH.js.map +1 -0
  12. package/dist/cli.js +8 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/index.cjs +230 -72
  15. package/dist/index.cjs.map +1 -1
  16. package/dist/index.d.cts +11 -6
  17. package/dist/index.d.ts +11 -6
  18. package/dist/index.js +230 -72
  19. package/dist/index.js.map +1 -1
  20. package/dist/mcp.js +5 -2
  21. package/dist/mcp.js.map +1 -1
  22. package/dist/{provider-WYHC4NHI.js → provider-HE727F7Z.js} +19 -131
  23. package/dist/provider-HE727F7Z.js.map +1 -0
  24. package/dist/resolve-UOAOPQ4H.js +111 -0
  25. package/dist/resolve-UOAOPQ4H.js.map +1 -0
  26. package/dist/tesseract-provider-MNMZPSGF.js +11 -0
  27. package/dist/{utils-HSF5HI5T.js → utils-STJT6CFC.js} +2 -2
  28. package/dist/utils-STJT6CFC.js.map +1 -0
  29. package/dist/{watch-R2JHXDGF.js → watch-PRQGLOW3.js} +6 -3
  30. package/dist/{watch-R2JHXDGF.js.map → watch-PRQGLOW3.js.map} +1 -1
  31. package/package.json +8 -8
  32. package/dist/batch-provider-PCT4I4LK.js.map +0 -1
  33. package/dist/chunk-ZOEUKD77.js.map +0 -1
  34. package/dist/provider-WYHC4NHI.js.map +0 -1
  35. package/dist/resolve-4FSAQF2S.js +0 -247
  36. package/dist/resolve-4FSAQF2S.js.map +0 -1
  37. /package/dist/{chunk-W5KUC23B.js.map → chunk-NU3KFVVZ.js.map} +0 -0
  38. /package/dist/{utils-HSF5HI5T.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
package/dist/index.cjs CHANGED
@@ -1993,8 +1993,8 @@ function getTesseractFallbackMessage() {
1993
1993
  "\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
1994
1994
  "\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
1995
1995
  "",
1996
- " [\uAD8C\uC7A5] Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
1997
- " Codex CLI: npm install -g @openai/codex",
1996
+ " [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
1997
+ " Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
1998
1998
  " Claude CLI: npm install -g @anthropic-ai/claude-code",
1999
1999
  " Ollama: brew install ollama (+ ollama pull gemma4:27b)"
2000
2000
  ].join("\n");
@@ -2004,7 +2004,7 @@ var init_auto_detect = __esm({
2004
2004
  "src/ocr/auto-detect.ts"() {
2005
2005
  "use strict";
2006
2006
  import_child_process = require("child_process");
2007
- CLI_PRIORITY = ["gemini", "codex", "claude", "ollama"];
2007
+ CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
2008
2008
  }
2009
2009
  });
2010
2010
 
@@ -2043,7 +2043,7 @@ function callCli(mode, imagePath) {
2043
2043
  const args = buildCliArgs(mode, imagePath);
2044
2044
  const result = (0, import_child_process2.spawnSync)(mode, args, {
2045
2045
  encoding: "utf-8",
2046
- timeout: 18e4,
2046
+ timeout: 6e5,
2047
2047
  maxBuffer: 10 * 1024 * 1024,
2048
2048
  // claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
2049
2049
  ...mode === "claude" ? { cwd: (0, import_os.tmpdir)() } : {}
@@ -2137,7 +2137,7 @@ async function callOllamaApi(imagePath) {
2137
2137
  return data.message?.content || "";
2138
2138
  }
2139
2139
  function stripCodeFence(text) {
2140
- const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
2140
+ const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
2141
2141
  return match ? match[1].trim() : text;
2142
2142
  }
2143
2143
  var import_child_process2, import_fs, import_path, import_os, OCR_PROMPT, _tempDir;
@@ -2148,7 +2148,15 @@ var init_cli_provider = __esm({
2148
2148
  import_fs = require("fs");
2149
2149
  import_path = require("path");
2150
2150
  import_os = require("os");
2151
- OCR_PROMPT = "\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\uADDC\uCE59:\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
2151
+ OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
2152
+ \uADDC\uCE59:
2153
+ - \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
2154
+ - \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
2155
+ - \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
2156
+ - \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
2157
+ - \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
2158
+ - \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
2159
+ - \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
2152
2160
  _tempDir = null;
2153
2161
  }
2154
2162
  });
@@ -2314,9 +2322,8 @@ async function callBatchCli(mode, imagePaths) {
2314
2322
  ${fileRefs}`;
2315
2323
  let args;
2316
2324
  if (mode === "gemini") {
2317
- args = ["--prompt", prompt, "--yolo"];
2318
- const model = process.env.KORDOC_GEMINI_MODEL;
2319
- if (model) args.push("--model", model);
2325
+ const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
2326
+ args = ["--prompt", prompt, "--yolo", "--model", model];
2320
2327
  } else {
2321
2328
  args = ["--print", prompt];
2322
2329
  const model = process.env.KORDOC_CLAUDE_MODEL;
@@ -2415,7 +2422,7 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2415
2422
  return createCliOcrProvider(mode);
2416
2423
  }
2417
2424
  const detected = detectAvailableOcr();
2418
- if (detected !== "gemini") {
2425
+ if (detected !== "codex") {
2419
2426
  if (detected === "tesseract") {
2420
2427
  warnings?.push({
2421
2428
  message: getTesseractFallbackMessage(),
@@ -2423,7 +2430,7 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2423
2430
  });
2424
2431
  } else {
2425
2432
  warnings?.push({
2426
- message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (gemini CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 gemini CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
2433
+ message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
2427
2434
  code: "OCR_CLI_FALLBACK"
2428
2435
  });
2429
2436
  }
@@ -2664,22 +2671,22 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2664
2671
  if (pageFilter && !pageFilter.has(i)) continue;
2665
2672
  pageNumbers.push(i);
2666
2673
  }
2667
- const pageImages = [];
2668
- for (const pageNum of pageNumbers) {
2669
- const page = await doc.getPage(pageNum);
2670
- const image = await renderPageToPng(page);
2671
- pageImages.push({ image, pageNum });
2672
- }
2673
- const batches = [];
2674
- for (let i = 0; i < pageImages.length; i += provider.batchSize) {
2675
- batches.push(pageImages.slice(i, i + provider.batchSize));
2674
+ const pageBatches = [];
2675
+ for (let i = 0; i < pageNumbers.length; i += provider.batchSize) {
2676
+ pageBatches.push(pageNumbers.slice(i, i + provider.batchSize));
2676
2677
  }
2677
2678
  let processed = 0;
2678
- const batchTasks = batches.map((batch, batchIdx) => async () => {
2679
+ const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
2679
2680
  const pageBlocks = [];
2680
2681
  try {
2681
- const results = await provider.processBatch(batch);
2682
- for (const { pageNum } of batch) {
2682
+ const batchImages = [];
2683
+ for (const pageNum of batchPageNums) {
2684
+ const page = await doc.getPage(pageNum);
2685
+ const image = await renderPageToPng(page);
2686
+ batchImages.push({ image, pageNum });
2687
+ }
2688
+ const results = await provider.processBatch(batchImages);
2689
+ for (const { pageNum } of batchImages) {
2683
2690
  const result = results.get(pageNum);
2684
2691
  pageBlocks.push({
2685
2692
  pageNum,
@@ -2687,16 +2694,16 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2687
2694
  });
2688
2695
  }
2689
2696
  } catch (err) {
2690
- const range = `${batch[0].pageNum}-${batch[batch.length - 1].pageNum}`;
2697
+ const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2691
2698
  warnings?.push({
2692
2699
  message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2693
2700
  code: "OCR_PAGE_FAILED"
2694
2701
  });
2695
- for (const { pageNum } of batch) {
2702
+ for (const pageNum of batchPageNums) {
2696
2703
  pageBlocks.push({ pageNum, blocks: [] });
2697
2704
  }
2698
2705
  }
2699
- processed += batch.length;
2706
+ processed += batchPageNums.length;
2700
2707
  onProgress?.(processed, pageNumbers.length);
2701
2708
  return { batchIdx, pageBlocks };
2702
2709
  });
@@ -2772,24 +2779,29 @@ function isPdfFile(buffer) {
2772
2779
  const b = magicBytes(buffer);
2773
2780
  return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
2774
2781
  }
2782
+ function isPngFile(buffer) {
2783
+ const b = magicBytes(buffer);
2784
+ return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
2785
+ }
2775
2786
  function detectFormat(buffer) {
2776
2787
  if (buffer.byteLength < 4) return "unknown";
2777
2788
  if (isZipFile(buffer)) return "hwpx";
2778
2789
  if (isOldHwpFile(buffer)) return "hwp";
2779
2790
  if (isPdfFile(buffer)) return "pdf";
2791
+ if (isPngFile(buffer)) return "image";
2780
2792
  return "unknown";
2781
2793
  }
2782
2794
  async function detectZipFormat(buffer) {
2783
2795
  try {
2784
2796
  const zip = await import_jszip.default.loadAsync(buffer);
2785
- if (zip.file("xl/workbook.xml")) return "xlsx";
2786
- if (zip.file("word/document.xml")) return "docx";
2787
- if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
2797
+ if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
2798
+ if (zip.file("word/document.xml")) return { format: "docx", zip };
2799
+ if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
2788
2800
  const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
2789
- if (hasSection) return "hwpx";
2790
- return "unknown";
2801
+ if (hasSection) return { format: "hwpx", zip };
2802
+ return { format: "unknown", zip: null };
2791
2803
  } catch {
2792
- return "unknown";
2804
+ return { format: "unknown", zip: null };
2793
2805
  }
2794
2806
  }
2795
2807
 
@@ -2798,7 +2810,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
2798
2810
  var import_xmldom = require("@xmldom/xmldom");
2799
2811
 
2800
2812
  // src/utils.ts
2801
- var VERSION = true ? "2.3.1" : "0.0.0-dev";
2813
+ var VERSION = true ? "2.3.3" : "0.0.0-dev";
2802
2814
  function toArrayBuffer(buf) {
2803
2815
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2804
2816
  return buf.buffer;
@@ -2958,12 +2970,16 @@ function buildTableDirect(rows, numRows) {
2958
2970
  return trimAndReturn(grid, numRows, maxCols);
2959
2971
  }
2960
2972
  function trimAndReturn(grid, numRows, maxCols) {
2961
- let effectiveCols = maxCols;
2962
- while (effectiveCols > 0) {
2963
- const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
2964
- if (!colEmpty) break;
2965
- effectiveCols--;
2973
+ let effectiveCols = 0;
2974
+ for (const row of grid) {
2975
+ for (let c = row.length - 1; c >= effectiveCols; c--) {
2976
+ if (row[c]?.text?.trim()) {
2977
+ effectiveCols = c + 1;
2978
+ break;
2979
+ }
2980
+ }
2966
2981
  }
2982
+ if (effectiveCols === 0) effectiveCols = maxCols;
2967
2983
  if (effectiveCols < maxCols && effectiveCols > 0) {
2968
2984
  const trimmed = grid.map((row) => row.slice(0, effectiveCols));
2969
2985
  return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
@@ -3220,11 +3236,11 @@ function parseStyleElements(doc, map) {
3220
3236
  function stripDtd(xml) {
3221
3237
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
3222
3238
  }
3223
- async function parseHwpxDocument(buffer, options) {
3239
+ async function parseHwpxDocument(buffer, options, existingZip) {
3224
3240
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
3225
3241
  let zip;
3226
3242
  try {
3227
- zip = await import_jszip2.default.loadAsync(buffer);
3243
+ zip = existingZip ?? await import_jszip2.default.loadAsync(buffer);
3228
3244
  } catch {
3229
3245
  return await extractFromBrokenZip(buffer);
3230
3246
  }
@@ -6236,8 +6252,15 @@ var import_pdf2 = require("pdfjs-dist/legacy/build/pdf.mjs");
6236
6252
  import_pdf2.GlobalWorkerOptions.workerSrc = "";
6237
6253
  var MAX_PAGES = 5e3;
6238
6254
  var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
6239
- var PDF_LOAD_TIMEOUT_MS = 3e4;
6255
+ function calcPdfTimeout(bufferSize) {
6256
+ const base = 3e4;
6257
+ const perMb = 500;
6258
+ const mb = bufferSize / (1024 * 1024);
6259
+ return Math.min(base + Math.ceil(mb * perMb), 3e5);
6260
+ }
6240
6261
  async function loadPdfWithTimeout(buffer) {
6262
+ const timeoutMs = calcPdfTimeout(buffer.byteLength);
6263
+ const timeoutSec = Math.round(timeoutMs / 1e3);
6241
6264
  const loadingTask = (0, import_pdf2.getDocument)({
6242
6265
  data: new Uint8Array(buffer),
6243
6266
  useSystemFonts: true,
@@ -6251,8 +6274,8 @@ async function loadPdfWithTimeout(buffer) {
6251
6274
  new Promise((_, reject) => {
6252
6275
  timer = setTimeout(() => {
6253
6276
  loadingTask.destroy();
6254
- reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
6255
- }, PDF_LOAD_TIMEOUT_MS);
6277
+ reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
6278
+ }, timeoutMs);
6256
6279
  })
6257
6280
  ]);
6258
6281
  } finally {
@@ -6273,11 +6296,15 @@ async function parsePdfDocument(buffer, options) {
6273
6296
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
6274
6297
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
6275
6298
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
6276
- const allFontSizes = [];
6299
+ const fontSizeFreq = /* @__PURE__ */ new Map();
6277
6300
  const pageHeights = /* @__PURE__ */ new Map();
6278
- let parsedPages = 0;
6301
+ const targetPageNums = [];
6279
6302
  for (let i = 1; i <= effectivePageCount; i++) {
6280
6303
  if (pageFilter && !pageFilter.has(i)) continue;
6304
+ targetPageNums.push(i);
6305
+ }
6306
+ let parsedPages = 0;
6307
+ const parseSinglePage = async (i) => {
6281
6308
  try {
6282
6309
  const page = await doc.getPage(i);
6283
6310
  const tc = await page.getTextContent();
@@ -6290,7 +6317,10 @@ async function parsePdfDocument(buffer, options) {
6290
6317
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
6291
6318
  }
6292
6319
  for (const item of visible) {
6293
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
6320
+ if (item.fontSize > 0) {
6321
+ const rounded = Math.round(item.fontSize * 10) / 10;
6322
+ fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
6323
+ }
6294
6324
  }
6295
6325
  const opList = await page.getOperatorList();
6296
6326
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -6307,12 +6337,34 @@ async function parsePdfDocument(buffer, options) {
6307
6337
  if (pageErr instanceof KordocError) throw pageErr;
6308
6338
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
6309
6339
  }
6340
+ };
6341
+ const SAMPLE_SIZE = Math.min(10, targetPageNums.length);
6342
+ const sampledIndices = /* @__PURE__ */ new Set();
6343
+ if (targetPageNums.length <= SAMPLE_SIZE) {
6344
+ for (let i = 0; i < targetPageNums.length; i++) sampledIndices.add(i);
6345
+ } else {
6346
+ for (let i = 0; i < SAMPLE_SIZE; i++) {
6347
+ const idx = Math.round(i * (targetPageNums.length - 1) / (SAMPLE_SIZE - 1));
6348
+ sampledIndices.add(idx);
6349
+ }
6350
+ }
6351
+ for (const si of sampledIndices) {
6352
+ await parseSinglePage(targetPageNums[si]);
6353
+ }
6354
+ const sampleParsed = parsedPages || sampledIndices.size;
6355
+ const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
6356
+ if (!isImageBased) {
6357
+ for (let si = 0; si < targetPageNums.length; si++) {
6358
+ if (!sampledIndices.has(si)) {
6359
+ await parseSinglePage(targetPageNums[si]);
6360
+ }
6361
+ }
6310
6362
  }
6311
6363
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
6312
- if (totalChars / Math.max(parsedPageCount, 1) < 10) {
6364
+ if (isImageBased) {
6313
6365
  let ocrProvider = options?.ocr ?? null;
6314
- const ocrMode = options?.ocrMode;
6315
- if (!ocrProvider && ocrMode && ocrMode !== "off") {
6366
+ const ocrMode = options?.ocrMode ?? "auto";
6367
+ if (!ocrProvider && ocrMode !== "off") {
6316
6368
  try {
6317
6369
  const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6318
6370
  const concurrency = options?.ocrConcurrency ?? 1;
@@ -6364,7 +6416,7 @@ async function parsePdfDocument(buffer, options) {
6364
6416
  blocks.splice(removed[ri], 1);
6365
6417
  }
6366
6418
  }
6367
- const medianFontSize = computeMedianFontSize(allFontSizes);
6419
+ const medianFontSize = computeMedianFromFreq(fontSizeFreq);
6368
6420
  if (medianFontSize > 0) {
6369
6421
  detectHeadings(blocks, medianFontSize);
6370
6422
  }
@@ -6417,11 +6469,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
6417
6469
  }
6418
6470
  return { visible, hiddenCount };
6419
6471
  }
6420
- function computeMedianFontSize(sizes) {
6421
- if (sizes.length === 0) return 0;
6422
- const sorted = [...sizes].sort((a, b) => a - b);
6423
- const mid = Math.floor(sorted.length / 2);
6424
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
6472
+ function computeMedianFromFreq(freq) {
6473
+ if (freq.size === 0) return 0;
6474
+ const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
6475
+ let total = 0;
6476
+ for (const [, count] of entries) total += count;
6477
+ const mid = total / 2;
6478
+ let cumulative = 0;
6479
+ for (const [size, count] of entries) {
6480
+ cumulative += count;
6481
+ if (cumulative >= mid) return size;
6482
+ }
6483
+ return 0;
6425
6484
  }
6426
6485
  function detectHeadings(blocks, medianFontSize) {
6427
6486
  for (const block of blocks) {
@@ -7224,6 +7283,7 @@ var MAX_SHEETS = 100;
7224
7283
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
7225
7284
  var MAX_ROWS2 = 1e4;
7226
7285
  var MAX_COLS2 = 200;
7286
+ var MAX_TOTAL_CELLS = 2e6;
7227
7287
  function cleanNumericValue(raw) {
7228
7288
  if (!/^-?\d+\.\d+$/.test(raw)) return raw;
7229
7289
  const num = parseFloat(raw);
@@ -7407,9 +7467,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
7407
7467
  }
7408
7468
  return blocks;
7409
7469
  }
7410
- async function parseXlsxDocument(buffer, options) {
7470
+ async function parseXlsxDocument(buffer, options, existingZip) {
7411
7471
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
7412
- const zip = await import_jszip3.default.loadAsync(buffer);
7472
+ const zip = existingZip ?? await import_jszip3.default.loadAsync(buffer);
7413
7473
  const warnings = [];
7414
7474
  const workbookFile = zip.file("xl/workbook.xml");
7415
7475
  if (!workbookFile) {
@@ -7436,6 +7496,7 @@ async function parseXlsxDocument(buffer, options) {
7436
7496
  }
7437
7497
  const blocks = [];
7438
7498
  const processedSheets = Math.min(sheets.length, MAX_SHEETS);
7499
+ let totalCells = 0;
7439
7500
  for (let i = 0; i < processedSheets; i++) {
7440
7501
  if (pageFilter && !pageFilter.has(i + 1)) continue;
7441
7502
  const sheet = sheets[i];
@@ -7462,6 +7523,11 @@ async function parseXlsxDocument(buffer, options) {
7462
7523
  try {
7463
7524
  const sheetXml = await sheetFile.async("text");
7464
7525
  const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
7526
+ totalCells += maxRow * maxCol;
7527
+ if (totalCells > MAX_TOTAL_CELLS) {
7528
+ warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
7529
+ break;
7530
+ }
7465
7531
  const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
7466
7532
  blocks.push(...sheetBlocks);
7467
7533
  } catch (err) {
@@ -7545,10 +7611,35 @@ function getAttr(el, localName) {
7545
7611
  function parseXml2(text) {
7546
7612
  return new import_xmldom3.DOMParser().parseFromString(text, "text/xml");
7547
7613
  }
7614
+ function buildElementIndex(root) {
7615
+ const index = /* @__PURE__ */ new Map();
7616
+ const walk = (node) => {
7617
+ const children = node.childNodes;
7618
+ for (let i = 0; i < children.length; i++) {
7619
+ const child = children[i];
7620
+ if (child.nodeType === 1) {
7621
+ const el = child;
7622
+ const name = el.localName ?? "";
7623
+ if (name) {
7624
+ let list = index.get(name);
7625
+ if (!list) {
7626
+ list = [];
7627
+ index.set(name, list);
7628
+ }
7629
+ list.push(el);
7630
+ }
7631
+ walk(el);
7632
+ }
7633
+ }
7634
+ };
7635
+ walk(root);
7636
+ return index;
7637
+ }
7548
7638
  function parseStyles(xml) {
7549
7639
  const doc = parseXml2(xml);
7550
7640
  const styles = /* @__PURE__ */ new Map();
7551
- const styleElements = findElements(doc, "style");
7641
+ const idx = buildElementIndex(doc);
7642
+ const styleElements = idx.get("style") ?? [];
7552
7643
  for (const el of styleElements) {
7553
7644
  const styleId = getAttr(el, "styleId");
7554
7645
  if (!styleId) continue;
@@ -7576,7 +7667,8 @@ function parseStyles(xml) {
7576
7667
  function parseNumbering(xml) {
7577
7668
  const doc = parseXml2(xml);
7578
7669
  const abstractNums = /* @__PURE__ */ new Map();
7579
- const abstractElements = findElements(doc, "abstractNum");
7670
+ const idx = buildElementIndex(doc);
7671
+ const abstractElements = idx.get("abstractNum") ?? [];
7580
7672
  for (const el of abstractElements) {
7581
7673
  const abstractNumId = getAttr(el, "abstractNumId");
7582
7674
  if (!abstractNumId) continue;
@@ -7591,7 +7683,7 @@ function parseNumbering(xml) {
7591
7683
  abstractNums.set(abstractNumId, levels);
7592
7684
  }
7593
7685
  const nums = /* @__PURE__ */ new Map();
7594
- const numElements = findElements(doc, "num");
7686
+ const numElements = idx.get("num") ?? [];
7595
7687
  for (const el of numElements) {
7596
7688
  const numId = getAttr(el, "numId");
7597
7689
  if (!numId) continue;
@@ -7835,9 +7927,9 @@ async function extractImages(zip, rels, doc) {
7835
7927
  }
7836
7928
  return { blocks, images };
7837
7929
  }
7838
- async function parseDocxDocument(buffer, options) {
7930
+ async function parseDocxDocument(buffer, options, existingZip) {
7839
7931
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
7840
- const zip = await import_jszip4.default.loadAsync(buffer);
7932
+ const zip = existingZip ?? await import_jszip4.default.loadAsync(buffer);
7841
7933
  const warnings = [];
7842
7934
  const docFile = zip.file("word/document.xml");
7843
7935
  if (!docFile) {
@@ -7927,6 +8019,11 @@ async function parseDocxDocument(buffer, options) {
7927
8019
  };
7928
8020
  }
7929
8021
 
8022
+ // src/index.ts
8023
+ init_cli_provider();
8024
+ init_tesseract_provider();
8025
+ init_markdown_to_blocks();
8026
+
7930
8027
  // src/diff/text-diff.ts
7931
8028
  function similarity(a, b) {
7932
8029
  if (a === b) return 1;
@@ -10443,25 +10540,86 @@ async function parse2(input, options) {
10443
10540
  if (!buffer || buffer.byteLength === 0) {
10444
10541
  return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
10445
10542
  }
10543
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
10544
+ if (buffer.byteLength > MAX_FILE_SIZE) {
10545
+ return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
10546
+ }
10446
10547
  const format = detectFormat(buffer);
10447
10548
  switch (format) {
10448
10549
  case "hwpx": {
10449
- const zipFormat = await detectZipFormat(buffer);
10450
- if (zipFormat === "xlsx") return parseXlsx(buffer, options);
10451
- if (zipFormat === "docx") return parseDocx(buffer, options);
10452
- return parseHwpx(buffer, options);
10550
+ const { format: zipFormat, zip } = await detectZipFormat(buffer);
10551
+ if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
10552
+ if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
10553
+ return parseHwpx(buffer, options, zip ?? void 0);
10453
10554
  }
10454
10555
  case "hwp":
10455
10556
  return parseHwp(buffer, options);
10456
10557
  case "pdf":
10457
10558
  return parsePdf(buffer, options);
10559
+ case "image":
10560
+ return parseImage(buffer, options);
10458
10561
  default:
10459
10562
  return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
10460
10563
  }
10461
10564
  }
10462
- async function parseHwpx(buffer, options) {
10565
+ async function parseImage(buffer, options) {
10566
+ const ocrMode = options?.ocrMode || "auto";
10567
+ if (ocrMode === "off") {
10568
+ return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
10569
+ }
10570
+ let ocrProvider;
10571
+ let actualOcrMode = "auto";
10572
+ try {
10573
+ if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
10574
+ ocrProvider = createCliOcrProvider(ocrMode);
10575
+ actualOcrMode = ocrMode;
10576
+ } else if (ocrMode === "tesseract") {
10577
+ ocrProvider = await createTesseractProvider();
10578
+ actualOcrMode = ocrMode;
10579
+ } else if (ocrMode === "auto") {
10580
+ const modesToTry = ["gemini", "claude", "codex", "ollama"];
10581
+ for (const mode of modesToTry) {
10582
+ try {
10583
+ ocrProvider = createCliOcrProvider(mode);
10584
+ actualOcrMode = mode;
10585
+ break;
10586
+ } catch (e) {
10587
+ console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
10588
+ }
10589
+ }
10590
+ if (!ocrProvider) {
10591
+ ocrProvider = await createTesseractProvider();
10592
+ actualOcrMode = "tesseract";
10593
+ }
10594
+ }
10595
+ if (!ocrProvider) {
10596
+ return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
10597
+ }
10598
+ const imageUint8Array = new Uint8Array(buffer);
10599
+ const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
10600
+ if (ocrProvider.terminate) {
10601
+ await ocrProvider.terminate();
10602
+ }
10603
+ const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
10604
+ const blocks = markdownToBlocks(markdown, 1);
10605
+ return {
10606
+ success: true,
10607
+ fileType: "image",
10608
+ markdown,
10609
+ blocks,
10610
+ isImageBased: true,
10611
+ warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
10612
+ };
10613
+ } catch (err) {
10614
+ if (ocrProvider && ocrProvider.terminate) {
10615
+ await ocrProvider.terminate();
10616
+ }
10617
+ return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
10618
+ }
10619
+ }
10620
+ async function parseHwpx(buffer, options, zip) {
10463
10621
  try {
10464
- const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
10622
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
10465
10623
  return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10466
10624
  } catch (err) {
10467
10625
  return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
@@ -10484,17 +10642,17 @@ async function parsePdf(buffer, options) {
10484
10642
  return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
10485
10643
  }
10486
10644
  }
10487
- async function parseXlsx(buffer, options) {
10645
+ async function parseXlsx(buffer, options, zip) {
10488
10646
  try {
10489
- const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
10647
+ const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
10490
10648
  return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
10491
10649
  } catch (err) {
10492
10650
  return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
10493
10651
  }
10494
10652
  }
10495
- async function parseDocx(buffer, options) {
10653
+ async function parseDocx(buffer, options, zip) {
10496
10654
  try {
10497
- const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
10655
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
10498
10656
  return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10499
10657
  } catch (err) {
10500
10658
  return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };