@clazic/kordoc 2.3.1 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/{batch-provider-PCT4I4LK.js → batch-provider-PNDCSGQW.js} +3 -4
  2. package/dist/batch-provider-PNDCSGQW.js.map +1 -0
  3. package/dist/chunk-4PP34NVQ.js +121 -0
  4. package/dist/chunk-4PP34NVQ.js.map +1 -0
  5. package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
  6. package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
  7. package/dist/chunk-JOGAFNIL.js +153 -0
  8. package/dist/chunk-JOGAFNIL.js.map +1 -0
  9. package/dist/{chunk-W5KUC23B.js → chunk-NU3KFVVZ.js} +2 -2
  10. package/dist/{chunk-ZOEUKD77.js → chunk-UDFKY7CH.js} +204 -49
  11. package/dist/chunk-UDFKY7CH.js.map +1 -0
  12. package/dist/cli.js +8 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/index.cjs +230 -72
  15. package/dist/index.cjs.map +1 -1
  16. package/dist/index.d.cts +11 -6
  17. package/dist/index.d.ts +11 -6
  18. package/dist/index.js +230 -72
  19. package/dist/index.js.map +1 -1
  20. package/dist/mcp.js +5 -2
  21. package/dist/mcp.js.map +1 -1
  22. package/dist/{provider-WYHC4NHI.js → provider-HE727F7Z.js} +19 -131
  23. package/dist/provider-HE727F7Z.js.map +1 -0
  24. package/dist/resolve-UOAOPQ4H.js +111 -0
  25. package/dist/resolve-UOAOPQ4H.js.map +1 -0
  26. package/dist/tesseract-provider-MNMZPSGF.js +11 -0
  27. package/dist/{utils-HSF5HI5T.js → utils-STJT6CFC.js} +2 -2
  28. package/dist/utils-STJT6CFC.js.map +1 -0
  29. package/dist/{watch-R2JHXDGF.js → watch-PRQGLOW3.js} +6 -3
  30. package/dist/{watch-R2JHXDGF.js.map → watch-PRQGLOW3.js.map} +1 -1
  31. package/package.json +8 -8
  32. package/dist/batch-provider-PCT4I4LK.js.map +0 -1
  33. package/dist/chunk-ZOEUKD77.js.map +0 -1
  34. package/dist/provider-WYHC4NHI.js.map +0 -1
  35. package/dist/resolve-4FSAQF2S.js +0 -247
  36. package/dist/resolve-4FSAQF2S.js.map +0 -1
  37. /package/dist/{chunk-W5KUC23B.js.map → chunk-NU3KFVVZ.js.map} +0 -0
  38. /package/dist/{utils-HSF5HI5T.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
package/dist/index.js CHANGED
@@ -1998,8 +1998,8 @@ function getTesseractFallbackMessage() {
1998
1998
  "\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
1999
1999
  "\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
2000
2000
  "",
2001
- " [\uAD8C\uC7A5] Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
2002
- " Codex CLI: npm install -g @openai/codex",
2001
+ " [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
2002
+ " Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
2003
2003
  " Claude CLI: npm install -g @anthropic-ai/claude-code",
2004
2004
  " Ollama: brew install ollama (+ ollama pull gemma4:27b)"
2005
2005
  ].join("\n");
@@ -2008,7 +2008,7 @@ var CLI_PRIORITY;
2008
2008
  var init_auto_detect = __esm({
2009
2009
  "src/ocr/auto-detect.ts"() {
2010
2010
  "use strict";
2011
- CLI_PRIORITY = ["gemini", "codex", "claude", "ollama"];
2011
+ CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
2012
2012
  }
2013
2013
  });
2014
2014
 
@@ -2051,7 +2051,7 @@ function callCli(mode, imagePath) {
2051
2051
  const args = buildCliArgs(mode, imagePath);
2052
2052
  const result = spawnSync(mode, args, {
2053
2053
  encoding: "utf-8",
2054
- timeout: 18e4,
2054
+ timeout: 6e5,
2055
2055
  maxBuffer: 10 * 1024 * 1024,
2056
2056
  // claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
2057
2057
  ...mode === "claude" ? { cwd: tmpdir() } : {}
@@ -2145,14 +2145,22 @@ async function callOllamaApi(imagePath) {
2145
2145
  return data.message?.content || "";
2146
2146
  }
2147
2147
  function stripCodeFence(text) {
2148
- const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
2148
+ const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
2149
2149
  return match ? match[1].trim() : text;
2150
2150
  }
2151
2151
  var OCR_PROMPT, _tempDir;
2152
2152
  var init_cli_provider = __esm({
2153
2153
  "src/ocr/cli-provider.ts"() {
2154
2154
  "use strict";
2155
- OCR_PROMPT = "\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\uADDC\uCE59:\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
2155
+ OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
2156
+ \uADDC\uCE59:
2157
+ - \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
2158
+ - \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
2159
+ - \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
2160
+ - \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
2161
+ - \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
2162
+ - \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
2163
+ - \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
2156
2164
  _tempDir = null;
2157
2165
  }
2158
2166
  });
@@ -2321,9 +2329,8 @@ async function callBatchCli(mode, imagePaths) {
2321
2329
  ${fileRefs}`;
2322
2330
  let args;
2323
2331
  if (mode === "gemini") {
2324
- args = ["--prompt", prompt, "--yolo"];
2325
- const model = process.env.KORDOC_GEMINI_MODEL;
2326
- if (model) args.push("--model", model);
2332
+ const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
2333
+ args = ["--prompt", prompt, "--yolo", "--model", model];
2327
2334
  } else {
2328
2335
  args = ["--print", prompt];
2329
2336
  const model = process.env.KORDOC_CLAUDE_MODEL;
@@ -2418,7 +2425,7 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2418
2425
  return createCliOcrProvider(mode);
2419
2426
  }
2420
2427
  const detected = detectAvailableOcr();
2421
- if (detected !== "gemini") {
2428
+ if (detected !== "codex") {
2422
2429
  if (detected === "tesseract") {
2423
2430
  warnings?.push({
2424
2431
  message: getTesseractFallbackMessage(),
@@ -2426,7 +2433,7 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2426
2433
  });
2427
2434
  } else {
2428
2435
  warnings?.push({
2429
- message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (gemini CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 gemini CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
2436
+ message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
2430
2437
  code: "OCR_CLI_FALLBACK"
2431
2438
  });
2432
2439
  }
@@ -2667,22 +2674,22 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2667
2674
  if (pageFilter && !pageFilter.has(i)) continue;
2668
2675
  pageNumbers.push(i);
2669
2676
  }
2670
- const pageImages = [];
2671
- for (const pageNum of pageNumbers) {
2672
- const page = await doc.getPage(pageNum);
2673
- const image = await renderPageToPng(page);
2674
- pageImages.push({ image, pageNum });
2675
- }
2676
- const batches = [];
2677
- for (let i = 0; i < pageImages.length; i += provider.batchSize) {
2678
- batches.push(pageImages.slice(i, i + provider.batchSize));
2677
+ const pageBatches = [];
2678
+ for (let i = 0; i < pageNumbers.length; i += provider.batchSize) {
2679
+ pageBatches.push(pageNumbers.slice(i, i + provider.batchSize));
2679
2680
  }
2680
2681
  let processed = 0;
2681
- const batchTasks = batches.map((batch, batchIdx) => async () => {
2682
+ const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
2682
2683
  const pageBlocks = [];
2683
2684
  try {
2684
- const results = await provider.processBatch(batch);
2685
- for (const { pageNum } of batch) {
2685
+ const batchImages = [];
2686
+ for (const pageNum of batchPageNums) {
2687
+ const page = await doc.getPage(pageNum);
2688
+ const image = await renderPageToPng(page);
2689
+ batchImages.push({ image, pageNum });
2690
+ }
2691
+ const results = await provider.processBatch(batchImages);
2692
+ for (const { pageNum } of batchImages) {
2686
2693
  const result = results.get(pageNum);
2687
2694
  pageBlocks.push({
2688
2695
  pageNum,
@@ -2690,16 +2697,16 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2690
2697
  });
2691
2698
  }
2692
2699
  } catch (err) {
2693
- const range = `${batch[0].pageNum}-${batch[batch.length - 1].pageNum}`;
2700
+ const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2694
2701
  warnings?.push({
2695
2702
  message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2696
2703
  code: "OCR_PAGE_FAILED"
2697
2704
  });
2698
- for (const { pageNum } of batch) {
2705
+ for (const pageNum of batchPageNums) {
2699
2706
  pageBlocks.push({ pageNum, blocks: [] });
2700
2707
  }
2701
2708
  }
2702
- processed += batch.length;
2709
+ processed += batchPageNums.length;
2703
2710
  onProgress?.(processed, pageNumbers.length);
2704
2711
  return { batchIdx, pageBlocks };
2705
2712
  });
@@ -2752,24 +2759,29 @@ function isPdfFile(buffer) {
2752
2759
  const b = magicBytes(buffer);
2753
2760
  return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
2754
2761
  }
2762
+ function isPngFile(buffer) {
2763
+ const b = magicBytes(buffer);
2764
+ return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
2765
+ }
2755
2766
  function detectFormat(buffer) {
2756
2767
  if (buffer.byteLength < 4) return "unknown";
2757
2768
  if (isZipFile(buffer)) return "hwpx";
2758
2769
  if (isOldHwpFile(buffer)) return "hwp";
2759
2770
  if (isPdfFile(buffer)) return "pdf";
2771
+ if (isPngFile(buffer)) return "image";
2760
2772
  return "unknown";
2761
2773
  }
2762
2774
  async function detectZipFormat(buffer) {
2763
2775
  try {
2764
2776
  const zip = await JSZip.loadAsync(buffer);
2765
- if (zip.file("xl/workbook.xml")) return "xlsx";
2766
- if (zip.file("word/document.xml")) return "docx";
2767
- if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
2777
+ if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
2778
+ if (zip.file("word/document.xml")) return { format: "docx", zip };
2779
+ if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
2768
2780
  const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
2769
- if (hasSection) return "hwpx";
2770
- return "unknown";
2781
+ if (hasSection) return { format: "hwpx", zip };
2782
+ return { format: "unknown", zip: null };
2771
2783
  } catch {
2772
- return "unknown";
2784
+ return { format: "unknown", zip: null };
2773
2785
  }
2774
2786
  }
2775
2787
 
@@ -2778,7 +2790,7 @@ import JSZip2 from "jszip";
2778
2790
  import { DOMParser } from "@xmldom/xmldom";
2779
2791
 
2780
2792
  // src/utils.ts
2781
- var VERSION = true ? "2.3.1" : "0.0.0-dev";
2793
+ var VERSION = true ? "2.3.3" : "0.0.0-dev";
2782
2794
  function toArrayBuffer(buf) {
2783
2795
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2784
2796
  return buf.buffer;
@@ -2938,12 +2950,16 @@ function buildTableDirect(rows, numRows) {
2938
2950
  return trimAndReturn(grid, numRows, maxCols);
2939
2951
  }
2940
2952
  function trimAndReturn(grid, numRows, maxCols) {
2941
- let effectiveCols = maxCols;
2942
- while (effectiveCols > 0) {
2943
- const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
2944
- if (!colEmpty) break;
2945
- effectiveCols--;
2953
+ let effectiveCols = 0;
2954
+ for (const row of grid) {
2955
+ for (let c = row.length - 1; c >= effectiveCols; c--) {
2956
+ if (row[c]?.text?.trim()) {
2957
+ effectiveCols = c + 1;
2958
+ break;
2959
+ }
2960
+ }
2946
2961
  }
2962
+ if (effectiveCols === 0) effectiveCols = maxCols;
2947
2963
  if (effectiveCols < maxCols && effectiveCols > 0) {
2948
2964
  const trimmed = grid.map((row) => row.slice(0, effectiveCols));
2949
2965
  return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
@@ -3200,11 +3216,11 @@ function parseStyleElements(doc, map) {
3200
3216
  function stripDtd(xml) {
3201
3217
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
3202
3218
  }
3203
- async function parseHwpxDocument(buffer, options) {
3219
+ async function parseHwpxDocument(buffer, options, existingZip) {
3204
3220
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
3205
3221
  let zip;
3206
3222
  try {
3207
- zip = await JSZip2.loadAsync(buffer);
3223
+ zip = existingZip ?? await JSZip2.loadAsync(buffer);
3208
3224
  } catch {
3209
3225
  return await extractFromBrokenZip(buffer);
3210
3226
  }
@@ -6216,8 +6232,15 @@ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mj
6216
6232
  GlobalWorkerOptions.workerSrc = "";
6217
6233
  var MAX_PAGES = 5e3;
6218
6234
  var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
6219
- var PDF_LOAD_TIMEOUT_MS = 3e4;
6235
+ function calcPdfTimeout(bufferSize) {
6236
+ const base = 3e4;
6237
+ const perMb = 500;
6238
+ const mb = bufferSize / (1024 * 1024);
6239
+ return Math.min(base + Math.ceil(mb * perMb), 3e5);
6240
+ }
6220
6241
  async function loadPdfWithTimeout(buffer) {
6242
+ const timeoutMs = calcPdfTimeout(buffer.byteLength);
6243
+ const timeoutSec = Math.round(timeoutMs / 1e3);
6221
6244
  const loadingTask = getDocument({
6222
6245
  data: new Uint8Array(buffer),
6223
6246
  useSystemFonts: true,
@@ -6231,8 +6254,8 @@ async function loadPdfWithTimeout(buffer) {
6231
6254
  new Promise((_, reject) => {
6232
6255
  timer = setTimeout(() => {
6233
6256
  loadingTask.destroy();
6234
- reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
6235
- }, PDF_LOAD_TIMEOUT_MS);
6257
+ reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
6258
+ }, timeoutMs);
6236
6259
  })
6237
6260
  ]);
6238
6261
  } finally {
@@ -6253,11 +6276,15 @@ async function parsePdfDocument(buffer, options) {
6253
6276
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
6254
6277
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
6255
6278
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
6256
- const allFontSizes = [];
6279
+ const fontSizeFreq = /* @__PURE__ */ new Map();
6257
6280
  const pageHeights = /* @__PURE__ */ new Map();
6258
- let parsedPages = 0;
6281
+ const targetPageNums = [];
6259
6282
  for (let i = 1; i <= effectivePageCount; i++) {
6260
6283
  if (pageFilter && !pageFilter.has(i)) continue;
6284
+ targetPageNums.push(i);
6285
+ }
6286
+ let parsedPages = 0;
6287
+ const parseSinglePage = async (i) => {
6261
6288
  try {
6262
6289
  const page = await doc.getPage(i);
6263
6290
  const tc = await page.getTextContent();
@@ -6270,7 +6297,10 @@ async function parsePdfDocument(buffer, options) {
6270
6297
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
6271
6298
  }
6272
6299
  for (const item of visible) {
6273
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
6300
+ if (item.fontSize > 0) {
6301
+ const rounded = Math.round(item.fontSize * 10) / 10;
6302
+ fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
6303
+ }
6274
6304
  }
6275
6305
  const opList = await page.getOperatorList();
6276
6306
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -6287,12 +6317,34 @@ async function parsePdfDocument(buffer, options) {
6287
6317
  if (pageErr instanceof KordocError) throw pageErr;
6288
6318
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
6289
6319
  }
6320
+ };
6321
+ const SAMPLE_SIZE = Math.min(10, targetPageNums.length);
6322
+ const sampledIndices = /* @__PURE__ */ new Set();
6323
+ if (targetPageNums.length <= SAMPLE_SIZE) {
6324
+ for (let i = 0; i < targetPageNums.length; i++) sampledIndices.add(i);
6325
+ } else {
6326
+ for (let i = 0; i < SAMPLE_SIZE; i++) {
6327
+ const idx = Math.round(i * (targetPageNums.length - 1) / (SAMPLE_SIZE - 1));
6328
+ sampledIndices.add(idx);
6329
+ }
6330
+ }
6331
+ for (const si of sampledIndices) {
6332
+ await parseSinglePage(targetPageNums[si]);
6333
+ }
6334
+ const sampleParsed = parsedPages || sampledIndices.size;
6335
+ const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
6336
+ if (!isImageBased) {
6337
+ for (let si = 0; si < targetPageNums.length; si++) {
6338
+ if (!sampledIndices.has(si)) {
6339
+ await parseSinglePage(targetPageNums[si]);
6340
+ }
6341
+ }
6290
6342
  }
6291
6343
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
6292
- if (totalChars / Math.max(parsedPageCount, 1) < 10) {
6344
+ if (isImageBased) {
6293
6345
  let ocrProvider = options?.ocr ?? null;
6294
- const ocrMode = options?.ocrMode;
6295
- if (!ocrProvider && ocrMode && ocrMode !== "off") {
6346
+ const ocrMode = options?.ocrMode ?? "auto";
6347
+ if (!ocrProvider && ocrMode !== "off") {
6296
6348
  try {
6297
6349
  const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6298
6350
  const concurrency = options?.ocrConcurrency ?? 1;
@@ -6344,7 +6396,7 @@ async function parsePdfDocument(buffer, options) {
6344
6396
  blocks.splice(removed[ri], 1);
6345
6397
  }
6346
6398
  }
6347
- const medianFontSize = computeMedianFontSize(allFontSizes);
6399
+ const medianFontSize = computeMedianFromFreq(fontSizeFreq);
6348
6400
  if (medianFontSize > 0) {
6349
6401
  detectHeadings(blocks, medianFontSize);
6350
6402
  }
@@ -6397,11 +6449,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
6397
6449
  }
6398
6450
  return { visible, hiddenCount };
6399
6451
  }
6400
- function computeMedianFontSize(sizes) {
6401
- if (sizes.length === 0) return 0;
6402
- const sorted = [...sizes].sort((a, b) => a - b);
6403
- const mid = Math.floor(sorted.length / 2);
6404
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
6452
+ function computeMedianFromFreq(freq) {
6453
+ if (freq.size === 0) return 0;
6454
+ const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
6455
+ let total = 0;
6456
+ for (const [, count] of entries) total += count;
6457
+ const mid = total / 2;
6458
+ let cumulative = 0;
6459
+ for (const [size, count] of entries) {
6460
+ cumulative += count;
6461
+ if (cumulative >= mid) return size;
6462
+ }
6463
+ return 0;
6405
6464
  }
6406
6465
  function detectHeadings(blocks, medianFontSize) {
6407
6466
  for (const block of blocks) {
@@ -7204,6 +7263,7 @@ var MAX_SHEETS = 100;
7204
7263
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
7205
7264
  var MAX_ROWS2 = 1e4;
7206
7265
  var MAX_COLS2 = 200;
7266
+ var MAX_TOTAL_CELLS = 2e6;
7207
7267
  function cleanNumericValue(raw) {
7208
7268
  if (!/^-?\d+\.\d+$/.test(raw)) return raw;
7209
7269
  const num = parseFloat(raw);
@@ -7387,9 +7447,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
7387
7447
  }
7388
7448
  return blocks;
7389
7449
  }
7390
- async function parseXlsxDocument(buffer, options) {
7450
+ async function parseXlsxDocument(buffer, options, existingZip) {
7391
7451
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
7392
- const zip = await JSZip3.loadAsync(buffer);
7452
+ const zip = existingZip ?? await JSZip3.loadAsync(buffer);
7393
7453
  const warnings = [];
7394
7454
  const workbookFile = zip.file("xl/workbook.xml");
7395
7455
  if (!workbookFile) {
@@ -7416,6 +7476,7 @@ async function parseXlsxDocument(buffer, options) {
7416
7476
  }
7417
7477
  const blocks = [];
7418
7478
  const processedSheets = Math.min(sheets.length, MAX_SHEETS);
7479
+ let totalCells = 0;
7419
7480
  for (let i = 0; i < processedSheets; i++) {
7420
7481
  if (pageFilter && !pageFilter.has(i + 1)) continue;
7421
7482
  const sheet = sheets[i];
@@ -7442,6 +7503,11 @@ async function parseXlsxDocument(buffer, options) {
7442
7503
  try {
7443
7504
  const sheetXml = await sheetFile.async("text");
7444
7505
  const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
7506
+ totalCells += maxRow * maxCol;
7507
+ if (totalCells > MAX_TOTAL_CELLS) {
7508
+ warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
7509
+ break;
7510
+ }
7445
7511
  const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
7446
7512
  blocks.push(...sheetBlocks);
7447
7513
  } catch (err) {
@@ -7525,10 +7591,35 @@ function getAttr(el, localName) {
7525
7591
  function parseXml2(text) {
7526
7592
  return new DOMParser3().parseFromString(text, "text/xml");
7527
7593
  }
7594
+ function buildElementIndex(root) {
7595
+ const index = /* @__PURE__ */ new Map();
7596
+ const walk = (node) => {
7597
+ const children = node.childNodes;
7598
+ for (let i = 0; i < children.length; i++) {
7599
+ const child = children[i];
7600
+ if (child.nodeType === 1) {
7601
+ const el = child;
7602
+ const name = el.localName ?? "";
7603
+ if (name) {
7604
+ let list = index.get(name);
7605
+ if (!list) {
7606
+ list = [];
7607
+ index.set(name, list);
7608
+ }
7609
+ list.push(el);
7610
+ }
7611
+ walk(el);
7612
+ }
7613
+ }
7614
+ };
7615
+ walk(root);
7616
+ return index;
7617
+ }
7528
7618
  function parseStyles(xml) {
7529
7619
  const doc = parseXml2(xml);
7530
7620
  const styles = /* @__PURE__ */ new Map();
7531
- const styleElements = findElements(doc, "style");
7621
+ const idx = buildElementIndex(doc);
7622
+ const styleElements = idx.get("style") ?? [];
7532
7623
  for (const el of styleElements) {
7533
7624
  const styleId = getAttr(el, "styleId");
7534
7625
  if (!styleId) continue;
@@ -7556,7 +7647,8 @@ function parseStyles(xml) {
7556
7647
  function parseNumbering(xml) {
7557
7648
  const doc = parseXml2(xml);
7558
7649
  const abstractNums = /* @__PURE__ */ new Map();
7559
- const abstractElements = findElements(doc, "abstractNum");
7650
+ const idx = buildElementIndex(doc);
7651
+ const abstractElements = idx.get("abstractNum") ?? [];
7560
7652
  for (const el of abstractElements) {
7561
7653
  const abstractNumId = getAttr(el, "abstractNumId");
7562
7654
  if (!abstractNumId) continue;
@@ -7571,7 +7663,7 @@ function parseNumbering(xml) {
7571
7663
  abstractNums.set(abstractNumId, levels);
7572
7664
  }
7573
7665
  const nums = /* @__PURE__ */ new Map();
7574
- const numElements = findElements(doc, "num");
7666
+ const numElements = idx.get("num") ?? [];
7575
7667
  for (const el of numElements) {
7576
7668
  const numId = getAttr(el, "numId");
7577
7669
  if (!numId) continue;
@@ -7815,9 +7907,9 @@ async function extractImages(zip, rels, doc) {
7815
7907
  }
7816
7908
  return { blocks, images };
7817
7909
  }
7818
- async function parseDocxDocument(buffer, options) {
7910
+ async function parseDocxDocument(buffer, options, existingZip) {
7819
7911
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
7820
- const zip = await JSZip4.loadAsync(buffer);
7912
+ const zip = existingZip ?? await JSZip4.loadAsync(buffer);
7821
7913
  const warnings = [];
7822
7914
  const docFile = zip.file("word/document.xml");
7823
7915
  if (!docFile) {
@@ -7907,6 +7999,11 @@ async function parseDocxDocument(buffer, options) {
7907
7999
  };
7908
8000
  }
7909
8001
 
8002
+ // src/index.ts
8003
+ init_cli_provider();
8004
+ init_tesseract_provider();
8005
+ init_markdown_to_blocks();
8006
+
7910
8007
  // src/diff/text-diff.ts
7911
8008
  function similarity(a, b) {
7912
8009
  if (a === b) return 1;
@@ -10423,25 +10520,86 @@ async function parse2(input, options) {
10423
10520
  if (!buffer || buffer.byteLength === 0) {
10424
10521
  return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
10425
10522
  }
10523
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
10524
+ if (buffer.byteLength > MAX_FILE_SIZE) {
10525
+ return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
10526
+ }
10426
10527
  const format = detectFormat(buffer);
10427
10528
  switch (format) {
10428
10529
  case "hwpx": {
10429
- const zipFormat = await detectZipFormat(buffer);
10430
- if (zipFormat === "xlsx") return parseXlsx(buffer, options);
10431
- if (zipFormat === "docx") return parseDocx(buffer, options);
10432
- return parseHwpx(buffer, options);
10530
+ const { format: zipFormat, zip } = await detectZipFormat(buffer);
10531
+ if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
10532
+ if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
10533
+ return parseHwpx(buffer, options, zip ?? void 0);
10433
10534
  }
10434
10535
  case "hwp":
10435
10536
  return parseHwp(buffer, options);
10436
10537
  case "pdf":
10437
10538
  return parsePdf(buffer, options);
10539
+ case "image":
10540
+ return parseImage(buffer, options);
10438
10541
  default:
10439
10542
  return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
10440
10543
  }
10441
10544
  }
10442
- async function parseHwpx(buffer, options) {
10545
+ async function parseImage(buffer, options) {
10546
+ const ocrMode = options?.ocrMode || "auto";
10547
+ if (ocrMode === "off") {
10548
+ return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
10549
+ }
10550
+ let ocrProvider;
10551
+ let actualOcrMode = "auto";
10552
+ try {
10553
+ if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
10554
+ ocrProvider = createCliOcrProvider(ocrMode);
10555
+ actualOcrMode = ocrMode;
10556
+ } else if (ocrMode === "tesseract") {
10557
+ ocrProvider = await createTesseractProvider();
10558
+ actualOcrMode = ocrMode;
10559
+ } else if (ocrMode === "auto") {
10560
+ const modesToTry = ["gemini", "claude", "codex", "ollama"];
10561
+ for (const mode of modesToTry) {
10562
+ try {
10563
+ ocrProvider = createCliOcrProvider(mode);
10564
+ actualOcrMode = mode;
10565
+ break;
10566
+ } catch (e) {
10567
+ console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
10568
+ }
10569
+ }
10570
+ if (!ocrProvider) {
10571
+ ocrProvider = await createTesseractProvider();
10572
+ actualOcrMode = "tesseract";
10573
+ }
10574
+ }
10575
+ if (!ocrProvider) {
10576
+ return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
10577
+ }
10578
+ const imageUint8Array = new Uint8Array(buffer);
10579
+ const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
10580
+ if (ocrProvider.terminate) {
10581
+ await ocrProvider.terminate();
10582
+ }
10583
+ const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
10584
+ const blocks = markdownToBlocks(markdown, 1);
10585
+ return {
10586
+ success: true,
10587
+ fileType: "image",
10588
+ markdown,
10589
+ blocks,
10590
+ isImageBased: true,
10591
+ warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
10592
+ };
10593
+ } catch (err) {
10594
+ if (ocrProvider && ocrProvider.terminate) {
10595
+ await ocrProvider.terminate();
10596
+ }
10597
+ return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
10598
+ }
10599
+ }
10600
+ async function parseHwpx(buffer, options, zip) {
10443
10601
  try {
10444
- const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
10602
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
10445
10603
  return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10446
10604
  } catch (err) {
10447
10605
  return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
@@ -10464,17 +10622,17 @@ async function parsePdf(buffer, options) {
10464
10622
  return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
10465
10623
  }
10466
10624
  }
10467
- async function parseXlsx(buffer, options) {
10625
+ async function parseXlsx(buffer, options, zip) {
10468
10626
  try {
10469
- const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
10627
+ const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
10470
10628
  return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
10471
10629
  } catch (err) {
10472
10630
  return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
10473
10631
  }
10474
10632
  }
10475
- async function parseDocx(buffer, options) {
10633
+ async function parseDocx(buffer, options, zip) {
10476
10634
  try {
10477
- const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
10635
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
10478
10636
  return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10479
10637
  } catch (err) {
10480
10638
  return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };