@clazic/kordoc 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/{batch-provider-FUCIIS4M.js → batch-provider-PNDCSGQW.js} +59 -30
  2. package/dist/batch-provider-PNDCSGQW.js.map +1 -0
  3. package/dist/{chunk-2ZGLFZCN.js → chunk-2GFJFTKS.js} +193 -49
  4. package/dist/chunk-2GFJFTKS.js.map +1 -0
  5. package/dist/chunk-4PP34NVQ.js +121 -0
  6. package/dist/chunk-4PP34NVQ.js.map +1 -0
  7. package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
  8. package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
  9. package/dist/chunk-JOGAFNIL.js +153 -0
  10. package/dist/chunk-JOGAFNIL.js.map +1 -0
  11. package/dist/{chunk-WWILSVMJ.js → chunk-STIKJGEA.js} +2 -2
  12. package/dist/cli.js +10 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/index.cjs +291 -103
  15. package/dist/index.cjs.map +1 -1
  16. package/dist/index.d.cts +11 -6
  17. package/dist/index.d.ts +11 -6
  18. package/dist/index.js +292 -104
  19. package/dist/index.js.map +1 -1
  20. package/dist/mcp.js +5 -2
  21. package/dist/mcp.js.map +1 -1
  22. package/dist/{provider-OBY3XFSZ.js → provider-HE727F7Z.js} +38 -139
  23. package/dist/provider-HE727F7Z.js.map +1 -0
  24. package/dist/resolve-QA3VACUP.js +111 -0
  25. package/dist/resolve-QA3VACUP.js.map +1 -0
  26. package/dist/tesseract-provider-MNMZPSGF.js +11 -0
  27. package/dist/{utils-QAK24RJS.js → utils-FFUQJTTI.js} +2 -2
  28. package/dist/utils-FFUQJTTI.js.map +1 -0
  29. package/dist/{watch-MPHX3QIH.js → watch-2O32L6IF.js} +6 -3
  30. package/dist/{watch-MPHX3QIH.js.map → watch-2O32L6IF.js.map} +1 -1
  31. package/package.json +1 -1
  32. package/dist/batch-provider-FUCIIS4M.js.map +0 -1
  33. package/dist/chunk-2ZGLFZCN.js.map +0 -1
  34. package/dist/provider-OBY3XFSZ.js.map +0 -1
  35. package/dist/resolve-LBFYRHJI.js +0 -247
  36. package/dist/resolve-LBFYRHJI.js.map +0 -1
  37. /package/dist/{chunk-WWILSVMJ.js.map → chunk-STIKJGEA.js.map} +0 -0
  38. /package/dist/{utils-QAK24RJS.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
package/dist/index.cjs CHANGED
@@ -1993,8 +1993,8 @@ function getTesseractFallbackMessage() {
1993
1993
  "\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
1994
1994
  "\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
1995
1995
  "",
1996
- " [\uAD8C\uC7A5] Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
1997
- " Codex CLI: npm install -g @openai/codex",
1996
+ " [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
1997
+ " Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
1998
1998
  " Claude CLI: npm install -g @anthropic-ai/claude-code",
1999
1999
  " Ollama: brew install ollama (+ ollama pull gemma4:27b)"
2000
2000
  ].join("\n");
@@ -2004,7 +2004,7 @@ var init_auto_detect = __esm({
2004
2004
  "src/ocr/auto-detect.ts"() {
2005
2005
  "use strict";
2006
2006
  import_child_process = require("child_process");
2007
- CLI_PRIORITY = ["gemini", "codex", "claude", "ollama"];
2007
+ CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
2008
2008
  }
2009
2009
  });
2010
2010
 
@@ -2043,7 +2043,7 @@ function callCli(mode, imagePath) {
2043
2043
  const args = buildCliArgs(mode, imagePath);
2044
2044
  const result = (0, import_child_process2.spawnSync)(mode, args, {
2045
2045
  encoding: "utf-8",
2046
- timeout: 18e4,
2046
+ timeout: 6e5,
2047
2047
  maxBuffer: 10 * 1024 * 1024,
2048
2048
  // claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
2049
2049
  ...mode === "claude" ? { cwd: (0, import_os.tmpdir)() } : {}
@@ -2137,7 +2137,7 @@ async function callOllamaApi(imagePath) {
2137
2137
  return data.message?.content || "";
2138
2138
  }
2139
2139
  function stripCodeFence(text) {
2140
- const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
2140
+ const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
2141
2141
  return match ? match[1].trim() : text;
2142
2142
  }
2143
2143
  var import_child_process2, import_fs, import_path, import_os, OCR_PROMPT, _tempDir;
@@ -2148,7 +2148,15 @@ var init_cli_provider = __esm({
2148
2148
  import_fs = require("fs");
2149
2149
  import_path = require("path");
2150
2150
  import_os = require("os");
2151
- OCR_PROMPT = "\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\uADDC\uCE59:\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
2151
+ OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
2152
+ \uADDC\uCE59:
2153
+ - \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
2154
+ - \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
2155
+ - \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
2156
+ - \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
2157
+ - \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
2158
+ - \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
2159
+ - \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
2152
2160
  _tempDir = null;
2153
2161
  }
2154
2162
  });
@@ -2242,9 +2250,9 @@ function createBatchCliProvider(mode, batchSize) {
2242
2250
  }
2243
2251
  let output;
2244
2252
  if (mode === "codex") {
2245
- output = callBatchCodexCli(tempFiles);
2253
+ output = await callBatchCodexCli(tempFiles);
2246
2254
  } else {
2247
- output = callBatchCli(mode, tempFiles);
2255
+ output = await callBatchCli(mode, tempFiles);
2248
2256
  }
2249
2257
  const cleaned = stripCodeFence2(output.trim());
2250
2258
  const parts = cleaned.split(/<!--\s*PAGE_BREAK\s*-->/).map((p) => p.trim()).filter((p) => p.length > 0);
@@ -2266,40 +2274,74 @@ function createBatchCliProvider(mode, batchSize) {
2266
2274
  }
2267
2275
  };
2268
2276
  }
2269
- function callBatchCli(mode, imagePaths) {
2277
+ function spawnAsync(cmd, args, opts) {
2278
+ return new Promise((resolve, reject) => {
2279
+ const child = (0, import_child_process3.spawn)(cmd, args, {
2280
+ cwd: opts.cwd,
2281
+ env: process.env,
2282
+ stdio: ["pipe", "pipe", "pipe"]
2283
+ });
2284
+ let stdout = "";
2285
+ let stderr = "";
2286
+ let killed = false;
2287
+ child.stdout.setEncoding("utf-8");
2288
+ child.stderr.setEncoding("utf-8");
2289
+ child.stdout.on("data", (d) => {
2290
+ stdout += d;
2291
+ });
2292
+ child.stderr.on("data", (d) => {
2293
+ stderr += d;
2294
+ });
2295
+ const timer = setTimeout(() => {
2296
+ killed = true;
2297
+ child.kill("SIGTERM");
2298
+ }, opts.timeoutMs);
2299
+ if (opts.stdin !== void 0) {
2300
+ child.stdin.end(opts.stdin);
2301
+ } else {
2302
+ child.stdin.end();
2303
+ }
2304
+ child.on("close", (code) => {
2305
+ clearTimeout(timer);
2306
+ if (killed) {
2307
+ reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
2308
+ } else {
2309
+ resolve({ stdout, stderr, exitCode: code ?? 1 });
2310
+ }
2311
+ });
2312
+ child.on("error", (err) => {
2313
+ clearTimeout(timer);
2314
+ reject(err);
2315
+ });
2316
+ });
2317
+ }
2318
+ async function callBatchCli(mode, imagePaths) {
2270
2319
  const fileRefs = imagePaths.map((p) => `@${p}`).join("\n");
2271
2320
  const prompt = `${BATCH_OCR_PROMPT}
2272
2321
 
2273
2322
  ${fileRefs}`;
2274
2323
  let args;
2275
2324
  if (mode === "gemini") {
2276
- args = ["--prompt", prompt, "--yolo"];
2277
- const model = process.env.KORDOC_GEMINI_MODEL;
2278
- if (model) args.push("--model", model);
2325
+ const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
2326
+ args = ["--prompt", prompt, "--yolo", "--model", model];
2279
2327
  } else {
2280
2328
  args = ["--print", prompt];
2281
2329
  const model = process.env.KORDOC_CLAUDE_MODEL;
2282
2330
  if (model) args.push("--model", model);
2283
2331
  }
2284
2332
  const timeoutMs = 6e4 + imagePaths.length * 2e4;
2285
- const result = (0, import_child_process3.spawnSync)(mode, args, {
2286
- encoding: "utf-8",
2287
- timeout: timeoutMs,
2288
- maxBuffer: 50 * 1024 * 1024,
2289
- // 50MB (large batch output)
2333
+ const result = await spawnAsync(mode, args, {
2334
+ timeoutMs,
2290
2335
  ...mode === "claude" ? { cwd: (0, import_os2.tmpdir)() } : {}
2291
2336
  });
2292
- if (result.error) {
2293
- throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${result.error.message}`);
2294
- }
2295
- if (result.status !== 0) {
2296
- const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
2337
+ if (result.exitCode !== 0) {
2338
+ const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
2297
2339
  throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
2298
2340
  }
2299
2341
  return result.stdout || "";
2300
2342
  }
2301
- function callBatchCodexCli(imagePaths) {
2302
- const outPath = (0, import_path2.join)((0, import_os2.tmpdir)(), `kordoc-codex-batch-${Date.now()}.txt`);
2343
+ async function callBatchCodexCli(imagePaths) {
2344
+ const outPath = (0, import_path2.join)((0, import_os2.tmpdir)(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
2303
2345
  try {
2304
2346
  const args = ["exec", BATCH_OCR_PROMPT];
2305
2347
  for (const p of imagePaths) {
@@ -2309,17 +2351,12 @@ function callBatchCodexCli(imagePaths) {
2309
2351
  const model = process.env.KORDOC_CODEX_MODEL;
2310
2352
  if (model) args.push("--model", model);
2311
2353
  const timeoutMs = 6e4 + imagePaths.length * 2e4;
2312
- const result = (0, import_child_process3.spawnSync)("codex", args, {
2313
- encoding: "utf-8",
2314
- timeout: timeoutMs,
2315
- maxBuffer: 50 * 1024 * 1024,
2316
- input: ""
2354
+ const result = await spawnAsync("codex", args, {
2355
+ timeoutMs,
2356
+ stdin: ""
2317
2357
  });
2318
- if (result.error) {
2319
- throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${result.error.message}`);
2320
- }
2321
- if (result.status !== 0) {
2322
- const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
2358
+ if (result.exitCode !== 0) {
2359
+ const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
2323
2360
  throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
2324
2361
  }
2325
2362
  try {
@@ -2581,7 +2618,7 @@ function isBatchProvider(p) {
2581
2618
  async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
2582
2619
  const blocks = [];
2583
2620
  if (isBatchProvider(provider)) {
2584
- return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress);
2621
+ return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
2585
2622
  }
2586
2623
  if (concurrency <= 1) {
2587
2624
  for (let i = 1; i <= effectivePageCount; i++) {
@@ -2628,43 +2665,54 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
2628
2665
  }
2629
2666
  return blocks;
2630
2667
  }
2631
- async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress) {
2632
- const blocks = [];
2668
+ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
2633
2669
  const pageNumbers = [];
2634
2670
  for (let i = 1; i <= effectivePageCount; i++) {
2635
2671
  if (pageFilter && !pageFilter.has(i)) continue;
2636
2672
  pageNumbers.push(i);
2637
2673
  }
2638
- const pageImages = [];
2639
- for (const pageNum of pageNumbers) {
2640
- const page = await doc.getPage(pageNum);
2641
- const image = await renderPageToPng(page);
2642
- pageImages.push({ image, pageNum });
2643
- }
2644
- const batches = [];
2645
- for (let i = 0; i < pageImages.length; i += provider.batchSize) {
2646
- batches.push(pageImages.slice(i, i + provider.batchSize));
2674
+ const pageBatches = [];
2675
+ for (let i = 0; i < pageNumbers.length; i += provider.batchSize) {
2676
+ pageBatches.push(pageNumbers.slice(i, i + provider.batchSize));
2647
2677
  }
2648
2678
  let processed = 0;
2649
- for (const batch of batches) {
2679
+ const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
2680
+ const pageBlocks = [];
2650
2681
  try {
2651
- const results = await provider.processBatch(batch);
2652
- for (const { pageNum } of batch) {
2682
+ const batchImages = [];
2683
+ for (const pageNum of batchPageNums) {
2684
+ const page = await doc.getPage(pageNum);
2685
+ const image = await renderPageToPng(page);
2686
+ batchImages.push({ image, pageNum });
2687
+ }
2688
+ const results = await provider.processBatch(batchImages);
2689
+ for (const { pageNum } of batchImages) {
2653
2690
  const result = results.get(pageNum);
2654
- if (result) {
2655
- for (const b of ocrResultToBlocks(result, pageNum)) blocks.push(b);
2656
- }
2657
- processed++;
2658
- onProgress?.(processed, pageNumbers.length);
2691
+ pageBlocks.push({
2692
+ pageNum,
2693
+ blocks: result ? ocrResultToBlocks(result, pageNum) : []
2694
+ });
2659
2695
  }
2660
2696
  } catch (err) {
2661
- const range = `${batch[0].pageNum}-${batch[batch.length - 1].pageNum}`;
2697
+ const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2662
2698
  warnings?.push({
2663
2699
  message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2664
2700
  code: "OCR_PAGE_FAILED"
2665
2701
  });
2666
- processed += batch.length;
2667
- onProgress?.(processed, pageNumbers.length);
2702
+ for (const pageNum of batchPageNums) {
2703
+ pageBlocks.push({ pageNum, blocks: [] });
2704
+ }
2705
+ }
2706
+ processed += batchPageNums.length;
2707
+ onProgress?.(processed, pageNumbers.length);
2708
+ return { batchIdx, pageBlocks };
2709
+ });
2710
+ const effectiveConcurrency = Math.max(1, concurrency);
2711
+ const batchResults = await runWithConcurrency(batchTasks, effectiveConcurrency);
2712
+ const blocks = [];
2713
+ for (const result of batchResults) {
2714
+ for (const { blocks: pageBlks } of result.pageBlocks) {
2715
+ for (const b of pageBlks) blocks.push(b);
2668
2716
  }
2669
2717
  }
2670
2718
  return blocks;
@@ -2731,24 +2779,29 @@ function isPdfFile(buffer) {
2731
2779
  const b = magicBytes(buffer);
2732
2780
  return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
2733
2781
  }
2782
+ function isPngFile(buffer) {
2783
+ const b = magicBytes(buffer);
2784
+ return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
2785
+ }
2734
2786
  function detectFormat(buffer) {
2735
2787
  if (buffer.byteLength < 4) return "unknown";
2736
2788
  if (isZipFile(buffer)) return "hwpx";
2737
2789
  if (isOldHwpFile(buffer)) return "hwp";
2738
2790
  if (isPdfFile(buffer)) return "pdf";
2791
+ if (isPngFile(buffer)) return "image";
2739
2792
  return "unknown";
2740
2793
  }
2741
2794
  async function detectZipFormat(buffer) {
2742
2795
  try {
2743
2796
  const zip = await import_jszip.default.loadAsync(buffer);
2744
- if (zip.file("xl/workbook.xml")) return "xlsx";
2745
- if (zip.file("word/document.xml")) return "docx";
2746
- if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
2797
+ if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
2798
+ if (zip.file("word/document.xml")) return { format: "docx", zip };
2799
+ if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
2747
2800
  const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
2748
- if (hasSection) return "hwpx";
2749
- return "unknown";
2801
+ if (hasSection) return { format: "hwpx", zip };
2802
+ return { format: "unknown", zip: null };
2750
2803
  } catch {
2751
- return "unknown";
2804
+ return { format: "unknown", zip: null };
2752
2805
  }
2753
2806
  }
2754
2807
 
@@ -2757,7 +2810,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
2757
2810
  var import_xmldom = require("@xmldom/xmldom");
2758
2811
 
2759
2812
  // src/utils.ts
2760
- var VERSION = true ? "2.3.0" : "0.0.0-dev";
2813
+ var VERSION = true ? "2.3.2" : "0.0.0-dev";
2761
2814
  function toArrayBuffer(buf) {
2762
2815
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2763
2816
  return buf.buffer;
@@ -2917,12 +2970,16 @@ function buildTableDirect(rows, numRows) {
2917
2970
  return trimAndReturn(grid, numRows, maxCols);
2918
2971
  }
2919
2972
  function trimAndReturn(grid, numRows, maxCols) {
2920
- let effectiveCols = maxCols;
2921
- while (effectiveCols > 0) {
2922
- const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
2923
- if (!colEmpty) break;
2924
- effectiveCols--;
2973
+ let effectiveCols = 0;
2974
+ for (const row of grid) {
2975
+ for (let c = row.length - 1; c >= effectiveCols; c--) {
2976
+ if (row[c]?.text?.trim()) {
2977
+ effectiveCols = c + 1;
2978
+ break;
2979
+ }
2980
+ }
2925
2981
  }
2982
+ if (effectiveCols === 0) effectiveCols = maxCols;
2926
2983
  if (effectiveCols < maxCols && effectiveCols > 0) {
2927
2984
  const trimmed = grid.map((row) => row.slice(0, effectiveCols));
2928
2985
  return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
@@ -3179,11 +3236,11 @@ function parseStyleElements(doc, map) {
3179
3236
  function stripDtd(xml) {
3180
3237
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
3181
3238
  }
3182
- async function parseHwpxDocument(buffer, options) {
3239
+ async function parseHwpxDocument(buffer, options, existingZip) {
3183
3240
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
3184
3241
  let zip;
3185
3242
  try {
3186
- zip = await import_jszip2.default.loadAsync(buffer);
3243
+ zip = existingZip ?? await import_jszip2.default.loadAsync(buffer);
3187
3244
  } catch {
3188
3245
  return await extractFromBrokenZip(buffer);
3189
3246
  }
@@ -6195,8 +6252,15 @@ var import_pdf2 = require("pdfjs-dist/legacy/build/pdf.mjs");
6195
6252
  import_pdf2.GlobalWorkerOptions.workerSrc = "";
6196
6253
  var MAX_PAGES = 5e3;
6197
6254
  var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
6198
- var PDF_LOAD_TIMEOUT_MS = 3e4;
6255
+ function calcPdfTimeout(bufferSize) {
6256
+ const base = 3e4;
6257
+ const perMb = 500;
6258
+ const mb = bufferSize / (1024 * 1024);
6259
+ return Math.min(base + Math.ceil(mb * perMb), 3e5);
6260
+ }
6199
6261
  async function loadPdfWithTimeout(buffer) {
6262
+ const timeoutMs = calcPdfTimeout(buffer.byteLength);
6263
+ const timeoutSec = Math.round(timeoutMs / 1e3);
6200
6264
  const loadingTask = (0, import_pdf2.getDocument)({
6201
6265
  data: new Uint8Array(buffer),
6202
6266
  useSystemFonts: true,
@@ -6210,8 +6274,8 @@ async function loadPdfWithTimeout(buffer) {
6210
6274
  new Promise((_, reject) => {
6211
6275
  timer = setTimeout(() => {
6212
6276
  loadingTask.destroy();
6213
- reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
6214
- }, PDF_LOAD_TIMEOUT_MS);
6277
+ reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
6278
+ }, timeoutMs);
6215
6279
  })
6216
6280
  ]);
6217
6281
  } finally {
@@ -6232,11 +6296,15 @@ async function parsePdfDocument(buffer, options) {
6232
6296
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
6233
6297
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
6234
6298
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
6235
- const allFontSizes = [];
6299
+ const fontSizeFreq = /* @__PURE__ */ new Map();
6236
6300
  const pageHeights = /* @__PURE__ */ new Map();
6237
- let parsedPages = 0;
6301
+ const targetPageNums = [];
6238
6302
  for (let i = 1; i <= effectivePageCount; i++) {
6239
6303
  if (pageFilter && !pageFilter.has(i)) continue;
6304
+ targetPageNums.push(i);
6305
+ }
6306
+ let parsedPages = 0;
6307
+ const parseSinglePage = async (i) => {
6240
6308
  try {
6241
6309
  const page = await doc.getPage(i);
6242
6310
  const tc = await page.getTextContent();
@@ -6249,7 +6317,10 @@ async function parsePdfDocument(buffer, options) {
6249
6317
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
6250
6318
  }
6251
6319
  for (const item of visible) {
6252
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
6320
+ if (item.fontSize > 0) {
6321
+ const rounded = Math.round(item.fontSize * 10) / 10;
6322
+ fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
6323
+ }
6253
6324
  }
6254
6325
  const opList = await page.getOperatorList();
6255
6326
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -6266,12 +6337,23 @@ async function parsePdfDocument(buffer, options) {
6266
6337
  if (pageErr instanceof KordocError) throw pageErr;
6267
6338
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
6268
6339
  }
6340
+ };
6341
+ const sampleCount = Math.min(5, targetPageNums.length);
6342
+ for (let si = 0; si < sampleCount; si++) {
6343
+ await parseSinglePage(targetPageNums[si]);
6344
+ }
6345
+ const sampleParsed = parsedPages || sampleCount;
6346
+ const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
6347
+ if (!isImageBased) {
6348
+ for (let si = sampleCount; si < targetPageNums.length; si++) {
6349
+ await parseSinglePage(targetPageNums[si]);
6350
+ }
6269
6351
  }
6270
6352
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
6271
- if (totalChars / Math.max(parsedPageCount, 1) < 10) {
6353
+ if (isImageBased) {
6272
6354
  let ocrProvider = options?.ocr ?? null;
6273
- const ocrMode = options?.ocrMode;
6274
- if (!ocrProvider && ocrMode && ocrMode !== "off") {
6355
+ const ocrMode = options?.ocrMode ?? "auto";
6356
+ if (!ocrProvider && ocrMode !== "off") {
6275
6357
  try {
6276
6358
  const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6277
6359
  const concurrency = options?.ocrConcurrency ?? 1;
@@ -6323,7 +6405,7 @@ async function parsePdfDocument(buffer, options) {
6323
6405
  blocks.splice(removed[ri], 1);
6324
6406
  }
6325
6407
  }
6326
- const medianFontSize = computeMedianFontSize(allFontSizes);
6408
+ const medianFontSize = computeMedianFromFreq(fontSizeFreq);
6327
6409
  if (medianFontSize > 0) {
6328
6410
  detectHeadings(blocks, medianFontSize);
6329
6411
  }
@@ -6376,11 +6458,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
6376
6458
  }
6377
6459
  return { visible, hiddenCount };
6378
6460
  }
6379
- function computeMedianFontSize(sizes) {
6380
- if (sizes.length === 0) return 0;
6381
- const sorted = [...sizes].sort((a, b) => a - b);
6382
- const mid = Math.floor(sorted.length / 2);
6383
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
6461
+ function computeMedianFromFreq(freq) {
6462
+ if (freq.size === 0) return 0;
6463
+ const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
6464
+ let total = 0;
6465
+ for (const [, count] of entries) total += count;
6466
+ const mid = total / 2;
6467
+ let cumulative = 0;
6468
+ for (const [size, count] of entries) {
6469
+ cumulative += count;
6470
+ if (cumulative >= mid) return size;
6471
+ }
6472
+ return 0;
6384
6473
  }
6385
6474
  function detectHeadings(blocks, medianFontSize) {
6386
6475
  for (const block of blocks) {
@@ -7183,6 +7272,7 @@ var MAX_SHEETS = 100;
7183
7272
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
7184
7273
  var MAX_ROWS2 = 1e4;
7185
7274
  var MAX_COLS2 = 200;
7275
+ var MAX_TOTAL_CELLS = 2e6;
7186
7276
  function cleanNumericValue(raw) {
7187
7277
  if (!/^-?\d+\.\d+$/.test(raw)) return raw;
7188
7278
  const num = parseFloat(raw);
@@ -7366,9 +7456,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
7366
7456
  }
7367
7457
  return blocks;
7368
7458
  }
7369
- async function parseXlsxDocument(buffer, options) {
7459
+ async function parseXlsxDocument(buffer, options, existingZip) {
7370
7460
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
7371
- const zip = await import_jszip3.default.loadAsync(buffer);
7461
+ const zip = existingZip ?? await import_jszip3.default.loadAsync(buffer);
7372
7462
  const warnings = [];
7373
7463
  const workbookFile = zip.file("xl/workbook.xml");
7374
7464
  if (!workbookFile) {
@@ -7395,6 +7485,7 @@ async function parseXlsxDocument(buffer, options) {
7395
7485
  }
7396
7486
  const blocks = [];
7397
7487
  const processedSheets = Math.min(sheets.length, MAX_SHEETS);
7488
+ let totalCells = 0;
7398
7489
  for (let i = 0; i < processedSheets; i++) {
7399
7490
  if (pageFilter && !pageFilter.has(i + 1)) continue;
7400
7491
  const sheet = sheets[i];
@@ -7421,6 +7512,11 @@ async function parseXlsxDocument(buffer, options) {
7421
7512
  try {
7422
7513
  const sheetXml = await sheetFile.async("text");
7423
7514
  const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
7515
+ totalCells += maxRow * maxCol;
7516
+ if (totalCells > MAX_TOTAL_CELLS) {
7517
+ warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
7518
+ break;
7519
+ }
7424
7520
  const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
7425
7521
  blocks.push(...sheetBlocks);
7426
7522
  } catch (err) {
@@ -7504,10 +7600,35 @@ function getAttr(el, localName) {
7504
7600
  function parseXml2(text) {
7505
7601
  return new import_xmldom3.DOMParser().parseFromString(text, "text/xml");
7506
7602
  }
7603
+ function buildElementIndex(root) {
7604
+ const index = /* @__PURE__ */ new Map();
7605
+ const walk = (node) => {
7606
+ const children = node.childNodes;
7607
+ for (let i = 0; i < children.length; i++) {
7608
+ const child = children[i];
7609
+ if (child.nodeType === 1) {
7610
+ const el = child;
7611
+ const name = el.localName ?? "";
7612
+ if (name) {
7613
+ let list = index.get(name);
7614
+ if (!list) {
7615
+ list = [];
7616
+ index.set(name, list);
7617
+ }
7618
+ list.push(el);
7619
+ }
7620
+ walk(el);
7621
+ }
7622
+ }
7623
+ };
7624
+ walk(root);
7625
+ return index;
7626
+ }
7507
7627
  function parseStyles(xml) {
7508
7628
  const doc = parseXml2(xml);
7509
7629
  const styles = /* @__PURE__ */ new Map();
7510
- const styleElements = findElements(doc, "style");
7630
+ const idx = buildElementIndex(doc);
7631
+ const styleElements = idx.get("style") ?? [];
7511
7632
  for (const el of styleElements) {
7512
7633
  const styleId = getAttr(el, "styleId");
7513
7634
  if (!styleId) continue;
@@ -7535,7 +7656,8 @@ function parseStyles(xml) {
7535
7656
  function parseNumbering(xml) {
7536
7657
  const doc = parseXml2(xml);
7537
7658
  const abstractNums = /* @__PURE__ */ new Map();
7538
- const abstractElements = findElements(doc, "abstractNum");
7659
+ const idx = buildElementIndex(doc);
7660
+ const abstractElements = idx.get("abstractNum") ?? [];
7539
7661
  for (const el of abstractElements) {
7540
7662
  const abstractNumId = getAttr(el, "abstractNumId");
7541
7663
  if (!abstractNumId) continue;
@@ -7550,7 +7672,7 @@ function parseNumbering(xml) {
7550
7672
  abstractNums.set(abstractNumId, levels);
7551
7673
  }
7552
7674
  const nums = /* @__PURE__ */ new Map();
7553
- const numElements = findElements(doc, "num");
7675
+ const numElements = idx.get("num") ?? [];
7554
7676
  for (const el of numElements) {
7555
7677
  const numId = getAttr(el, "numId");
7556
7678
  if (!numId) continue;
@@ -7794,9 +7916,9 @@ async function extractImages(zip, rels, doc) {
7794
7916
  }
7795
7917
  return { blocks, images };
7796
7918
  }
7797
- async function parseDocxDocument(buffer, options) {
7919
+ async function parseDocxDocument(buffer, options, existingZip) {
7798
7920
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
7799
- const zip = await import_jszip4.default.loadAsync(buffer);
7921
+ const zip = existingZip ?? await import_jszip4.default.loadAsync(buffer);
7800
7922
  const warnings = [];
7801
7923
  const docFile = zip.file("word/document.xml");
7802
7924
  if (!docFile) {
@@ -7886,6 +8008,11 @@ async function parseDocxDocument(buffer, options) {
7886
8008
  };
7887
8009
  }
7888
8010
 
8011
+ // src/index.ts
8012
+ init_cli_provider();
8013
+ init_tesseract_provider();
8014
+ init_markdown_to_blocks();
8015
+
7889
8016
  // src/diff/text-diff.ts
7890
8017
  function similarity(a, b) {
7891
8018
  if (a === b) return 1;
@@ -10402,25 +10529,86 @@ async function parse2(input, options) {
10402
10529
  if (!buffer || buffer.byteLength === 0) {
10403
10530
  return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
10404
10531
  }
10532
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
10533
+ if (buffer.byteLength > MAX_FILE_SIZE) {
10534
+ return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
10535
+ }
10405
10536
  const format = detectFormat(buffer);
10406
10537
  switch (format) {
10407
10538
  case "hwpx": {
10408
- const zipFormat = await detectZipFormat(buffer);
10409
- if (zipFormat === "xlsx") return parseXlsx(buffer, options);
10410
- if (zipFormat === "docx") return parseDocx(buffer, options);
10411
- return parseHwpx(buffer, options);
10539
+ const { format: zipFormat, zip } = await detectZipFormat(buffer);
10540
+ if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
10541
+ if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
10542
+ return parseHwpx(buffer, options, zip ?? void 0);
10412
10543
  }
10413
10544
  case "hwp":
10414
10545
  return parseHwp(buffer, options);
10415
10546
  case "pdf":
10416
10547
  return parsePdf(buffer, options);
10548
+ case "image":
10549
+ return parseImage(buffer, options);
10417
10550
  default:
10418
10551
  return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
10419
10552
  }
10420
10553
  }
10421
- async function parseHwpx(buffer, options) {
10554
+ async function parseImage(buffer, options) {
10555
+ const ocrMode = options?.ocrMode || "auto";
10556
+ if (ocrMode === "off") {
10557
+ return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
10558
+ }
10559
+ let ocrProvider;
10560
+ let actualOcrMode = "auto";
10561
+ try {
10562
+ if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
10563
+ ocrProvider = createCliOcrProvider(ocrMode);
10564
+ actualOcrMode = ocrMode;
10565
+ } else if (ocrMode === "tesseract") {
10566
+ ocrProvider = await createTesseractProvider();
10567
+ actualOcrMode = ocrMode;
10568
+ } else if (ocrMode === "auto") {
10569
+ const modesToTry = ["gemini", "claude", "codex", "ollama"];
10570
+ for (const mode of modesToTry) {
10571
+ try {
10572
+ ocrProvider = createCliOcrProvider(mode);
10573
+ actualOcrMode = mode;
10574
+ break;
10575
+ } catch (e) {
10576
+ console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
10577
+ }
10578
+ }
10579
+ if (!ocrProvider) {
10580
+ ocrProvider = await createTesseractProvider();
10581
+ actualOcrMode = "tesseract";
10582
+ }
10583
+ }
10584
+ if (!ocrProvider) {
10585
+ return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
10586
+ }
10587
+ const imageUint8Array = new Uint8Array(buffer);
10588
+ const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
10589
+ if (ocrProvider.terminate) {
10590
+ await ocrProvider.terminate();
10591
+ }
10592
+ const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
10593
+ const blocks = markdownToBlocks(markdown, 1);
10594
+ return {
10595
+ success: true,
10596
+ fileType: "image",
10597
+ markdown,
10598
+ blocks,
10599
+ isImageBased: true,
10600
+ warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
10601
+ };
10602
+ } catch (err) {
10603
+ if (ocrProvider && ocrProvider.terminate) {
10604
+ await ocrProvider.terminate();
10605
+ }
10606
+ return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
10607
+ }
10608
+ }
10609
+ async function parseHwpx(buffer, options, zip) {
10422
10610
  try {
10423
- const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
10611
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
10424
10612
  return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10425
10613
  } catch (err) {
10426
10614
  return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
@@ -10443,17 +10631,17 @@ async function parsePdf(buffer, options) {
10443
10631
  return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
10444
10632
  }
10445
10633
  }
10446
- async function parseXlsx(buffer, options) {
10634
+ async function parseXlsx(buffer, options, zip) {
10447
10635
  try {
10448
- const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
10636
+ const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
10449
10637
  return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
10450
10638
  } catch (err) {
10451
10639
  return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
10452
10640
  }
10453
10641
  }
10454
- async function parseDocx(buffer, options) {
10642
+ async function parseDocx(buffer, options, zip) {
10455
10643
  try {
10456
- const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
10644
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
10457
10645
  return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10458
10646
  } catch (err) {
10459
10647
  return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };