@clazic/kordoc 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/{batch-provider-FUCIIS4M.js → batch-provider-PNDCSGQW.js} +59 -30
  2. package/dist/batch-provider-PNDCSGQW.js.map +1 -0
  3. package/dist/{chunk-2ZGLFZCN.js → chunk-2GFJFTKS.js} +193 -49
  4. package/dist/chunk-2GFJFTKS.js.map +1 -0
  5. package/dist/chunk-4PP34NVQ.js +121 -0
  6. package/dist/chunk-4PP34NVQ.js.map +1 -0
  7. package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
  8. package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
  9. package/dist/chunk-JOGAFNIL.js +153 -0
  10. package/dist/chunk-JOGAFNIL.js.map +1 -0
  11. package/dist/{chunk-WWILSVMJ.js → chunk-STIKJGEA.js} +2 -2
  12. package/dist/cli.js +10 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/index.cjs +291 -103
  15. package/dist/index.cjs.map +1 -1
  16. package/dist/index.d.cts +11 -6
  17. package/dist/index.d.ts +11 -6
  18. package/dist/index.js +292 -104
  19. package/dist/index.js.map +1 -1
  20. package/dist/mcp.js +5 -2
  21. package/dist/mcp.js.map +1 -1
  22. package/dist/{provider-OBY3XFSZ.js → provider-HE727F7Z.js} +38 -139
  23. package/dist/provider-HE727F7Z.js.map +1 -0
  24. package/dist/resolve-QA3VACUP.js +111 -0
  25. package/dist/resolve-QA3VACUP.js.map +1 -0
  26. package/dist/tesseract-provider-MNMZPSGF.js +11 -0
  27. package/dist/{utils-QAK24RJS.js → utils-FFUQJTTI.js} +2 -2
  28. package/dist/utils-FFUQJTTI.js.map +1 -0
  29. package/dist/{watch-MPHX3QIH.js → watch-2O32L6IF.js} +6 -3
  30. package/dist/{watch-MPHX3QIH.js.map → watch-2O32L6IF.js.map} +1 -1
  31. package/package.json +1 -1
  32. package/dist/batch-provider-FUCIIS4M.js.map +0 -1
  33. package/dist/chunk-2ZGLFZCN.js.map +0 -1
  34. package/dist/provider-OBY3XFSZ.js.map +0 -1
  35. package/dist/resolve-LBFYRHJI.js +0 -247
  36. package/dist/resolve-LBFYRHJI.js.map +0 -1
  37. /package/dist/{chunk-WWILSVMJ.js.map → chunk-STIKJGEA.js.map} +0 -0
  38. /package/dist/{utils-QAK24RJS.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
package/dist/index.js CHANGED
@@ -1998,8 +1998,8 @@ function getTesseractFallbackMessage() {
1998
1998
  "\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
1999
1999
  "\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
2000
2000
  "",
2001
- " [\uAD8C\uC7A5] Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
2002
- " Codex CLI: npm install -g @openai/codex",
2001
+ " [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
2002
+ " Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
2003
2003
  " Claude CLI: npm install -g @anthropic-ai/claude-code",
2004
2004
  " Ollama: brew install ollama (+ ollama pull gemma4:27b)"
2005
2005
  ].join("\n");
@@ -2008,7 +2008,7 @@ var CLI_PRIORITY;
2008
2008
  var init_auto_detect = __esm({
2009
2009
  "src/ocr/auto-detect.ts"() {
2010
2010
  "use strict";
2011
- CLI_PRIORITY = ["gemini", "codex", "claude", "ollama"];
2011
+ CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
2012
2012
  }
2013
2013
  });
2014
2014
 
@@ -2051,7 +2051,7 @@ function callCli(mode, imagePath) {
2051
2051
  const args = buildCliArgs(mode, imagePath);
2052
2052
  const result = spawnSync(mode, args, {
2053
2053
  encoding: "utf-8",
2054
- timeout: 18e4,
2054
+ timeout: 6e5,
2055
2055
  maxBuffer: 10 * 1024 * 1024,
2056
2056
  // claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
2057
2057
  ...mode === "claude" ? { cwd: tmpdir() } : {}
@@ -2145,14 +2145,22 @@ async function callOllamaApi(imagePath) {
2145
2145
  return data.message?.content || "";
2146
2146
  }
2147
2147
  function stripCodeFence(text) {
2148
- const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
2148
+ const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
2149
2149
  return match ? match[1].trim() : text;
2150
2150
  }
2151
2151
  var OCR_PROMPT, _tempDir;
2152
2152
  var init_cli_provider = __esm({
2153
2153
  "src/ocr/cli-provider.ts"() {
2154
2154
  "use strict";
2155
- OCR_PROMPT = "\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\uADDC\uCE59:\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
2155
+ OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
2156
+ \uADDC\uCE59:
2157
+ - \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
2158
+ - \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
2159
+ - \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
2160
+ - \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
2161
+ - \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
2162
+ - \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
2163
+ - \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
2156
2164
  _tempDir = null;
2157
2165
  }
2158
2166
  });
@@ -2222,7 +2230,7 @@ __export(batch_provider_exports, {
2222
2230
  DEFAULT_BATCH_SIZES: () => DEFAULT_BATCH_SIZES,
2223
2231
  createBatchCliProvider: () => createBatchCliProvider
2224
2232
  });
2225
- import { spawnSync as spawnSync2 } from "child_process";
2233
+ import { spawn } from "child_process";
2226
2234
  import { writeFileSync as writeFileSync2, readFileSync as readFileSync2, unlinkSync as unlinkSync2, mkdirSync as mkdirSync2 } from "fs";
2227
2235
  import { join as join2 } from "path";
2228
2236
  import { tmpdir as tmpdir2 } from "os";
@@ -2249,9 +2257,9 @@ function createBatchCliProvider(mode, batchSize) {
2249
2257
  }
2250
2258
  let output;
2251
2259
  if (mode === "codex") {
2252
- output = callBatchCodexCli(tempFiles);
2260
+ output = await callBatchCodexCli(tempFiles);
2253
2261
  } else {
2254
- output = callBatchCli(mode, tempFiles);
2262
+ output = await callBatchCli(mode, tempFiles);
2255
2263
  }
2256
2264
  const cleaned = stripCodeFence2(output.trim());
2257
2265
  const parts = cleaned.split(/<!--\s*PAGE_BREAK\s*-->/).map((p) => p.trim()).filter((p) => p.length > 0);
@@ -2273,40 +2281,74 @@ function createBatchCliProvider(mode, batchSize) {
2273
2281
  }
2274
2282
  };
2275
2283
  }
2276
- function callBatchCli(mode, imagePaths) {
2284
+ function spawnAsync(cmd, args, opts) {
2285
+ return new Promise((resolve, reject) => {
2286
+ const child = spawn(cmd, args, {
2287
+ cwd: opts.cwd,
2288
+ env: process.env,
2289
+ stdio: ["pipe", "pipe", "pipe"]
2290
+ });
2291
+ let stdout = "";
2292
+ let stderr = "";
2293
+ let killed = false;
2294
+ child.stdout.setEncoding("utf-8");
2295
+ child.stderr.setEncoding("utf-8");
2296
+ child.stdout.on("data", (d) => {
2297
+ stdout += d;
2298
+ });
2299
+ child.stderr.on("data", (d) => {
2300
+ stderr += d;
2301
+ });
2302
+ const timer = setTimeout(() => {
2303
+ killed = true;
2304
+ child.kill("SIGTERM");
2305
+ }, opts.timeoutMs);
2306
+ if (opts.stdin !== void 0) {
2307
+ child.stdin.end(opts.stdin);
2308
+ } else {
2309
+ child.stdin.end();
2310
+ }
2311
+ child.on("close", (code) => {
2312
+ clearTimeout(timer);
2313
+ if (killed) {
2314
+ reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
2315
+ } else {
2316
+ resolve({ stdout, stderr, exitCode: code ?? 1 });
2317
+ }
2318
+ });
2319
+ child.on("error", (err) => {
2320
+ clearTimeout(timer);
2321
+ reject(err);
2322
+ });
2323
+ });
2324
+ }
2325
+ async function callBatchCli(mode, imagePaths) {
2277
2326
  const fileRefs = imagePaths.map((p) => `@${p}`).join("\n");
2278
2327
  const prompt = `${BATCH_OCR_PROMPT}
2279
2328
 
2280
2329
  ${fileRefs}`;
2281
2330
  let args;
2282
2331
  if (mode === "gemini") {
2283
- args = ["--prompt", prompt, "--yolo"];
2284
- const model = process.env.KORDOC_GEMINI_MODEL;
2285
- if (model) args.push("--model", model);
2332
+ const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
2333
+ args = ["--prompt", prompt, "--yolo", "--model", model];
2286
2334
  } else {
2287
2335
  args = ["--print", prompt];
2288
2336
  const model = process.env.KORDOC_CLAUDE_MODEL;
2289
2337
  if (model) args.push("--model", model);
2290
2338
  }
2291
2339
  const timeoutMs = 6e4 + imagePaths.length * 2e4;
2292
- const result = spawnSync2(mode, args, {
2293
- encoding: "utf-8",
2294
- timeout: timeoutMs,
2295
- maxBuffer: 50 * 1024 * 1024,
2296
- // 50MB (large batch output)
2340
+ const result = await spawnAsync(mode, args, {
2341
+ timeoutMs,
2297
2342
  ...mode === "claude" ? { cwd: tmpdir2() } : {}
2298
2343
  });
2299
- if (result.error) {
2300
- throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${result.error.message}`);
2301
- }
2302
- if (result.status !== 0) {
2303
- const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
2344
+ if (result.exitCode !== 0) {
2345
+ const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
2304
2346
  throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
2305
2347
  }
2306
2348
  return result.stdout || "";
2307
2349
  }
2308
- function callBatchCodexCli(imagePaths) {
2309
- const outPath = join2(tmpdir2(), `kordoc-codex-batch-${Date.now()}.txt`);
2350
+ async function callBatchCodexCli(imagePaths) {
2351
+ const outPath = join2(tmpdir2(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
2310
2352
  try {
2311
2353
  const args = ["exec", BATCH_OCR_PROMPT];
2312
2354
  for (const p of imagePaths) {
@@ -2316,17 +2358,12 @@ function callBatchCodexCli(imagePaths) {
2316
2358
  const model = process.env.KORDOC_CODEX_MODEL;
2317
2359
  if (model) args.push("--model", model);
2318
2360
  const timeoutMs = 6e4 + imagePaths.length * 2e4;
2319
- const result = spawnSync2("codex", args, {
2320
- encoding: "utf-8",
2321
- timeout: timeoutMs,
2322
- maxBuffer: 50 * 1024 * 1024,
2323
- input: ""
2361
+ const result = await spawnAsync("codex", args, {
2362
+ timeoutMs,
2363
+ stdin: ""
2324
2364
  });
2325
- if (result.error) {
2326
- throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${result.error.message}`);
2327
- }
2328
- if (result.status !== 0) {
2329
- const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
2365
+ if (result.exitCode !== 0) {
2366
+ const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
2330
2367
  throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
2331
2368
  }
2332
2369
  try {
@@ -2584,7 +2621,7 @@ function isBatchProvider(p) {
2584
2621
  async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
2585
2622
  const blocks = [];
2586
2623
  if (isBatchProvider(provider)) {
2587
- return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress);
2624
+ return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
2588
2625
  }
2589
2626
  if (concurrency <= 1) {
2590
2627
  for (let i = 1; i <= effectivePageCount; i++) {
@@ -2631,43 +2668,54 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
2631
2668
  }
2632
2669
  return blocks;
2633
2670
  }
2634
- async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress) {
2635
- const blocks = [];
2671
+ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
2636
2672
  const pageNumbers = [];
2637
2673
  for (let i = 1; i <= effectivePageCount; i++) {
2638
2674
  if (pageFilter && !pageFilter.has(i)) continue;
2639
2675
  pageNumbers.push(i);
2640
2676
  }
2641
- const pageImages = [];
2642
- for (const pageNum of pageNumbers) {
2643
- const page = await doc.getPage(pageNum);
2644
- const image = await renderPageToPng(page);
2645
- pageImages.push({ image, pageNum });
2646
- }
2647
- const batches = [];
2648
- for (let i = 0; i < pageImages.length; i += provider.batchSize) {
2649
- batches.push(pageImages.slice(i, i + provider.batchSize));
2677
+ const pageBatches = [];
2678
+ for (let i = 0; i < pageNumbers.length; i += provider.batchSize) {
2679
+ pageBatches.push(pageNumbers.slice(i, i + provider.batchSize));
2650
2680
  }
2651
2681
  let processed = 0;
2652
- for (const batch of batches) {
2682
+ const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
2683
+ const pageBlocks = [];
2653
2684
  try {
2654
- const results = await provider.processBatch(batch);
2655
- for (const { pageNum } of batch) {
2685
+ const batchImages = [];
2686
+ for (const pageNum of batchPageNums) {
2687
+ const page = await doc.getPage(pageNum);
2688
+ const image = await renderPageToPng(page);
2689
+ batchImages.push({ image, pageNum });
2690
+ }
2691
+ const results = await provider.processBatch(batchImages);
2692
+ for (const { pageNum } of batchImages) {
2656
2693
  const result = results.get(pageNum);
2657
- if (result) {
2658
- for (const b of ocrResultToBlocks(result, pageNum)) blocks.push(b);
2659
- }
2660
- processed++;
2661
- onProgress?.(processed, pageNumbers.length);
2694
+ pageBlocks.push({
2695
+ pageNum,
2696
+ blocks: result ? ocrResultToBlocks(result, pageNum) : []
2697
+ });
2662
2698
  }
2663
2699
  } catch (err) {
2664
- const range = `${batch[0].pageNum}-${batch[batch.length - 1].pageNum}`;
2700
+ const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2665
2701
  warnings?.push({
2666
2702
  message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2667
2703
  code: "OCR_PAGE_FAILED"
2668
2704
  });
2669
- processed += batch.length;
2670
- onProgress?.(processed, pageNumbers.length);
2705
+ for (const pageNum of batchPageNums) {
2706
+ pageBlocks.push({ pageNum, blocks: [] });
2707
+ }
2708
+ }
2709
+ processed += batchPageNums.length;
2710
+ onProgress?.(processed, pageNumbers.length);
2711
+ return { batchIdx, pageBlocks };
2712
+ });
2713
+ const effectiveConcurrency = Math.max(1, concurrency);
2714
+ const batchResults = await runWithConcurrency(batchTasks, effectiveConcurrency);
2715
+ const blocks = [];
2716
+ for (const result of batchResults) {
2717
+ for (const { blocks: pageBlks } of result.pageBlocks) {
2718
+ for (const b of pageBlks) blocks.push(b);
2671
2719
  }
2672
2720
  }
2673
2721
  return blocks;
@@ -2711,24 +2759,29 @@ function isPdfFile(buffer) {
2711
2759
  const b = magicBytes(buffer);
2712
2760
  return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
2713
2761
  }
2762
+ function isPngFile(buffer) {
2763
+ const b = magicBytes(buffer);
2764
+ return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
2765
+ }
2714
2766
  function detectFormat(buffer) {
2715
2767
  if (buffer.byteLength < 4) return "unknown";
2716
2768
  if (isZipFile(buffer)) return "hwpx";
2717
2769
  if (isOldHwpFile(buffer)) return "hwp";
2718
2770
  if (isPdfFile(buffer)) return "pdf";
2771
+ if (isPngFile(buffer)) return "image";
2719
2772
  return "unknown";
2720
2773
  }
2721
2774
  async function detectZipFormat(buffer) {
2722
2775
  try {
2723
2776
  const zip = await JSZip.loadAsync(buffer);
2724
- if (zip.file("xl/workbook.xml")) return "xlsx";
2725
- if (zip.file("word/document.xml")) return "docx";
2726
- if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
2777
+ if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
2778
+ if (zip.file("word/document.xml")) return { format: "docx", zip };
2779
+ if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
2727
2780
  const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
2728
- if (hasSection) return "hwpx";
2729
- return "unknown";
2781
+ if (hasSection) return { format: "hwpx", zip };
2782
+ return { format: "unknown", zip: null };
2730
2783
  } catch {
2731
- return "unknown";
2784
+ return { format: "unknown", zip: null };
2732
2785
  }
2733
2786
  }
2734
2787
 
@@ -2737,7 +2790,7 @@ import JSZip2 from "jszip";
2737
2790
  import { DOMParser } from "@xmldom/xmldom";
2738
2791
 
2739
2792
  // src/utils.ts
2740
- var VERSION = true ? "2.3.0" : "0.0.0-dev";
2793
+ var VERSION = true ? "2.3.2" : "0.0.0-dev";
2741
2794
  function toArrayBuffer(buf) {
2742
2795
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2743
2796
  return buf.buffer;
@@ -2897,12 +2950,16 @@ function buildTableDirect(rows, numRows) {
2897
2950
  return trimAndReturn(grid, numRows, maxCols);
2898
2951
  }
2899
2952
  function trimAndReturn(grid, numRows, maxCols) {
2900
- let effectiveCols = maxCols;
2901
- while (effectiveCols > 0) {
2902
- const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
2903
- if (!colEmpty) break;
2904
- effectiveCols--;
2953
+ let effectiveCols = 0;
2954
+ for (const row of grid) {
2955
+ for (let c = row.length - 1; c >= effectiveCols; c--) {
2956
+ if (row[c]?.text?.trim()) {
2957
+ effectiveCols = c + 1;
2958
+ break;
2959
+ }
2960
+ }
2905
2961
  }
2962
+ if (effectiveCols === 0) effectiveCols = maxCols;
2906
2963
  if (effectiveCols < maxCols && effectiveCols > 0) {
2907
2964
  const trimmed = grid.map((row) => row.slice(0, effectiveCols));
2908
2965
  return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
@@ -3159,11 +3216,11 @@ function parseStyleElements(doc, map) {
3159
3216
  function stripDtd(xml) {
3160
3217
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
3161
3218
  }
3162
- async function parseHwpxDocument(buffer, options) {
3219
+ async function parseHwpxDocument(buffer, options, existingZip) {
3163
3220
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
3164
3221
  let zip;
3165
3222
  try {
3166
- zip = await JSZip2.loadAsync(buffer);
3223
+ zip = existingZip ?? await JSZip2.loadAsync(buffer);
3167
3224
  } catch {
3168
3225
  return await extractFromBrokenZip(buffer);
3169
3226
  }
@@ -6175,8 +6232,15 @@ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mj
6175
6232
  GlobalWorkerOptions.workerSrc = "";
6176
6233
  var MAX_PAGES = 5e3;
6177
6234
  var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
6178
- var PDF_LOAD_TIMEOUT_MS = 3e4;
6235
+ function calcPdfTimeout(bufferSize) {
6236
+ const base = 3e4;
6237
+ const perMb = 500;
6238
+ const mb = bufferSize / (1024 * 1024);
6239
+ return Math.min(base + Math.ceil(mb * perMb), 3e5);
6240
+ }
6179
6241
  async function loadPdfWithTimeout(buffer) {
6242
+ const timeoutMs = calcPdfTimeout(buffer.byteLength);
6243
+ const timeoutSec = Math.round(timeoutMs / 1e3);
6180
6244
  const loadingTask = getDocument({
6181
6245
  data: new Uint8Array(buffer),
6182
6246
  useSystemFonts: true,
@@ -6190,8 +6254,8 @@ async function loadPdfWithTimeout(buffer) {
6190
6254
  new Promise((_, reject) => {
6191
6255
  timer = setTimeout(() => {
6192
6256
  loadingTask.destroy();
6193
- reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
6194
- }, PDF_LOAD_TIMEOUT_MS);
6257
+ reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
6258
+ }, timeoutMs);
6195
6259
  })
6196
6260
  ]);
6197
6261
  } finally {
@@ -6212,11 +6276,15 @@ async function parsePdfDocument(buffer, options) {
6212
6276
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
6213
6277
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
6214
6278
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
6215
- const allFontSizes = [];
6279
+ const fontSizeFreq = /* @__PURE__ */ new Map();
6216
6280
  const pageHeights = /* @__PURE__ */ new Map();
6217
- let parsedPages = 0;
6281
+ const targetPageNums = [];
6218
6282
  for (let i = 1; i <= effectivePageCount; i++) {
6219
6283
  if (pageFilter && !pageFilter.has(i)) continue;
6284
+ targetPageNums.push(i);
6285
+ }
6286
+ let parsedPages = 0;
6287
+ const parseSinglePage = async (i) => {
6220
6288
  try {
6221
6289
  const page = await doc.getPage(i);
6222
6290
  const tc = await page.getTextContent();
@@ -6229,7 +6297,10 @@ async function parsePdfDocument(buffer, options) {
6229
6297
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
6230
6298
  }
6231
6299
  for (const item of visible) {
6232
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
6300
+ if (item.fontSize > 0) {
6301
+ const rounded = Math.round(item.fontSize * 10) / 10;
6302
+ fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
6303
+ }
6233
6304
  }
6234
6305
  const opList = await page.getOperatorList();
6235
6306
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -6246,12 +6317,23 @@ async function parsePdfDocument(buffer, options) {
6246
6317
  if (pageErr instanceof KordocError) throw pageErr;
6247
6318
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
6248
6319
  }
6320
+ };
6321
+ const sampleCount = Math.min(5, targetPageNums.length);
6322
+ for (let si = 0; si < sampleCount; si++) {
6323
+ await parseSinglePage(targetPageNums[si]);
6324
+ }
6325
+ const sampleParsed = parsedPages || sampleCount;
6326
+ const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
6327
+ if (!isImageBased) {
6328
+ for (let si = sampleCount; si < targetPageNums.length; si++) {
6329
+ await parseSinglePage(targetPageNums[si]);
6330
+ }
6249
6331
  }
6250
6332
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
6251
- if (totalChars / Math.max(parsedPageCount, 1) < 10) {
6333
+ if (isImageBased) {
6252
6334
  let ocrProvider = options?.ocr ?? null;
6253
- const ocrMode = options?.ocrMode;
6254
- if (!ocrProvider && ocrMode && ocrMode !== "off") {
6335
+ const ocrMode = options?.ocrMode ?? "auto";
6336
+ if (!ocrProvider && ocrMode !== "off") {
6255
6337
  try {
6256
6338
  const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6257
6339
  const concurrency = options?.ocrConcurrency ?? 1;
@@ -6303,7 +6385,7 @@ async function parsePdfDocument(buffer, options) {
6303
6385
  blocks.splice(removed[ri], 1);
6304
6386
  }
6305
6387
  }
6306
- const medianFontSize = computeMedianFontSize(allFontSizes);
6388
+ const medianFontSize = computeMedianFromFreq(fontSizeFreq);
6307
6389
  if (medianFontSize > 0) {
6308
6390
  detectHeadings(blocks, medianFontSize);
6309
6391
  }
@@ -6356,11 +6438,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
6356
6438
  }
6357
6439
  return { visible, hiddenCount };
6358
6440
  }
6359
- function computeMedianFontSize(sizes) {
6360
- if (sizes.length === 0) return 0;
6361
- const sorted = [...sizes].sort((a, b) => a - b);
6362
- const mid = Math.floor(sorted.length / 2);
6363
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
6441
+ function computeMedianFromFreq(freq) {
6442
+ if (freq.size === 0) return 0;
6443
+ const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
6444
+ let total = 0;
6445
+ for (const [, count] of entries) total += count;
6446
+ const mid = total / 2;
6447
+ let cumulative = 0;
6448
+ for (const [size, count] of entries) {
6449
+ cumulative += count;
6450
+ if (cumulative >= mid) return size;
6451
+ }
6452
+ return 0;
6364
6453
  }
6365
6454
  function detectHeadings(blocks, medianFontSize) {
6366
6455
  for (const block of blocks) {
@@ -7163,6 +7252,7 @@ var MAX_SHEETS = 100;
7163
7252
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
7164
7253
  var MAX_ROWS2 = 1e4;
7165
7254
  var MAX_COLS2 = 200;
7255
+ var MAX_TOTAL_CELLS = 2e6;
7166
7256
  function cleanNumericValue(raw) {
7167
7257
  if (!/^-?\d+\.\d+$/.test(raw)) return raw;
7168
7258
  const num = parseFloat(raw);
@@ -7346,9 +7436,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
7346
7436
  }
7347
7437
  return blocks;
7348
7438
  }
7349
- async function parseXlsxDocument(buffer, options) {
7439
+ async function parseXlsxDocument(buffer, options, existingZip) {
7350
7440
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
7351
- const zip = await JSZip3.loadAsync(buffer);
7441
+ const zip = existingZip ?? await JSZip3.loadAsync(buffer);
7352
7442
  const warnings = [];
7353
7443
  const workbookFile = zip.file("xl/workbook.xml");
7354
7444
  if (!workbookFile) {
@@ -7375,6 +7465,7 @@ async function parseXlsxDocument(buffer, options) {
7375
7465
  }
7376
7466
  const blocks = [];
7377
7467
  const processedSheets = Math.min(sheets.length, MAX_SHEETS);
7468
+ let totalCells = 0;
7378
7469
  for (let i = 0; i < processedSheets; i++) {
7379
7470
  if (pageFilter && !pageFilter.has(i + 1)) continue;
7380
7471
  const sheet = sheets[i];
@@ -7401,6 +7492,11 @@ async function parseXlsxDocument(buffer, options) {
7401
7492
  try {
7402
7493
  const sheetXml = await sheetFile.async("text");
7403
7494
  const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
7495
+ totalCells += maxRow * maxCol;
7496
+ if (totalCells > MAX_TOTAL_CELLS) {
7497
+ warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
7498
+ break;
7499
+ }
7404
7500
  const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
7405
7501
  blocks.push(...sheetBlocks);
7406
7502
  } catch (err) {
@@ -7484,10 +7580,35 @@ function getAttr(el, localName) {
7484
7580
  function parseXml2(text) {
7485
7581
  return new DOMParser3().parseFromString(text, "text/xml");
7486
7582
  }
7583
+ function buildElementIndex(root) {
7584
+ const index = /* @__PURE__ */ new Map();
7585
+ const walk = (node) => {
7586
+ const children = node.childNodes;
7587
+ for (let i = 0; i < children.length; i++) {
7588
+ const child = children[i];
7589
+ if (child.nodeType === 1) {
7590
+ const el = child;
7591
+ const name = el.localName ?? "";
7592
+ if (name) {
7593
+ let list = index.get(name);
7594
+ if (!list) {
7595
+ list = [];
7596
+ index.set(name, list);
7597
+ }
7598
+ list.push(el);
7599
+ }
7600
+ walk(el);
7601
+ }
7602
+ }
7603
+ };
7604
+ walk(root);
7605
+ return index;
7606
+ }
7487
7607
  function parseStyles(xml) {
7488
7608
  const doc = parseXml2(xml);
7489
7609
  const styles = /* @__PURE__ */ new Map();
7490
- const styleElements = findElements(doc, "style");
7610
+ const idx = buildElementIndex(doc);
7611
+ const styleElements = idx.get("style") ?? [];
7491
7612
  for (const el of styleElements) {
7492
7613
  const styleId = getAttr(el, "styleId");
7493
7614
  if (!styleId) continue;
@@ -7515,7 +7636,8 @@ function parseStyles(xml) {
7515
7636
  function parseNumbering(xml) {
7516
7637
  const doc = parseXml2(xml);
7517
7638
  const abstractNums = /* @__PURE__ */ new Map();
7518
- const abstractElements = findElements(doc, "abstractNum");
7639
+ const idx = buildElementIndex(doc);
7640
+ const abstractElements = idx.get("abstractNum") ?? [];
7519
7641
  for (const el of abstractElements) {
7520
7642
  const abstractNumId = getAttr(el, "abstractNumId");
7521
7643
  if (!abstractNumId) continue;
@@ -7530,7 +7652,7 @@ function parseNumbering(xml) {
7530
7652
  abstractNums.set(abstractNumId, levels);
7531
7653
  }
7532
7654
  const nums = /* @__PURE__ */ new Map();
7533
- const numElements = findElements(doc, "num");
7655
+ const numElements = idx.get("num") ?? [];
7534
7656
  for (const el of numElements) {
7535
7657
  const numId = getAttr(el, "numId");
7536
7658
  if (!numId) continue;
@@ -7774,9 +7896,9 @@ async function extractImages(zip, rels, doc) {
7774
7896
  }
7775
7897
  return { blocks, images };
7776
7898
  }
7777
- async function parseDocxDocument(buffer, options) {
7899
+ async function parseDocxDocument(buffer, options, existingZip) {
7778
7900
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
7779
- const zip = await JSZip4.loadAsync(buffer);
7901
+ const zip = existingZip ?? await JSZip4.loadAsync(buffer);
7780
7902
  const warnings = [];
7781
7903
  const docFile = zip.file("word/document.xml");
7782
7904
  if (!docFile) {
@@ -7866,6 +7988,11 @@ async function parseDocxDocument(buffer, options) {
7866
7988
  };
7867
7989
  }
7868
7990
 
7991
+ // src/index.ts
7992
+ init_cli_provider();
7993
+ init_tesseract_provider();
7994
+ init_markdown_to_blocks();
7995
+
7869
7996
  // src/diff/text-diff.ts
7870
7997
  function similarity(a, b) {
7871
7998
  if (a === b) return 1;
@@ -10382,25 +10509,86 @@ async function parse2(input, options) {
10382
10509
  if (!buffer || buffer.byteLength === 0) {
10383
10510
  return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
10384
10511
  }
10512
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
10513
+ if (buffer.byteLength > MAX_FILE_SIZE) {
10514
+ return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
10515
+ }
10385
10516
  const format = detectFormat(buffer);
10386
10517
  switch (format) {
10387
10518
  case "hwpx": {
10388
- const zipFormat = await detectZipFormat(buffer);
10389
- if (zipFormat === "xlsx") return parseXlsx(buffer, options);
10390
- if (zipFormat === "docx") return parseDocx(buffer, options);
10391
- return parseHwpx(buffer, options);
10519
+ const { format: zipFormat, zip } = await detectZipFormat(buffer);
10520
+ if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
10521
+ if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
10522
+ return parseHwpx(buffer, options, zip ?? void 0);
10392
10523
  }
10393
10524
  case "hwp":
10394
10525
  return parseHwp(buffer, options);
10395
10526
  case "pdf":
10396
10527
  return parsePdf(buffer, options);
10528
+ case "image":
10529
+ return parseImage(buffer, options);
10397
10530
  default:
10398
10531
  return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
10399
10532
  }
10400
10533
  }
10401
- async function parseHwpx(buffer, options) {
10534
+ async function parseImage(buffer, options) {
10535
+ const ocrMode = options?.ocrMode || "auto";
10536
+ if (ocrMode === "off") {
10537
+ return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
10538
+ }
10539
+ let ocrProvider;
10540
+ let actualOcrMode = "auto";
10541
+ try {
10542
+ if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
10543
+ ocrProvider = createCliOcrProvider(ocrMode);
10544
+ actualOcrMode = ocrMode;
10545
+ } else if (ocrMode === "tesseract") {
10546
+ ocrProvider = await createTesseractProvider();
10547
+ actualOcrMode = ocrMode;
10548
+ } else if (ocrMode === "auto") {
10549
+ const modesToTry = ["gemini", "claude", "codex", "ollama"];
10550
+ for (const mode of modesToTry) {
10551
+ try {
10552
+ ocrProvider = createCliOcrProvider(mode);
10553
+ actualOcrMode = mode;
10554
+ break;
10555
+ } catch (e) {
10556
+ console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
10557
+ }
10558
+ }
10559
+ if (!ocrProvider) {
10560
+ ocrProvider = await createTesseractProvider();
10561
+ actualOcrMode = "tesseract";
10562
+ }
10563
+ }
10564
+ if (!ocrProvider) {
10565
+ return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
10566
+ }
10567
+ const imageUint8Array = new Uint8Array(buffer);
10568
+ const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
10569
+ if (ocrProvider.terminate) {
10570
+ await ocrProvider.terminate();
10571
+ }
10572
+ const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
10573
+ const blocks = markdownToBlocks(markdown, 1);
10574
+ return {
10575
+ success: true,
10576
+ fileType: "image",
10577
+ markdown,
10578
+ blocks,
10579
+ isImageBased: true,
10580
+ warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
10581
+ };
10582
+ } catch (err) {
10583
+ if (ocrProvider && ocrProvider.terminate) {
10584
+ await ocrProvider.terminate();
10585
+ }
10586
+ return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
10587
+ }
10588
+ }
10589
+ async function parseHwpx(buffer, options, zip) {
10402
10590
  try {
10403
- const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
10591
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
10404
10592
  return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10405
10593
  } catch (err) {
10406
10594
  return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
@@ -10423,17 +10611,17 @@ async function parsePdf(buffer, options) {
10423
10611
  return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
10424
10612
  }
10425
10613
  }
10426
- async function parseXlsx(buffer, options) {
10614
+ async function parseXlsx(buffer, options, zip) {
10427
10615
  try {
10428
- const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
10616
+ const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
10429
10617
  return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
10430
10618
  } catch (err) {
10431
10619
  return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
10432
10620
  }
10433
10621
  }
10434
- async function parseDocx(buffer, options) {
10622
+ async function parseDocx(buffer, options, zip) {
10435
10623
  try {
10436
- const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
10624
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
10437
10625
  return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10438
10626
  } catch (err) {
10439
10627
  return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };