@clazic/kordoc 2.4.18 → 2.4.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/dist/{auto-detect-2YGFYQCN.js → auto-detect-CBYICI6B.js} +4 -4
  2. package/dist/{chunk-T7EBS5XP.js → chunk-463YQ2WL.js} +8 -18
  3. package/dist/chunk-463YQ2WL.js.map +1 -0
  4. package/dist/{chunk-7NOZFYH6.js → chunk-CLK4PNZ7.js} +7 -8
  5. package/dist/chunk-CLK4PNZ7.js.map +1 -0
  6. package/dist/{chunk-W2KDIKDF.js → chunk-MZN7PLTZ.js} +2 -2
  7. package/dist/{chunk-34WIGIQC.js → chunk-Y4WFKJ5P.js} +1 -1
  8. package/dist/chunk-Y4WFKJ5P.js.map +1 -0
  9. package/dist/cli.js +9 -13
  10. package/dist/cli.js.map +1 -1
  11. package/dist/index.cjs +16 -105
  12. package/dist/index.cjs.map +1 -1
  13. package/dist/index.d.cts +5 -6
  14. package/dist/index.d.ts +5 -6
  15. package/dist/index.js +16 -104
  16. package/dist/index.js.map +1 -1
  17. package/dist/mcp.js +5 -6
  18. package/dist/mcp.js.map +1 -1
  19. package/dist/{resolve-673XFZQ6.js → resolve-XWYJYKKH.js} +15 -36
  20. package/dist/resolve-XWYJYKKH.js.map +1 -0
  21. package/dist/{utils-DHOODYKU.js → utils-YUAT7LFD.js} +2 -2
  22. package/dist/{watch-YGIU7RN7.js → watch-WEOFVVDO.js} +5 -6
  23. package/dist/{watch-YGIU7RN7.js.map → watch-WEOFVVDO.js.map} +1 -1
  24. package/package.json +1 -2
  25. package/dist/chunk-34WIGIQC.js.map +0 -1
  26. package/dist/chunk-7FMKAV4P.js +0 -56
  27. package/dist/chunk-7FMKAV4P.js.map +0 -1
  28. package/dist/chunk-7NOZFYH6.js.map +0 -1
  29. package/dist/chunk-T7EBS5XP.js.map +0 -1
  30. package/dist/resolve-673XFZQ6.js.map +0 -1
  31. package/dist/tesseract-provider-MNMZPSGF.js +0 -11
  32. package/dist/utils-DHOODYKU.js.map +0 -1
  33. /package/dist/{auto-detect-2YGFYQCN.js.map → auto-detect-CBYICI6B.js.map} +0 -0
  34. /package/dist/{chunk-W2KDIKDF.js.map → chunk-MZN7PLTZ.js.map} +0 -0
  35. /package/dist/{tesseract-provider-MNMZPSGF.js.map → utils-YUAT7LFD.js.map} +0 -0
package/dist/index.d.cts CHANGED
@@ -106,17 +106,16 @@ interface ParseOptions {
106
106
  ocr?: OcrProvider;
107
107
  /**
108
108
  * OCR 모드 (CLI 자동 탐색용).
109
- * - "auto": 설치된 CLI 자동 탐색 (gemini→claude→codex→ollama→tesseract)
110
- * - "gemini"|"claude"|"codex"|"ollama"|"tesseract": 특정 도구 강제 지정
109
+ * - "auto": 설치된 CLI 자동 탐색 (codex→gemini→claude→ollama)
110
+ * - "gemini"|"claude"|"codex"|"ollama": 특정 도구 강제 지정
111
111
  * - "off": OCR 비활성화 (이미지 기반 PDF면 에러)
112
112
  * - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
113
113
  */
114
114
  ocrMode?: OcrMode;
115
115
  /**
116
116
  * OCR 병렬 처리 수.
117
- * - tesseract: 기본값은 CPU 코어 (병렬 처리로 속도 향상)
118
- * - CLI 제공 프로바이더(gemini/claude/codex): 기본 1 (rate limit 보호)
119
- * - 1: 순차 처리 (기존 동작)
117
+ * - CLI 제공 프로바이더(gemini/claude/codex): 기본 4 (배치 병렬 실행)
118
+ * - 1: 순차 처리
120
119
  */
121
120
  ocrConcurrency?: number;
122
121
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
@@ -241,7 +240,7 @@ interface StructuredOcrResult {
241
240
  markdown: string;
242
241
  }
243
242
  /** OCR 모드 — CLI --ocr 옵션 허용값 */
244
- type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "tesseract" | "off";
243
+ type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "off";
245
244
  /** 사용자 제공 OCR 함수 — 페이지 이미지를 받아 텍스트 또는 구조화된 결과 반환 */
246
245
  type OcrProvider = (pageImage: Uint8Array, pageNumber: number, mimeType: "image/png") => Promise<string | StructuredOcrResult>;
247
246
  interface WatchOptions {
package/dist/index.d.ts CHANGED
@@ -106,17 +106,16 @@ interface ParseOptions {
106
106
  ocr?: OcrProvider;
107
107
  /**
108
108
  * OCR 모드 (CLI 자동 탐색용).
109
- * - "auto": 설치된 CLI 자동 탐색 (gemini→claude→codex→ollama→tesseract)
110
- * - "gemini"|"claude"|"codex"|"ollama"|"tesseract": 특정 도구 강제 지정
109
+ * - "auto": 설치된 CLI 자동 탐색 (codex→gemini→claude→ollama)
110
+ * - "gemini"|"claude"|"codex"|"ollama": 특정 도구 강제 지정
111
111
  * - "off": OCR 비활성화 (이미지 기반 PDF면 에러)
112
112
  * - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
113
113
  */
114
114
  ocrMode?: OcrMode;
115
115
  /**
116
116
  * OCR 병렬 처리 수.
117
- * - tesseract: 기본값은 CPU 코어 (병렬 처리로 속도 향상)
118
- * - CLI 제공 프로바이더(gemini/claude/codex): 기본 1 (rate limit 보호)
119
- * - 1: 순차 처리 (기존 동작)
117
+ * - CLI 제공 프로바이더(gemini/claude/codex): 기본 4 (배치 병렬 실행)
118
+ * - 1: 순차 처리
120
119
  */
121
120
  ocrConcurrency?: number;
122
121
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
@@ -241,7 +240,7 @@ interface StructuredOcrResult {
241
240
  markdown: string;
242
241
  }
243
242
  /** OCR 모드 — CLI --ocr 옵션 허용값 */
244
- type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "tesseract" | "off";
243
+ type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "off";
245
244
  /** 사용자 제공 OCR 함수 — 페이지 이미지를 받아 텍스트 또는 구조화된 결과 반환 */
246
245
  type OcrProvider = (pageImage: Uint8Array, pageNumber: number, mimeType: "image/png") => Promise<string | StructuredOcrResult>;
247
246
  interface WatchOptions {
package/dist/index.js CHANGED
@@ -2183,7 +2183,7 @@ var auto_detect_exports = {};
2183
2183
  __export(auto_detect_exports, {
2184
2184
  detectAvailableOcr: () => detectAvailableOcr,
2185
2185
  getAutoFallbackChain: () => getAutoFallbackChain,
2186
- getTesseractFallbackMessage: () => getTesseractFallbackMessage,
2186
+ getNoCliMessage: () => getNoCliMessage,
2187
2187
  validateOcrMode: () => validateOcrMode
2188
2188
  });
2189
2189
  import { execSync } from "child_process";
@@ -2191,7 +2191,7 @@ function detectAvailableOcr() {
2191
2191
  for (const cli of CLI_PRIORITY) {
2192
2192
  if (isCliInstalled(cli)) return cli;
2193
2193
  }
2194
- return "tesseract";
2194
+ return null;
2195
2195
  }
2196
2196
  function isCliInstalled(name) {
2197
2197
  try {
@@ -2207,11 +2207,10 @@ function getAutoFallbackChain() {
2207
2207
  for (const cli of CLI_PRIORITY) {
2208
2208
  if (isCliInstalled(cli)) chain.push(cli);
2209
2209
  }
2210
- chain.push("tesseract");
2211
2210
  return chain;
2212
2211
  }
2213
2212
  function validateOcrMode(mode) {
2214
- if (mode === "auto" || mode === "off" || mode === "tesseract") return;
2213
+ if (mode === "auto" || mode === "off") return;
2215
2214
  if (!isCliInstalled(mode)) {
2216
2215
  throw new Error(`'${mode}' CLI\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4.
2217
2216
  ${getInstallGuide(mode)}`);
@@ -2226,10 +2225,10 @@ function getInstallGuide(mode) {
2226
2225
  };
2227
2226
  return guides[mode] || `'${mode}'\uC744(\uB97C) \uC124\uCE58\uD574\uC8FC\uC138\uC694.`;
2228
2227
  }
2229
- function getTesseractFallbackMessage() {
2228
+ function getNoCliMessage() {
2230
2229
  return [
2231
- "\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
2232
- "\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
2230
+ "\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 OCR\uC744 \uC218\uD589\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.",
2231
+ "\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \uCC98\uB9AC\uB97C \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
2233
2232
  "",
2234
2233
  " [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
2235
2234
  " Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
@@ -2412,65 +2411,6 @@ var init_cli_provider = __esm({
2412
2411
  }
2413
2412
  });
2414
2413
 
2415
- // src/ocr/tesseract-provider.ts
2416
- var tesseract_provider_exports = {};
2417
- __export(tesseract_provider_exports, {
2418
- createTesseractPoolProvider: () => createTesseractPoolProvider,
2419
- createTesseractProvider: () => createTesseractProvider
2420
- });
2421
- import { createWorker } from "tesseract.js";
2422
- async function createTesseractProvider() {
2423
- const worker = await createWorker("kor+eng");
2424
- let terminated = false;
2425
- const provider = async (pageImage, _pageNumber, _mimeType) => {
2426
- const { data } = await worker.recognize(pageImage);
2427
- return data.text;
2428
- };
2429
- provider.terminate = async () => {
2430
- if (!terminated) {
2431
- await worker.terminate();
2432
- terminated = true;
2433
- }
2434
- };
2435
- return provider;
2436
- }
2437
- async function createTesseractPoolProvider(concurrency) {
2438
- const workers = await Promise.all(
2439
- Array.from({ length: concurrency }, () => createWorker("kor+eng"))
2440
- );
2441
- const idle = [...workers];
2442
- const waitQueue = [];
2443
- function acquire() {
2444
- if (idle.length > 0) return Promise.resolve(idle.pop());
2445
- return new Promise((resolve4) => waitQueue.push(resolve4));
2446
- }
2447
- function release(w) {
2448
- if (waitQueue.length > 0) {
2449
- waitQueue.shift()(w);
2450
- } else {
2451
- idle.push(w);
2452
- }
2453
- }
2454
- const provider = async (pageImage, _pageNumber, _mimeType) => {
2455
- const w = await acquire();
2456
- try {
2457
- const { data } = await w.recognize(pageImage);
2458
- return data.text;
2459
- } finally {
2460
- release(w);
2461
- }
2462
- };
2463
- provider.terminate = async () => {
2464
- await Promise.all(workers.map((w) => w.terminate()));
2465
- };
2466
- return provider;
2467
- }
2468
- var init_tesseract_provider = __esm({
2469
- "src/ocr/tesseract-provider.ts"() {
2470
- "use strict";
2471
- }
2472
- });
2473
-
2474
2414
  // src/ocr/batch-provider.ts
2475
2415
  var batch_provider_exports = {};
2476
2416
  __export(batch_provider_exports, {
@@ -2679,15 +2619,6 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2679
2619
  }
2680
2620
  if (mode !== "auto") {
2681
2621
  validateOcrMode(mode);
2682
- if (mode === "tesseract") {
2683
- const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2684
- if (concurrency && concurrency > 1) {
2685
- logger.log({ level: "info", event: "done", message: "Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
2686
- return createTesseractPoolProvider2(concurrency);
2687
- }
2688
- logger.log({ level: "info", event: "done", message: "Tesseract single provider \uC120\uD0DD" });
2689
- return createTesseractProvider2();
2690
- }
2691
2622
  if (mode === "gemini" || mode === "claude" || mode === "codex") {
2692
2623
  const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
2693
2624
  const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[mode];
@@ -2703,27 +2634,16 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2703
2634
  }
2704
2635
  const detected = detectAvailableOcr();
2705
2636
  logger.log({ level: "info", event: "progress", message: "OCR auto \uAC10\uC9C0 \uACB0\uACFC", meta: { detected } });
2706
- if (detected !== "codex") {
2707
- if (detected === "tesseract") {
2708
- warnings?.push({
2709
- message: getTesseractFallbackMessage(),
2710
- code: "OCR_CLI_FALLBACK"
2711
- });
2712
- } else {
2713
- warnings?.push({
2714
- message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
2715
- code: "OCR_CLI_FALLBACK"
2716
- });
2717
- }
2637
+ if (!detected) {
2638
+ throw new Error(
2639
+ "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR CLI\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4. \uB2E4\uC74C \uC911 \uD558\uB098\uB97C \uC124\uCE58\uD558\uC138\uC694:\n Codex CLI: npm install -g @openai/codex\n Claude CLI: npm install -g @anthropic-ai/claude-code\n Gemini CLI: https://ai.google.dev/gemini-api/docs/cli"
2640
+ );
2718
2641
  }
2719
- if (detected === "tesseract") {
2720
- const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2721
- if (concurrency && concurrency > 1) {
2722
- logger.log({ level: "info", event: "done", message: "AUTO: Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
2723
- return createTesseractPoolProvider2(concurrency);
2724
- }
2725
- logger.log({ level: "info", event: "done", message: "AUTO: Tesseract single provider \uC120\uD0DD" });
2726
- return createTesseractProvider2();
2642
+ if (detected !== "codex") {
2643
+ warnings?.push({
2644
+ message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
2645
+ code: "OCR_CLI_FALLBACK"
2646
+ });
2727
2647
  }
2728
2648
  if (detected === "gemini" || detected === "codex" || detected === "claude") {
2729
2649
  const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
@@ -3115,7 +3035,7 @@ import JSZip2 from "jszip";
3115
3035
  import { DOMParser } from "@xmldom/xmldom";
3116
3036
 
3117
3037
  // src/utils.ts
3118
- var VERSION = true ? "2.4.17" : "0.0.0-dev";
3038
+ var VERSION = true ? "2.4.19" : "0.0.0-dev";
3119
3039
  function toArrayBuffer(buf) {
3120
3040
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3121
3041
  return buf.buffer;
@@ -8711,7 +8631,6 @@ async function parseDocxDocument(buffer, options, existingZip) {
8711
8631
 
8712
8632
  // src/index.ts
8713
8633
  init_cli_provider();
8714
- init_tesseract_provider();
8715
8634
  init_markdown_to_blocks();
8716
8635
  init_logger();
8717
8636
 
@@ -12070,9 +11989,6 @@ async function parseImage(buffer, options) {
12070
11989
  if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
12071
11990
  ocrProvider = createCliOcrProvider(ocrMode);
12072
11991
  actualOcrMode = ocrMode;
12073
- } else if (ocrMode === "tesseract") {
12074
- ocrProvider = await createTesseractProvider();
12075
- actualOcrMode = ocrMode;
12076
11992
  } else if (ocrMode === "auto") {
12077
11993
  const modesToTry = ["gemini", "claude", "codex", "ollama"];
12078
11994
  for (const mode of modesToTry) {
@@ -12084,10 +12000,6 @@ async function parseImage(buffer, options) {
12084
12000
  console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
12085
12001
  }
12086
12002
  }
12087
- if (!ocrProvider) {
12088
- ocrProvider = await createTesseractProvider();
12089
- actualOcrMode = "tesseract";
12090
- }
12091
12003
  }
12092
12004
  if (!ocrProvider) {
12093
12005
  return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };