@clazic/kordoc 2.4.17 → 2.4.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/dist/{auto-detect-2YGFYQCN.js → auto-detect-CBYICI6B.js} +4 -4
  2. package/dist/{chunk-WM3XI23V.js → chunk-463YQ2WL.js} +38 -25
  3. package/dist/chunk-463YQ2WL.js.map +1 -0
  4. package/dist/{chunk-7NOZFYH6.js → chunk-CLK4PNZ7.js} +7 -8
  5. package/dist/chunk-CLK4PNZ7.js.map +1 -0
  6. package/dist/{chunk-W2KDIKDF.js → chunk-MZN7PLTZ.js} +2 -2
  7. package/dist/{chunk-34WIGIQC.js → chunk-Y4WFKJ5P.js} +1 -1
  8. package/dist/chunk-Y4WFKJ5P.js.map +1 -0
  9. package/dist/cli.js +9 -13
  10. package/dist/cli.js.map +1 -1
  11. package/dist/index.cjs +49 -191
  12. package/dist/index.cjs.map +1 -1
  13. package/dist/index.d.cts +5 -6
  14. package/dist/index.d.ts +5 -6
  15. package/dist/index.js +49 -190
  16. package/dist/index.js.map +1 -1
  17. package/dist/mcp.js +5 -6
  18. package/dist/mcp.js.map +1 -1
  19. package/dist/{resolve-673XFZQ6.js → resolve-XWYJYKKH.js} +15 -36
  20. package/dist/resolve-XWYJYKKH.js.map +1 -0
  21. package/dist/{utils-DHOODYKU.js → utils-YUAT7LFD.js} +2 -2
  22. package/dist/{watch-RM4VNOL4.js → watch-WEOFVVDO.js} +5 -6
  23. package/dist/{watch-RM4VNOL4.js.map → watch-WEOFVVDO.js.map} +1 -1
  24. package/package.json +1 -2
  25. package/dist/chunk-34WIGIQC.js.map +0 -1
  26. package/dist/chunk-7FMKAV4P.js +0 -56
  27. package/dist/chunk-7FMKAV4P.js.map +0 -1
  28. package/dist/chunk-7NOZFYH6.js.map +0 -1
  29. package/dist/chunk-WM3XI23V.js.map +0 -1
  30. package/dist/resolve-673XFZQ6.js.map +0 -1
  31. package/dist/tesseract-provider-MNMZPSGF.js +0 -11
  32. package/dist/utils-DHOODYKU.js.map +0 -1
  33. /package/dist/{auto-detect-2YGFYQCN.js.map → auto-detect-CBYICI6B.js.map} +0 -0
  34. /package/dist/{chunk-W2KDIKDF.js.map → chunk-MZN7PLTZ.js.map} +0 -0
  35. /package/dist/{tesseract-provider-MNMZPSGF.js.map → utils-YUAT7LFD.js.map} +0 -0
package/dist/index.d.cts CHANGED
@@ -106,17 +106,16 @@ interface ParseOptions {
106
106
  ocr?: OcrProvider;
107
107
  /**
108
108
  * OCR 모드 (CLI 자동 탐색용).
109
- * - "auto": 설치된 CLI 자동 탐색 (gemini→claude→codex→ollama→tesseract)
110
- * - "gemini"|"claude"|"codex"|"ollama"|"tesseract": 특정 도구 강제 지정
109
+ * - "auto": 설치된 CLI 자동 탐색 (codex→gemini→claude→ollama)
110
+ * - "gemini"|"claude"|"codex"|"ollama": 특정 도구 강제 지정
111
111
  * - "off": OCR 비활성화 (이미지 기반 PDF면 에러)
112
112
  * - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
113
113
  */
114
114
  ocrMode?: OcrMode;
115
115
  /**
116
116
  * OCR 병렬 처리 수.
117
- * - tesseract: 기본값은 CPU 코어 (병렬 처리로 속도 향상)
118
- * - CLI 제공 프로바이더(gemini/claude/codex): 기본 1 (rate limit 보호)
119
- * - 1: 순차 처리 (기존 동작)
117
+ * - CLI 제공 프로바이더(gemini/claude/codex): 기본 4 (배치 병렬 실행)
118
+ * - 1: 순차 처리
120
119
  */
121
120
  ocrConcurrency?: number;
122
121
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
@@ -241,7 +240,7 @@ interface StructuredOcrResult {
241
240
  markdown: string;
242
241
  }
243
242
  /** OCR 모드 — CLI --ocr 옵션 허용값 */
244
- type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "tesseract" | "off";
243
+ type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "off";
245
244
  /** 사용자 제공 OCR 함수 — 페이지 이미지를 받아 텍스트 또는 구조화된 결과 반환 */
246
245
  type OcrProvider = (pageImage: Uint8Array, pageNumber: number, mimeType: "image/png") => Promise<string | StructuredOcrResult>;
247
246
  interface WatchOptions {
package/dist/index.d.ts CHANGED
@@ -106,17 +106,16 @@ interface ParseOptions {
106
106
  ocr?: OcrProvider;
107
107
  /**
108
108
  * OCR 모드 (CLI 자동 탐색용).
109
- * - "auto": 설치된 CLI 자동 탐색 (gemini→claude→codex→ollama→tesseract)
110
- * - "gemini"|"claude"|"codex"|"ollama"|"tesseract": 특정 도구 강제 지정
109
+ * - "auto": 설치된 CLI 자동 탐색 (codex→gemini→claude→ollama)
110
+ * - "gemini"|"claude"|"codex"|"ollama": 특정 도구 강제 지정
111
111
  * - "off": OCR 비활성화 (이미지 기반 PDF면 에러)
112
112
  * - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
113
113
  */
114
114
  ocrMode?: OcrMode;
115
115
  /**
116
116
  * OCR 병렬 처리 수.
117
- * - tesseract: 기본값은 CPU 코어 (병렬 처리로 속도 향상)
118
- * - CLI 제공 프로바이더(gemini/claude/codex): 기본 1 (rate limit 보호)
119
- * - 1: 순차 처리 (기존 동작)
117
+ * - CLI 제공 프로바이더(gemini/claude/codex): 기본 4 (배치 병렬 실행)
118
+ * - 1: 순차 처리
120
119
  */
121
120
  ocrConcurrency?: number;
122
121
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
@@ -241,7 +240,7 @@ interface StructuredOcrResult {
241
240
  markdown: string;
242
241
  }
243
242
  /** OCR 모드 — CLI --ocr 옵션 허용값 */
244
- type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "tesseract" | "off";
243
+ type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "off";
245
244
  /** 사용자 제공 OCR 함수 — 페이지 이미지를 받아 텍스트 또는 구조화된 결과 반환 */
246
245
  type OcrProvider = (pageImage: Uint8Array, pageNumber: number, mimeType: "image/png") => Promise<string | StructuredOcrResult>;
247
246
  interface WatchOptions {
package/dist/index.js CHANGED
@@ -2183,7 +2183,7 @@ var auto_detect_exports = {};
2183
2183
  __export(auto_detect_exports, {
2184
2184
  detectAvailableOcr: () => detectAvailableOcr,
2185
2185
  getAutoFallbackChain: () => getAutoFallbackChain,
2186
- getTesseractFallbackMessage: () => getTesseractFallbackMessage,
2186
+ getNoCliMessage: () => getNoCliMessage,
2187
2187
  validateOcrMode: () => validateOcrMode
2188
2188
  });
2189
2189
  import { execSync } from "child_process";
@@ -2191,7 +2191,7 @@ function detectAvailableOcr() {
2191
2191
  for (const cli of CLI_PRIORITY) {
2192
2192
  if (isCliInstalled(cli)) return cli;
2193
2193
  }
2194
- return "tesseract";
2194
+ return null;
2195
2195
  }
2196
2196
  function isCliInstalled(name) {
2197
2197
  try {
@@ -2207,11 +2207,10 @@ function getAutoFallbackChain() {
2207
2207
  for (const cli of CLI_PRIORITY) {
2208
2208
  if (isCliInstalled(cli)) chain.push(cli);
2209
2209
  }
2210
- chain.push("tesseract");
2211
2210
  return chain;
2212
2211
  }
2213
2212
  function validateOcrMode(mode) {
2214
- if (mode === "auto" || mode === "off" || mode === "tesseract") return;
2213
+ if (mode === "auto" || mode === "off") return;
2215
2214
  if (!isCliInstalled(mode)) {
2216
2215
  throw new Error(`'${mode}' CLI\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4.
2217
2216
  ${getInstallGuide(mode)}`);
@@ -2226,10 +2225,10 @@ function getInstallGuide(mode) {
2226
2225
  };
2227
2226
  return guides[mode] || `'${mode}'\uC744(\uB97C) \uC124\uCE58\uD574\uC8FC\uC138\uC694.`;
2228
2227
  }
2229
- function getTesseractFallbackMessage() {
2228
+ function getNoCliMessage() {
2230
2229
  return [
2231
- "\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
2232
- "\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
2230
+ "\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 OCR\uC744 \uC218\uD589\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.",
2231
+ "\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \uCC98\uB9AC\uB97C \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
2233
2232
  "",
2234
2233
  " [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
2235
2234
  " Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
@@ -2412,65 +2411,6 @@ var init_cli_provider = __esm({
2412
2411
  }
2413
2412
  });
2414
2413
 
2415
- // src/ocr/tesseract-provider.ts
2416
- var tesseract_provider_exports = {};
2417
- __export(tesseract_provider_exports, {
2418
- createTesseractPoolProvider: () => createTesseractPoolProvider,
2419
- createTesseractProvider: () => createTesseractProvider
2420
- });
2421
- import { createWorker } from "tesseract.js";
2422
- async function createTesseractProvider() {
2423
- const worker = await createWorker("kor+eng");
2424
- let terminated = false;
2425
- const provider = async (pageImage, _pageNumber, _mimeType) => {
2426
- const { data } = await worker.recognize(pageImage);
2427
- return data.text;
2428
- };
2429
- provider.terminate = async () => {
2430
- if (!terminated) {
2431
- await worker.terminate();
2432
- terminated = true;
2433
- }
2434
- };
2435
- return provider;
2436
- }
2437
- async function createTesseractPoolProvider(concurrency) {
2438
- const workers = await Promise.all(
2439
- Array.from({ length: concurrency }, () => createWorker("kor+eng"))
2440
- );
2441
- const idle = [...workers];
2442
- const waitQueue = [];
2443
- function acquire() {
2444
- if (idle.length > 0) return Promise.resolve(idle.pop());
2445
- return new Promise((resolve4) => waitQueue.push(resolve4));
2446
- }
2447
- function release(w) {
2448
- if (waitQueue.length > 0) {
2449
- waitQueue.shift()(w);
2450
- } else {
2451
- idle.push(w);
2452
- }
2453
- }
2454
- const provider = async (pageImage, _pageNumber, _mimeType) => {
2455
- const w = await acquire();
2456
- try {
2457
- const { data } = await w.recognize(pageImage);
2458
- return data.text;
2459
- } finally {
2460
- release(w);
2461
- }
2462
- };
2463
- provider.terminate = async () => {
2464
- await Promise.all(workers.map((w) => w.terminate()));
2465
- };
2466
- return provider;
2467
- }
2468
- var init_tesseract_provider = __esm({
2469
- "src/ocr/tesseract-provider.ts"() {
2470
- "use strict";
2471
- }
2472
- });
2473
-
2474
2414
  // src/ocr/batch-provider.ts
2475
2415
  var batch_provider_exports = {};
2476
2416
  __export(batch_provider_exports, {
@@ -2679,15 +2619,6 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2679
2619
  }
2680
2620
  if (mode !== "auto") {
2681
2621
  validateOcrMode(mode);
2682
- if (mode === "tesseract") {
2683
- const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2684
- if (concurrency && concurrency > 1) {
2685
- logger.log({ level: "info", event: "done", message: "Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
2686
- return createTesseractPoolProvider2(concurrency);
2687
- }
2688
- logger.log({ level: "info", event: "done", message: "Tesseract single provider \uC120\uD0DD" });
2689
- return createTesseractProvider2();
2690
- }
2691
2622
  if (mode === "gemini" || mode === "claude" || mode === "codex") {
2692
2623
  const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
2693
2624
  const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[mode];
@@ -2703,27 +2634,16 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2703
2634
  }
2704
2635
  const detected = detectAvailableOcr();
2705
2636
  logger.log({ level: "info", event: "progress", message: "OCR auto \uAC10\uC9C0 \uACB0\uACFC", meta: { detected } });
2706
- if (detected !== "codex") {
2707
- if (detected === "tesseract") {
2708
- warnings?.push({
2709
- message: getTesseractFallbackMessage(),
2710
- code: "OCR_CLI_FALLBACK"
2711
- });
2712
- } else {
2713
- warnings?.push({
2714
- message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
2715
- code: "OCR_CLI_FALLBACK"
2716
- });
2717
- }
2637
+ if (!detected) {
2638
+ throw new Error(
2639
+ "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR CLI\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4. \uB2E4\uC74C \uC911 \uD558\uB098\uB97C \uC124\uCE58\uD558\uC138\uC694:\n Codex CLI: npm install -g @openai/codex\n Claude CLI: npm install -g @anthropic-ai/claude-code\n Gemini CLI: https://ai.google.dev/gemini-api/docs/cli"
2640
+ );
2718
2641
  }
2719
- if (detected === "tesseract") {
2720
- const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2721
- if (concurrency && concurrency > 1) {
2722
- logger.log({ level: "info", event: "done", message: "AUTO: Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
2723
- return createTesseractPoolProvider2(concurrency);
2724
- }
2725
- logger.log({ level: "info", event: "done", message: "AUTO: Tesseract single provider \uC120\uD0DD" });
2726
- return createTesseractProvider2();
2642
+ if (detected !== "codex") {
2643
+ warnings?.push({
2644
+ message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
2645
+ code: "OCR_CLI_FALLBACK"
2646
+ });
2727
2647
  }
2728
2648
  if (detected === "gemini" || detected === "codex" || detected === "claude") {
2729
2649
  const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
@@ -3115,7 +3035,7 @@ import JSZip2 from "jszip";
3115
3035
  import { DOMParser } from "@xmldom/xmldom";
3116
3036
 
3117
3037
  // src/utils.ts
3118
- var VERSION = true ? "2.4.17" : "0.0.0-dev";
3038
+ var VERSION = true ? "2.4.19" : "0.0.0-dev";
3119
3039
  function toArrayBuffer(buf) {
3120
3040
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3121
3041
  return buf.buffer;
@@ -8711,7 +8631,6 @@ async function parseDocxDocument(buffer, options, existingZip) {
8711
8631
 
8712
8632
  // src/index.ts
8713
8633
  init_cli_provider();
8714
- init_tesseract_provider();
8715
8634
  init_markdown_to_blocks();
8716
8635
  init_logger();
8717
8636
 
@@ -11346,17 +11265,39 @@ var DEFAULT_STAGE_WEIGHTS = {
11346
11265
  render: 20,
11347
11266
  probe: 5,
11348
11267
  ocr: 45,
11349
- proofread: 10,
11268
+ proofread: 0,
11350
11269
  merge: 5
11351
11270
  };
11352
- var OCR_PROMPT2 = "Extract all text and tables from this image exactly as-is into Markdown. Do not summarize, infer, or alter the content in any way.";
11353
- var PROOFREAD_PROMPT = [
11354
- "Perform non-destructive proofreading only on the Markdown below.",
11355
- "Rules:",
11356
- "- Do not add, remove, or infer any facts",
11357
- "- Do not change numbers, units, or proper nouns",
11358
- "- Correct only typos, spacing, line breaks, and Markdown structure",
11359
- "- Output the corrected Markdown body only"
11271
+ var OCR_PROMPT2 = [
11272
+ "\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, OCR \uC624\uC778\uC2DD \uC624\uB958\uB97C \uC989\uC2DC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uBB3C\uC744 \uCD9C\uB825\uD558\uC138\uC694.",
11273
+ "",
11274
+ "\uCD94\uCD9C \uADDC\uCE59:",
11275
+ "- \uD14D\uC2A4\uD2B8, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8\uB97C \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658",
11276
+ "- \uD45C\uB294 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)",
11277
+ "- \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9",
11278
+ "- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9",
11279
+ "- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC",
11280
+ "- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0",
11281
+ "",
11282
+ "\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D:",
11283
+ "- \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
11284
+ "- \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11285
+ "- \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11286
+ "- \uD45C\uC758 \uC81C\uBAA9\uC744 \uBCC0\uACBD \uB610\uB294 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
11287
+ "- \uD45C\uC758 \uD589\xB7\uC5F4 \uC218, \uC140 \uB0B4\uC6A9, \uD5E4\uB354\uB97C \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11288
+ "- \uC81C\uBAA9 \uC218\uC900(#, ##, ### \uB4F1)\uC744 \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83",
11289
+ "- \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83",
11290
+ "- ` ``` `\uB85C \uAC10\uC2F8\uAC70\uB098 \uC124\uBA85 \uD14D\uC2A4\uD2B8\uB97C \uCD94\uAC00\uD558\uC9C0 \uB9D0 \uAC83",
11291
+ "",
11292
+ "\uD5C8\uC6A9\uB418\uB294 \uAD50\uC815 \uBC94\uC704 (OCR \uC624\uC778\uC2DD \uC218\uC815):",
11293
+ "- \uBA85\uBC31\uD55C \uAE00\uC790 \uC624\uC778\uC2DD \uC218\uC815 (\uC608: '0' \u2192 'O', 'l' \u2192 '1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uD655\uD55C \uACBD\uC6B0\uB9CC)",
11294
+ "- \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70",
11295
+ "- \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uBD84\uB9AC\uB41C \uBB38\uC7A5 \uBCD1\uD569 (\uC758\uBBF8 \uB2E8\uC704 \uAE30\uC900)",
11296
+ "- Markdown \uBB38\uBC95 \uC624\uB958 \uC218\uC815 (\uD45C \uAD6C\uBD84\uC120 \uB204\uB77D, \uB9AC\uC2A4\uD2B8 \uB4E4\uC5EC\uC4F0\uAE30 \uB4F1)",
11297
+ "",
11298
+ "\uCD9C\uB825 \uADDC\uCE59:",
11299
+ "- \uBCC0\uD658\uB41C Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825\uD560 \uAC83 (\uC124\uBA85, \uC8FC\uC11D, \uBA54\uD0C0 \uD14D\uC2A4\uD2B8 \uC5C6\uC774)",
11300
+ "- \uD655\uC2E4\uD558\uC9C0 \uC54A\uC73C\uBA74 \uC6D0\uBB38\uC744 \uADF8\uB300\uB85C \uC720\uC9C0\uD560 \uAC83"
11360
11301
  ].join("\n");
11361
11302
  function elapsedMs(startAt) {
11362
11303
  return Math.round(performance.now() - startAt);
@@ -11367,7 +11308,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11367
11308
  const workspaceDir = resolve3(options.workspaceDir ?? join4(dirname3(absInput), `${stem}_ocr_workspace`));
11368
11309
  const imagesDir = join4(workspaceDir, "images");
11369
11310
  const rawDir = join4(workspaceDir, "ocr", "raw");
11370
- const proofDir = join4(workspaceDir, "ocr", "proofread");
11371
11311
  const diffDir = join4(workspaceDir, "ocr", "diff");
11372
11312
  const outputPath = resolve3(options.outputPath ?? join4(dirname3(absInput), `${stem}.md`));
11373
11313
  const reportPath = join4(workspaceDir, "run-report.json");
@@ -11387,7 +11327,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11387
11327
  const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
11388
11328
  await mkdir(imagesDir, { recursive: true });
11389
11329
  await mkdir(rawDir, { recursive: true });
11390
- await mkdir(proofDir, { recursive: true });
11391
11330
  await mkdir(diffDir, { recursive: true });
11392
11331
  const timingsMs = {};
11393
11332
  const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
@@ -11502,50 +11441,11 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11502
11441
  timingsMs.ocr = elapsedMs(ocrStart);
11503
11442
  markStageDone("ocr", "OCR \uC644\uB8CC");
11504
11443
  logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
11505
- const proofStart = performance.now();
11506
- currentStage = "proofread";
11507
- markStageStart("proofread", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC9C4\uD589 \uC911");
11508
- logStage("info", "proofread", "start", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC2DC\uC791", { pages: rawPagePaths.length });
11509
- const proofedPaths = [];
11510
- for (let i = 0; i < rawPagePaths.length; i++) {
11511
- const rawMd = await readFile(rawPagePaths[i], "utf-8");
11512
- const prompt = `${PROOFREAD_PROMPT}
11513
-
11514
- ---
11515
- ${rawMd}
11516
- ---`;
11517
- const corrected = await ocrImageViaNim({
11518
- textOnlyPrompt: prompt,
11519
- model: selectedModel,
11520
- maxTokens: modelMaxTokens[selectedModel] ?? 8192,
11521
- baseUrl,
11522
- keyPool,
11523
- timeoutMs,
11524
- maxRetries: maxRetriesPerPage,
11525
- logger,
11526
- stage: "proofread"
11527
- });
11528
- const safeCorrected = preserveNumericIntegrity(rawMd, corrected);
11529
- const taggedCorrected = addUncertainTag(rawMd, safeCorrected);
11530
- const pagePath = join4(proofDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11531
- await writeFile(pagePath, taggedCorrected, "utf-8");
11532
- await writeFile(
11533
- join4(diffDir, `page_${String(i + 1).padStart(4, "0")}.json`),
11534
- JSON.stringify(buildDiffSummary(rawMd, taggedCorrected), null, 2),
11535
- "utf-8"
11536
- );
11537
- proofedPaths.push(pagePath);
11538
- markStageProgress("proofread", Math.round((i + 1) / rawPagePaths.length * 100), i + 1, rawPagePaths.length, `\uAD50\uC815 ${i + 1}/${rawPagePaths.length}`);
11539
- logStage("debug", "proofread", "progress", "\uD398\uC774\uC9C0 \uAD50\uC815 \uC644\uB8CC", { page: i + 1, total: rawPagePaths.length });
11540
- }
11541
- timingsMs.proofread = elapsedMs(proofStart);
11542
- markStageDone("proofread", "\uAD50\uC815 \uC644\uB8CC");
11543
- logStage("info", "proofread", "done", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC644\uB8CC", { elapsedMs: timingsMs.proofread });
11544
11444
  const mergeStart = performance.now();
11545
11445
  currentStage = "merge";
11546
11446
  markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
11547
- logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: proofedPaths.length });
11548
- const merged = await mergeMarkdownPages(proofedPaths);
11447
+ logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
11448
+ const merged = await mergeMarkdownPages(rawPagePaths);
11549
11449
  await writeFile(outputPath, merged, "utf-8");
11550
11450
  timingsMs.merge = elapsedMs(mergeStart);
11551
11451
  markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
@@ -12004,40 +11904,6 @@ function ensureSupportedInput(path) {
12004
11904
  throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
12005
11905
  }
12006
11906
  }
12007
- function extractNumericTokens(text) {
12008
- return text.match(/\d[\d,./-]*/g) ?? [];
12009
- }
12010
- function preserveNumericIntegrity(rawText, correctedText) {
12011
- const rawTokens = extractNumericTokens(rawText);
12012
- const correctedTokens = extractNumericTokens(correctedText);
12013
- if (rawTokens.length !== correctedTokens.length) return rawText;
12014
- for (let i = 0; i < rawTokens.length; i++) {
12015
- if (rawTokens[i] !== correctedTokens[i]) return rawText;
12016
- }
12017
- return correctedText;
12018
- }
12019
- function addUncertainTag(rawText, correctedText) {
12020
- if (correctedText.includes("[\uD655\uC778\uD544\uC694:")) return correctedText;
12021
- const rawLen = rawText.trim().length;
12022
- const corrLen = correctedText.trim().length;
12023
- if (rawLen === 0 || corrLen === 0) return correctedText;
12024
- const rawLines = rawText.split("\n").filter(Boolean).length;
12025
- const corrLines = correctedText.split("\n").filter(Boolean).length;
12026
- const rawTableLines = rawText.split("\n").filter((l) => l.includes("|")).length;
12027
- const corrTableLines = correctedText.split("\n").filter((l) => l.includes("|")).length;
12028
- const suspicious = corrLen < rawLen * 0.75 || corrLines < Math.max(1, Math.floor(rawLines * 0.8)) || rawTableLines >= 2 && corrTableLines < Math.floor(rawTableLines * 0.7);
12029
- if (!suspicious) return correctedText;
12030
- return `${correctedText}
12031
-
12032
- [\uD655\uC778\uD544\uC694: \uAD50\uC815 \uACB0\uACFC\uAC00 \uCD95\uC57D\uB418\uC5C8\uC744 \uC218 \uC788\uC5B4 \uC6D0\uBB38\uACFC \uB300\uC870\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.]`;
12033
- }
12034
- function buildDiffSummary(before, after) {
12035
- return {
12036
- changed: before !== after,
12037
- beforeLength: before.length,
12038
- afterLength: after.length
12039
- };
12040
- }
12041
11907
  function normalizePipelineError(err, stage) {
12042
11908
  if (err instanceof UnifiedOcrError) return err;
12043
11909
  const message = err instanceof Error ? err.message : String(err);
@@ -12123,9 +11989,6 @@ async function parseImage(buffer, options) {
12123
11989
  if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
12124
11990
  ocrProvider = createCliOcrProvider(ocrMode);
12125
11991
  actualOcrMode = ocrMode;
12126
- } else if (ocrMode === "tesseract") {
12127
- ocrProvider = await createTesseractProvider();
12128
- actualOcrMode = ocrMode;
12129
11992
  } else if (ocrMode === "auto") {
12130
11993
  const modesToTry = ["gemini", "claude", "codex", "ollama"];
12131
11994
  for (const mode of modesToTry) {
@@ -12137,10 +12000,6 @@ async function parseImage(buffer, options) {
12137
12000
  console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
12138
12001
  }
12139
12002
  }
12140
- if (!ocrProvider) {
12141
- ocrProvider = await createTesseractProvider();
12142
- actualOcrMode = "tesseract";
12143
- }
12144
12003
  }
12145
12004
  if (!ocrProvider) {
12146
12005
  return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };