@clazic/kordoc 2.4.17 → 2.4.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/dist/{auto-detect-2YGFYQCN.js → auto-detect-CBYICI6B.js} +4 -4
  2. package/dist/{chunk-WM3XI23V.js → chunk-463YQ2WL.js} +38 -25
  3. package/dist/chunk-463YQ2WL.js.map +1 -0
  4. package/dist/{chunk-7NOZFYH6.js → chunk-CLK4PNZ7.js} +7 -8
  5. package/dist/chunk-CLK4PNZ7.js.map +1 -0
  6. package/dist/{chunk-W2KDIKDF.js → chunk-MZN7PLTZ.js} +2 -2
  7. package/dist/{chunk-34WIGIQC.js → chunk-Y4WFKJ5P.js} +1 -1
  8. package/dist/chunk-Y4WFKJ5P.js.map +1 -0
  9. package/dist/cli.js +9 -13
  10. package/dist/cli.js.map +1 -1
  11. package/dist/index.cjs +49 -191
  12. package/dist/index.cjs.map +1 -1
  13. package/dist/index.d.cts +5 -6
  14. package/dist/index.d.ts +5 -6
  15. package/dist/index.js +49 -190
  16. package/dist/index.js.map +1 -1
  17. package/dist/mcp.js +5 -6
  18. package/dist/mcp.js.map +1 -1
  19. package/dist/{resolve-673XFZQ6.js → resolve-XWYJYKKH.js} +15 -36
  20. package/dist/resolve-XWYJYKKH.js.map +1 -0
  21. package/dist/{utils-DHOODYKU.js → utils-YUAT7LFD.js} +2 -2
  22. package/dist/{watch-RM4VNOL4.js → watch-WEOFVVDO.js} +5 -6
  23. package/dist/{watch-RM4VNOL4.js.map → watch-WEOFVVDO.js.map} +1 -1
  24. package/package.json +1 -2
  25. package/dist/chunk-34WIGIQC.js.map +0 -1
  26. package/dist/chunk-7FMKAV4P.js +0 -56
  27. package/dist/chunk-7FMKAV4P.js.map +0 -1
  28. package/dist/chunk-7NOZFYH6.js.map +0 -1
  29. package/dist/chunk-WM3XI23V.js.map +0 -1
  30. package/dist/resolve-673XFZQ6.js.map +0 -1
  31. package/dist/tesseract-provider-MNMZPSGF.js +0 -11
  32. package/dist/utils-DHOODYKU.js.map +0 -1
  33. /package/dist/{auto-detect-2YGFYQCN.js.map → auto-detect-CBYICI6B.js.map} +0 -0
  34. /package/dist/{chunk-W2KDIKDF.js.map → chunk-MZN7PLTZ.js.map} +0 -0
  35. /package/dist/{tesseract-provider-MNMZPSGF.js.map → utils-YUAT7LFD.js.map} +0 -0
package/dist/index.cjs CHANGED
@@ -2179,14 +2179,14 @@ var auto_detect_exports = {};
2179
2179
  __export(auto_detect_exports, {
2180
2180
  detectAvailableOcr: () => detectAvailableOcr,
2181
2181
  getAutoFallbackChain: () => getAutoFallbackChain,
2182
- getTesseractFallbackMessage: () => getTesseractFallbackMessage,
2182
+ getNoCliMessage: () => getNoCliMessage,
2183
2183
  validateOcrMode: () => validateOcrMode
2184
2184
  });
2185
2185
  function detectAvailableOcr() {
2186
2186
  for (const cli of CLI_PRIORITY) {
2187
2187
  if (isCliInstalled(cli)) return cli;
2188
2188
  }
2189
- return "tesseract";
2189
+ return null;
2190
2190
  }
2191
2191
  function isCliInstalled(name) {
2192
2192
  try {
@@ -2202,11 +2202,10 @@ function getAutoFallbackChain() {
2202
2202
  for (const cli of CLI_PRIORITY) {
2203
2203
  if (isCliInstalled(cli)) chain.push(cli);
2204
2204
  }
2205
- chain.push("tesseract");
2206
2205
  return chain;
2207
2206
  }
2208
2207
  function validateOcrMode(mode) {
2209
- if (mode === "auto" || mode === "off" || mode === "tesseract") return;
2208
+ if (mode === "auto" || mode === "off") return;
2210
2209
  if (!isCliInstalled(mode)) {
2211
2210
  throw new Error(`'${mode}' CLI\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4.
2212
2211
  ${getInstallGuide(mode)}`);
@@ -2221,10 +2220,10 @@ function getInstallGuide(mode) {
2221
2220
  };
2222
2221
  return guides[mode] || `'${mode}'\uC744(\uB97C) \uC124\uCE58\uD574\uC8FC\uC138\uC694.`;
2223
2222
  }
2224
- function getTesseractFallbackMessage() {
2223
+ function getNoCliMessage() {
2225
2224
  return [
2226
- "\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
2227
- "\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
2225
+ "\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 OCR\uC744 \uC218\uD589\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.",
2226
+ "\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \uCC98\uB9AC\uB97C \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
2228
2227
  "",
2229
2228
  " [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
2230
2229
  " Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
@@ -2408,66 +2407,6 @@ var init_cli_provider = __esm({
2408
2407
  }
2409
2408
  });
2410
2409
 
2411
- // src/ocr/tesseract-provider.ts
2412
- var tesseract_provider_exports = {};
2413
- __export(tesseract_provider_exports, {
2414
- createTesseractPoolProvider: () => createTesseractPoolProvider,
2415
- createTesseractProvider: () => createTesseractProvider
2416
- });
2417
- async function createTesseractProvider() {
2418
- const worker = await (0, import_tesseract.createWorker)("kor+eng");
2419
- let terminated = false;
2420
- const provider = async (pageImage, _pageNumber, _mimeType) => {
2421
- const { data } = await worker.recognize(pageImage);
2422
- return data.text;
2423
- };
2424
- provider.terminate = async () => {
2425
- if (!terminated) {
2426
- await worker.terminate();
2427
- terminated = true;
2428
- }
2429
- };
2430
- return provider;
2431
- }
2432
- async function createTesseractPoolProvider(concurrency) {
2433
- const workers = await Promise.all(
2434
- Array.from({ length: concurrency }, () => (0, import_tesseract.createWorker)("kor+eng"))
2435
- );
2436
- const idle = [...workers];
2437
- const waitQueue = [];
2438
- function acquire() {
2439
- if (idle.length > 0) return Promise.resolve(idle.pop());
2440
- return new Promise((resolve4) => waitQueue.push(resolve4));
2441
- }
2442
- function release(w) {
2443
- if (waitQueue.length > 0) {
2444
- waitQueue.shift()(w);
2445
- } else {
2446
- idle.push(w);
2447
- }
2448
- }
2449
- const provider = async (pageImage, _pageNumber, _mimeType) => {
2450
- const w = await acquire();
2451
- try {
2452
- const { data } = await w.recognize(pageImage);
2453
- return data.text;
2454
- } finally {
2455
- release(w);
2456
- }
2457
- };
2458
- provider.terminate = async () => {
2459
- await Promise.all(workers.map((w) => w.terminate()));
2460
- };
2461
- return provider;
2462
- }
2463
- var import_tesseract;
2464
- var init_tesseract_provider = __esm({
2465
- "src/ocr/tesseract-provider.ts"() {
2466
- "use strict";
2467
- import_tesseract = require("tesseract.js");
2468
- }
2469
- });
2470
-
2471
2410
  // src/ocr/batch-provider.ts
2472
2411
  var batch_provider_exports = {};
2473
2412
  __export(batch_provider_exports, {
@@ -2676,15 +2615,6 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2676
2615
  }
2677
2616
  if (mode !== "auto") {
2678
2617
  validateOcrMode(mode);
2679
- if (mode === "tesseract") {
2680
- const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2681
- if (concurrency && concurrency > 1) {
2682
- logger.log({ level: "info", event: "done", message: "Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
2683
- return createTesseractPoolProvider2(concurrency);
2684
- }
2685
- logger.log({ level: "info", event: "done", message: "Tesseract single provider \uC120\uD0DD" });
2686
- return createTesseractProvider2();
2687
- }
2688
2618
  if (mode === "gemini" || mode === "claude" || mode === "codex") {
2689
2619
  const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
2690
2620
  const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[mode];
@@ -2700,27 +2630,16 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2700
2630
  }
2701
2631
  const detected = detectAvailableOcr();
2702
2632
  logger.log({ level: "info", event: "progress", message: "OCR auto \uAC10\uC9C0 \uACB0\uACFC", meta: { detected } });
2703
- if (detected !== "codex") {
2704
- if (detected === "tesseract") {
2705
- warnings?.push({
2706
- message: getTesseractFallbackMessage(),
2707
- code: "OCR_CLI_FALLBACK"
2708
- });
2709
- } else {
2710
- warnings?.push({
2711
- message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
2712
- code: "OCR_CLI_FALLBACK"
2713
- });
2714
- }
2633
+ if (!detected) {
2634
+ throw new Error(
2635
+ "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR CLI\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4. \uB2E4\uC74C \uC911 \uD558\uB098\uB97C \uC124\uCE58\uD558\uC138\uC694:\n Codex CLI: npm install -g @openai/codex\n Claude CLI: npm install -g @anthropic-ai/claude-code\n Gemini CLI: https://ai.google.dev/gemini-api/docs/cli"
2636
+ );
2715
2637
  }
2716
- if (detected === "tesseract") {
2717
- const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2718
- if (concurrency && concurrency > 1) {
2719
- logger.log({ level: "info", event: "done", message: "AUTO: Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
2720
- return createTesseractPoolProvider2(concurrency);
2721
- }
2722
- logger.log({ level: "info", event: "done", message: "AUTO: Tesseract single provider \uC120\uD0DD" });
2723
- return createTesseractProvider2();
2638
+ if (detected !== "codex") {
2639
+ warnings?.push({
2640
+ message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
2641
+ code: "OCR_CLI_FALLBACK"
2642
+ });
2724
2643
  }
2725
2644
  if (detected === "gemini" || detected === "codex" || detected === "claude") {
2726
2645
  const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
@@ -3138,7 +3057,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
3138
3057
  var import_xmldom = require("@xmldom/xmldom");
3139
3058
 
3140
3059
  // src/utils.ts
3141
- var VERSION = true ? "2.4.17" : "0.0.0-dev";
3060
+ var VERSION = true ? "2.4.19" : "0.0.0-dev";
3142
3061
  function toArrayBuffer(buf) {
3143
3062
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3144
3063
  return buf.buffer;
@@ -8734,7 +8653,6 @@ async function parseDocxDocument(buffer, options, existingZip) {
8734
8653
 
8735
8654
  // src/index.ts
8736
8655
  init_cli_provider();
8737
- init_tesseract_provider();
8738
8656
  init_markdown_to_blocks();
8739
8657
  init_logger();
8740
8658
 
@@ -11369,17 +11287,39 @@ var DEFAULT_STAGE_WEIGHTS = {
11369
11287
  render: 20,
11370
11288
  probe: 5,
11371
11289
  ocr: 45,
11372
- proofread: 10,
11290
+ proofread: 0,
11373
11291
  merge: 5
11374
11292
  };
11375
- var OCR_PROMPT2 = "Extract all text and tables from this image exactly as-is into Markdown. Do not summarize, infer, or alter the content in any way.";
11376
- var PROOFREAD_PROMPT = [
11377
- "Perform non-destructive proofreading only on the Markdown below.",
11378
- "Rules:",
11379
- "- Do not add, remove, or infer any facts",
11380
- "- Do not change numbers, units, or proper nouns",
11381
- "- Correct only typos, spacing, line breaks, and Markdown structure",
11382
- "- Output the corrected Markdown body only"
11293
+ var OCR_PROMPT2 = [
11294
+ "\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, OCR \uC624\uC778\uC2DD \uC624\uB958\uB97C \uC989\uC2DC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uBB3C\uC744 \uCD9C\uB825\uD558\uC138\uC694.",
11295
+ "",
11296
+ "\uCD94\uCD9C \uADDC\uCE59:",
11297
+ "- \uD14D\uC2A4\uD2B8, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8\uB97C \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658",
11298
+ "- \uD45C\uB294 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)",
11299
+ "- \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9",
11300
+ "- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9",
11301
+ "- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC",
11302
+ "- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0",
11303
+ "",
11304
+ "\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D:",
11305
+ "- \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
11306
+ "- \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11307
+ "- \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11308
+ "- \uD45C\uC758 \uC81C\uBAA9\uC744 \uBCC0\uACBD \uB610\uB294 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
11309
+ "- \uD45C\uC758 \uD589\xB7\uC5F4 \uC218, \uC140 \uB0B4\uC6A9, \uD5E4\uB354\uB97C \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11310
+ "- \uC81C\uBAA9 \uC218\uC900(#, ##, ### \uB4F1)\uC744 \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83",
11311
+ "- \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83",
11312
+ "- ` ``` `\uB85C \uAC10\uC2F8\uAC70\uB098 \uC124\uBA85 \uD14D\uC2A4\uD2B8\uB97C \uCD94\uAC00\uD558\uC9C0 \uB9D0 \uAC83",
11313
+ "",
11314
+ "\uD5C8\uC6A9\uB418\uB294 \uAD50\uC815 \uBC94\uC704 (OCR \uC624\uC778\uC2DD \uC218\uC815):",
11315
+ "- \uBA85\uBC31\uD55C \uAE00\uC790 \uC624\uC778\uC2DD \uC218\uC815 (\uC608: '0' \u2192 'O', 'l' \u2192 '1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uD655\uD55C \uACBD\uC6B0\uB9CC)",
11316
+ "- \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70",
11317
+ "- \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uBD84\uB9AC\uB41C \uBB38\uC7A5 \uBCD1\uD569 (\uC758\uBBF8 \uB2E8\uC704 \uAE30\uC900)",
11318
+ "- Markdown \uBB38\uBC95 \uC624\uB958 \uC218\uC815 (\uD45C \uAD6C\uBD84\uC120 \uB204\uB77D, \uB9AC\uC2A4\uD2B8 \uB4E4\uC5EC\uC4F0\uAE30 \uB4F1)",
11319
+ "",
11320
+ "\uCD9C\uB825 \uADDC\uCE59:",
11321
+ "- \uBCC0\uD658\uB41C Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825\uD560 \uAC83 (\uC124\uBA85, \uC8FC\uC11D, \uBA54\uD0C0 \uD14D\uC2A4\uD2B8 \uC5C6\uC774)",
11322
+ "- \uD655\uC2E4\uD558\uC9C0 \uC54A\uC73C\uBA74 \uC6D0\uBB38\uC744 \uADF8\uB300\uB85C \uC720\uC9C0\uD560 \uAC83"
11383
11323
  ].join("\n");
11384
11324
  function elapsedMs(startAt) {
11385
11325
  return Math.round(import_node_perf_hooks.performance.now() - startAt);
@@ -11390,7 +11330,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11390
11330
  const workspaceDir = (0, import_path5.resolve)(options.workspaceDir ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}_ocr_workspace`));
11391
11331
  const imagesDir = (0, import_path5.join)(workspaceDir, "images");
11392
11332
  const rawDir = (0, import_path5.join)(workspaceDir, "ocr", "raw");
11393
- const proofDir = (0, import_path5.join)(workspaceDir, "ocr", "proofread");
11394
11333
  const diffDir = (0, import_path5.join)(workspaceDir, "ocr", "diff");
11395
11334
  const outputPath = (0, import_path5.resolve)(options.outputPath ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}.md`));
11396
11335
  const reportPath = (0, import_path5.join)(workspaceDir, "run-report.json");
@@ -11410,7 +11349,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11410
11349
  const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
11411
11350
  await (0, import_promises2.mkdir)(imagesDir, { recursive: true });
11412
11351
  await (0, import_promises2.mkdir)(rawDir, { recursive: true });
11413
- await (0, import_promises2.mkdir)(proofDir, { recursive: true });
11414
11352
  await (0, import_promises2.mkdir)(diffDir, { recursive: true });
11415
11353
  const timingsMs = {};
11416
11354
  const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
@@ -11525,50 +11463,11 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11525
11463
  timingsMs.ocr = elapsedMs(ocrStart);
11526
11464
  markStageDone("ocr", "OCR \uC644\uB8CC");
11527
11465
  logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
11528
- const proofStart = import_node_perf_hooks.performance.now();
11529
- currentStage = "proofread";
11530
- markStageStart("proofread", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC9C4\uD589 \uC911");
11531
- logStage("info", "proofread", "start", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC2DC\uC791", { pages: rawPagePaths.length });
11532
- const proofedPaths = [];
11533
- for (let i = 0; i < rawPagePaths.length; i++) {
11534
- const rawMd = await (0, import_promises2.readFile)(rawPagePaths[i], "utf-8");
11535
- const prompt = `${PROOFREAD_PROMPT}
11536
-
11537
- ---
11538
- ${rawMd}
11539
- ---`;
11540
- const corrected = await ocrImageViaNim({
11541
- textOnlyPrompt: prompt,
11542
- model: selectedModel,
11543
- maxTokens: modelMaxTokens[selectedModel] ?? 8192,
11544
- baseUrl,
11545
- keyPool,
11546
- timeoutMs,
11547
- maxRetries: maxRetriesPerPage,
11548
- logger,
11549
- stage: "proofread"
11550
- });
11551
- const safeCorrected = preserveNumericIntegrity(rawMd, corrected);
11552
- const taggedCorrected = addUncertainTag(rawMd, safeCorrected);
11553
- const pagePath = (0, import_path5.join)(proofDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11554
- await (0, import_promises2.writeFile)(pagePath, taggedCorrected, "utf-8");
11555
- await (0, import_promises2.writeFile)(
11556
- (0, import_path5.join)(diffDir, `page_${String(i + 1).padStart(4, "0")}.json`),
11557
- JSON.stringify(buildDiffSummary(rawMd, taggedCorrected), null, 2),
11558
- "utf-8"
11559
- );
11560
- proofedPaths.push(pagePath);
11561
- markStageProgress("proofread", Math.round((i + 1) / rawPagePaths.length * 100), i + 1, rawPagePaths.length, `\uAD50\uC815 ${i + 1}/${rawPagePaths.length}`);
11562
- logStage("debug", "proofread", "progress", "\uD398\uC774\uC9C0 \uAD50\uC815 \uC644\uB8CC", { page: i + 1, total: rawPagePaths.length });
11563
- }
11564
- timingsMs.proofread = elapsedMs(proofStart);
11565
- markStageDone("proofread", "\uAD50\uC815 \uC644\uB8CC");
11566
- logStage("info", "proofread", "done", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC644\uB8CC", { elapsedMs: timingsMs.proofread });
11567
11466
  const mergeStart = import_node_perf_hooks.performance.now();
11568
11467
  currentStage = "merge";
11569
11468
  markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
11570
- logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: proofedPaths.length });
11571
- const merged = await mergeMarkdownPages(proofedPaths);
11469
+ logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
11470
+ const merged = await mergeMarkdownPages(rawPagePaths);
11572
11471
  await (0, import_promises2.writeFile)(outputPath, merged, "utf-8");
11573
11472
  timingsMs.merge = elapsedMs(mergeStart);
11574
11473
  markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
@@ -12027,40 +11926,6 @@ function ensureSupportedInput(path) {
12027
11926
  throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
12028
11927
  }
12029
11928
  }
12030
- function extractNumericTokens(text) {
12031
- return text.match(/\d[\d,./-]*/g) ?? [];
12032
- }
12033
- function preserveNumericIntegrity(rawText, correctedText) {
12034
- const rawTokens = extractNumericTokens(rawText);
12035
- const correctedTokens = extractNumericTokens(correctedText);
12036
- if (rawTokens.length !== correctedTokens.length) return rawText;
12037
- for (let i = 0; i < rawTokens.length; i++) {
12038
- if (rawTokens[i] !== correctedTokens[i]) return rawText;
12039
- }
12040
- return correctedText;
12041
- }
12042
- function addUncertainTag(rawText, correctedText) {
12043
- if (correctedText.includes("[\uD655\uC778\uD544\uC694:")) return correctedText;
12044
- const rawLen = rawText.trim().length;
12045
- const corrLen = correctedText.trim().length;
12046
- if (rawLen === 0 || corrLen === 0) return correctedText;
12047
- const rawLines = rawText.split("\n").filter(Boolean).length;
12048
- const corrLines = correctedText.split("\n").filter(Boolean).length;
12049
- const rawTableLines = rawText.split("\n").filter((l) => l.includes("|")).length;
12050
- const corrTableLines = correctedText.split("\n").filter((l) => l.includes("|")).length;
12051
- const suspicious = corrLen < rawLen * 0.75 || corrLines < Math.max(1, Math.floor(rawLines * 0.8)) || rawTableLines >= 2 && corrTableLines < Math.floor(rawTableLines * 0.7);
12052
- if (!suspicious) return correctedText;
12053
- return `${correctedText}
12054
-
12055
- [\uD655\uC778\uD544\uC694: \uAD50\uC815 \uACB0\uACFC\uAC00 \uCD95\uC57D\uB418\uC5C8\uC744 \uC218 \uC788\uC5B4 \uC6D0\uBB38\uACFC \uB300\uC870\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.]`;
12056
- }
12057
- function buildDiffSummary(before, after) {
12058
- return {
12059
- changed: before !== after,
12060
- beforeLength: before.length,
12061
- afterLength: after.length
12062
- };
12063
- }
12064
11929
  function normalizePipelineError(err, stage) {
12065
11930
  if (err instanceof UnifiedOcrError) return err;
12066
11931
  const message = err instanceof Error ? err.message : String(err);
@@ -12146,9 +12011,6 @@ async function parseImage(buffer, options) {
12146
12011
  if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
12147
12012
  ocrProvider = createCliOcrProvider(ocrMode);
12148
12013
  actualOcrMode = ocrMode;
12149
- } else if (ocrMode === "tesseract") {
12150
- ocrProvider = await createTesseractProvider();
12151
- actualOcrMode = ocrMode;
12152
12014
  } else if (ocrMode === "auto") {
12153
12015
  const modesToTry = ["gemini", "claude", "codex", "ollama"];
12154
12016
  for (const mode of modesToTry) {
@@ -12160,10 +12022,6 @@ async function parseImage(buffer, options) {
12160
12022
  console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
12161
12023
  }
12162
12024
  }
12163
- if (!ocrProvider) {
12164
- ocrProvider = await createTesseractProvider();
12165
- actualOcrMode = "tesseract";
12166
- }
12167
12025
  }
12168
12026
  if (!ocrProvider) {
12169
12027
  return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };