@clazic/kordoc 2.4.3 → 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env node
2
+
3
+ // src/ocr/auto-detect.ts
4
+ import { execSync } from "child_process";
5
+ var CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
6
+ function detectAvailableOcr() {
7
+ for (const cli of CLI_PRIORITY) {
8
+ if (isCliInstalled(cli)) return cli;
9
+ }
10
+ return "tesseract";
11
+ }
12
+ function isCliInstalled(name) {
13
+ try {
14
+ const cmd = process.platform === "win32" ? "where" : "which";
15
+ execSync(`${cmd} ${name}`, { stdio: "ignore", timeout: 3e3 });
16
+ return true;
17
+ } catch {
18
+ return false;
19
+ }
20
+ }
21
+ function getAutoFallbackChain() {
22
+ const chain = [];
23
+ for (const cli of CLI_PRIORITY) {
24
+ if (isCliInstalled(cli)) chain.push(cli);
25
+ }
26
+ chain.push("tesseract");
27
+ return chain;
28
+ }
29
+ function validateOcrMode(mode) {
30
+ if (mode === "auto" || mode === "off" || mode === "tesseract") return;
31
+ if (!isCliInstalled(mode)) {
32
+ throw new Error(`'${mode}' CLI\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4.
33
+ ${getInstallGuide(mode)}`);
34
+ }
35
+ }
36
+ function getInstallGuide(mode) {
37
+ const guides = {
38
+ gemini: "\uC124\uCE58: https://ai.google.dev/gemini-api/docs/cli",
39
+ claude: "\uC124\uCE58: npm install -g @anthropic-ai/claude-code \uB610\uB294 https://claude.ai/code",
40
+ codex: "\uC124\uCE58: npm install -g @openai/codex \uB610\uB294 https://github.com/openai/codex",
41
+ ollama: "\uC124\uCE58: brew install ollama \uB610\uB294 https://ollama.com/download"
42
+ };
43
+ return guides[mode] || `'${mode}'\uC744(\uB97C) \uC124\uCE58\uD574\uC8FC\uC138\uC694.`;
44
+ }
45
+ function getTesseractFallbackMessage() {
46
+ return [
47
+ "\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
48
+ "\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
49
+ "",
50
+ " [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
51
+ " Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
52
+ " Claude CLI: npm install -g @anthropic-ai/claude-code",
53
+ " Ollama: brew install ollama (+ ollama pull gemma4:27b)"
54
+ ].join("\n");
55
+ }
56
+
57
+ export {
58
+ detectAvailableOcr,
59
+ getAutoFallbackChain,
60
+ validateOcrMode,
61
+ getTesseractFallbackMessage
62
+ };
63
+ //# sourceMappingURL=chunk-7NOZFYH6.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/ocr/auto-detect.ts"],"sourcesContent":["/**\n * OCR CLI 자동 탐색\n *\n * 탐색 순서: codex → gemini → claude → ollama → tesseract.js\n * CLI는 which(unix) / where(win) 명령어로 PATH 존재 확인.\n * tesseract.js는 bundled 의존성이므로 항상 사용 가능 (최후 fallback).\n */\n\nimport { execSync } from \"child_process\"\nimport type { OcrMode } from \"../types.js\"\n\n/** CLI 탐색 우선순위 */\nconst CLI_PRIORITY = [\"codex\", \"gemini\", \"claude\", \"ollama\"] as const\n\n/**\n * 시스템에 설치된 OCR 도구를 우선순위대로 탐색.\n * tesseract.js는 bundled 의존성이므로 CLI를 찾지 못해도 항상 \"tesseract\" 반환.\n * @returns 사용 가능한 OcrMode (null 반환 없음)\n */\nexport function detectAvailableOcr(): OcrMode {\n // 1. CLI 프로그램 탐색 (codex → gemini → claude → ollama)\n for (const cli of CLI_PRIORITY) {\n if (isCliInstalled(cli)) return cli\n }\n\n // 2. tesseract.js — bundled 의존성, 항상 사용 가능\n return \"tesseract\"\n}\n\n/**\n * 특정 CLI가 시스템 PATH에 있는지 확인.\n * which(unix) 또는 where(win32) 사용.\n */\nfunction isCliInstalled(name: string): boolean {\n try {\n const cmd = process.platform === \"win32\" ? \"where\" : \"which\"\n execSync(`${cmd} ${name}`, { stdio: \"ignore\", timeout: 3000 })\n return true\n } catch {\n return false\n }\n}\n\n/**\n * auto 모드에서 시도할 fallback 체인 반환.\n * 설치된 CLI만 포함하며, tesseract는 항상 마지막에 추가.\n */\nexport function getAutoFallbackChain(): OcrMode[] {\n const chain: OcrMode[] = []\n for (const cli of CLI_PRIORITY) {\n if (isCliInstalled(cli)) chain.push(cli)\n }\n chain.push(\"tesseract\")\n return chain\n}\n\n/**\n * 수동 지정된 OcrMode 유효성 검증.\n * --ocr gemini 등 강제 지정 시 호출.\n * @throws 해당 CLI가 설치되지 않은 경우 Error (tesseract는 항상 통과)\n */\nexport function validateOcrMode(mode: OcrMode): void {\n if (mode === \"auto\" || mode === \"off\" || mode === \"tesseract\") return\n\n if (!isCliInstalled(mode)) {\n throw new Error(`'${mode}' CLI가 설치되지 않았습니다.\\n${getInstallGuide(mode)}`)\n }\n}\n\n/** CLI별 설치 안내 메시지 */\nfunction getInstallGuide(mode: string): string {\n const guides: Record<string, string> = {\n gemini: \"설치: https://ai.google.dev/gemini-api/docs/cli\",\n claude: \"설치: npm install -g @anthropic-ai/claude-code 또는 https://claude.ai/code\",\n codex: \"설치: npm install -g @openai/codex 또는 https://github.com/openai/codex\",\n ollama: \"설치: brew install ollama 또는 https://ollama.com/download\",\n }\n return guides[mode] || `'${mode}'을(를) 설치해주세요.`\n}\n\n/**\n * AI CLI가 없어 tesseract.js로 fallback할 때 표시할 안내 메시지.\n */\nexport function getTesseractFallbackMessage(): string {\n return [\n \"설치된 AI CLI가 없어 내장 tesseract.js로 OCR을 수행합니다.\",\n \"더 나은 품질(테이블/헤딩 구조 보존)을 위해 AI CLI 설치를 권장합니다:\",\n \"\",\n \" [권장] Codex CLI: npm install -g @openai/codex\",\n \" Gemini CLI: https://ai.google.dev/gemini-api/docs/cli\",\n \" Claude CLI: npm install -g @anthropic-ai/claude-code\",\n \" Ollama: brew install ollama (+ ollama pull gemma4:27b)\",\n ].join(\"\\n\")\n}\n"],"mappings":";;;AAQA,SAAS,gBAAgB;AAIzB,IAAM,eAAe,CAAC,SAAS,UAAU,UAAU,QAAQ;AAOpD,SAAS,qBAA8B;AAE5C,aAAW,OAAO,cAAc;AAC9B,QAAI,eAAe,GAAG,EAAG,QAAO;AAAA,EAClC;AAGA,SAAO;AACT;AAMA,SAAS,eAAe,MAAuB;AAC7C,MAAI;AACF,UAAM,MAAM,QAAQ,aAAa,UAAU,UAAU;AACrD,aAAS,GAAG,GAAG,IAAI,IAAI,IAAI,EAAE,OAAO,UAAU,SAAS,IAAK,CAAC;AAC7D,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAMO,SAAS,uBAAkC;AAChD,QAAM,QAAmB,CAAC;AAC1B,aAAW,OAAO,cAAc;AAC9B,QAAI,eAAe,GAAG,EAAG,OAAM,KAAK,GAAG;AAAA,EACzC;AACA,QAAM,KAAK,WAAW;AACtB,SAAO;AACT;AAOO,SAAS,gBAAgB,MAAqB;AACnD,MAAI,SAAS,UAAU,SAAS,SAAS,SAAS,YAAa;AAE/D,MAAI,CAAC,eAAe,IAAI,GAAG;AACzB,UAAM,IAAI,MAAM,IAAI,IAAI;AAAA,EAAuB,gBAAgB,IAAI,CAAC,EAAE;AAAA,EACxE;AACF;AAGA,SAAS,gBAAgB,MAAsB;AAC7C,QAAM,SAAiC;AAAA,IACrC,QAAQ;AAAA,IACR,QAAQ;AAAA,IACR,OAAQ;AAAA,IACR,QAAQ;AAAA,EACV;AACA,SAAO,OAAO,IAAI,KAAK,IAAI,IAAI;AACjC;AAKO,SAAS,8BAAsC;AACpD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,EAAE,KAAK,IAAI;AACb;","names":[]}
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/utils.ts
4
- var VERSION = true ? "2.4.3" : "0.0.0-dev";
4
+ var VERSION = true ? "2.4.4" : "0.0.0-dev";
5
5
  function toArrayBuffer(buf) {
6
6
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
7
7
  return buf.buffer;
@@ -90,4 +90,4 @@ export {
90
90
  sanitizeHref,
91
91
  classifyError
92
92
  };
93
- //# sourceMappingURL=chunk-HOUVJPR7.js.map
93
+ //# sourceMappingURL=chunk-KEDUF24M.js.map
package/dist/cli.js CHANGED
@@ -4,15 +4,15 @@ import {
4
4
  markdownToHwpx,
5
5
  markdownToXlsx,
6
6
  parse
7
- } from "./chunk-IAU7NTTA.js";
7
+ } from "./chunk-5AXJRBBK.js";
8
+ import "./chunk-4PP34NVQ.js";
8
9
  import {
9
10
  VERSION,
10
11
  toArrayBuffer
11
- } from "./chunk-HOUVJPR7.js";
12
+ } from "./chunk-KEDUF24M.js";
12
13
  import "./chunk-MOL7MDBG.js";
13
14
  import "./chunk-7FMKAV4P.js";
14
15
  import "./chunk-JOGAFNIL.js";
15
- import "./chunk-4PP34NVQ.js";
16
16
  import "./chunk-ZWE3DS7E.js";
17
17
 
18
18
  // src/cli.ts
@@ -137,7 +137,7 @@ async function runParse(files, opts) {
137
137
  saveImages(absPath);
138
138
  }
139
139
  } catch (err) {
140
- const { sanitizeError } = await import("./utils-PYEEPTPM.js");
140
+ const { sanitizeError } = await import("./utils-BB2CDSTB.js");
141
141
  process.stderr.write(`
142
142
  [kordoc] ERROR: ${fileName} \u2014 ${sanitizeError(err)}
143
143
  `);
@@ -221,7 +221,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
221
221
  `));
222
222
  }
223
223
  } catch (err) {
224
- const { sanitizeError } = await import("./utils-PYEEPTPM.js");
224
+ const { sanitizeError } = await import("./utils-BB2CDSTB.js");
225
225
  process.stderr.write(` FAIL
226
226
  `);
227
227
  process.stderr.write(` \u2192 ${sanitizeError(err)}
@@ -230,7 +230,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
230
230
  }
231
231
  });
232
232
  program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
233
- const { watchDirectory } = await import("./watch-IQLSW2OB.js");
233
+ const { watchDirectory } = await import("./watch-6QVK32X7.js");
234
234
  await watchDirectory({
235
235
  dir,
236
236
  outDir: opts.outDir,
package/dist/index.cjs CHANGED
@@ -1957,6 +1957,13 @@ var require_cfb = __commonJS({
1957
1957
  });
1958
1958
 
1959
1959
  // src/ocr/auto-detect.ts
1960
+ var auto_detect_exports = {};
1961
+ __export(auto_detect_exports, {
1962
+ detectAvailableOcr: () => detectAvailableOcr,
1963
+ getAutoFallbackChain: () => getAutoFallbackChain,
1964
+ getTesseractFallbackMessage: () => getTesseractFallbackMessage,
1965
+ validateOcrMode: () => validateOcrMode
1966
+ });
1960
1967
  function detectAvailableOcr() {
1961
1968
  for (const cli of CLI_PRIORITY) {
1962
1969
  if (isCliInstalled(cli)) return cli;
@@ -1972,6 +1979,14 @@ function isCliInstalled(name) {
1972
1979
  return false;
1973
1980
  }
1974
1981
  }
1982
+ function getAutoFallbackChain() {
1983
+ const chain = [];
1984
+ for (const cli of CLI_PRIORITY) {
1985
+ if (isCliInstalled(cli)) chain.push(cli);
1986
+ }
1987
+ chain.push("tesseract");
1988
+ return chain;
1989
+ }
1975
1990
  function validateOcrMode(mode) {
1976
1991
  if (mode === "auto" || mode === "off" || mode === "tesseract") return;
1977
1992
  if (!isCliInstalled(mode)) {
@@ -2810,7 +2825,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
2810
2825
  var import_xmldom = require("@xmldom/xmldom");
2811
2826
 
2812
2827
  // src/utils.ts
2813
- var VERSION = true ? "2.4.3" : "0.0.0-dev";
2828
+ var VERSION = true ? "2.4.4" : "0.0.0-dev";
2814
2829
  function toArrayBuffer(buf) {
2815
2830
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2816
2831
  return buf.buffer;
@@ -6362,53 +6377,69 @@ async function parsePdfDocument(buffer, options) {
6362
6377
  }
6363
6378
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
6364
6379
  if (isImageBased) {
6365
- let ocrProvider = options?.ocr ?? null;
6366
6380
  const ocrMode = options?.ocrMode ?? "auto";
6367
- if (!ocrProvider && ocrMode !== "off") {
6368
- try {
6369
- const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6370
- const concurrency = options?.ocrConcurrency ?? 1;
6371
- const batchSize = options?.ocrBatchSize;
6372
- ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
6373
- } catch (resolveErr) {
6374
- if (ocrMode !== "auto") {
6375
- throw Object.assign(
6376
- new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
6377
- { isImageBased: true }
6378
- );
6379
- }
6380
- }
6381
+ const concurrency = options?.ocrConcurrency ?? 1;
6382
+ const batchSize = options?.ocrBatchSize;
6383
+ if (ocrMode === "off") {
6384
+ throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
6381
6385
  }
6382
- if (ocrProvider) {
6383
- let ocrBlocks = [];
6386
+ const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6387
+ const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6388
+ const tryProvider = async (provider) => {
6384
6389
  try {
6385
- const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6386
- const concurrency = options?.ocrConcurrency ?? 1;
6387
- ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
6390
+ return await ocrPages2(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
6388
6391
  } catch {
6392
+ return [];
6389
6393
  } finally {
6390
- const terminable = ocrProvider;
6394
+ const terminable = provider;
6391
6395
  if (typeof terminable.terminate === "function") {
6392
6396
  await terminable.terminate().catch(() => {
6393
6397
  });
6394
6398
  }
6395
6399
  }
6396
- if (ocrBlocks.length > 0) {
6397
- const ocrMarkdown = blocksToMarkdown(ocrBlocks);
6398
- return {
6399
- markdown: ocrMarkdown,
6400
- blocks: ocrBlocks,
6401
- metadata,
6402
- warnings: warnings.length > 0 ? warnings : void 0,
6403
- isImageBased: true
6404
- };
6400
+ };
6401
+ let ocrBlocks = [];
6402
+ if (options?.ocr) {
6403
+ ocrBlocks = await tryProvider(options.ocr);
6404
+ } else if (ocrMode === "auto") {
6405
+ const { getAutoFallbackChain: getAutoFallbackChain2 } = await Promise.resolve().then(() => (init_auto_detect(), auto_detect_exports));
6406
+ for (const mode of getAutoFallbackChain2()) {
6407
+ try {
6408
+ const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
6409
+ const blocks2 = await tryProvider(provider);
6410
+ if (blocks2.length > 0) {
6411
+ ocrBlocks = blocks2;
6412
+ break;
6413
+ }
6414
+ warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
6415
+ } catch {
6416
+ }
6417
+ }
6418
+ } else {
6419
+ try {
6420
+ const provider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
6421
+ ocrBlocks = await tryProvider(provider);
6422
+ } catch (resolveErr) {
6423
+ throw Object.assign(
6424
+ new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
6425
+ { isImageBased: true }
6426
+ );
6405
6427
  }
6406
6428
  }
6407
- if (ocrMode === "off") {
6408
- throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
6429
+ if (ocrBlocks.length > 0) {
6430
+ const ocrMarkdown = blocksToMarkdown(ocrBlocks);
6431
+ return {
6432
+ markdown: ocrMarkdown,
6433
+ blocks: ocrBlocks,
6434
+ metadata,
6435
+ warnings: warnings.length > 0 ? warnings : void 0,
6436
+ isImageBased: true
6437
+ };
6409
6438
  }
6410
- const errMsg = ocrMode ? `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` : `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`;
6411
- throw Object.assign(new KordocError(errMsg), { isImageBased: true });
6439
+ throw Object.assign(
6440
+ new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`),
6441
+ { isImageBased: true }
6442
+ );
6412
6443
  }
6413
6444
  if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
6414
6445
  const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);