@clazic/kordoc 2.4.3 → 2.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/auto-detect-2YGFYQCN.js +15 -0
- package/dist/chunk-7NOZFYH6.js +63 -0
- package/dist/chunk-7NOZFYH6.js.map +1 -0
- package/dist/{chunk-IAU7NTTA.js → chunk-ATB6T3SG.js} +72 -39
- package/dist/chunk-ATB6T3SG.js.map +1 -0
- package/dist/{chunk-HOUVJPR7.js → chunk-CG3DV7QG.js} +2 -2
- package/dist/cli.js +6 -6
- package/dist/index.cjs +114 -51
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +114 -51
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/{provider-HE727F7Z.js → provider-7F7NEDTN.js} +32 -17
- package/dist/provider-7F7NEDTN.js.map +1 -0
- package/dist/{resolve-UOAOPQ4H.js → resolve-TZVGVOVD.js} +6 -47
- package/dist/resolve-TZVGVOVD.js.map +1 -0
- package/dist/{utils-PYEEPTPM.js → utils-LG2ALGSE.js} +2 -2
- package/dist/utils-LG2ALGSE.js.map +1 -0
- package/dist/{watch-IQLSW2OB.js → watch-Z3CENX4H.js} +4 -4
- package/package.json +1 -1
- package/dist/chunk-IAU7NTTA.js.map +0 -1
- package/dist/provider-HE727F7Z.js.map +0 -1
- package/dist/resolve-UOAOPQ4H.js.map +0 -1
- /package/dist/{utils-PYEEPTPM.js.map → auto-detect-2YGFYQCN.js.map} +0 -0
- /package/dist/{chunk-HOUVJPR7.js.map → chunk-CG3DV7QG.js.map} +0 -0
- /package/dist/{watch-IQLSW2OB.js.map → watch-Z3CENX4H.js.map} +0 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
detectAvailableOcr,
|
|
4
|
+
getAutoFallbackChain,
|
|
5
|
+
getTesseractFallbackMessage,
|
|
6
|
+
validateOcrMode
|
|
7
|
+
} from "./chunk-7NOZFYH6.js";
|
|
8
|
+
import "./chunk-ZWE3DS7E.js";
|
|
9
|
+
export {
|
|
10
|
+
detectAvailableOcr,
|
|
11
|
+
getAutoFallbackChain,
|
|
12
|
+
getTesseractFallbackMessage,
|
|
13
|
+
validateOcrMode
|
|
14
|
+
};
|
|
15
|
+
//# sourceMappingURL=auto-detect-2YGFYQCN.js.map
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
// src/ocr/auto-detect.ts
|
|
4
|
+
import { execSync } from "child_process";
|
|
5
|
+
var CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
|
|
6
|
+
function detectAvailableOcr() {
|
|
7
|
+
for (const cli of CLI_PRIORITY) {
|
|
8
|
+
if (isCliInstalled(cli)) return cli;
|
|
9
|
+
}
|
|
10
|
+
return "tesseract";
|
|
11
|
+
}
|
|
12
|
+
function isCliInstalled(name) {
|
|
13
|
+
try {
|
|
14
|
+
const cmd = process.platform === "win32" ? "where" : "which";
|
|
15
|
+
execSync(`${cmd} ${name}`, { stdio: "ignore", timeout: 3e3 });
|
|
16
|
+
return true;
|
|
17
|
+
} catch {
|
|
18
|
+
return false;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
function getAutoFallbackChain() {
|
|
22
|
+
const chain = [];
|
|
23
|
+
for (const cli of CLI_PRIORITY) {
|
|
24
|
+
if (isCliInstalled(cli)) chain.push(cli);
|
|
25
|
+
}
|
|
26
|
+
chain.push("tesseract");
|
|
27
|
+
return chain;
|
|
28
|
+
}
|
|
29
|
+
function validateOcrMode(mode) {
|
|
30
|
+
if (mode === "auto" || mode === "off" || mode === "tesseract") return;
|
|
31
|
+
if (!isCliInstalled(mode)) {
|
|
32
|
+
throw new Error(`'${mode}' CLI\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4.
|
|
33
|
+
${getInstallGuide(mode)}`);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
function getInstallGuide(mode) {
|
|
37
|
+
const guides = {
|
|
38
|
+
gemini: "\uC124\uCE58: https://ai.google.dev/gemini-api/docs/cli",
|
|
39
|
+
claude: "\uC124\uCE58: npm install -g @anthropic-ai/claude-code \uB610\uB294 https://claude.ai/code",
|
|
40
|
+
codex: "\uC124\uCE58: npm install -g @openai/codex \uB610\uB294 https://github.com/openai/codex",
|
|
41
|
+
ollama: "\uC124\uCE58: brew install ollama \uB610\uB294 https://ollama.com/download"
|
|
42
|
+
};
|
|
43
|
+
return guides[mode] || `'${mode}'\uC744(\uB97C) \uC124\uCE58\uD574\uC8FC\uC138\uC694.`;
|
|
44
|
+
}
|
|
45
|
+
function getTesseractFallbackMessage() {
|
|
46
|
+
return [
|
|
47
|
+
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
|
|
48
|
+
"\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
|
|
49
|
+
"",
|
|
50
|
+
" [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
|
|
51
|
+
" Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
|
|
52
|
+
" Claude CLI: npm install -g @anthropic-ai/claude-code",
|
|
53
|
+
" Ollama: brew install ollama (+ ollama pull gemma4:27b)"
|
|
54
|
+
].join("\n");
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export {
|
|
58
|
+
detectAvailableOcr,
|
|
59
|
+
getAutoFallbackChain,
|
|
60
|
+
validateOcrMode,
|
|
61
|
+
getTesseractFallbackMessage
|
|
62
|
+
};
|
|
63
|
+
//# sourceMappingURL=chunk-7NOZFYH6.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/ocr/auto-detect.ts"],"sourcesContent":["/**\n * OCR CLI 자동 탐색\n *\n * 탐색 순서: codex → gemini → claude → ollama → tesseract.js\n * CLI는 which(unix) / where(win) 명령어로 PATH 존재 확인.\n * tesseract.js는 bundled 의존성이므로 항상 사용 가능 (최후 fallback).\n */\n\nimport { execSync } from \"child_process\"\nimport type { OcrMode } from \"../types.js\"\n\n/** CLI 탐색 우선순위 */\nconst CLI_PRIORITY = [\"codex\", \"gemini\", \"claude\", \"ollama\"] as const\n\n/**\n * 시스템에 설치된 OCR 도구를 우선순위대로 탐색.\n * tesseract.js는 bundled 의존성이므로 CLI를 찾지 못해도 항상 \"tesseract\" 반환.\n * @returns 사용 가능한 OcrMode (null 반환 없음)\n */\nexport function detectAvailableOcr(): OcrMode {\n // 1. CLI 프로그램 탐색 (codex → gemini → claude → ollama)\n for (const cli of CLI_PRIORITY) {\n if (isCliInstalled(cli)) return cli\n }\n\n // 2. tesseract.js — bundled 의존성, 항상 사용 가능\n return \"tesseract\"\n}\n\n/**\n * 특정 CLI가 시스템 PATH에 있는지 확인.\n * which(unix) 또는 where(win32) 사용.\n */\nfunction isCliInstalled(name: string): boolean {\n try {\n const cmd = process.platform === \"win32\" ? \"where\" : \"which\"\n execSync(`${cmd} ${name}`, { stdio: \"ignore\", timeout: 3000 })\n return true\n } catch {\n return false\n }\n}\n\n/**\n * auto 모드에서 시도할 fallback 체인 반환.\n * 설치된 CLI만 포함하며, tesseract는 항상 마지막에 추가.\n */\nexport function getAutoFallbackChain(): OcrMode[] {\n const chain: OcrMode[] = []\n for (const cli of CLI_PRIORITY) {\n if (isCliInstalled(cli)) chain.push(cli)\n }\n chain.push(\"tesseract\")\n return chain\n}\n\n/**\n * 수동 지정된 OcrMode 유효성 검증.\n * --ocr gemini 등 강제 지정 시 호출.\n * @throws 해당 CLI가 설치되지 않은 경우 Error (tesseract는 항상 통과)\n */\nexport function validateOcrMode(mode: OcrMode): void {\n if (mode === \"auto\" || mode === \"off\" || mode === \"tesseract\") return\n\n if (!isCliInstalled(mode)) {\n throw new Error(`'${mode}' CLI가 설치되지 않았습니다.\\n${getInstallGuide(mode)}`)\n }\n}\n\n/** CLI별 설치 안내 메시지 */\nfunction getInstallGuide(mode: string): string {\n const guides: Record<string, string> = {\n gemini: \"설치: https://ai.google.dev/gemini-api/docs/cli\",\n claude: \"설치: npm install -g @anthropic-ai/claude-code 또는 https://claude.ai/code\",\n codex: \"설치: npm install -g @openai/codex 또는 https://github.com/openai/codex\",\n ollama: \"설치: brew install ollama 또는 https://ollama.com/download\",\n }\n return guides[mode] || `'${mode}'을(를) 설치해주세요.`\n}\n\n/**\n * AI CLI가 없어 tesseract.js로 fallback할 때 표시할 안내 메시지.\n */\nexport function getTesseractFallbackMessage(): string {\n return [\n \"설치된 AI CLI가 없어 내장 tesseract.js로 OCR을 수행합니다.\",\n \"더 나은 품질(테이블/헤딩 구조 보존)을 위해 AI CLI 설치를 권장합니다:\",\n \"\",\n \" [권장] Codex CLI: npm install -g @openai/codex\",\n \" Gemini CLI: https://ai.google.dev/gemini-api/docs/cli\",\n \" Claude CLI: npm install -g @anthropic-ai/claude-code\",\n \" Ollama: brew install ollama (+ ollama pull gemma4:27b)\",\n ].join(\"\\n\")\n}\n"],"mappings":";;;AAQA,SAAS,gBAAgB;AAIzB,IAAM,eAAe,CAAC,SAAS,UAAU,UAAU,QAAQ;AAOpD,SAAS,qBAA8B;AAE5C,aAAW,OAAO,cAAc;AAC9B,QAAI,eAAe,GAAG,EAAG,QAAO;AAAA,EAClC;AAGA,SAAO;AACT;AAMA,SAAS,eAAe,MAAuB;AAC7C,MAAI;AACF,UAAM,MAAM,QAAQ,aAAa,UAAU,UAAU;AACrD,aAAS,GAAG,GAAG,IAAI,IAAI,IAAI,EAAE,OAAO,UAAU,SAAS,IAAK,CAAC;AAC7D,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAMO,SAAS,uBAAkC;AAChD,QAAM,QAAmB,CAAC;AAC1B,aAAW,OAAO,cAAc;AAC9B,QAAI,eAAe,GAAG,EAAG,OAAM,KAAK,GAAG;AAAA,EACzC;AACA,QAAM,KAAK,WAAW;AACtB,SAAO;AACT;AAOO,SAAS,gBAAgB,MAAqB;AACnD,MAAI,SAAS,UAAU,SAAS,SAAS,SAAS,YAAa;AAE/D,MAAI,CAAC,eAAe,IAAI,GAAG;AACzB,UAAM,IAAI,MAAM,IAAI,IAAI;AAAA,EAAuB,gBAAgB,IAAI,CAAC,EAAE;AAAA,EACxE;AACF;AAGA,SAAS,gBAAgB,MAAsB;AAC7C,QAAM,SAAiC;AAAA,IACrC,QAAQ;AAAA,IACR,QAAQ;AAAA,IACR,OAAQ;AAAA,IACR,QAAQ;AAAA,EACV;AACA,SAAO,OAAO,IAAI,KAAK,IAAI,IAAI;AACjC;AAKO,SAAS,8BAAsC;AACpD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,EAAE,KAAK,IAAI;AACb;","names":[]}
|
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
markdownToBlocks
|
|
4
|
+
} from "./chunk-4PP34NVQ.js";
|
|
2
5
|
import {
|
|
3
6
|
KordocError,
|
|
4
7
|
classifyError,
|
|
@@ -6,7 +9,7 @@ import {
|
|
|
6
9
|
precheckZipSize,
|
|
7
10
|
sanitizeHref,
|
|
8
11
|
toArrayBuffer
|
|
9
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-CG3DV7QG.js";
|
|
10
13
|
import {
|
|
11
14
|
parsePageRange
|
|
12
15
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -16,9 +19,6 @@ import {
|
|
|
16
19
|
import {
|
|
17
20
|
createCliOcrProvider
|
|
18
21
|
} from "./chunk-JOGAFNIL.js";
|
|
19
|
-
import {
|
|
20
|
-
markdownToBlocks
|
|
21
|
-
} from "./chunk-4PP34NVQ.js";
|
|
22
22
|
import {
|
|
23
23
|
__commonJS,
|
|
24
24
|
__require,
|
|
@@ -5456,53 +5456,86 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5456
5456
|
}
|
|
5457
5457
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
5458
5458
|
if (isImageBased) {
|
|
5459
|
-
let ocrProvider = options?.ocr ?? null;
|
|
5460
5459
|
const ocrMode = options?.ocrMode ?? "auto";
|
|
5461
|
-
|
|
5462
|
-
|
|
5463
|
-
|
|
5464
|
-
|
|
5465
|
-
const batchSize = options?.ocrBatchSize;
|
|
5466
|
-
ocrProvider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
|
|
5467
|
-
} catch (resolveErr) {
|
|
5468
|
-
if (ocrMode !== "auto") {
|
|
5469
|
-
throw Object.assign(
|
|
5470
|
-
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
5471
|
-
{ isImageBased: true }
|
|
5472
|
-
);
|
|
5473
|
-
}
|
|
5474
|
-
}
|
|
5460
|
+
const concurrency = options?.ocrConcurrency ?? 1;
|
|
5461
|
+
const batchSize = options?.ocrBatchSize;
|
|
5462
|
+
if (ocrMode === "off") {
|
|
5463
|
+
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
5475
5464
|
}
|
|
5476
|
-
|
|
5477
|
-
|
|
5465
|
+
const { resolveOcrProvider } = await import("./resolve-TZVGVOVD.js");
|
|
5466
|
+
const { ocrPages } = await import("./provider-7F7NEDTN.js");
|
|
5467
|
+
const tryProvider = async (provider, filter) => {
|
|
5478
5468
|
try {
|
|
5479
|
-
|
|
5480
|
-
const concurrency = options?.ocrConcurrency ?? 1;
|
|
5481
|
-
ocrBlocks = await ocrPages(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
5469
|
+
return await ocrPages(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
5482
5470
|
} catch {
|
|
5471
|
+
return [];
|
|
5483
5472
|
} finally {
|
|
5484
|
-
const terminable =
|
|
5473
|
+
const terminable = provider;
|
|
5485
5474
|
if (typeof terminable.terminate === "function") {
|
|
5486
5475
|
await terminable.terminate().catch(() => {
|
|
5487
5476
|
});
|
|
5488
5477
|
}
|
|
5489
5478
|
}
|
|
5490
|
-
|
|
5491
|
-
|
|
5492
|
-
|
|
5493
|
-
|
|
5494
|
-
|
|
5495
|
-
|
|
5496
|
-
|
|
5497
|
-
|
|
5498
|
-
|
|
5479
|
+
};
|
|
5480
|
+
let ocrBlocks = [];
|
|
5481
|
+
if (options?.ocr) {
|
|
5482
|
+
ocrBlocks = await tryProvider(options.ocr, pageFilter);
|
|
5483
|
+
} else if (ocrMode === "auto") {
|
|
5484
|
+
const { getAutoFallbackChain } = await import("./auto-detect-2YGFYQCN.js");
|
|
5485
|
+
const pendingPages = /* @__PURE__ */ new Set();
|
|
5486
|
+
for (let i = 1; i <= effectivePageCount; i++) {
|
|
5487
|
+
if (!pageFilter || pageFilter.has(i)) pendingPages.add(i);
|
|
5488
|
+
}
|
|
5489
|
+
const allOcrBlocks = [];
|
|
5490
|
+
for (const mode of getAutoFallbackChain()) {
|
|
5491
|
+
if (pendingPages.size === 0) break;
|
|
5492
|
+
try {
|
|
5493
|
+
const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
|
|
5494
|
+
const provider = await resolveOcrProvider(mode, warnings, concurrency, batchSize);
|
|
5495
|
+
const blocks2 = await tryProvider(provider, modeFilter);
|
|
5496
|
+
if (blocks2.length > 0) {
|
|
5497
|
+
for (const b of blocks2) {
|
|
5498
|
+
if (b.pageNumber !== void 0) pendingPages.delete(b.pageNumber);
|
|
5499
|
+
}
|
|
5500
|
+
for (const b of blocks2) allOcrBlocks.push(b);
|
|
5501
|
+
if (pendingPages.size > 0) {
|
|
5502
|
+
warnings.push({
|
|
5503
|
+
message: `OCR: '${mode}' \uC644\uB8CC (${pendingPages.size}\uD398\uC774\uC9C0 \uBBF8\uCC98\uB9AC \u2192 \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC7AC\uC2DC\uB3C4)`,
|
|
5504
|
+
code: "OCR_CLI_FALLBACK"
|
|
5505
|
+
});
|
|
5506
|
+
}
|
|
5507
|
+
} else {
|
|
5508
|
+
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
5509
|
+
}
|
|
5510
|
+
} catch {
|
|
5511
|
+
}
|
|
5512
|
+
}
|
|
5513
|
+
ocrBlocks = allOcrBlocks;
|
|
5514
|
+
} else {
|
|
5515
|
+
try {
|
|
5516
|
+
const provider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
|
|
5517
|
+
ocrBlocks = await tryProvider(provider, pageFilter);
|
|
5518
|
+
} catch (resolveErr) {
|
|
5519
|
+
throw Object.assign(
|
|
5520
|
+
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
5521
|
+
{ isImageBased: true }
|
|
5522
|
+
);
|
|
5499
5523
|
}
|
|
5500
5524
|
}
|
|
5501
|
-
if (
|
|
5502
|
-
|
|
5525
|
+
if (ocrBlocks.length > 0) {
|
|
5526
|
+
const ocrMarkdown = blocksToMarkdown(ocrBlocks);
|
|
5527
|
+
return {
|
|
5528
|
+
markdown: ocrMarkdown,
|
|
5529
|
+
blocks: ocrBlocks,
|
|
5530
|
+
metadata,
|
|
5531
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
5532
|
+
isImageBased: true
|
|
5533
|
+
};
|
|
5503
5534
|
}
|
|
5504
|
-
|
|
5505
|
-
|
|
5535
|
+
throw Object.assign(
|
|
5536
|
+
new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`),
|
|
5537
|
+
{ isImageBased: true }
|
|
5538
|
+
);
|
|
5506
5539
|
}
|
|
5507
5540
|
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
5508
5541
|
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|
|
@@ -9779,4 +9812,4 @@ export {
|
|
|
9779
9812
|
cfb/cfb.js:
|
|
9780
9813
|
(*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
|
|
9781
9814
|
*/
|
|
9782
|
-
//# sourceMappingURL=chunk-
|
|
9815
|
+
//# sourceMappingURL=chunk-ATB6T3SG.js.map
|