@clazic/kordoc 2.4.18 → 2.4.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{auto-detect-2YGFYQCN.js → auto-detect-CBYICI6B.js} +4 -4
- package/dist/{chunk-T7EBS5XP.js → chunk-463YQ2WL.js} +8 -18
- package/dist/chunk-463YQ2WL.js.map +1 -0
- package/dist/{chunk-7NOZFYH6.js → chunk-CLK4PNZ7.js} +7 -8
- package/dist/chunk-CLK4PNZ7.js.map +1 -0
- package/dist/{chunk-W2KDIKDF.js → chunk-MZN7PLTZ.js} +2 -2
- package/dist/{chunk-34WIGIQC.js → chunk-Y4WFKJ5P.js} +1 -1
- package/dist/chunk-Y4WFKJ5P.js.map +1 -0
- package/dist/cli.js +9 -13
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +16 -105
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +5 -6
- package/dist/index.d.ts +5 -6
- package/dist/index.js +16 -104
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -6
- package/dist/mcp.js.map +1 -1
- package/dist/{resolve-673XFZQ6.js → resolve-XWYJYKKH.js} +15 -36
- package/dist/resolve-XWYJYKKH.js.map +1 -0
- package/dist/{utils-DHOODYKU.js → utils-YUAT7LFD.js} +2 -2
- package/dist/{watch-YGIU7RN7.js → watch-WEOFVVDO.js} +5 -6
- package/dist/{watch-YGIU7RN7.js.map → watch-WEOFVVDO.js.map} +1 -1
- package/package.json +1 -2
- package/dist/chunk-34WIGIQC.js.map +0 -1
- package/dist/chunk-7FMKAV4P.js +0 -56
- package/dist/chunk-7FMKAV4P.js.map +0 -1
- package/dist/chunk-7NOZFYH6.js.map +0 -1
- package/dist/chunk-T7EBS5XP.js.map +0 -1
- package/dist/resolve-673XFZQ6.js.map +0 -1
- package/dist/tesseract-provider-MNMZPSGF.js +0 -11
- package/dist/utils-DHOODYKU.js.map +0 -1
- /package/dist/{auto-detect-2YGFYQCN.js.map → auto-detect-CBYICI6B.js.map} +0 -0
- /package/dist/{chunk-W2KDIKDF.js.map → chunk-MZN7PLTZ.js.map} +0 -0
- /package/dist/{tesseract-provider-MNMZPSGF.js.map → utils-YUAT7LFD.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -106,17 +106,16 @@ interface ParseOptions {
|
|
|
106
106
|
ocr?: OcrProvider;
|
|
107
107
|
/**
|
|
108
108
|
* OCR 모드 (CLI 자동 탐색용).
|
|
109
|
-
* - "auto": 설치된 CLI 자동 탐색 (gemini→claude→
|
|
110
|
-
* - "gemini"|"claude"|"codex"|"ollama"
|
|
109
|
+
* - "auto": 설치된 CLI 자동 탐색 (codex→gemini→claude→ollama)
|
|
110
|
+
* - "gemini"|"claude"|"codex"|"ollama": 특정 도구 강제 지정
|
|
111
111
|
* - "off": OCR 비활성화 (이미지 기반 PDF면 에러)
|
|
112
112
|
* - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
|
|
113
113
|
*/
|
|
114
114
|
ocrMode?: OcrMode;
|
|
115
115
|
/**
|
|
116
116
|
* OCR 병렬 처리 수.
|
|
117
|
-
* -
|
|
118
|
-
* -
|
|
119
|
-
* - 1: 순차 처리 (기존 동작)
|
|
117
|
+
* - CLI 제공 프로바이더(gemini/claude/codex): 기본 4 (배치 병렬 실행)
|
|
118
|
+
* - 1: 순차 처리
|
|
120
119
|
*/
|
|
121
120
|
ocrConcurrency?: number;
|
|
122
121
|
/** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
|
|
@@ -241,7 +240,7 @@ interface StructuredOcrResult {
|
|
|
241
240
|
markdown: string;
|
|
242
241
|
}
|
|
243
242
|
/** OCR 모드 — CLI --ocr 옵션 허용값 */
|
|
244
|
-
type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "
|
|
243
|
+
type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "off";
|
|
245
244
|
/** 사용자 제공 OCR 함수 — 페이지 이미지를 받아 텍스트 또는 구조화된 결과 반환 */
|
|
246
245
|
type OcrProvider = (pageImage: Uint8Array, pageNumber: number, mimeType: "image/png") => Promise<string | StructuredOcrResult>;
|
|
247
246
|
interface WatchOptions {
|
package/dist/index.d.ts
CHANGED
|
@@ -106,17 +106,16 @@ interface ParseOptions {
|
|
|
106
106
|
ocr?: OcrProvider;
|
|
107
107
|
/**
|
|
108
108
|
* OCR 모드 (CLI 자동 탐색용).
|
|
109
|
-
* - "auto": 설치된 CLI 자동 탐색 (gemini→claude→
|
|
110
|
-
* - "gemini"|"claude"|"codex"|"ollama"
|
|
109
|
+
* - "auto": 설치된 CLI 자동 탐색 (codex→gemini→claude→ollama)
|
|
110
|
+
* - "gemini"|"claude"|"codex"|"ollama": 특정 도구 강제 지정
|
|
111
111
|
* - "off": OCR 비활성화 (이미지 기반 PDF면 에러)
|
|
112
112
|
* - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
|
|
113
113
|
*/
|
|
114
114
|
ocrMode?: OcrMode;
|
|
115
115
|
/**
|
|
116
116
|
* OCR 병렬 처리 수.
|
|
117
|
-
* -
|
|
118
|
-
* -
|
|
119
|
-
* - 1: 순차 처리 (기존 동작)
|
|
117
|
+
* - CLI 제공 프로바이더(gemini/claude/codex): 기본 4 (배치 병렬 실행)
|
|
118
|
+
* - 1: 순차 처리
|
|
120
119
|
*/
|
|
121
120
|
ocrConcurrency?: number;
|
|
122
121
|
/** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
|
|
@@ -241,7 +240,7 @@ interface StructuredOcrResult {
|
|
|
241
240
|
markdown: string;
|
|
242
241
|
}
|
|
243
242
|
/** OCR 모드 — CLI --ocr 옵션 허용값 */
|
|
244
|
-
type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "
|
|
243
|
+
type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "off";
|
|
245
244
|
/** 사용자 제공 OCR 함수 — 페이지 이미지를 받아 텍스트 또는 구조화된 결과 반환 */
|
|
246
245
|
type OcrProvider = (pageImage: Uint8Array, pageNumber: number, mimeType: "image/png") => Promise<string | StructuredOcrResult>;
|
|
247
246
|
interface WatchOptions {
|
package/dist/index.js
CHANGED
|
@@ -2183,7 +2183,7 @@ var auto_detect_exports = {};
|
|
|
2183
2183
|
__export(auto_detect_exports, {
|
|
2184
2184
|
detectAvailableOcr: () => detectAvailableOcr,
|
|
2185
2185
|
getAutoFallbackChain: () => getAutoFallbackChain,
|
|
2186
|
-
|
|
2186
|
+
getNoCliMessage: () => getNoCliMessage,
|
|
2187
2187
|
validateOcrMode: () => validateOcrMode
|
|
2188
2188
|
});
|
|
2189
2189
|
import { execSync } from "child_process";
|
|
@@ -2191,7 +2191,7 @@ function detectAvailableOcr() {
|
|
|
2191
2191
|
for (const cli of CLI_PRIORITY) {
|
|
2192
2192
|
if (isCliInstalled(cli)) return cli;
|
|
2193
2193
|
}
|
|
2194
|
-
return
|
|
2194
|
+
return null;
|
|
2195
2195
|
}
|
|
2196
2196
|
function isCliInstalled(name) {
|
|
2197
2197
|
try {
|
|
@@ -2207,11 +2207,10 @@ function getAutoFallbackChain() {
|
|
|
2207
2207
|
for (const cli of CLI_PRIORITY) {
|
|
2208
2208
|
if (isCliInstalled(cli)) chain.push(cli);
|
|
2209
2209
|
}
|
|
2210
|
-
chain.push("tesseract");
|
|
2211
2210
|
return chain;
|
|
2212
2211
|
}
|
|
2213
2212
|
function validateOcrMode(mode) {
|
|
2214
|
-
if (mode === "auto" || mode === "off"
|
|
2213
|
+
if (mode === "auto" || mode === "off") return;
|
|
2215
2214
|
if (!isCliInstalled(mode)) {
|
|
2216
2215
|
throw new Error(`'${mode}' CLI\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4.
|
|
2217
2216
|
${getInstallGuide(mode)}`);
|
|
@@ -2226,10 +2225,10 @@ function getInstallGuide(mode) {
|
|
|
2226
2225
|
};
|
|
2227
2226
|
return guides[mode] || `'${mode}'\uC744(\uB97C) \uC124\uCE58\uD574\uC8FC\uC138\uC694.`;
|
|
2228
2227
|
}
|
|
2229
|
-
function
|
|
2228
|
+
function getNoCliMessage() {
|
|
2230
2229
|
return [
|
|
2231
|
-
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4
|
|
2232
|
-
"\
|
|
2230
|
+
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 OCR\uC744 \uC218\uD589\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.",
|
|
2231
|
+
"\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \uCC98\uB9AC\uB97C \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
|
|
2233
2232
|
"",
|
|
2234
2233
|
" [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
|
|
2235
2234
|
" Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
|
|
@@ -2412,65 +2411,6 @@ var init_cli_provider = __esm({
|
|
|
2412
2411
|
}
|
|
2413
2412
|
});
|
|
2414
2413
|
|
|
2415
|
-
// src/ocr/tesseract-provider.ts
|
|
2416
|
-
var tesseract_provider_exports = {};
|
|
2417
|
-
__export(tesseract_provider_exports, {
|
|
2418
|
-
createTesseractPoolProvider: () => createTesseractPoolProvider,
|
|
2419
|
-
createTesseractProvider: () => createTesseractProvider
|
|
2420
|
-
});
|
|
2421
|
-
import { createWorker } from "tesseract.js";
|
|
2422
|
-
async function createTesseractProvider() {
|
|
2423
|
-
const worker = await createWorker("kor+eng");
|
|
2424
|
-
let terminated = false;
|
|
2425
|
-
const provider = async (pageImage, _pageNumber, _mimeType) => {
|
|
2426
|
-
const { data } = await worker.recognize(pageImage);
|
|
2427
|
-
return data.text;
|
|
2428
|
-
};
|
|
2429
|
-
provider.terminate = async () => {
|
|
2430
|
-
if (!terminated) {
|
|
2431
|
-
await worker.terminate();
|
|
2432
|
-
terminated = true;
|
|
2433
|
-
}
|
|
2434
|
-
};
|
|
2435
|
-
return provider;
|
|
2436
|
-
}
|
|
2437
|
-
async function createTesseractPoolProvider(concurrency) {
|
|
2438
|
-
const workers = await Promise.all(
|
|
2439
|
-
Array.from({ length: concurrency }, () => createWorker("kor+eng"))
|
|
2440
|
-
);
|
|
2441
|
-
const idle = [...workers];
|
|
2442
|
-
const waitQueue = [];
|
|
2443
|
-
function acquire() {
|
|
2444
|
-
if (idle.length > 0) return Promise.resolve(idle.pop());
|
|
2445
|
-
return new Promise((resolve4) => waitQueue.push(resolve4));
|
|
2446
|
-
}
|
|
2447
|
-
function release(w) {
|
|
2448
|
-
if (waitQueue.length > 0) {
|
|
2449
|
-
waitQueue.shift()(w);
|
|
2450
|
-
} else {
|
|
2451
|
-
idle.push(w);
|
|
2452
|
-
}
|
|
2453
|
-
}
|
|
2454
|
-
const provider = async (pageImage, _pageNumber, _mimeType) => {
|
|
2455
|
-
const w = await acquire();
|
|
2456
|
-
try {
|
|
2457
|
-
const { data } = await w.recognize(pageImage);
|
|
2458
|
-
return data.text;
|
|
2459
|
-
} finally {
|
|
2460
|
-
release(w);
|
|
2461
|
-
}
|
|
2462
|
-
};
|
|
2463
|
-
provider.terminate = async () => {
|
|
2464
|
-
await Promise.all(workers.map((w) => w.terminate()));
|
|
2465
|
-
};
|
|
2466
|
-
return provider;
|
|
2467
|
-
}
|
|
2468
|
-
var init_tesseract_provider = __esm({
|
|
2469
|
-
"src/ocr/tesseract-provider.ts"() {
|
|
2470
|
-
"use strict";
|
|
2471
|
-
}
|
|
2472
|
-
});
|
|
2473
|
-
|
|
2474
2414
|
// src/ocr/batch-provider.ts
|
|
2475
2415
|
var batch_provider_exports = {};
|
|
2476
2416
|
__export(batch_provider_exports, {
|
|
@@ -2679,15 +2619,6 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2679
2619
|
}
|
|
2680
2620
|
if (mode !== "auto") {
|
|
2681
2621
|
validateOcrMode(mode);
|
|
2682
|
-
if (mode === "tesseract") {
|
|
2683
|
-
const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2684
|
-
if (concurrency && concurrency > 1) {
|
|
2685
|
-
logger.log({ level: "info", event: "done", message: "Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
|
|
2686
|
-
return createTesseractPoolProvider2(concurrency);
|
|
2687
|
-
}
|
|
2688
|
-
logger.log({ level: "info", event: "done", message: "Tesseract single provider \uC120\uD0DD" });
|
|
2689
|
-
return createTesseractProvider2();
|
|
2690
|
-
}
|
|
2691
2622
|
if (mode === "gemini" || mode === "claude" || mode === "codex") {
|
|
2692
2623
|
const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
|
|
2693
2624
|
const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[mode];
|
|
@@ -2703,27 +2634,16 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2703
2634
|
}
|
|
2704
2635
|
const detected = detectAvailableOcr();
|
|
2705
2636
|
logger.log({ level: "info", event: "progress", message: "OCR auto \uAC10\uC9C0 \uACB0\uACFC", meta: { detected } });
|
|
2706
|
-
if (detected
|
|
2707
|
-
|
|
2708
|
-
|
|
2709
|
-
|
|
2710
|
-
code: "OCR_CLI_FALLBACK"
|
|
2711
|
-
});
|
|
2712
|
-
} else {
|
|
2713
|
-
warnings?.push({
|
|
2714
|
-
message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
|
|
2715
|
-
code: "OCR_CLI_FALLBACK"
|
|
2716
|
-
});
|
|
2717
|
-
}
|
|
2637
|
+
if (!detected) {
|
|
2638
|
+
throw new Error(
|
|
2639
|
+
"\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR CLI\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4. \uB2E4\uC74C \uC911 \uD558\uB098\uB97C \uC124\uCE58\uD558\uC138\uC694:\n Codex CLI: npm install -g @openai/codex\n Claude CLI: npm install -g @anthropic-ai/claude-code\n Gemini CLI: https://ai.google.dev/gemini-api/docs/cli"
|
|
2640
|
+
);
|
|
2718
2641
|
}
|
|
2719
|
-
if (detected
|
|
2720
|
-
|
|
2721
|
-
|
|
2722
|
-
|
|
2723
|
-
|
|
2724
|
-
}
|
|
2725
|
-
logger.log({ level: "info", event: "done", message: "AUTO: Tesseract single provider \uC120\uD0DD" });
|
|
2726
|
-
return createTesseractProvider2();
|
|
2642
|
+
if (detected !== "codex") {
|
|
2643
|
+
warnings?.push({
|
|
2644
|
+
message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
|
|
2645
|
+
code: "OCR_CLI_FALLBACK"
|
|
2646
|
+
});
|
|
2727
2647
|
}
|
|
2728
2648
|
if (detected === "gemini" || detected === "codex" || detected === "claude") {
|
|
2729
2649
|
const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
|
|
@@ -3115,7 +3035,7 @@ import JSZip2 from "jszip";
|
|
|
3115
3035
|
import { DOMParser } from "@xmldom/xmldom";
|
|
3116
3036
|
|
|
3117
3037
|
// src/utils.ts
|
|
3118
|
-
var VERSION = true ? "2.4.
|
|
3038
|
+
var VERSION = true ? "2.4.19" : "0.0.0-dev";
|
|
3119
3039
|
function toArrayBuffer(buf) {
|
|
3120
3040
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3121
3041
|
return buf.buffer;
|
|
@@ -8711,7 +8631,6 @@ async function parseDocxDocument(buffer, options, existingZip) {
|
|
|
8711
8631
|
|
|
8712
8632
|
// src/index.ts
|
|
8713
8633
|
init_cli_provider();
|
|
8714
|
-
init_tesseract_provider();
|
|
8715
8634
|
init_markdown_to_blocks();
|
|
8716
8635
|
init_logger();
|
|
8717
8636
|
|
|
@@ -12070,9 +11989,6 @@ async function parseImage(buffer, options) {
|
|
|
12070
11989
|
if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
|
|
12071
11990
|
ocrProvider = createCliOcrProvider(ocrMode);
|
|
12072
11991
|
actualOcrMode = ocrMode;
|
|
12073
|
-
} else if (ocrMode === "tesseract") {
|
|
12074
|
-
ocrProvider = await createTesseractProvider();
|
|
12075
|
-
actualOcrMode = ocrMode;
|
|
12076
11992
|
} else if (ocrMode === "auto") {
|
|
12077
11993
|
const modesToTry = ["gemini", "claude", "codex", "ollama"];
|
|
12078
11994
|
for (const mode of modesToTry) {
|
|
@@ -12084,10 +12000,6 @@ async function parseImage(buffer, options) {
|
|
|
12084
12000
|
console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
|
|
12085
12001
|
}
|
|
12086
12002
|
}
|
|
12087
|
-
if (!ocrProvider) {
|
|
12088
|
-
ocrProvider = await createTesseractProvider();
|
|
12089
|
-
actualOcrMode = "tesseract";
|
|
12090
|
-
}
|
|
12091
12003
|
}
|
|
12092
12004
|
if (!ocrProvider) {
|
|
12093
12005
|
return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
|