@clazic/kordoc 2.4.17 → 2.4.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{auto-detect-2YGFYQCN.js → auto-detect-CBYICI6B.js} +4 -4
- package/dist/{chunk-WM3XI23V.js → chunk-463YQ2WL.js} +38 -25
- package/dist/chunk-463YQ2WL.js.map +1 -0
- package/dist/{chunk-7NOZFYH6.js → chunk-CLK4PNZ7.js} +7 -8
- package/dist/chunk-CLK4PNZ7.js.map +1 -0
- package/dist/{chunk-W2KDIKDF.js → chunk-MZN7PLTZ.js} +2 -2
- package/dist/{chunk-34WIGIQC.js → chunk-Y4WFKJ5P.js} +1 -1
- package/dist/chunk-Y4WFKJ5P.js.map +1 -0
- package/dist/cli.js +9 -13
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +49 -191
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +5 -6
- package/dist/index.d.ts +5 -6
- package/dist/index.js +49 -190
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -6
- package/dist/mcp.js.map +1 -1
- package/dist/{resolve-673XFZQ6.js → resolve-XWYJYKKH.js} +15 -36
- package/dist/resolve-XWYJYKKH.js.map +1 -0
- package/dist/{utils-DHOODYKU.js → utils-YUAT7LFD.js} +2 -2
- package/dist/{watch-RM4VNOL4.js → watch-WEOFVVDO.js} +5 -6
- package/dist/{watch-RM4VNOL4.js.map → watch-WEOFVVDO.js.map} +1 -1
- package/package.json +1 -2
- package/dist/chunk-34WIGIQC.js.map +0 -1
- package/dist/chunk-7FMKAV4P.js +0 -56
- package/dist/chunk-7FMKAV4P.js.map +0 -1
- package/dist/chunk-7NOZFYH6.js.map +0 -1
- package/dist/chunk-WM3XI23V.js.map +0 -1
- package/dist/resolve-673XFZQ6.js.map +0 -1
- package/dist/tesseract-provider-MNMZPSGF.js +0 -11
- package/dist/utils-DHOODYKU.js.map +0 -1
- /package/dist/{auto-detect-2YGFYQCN.js.map → auto-detect-CBYICI6B.js.map} +0 -0
- /package/dist/{chunk-W2KDIKDF.js.map → chunk-MZN7PLTZ.js.map} +0 -0
- /package/dist/{tesseract-provider-MNMZPSGF.js.map → utils-YUAT7LFD.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -106,17 +106,16 @@ interface ParseOptions {
|
|
|
106
106
|
ocr?: OcrProvider;
|
|
107
107
|
/**
|
|
108
108
|
* OCR 모드 (CLI 자동 탐색용).
|
|
109
|
-
* - "auto": 설치된 CLI 자동 탐색 (gemini→claude→
|
|
110
|
-
* - "gemini"|"claude"|"codex"|"ollama"
|
|
109
|
+
* - "auto": 설치된 CLI 자동 탐색 (codex→gemini→claude→ollama)
|
|
110
|
+
* - "gemini"|"claude"|"codex"|"ollama": 특정 도구 강제 지정
|
|
111
111
|
* - "off": OCR 비활성화 (이미지 기반 PDF면 에러)
|
|
112
112
|
* - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
|
|
113
113
|
*/
|
|
114
114
|
ocrMode?: OcrMode;
|
|
115
115
|
/**
|
|
116
116
|
* OCR 병렬 처리 수.
|
|
117
|
-
* -
|
|
118
|
-
* -
|
|
119
|
-
* - 1: 순차 처리 (기존 동작)
|
|
117
|
+
* - CLI 제공 프로바이더(gemini/claude/codex): 기본 4 (배치 병렬 실행)
|
|
118
|
+
* - 1: 순차 처리
|
|
120
119
|
*/
|
|
121
120
|
ocrConcurrency?: number;
|
|
122
121
|
/** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
|
|
@@ -241,7 +240,7 @@ interface StructuredOcrResult {
|
|
|
241
240
|
markdown: string;
|
|
242
241
|
}
|
|
243
242
|
/** OCR 모드 — CLI --ocr 옵션 허용값 */
|
|
244
|
-
type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "
|
|
243
|
+
type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "off";
|
|
245
244
|
/** 사용자 제공 OCR 함수 — 페이지 이미지를 받아 텍스트 또는 구조화된 결과 반환 */
|
|
246
245
|
type OcrProvider = (pageImage: Uint8Array, pageNumber: number, mimeType: "image/png") => Promise<string | StructuredOcrResult>;
|
|
247
246
|
interface WatchOptions {
|
package/dist/index.d.ts
CHANGED
|
@@ -106,17 +106,16 @@ interface ParseOptions {
|
|
|
106
106
|
ocr?: OcrProvider;
|
|
107
107
|
/**
|
|
108
108
|
* OCR 모드 (CLI 자동 탐색용).
|
|
109
|
-
* - "auto": 설치된 CLI 자동 탐색 (gemini→claude→
|
|
110
|
-
* - "gemini"|"claude"|"codex"|"ollama"
|
|
109
|
+
* - "auto": 설치된 CLI 자동 탐색 (codex→gemini→claude→ollama)
|
|
110
|
+
* - "gemini"|"claude"|"codex"|"ollama": 특정 도구 강제 지정
|
|
111
111
|
* - "off": OCR 비활성화 (이미지 기반 PDF면 에러)
|
|
112
112
|
* - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
|
|
113
113
|
*/
|
|
114
114
|
ocrMode?: OcrMode;
|
|
115
115
|
/**
|
|
116
116
|
* OCR 병렬 처리 수.
|
|
117
|
-
* -
|
|
118
|
-
* -
|
|
119
|
-
* - 1: 순차 처리 (기존 동작)
|
|
117
|
+
* - CLI 제공 프로바이더(gemini/claude/codex): 기본 4 (배치 병렬 실행)
|
|
118
|
+
* - 1: 순차 처리
|
|
120
119
|
*/
|
|
121
120
|
ocrConcurrency?: number;
|
|
122
121
|
/** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
|
|
@@ -241,7 +240,7 @@ interface StructuredOcrResult {
|
|
|
241
240
|
markdown: string;
|
|
242
241
|
}
|
|
243
242
|
/** OCR 모드 — CLI --ocr 옵션 허용값 */
|
|
244
|
-
type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "
|
|
243
|
+
type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "off";
|
|
245
244
|
/** 사용자 제공 OCR 함수 — 페이지 이미지를 받아 텍스트 또는 구조화된 결과 반환 */
|
|
246
245
|
type OcrProvider = (pageImage: Uint8Array, pageNumber: number, mimeType: "image/png") => Promise<string | StructuredOcrResult>;
|
|
247
246
|
interface WatchOptions {
|
package/dist/index.js
CHANGED
|
@@ -2183,7 +2183,7 @@ var auto_detect_exports = {};
|
|
|
2183
2183
|
__export(auto_detect_exports, {
|
|
2184
2184
|
detectAvailableOcr: () => detectAvailableOcr,
|
|
2185
2185
|
getAutoFallbackChain: () => getAutoFallbackChain,
|
|
2186
|
-
|
|
2186
|
+
getNoCliMessage: () => getNoCliMessage,
|
|
2187
2187
|
validateOcrMode: () => validateOcrMode
|
|
2188
2188
|
});
|
|
2189
2189
|
import { execSync } from "child_process";
|
|
@@ -2191,7 +2191,7 @@ function detectAvailableOcr() {
|
|
|
2191
2191
|
for (const cli of CLI_PRIORITY) {
|
|
2192
2192
|
if (isCliInstalled(cli)) return cli;
|
|
2193
2193
|
}
|
|
2194
|
-
return
|
|
2194
|
+
return null;
|
|
2195
2195
|
}
|
|
2196
2196
|
function isCliInstalled(name) {
|
|
2197
2197
|
try {
|
|
@@ -2207,11 +2207,10 @@ function getAutoFallbackChain() {
|
|
|
2207
2207
|
for (const cli of CLI_PRIORITY) {
|
|
2208
2208
|
if (isCliInstalled(cli)) chain.push(cli);
|
|
2209
2209
|
}
|
|
2210
|
-
chain.push("tesseract");
|
|
2211
2210
|
return chain;
|
|
2212
2211
|
}
|
|
2213
2212
|
function validateOcrMode(mode) {
|
|
2214
|
-
if (mode === "auto" || mode === "off"
|
|
2213
|
+
if (mode === "auto" || mode === "off") return;
|
|
2215
2214
|
if (!isCliInstalled(mode)) {
|
|
2216
2215
|
throw new Error(`'${mode}' CLI\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4.
|
|
2217
2216
|
${getInstallGuide(mode)}`);
|
|
@@ -2226,10 +2225,10 @@ function getInstallGuide(mode) {
|
|
|
2226
2225
|
};
|
|
2227
2226
|
return guides[mode] || `'${mode}'\uC744(\uB97C) \uC124\uCE58\uD574\uC8FC\uC138\uC694.`;
|
|
2228
2227
|
}
|
|
2229
|
-
function
|
|
2228
|
+
function getNoCliMessage() {
|
|
2230
2229
|
return [
|
|
2231
|
-
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4
|
|
2232
|
-
"\
|
|
2230
|
+
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 OCR\uC744 \uC218\uD589\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.",
|
|
2231
|
+
"\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \uCC98\uB9AC\uB97C \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
|
|
2233
2232
|
"",
|
|
2234
2233
|
" [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
|
|
2235
2234
|
" Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
|
|
@@ -2412,65 +2411,6 @@ var init_cli_provider = __esm({
|
|
|
2412
2411
|
}
|
|
2413
2412
|
});
|
|
2414
2413
|
|
|
2415
|
-
// src/ocr/tesseract-provider.ts
|
|
2416
|
-
var tesseract_provider_exports = {};
|
|
2417
|
-
__export(tesseract_provider_exports, {
|
|
2418
|
-
createTesseractPoolProvider: () => createTesseractPoolProvider,
|
|
2419
|
-
createTesseractProvider: () => createTesseractProvider
|
|
2420
|
-
});
|
|
2421
|
-
import { createWorker } from "tesseract.js";
|
|
2422
|
-
async function createTesseractProvider() {
|
|
2423
|
-
const worker = await createWorker("kor+eng");
|
|
2424
|
-
let terminated = false;
|
|
2425
|
-
const provider = async (pageImage, _pageNumber, _mimeType) => {
|
|
2426
|
-
const { data } = await worker.recognize(pageImage);
|
|
2427
|
-
return data.text;
|
|
2428
|
-
};
|
|
2429
|
-
provider.terminate = async () => {
|
|
2430
|
-
if (!terminated) {
|
|
2431
|
-
await worker.terminate();
|
|
2432
|
-
terminated = true;
|
|
2433
|
-
}
|
|
2434
|
-
};
|
|
2435
|
-
return provider;
|
|
2436
|
-
}
|
|
2437
|
-
async function createTesseractPoolProvider(concurrency) {
|
|
2438
|
-
const workers = await Promise.all(
|
|
2439
|
-
Array.from({ length: concurrency }, () => createWorker("kor+eng"))
|
|
2440
|
-
);
|
|
2441
|
-
const idle = [...workers];
|
|
2442
|
-
const waitQueue = [];
|
|
2443
|
-
function acquire() {
|
|
2444
|
-
if (idle.length > 0) return Promise.resolve(idle.pop());
|
|
2445
|
-
return new Promise((resolve4) => waitQueue.push(resolve4));
|
|
2446
|
-
}
|
|
2447
|
-
function release(w) {
|
|
2448
|
-
if (waitQueue.length > 0) {
|
|
2449
|
-
waitQueue.shift()(w);
|
|
2450
|
-
} else {
|
|
2451
|
-
idle.push(w);
|
|
2452
|
-
}
|
|
2453
|
-
}
|
|
2454
|
-
const provider = async (pageImage, _pageNumber, _mimeType) => {
|
|
2455
|
-
const w = await acquire();
|
|
2456
|
-
try {
|
|
2457
|
-
const { data } = await w.recognize(pageImage);
|
|
2458
|
-
return data.text;
|
|
2459
|
-
} finally {
|
|
2460
|
-
release(w);
|
|
2461
|
-
}
|
|
2462
|
-
};
|
|
2463
|
-
provider.terminate = async () => {
|
|
2464
|
-
await Promise.all(workers.map((w) => w.terminate()));
|
|
2465
|
-
};
|
|
2466
|
-
return provider;
|
|
2467
|
-
}
|
|
2468
|
-
var init_tesseract_provider = __esm({
|
|
2469
|
-
"src/ocr/tesseract-provider.ts"() {
|
|
2470
|
-
"use strict";
|
|
2471
|
-
}
|
|
2472
|
-
});
|
|
2473
|
-
|
|
2474
2414
|
// src/ocr/batch-provider.ts
|
|
2475
2415
|
var batch_provider_exports = {};
|
|
2476
2416
|
__export(batch_provider_exports, {
|
|
@@ -2679,15 +2619,6 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2679
2619
|
}
|
|
2680
2620
|
if (mode !== "auto") {
|
|
2681
2621
|
validateOcrMode(mode);
|
|
2682
|
-
if (mode === "tesseract") {
|
|
2683
|
-
const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2684
|
-
if (concurrency && concurrency > 1) {
|
|
2685
|
-
logger.log({ level: "info", event: "done", message: "Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
|
|
2686
|
-
return createTesseractPoolProvider2(concurrency);
|
|
2687
|
-
}
|
|
2688
|
-
logger.log({ level: "info", event: "done", message: "Tesseract single provider \uC120\uD0DD" });
|
|
2689
|
-
return createTesseractProvider2();
|
|
2690
|
-
}
|
|
2691
2622
|
if (mode === "gemini" || mode === "claude" || mode === "codex") {
|
|
2692
2623
|
const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
|
|
2693
2624
|
const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[mode];
|
|
@@ -2703,27 +2634,16 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2703
2634
|
}
|
|
2704
2635
|
const detected = detectAvailableOcr();
|
|
2705
2636
|
logger.log({ level: "info", event: "progress", message: "OCR auto \uAC10\uC9C0 \uACB0\uACFC", meta: { detected } });
|
|
2706
|
-
if (detected
|
|
2707
|
-
|
|
2708
|
-
|
|
2709
|
-
|
|
2710
|
-
code: "OCR_CLI_FALLBACK"
|
|
2711
|
-
});
|
|
2712
|
-
} else {
|
|
2713
|
-
warnings?.push({
|
|
2714
|
-
message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
|
|
2715
|
-
code: "OCR_CLI_FALLBACK"
|
|
2716
|
-
});
|
|
2717
|
-
}
|
|
2637
|
+
if (!detected) {
|
|
2638
|
+
throw new Error(
|
|
2639
|
+
"\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR CLI\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4. \uB2E4\uC74C \uC911 \uD558\uB098\uB97C \uC124\uCE58\uD558\uC138\uC694:\n Codex CLI: npm install -g @openai/codex\n Claude CLI: npm install -g @anthropic-ai/claude-code\n Gemini CLI: https://ai.google.dev/gemini-api/docs/cli"
|
|
2640
|
+
);
|
|
2718
2641
|
}
|
|
2719
|
-
if (detected
|
|
2720
|
-
|
|
2721
|
-
|
|
2722
|
-
|
|
2723
|
-
|
|
2724
|
-
}
|
|
2725
|
-
logger.log({ level: "info", event: "done", message: "AUTO: Tesseract single provider \uC120\uD0DD" });
|
|
2726
|
-
return createTesseractProvider2();
|
|
2642
|
+
if (detected !== "codex") {
|
|
2643
|
+
warnings?.push({
|
|
2644
|
+
message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
|
|
2645
|
+
code: "OCR_CLI_FALLBACK"
|
|
2646
|
+
});
|
|
2727
2647
|
}
|
|
2728
2648
|
if (detected === "gemini" || detected === "codex" || detected === "claude") {
|
|
2729
2649
|
const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
|
|
@@ -3115,7 +3035,7 @@ import JSZip2 from "jszip";
|
|
|
3115
3035
|
import { DOMParser } from "@xmldom/xmldom";
|
|
3116
3036
|
|
|
3117
3037
|
// src/utils.ts
|
|
3118
|
-
var VERSION = true ? "2.4.
|
|
3038
|
+
var VERSION = true ? "2.4.19" : "0.0.0-dev";
|
|
3119
3039
|
function toArrayBuffer(buf) {
|
|
3120
3040
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3121
3041
|
return buf.buffer;
|
|
@@ -8711,7 +8631,6 @@ async function parseDocxDocument(buffer, options, existingZip) {
|
|
|
8711
8631
|
|
|
8712
8632
|
// src/index.ts
|
|
8713
8633
|
init_cli_provider();
|
|
8714
|
-
init_tesseract_provider();
|
|
8715
8634
|
init_markdown_to_blocks();
|
|
8716
8635
|
init_logger();
|
|
8717
8636
|
|
|
@@ -11346,17 +11265,39 @@ var DEFAULT_STAGE_WEIGHTS = {
|
|
|
11346
11265
|
render: 20,
|
|
11347
11266
|
probe: 5,
|
|
11348
11267
|
ocr: 45,
|
|
11349
|
-
proofread:
|
|
11268
|
+
proofread: 0,
|
|
11350
11269
|
merge: 5
|
|
11351
11270
|
};
|
|
11352
|
-
var OCR_PROMPT2 =
|
|
11353
|
-
|
|
11354
|
-
"
|
|
11355
|
-
"
|
|
11356
|
-
"-
|
|
11357
|
-
"-
|
|
11358
|
-
"-
|
|
11359
|
-
"-
|
|
11271
|
+
var OCR_PROMPT2 = [
|
|
11272
|
+
"\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, OCR \uC624\uC778\uC2DD \uC624\uB958\uB97C \uC989\uC2DC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uBB3C\uC744 \uCD9C\uB825\uD558\uC138\uC694.",
|
|
11273
|
+
"",
|
|
11274
|
+
"\uCD94\uCD9C \uADDC\uCE59:",
|
|
11275
|
+
"- \uD14D\uC2A4\uD2B8, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8\uB97C \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658",
|
|
11276
|
+
"- \uD45C\uB294 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)",
|
|
11277
|
+
"- \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9",
|
|
11278
|
+
"- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9",
|
|
11279
|
+
"- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC",
|
|
11280
|
+
"- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0",
|
|
11281
|
+
"",
|
|
11282
|
+
"\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D:",
|
|
11283
|
+
"- \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11284
|
+
"- \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11285
|
+
"- \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11286
|
+
"- \uD45C\uC758 \uC81C\uBAA9\uC744 \uBCC0\uACBD \uB610\uB294 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11287
|
+
"- \uD45C\uC758 \uD589\xB7\uC5F4 \uC218, \uC140 \uB0B4\uC6A9, \uD5E4\uB354\uB97C \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11288
|
+
"- \uC81C\uBAA9 \uC218\uC900(#, ##, ### \uB4F1)\uC744 \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83",
|
|
11289
|
+
"- \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11290
|
+
"- ` ``` `\uB85C \uAC10\uC2F8\uAC70\uB098 \uC124\uBA85 \uD14D\uC2A4\uD2B8\uB97C \uCD94\uAC00\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11291
|
+
"",
|
|
11292
|
+
"\uD5C8\uC6A9\uB418\uB294 \uAD50\uC815 \uBC94\uC704 (OCR \uC624\uC778\uC2DD \uC218\uC815):",
|
|
11293
|
+
"- \uBA85\uBC31\uD55C \uAE00\uC790 \uC624\uC778\uC2DD \uC218\uC815 (\uC608: '0' \u2192 'O', 'l' \u2192 '1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uD655\uD55C \uACBD\uC6B0\uB9CC)",
|
|
11294
|
+
"- \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70",
|
|
11295
|
+
"- \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uBD84\uB9AC\uB41C \uBB38\uC7A5 \uBCD1\uD569 (\uC758\uBBF8 \uB2E8\uC704 \uAE30\uC900)",
|
|
11296
|
+
"- Markdown \uBB38\uBC95 \uC624\uB958 \uC218\uC815 (\uD45C \uAD6C\uBD84\uC120 \uB204\uB77D, \uB9AC\uC2A4\uD2B8 \uB4E4\uC5EC\uC4F0\uAE30 \uB4F1)",
|
|
11297
|
+
"",
|
|
11298
|
+
"\uCD9C\uB825 \uADDC\uCE59:",
|
|
11299
|
+
"- \uBCC0\uD658\uB41C Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825\uD560 \uAC83 (\uC124\uBA85, \uC8FC\uC11D, \uBA54\uD0C0 \uD14D\uC2A4\uD2B8 \uC5C6\uC774)",
|
|
11300
|
+
"- \uD655\uC2E4\uD558\uC9C0 \uC54A\uC73C\uBA74 \uC6D0\uBB38\uC744 \uADF8\uB300\uB85C \uC720\uC9C0\uD560 \uAC83"
|
|
11360
11301
|
].join("\n");
|
|
11361
11302
|
function elapsedMs(startAt) {
|
|
11362
11303
|
return Math.round(performance.now() - startAt);
|
|
@@ -11367,7 +11308,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11367
11308
|
const workspaceDir = resolve3(options.workspaceDir ?? join4(dirname3(absInput), `${stem}_ocr_workspace`));
|
|
11368
11309
|
const imagesDir = join4(workspaceDir, "images");
|
|
11369
11310
|
const rawDir = join4(workspaceDir, "ocr", "raw");
|
|
11370
|
-
const proofDir = join4(workspaceDir, "ocr", "proofread");
|
|
11371
11311
|
const diffDir = join4(workspaceDir, "ocr", "diff");
|
|
11372
11312
|
const outputPath = resolve3(options.outputPath ?? join4(dirname3(absInput), `${stem}.md`));
|
|
11373
11313
|
const reportPath = join4(workspaceDir, "run-report.json");
|
|
@@ -11387,7 +11327,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11387
11327
|
const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
|
|
11388
11328
|
await mkdir(imagesDir, { recursive: true });
|
|
11389
11329
|
await mkdir(rawDir, { recursive: true });
|
|
11390
|
-
await mkdir(proofDir, { recursive: true });
|
|
11391
11330
|
await mkdir(diffDir, { recursive: true });
|
|
11392
11331
|
const timingsMs = {};
|
|
11393
11332
|
const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
|
|
@@ -11502,50 +11441,11 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11502
11441
|
timingsMs.ocr = elapsedMs(ocrStart);
|
|
11503
11442
|
markStageDone("ocr", "OCR \uC644\uB8CC");
|
|
11504
11443
|
logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
|
|
11505
|
-
const proofStart = performance.now();
|
|
11506
|
-
currentStage = "proofread";
|
|
11507
|
-
markStageStart("proofread", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC9C4\uD589 \uC911");
|
|
11508
|
-
logStage("info", "proofread", "start", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
11509
|
-
const proofedPaths = [];
|
|
11510
|
-
for (let i = 0; i < rawPagePaths.length; i++) {
|
|
11511
|
-
const rawMd = await readFile(rawPagePaths[i], "utf-8");
|
|
11512
|
-
const prompt = `${PROOFREAD_PROMPT}
|
|
11513
|
-
|
|
11514
|
-
---
|
|
11515
|
-
${rawMd}
|
|
11516
|
-
---`;
|
|
11517
|
-
const corrected = await ocrImageViaNim({
|
|
11518
|
-
textOnlyPrompt: prompt,
|
|
11519
|
-
model: selectedModel,
|
|
11520
|
-
maxTokens: modelMaxTokens[selectedModel] ?? 8192,
|
|
11521
|
-
baseUrl,
|
|
11522
|
-
keyPool,
|
|
11523
|
-
timeoutMs,
|
|
11524
|
-
maxRetries: maxRetriesPerPage,
|
|
11525
|
-
logger,
|
|
11526
|
-
stage: "proofread"
|
|
11527
|
-
});
|
|
11528
|
-
const safeCorrected = preserveNumericIntegrity(rawMd, corrected);
|
|
11529
|
-
const taggedCorrected = addUncertainTag(rawMd, safeCorrected);
|
|
11530
|
-
const pagePath = join4(proofDir, `page_${String(i + 1).padStart(4, "0")}.md`);
|
|
11531
|
-
await writeFile(pagePath, taggedCorrected, "utf-8");
|
|
11532
|
-
await writeFile(
|
|
11533
|
-
join4(diffDir, `page_${String(i + 1).padStart(4, "0")}.json`),
|
|
11534
|
-
JSON.stringify(buildDiffSummary(rawMd, taggedCorrected), null, 2),
|
|
11535
|
-
"utf-8"
|
|
11536
|
-
);
|
|
11537
|
-
proofedPaths.push(pagePath);
|
|
11538
|
-
markStageProgress("proofread", Math.round((i + 1) / rawPagePaths.length * 100), i + 1, rawPagePaths.length, `\uAD50\uC815 ${i + 1}/${rawPagePaths.length}`);
|
|
11539
|
-
logStage("debug", "proofread", "progress", "\uD398\uC774\uC9C0 \uAD50\uC815 \uC644\uB8CC", { page: i + 1, total: rawPagePaths.length });
|
|
11540
|
-
}
|
|
11541
|
-
timingsMs.proofread = elapsedMs(proofStart);
|
|
11542
|
-
markStageDone("proofread", "\uAD50\uC815 \uC644\uB8CC");
|
|
11543
|
-
logStage("info", "proofread", "done", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC644\uB8CC", { elapsedMs: timingsMs.proofread });
|
|
11544
11444
|
const mergeStart = performance.now();
|
|
11545
11445
|
currentStage = "merge";
|
|
11546
11446
|
markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
|
|
11547
|
-
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages:
|
|
11548
|
-
const merged = await mergeMarkdownPages(
|
|
11447
|
+
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
11448
|
+
const merged = await mergeMarkdownPages(rawPagePaths);
|
|
11549
11449
|
await writeFile(outputPath, merged, "utf-8");
|
|
11550
11450
|
timingsMs.merge = elapsedMs(mergeStart);
|
|
11551
11451
|
markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
|
|
@@ -12004,40 +11904,6 @@ function ensureSupportedInput(path) {
|
|
|
12004
11904
|
throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
|
|
12005
11905
|
}
|
|
12006
11906
|
}
|
|
12007
|
-
function extractNumericTokens(text) {
|
|
12008
|
-
return text.match(/\d[\d,./-]*/g) ?? [];
|
|
12009
|
-
}
|
|
12010
|
-
function preserveNumericIntegrity(rawText, correctedText) {
|
|
12011
|
-
const rawTokens = extractNumericTokens(rawText);
|
|
12012
|
-
const correctedTokens = extractNumericTokens(correctedText);
|
|
12013
|
-
if (rawTokens.length !== correctedTokens.length) return rawText;
|
|
12014
|
-
for (let i = 0; i < rawTokens.length; i++) {
|
|
12015
|
-
if (rawTokens[i] !== correctedTokens[i]) return rawText;
|
|
12016
|
-
}
|
|
12017
|
-
return correctedText;
|
|
12018
|
-
}
|
|
12019
|
-
function addUncertainTag(rawText, correctedText) {
|
|
12020
|
-
if (correctedText.includes("[\uD655\uC778\uD544\uC694:")) return correctedText;
|
|
12021
|
-
const rawLen = rawText.trim().length;
|
|
12022
|
-
const corrLen = correctedText.trim().length;
|
|
12023
|
-
if (rawLen === 0 || corrLen === 0) return correctedText;
|
|
12024
|
-
const rawLines = rawText.split("\n").filter(Boolean).length;
|
|
12025
|
-
const corrLines = correctedText.split("\n").filter(Boolean).length;
|
|
12026
|
-
const rawTableLines = rawText.split("\n").filter((l) => l.includes("|")).length;
|
|
12027
|
-
const corrTableLines = correctedText.split("\n").filter((l) => l.includes("|")).length;
|
|
12028
|
-
const suspicious = corrLen < rawLen * 0.75 || corrLines < Math.max(1, Math.floor(rawLines * 0.8)) || rawTableLines >= 2 && corrTableLines < Math.floor(rawTableLines * 0.7);
|
|
12029
|
-
if (!suspicious) return correctedText;
|
|
12030
|
-
return `${correctedText}
|
|
12031
|
-
|
|
12032
|
-
[\uD655\uC778\uD544\uC694: \uAD50\uC815 \uACB0\uACFC\uAC00 \uCD95\uC57D\uB418\uC5C8\uC744 \uC218 \uC788\uC5B4 \uC6D0\uBB38\uACFC \uB300\uC870\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.]`;
|
|
12033
|
-
}
|
|
12034
|
-
function buildDiffSummary(before, after) {
|
|
12035
|
-
return {
|
|
12036
|
-
changed: before !== after,
|
|
12037
|
-
beforeLength: before.length,
|
|
12038
|
-
afterLength: after.length
|
|
12039
|
-
};
|
|
12040
|
-
}
|
|
12041
11907
|
function normalizePipelineError(err, stage) {
|
|
12042
11908
|
if (err instanceof UnifiedOcrError) return err;
|
|
12043
11909
|
const message = err instanceof Error ? err.message : String(err);
|
|
@@ -12123,9 +11989,6 @@ async function parseImage(buffer, options) {
|
|
|
12123
11989
|
if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
|
|
12124
11990
|
ocrProvider = createCliOcrProvider(ocrMode);
|
|
12125
11991
|
actualOcrMode = ocrMode;
|
|
12126
|
-
} else if (ocrMode === "tesseract") {
|
|
12127
|
-
ocrProvider = await createTesseractProvider();
|
|
12128
|
-
actualOcrMode = ocrMode;
|
|
12129
11992
|
} else if (ocrMode === "auto") {
|
|
12130
11993
|
const modesToTry = ["gemini", "claude", "codex", "ollama"];
|
|
12131
11994
|
for (const mode of modesToTry) {
|
|
@@ -12137,10 +12000,6 @@ async function parseImage(buffer, options) {
|
|
|
12137
12000
|
console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
|
|
12138
12001
|
}
|
|
12139
12002
|
}
|
|
12140
|
-
if (!ocrProvider) {
|
|
12141
|
-
ocrProvider = await createTesseractProvider();
|
|
12142
|
-
actualOcrMode = "tesseract";
|
|
12143
|
-
}
|
|
12144
12003
|
}
|
|
12145
12004
|
if (!ocrProvider) {
|
|
12146
12005
|
return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
|