@clazic/kordoc 2.4.17 → 2.4.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{auto-detect-2YGFYQCN.js → auto-detect-CBYICI6B.js} +4 -4
- package/dist/{chunk-WM3XI23V.js → chunk-463YQ2WL.js} +38 -25
- package/dist/chunk-463YQ2WL.js.map +1 -0
- package/dist/{chunk-7NOZFYH6.js → chunk-CLK4PNZ7.js} +7 -8
- package/dist/chunk-CLK4PNZ7.js.map +1 -0
- package/dist/{chunk-W2KDIKDF.js → chunk-MZN7PLTZ.js} +2 -2
- package/dist/{chunk-34WIGIQC.js → chunk-Y4WFKJ5P.js} +1 -1
- package/dist/chunk-Y4WFKJ5P.js.map +1 -0
- package/dist/cli.js +9 -13
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +49 -191
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +5 -6
- package/dist/index.d.ts +5 -6
- package/dist/index.js +49 -190
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -6
- package/dist/mcp.js.map +1 -1
- package/dist/{resolve-673XFZQ6.js → resolve-XWYJYKKH.js} +15 -36
- package/dist/resolve-XWYJYKKH.js.map +1 -0
- package/dist/{utils-DHOODYKU.js → utils-YUAT7LFD.js} +2 -2
- package/dist/{watch-RM4VNOL4.js → watch-WEOFVVDO.js} +5 -6
- package/dist/{watch-RM4VNOL4.js.map → watch-WEOFVVDO.js.map} +1 -1
- package/package.json +1 -2
- package/dist/chunk-34WIGIQC.js.map +0 -1
- package/dist/chunk-7FMKAV4P.js +0 -56
- package/dist/chunk-7FMKAV4P.js.map +0 -1
- package/dist/chunk-7NOZFYH6.js.map +0 -1
- package/dist/chunk-WM3XI23V.js.map +0 -1
- package/dist/resolve-673XFZQ6.js.map +0 -1
- package/dist/tesseract-provider-MNMZPSGF.js +0 -11
- package/dist/utils-DHOODYKU.js.map +0 -1
- /package/dist/{auto-detect-2YGFYQCN.js.map → auto-detect-CBYICI6B.js.map} +0 -0
- /package/dist/{chunk-W2KDIKDF.js.map → chunk-MZN7PLTZ.js.map} +0 -0
- /package/dist/{tesseract-provider-MNMZPSGF.js.map → utils-YUAT7LFD.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -2179,14 +2179,14 @@ var auto_detect_exports = {};
|
|
|
2179
2179
|
__export(auto_detect_exports, {
|
|
2180
2180
|
detectAvailableOcr: () => detectAvailableOcr,
|
|
2181
2181
|
getAutoFallbackChain: () => getAutoFallbackChain,
|
|
2182
|
-
|
|
2182
|
+
getNoCliMessage: () => getNoCliMessage,
|
|
2183
2183
|
validateOcrMode: () => validateOcrMode
|
|
2184
2184
|
});
|
|
2185
2185
|
function detectAvailableOcr() {
|
|
2186
2186
|
for (const cli of CLI_PRIORITY) {
|
|
2187
2187
|
if (isCliInstalled(cli)) return cli;
|
|
2188
2188
|
}
|
|
2189
|
-
return
|
|
2189
|
+
return null;
|
|
2190
2190
|
}
|
|
2191
2191
|
function isCliInstalled(name) {
|
|
2192
2192
|
try {
|
|
@@ -2202,11 +2202,10 @@ function getAutoFallbackChain() {
|
|
|
2202
2202
|
for (const cli of CLI_PRIORITY) {
|
|
2203
2203
|
if (isCliInstalled(cli)) chain.push(cli);
|
|
2204
2204
|
}
|
|
2205
|
-
chain.push("tesseract");
|
|
2206
2205
|
return chain;
|
|
2207
2206
|
}
|
|
2208
2207
|
function validateOcrMode(mode) {
|
|
2209
|
-
if (mode === "auto" || mode === "off"
|
|
2208
|
+
if (mode === "auto" || mode === "off") return;
|
|
2210
2209
|
if (!isCliInstalled(mode)) {
|
|
2211
2210
|
throw new Error(`'${mode}' CLI\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4.
|
|
2212
2211
|
${getInstallGuide(mode)}`);
|
|
@@ -2221,10 +2220,10 @@ function getInstallGuide(mode) {
|
|
|
2221
2220
|
};
|
|
2222
2221
|
return guides[mode] || `'${mode}'\uC744(\uB97C) \uC124\uCE58\uD574\uC8FC\uC138\uC694.`;
|
|
2223
2222
|
}
|
|
2224
|
-
function
|
|
2223
|
+
function getNoCliMessage() {
|
|
2225
2224
|
return [
|
|
2226
|
-
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4
|
|
2227
|
-
"\
|
|
2225
|
+
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 OCR\uC744 \uC218\uD589\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.",
|
|
2226
|
+
"\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \uCC98\uB9AC\uB97C \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
|
|
2228
2227
|
"",
|
|
2229
2228
|
" [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
|
|
2230
2229
|
" Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
|
|
@@ -2408,66 +2407,6 @@ var init_cli_provider = __esm({
|
|
|
2408
2407
|
}
|
|
2409
2408
|
});
|
|
2410
2409
|
|
|
2411
|
-
// src/ocr/tesseract-provider.ts
|
|
2412
|
-
var tesseract_provider_exports = {};
|
|
2413
|
-
__export(tesseract_provider_exports, {
|
|
2414
|
-
createTesseractPoolProvider: () => createTesseractPoolProvider,
|
|
2415
|
-
createTesseractProvider: () => createTesseractProvider
|
|
2416
|
-
});
|
|
2417
|
-
async function createTesseractProvider() {
|
|
2418
|
-
const worker = await (0, import_tesseract.createWorker)("kor+eng");
|
|
2419
|
-
let terminated = false;
|
|
2420
|
-
const provider = async (pageImage, _pageNumber, _mimeType) => {
|
|
2421
|
-
const { data } = await worker.recognize(pageImage);
|
|
2422
|
-
return data.text;
|
|
2423
|
-
};
|
|
2424
|
-
provider.terminate = async () => {
|
|
2425
|
-
if (!terminated) {
|
|
2426
|
-
await worker.terminate();
|
|
2427
|
-
terminated = true;
|
|
2428
|
-
}
|
|
2429
|
-
};
|
|
2430
|
-
return provider;
|
|
2431
|
-
}
|
|
2432
|
-
async function createTesseractPoolProvider(concurrency) {
|
|
2433
|
-
const workers = await Promise.all(
|
|
2434
|
-
Array.from({ length: concurrency }, () => (0, import_tesseract.createWorker)("kor+eng"))
|
|
2435
|
-
);
|
|
2436
|
-
const idle = [...workers];
|
|
2437
|
-
const waitQueue = [];
|
|
2438
|
-
function acquire() {
|
|
2439
|
-
if (idle.length > 0) return Promise.resolve(idle.pop());
|
|
2440
|
-
return new Promise((resolve4) => waitQueue.push(resolve4));
|
|
2441
|
-
}
|
|
2442
|
-
function release(w) {
|
|
2443
|
-
if (waitQueue.length > 0) {
|
|
2444
|
-
waitQueue.shift()(w);
|
|
2445
|
-
} else {
|
|
2446
|
-
idle.push(w);
|
|
2447
|
-
}
|
|
2448
|
-
}
|
|
2449
|
-
const provider = async (pageImage, _pageNumber, _mimeType) => {
|
|
2450
|
-
const w = await acquire();
|
|
2451
|
-
try {
|
|
2452
|
-
const { data } = await w.recognize(pageImage);
|
|
2453
|
-
return data.text;
|
|
2454
|
-
} finally {
|
|
2455
|
-
release(w);
|
|
2456
|
-
}
|
|
2457
|
-
};
|
|
2458
|
-
provider.terminate = async () => {
|
|
2459
|
-
await Promise.all(workers.map((w) => w.terminate()));
|
|
2460
|
-
};
|
|
2461
|
-
return provider;
|
|
2462
|
-
}
|
|
2463
|
-
var import_tesseract;
|
|
2464
|
-
var init_tesseract_provider = __esm({
|
|
2465
|
-
"src/ocr/tesseract-provider.ts"() {
|
|
2466
|
-
"use strict";
|
|
2467
|
-
import_tesseract = require("tesseract.js");
|
|
2468
|
-
}
|
|
2469
|
-
});
|
|
2470
|
-
|
|
2471
2410
|
// src/ocr/batch-provider.ts
|
|
2472
2411
|
var batch_provider_exports = {};
|
|
2473
2412
|
__export(batch_provider_exports, {
|
|
@@ -2676,15 +2615,6 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2676
2615
|
}
|
|
2677
2616
|
if (mode !== "auto") {
|
|
2678
2617
|
validateOcrMode(mode);
|
|
2679
|
-
if (mode === "tesseract") {
|
|
2680
|
-
const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2681
|
-
if (concurrency && concurrency > 1) {
|
|
2682
|
-
logger.log({ level: "info", event: "done", message: "Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
|
|
2683
|
-
return createTesseractPoolProvider2(concurrency);
|
|
2684
|
-
}
|
|
2685
|
-
logger.log({ level: "info", event: "done", message: "Tesseract single provider \uC120\uD0DD" });
|
|
2686
|
-
return createTesseractProvider2();
|
|
2687
|
-
}
|
|
2688
2618
|
if (mode === "gemini" || mode === "claude" || mode === "codex") {
|
|
2689
2619
|
const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
|
|
2690
2620
|
const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[mode];
|
|
@@ -2700,27 +2630,16 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2700
2630
|
}
|
|
2701
2631
|
const detected = detectAvailableOcr();
|
|
2702
2632
|
logger.log({ level: "info", event: "progress", message: "OCR auto \uAC10\uC9C0 \uACB0\uACFC", meta: { detected } });
|
|
2703
|
-
if (detected
|
|
2704
|
-
|
|
2705
|
-
|
|
2706
|
-
|
|
2707
|
-
code: "OCR_CLI_FALLBACK"
|
|
2708
|
-
});
|
|
2709
|
-
} else {
|
|
2710
|
-
warnings?.push({
|
|
2711
|
-
message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
|
|
2712
|
-
code: "OCR_CLI_FALLBACK"
|
|
2713
|
-
});
|
|
2714
|
-
}
|
|
2633
|
+
if (!detected) {
|
|
2634
|
+
throw new Error(
|
|
2635
|
+
"\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR CLI\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4. \uB2E4\uC74C \uC911 \uD558\uB098\uB97C \uC124\uCE58\uD558\uC138\uC694:\n Codex CLI: npm install -g @openai/codex\n Claude CLI: npm install -g @anthropic-ai/claude-code\n Gemini CLI: https://ai.google.dev/gemini-api/docs/cli"
|
|
2636
|
+
);
|
|
2715
2637
|
}
|
|
2716
|
-
if (detected
|
|
2717
|
-
|
|
2718
|
-
|
|
2719
|
-
|
|
2720
|
-
|
|
2721
|
-
}
|
|
2722
|
-
logger.log({ level: "info", event: "done", message: "AUTO: Tesseract single provider \uC120\uD0DD" });
|
|
2723
|
-
return createTesseractProvider2();
|
|
2638
|
+
if (detected !== "codex") {
|
|
2639
|
+
warnings?.push({
|
|
2640
|
+
message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
|
|
2641
|
+
code: "OCR_CLI_FALLBACK"
|
|
2642
|
+
});
|
|
2724
2643
|
}
|
|
2725
2644
|
if (detected === "gemini" || detected === "codex" || detected === "claude") {
|
|
2726
2645
|
const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
|
|
@@ -3138,7 +3057,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
|
|
|
3138
3057
|
var import_xmldom = require("@xmldom/xmldom");
|
|
3139
3058
|
|
|
3140
3059
|
// src/utils.ts
|
|
3141
|
-
var VERSION = true ? "2.4.
|
|
3060
|
+
var VERSION = true ? "2.4.19" : "0.0.0-dev";
|
|
3142
3061
|
function toArrayBuffer(buf) {
|
|
3143
3062
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3144
3063
|
return buf.buffer;
|
|
@@ -8734,7 +8653,6 @@ async function parseDocxDocument(buffer, options, existingZip) {
|
|
|
8734
8653
|
|
|
8735
8654
|
// src/index.ts
|
|
8736
8655
|
init_cli_provider();
|
|
8737
|
-
init_tesseract_provider();
|
|
8738
8656
|
init_markdown_to_blocks();
|
|
8739
8657
|
init_logger();
|
|
8740
8658
|
|
|
@@ -11369,17 +11287,39 @@ var DEFAULT_STAGE_WEIGHTS = {
|
|
|
11369
11287
|
render: 20,
|
|
11370
11288
|
probe: 5,
|
|
11371
11289
|
ocr: 45,
|
|
11372
|
-
proofread:
|
|
11290
|
+
proofread: 0,
|
|
11373
11291
|
merge: 5
|
|
11374
11292
|
};
|
|
11375
|
-
var OCR_PROMPT2 =
|
|
11376
|
-
|
|
11377
|
-
"
|
|
11378
|
-
"
|
|
11379
|
-
"-
|
|
11380
|
-
"-
|
|
11381
|
-
"-
|
|
11382
|
-
"-
|
|
11293
|
+
var OCR_PROMPT2 = [
|
|
11294
|
+
"\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, OCR \uC624\uC778\uC2DD \uC624\uB958\uB97C \uC989\uC2DC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uBB3C\uC744 \uCD9C\uB825\uD558\uC138\uC694.",
|
|
11295
|
+
"",
|
|
11296
|
+
"\uCD94\uCD9C \uADDC\uCE59:",
|
|
11297
|
+
"- \uD14D\uC2A4\uD2B8, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8\uB97C \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658",
|
|
11298
|
+
"- \uD45C\uB294 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)",
|
|
11299
|
+
"- \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9",
|
|
11300
|
+
"- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9",
|
|
11301
|
+
"- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC",
|
|
11302
|
+
"- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0",
|
|
11303
|
+
"",
|
|
11304
|
+
"\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D:",
|
|
11305
|
+
"- \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11306
|
+
"- \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11307
|
+
"- \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11308
|
+
"- \uD45C\uC758 \uC81C\uBAA9\uC744 \uBCC0\uACBD \uB610\uB294 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11309
|
+
"- \uD45C\uC758 \uD589\xB7\uC5F4 \uC218, \uC140 \uB0B4\uC6A9, \uD5E4\uB354\uB97C \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11310
|
+
"- \uC81C\uBAA9 \uC218\uC900(#, ##, ### \uB4F1)\uC744 \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83",
|
|
11311
|
+
"- \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11312
|
+
"- ` ``` `\uB85C \uAC10\uC2F8\uAC70\uB098 \uC124\uBA85 \uD14D\uC2A4\uD2B8\uB97C \uCD94\uAC00\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11313
|
+
"",
|
|
11314
|
+
"\uD5C8\uC6A9\uB418\uB294 \uAD50\uC815 \uBC94\uC704 (OCR \uC624\uC778\uC2DD \uC218\uC815):",
|
|
11315
|
+
"- \uBA85\uBC31\uD55C \uAE00\uC790 \uC624\uC778\uC2DD \uC218\uC815 (\uC608: '0' \u2192 'O', 'l' \u2192 '1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uD655\uD55C \uACBD\uC6B0\uB9CC)",
|
|
11316
|
+
"- \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70",
|
|
11317
|
+
"- \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uBD84\uB9AC\uB41C \uBB38\uC7A5 \uBCD1\uD569 (\uC758\uBBF8 \uB2E8\uC704 \uAE30\uC900)",
|
|
11318
|
+
"- Markdown \uBB38\uBC95 \uC624\uB958 \uC218\uC815 (\uD45C \uAD6C\uBD84\uC120 \uB204\uB77D, \uB9AC\uC2A4\uD2B8 \uB4E4\uC5EC\uC4F0\uAE30 \uB4F1)",
|
|
11319
|
+
"",
|
|
11320
|
+
"\uCD9C\uB825 \uADDC\uCE59:",
|
|
11321
|
+
"- \uBCC0\uD658\uB41C Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825\uD560 \uAC83 (\uC124\uBA85, \uC8FC\uC11D, \uBA54\uD0C0 \uD14D\uC2A4\uD2B8 \uC5C6\uC774)",
|
|
11322
|
+
"- \uD655\uC2E4\uD558\uC9C0 \uC54A\uC73C\uBA74 \uC6D0\uBB38\uC744 \uADF8\uB300\uB85C \uC720\uC9C0\uD560 \uAC83"
|
|
11383
11323
|
].join("\n");
|
|
11384
11324
|
function elapsedMs(startAt) {
|
|
11385
11325
|
return Math.round(import_node_perf_hooks.performance.now() - startAt);
|
|
@@ -11390,7 +11330,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11390
11330
|
const workspaceDir = (0, import_path5.resolve)(options.workspaceDir ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}_ocr_workspace`));
|
|
11391
11331
|
const imagesDir = (0, import_path5.join)(workspaceDir, "images");
|
|
11392
11332
|
const rawDir = (0, import_path5.join)(workspaceDir, "ocr", "raw");
|
|
11393
|
-
const proofDir = (0, import_path5.join)(workspaceDir, "ocr", "proofread");
|
|
11394
11333
|
const diffDir = (0, import_path5.join)(workspaceDir, "ocr", "diff");
|
|
11395
11334
|
const outputPath = (0, import_path5.resolve)(options.outputPath ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}.md`));
|
|
11396
11335
|
const reportPath = (0, import_path5.join)(workspaceDir, "run-report.json");
|
|
@@ -11410,7 +11349,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11410
11349
|
const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
|
|
11411
11350
|
await (0, import_promises2.mkdir)(imagesDir, { recursive: true });
|
|
11412
11351
|
await (0, import_promises2.mkdir)(rawDir, { recursive: true });
|
|
11413
|
-
await (0, import_promises2.mkdir)(proofDir, { recursive: true });
|
|
11414
11352
|
await (0, import_promises2.mkdir)(diffDir, { recursive: true });
|
|
11415
11353
|
const timingsMs = {};
|
|
11416
11354
|
const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
|
|
@@ -11525,50 +11463,11 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11525
11463
|
timingsMs.ocr = elapsedMs(ocrStart);
|
|
11526
11464
|
markStageDone("ocr", "OCR \uC644\uB8CC");
|
|
11527
11465
|
logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
|
|
11528
|
-
const proofStart = import_node_perf_hooks.performance.now();
|
|
11529
|
-
currentStage = "proofread";
|
|
11530
|
-
markStageStart("proofread", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC9C4\uD589 \uC911");
|
|
11531
|
-
logStage("info", "proofread", "start", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
11532
|
-
const proofedPaths = [];
|
|
11533
|
-
for (let i = 0; i < rawPagePaths.length; i++) {
|
|
11534
|
-
const rawMd = await (0, import_promises2.readFile)(rawPagePaths[i], "utf-8");
|
|
11535
|
-
const prompt = `${PROOFREAD_PROMPT}
|
|
11536
|
-
|
|
11537
|
-
---
|
|
11538
|
-
${rawMd}
|
|
11539
|
-
---`;
|
|
11540
|
-
const corrected = await ocrImageViaNim({
|
|
11541
|
-
textOnlyPrompt: prompt,
|
|
11542
|
-
model: selectedModel,
|
|
11543
|
-
maxTokens: modelMaxTokens[selectedModel] ?? 8192,
|
|
11544
|
-
baseUrl,
|
|
11545
|
-
keyPool,
|
|
11546
|
-
timeoutMs,
|
|
11547
|
-
maxRetries: maxRetriesPerPage,
|
|
11548
|
-
logger,
|
|
11549
|
-
stage: "proofread"
|
|
11550
|
-
});
|
|
11551
|
-
const safeCorrected = preserveNumericIntegrity(rawMd, corrected);
|
|
11552
|
-
const taggedCorrected = addUncertainTag(rawMd, safeCorrected);
|
|
11553
|
-
const pagePath = (0, import_path5.join)(proofDir, `page_${String(i + 1).padStart(4, "0")}.md`);
|
|
11554
|
-
await (0, import_promises2.writeFile)(pagePath, taggedCorrected, "utf-8");
|
|
11555
|
-
await (0, import_promises2.writeFile)(
|
|
11556
|
-
(0, import_path5.join)(diffDir, `page_${String(i + 1).padStart(4, "0")}.json`),
|
|
11557
|
-
JSON.stringify(buildDiffSummary(rawMd, taggedCorrected), null, 2),
|
|
11558
|
-
"utf-8"
|
|
11559
|
-
);
|
|
11560
|
-
proofedPaths.push(pagePath);
|
|
11561
|
-
markStageProgress("proofread", Math.round((i + 1) / rawPagePaths.length * 100), i + 1, rawPagePaths.length, `\uAD50\uC815 ${i + 1}/${rawPagePaths.length}`);
|
|
11562
|
-
logStage("debug", "proofread", "progress", "\uD398\uC774\uC9C0 \uAD50\uC815 \uC644\uB8CC", { page: i + 1, total: rawPagePaths.length });
|
|
11563
|
-
}
|
|
11564
|
-
timingsMs.proofread = elapsedMs(proofStart);
|
|
11565
|
-
markStageDone("proofread", "\uAD50\uC815 \uC644\uB8CC");
|
|
11566
|
-
logStage("info", "proofread", "done", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC644\uB8CC", { elapsedMs: timingsMs.proofread });
|
|
11567
11466
|
const mergeStart = import_node_perf_hooks.performance.now();
|
|
11568
11467
|
currentStage = "merge";
|
|
11569
11468
|
markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
|
|
11570
|
-
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages:
|
|
11571
|
-
const merged = await mergeMarkdownPages(
|
|
11469
|
+
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
11470
|
+
const merged = await mergeMarkdownPages(rawPagePaths);
|
|
11572
11471
|
await (0, import_promises2.writeFile)(outputPath, merged, "utf-8");
|
|
11573
11472
|
timingsMs.merge = elapsedMs(mergeStart);
|
|
11574
11473
|
markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
|
|
@@ -12027,40 +11926,6 @@ function ensureSupportedInput(path) {
|
|
|
12027
11926
|
throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
|
|
12028
11927
|
}
|
|
12029
11928
|
}
|
|
12030
|
-
function extractNumericTokens(text) {
|
|
12031
|
-
return text.match(/\d[\d,./-]*/g) ?? [];
|
|
12032
|
-
}
|
|
12033
|
-
function preserveNumericIntegrity(rawText, correctedText) {
|
|
12034
|
-
const rawTokens = extractNumericTokens(rawText);
|
|
12035
|
-
const correctedTokens = extractNumericTokens(correctedText);
|
|
12036
|
-
if (rawTokens.length !== correctedTokens.length) return rawText;
|
|
12037
|
-
for (let i = 0; i < rawTokens.length; i++) {
|
|
12038
|
-
if (rawTokens[i] !== correctedTokens[i]) return rawText;
|
|
12039
|
-
}
|
|
12040
|
-
return correctedText;
|
|
12041
|
-
}
|
|
12042
|
-
function addUncertainTag(rawText, correctedText) {
|
|
12043
|
-
if (correctedText.includes("[\uD655\uC778\uD544\uC694:")) return correctedText;
|
|
12044
|
-
const rawLen = rawText.trim().length;
|
|
12045
|
-
const corrLen = correctedText.trim().length;
|
|
12046
|
-
if (rawLen === 0 || corrLen === 0) return correctedText;
|
|
12047
|
-
const rawLines = rawText.split("\n").filter(Boolean).length;
|
|
12048
|
-
const corrLines = correctedText.split("\n").filter(Boolean).length;
|
|
12049
|
-
const rawTableLines = rawText.split("\n").filter((l) => l.includes("|")).length;
|
|
12050
|
-
const corrTableLines = correctedText.split("\n").filter((l) => l.includes("|")).length;
|
|
12051
|
-
const suspicious = corrLen < rawLen * 0.75 || corrLines < Math.max(1, Math.floor(rawLines * 0.8)) || rawTableLines >= 2 && corrTableLines < Math.floor(rawTableLines * 0.7);
|
|
12052
|
-
if (!suspicious) return correctedText;
|
|
12053
|
-
return `${correctedText}
|
|
12054
|
-
|
|
12055
|
-
[\uD655\uC778\uD544\uC694: \uAD50\uC815 \uACB0\uACFC\uAC00 \uCD95\uC57D\uB418\uC5C8\uC744 \uC218 \uC788\uC5B4 \uC6D0\uBB38\uACFC \uB300\uC870\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.]`;
|
|
12056
|
-
}
|
|
12057
|
-
function buildDiffSummary(before, after) {
|
|
12058
|
-
return {
|
|
12059
|
-
changed: before !== after,
|
|
12060
|
-
beforeLength: before.length,
|
|
12061
|
-
afterLength: after.length
|
|
12062
|
-
};
|
|
12063
|
-
}
|
|
12064
11929
|
function normalizePipelineError(err, stage) {
|
|
12065
11930
|
if (err instanceof UnifiedOcrError) return err;
|
|
12066
11931
|
const message = err instanceof Error ? err.message : String(err);
|
|
@@ -12146,9 +12011,6 @@ async function parseImage(buffer, options) {
|
|
|
12146
12011
|
if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
|
|
12147
12012
|
ocrProvider = createCliOcrProvider(ocrMode);
|
|
12148
12013
|
actualOcrMode = ocrMode;
|
|
12149
|
-
} else if (ocrMode === "tesseract") {
|
|
12150
|
-
ocrProvider = await createTesseractProvider();
|
|
12151
|
-
actualOcrMode = ocrMode;
|
|
12152
12014
|
} else if (ocrMode === "auto") {
|
|
12153
12015
|
const modesToTry = ["gemini", "claude", "codex", "ollama"];
|
|
12154
12016
|
for (const mode of modesToTry) {
|
|
@@ -12160,10 +12022,6 @@ async function parseImage(buffer, options) {
|
|
|
12160
12022
|
console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
|
|
12161
12023
|
}
|
|
12162
12024
|
}
|
|
12163
|
-
if (!ocrProvider) {
|
|
12164
|
-
ocrProvider = await createTesseractProvider();
|
|
12165
|
-
actualOcrMode = "tesseract";
|
|
12166
|
-
}
|
|
12167
12025
|
}
|
|
12168
12026
|
if (!ocrProvider) {
|
|
12169
12027
|
return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
|