@clazic/kordoc 2.3.0 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{batch-provider-FUCIIS4M.js → batch-provider-PCT4I4LK.js} +57 -27
- package/dist/batch-provider-PCT4I4LK.js.map +1 -0
- package/dist/{chunk-WWILSVMJ.js → chunk-W5KUC23B.js} +2 -2
- package/dist/{chunk-2ZGLFZCN.js → chunk-ZOEUKD77.js} +4 -4
- package/dist/cli.js +7 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +78 -37
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +79 -38
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{provider-OBY3XFSZ.js → provider-WYHC4NHI.js} +23 -12
- package/dist/provider-WYHC4NHI.js.map +1 -0
- package/dist/{resolve-LBFYRHJI.js → resolve-4FSAQF2S.js} +3 -3
- package/dist/{utils-QAK24RJS.js → utils-HSF5HI5T.js} +2 -2
- package/dist/{watch-MPHX3QIH.js → watch-R2JHXDGF.js} +3 -3
- package/package.json +1 -1
- package/dist/batch-provider-FUCIIS4M.js.map +0 -1
- package/dist/provider-OBY3XFSZ.js.map +0 -1
- /package/dist/{chunk-WWILSVMJ.js.map → chunk-W5KUC23B.js.map} +0 -0
- /package/dist/{chunk-2ZGLFZCN.js.map → chunk-ZOEUKD77.js.map} +0 -0
- /package/dist/{resolve-LBFYRHJI.js.map → resolve-4FSAQF2S.js.map} +0 -0
- /package/dist/{utils-QAK24RJS.js.map → utils-HSF5HI5T.js.map} +0 -0
- /package/dist/{watch-MPHX3QIH.js.map → watch-R2JHXDGF.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -2222,7 +2222,7 @@ __export(batch_provider_exports, {
|
|
|
2222
2222
|
DEFAULT_BATCH_SIZES: () => DEFAULT_BATCH_SIZES,
|
|
2223
2223
|
createBatchCliProvider: () => createBatchCliProvider
|
|
2224
2224
|
});
|
|
2225
|
-
import {
|
|
2225
|
+
import { spawn } from "child_process";
|
|
2226
2226
|
import { writeFileSync as writeFileSync2, readFileSync as readFileSync2, unlinkSync as unlinkSync2, mkdirSync as mkdirSync2 } from "fs";
|
|
2227
2227
|
import { join as join2 } from "path";
|
|
2228
2228
|
import { tmpdir as tmpdir2 } from "os";
|
|
@@ -2249,9 +2249,9 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
2249
2249
|
}
|
|
2250
2250
|
let output;
|
|
2251
2251
|
if (mode === "codex") {
|
|
2252
|
-
output = callBatchCodexCli(tempFiles);
|
|
2252
|
+
output = await callBatchCodexCli(tempFiles);
|
|
2253
2253
|
} else {
|
|
2254
|
-
output = callBatchCli(mode, tempFiles);
|
|
2254
|
+
output = await callBatchCli(mode, tempFiles);
|
|
2255
2255
|
}
|
|
2256
2256
|
const cleaned = stripCodeFence2(output.trim());
|
|
2257
2257
|
const parts = cleaned.split(/<!--\s*PAGE_BREAK\s*-->/).map((p) => p.trim()).filter((p) => p.length > 0);
|
|
@@ -2273,7 +2273,48 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
2273
2273
|
}
|
|
2274
2274
|
};
|
|
2275
2275
|
}
|
|
2276
|
-
function
|
|
2276
|
+
function spawnAsync(cmd, args, opts) {
|
|
2277
|
+
return new Promise((resolve, reject) => {
|
|
2278
|
+
const child = spawn(cmd, args, {
|
|
2279
|
+
cwd: opts.cwd,
|
|
2280
|
+
env: process.env,
|
|
2281
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
2282
|
+
});
|
|
2283
|
+
let stdout = "";
|
|
2284
|
+
let stderr = "";
|
|
2285
|
+
let killed = false;
|
|
2286
|
+
child.stdout.setEncoding("utf-8");
|
|
2287
|
+
child.stderr.setEncoding("utf-8");
|
|
2288
|
+
child.stdout.on("data", (d) => {
|
|
2289
|
+
stdout += d;
|
|
2290
|
+
});
|
|
2291
|
+
child.stderr.on("data", (d) => {
|
|
2292
|
+
stderr += d;
|
|
2293
|
+
});
|
|
2294
|
+
const timer = setTimeout(() => {
|
|
2295
|
+
killed = true;
|
|
2296
|
+
child.kill("SIGTERM");
|
|
2297
|
+
}, opts.timeoutMs);
|
|
2298
|
+
if (opts.stdin !== void 0) {
|
|
2299
|
+
child.stdin.end(opts.stdin);
|
|
2300
|
+
} else {
|
|
2301
|
+
child.stdin.end();
|
|
2302
|
+
}
|
|
2303
|
+
child.on("close", (code) => {
|
|
2304
|
+
clearTimeout(timer);
|
|
2305
|
+
if (killed) {
|
|
2306
|
+
reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
|
|
2307
|
+
} else {
|
|
2308
|
+
resolve({ stdout, stderr, exitCode: code ?? 1 });
|
|
2309
|
+
}
|
|
2310
|
+
});
|
|
2311
|
+
child.on("error", (err) => {
|
|
2312
|
+
clearTimeout(timer);
|
|
2313
|
+
reject(err);
|
|
2314
|
+
});
|
|
2315
|
+
});
|
|
2316
|
+
}
|
|
2317
|
+
async function callBatchCli(mode, imagePaths) {
|
|
2277
2318
|
const fileRefs = imagePaths.map((p) => `@${p}`).join("\n");
|
|
2278
2319
|
const prompt = `${BATCH_OCR_PROMPT}
|
|
2279
2320
|
|
|
@@ -2289,24 +2330,18 @@ ${fileRefs}`;
|
|
|
2289
2330
|
if (model) args.push("--model", model);
|
|
2290
2331
|
}
|
|
2291
2332
|
const timeoutMs = 6e4 + imagePaths.length * 2e4;
|
|
2292
|
-
const result =
|
|
2293
|
-
|
|
2294
|
-
timeout: timeoutMs,
|
|
2295
|
-
maxBuffer: 50 * 1024 * 1024,
|
|
2296
|
-
// 50MB (large batch output)
|
|
2333
|
+
const result = await spawnAsync(mode, args, {
|
|
2334
|
+
timeoutMs,
|
|
2297
2335
|
...mode === "claude" ? { cwd: tmpdir2() } : {}
|
|
2298
2336
|
});
|
|
2299
|
-
if (result.
|
|
2300
|
-
|
|
2301
|
-
}
|
|
2302
|
-
if (result.status !== 0) {
|
|
2303
|
-
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
2337
|
+
if (result.exitCode !== 0) {
|
|
2338
|
+
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
2304
2339
|
throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
2305
2340
|
}
|
|
2306
2341
|
return result.stdout || "";
|
|
2307
2342
|
}
|
|
2308
|
-
function callBatchCodexCli(imagePaths) {
|
|
2309
|
-
const outPath = join2(tmpdir2(), `kordoc-codex-batch-${Date.now()}.txt`);
|
|
2343
|
+
async function callBatchCodexCli(imagePaths) {
|
|
2344
|
+
const outPath = join2(tmpdir2(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
|
|
2310
2345
|
try {
|
|
2311
2346
|
const args = ["exec", BATCH_OCR_PROMPT];
|
|
2312
2347
|
for (const p of imagePaths) {
|
|
@@ -2316,17 +2351,12 @@ function callBatchCodexCli(imagePaths) {
|
|
|
2316
2351
|
const model = process.env.KORDOC_CODEX_MODEL;
|
|
2317
2352
|
if (model) args.push("--model", model);
|
|
2318
2353
|
const timeoutMs = 6e4 + imagePaths.length * 2e4;
|
|
2319
|
-
const result =
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
maxBuffer: 50 * 1024 * 1024,
|
|
2323
|
-
input: ""
|
|
2354
|
+
const result = await spawnAsync("codex", args, {
|
|
2355
|
+
timeoutMs,
|
|
2356
|
+
stdin: ""
|
|
2324
2357
|
});
|
|
2325
|
-
if (result.
|
|
2326
|
-
|
|
2327
|
-
}
|
|
2328
|
-
if (result.status !== 0) {
|
|
2329
|
-
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
2358
|
+
if (result.exitCode !== 0) {
|
|
2359
|
+
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
2330
2360
|
throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
2331
2361
|
}
|
|
2332
2362
|
try {
|
|
@@ -2584,7 +2614,7 @@ function isBatchProvider(p) {
|
|
|
2584
2614
|
async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
2585
2615
|
const blocks = [];
|
|
2586
2616
|
if (isBatchProvider(provider)) {
|
|
2587
|
-
return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress);
|
|
2617
|
+
return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
|
|
2588
2618
|
}
|
|
2589
2619
|
if (concurrency <= 1) {
|
|
2590
2620
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -2631,8 +2661,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
|
|
|
2631
2661
|
}
|
|
2632
2662
|
return blocks;
|
|
2633
2663
|
}
|
|
2634
|
-
async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress) {
|
|
2635
|
-
const blocks = [];
|
|
2664
|
+
async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
2636
2665
|
const pageNumbers = [];
|
|
2637
2666
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
2638
2667
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
@@ -2649,16 +2678,16 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2649
2678
|
batches.push(pageImages.slice(i, i + provider.batchSize));
|
|
2650
2679
|
}
|
|
2651
2680
|
let processed = 0;
|
|
2652
|
-
|
|
2681
|
+
const batchTasks = batches.map((batch, batchIdx) => async () => {
|
|
2682
|
+
const pageBlocks = [];
|
|
2653
2683
|
try {
|
|
2654
2684
|
const results = await provider.processBatch(batch);
|
|
2655
2685
|
for (const { pageNum } of batch) {
|
|
2656
2686
|
const result = results.get(pageNum);
|
|
2657
|
-
|
|
2658
|
-
|
|
2659
|
-
|
|
2660
|
-
|
|
2661
|
-
onProgress?.(processed, pageNumbers.length);
|
|
2687
|
+
pageBlocks.push({
|
|
2688
|
+
pageNum,
|
|
2689
|
+
blocks: result ? ocrResultToBlocks(result, pageNum) : []
|
|
2690
|
+
});
|
|
2662
2691
|
}
|
|
2663
2692
|
} catch (err) {
|
|
2664
2693
|
const range = `${batch[0].pageNum}-${batch[batch.length - 1].pageNum}`;
|
|
@@ -2666,8 +2695,20 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2666
2695
|
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2667
2696
|
code: "OCR_PAGE_FAILED"
|
|
2668
2697
|
});
|
|
2669
|
-
|
|
2670
|
-
|
|
2698
|
+
for (const { pageNum } of batch) {
|
|
2699
|
+
pageBlocks.push({ pageNum, blocks: [] });
|
|
2700
|
+
}
|
|
2701
|
+
}
|
|
2702
|
+
processed += batch.length;
|
|
2703
|
+
onProgress?.(processed, pageNumbers.length);
|
|
2704
|
+
return { batchIdx, pageBlocks };
|
|
2705
|
+
});
|
|
2706
|
+
const effectiveConcurrency = Math.max(1, concurrency);
|
|
2707
|
+
const batchResults = await runWithConcurrency(batchTasks, effectiveConcurrency);
|
|
2708
|
+
const blocks = [];
|
|
2709
|
+
for (const result of batchResults) {
|
|
2710
|
+
for (const { blocks: pageBlks } of result.pageBlocks) {
|
|
2711
|
+
for (const b of pageBlks) blocks.push(b);
|
|
2671
2712
|
}
|
|
2672
2713
|
}
|
|
2673
2714
|
return blocks;
|
|
@@ -2737,7 +2778,7 @@ import JSZip2 from "jszip";
|
|
|
2737
2778
|
import { DOMParser } from "@xmldom/xmldom";
|
|
2738
2779
|
|
|
2739
2780
|
// src/utils.ts
|
|
2740
|
-
var VERSION = true ? "2.3.
|
|
2781
|
+
var VERSION = true ? "2.3.1" : "0.0.0-dev";
|
|
2741
2782
|
function toArrayBuffer(buf) {
|
|
2742
2783
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2743
2784
|
return buf.buffer;
|