@clazic/kordoc 2.3.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2222,7 +2222,7 @@ __export(batch_provider_exports, {
2222
2222
  DEFAULT_BATCH_SIZES: () => DEFAULT_BATCH_SIZES,
2223
2223
  createBatchCliProvider: () => createBatchCliProvider
2224
2224
  });
2225
- import { spawnSync as spawnSync2 } from "child_process";
2225
+ import { spawn } from "child_process";
2226
2226
  import { writeFileSync as writeFileSync2, readFileSync as readFileSync2, unlinkSync as unlinkSync2, mkdirSync as mkdirSync2 } from "fs";
2227
2227
  import { join as join2 } from "path";
2228
2228
  import { tmpdir as tmpdir2 } from "os";
@@ -2249,9 +2249,9 @@ function createBatchCliProvider(mode, batchSize) {
2249
2249
  }
2250
2250
  let output;
2251
2251
  if (mode === "codex") {
2252
- output = callBatchCodexCli(tempFiles);
2252
+ output = await callBatchCodexCli(tempFiles);
2253
2253
  } else {
2254
- output = callBatchCli(mode, tempFiles);
2254
+ output = await callBatchCli(mode, tempFiles);
2255
2255
  }
2256
2256
  const cleaned = stripCodeFence2(output.trim());
2257
2257
  const parts = cleaned.split(/<!--\s*PAGE_BREAK\s*-->/).map((p) => p.trim()).filter((p) => p.length > 0);
@@ -2273,7 +2273,48 @@ function createBatchCliProvider(mode, batchSize) {
2273
2273
  }
2274
2274
  };
2275
2275
  }
2276
- function callBatchCli(mode, imagePaths) {
2276
+ function spawnAsync(cmd, args, opts) {
2277
+ return new Promise((resolve, reject) => {
2278
+ const child = spawn(cmd, args, {
2279
+ cwd: opts.cwd,
2280
+ env: process.env,
2281
+ stdio: ["pipe", "pipe", "pipe"]
2282
+ });
2283
+ let stdout = "";
2284
+ let stderr = "";
2285
+ let killed = false;
2286
+ child.stdout.setEncoding("utf-8");
2287
+ child.stderr.setEncoding("utf-8");
2288
+ child.stdout.on("data", (d) => {
2289
+ stdout += d;
2290
+ });
2291
+ child.stderr.on("data", (d) => {
2292
+ stderr += d;
2293
+ });
2294
+ const timer = setTimeout(() => {
2295
+ killed = true;
2296
+ child.kill("SIGTERM");
2297
+ }, opts.timeoutMs);
2298
+ if (opts.stdin !== void 0) {
2299
+ child.stdin.end(opts.stdin);
2300
+ } else {
2301
+ child.stdin.end();
2302
+ }
2303
+ child.on("close", (code) => {
2304
+ clearTimeout(timer);
2305
+ if (killed) {
2306
+ reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
2307
+ } else {
2308
+ resolve({ stdout, stderr, exitCode: code ?? 1 });
2309
+ }
2310
+ });
2311
+ child.on("error", (err) => {
2312
+ clearTimeout(timer);
2313
+ reject(err);
2314
+ });
2315
+ });
2316
+ }
2317
+ async function callBatchCli(mode, imagePaths) {
2277
2318
  const fileRefs = imagePaths.map((p) => `@${p}`).join("\n");
2278
2319
  const prompt = `${BATCH_OCR_PROMPT}
2279
2320
 
@@ -2289,24 +2330,18 @@ ${fileRefs}`;
2289
2330
  if (model) args.push("--model", model);
2290
2331
  }
2291
2332
  const timeoutMs = 6e4 + imagePaths.length * 2e4;
2292
- const result = spawnSync2(mode, args, {
2293
- encoding: "utf-8",
2294
- timeout: timeoutMs,
2295
- maxBuffer: 50 * 1024 * 1024,
2296
- // 50MB (large batch output)
2333
+ const result = await spawnAsync(mode, args, {
2334
+ timeoutMs,
2297
2335
  ...mode === "claude" ? { cwd: tmpdir2() } : {}
2298
2336
  });
2299
- if (result.error) {
2300
- throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${result.error.message}`);
2301
- }
2302
- if (result.status !== 0) {
2303
- const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
2337
+ if (result.exitCode !== 0) {
2338
+ const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
2304
2339
  throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
2305
2340
  }
2306
2341
  return result.stdout || "";
2307
2342
  }
2308
- function callBatchCodexCli(imagePaths) {
2309
- const outPath = join2(tmpdir2(), `kordoc-codex-batch-${Date.now()}.txt`);
2343
+ async function callBatchCodexCli(imagePaths) {
2344
+ const outPath = join2(tmpdir2(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
2310
2345
  try {
2311
2346
  const args = ["exec", BATCH_OCR_PROMPT];
2312
2347
  for (const p of imagePaths) {
@@ -2316,17 +2351,12 @@ function callBatchCodexCli(imagePaths) {
2316
2351
  const model = process.env.KORDOC_CODEX_MODEL;
2317
2352
  if (model) args.push("--model", model);
2318
2353
  const timeoutMs = 6e4 + imagePaths.length * 2e4;
2319
- const result = spawnSync2("codex", args, {
2320
- encoding: "utf-8",
2321
- timeout: timeoutMs,
2322
- maxBuffer: 50 * 1024 * 1024,
2323
- input: ""
2354
+ const result = await spawnAsync("codex", args, {
2355
+ timeoutMs,
2356
+ stdin: ""
2324
2357
  });
2325
- if (result.error) {
2326
- throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${result.error.message}`);
2327
- }
2328
- if (result.status !== 0) {
2329
- const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
2358
+ if (result.exitCode !== 0) {
2359
+ const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
2330
2360
  throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
2331
2361
  }
2332
2362
  try {
@@ -2584,7 +2614,7 @@ function isBatchProvider(p) {
2584
2614
  async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
2585
2615
  const blocks = [];
2586
2616
  if (isBatchProvider(provider)) {
2587
- return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress);
2617
+ return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
2588
2618
  }
2589
2619
  if (concurrency <= 1) {
2590
2620
  for (let i = 1; i <= effectivePageCount; i++) {
@@ -2631,8 +2661,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
2631
2661
  }
2632
2662
  return blocks;
2633
2663
  }
2634
- async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress) {
2635
- const blocks = [];
2664
+ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
2636
2665
  const pageNumbers = [];
2637
2666
  for (let i = 1; i <= effectivePageCount; i++) {
2638
2667
  if (pageFilter && !pageFilter.has(i)) continue;
@@ -2649,16 +2678,16 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2649
2678
  batches.push(pageImages.slice(i, i + provider.batchSize));
2650
2679
  }
2651
2680
  let processed = 0;
2652
- for (const batch of batches) {
2681
+ const batchTasks = batches.map((batch, batchIdx) => async () => {
2682
+ const pageBlocks = [];
2653
2683
  try {
2654
2684
  const results = await provider.processBatch(batch);
2655
2685
  for (const { pageNum } of batch) {
2656
2686
  const result = results.get(pageNum);
2657
- if (result) {
2658
- for (const b of ocrResultToBlocks(result, pageNum)) blocks.push(b);
2659
- }
2660
- processed++;
2661
- onProgress?.(processed, pageNumbers.length);
2687
+ pageBlocks.push({
2688
+ pageNum,
2689
+ blocks: result ? ocrResultToBlocks(result, pageNum) : []
2690
+ });
2662
2691
  }
2663
2692
  } catch (err) {
2664
2693
  const range = `${batch[0].pageNum}-${batch[batch.length - 1].pageNum}`;
@@ -2666,8 +2695,20 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2666
2695
  message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2667
2696
  code: "OCR_PAGE_FAILED"
2668
2697
  });
2669
- processed += batch.length;
2670
- onProgress?.(processed, pageNumbers.length);
2698
+ for (const { pageNum } of batch) {
2699
+ pageBlocks.push({ pageNum, blocks: [] });
2700
+ }
2701
+ }
2702
+ processed += batch.length;
2703
+ onProgress?.(processed, pageNumbers.length);
2704
+ return { batchIdx, pageBlocks };
2705
+ });
2706
+ const effectiveConcurrency = Math.max(1, concurrency);
2707
+ const batchResults = await runWithConcurrency(batchTasks, effectiveConcurrency);
2708
+ const blocks = [];
2709
+ for (const result of batchResults) {
2710
+ for (const { blocks: pageBlks } of result.pageBlocks) {
2711
+ for (const b of pageBlks) blocks.push(b);
2671
2712
  }
2672
2713
  }
2673
2714
  return blocks;
@@ -2737,7 +2778,7 @@ import JSZip2 from "jszip";
2737
2778
  import { DOMParser } from "@xmldom/xmldom";
2738
2779
 
2739
2780
  // src/utils.ts
2740
- var VERSION = true ? "2.3.0" : "0.0.0-dev";
2781
+ var VERSION = true ? "2.3.1" : "0.0.0-dev";
2741
2782
  function toArrayBuffer(buf) {
2742
2783
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2743
2784
  return buf.buffer;