@clazic/kordoc 2.2.9 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -119,6 +119,8 @@ interface ParseOptions {
119
119
  ocrConcurrency?: number;
120
120
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
121
121
  onProgress?: (current: number, total: number) => void;
122
+ /** OCR 배치 크기 — CLI 1회 호출당 처리할 페이지 수. gemini/claude/codex 전용. */
123
+ ocrBatchSize?: number;
122
124
  /** PDF 머리글/바닥글 자동 제거 */
123
125
  removeHeaderFooter?: boolean;
124
126
  }
package/dist/index.d.ts CHANGED
@@ -119,6 +119,8 @@ interface ParseOptions {
119
119
  ocrConcurrency?: number;
120
120
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
121
121
  onProgress?: (current: number, total: number) => void;
122
+ /** OCR 배치 크기 — CLI 1회 호출당 처리할 페이지 수. gemini/claude/codex 전용. */
123
+ ocrBatchSize?: number;
122
124
  /** PDF 머리글/바닥글 자동 제거 */
123
125
  removeHeaderFooter?: boolean;
124
126
  }
package/dist/index.js CHANGED
@@ -2019,7 +2019,7 @@ import { join } from "path";
2019
2019
  import { tmpdir } from "os";
2020
2020
  function getTempDir() {
2021
2021
  if (!_tempDir) {
2022
- _tempDir = join(process.cwd(), ".kordoc-tmp");
2022
+ _tempDir = join(process.cwd(), "_kordoc_ocr_tmp");
2023
2023
  mkdirSync(_tempDir, { recursive: true });
2024
2024
  }
2025
2025
  return _tempDir;
@@ -2119,8 +2119,8 @@ function buildCliArgs(mode, imagePath) {
2119
2119
  }
2120
2120
  }
2121
2121
  async function callOllamaApi(imagePath) {
2122
- const { readFileSync: readFileSync2 } = await import("fs");
2123
- const imageBase64 = readFileSync2(imagePath).toString("base64");
2122
+ const { readFileSync: readFileSync3 } = await import("fs");
2123
+ const imageBase64 = readFileSync3(imagePath).toString("base64");
2124
2124
  const model = process.env.KORDOC_OLLAMA_MODEL || "qwen3-vl:8b";
2125
2125
  const host = process.env.KORDOC_OLLAMA_HOST || "http://localhost:11434";
2126
2126
  const timeoutMs = Number(process.env.KORDOC_OLLAMA_TIMEOUT) || 12e4;
@@ -2216,12 +2216,185 @@ var init_tesseract_provider = __esm({
2216
2216
  }
2217
2217
  });
2218
2218
 
2219
+ // src/ocr/batch-provider.ts
2220
+ var batch_provider_exports = {};
2221
+ __export(batch_provider_exports, {
2222
+ DEFAULT_BATCH_SIZES: () => DEFAULT_BATCH_SIZES,
2223
+ createBatchCliProvider: () => createBatchCliProvider
2224
+ });
2225
+ import { spawn } from "child_process";
2226
+ import { writeFileSync as writeFileSync2, readFileSync as readFileSync2, unlinkSync as unlinkSync2, mkdirSync as mkdirSync2 } from "fs";
2227
+ import { join as join2 } from "path";
2228
+ import { tmpdir as tmpdir2 } from "os";
2229
+ function getBatchTempDir() {
2230
+ if (!_batchTempDir) {
2231
+ _batchTempDir = join2(process.cwd(), "_kordoc_ocr_tmp");
2232
+ mkdirSync2(_batchTempDir, { recursive: true });
2233
+ }
2234
+ return _batchTempDir;
2235
+ }
2236
+ function createBatchCliProvider(mode, batchSize) {
2237
+ return {
2238
+ __batch: true,
2239
+ batchSize,
2240
+ async processBatch(pages) {
2241
+ const results = /* @__PURE__ */ new Map();
2242
+ const tempDir = getBatchTempDir();
2243
+ const tempFiles = [];
2244
+ try {
2245
+ for (const { image, pageNum } of pages) {
2246
+ const path = join2(tempDir, `batch-p${pageNum}.png`);
2247
+ writeFileSync2(path, image);
2248
+ tempFiles.push(path);
2249
+ }
2250
+ let output;
2251
+ if (mode === "codex") {
2252
+ output = await callBatchCodexCli(tempFiles);
2253
+ } else {
2254
+ output = await callBatchCli(mode, tempFiles);
2255
+ }
2256
+ const cleaned = stripCodeFence2(output.trim());
2257
+ const parts = cleaned.split(/<!--\s*PAGE_BREAK\s*-->/).map((p) => p.trim()).filter((p) => p.length > 0);
2258
+ for (let i = 0; i < pages.length; i++) {
2259
+ const pageNum = pages[i].pageNum;
2260
+ if (i < parts.length) {
2261
+ results.set(pageNum, { markdown: parts[i] });
2262
+ }
2263
+ }
2264
+ } finally {
2265
+ for (const f of tempFiles) {
2266
+ try {
2267
+ unlinkSync2(f);
2268
+ } catch {
2269
+ }
2270
+ }
2271
+ }
2272
+ return results;
2273
+ }
2274
+ };
2275
+ }
2276
+ function spawnAsync(cmd, args, opts) {
2277
+ return new Promise((resolve, reject) => {
2278
+ const child = spawn(cmd, args, {
2279
+ cwd: opts.cwd,
2280
+ env: process.env,
2281
+ stdio: ["pipe", "pipe", "pipe"]
2282
+ });
2283
+ let stdout = "";
2284
+ let stderr = "";
2285
+ let killed = false;
2286
+ child.stdout.setEncoding("utf-8");
2287
+ child.stderr.setEncoding("utf-8");
2288
+ child.stdout.on("data", (d) => {
2289
+ stdout += d;
2290
+ });
2291
+ child.stderr.on("data", (d) => {
2292
+ stderr += d;
2293
+ });
2294
+ const timer = setTimeout(() => {
2295
+ killed = true;
2296
+ child.kill("SIGTERM");
2297
+ }, opts.timeoutMs);
2298
+ if (opts.stdin !== void 0) {
2299
+ child.stdin.end(opts.stdin);
2300
+ } else {
2301
+ child.stdin.end();
2302
+ }
2303
+ child.on("close", (code) => {
2304
+ clearTimeout(timer);
2305
+ if (killed) {
2306
+ reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
2307
+ } else {
2308
+ resolve({ stdout, stderr, exitCode: code ?? 1 });
2309
+ }
2310
+ });
2311
+ child.on("error", (err) => {
2312
+ clearTimeout(timer);
2313
+ reject(err);
2314
+ });
2315
+ });
2316
+ }
2317
+ async function callBatchCli(mode, imagePaths) {
2318
+ const fileRefs = imagePaths.map((p) => `@${p}`).join("\n");
2319
+ const prompt = `${BATCH_OCR_PROMPT}
2320
+
2321
+ ${fileRefs}`;
2322
+ let args;
2323
+ if (mode === "gemini") {
2324
+ args = ["--prompt", prompt, "--yolo"];
2325
+ const model = process.env.KORDOC_GEMINI_MODEL;
2326
+ if (model) args.push("--model", model);
2327
+ } else {
2328
+ args = ["--print", prompt];
2329
+ const model = process.env.KORDOC_CLAUDE_MODEL;
2330
+ if (model) args.push("--model", model);
2331
+ }
2332
+ const timeoutMs = 6e4 + imagePaths.length * 2e4;
2333
+ const result = await spawnAsync(mode, args, {
2334
+ timeoutMs,
2335
+ ...mode === "claude" ? { cwd: tmpdir2() } : {}
2336
+ });
2337
+ if (result.exitCode !== 0) {
2338
+ const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
2339
+ throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
2340
+ }
2341
+ return result.stdout || "";
2342
+ }
2343
+ async function callBatchCodexCli(imagePaths) {
2344
+ const outPath = join2(tmpdir2(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
2345
+ try {
2346
+ const args = ["exec", BATCH_OCR_PROMPT];
2347
+ for (const p of imagePaths) {
2348
+ args.push("--image", p);
2349
+ }
2350
+ args.push("--output-last-message", outPath);
2351
+ const model = process.env.KORDOC_CODEX_MODEL;
2352
+ if (model) args.push("--model", model);
2353
+ const timeoutMs = 6e4 + imagePaths.length * 2e4;
2354
+ const result = await spawnAsync("codex", args, {
2355
+ timeoutMs,
2356
+ stdin: ""
2357
+ });
2358
+ if (result.exitCode !== 0) {
2359
+ const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
2360
+ throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
2361
+ }
2362
+ try {
2363
+ return readFileSync2(outPath, "utf-8");
2364
+ } catch {
2365
+ return result.stdout || "";
2366
+ }
2367
+ } finally {
2368
+ try {
2369
+ unlinkSync2(outPath);
2370
+ } catch {
2371
+ }
2372
+ }
2373
+ }
2374
+ function stripCodeFence2(text) {
2375
+ const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
2376
+ return match ? match[1].trim() : text;
2377
+ }
2378
+ var BATCH_OCR_PROMPT, DEFAULT_BATCH_SIZES, _batchTempDir;
2379
+ var init_batch_provider = __esm({
2380
+ "src/ocr/batch-provider.ts"() {
2381
+ "use strict";
2382
+ BATCH_OCR_PROMPT = "\uB2E4\uC74C \uBB38\uC11C \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uB4E4\uC744 OCR\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\n\uADDC\uCE59:\n- \uAC01 \uD398\uC774\uC9C0 \uACB0\uACFC \uC0AC\uC774\uC5D0 \uBC18\uB4DC\uC2DC \uC774 \uAD6C\uBD84\uC790\uB97C \uC0BD\uC785: <!-- PAGE_BREAK -->\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
2383
+ DEFAULT_BATCH_SIZES = {
2384
+ gemini: 5,
2385
+ claude: 5,
2386
+ codex: 10
2387
+ };
2388
+ _batchTempDir = null;
2389
+ }
2390
+ });
2391
+
2219
2392
  // src/ocr/resolve.ts
2220
2393
  var resolve_exports = {};
2221
2394
  __export(resolve_exports, {
2222
2395
  resolveOcrProvider: () => resolveOcrProvider
2223
2396
  });
2224
- async function resolveOcrProvider(mode, warnings, concurrency) {
2397
+ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2225
2398
  if (mode === "off") {
2226
2399
  throw new Error("OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC788\uC2B5\uB2C8\uB2E4 (--ocr off).");
2227
2400
  }
@@ -2234,6 +2407,14 @@ async function resolveOcrProvider(mode, warnings, concurrency) {
2234
2407
  }
2235
2408
  return createTesseractProvider2();
2236
2409
  }
2410
+ if (mode === "gemini" || mode === "claude" || mode === "codex") {
2411
+ const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
2412
+ const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[mode];
2413
+ if (effectiveBatch > 1) {
2414
+ return createBatchCliProvider2(mode, effectiveBatch);
2415
+ }
2416
+ return createCliOcrProvider(mode);
2417
+ }
2237
2418
  return createCliOcrProvider(mode);
2238
2419
  }
2239
2420
  const detected = detectAvailableOcr();
@@ -2257,6 +2438,14 @@ async function resolveOcrProvider(mode, warnings, concurrency) {
2257
2438
  }
2258
2439
  return createTesseractProvider2();
2259
2440
  }
2441
+ if (detected === "gemini" || detected === "codex" || detected === "claude") {
2442
+ const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
2443
+ const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[detected];
2444
+ if (effectiveBatch > 1) {
2445
+ return createBatchCliProvider2(detected, effectiveBatch);
2446
+ }
2447
+ return createCliOcrProvider(detected);
2448
+ }
2260
2449
  return createCliOcrProvider(detected);
2261
2450
  }
2262
2451
  var init_resolve = __esm({
@@ -2419,8 +2608,14 @@ function ocrResultToBlocks(result, pageNum) {
2419
2608
  }
2420
2609
  return pageBlocks;
2421
2610
  }
2422
- async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1) {
2611
+ function isBatchProvider(p) {
2612
+ return !!p && typeof p === "object" && "__batch" in p && p.__batch === true;
2613
+ }
2614
+ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
2423
2615
  const blocks = [];
2616
+ if (isBatchProvider(provider)) {
2617
+ return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
2618
+ }
2424
2619
  if (concurrency <= 1) {
2425
2620
  for (let i = 1; i <= effectivePageCount; i++) {
2426
2621
  if (pageFilter && !pageFilter.has(i)) continue;
@@ -2466,6 +2661,58 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
2466
2661
  }
2467
2662
  return blocks;
2468
2663
  }
2664
+ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
2665
+ const pageNumbers = [];
2666
+ for (let i = 1; i <= effectivePageCount; i++) {
2667
+ if (pageFilter && !pageFilter.has(i)) continue;
2668
+ pageNumbers.push(i);
2669
+ }
2670
+ const pageImages = [];
2671
+ for (const pageNum of pageNumbers) {
2672
+ const page = await doc.getPage(pageNum);
2673
+ const image = await renderPageToPng(page);
2674
+ pageImages.push({ image, pageNum });
2675
+ }
2676
+ const batches = [];
2677
+ for (let i = 0; i < pageImages.length; i += provider.batchSize) {
2678
+ batches.push(pageImages.slice(i, i + provider.batchSize));
2679
+ }
2680
+ let processed = 0;
2681
+ const batchTasks = batches.map((batch, batchIdx) => async () => {
2682
+ const pageBlocks = [];
2683
+ try {
2684
+ const results = await provider.processBatch(batch);
2685
+ for (const { pageNum } of batch) {
2686
+ const result = results.get(pageNum);
2687
+ pageBlocks.push({
2688
+ pageNum,
2689
+ blocks: result ? ocrResultToBlocks(result, pageNum) : []
2690
+ });
2691
+ }
2692
+ } catch (err) {
2693
+ const range = `${batch[0].pageNum}-${batch[batch.length - 1].pageNum}`;
2694
+ warnings?.push({
2695
+ message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2696
+ code: "OCR_PAGE_FAILED"
2697
+ });
2698
+ for (const { pageNum } of batch) {
2699
+ pageBlocks.push({ pageNum, blocks: [] });
2700
+ }
2701
+ }
2702
+ processed += batch.length;
2703
+ onProgress?.(processed, pageNumbers.length);
2704
+ return { batchIdx, pageBlocks };
2705
+ });
2706
+ const effectiveConcurrency = Math.max(1, concurrency);
2707
+ const batchResults = await runWithConcurrency(batchTasks, effectiveConcurrency);
2708
+ const blocks = [];
2709
+ for (const result of batchResults) {
2710
+ for (const { blocks: pageBlks } of result.pageBlocks) {
2711
+ for (const b of pageBlks) blocks.push(b);
2712
+ }
2713
+ }
2714
+ return blocks;
2715
+ }
2469
2716
  async function renderPageToPng(page) {
2470
2717
  const { createCanvas } = await import("@napi-rs/canvas");
2471
2718
  const scale = 2;
@@ -2531,7 +2778,7 @@ import JSZip2 from "jszip";
2531
2778
  import { DOMParser } from "@xmldom/xmldom";
2532
2779
 
2533
2780
  // src/utils.ts
2534
- var VERSION = true ? "2.2.9" : "0.0.0-dev";
2781
+ var VERSION = true ? "2.3.1" : "0.0.0-dev";
2535
2782
  function toArrayBuffer(buf) {
2536
2783
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2537
2784
  return buf.buffer;
@@ -6049,7 +6296,8 @@ async function parsePdfDocument(buffer, options) {
6049
6296
  try {
6050
6297
  const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6051
6298
  const concurrency = options?.ocrConcurrency ?? 1;
6052
- ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency);
6299
+ const batchSize = options?.ocrBatchSize;
6300
+ ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
6053
6301
  } catch (resolveErr) {
6054
6302
  if (ocrMode !== "auto") {
6055
6303
  throw Object.assign(
@@ -6064,7 +6312,7 @@ async function parsePdfDocument(buffer, options) {
6064
6312
  try {
6065
6313
  const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6066
6314
  const concurrency = options?.ocrConcurrency ?? 1;
6067
- ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency);
6315
+ ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
6068
6316
  } catch {
6069
6317
  } finally {
6070
6318
  const terminable = ocrProvider;