@clazic/kordoc 2.2.9 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -119,6 +119,8 @@ interface ParseOptions {
119
119
  ocrConcurrency?: number;
120
120
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
121
121
  onProgress?: (current: number, total: number) => void;
122
+ /** OCR 배치 크기 — CLI 1회 호출당 처리할 페이지 수. gemini/claude/codex 전용. */
123
+ ocrBatchSize?: number;
122
124
  /** PDF 머리글/바닥글 자동 제거 */
123
125
  removeHeaderFooter?: boolean;
124
126
  }
package/dist/index.d.ts CHANGED
@@ -119,6 +119,8 @@ interface ParseOptions {
119
119
  ocrConcurrency?: number;
120
120
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
121
121
  onProgress?: (current: number, total: number) => void;
122
+ /** OCR 배치 크기 — CLI 1회 호출당 처리할 페이지 수. gemini/claude/codex 전용. */
123
+ ocrBatchSize?: number;
122
124
  /** PDF 머리글/바닥글 자동 제거 */
123
125
  removeHeaderFooter?: boolean;
124
126
  }
package/dist/index.js CHANGED
@@ -2019,7 +2019,7 @@ import { join } from "path";
2019
2019
  import { tmpdir } from "os";
2020
2020
  function getTempDir() {
2021
2021
  if (!_tempDir) {
2022
- _tempDir = join(process.cwd(), ".kordoc-tmp");
2022
+ _tempDir = join(process.cwd(), "_kordoc_ocr_tmp");
2023
2023
  mkdirSync(_tempDir, { recursive: true });
2024
2024
  }
2025
2025
  return _tempDir;
@@ -2119,8 +2119,8 @@ function buildCliArgs(mode, imagePath) {
2119
2119
  }
2120
2120
  }
2121
2121
  async function callOllamaApi(imagePath) {
2122
- const { readFileSync: readFileSync2 } = await import("fs");
2123
- const imageBase64 = readFileSync2(imagePath).toString("base64");
2122
+ const { readFileSync: readFileSync3 } = await import("fs");
2123
+ const imageBase64 = readFileSync3(imagePath).toString("base64");
2124
2124
  const model = process.env.KORDOC_OLLAMA_MODEL || "qwen3-vl:8b";
2125
2125
  const host = process.env.KORDOC_OLLAMA_HOST || "http://localhost:11434";
2126
2126
  const timeoutMs = Number(process.env.KORDOC_OLLAMA_TIMEOUT) || 12e4;
@@ -2216,12 +2216,155 @@ var init_tesseract_provider = __esm({
2216
2216
  }
2217
2217
  });
2218
2218
 
2219
+ // src/ocr/batch-provider.ts
2220
+ var batch_provider_exports = {};
2221
+ __export(batch_provider_exports, {
2222
+ DEFAULT_BATCH_SIZES: () => DEFAULT_BATCH_SIZES,
2223
+ createBatchCliProvider: () => createBatchCliProvider
2224
+ });
2225
+ import { spawnSync as spawnSync2 } from "child_process";
2226
+ import { writeFileSync as writeFileSync2, readFileSync as readFileSync2, unlinkSync as unlinkSync2, mkdirSync as mkdirSync2 } from "fs";
2227
+ import { join as join2 } from "path";
2228
+ import { tmpdir as tmpdir2 } from "os";
2229
+ function getBatchTempDir() {
2230
+ if (!_batchTempDir) {
2231
+ _batchTempDir = join2(process.cwd(), "_kordoc_ocr_tmp");
2232
+ mkdirSync2(_batchTempDir, { recursive: true });
2233
+ }
2234
+ return _batchTempDir;
2235
+ }
2236
+ function createBatchCliProvider(mode, batchSize) {
2237
+ return {
2238
+ __batch: true,
2239
+ batchSize,
2240
+ async processBatch(pages) {
2241
+ const results = /* @__PURE__ */ new Map();
2242
+ const tempDir = getBatchTempDir();
2243
+ const tempFiles = [];
2244
+ try {
2245
+ for (const { image, pageNum } of pages) {
2246
+ const path = join2(tempDir, `batch-p${pageNum}.png`);
2247
+ writeFileSync2(path, image);
2248
+ tempFiles.push(path);
2249
+ }
2250
+ let output;
2251
+ if (mode === "codex") {
2252
+ output = callBatchCodexCli(tempFiles);
2253
+ } else {
2254
+ output = callBatchCli(mode, tempFiles);
2255
+ }
2256
+ const cleaned = stripCodeFence2(output.trim());
2257
+ const parts = cleaned.split(/<!--\s*PAGE_BREAK\s*-->/).map((p) => p.trim()).filter((p) => p.length > 0);
2258
+ for (let i = 0; i < pages.length; i++) {
2259
+ const pageNum = pages[i].pageNum;
2260
+ if (i < parts.length) {
2261
+ results.set(pageNum, { markdown: parts[i] });
2262
+ }
2263
+ }
2264
+ } finally {
2265
+ for (const f of tempFiles) {
2266
+ try {
2267
+ unlinkSync2(f);
2268
+ } catch {
2269
+ }
2270
+ }
2271
+ }
2272
+ return results;
2273
+ }
2274
+ };
2275
+ }
2276
+ function callBatchCli(mode, imagePaths) {
2277
+ const fileRefs = imagePaths.map((p) => `@${p}`).join("\n");
2278
+ const prompt = `${BATCH_OCR_PROMPT}
2279
+
2280
+ ${fileRefs}`;
2281
+ let args;
2282
+ if (mode === "gemini") {
2283
+ args = ["--prompt", prompt, "--yolo"];
2284
+ const model = process.env.KORDOC_GEMINI_MODEL;
2285
+ if (model) args.push("--model", model);
2286
+ } else {
2287
+ args = ["--print", prompt];
2288
+ const model = process.env.KORDOC_CLAUDE_MODEL;
2289
+ if (model) args.push("--model", model);
2290
+ }
2291
+ const timeoutMs = 6e4 + imagePaths.length * 2e4;
2292
+ const result = spawnSync2(mode, args, {
2293
+ encoding: "utf-8",
2294
+ timeout: timeoutMs,
2295
+ maxBuffer: 50 * 1024 * 1024,
2296
+ // 50MB (large batch output)
2297
+ ...mode === "claude" ? { cwd: tmpdir2() } : {}
2298
+ });
2299
+ if (result.error) {
2300
+ throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${result.error.message}`);
2301
+ }
2302
+ if (result.status !== 0) {
2303
+ const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
2304
+ throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
2305
+ }
2306
+ return result.stdout || "";
2307
+ }
2308
+ function callBatchCodexCli(imagePaths) {
2309
+ const outPath = join2(tmpdir2(), `kordoc-codex-batch-${Date.now()}.txt`);
2310
+ try {
2311
+ const args = ["exec", BATCH_OCR_PROMPT];
2312
+ for (const p of imagePaths) {
2313
+ args.push("--image", p);
2314
+ }
2315
+ args.push("--output-last-message", outPath);
2316
+ const model = process.env.KORDOC_CODEX_MODEL;
2317
+ if (model) args.push("--model", model);
2318
+ const timeoutMs = 6e4 + imagePaths.length * 2e4;
2319
+ const result = spawnSync2("codex", args, {
2320
+ encoding: "utf-8",
2321
+ timeout: timeoutMs,
2322
+ maxBuffer: 50 * 1024 * 1024,
2323
+ input: ""
2324
+ });
2325
+ if (result.error) {
2326
+ throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${result.error.message}`);
2327
+ }
2328
+ if (result.status !== 0) {
2329
+ const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
2330
+ throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
2331
+ }
2332
+ try {
2333
+ return readFileSync2(outPath, "utf-8");
2334
+ } catch {
2335
+ return result.stdout || "";
2336
+ }
2337
+ } finally {
2338
+ try {
2339
+ unlinkSync2(outPath);
2340
+ } catch {
2341
+ }
2342
+ }
2343
+ }
2344
+ function stripCodeFence2(text) {
2345
+ const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
2346
+ return match ? match[1].trim() : text;
2347
+ }
2348
+ var BATCH_OCR_PROMPT, DEFAULT_BATCH_SIZES, _batchTempDir;
2349
+ var init_batch_provider = __esm({
2350
+ "src/ocr/batch-provider.ts"() {
2351
+ "use strict";
2352
+ BATCH_OCR_PROMPT = "\uB2E4\uC74C \uBB38\uC11C \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uB4E4\uC744 OCR\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\n\uADDC\uCE59:\n- \uAC01 \uD398\uC774\uC9C0 \uACB0\uACFC \uC0AC\uC774\uC5D0 \uBC18\uB4DC\uC2DC \uC774 \uAD6C\uBD84\uC790\uB97C \uC0BD\uC785: <!-- PAGE_BREAK -->\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
2353
+ DEFAULT_BATCH_SIZES = {
2354
+ gemini: 5,
2355
+ claude: 5,
2356
+ codex: 10
2357
+ };
2358
+ _batchTempDir = null;
2359
+ }
2360
+ });
2361
+
2219
2362
  // src/ocr/resolve.ts
2220
2363
  var resolve_exports = {};
2221
2364
  __export(resolve_exports, {
2222
2365
  resolveOcrProvider: () => resolveOcrProvider
2223
2366
  });
2224
- async function resolveOcrProvider(mode, warnings, concurrency) {
2367
+ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2225
2368
  if (mode === "off") {
2226
2369
  throw new Error("OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC788\uC2B5\uB2C8\uB2E4 (--ocr off).");
2227
2370
  }
@@ -2234,6 +2377,14 @@ async function resolveOcrProvider(mode, warnings, concurrency) {
2234
2377
  }
2235
2378
  return createTesseractProvider2();
2236
2379
  }
2380
+ if (mode === "gemini" || mode === "claude" || mode === "codex") {
2381
+ const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
2382
+ const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[mode];
2383
+ if (effectiveBatch > 1) {
2384
+ return createBatchCliProvider2(mode, effectiveBatch);
2385
+ }
2386
+ return createCliOcrProvider(mode);
2387
+ }
2237
2388
  return createCliOcrProvider(mode);
2238
2389
  }
2239
2390
  const detected = detectAvailableOcr();
@@ -2257,6 +2408,14 @@ async function resolveOcrProvider(mode, warnings, concurrency) {
2257
2408
  }
2258
2409
  return createTesseractProvider2();
2259
2410
  }
2411
+ if (detected === "gemini" || detected === "codex" || detected === "claude") {
2412
+ const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
2413
+ const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[detected];
2414
+ if (effectiveBatch > 1) {
2415
+ return createBatchCliProvider2(detected, effectiveBatch);
2416
+ }
2417
+ return createCliOcrProvider(detected);
2418
+ }
2260
2419
  return createCliOcrProvider(detected);
2261
2420
  }
2262
2421
  var init_resolve = __esm({
@@ -2419,8 +2578,14 @@ function ocrResultToBlocks(result, pageNum) {
2419
2578
  }
2420
2579
  return pageBlocks;
2421
2580
  }
2422
- async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1) {
2581
+ function isBatchProvider(p) {
2582
+ return !!p && typeof p === "object" && "__batch" in p && p.__batch === true;
2583
+ }
2584
+ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
2423
2585
  const blocks = [];
2586
+ if (isBatchProvider(provider)) {
2587
+ return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress);
2588
+ }
2424
2589
  if (concurrency <= 1) {
2425
2590
  for (let i = 1; i <= effectivePageCount; i++) {
2426
2591
  if (pageFilter && !pageFilter.has(i)) continue;
@@ -2466,6 +2631,47 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
2466
2631
  }
2467
2632
  return blocks;
2468
2633
  }
2634
+ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress) {
2635
+ const blocks = [];
2636
+ const pageNumbers = [];
2637
+ for (let i = 1; i <= effectivePageCount; i++) {
2638
+ if (pageFilter && !pageFilter.has(i)) continue;
2639
+ pageNumbers.push(i);
2640
+ }
2641
+ const pageImages = [];
2642
+ for (const pageNum of pageNumbers) {
2643
+ const page = await doc.getPage(pageNum);
2644
+ const image = await renderPageToPng(page);
2645
+ pageImages.push({ image, pageNum });
2646
+ }
2647
+ const batches = [];
2648
+ for (let i = 0; i < pageImages.length; i += provider.batchSize) {
2649
+ batches.push(pageImages.slice(i, i + provider.batchSize));
2650
+ }
2651
+ let processed = 0;
2652
+ for (const batch of batches) {
2653
+ try {
2654
+ const results = await provider.processBatch(batch);
2655
+ for (const { pageNum } of batch) {
2656
+ const result = results.get(pageNum);
2657
+ if (result) {
2658
+ for (const b of ocrResultToBlocks(result, pageNum)) blocks.push(b);
2659
+ }
2660
+ processed++;
2661
+ onProgress?.(processed, pageNumbers.length);
2662
+ }
2663
+ } catch (err) {
2664
+ const range = `${batch[0].pageNum}-${batch[batch.length - 1].pageNum}`;
2665
+ warnings?.push({
2666
+ message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2667
+ code: "OCR_PAGE_FAILED"
2668
+ });
2669
+ processed += batch.length;
2670
+ onProgress?.(processed, pageNumbers.length);
2671
+ }
2672
+ }
2673
+ return blocks;
2674
+ }
2469
2675
  async function renderPageToPng(page) {
2470
2676
  const { createCanvas } = await import("@napi-rs/canvas");
2471
2677
  const scale = 2;
@@ -2531,7 +2737,7 @@ import JSZip2 from "jszip";
2531
2737
  import { DOMParser } from "@xmldom/xmldom";
2532
2738
 
2533
2739
  // src/utils.ts
2534
- var VERSION = true ? "2.2.9" : "0.0.0-dev";
2740
+ var VERSION = true ? "2.3.0" : "0.0.0-dev";
2535
2741
  function toArrayBuffer(buf) {
2536
2742
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2537
2743
  return buf.buffer;
@@ -6049,7 +6255,8 @@ async function parsePdfDocument(buffer, options) {
6049
6255
  try {
6050
6256
  const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6051
6257
  const concurrency = options?.ocrConcurrency ?? 1;
6052
- ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency);
6258
+ const batchSize = options?.ocrBatchSize;
6259
+ ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
6053
6260
  } catch (resolveErr) {
6054
6261
  if (ocrMode !== "auto") {
6055
6262
  throw Object.assign(
@@ -6064,7 +6271,7 @@ async function parsePdfDocument(buffer, options) {
6064
6271
  try {
6065
6272
  const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6066
6273
  const concurrency = options?.ocrConcurrency ?? 1;
6067
- ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency);
6274
+ ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
6068
6275
  } catch {
6069
6276
  } finally {
6070
6277
  const terminable = ocrProvider;