@clazic/kordoc 2.2.9 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/batch-provider-PCT4I4LK.js +169 -0
- package/dist/batch-provider-PCT4I4LK.js.map +1 -0
- package/dist/{chunk-FF5M4SDK.js → chunk-W5KUC23B.js} +2 -2
- package/dist/{chunk-OL2NDK3E.js → chunk-ZOEUKD77.js} +7 -6
- package/dist/chunk-ZOEUKD77.js.map +1 -0
- package/dist/cli.js +12 -6
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +256 -8
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +256 -8
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{provider-I3XGSVL6.js → provider-WYHC4NHI.js} +60 -2
- package/dist/provider-WYHC4NHI.js.map +1 -0
- package/dist/{resolve-UFUJEPCJ.js → resolve-4FSAQF2S.js} +19 -3
- package/dist/resolve-4FSAQF2S.js.map +1 -0
- package/dist/{utils-CU26KLDC.js → utils-HSF5HI5T.js} +2 -2
- package/dist/{watch-Z6SH4KRB.js → watch-R2JHXDGF.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-OL2NDK3E.js.map +0 -1
- package/dist/provider-I3XGSVL6.js.map +0 -1
- package/dist/resolve-UFUJEPCJ.js.map +0 -1
- /package/dist/{chunk-FF5M4SDK.js.map → chunk-W5KUC23B.js.map} +0 -0
- /package/dist/{utils-CU26KLDC.js.map → utils-HSF5HI5T.js.map} +0 -0
- /package/dist/{watch-Z6SH4KRB.js.map → watch-R2JHXDGF.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -119,6 +119,8 @@ interface ParseOptions {
|
|
|
119
119
|
ocrConcurrency?: number;
|
|
120
120
|
/** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
|
|
121
121
|
onProgress?: (current: number, total: number) => void;
|
|
122
|
+
/** OCR 배치 크기 — CLI 1회 호출당 처리할 페이지 수. gemini/claude/codex 전용. */
|
|
123
|
+
ocrBatchSize?: number;
|
|
122
124
|
/** PDF 머리글/바닥글 자동 제거 */
|
|
123
125
|
removeHeaderFooter?: boolean;
|
|
124
126
|
}
|
package/dist/index.d.ts
CHANGED
|
@@ -119,6 +119,8 @@ interface ParseOptions {
|
|
|
119
119
|
ocrConcurrency?: number;
|
|
120
120
|
/** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
|
|
121
121
|
onProgress?: (current: number, total: number) => void;
|
|
122
|
+
/** OCR 배치 크기 — CLI 1회 호출당 처리할 페이지 수. gemini/claude/codex 전용. */
|
|
123
|
+
ocrBatchSize?: number;
|
|
122
124
|
/** PDF 머리글/바닥글 자동 제거 */
|
|
123
125
|
removeHeaderFooter?: boolean;
|
|
124
126
|
}
|
package/dist/index.js
CHANGED
|
@@ -2019,7 +2019,7 @@ import { join } from "path";
|
|
|
2019
2019
|
import { tmpdir } from "os";
|
|
2020
2020
|
function getTempDir() {
|
|
2021
2021
|
if (!_tempDir) {
|
|
2022
|
-
_tempDir = join(process.cwd(), "
|
|
2022
|
+
_tempDir = join(process.cwd(), "_kordoc_ocr_tmp");
|
|
2023
2023
|
mkdirSync(_tempDir, { recursive: true });
|
|
2024
2024
|
}
|
|
2025
2025
|
return _tempDir;
|
|
@@ -2119,8 +2119,8 @@ function buildCliArgs(mode, imagePath) {
|
|
|
2119
2119
|
}
|
|
2120
2120
|
}
|
|
2121
2121
|
async function callOllamaApi(imagePath) {
|
|
2122
|
-
const { readFileSync:
|
|
2123
|
-
const imageBase64 =
|
|
2122
|
+
const { readFileSync: readFileSync3 } = await import("fs");
|
|
2123
|
+
const imageBase64 = readFileSync3(imagePath).toString("base64");
|
|
2124
2124
|
const model = process.env.KORDOC_OLLAMA_MODEL || "qwen3-vl:8b";
|
|
2125
2125
|
const host = process.env.KORDOC_OLLAMA_HOST || "http://localhost:11434";
|
|
2126
2126
|
const timeoutMs = Number(process.env.KORDOC_OLLAMA_TIMEOUT) || 12e4;
|
|
@@ -2216,12 +2216,185 @@ var init_tesseract_provider = __esm({
|
|
|
2216
2216
|
}
|
|
2217
2217
|
});
|
|
2218
2218
|
|
|
2219
|
+
// src/ocr/batch-provider.ts
|
|
2220
|
+
var batch_provider_exports = {};
|
|
2221
|
+
__export(batch_provider_exports, {
|
|
2222
|
+
DEFAULT_BATCH_SIZES: () => DEFAULT_BATCH_SIZES,
|
|
2223
|
+
createBatchCliProvider: () => createBatchCliProvider
|
|
2224
|
+
});
|
|
2225
|
+
import { spawn } from "child_process";
|
|
2226
|
+
import { writeFileSync as writeFileSync2, readFileSync as readFileSync2, unlinkSync as unlinkSync2, mkdirSync as mkdirSync2 } from "fs";
|
|
2227
|
+
import { join as join2 } from "path";
|
|
2228
|
+
import { tmpdir as tmpdir2 } from "os";
|
|
2229
|
+
function getBatchTempDir() {
|
|
2230
|
+
if (!_batchTempDir) {
|
|
2231
|
+
_batchTempDir = join2(process.cwd(), "_kordoc_ocr_tmp");
|
|
2232
|
+
mkdirSync2(_batchTempDir, { recursive: true });
|
|
2233
|
+
}
|
|
2234
|
+
return _batchTempDir;
|
|
2235
|
+
}
|
|
2236
|
+
function createBatchCliProvider(mode, batchSize) {
|
|
2237
|
+
return {
|
|
2238
|
+
__batch: true,
|
|
2239
|
+
batchSize,
|
|
2240
|
+
async processBatch(pages) {
|
|
2241
|
+
const results = /* @__PURE__ */ new Map();
|
|
2242
|
+
const tempDir = getBatchTempDir();
|
|
2243
|
+
const tempFiles = [];
|
|
2244
|
+
try {
|
|
2245
|
+
for (const { image, pageNum } of pages) {
|
|
2246
|
+
const path = join2(tempDir, `batch-p${pageNum}.png`);
|
|
2247
|
+
writeFileSync2(path, image);
|
|
2248
|
+
tempFiles.push(path);
|
|
2249
|
+
}
|
|
2250
|
+
let output;
|
|
2251
|
+
if (mode === "codex") {
|
|
2252
|
+
output = await callBatchCodexCli(tempFiles);
|
|
2253
|
+
} else {
|
|
2254
|
+
output = await callBatchCli(mode, tempFiles);
|
|
2255
|
+
}
|
|
2256
|
+
const cleaned = stripCodeFence2(output.trim());
|
|
2257
|
+
const parts = cleaned.split(/<!--\s*PAGE_BREAK\s*-->/).map((p) => p.trim()).filter((p) => p.length > 0);
|
|
2258
|
+
for (let i = 0; i < pages.length; i++) {
|
|
2259
|
+
const pageNum = pages[i].pageNum;
|
|
2260
|
+
if (i < parts.length) {
|
|
2261
|
+
results.set(pageNum, { markdown: parts[i] });
|
|
2262
|
+
}
|
|
2263
|
+
}
|
|
2264
|
+
} finally {
|
|
2265
|
+
for (const f of tempFiles) {
|
|
2266
|
+
try {
|
|
2267
|
+
unlinkSync2(f);
|
|
2268
|
+
} catch {
|
|
2269
|
+
}
|
|
2270
|
+
}
|
|
2271
|
+
}
|
|
2272
|
+
return results;
|
|
2273
|
+
}
|
|
2274
|
+
};
|
|
2275
|
+
}
|
|
2276
|
+
function spawnAsync(cmd, args, opts) {
|
|
2277
|
+
return new Promise((resolve, reject) => {
|
|
2278
|
+
const child = spawn(cmd, args, {
|
|
2279
|
+
cwd: opts.cwd,
|
|
2280
|
+
env: process.env,
|
|
2281
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
2282
|
+
});
|
|
2283
|
+
let stdout = "";
|
|
2284
|
+
let stderr = "";
|
|
2285
|
+
let killed = false;
|
|
2286
|
+
child.stdout.setEncoding("utf-8");
|
|
2287
|
+
child.stderr.setEncoding("utf-8");
|
|
2288
|
+
child.stdout.on("data", (d) => {
|
|
2289
|
+
stdout += d;
|
|
2290
|
+
});
|
|
2291
|
+
child.stderr.on("data", (d) => {
|
|
2292
|
+
stderr += d;
|
|
2293
|
+
});
|
|
2294
|
+
const timer = setTimeout(() => {
|
|
2295
|
+
killed = true;
|
|
2296
|
+
child.kill("SIGTERM");
|
|
2297
|
+
}, opts.timeoutMs);
|
|
2298
|
+
if (opts.stdin !== void 0) {
|
|
2299
|
+
child.stdin.end(opts.stdin);
|
|
2300
|
+
} else {
|
|
2301
|
+
child.stdin.end();
|
|
2302
|
+
}
|
|
2303
|
+
child.on("close", (code) => {
|
|
2304
|
+
clearTimeout(timer);
|
|
2305
|
+
if (killed) {
|
|
2306
|
+
reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
|
|
2307
|
+
} else {
|
|
2308
|
+
resolve({ stdout, stderr, exitCode: code ?? 1 });
|
|
2309
|
+
}
|
|
2310
|
+
});
|
|
2311
|
+
child.on("error", (err) => {
|
|
2312
|
+
clearTimeout(timer);
|
|
2313
|
+
reject(err);
|
|
2314
|
+
});
|
|
2315
|
+
});
|
|
2316
|
+
}
|
|
2317
|
+
async function callBatchCli(mode, imagePaths) {
|
|
2318
|
+
const fileRefs = imagePaths.map((p) => `@${p}`).join("\n");
|
|
2319
|
+
const prompt = `${BATCH_OCR_PROMPT}
|
|
2320
|
+
|
|
2321
|
+
${fileRefs}`;
|
|
2322
|
+
let args;
|
|
2323
|
+
if (mode === "gemini") {
|
|
2324
|
+
args = ["--prompt", prompt, "--yolo"];
|
|
2325
|
+
const model = process.env.KORDOC_GEMINI_MODEL;
|
|
2326
|
+
if (model) args.push("--model", model);
|
|
2327
|
+
} else {
|
|
2328
|
+
args = ["--print", prompt];
|
|
2329
|
+
const model = process.env.KORDOC_CLAUDE_MODEL;
|
|
2330
|
+
if (model) args.push("--model", model);
|
|
2331
|
+
}
|
|
2332
|
+
const timeoutMs = 6e4 + imagePaths.length * 2e4;
|
|
2333
|
+
const result = await spawnAsync(mode, args, {
|
|
2334
|
+
timeoutMs,
|
|
2335
|
+
...mode === "claude" ? { cwd: tmpdir2() } : {}
|
|
2336
|
+
});
|
|
2337
|
+
if (result.exitCode !== 0) {
|
|
2338
|
+
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
2339
|
+
throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
2340
|
+
}
|
|
2341
|
+
return result.stdout || "";
|
|
2342
|
+
}
|
|
2343
|
+
async function callBatchCodexCli(imagePaths) {
|
|
2344
|
+
const outPath = join2(tmpdir2(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
|
|
2345
|
+
try {
|
|
2346
|
+
const args = ["exec", BATCH_OCR_PROMPT];
|
|
2347
|
+
for (const p of imagePaths) {
|
|
2348
|
+
args.push("--image", p);
|
|
2349
|
+
}
|
|
2350
|
+
args.push("--output-last-message", outPath);
|
|
2351
|
+
const model = process.env.KORDOC_CODEX_MODEL;
|
|
2352
|
+
if (model) args.push("--model", model);
|
|
2353
|
+
const timeoutMs = 6e4 + imagePaths.length * 2e4;
|
|
2354
|
+
const result = await spawnAsync("codex", args, {
|
|
2355
|
+
timeoutMs,
|
|
2356
|
+
stdin: ""
|
|
2357
|
+
});
|
|
2358
|
+
if (result.exitCode !== 0) {
|
|
2359
|
+
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
2360
|
+
throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
2361
|
+
}
|
|
2362
|
+
try {
|
|
2363
|
+
return readFileSync2(outPath, "utf-8");
|
|
2364
|
+
} catch {
|
|
2365
|
+
return result.stdout || "";
|
|
2366
|
+
}
|
|
2367
|
+
} finally {
|
|
2368
|
+
try {
|
|
2369
|
+
unlinkSync2(outPath);
|
|
2370
|
+
} catch {
|
|
2371
|
+
}
|
|
2372
|
+
}
|
|
2373
|
+
}
|
|
2374
|
+
function stripCodeFence2(text) {
|
|
2375
|
+
const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
|
|
2376
|
+
return match ? match[1].trim() : text;
|
|
2377
|
+
}
|
|
2378
|
+
var BATCH_OCR_PROMPT, DEFAULT_BATCH_SIZES, _batchTempDir;
|
|
2379
|
+
var init_batch_provider = __esm({
|
|
2380
|
+
"src/ocr/batch-provider.ts"() {
|
|
2381
|
+
"use strict";
|
|
2382
|
+
BATCH_OCR_PROMPT = "\uB2E4\uC74C \uBB38\uC11C \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uB4E4\uC744 OCR\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\n\uADDC\uCE59:\n- \uAC01 \uD398\uC774\uC9C0 \uACB0\uACFC \uC0AC\uC774\uC5D0 \uBC18\uB4DC\uC2DC \uC774 \uAD6C\uBD84\uC790\uB97C \uC0BD\uC785: <!-- PAGE_BREAK -->\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
|
|
2383
|
+
DEFAULT_BATCH_SIZES = {
|
|
2384
|
+
gemini: 5,
|
|
2385
|
+
claude: 5,
|
|
2386
|
+
codex: 10
|
|
2387
|
+
};
|
|
2388
|
+
_batchTempDir = null;
|
|
2389
|
+
}
|
|
2390
|
+
});
|
|
2391
|
+
|
|
2219
2392
|
// src/ocr/resolve.ts
|
|
2220
2393
|
var resolve_exports = {};
|
|
2221
2394
|
__export(resolve_exports, {
|
|
2222
2395
|
resolveOcrProvider: () => resolveOcrProvider
|
|
2223
2396
|
});
|
|
2224
|
-
async function resolveOcrProvider(mode, warnings, concurrency) {
|
|
2397
|
+
async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
2225
2398
|
if (mode === "off") {
|
|
2226
2399
|
throw new Error("OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC788\uC2B5\uB2C8\uB2E4 (--ocr off).");
|
|
2227
2400
|
}
|
|
@@ -2234,6 +2407,14 @@ async function resolveOcrProvider(mode, warnings, concurrency) {
|
|
|
2234
2407
|
}
|
|
2235
2408
|
return createTesseractProvider2();
|
|
2236
2409
|
}
|
|
2410
|
+
if (mode === "gemini" || mode === "claude" || mode === "codex") {
|
|
2411
|
+
const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
|
|
2412
|
+
const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[mode];
|
|
2413
|
+
if (effectiveBatch > 1) {
|
|
2414
|
+
return createBatchCliProvider2(mode, effectiveBatch);
|
|
2415
|
+
}
|
|
2416
|
+
return createCliOcrProvider(mode);
|
|
2417
|
+
}
|
|
2237
2418
|
return createCliOcrProvider(mode);
|
|
2238
2419
|
}
|
|
2239
2420
|
const detected = detectAvailableOcr();
|
|
@@ -2257,6 +2438,14 @@ async function resolveOcrProvider(mode, warnings, concurrency) {
|
|
|
2257
2438
|
}
|
|
2258
2439
|
return createTesseractProvider2();
|
|
2259
2440
|
}
|
|
2441
|
+
if (detected === "gemini" || detected === "codex" || detected === "claude") {
|
|
2442
|
+
const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
|
|
2443
|
+
const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[detected];
|
|
2444
|
+
if (effectiveBatch > 1) {
|
|
2445
|
+
return createBatchCliProvider2(detected, effectiveBatch);
|
|
2446
|
+
}
|
|
2447
|
+
return createCliOcrProvider(detected);
|
|
2448
|
+
}
|
|
2260
2449
|
return createCliOcrProvider(detected);
|
|
2261
2450
|
}
|
|
2262
2451
|
var init_resolve = __esm({
|
|
@@ -2419,8 +2608,14 @@ function ocrResultToBlocks(result, pageNum) {
|
|
|
2419
2608
|
}
|
|
2420
2609
|
return pageBlocks;
|
|
2421
2610
|
}
|
|
2422
|
-
|
|
2611
|
+
function isBatchProvider(p) {
|
|
2612
|
+
return !!p && typeof p === "object" && "__batch" in p && p.__batch === true;
|
|
2613
|
+
}
|
|
2614
|
+
async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
2423
2615
|
const blocks = [];
|
|
2616
|
+
if (isBatchProvider(provider)) {
|
|
2617
|
+
return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
|
|
2618
|
+
}
|
|
2424
2619
|
if (concurrency <= 1) {
|
|
2425
2620
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
2426
2621
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
@@ -2466,6 +2661,58 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
|
|
|
2466
2661
|
}
|
|
2467
2662
|
return blocks;
|
|
2468
2663
|
}
|
|
2664
|
+
async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
2665
|
+
const pageNumbers = [];
|
|
2666
|
+
for (let i = 1; i <= effectivePageCount; i++) {
|
|
2667
|
+
if (pageFilter && !pageFilter.has(i)) continue;
|
|
2668
|
+
pageNumbers.push(i);
|
|
2669
|
+
}
|
|
2670
|
+
const pageImages = [];
|
|
2671
|
+
for (const pageNum of pageNumbers) {
|
|
2672
|
+
const page = await doc.getPage(pageNum);
|
|
2673
|
+
const image = await renderPageToPng(page);
|
|
2674
|
+
pageImages.push({ image, pageNum });
|
|
2675
|
+
}
|
|
2676
|
+
const batches = [];
|
|
2677
|
+
for (let i = 0; i < pageImages.length; i += provider.batchSize) {
|
|
2678
|
+
batches.push(pageImages.slice(i, i + provider.batchSize));
|
|
2679
|
+
}
|
|
2680
|
+
let processed = 0;
|
|
2681
|
+
const batchTasks = batches.map((batch, batchIdx) => async () => {
|
|
2682
|
+
const pageBlocks = [];
|
|
2683
|
+
try {
|
|
2684
|
+
const results = await provider.processBatch(batch);
|
|
2685
|
+
for (const { pageNum } of batch) {
|
|
2686
|
+
const result = results.get(pageNum);
|
|
2687
|
+
pageBlocks.push({
|
|
2688
|
+
pageNum,
|
|
2689
|
+
blocks: result ? ocrResultToBlocks(result, pageNum) : []
|
|
2690
|
+
});
|
|
2691
|
+
}
|
|
2692
|
+
} catch (err) {
|
|
2693
|
+
const range = `${batch[0].pageNum}-${batch[batch.length - 1].pageNum}`;
|
|
2694
|
+
warnings?.push({
|
|
2695
|
+
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2696
|
+
code: "OCR_PAGE_FAILED"
|
|
2697
|
+
});
|
|
2698
|
+
for (const { pageNum } of batch) {
|
|
2699
|
+
pageBlocks.push({ pageNum, blocks: [] });
|
|
2700
|
+
}
|
|
2701
|
+
}
|
|
2702
|
+
processed += batch.length;
|
|
2703
|
+
onProgress?.(processed, pageNumbers.length);
|
|
2704
|
+
return { batchIdx, pageBlocks };
|
|
2705
|
+
});
|
|
2706
|
+
const effectiveConcurrency = Math.max(1, concurrency);
|
|
2707
|
+
const batchResults = await runWithConcurrency(batchTasks, effectiveConcurrency);
|
|
2708
|
+
const blocks = [];
|
|
2709
|
+
for (const result of batchResults) {
|
|
2710
|
+
for (const { blocks: pageBlks } of result.pageBlocks) {
|
|
2711
|
+
for (const b of pageBlks) blocks.push(b);
|
|
2712
|
+
}
|
|
2713
|
+
}
|
|
2714
|
+
return blocks;
|
|
2715
|
+
}
|
|
2469
2716
|
async function renderPageToPng(page) {
|
|
2470
2717
|
const { createCanvas } = await import("@napi-rs/canvas");
|
|
2471
2718
|
const scale = 2;
|
|
@@ -2531,7 +2778,7 @@ import JSZip2 from "jszip";
|
|
|
2531
2778
|
import { DOMParser } from "@xmldom/xmldom";
|
|
2532
2779
|
|
|
2533
2780
|
// src/utils.ts
|
|
2534
|
-
var VERSION = true ? "2.
|
|
2781
|
+
var VERSION = true ? "2.3.1" : "0.0.0-dev";
|
|
2535
2782
|
function toArrayBuffer(buf) {
|
|
2536
2783
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2537
2784
|
return buf.buffer;
|
|
@@ -6049,7 +6296,8 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6049
6296
|
try {
|
|
6050
6297
|
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
6051
6298
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6052
|
-
|
|
6299
|
+
const batchSize = options?.ocrBatchSize;
|
|
6300
|
+
ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
|
|
6053
6301
|
} catch (resolveErr) {
|
|
6054
6302
|
if (ocrMode !== "auto") {
|
|
6055
6303
|
throw Object.assign(
|
|
@@ -6064,7 +6312,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6064
6312
|
try {
|
|
6065
6313
|
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
6066
6314
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6067
|
-
ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency);
|
|
6315
|
+
ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
6068
6316
|
} catch {
|
|
6069
6317
|
} finally {
|
|
6070
6318
|
const terminable = ocrProvider;
|