@clazic/kordoc 2.2.9 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/env node
2
+ import "./chunk-ZWE3DS7E.js";
3
+
4
+ // src/ocr/batch-provider.ts
5
+ import { spawn } from "child_process";
6
+ import { writeFileSync, readFileSync, unlinkSync, mkdirSync } from "fs";
7
+ import { join } from "path";
8
+ import { tmpdir } from "os";
9
+ var BATCH_OCR_PROMPT = "\uB2E4\uC74C \uBB38\uC11C \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uB4E4\uC744 OCR\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\n\uADDC\uCE59:\n- \uAC01 \uD398\uC774\uC9C0 \uACB0\uACFC \uC0AC\uC774\uC5D0 \uBC18\uB4DC\uC2DC \uC774 \uAD6C\uBD84\uC790\uB97C \uC0BD\uC785: <!-- PAGE_BREAK -->\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
10
+ var DEFAULT_BATCH_SIZES = {
11
+ gemini: 5,
12
+ claude: 5,
13
+ codex: 10
14
+ };
15
+ var _batchTempDir = null;
16
+ function getBatchTempDir() {
17
+ if (!_batchTempDir) {
18
+ _batchTempDir = join(process.cwd(), "_kordoc_ocr_tmp");
19
+ mkdirSync(_batchTempDir, { recursive: true });
20
+ }
21
+ return _batchTempDir;
22
+ }
23
+ function createBatchCliProvider(mode, batchSize) {
24
+ return {
25
+ __batch: true,
26
+ batchSize,
27
+ async processBatch(pages) {
28
+ const results = /* @__PURE__ */ new Map();
29
+ const tempDir = getBatchTempDir();
30
+ const tempFiles = [];
31
+ try {
32
+ for (const { image, pageNum } of pages) {
33
+ const path = join(tempDir, `batch-p${pageNum}.png`);
34
+ writeFileSync(path, image);
35
+ tempFiles.push(path);
36
+ }
37
+ let output;
38
+ if (mode === "codex") {
39
+ output = await callBatchCodexCli(tempFiles);
40
+ } else {
41
+ output = await callBatchCli(mode, tempFiles);
42
+ }
43
+ const cleaned = stripCodeFence(output.trim());
44
+ const parts = cleaned.split(/<!--\s*PAGE_BREAK\s*-->/).map((p) => p.trim()).filter((p) => p.length > 0);
45
+ for (let i = 0; i < pages.length; i++) {
46
+ const pageNum = pages[i].pageNum;
47
+ if (i < parts.length) {
48
+ results.set(pageNum, { markdown: parts[i] });
49
+ }
50
+ }
51
+ } finally {
52
+ for (const f of tempFiles) {
53
+ try {
54
+ unlinkSync(f);
55
+ } catch {
56
+ }
57
+ }
58
+ }
59
+ return results;
60
+ }
61
+ };
62
+ }
63
+ function spawnAsync(cmd, args, opts) {
64
+ return new Promise((resolve, reject) => {
65
+ const child = spawn(cmd, args, {
66
+ cwd: opts.cwd,
67
+ env: process.env,
68
+ stdio: ["pipe", "pipe", "pipe"]
69
+ });
70
+ let stdout = "";
71
+ let stderr = "";
72
+ let killed = false;
73
+ child.stdout.setEncoding("utf-8");
74
+ child.stderr.setEncoding("utf-8");
75
+ child.stdout.on("data", (d) => {
76
+ stdout += d;
77
+ });
78
+ child.stderr.on("data", (d) => {
79
+ stderr += d;
80
+ });
81
+ const timer = setTimeout(() => {
82
+ killed = true;
83
+ child.kill("SIGTERM");
84
+ }, opts.timeoutMs);
85
+ if (opts.stdin !== void 0) {
86
+ child.stdin.end(opts.stdin);
87
+ } else {
88
+ child.stdin.end();
89
+ }
90
+ child.on("close", (code) => {
91
+ clearTimeout(timer);
92
+ if (killed) {
93
+ reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
94
+ } else {
95
+ resolve({ stdout, stderr, exitCode: code ?? 1 });
96
+ }
97
+ });
98
+ child.on("error", (err) => {
99
+ clearTimeout(timer);
100
+ reject(err);
101
+ });
102
+ });
103
+ }
104
+ async function callBatchCli(mode, imagePaths) {
105
+ const fileRefs = imagePaths.map((p) => `@${p}`).join("\n");
106
+ const prompt = `${BATCH_OCR_PROMPT}
107
+
108
+ ${fileRefs}`;
109
+ let args;
110
+ if (mode === "gemini") {
111
+ args = ["--prompt", prompt, "--yolo"];
112
+ const model = process.env.KORDOC_GEMINI_MODEL;
113
+ if (model) args.push("--model", model);
114
+ } else {
115
+ args = ["--print", prompt];
116
+ const model = process.env.KORDOC_CLAUDE_MODEL;
117
+ if (model) args.push("--model", model);
118
+ }
119
+ const timeoutMs = 6e4 + imagePaths.length * 2e4;
120
+ const result = await spawnAsync(mode, args, {
121
+ timeoutMs,
122
+ ...mode === "claude" ? { cwd: tmpdir() } : {}
123
+ });
124
+ if (result.exitCode !== 0) {
125
+ const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
126
+ throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
127
+ }
128
+ return result.stdout || "";
129
+ }
130
+ async function callBatchCodexCli(imagePaths) {
131
+ const outPath = join(tmpdir(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
132
+ try {
133
+ const args = ["exec", BATCH_OCR_PROMPT];
134
+ for (const p of imagePaths) {
135
+ args.push("--image", p);
136
+ }
137
+ args.push("--output-last-message", outPath);
138
+ const model = process.env.KORDOC_CODEX_MODEL;
139
+ if (model) args.push("--model", model);
140
+ const timeoutMs = 6e4 + imagePaths.length * 2e4;
141
+ const result = await spawnAsync("codex", args, {
142
+ timeoutMs,
143
+ stdin: ""
144
+ });
145
+ if (result.exitCode !== 0) {
146
+ const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
147
+ throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
148
+ }
149
+ try {
150
+ return readFileSync(outPath, "utf-8");
151
+ } catch {
152
+ return result.stdout || "";
153
+ }
154
+ } finally {
155
+ try {
156
+ unlinkSync(outPath);
157
+ } catch {
158
+ }
159
+ }
160
+ }
161
+ function stripCodeFence(text) {
162
+ const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
163
+ return match ? match[1].trim() : text;
164
+ }
165
+ export {
166
+ DEFAULT_BATCH_SIZES,
167
+ createBatchCliProvider
168
+ };
169
+ //# sourceMappingURL=batch-provider-PCT4I4LK.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/ocr/batch-provider.ts"],"sourcesContent":["/**\n * CLI 배치 OCR 프로바이더\n *\n * 여러 페이지 이미지를 단일 CLI 호출로 처리하여 API 호출 수를 대폭 감소.\n * gemini/claude: @file 멀티 참조, codex: --image 멀티 플래그\n *\n * 299페이지 기준:\n * - 기존: CLI 299회 호출 (~30분)\n * - 배치: CLI 3~6회 호출 (~3분)\n */\n\nimport { spawn } from \"child_process\"\nimport { writeFileSync, readFileSync, unlinkSync, mkdirSync } from \"fs\"\nimport { join } from \"path\"\nimport { tmpdir } from \"os\"\nimport type { StructuredOcrResult, BatchOcrProvider } from \"../types.js\"\n\n/** 배치 OCR 프롬프트 */\nconst BATCH_OCR_PROMPT =\n \"다음 문서 페이지 이미지들을 OCR하여 순수 Markdown으로 변환하세요.\\n\\n\" +\n \"규칙:\\n\" +\n \"- 각 페이지 결과 사이에 반드시 이 구분자를 삽입: <!-- PAGE_BREAK -->\\n\" +\n \"- 테이블은 Markdown 테이블 문법 사용 (| 구분, |---|---| 헤더 구분선 포함)\\n\" +\n \"- 병합된 셀은 해당 위치에 내용 기재\\n\" +\n \"- 헤딩은 글자 크기에 따라 ## ~ ###### 사용\\n\" +\n \"- 리스트는 - 또는 1. 사용\\n\" +\n \"- 이미지, 도형 등 비텍스트 요소는 무시\\n\" +\n \"- 원문의 읽기 순서와 구조를 유지\\n\" +\n \"- ```로 감싸지 말고 순수 Markdown만 출력\"\n\n/** 모드별 기본 배치 크기 (CLI 내부 타임아웃 + 실측 기반)\n *\n * gemini CLI: 10장 이상에서 AbortError 발생 (내부 타임아웃).\n * 5장 배치가 안정적으로 동작 확인 (35초/배치).\n * 299페이지 = 60배치 = 기존 299회 대비 80% 감소.\n */\nexport const DEFAULT_BATCH_SIZES: Record<string, number> = {\n gemini: 5,\n claude: 5,\n codex: 10,\n}\n\n/** 임시 디렉토리 — gemini CLI는 cwd 하위 + gitignore 밖만 @참조 가능 */\nlet _batchTempDir: string | null = null\nfunction getBatchTempDir(): string {\n if (!_batchTempDir) {\n _batchTempDir = join(process.cwd(), \"_kordoc_ocr_tmp\")\n mkdirSync(_batchTempDir, { recursive: true })\n }\n return _batchTempDir\n}\n\n/**\n * 배치 CLI 프로바이더 생성\n */\nexport function createBatchCliProvider(\n mode: \"gemini\" | \"claude\" | \"codex\",\n batchSize: number\n): BatchOcrProvider {\n return {\n __batch: true as const,\n batchSize,\n async processBatch(pages) {\n const results = new Map<number, StructuredOcrResult>()\n const tempDir = getBatchTempDir()\n const tempFiles: string[] = []\n\n try {\n // 1. Write all page images to temp files\n for (const { image, pageNum } of pages) {\n const path = join(tempDir, `batch-p${pageNum}.png`)\n writeFileSync(path, image)\n tempFiles.push(path)\n }\n\n // 2. Call CLI with all file references (비동기 — 병렬 배치 실행 가능)\n let output: string\n if (mode === \"codex\") {\n output = await callBatchCodexCli(tempFiles)\n } else {\n output = await callBatchCli(mode, tempFiles)\n }\n\n // 3. Parse response by PAGE_BREAK separator\n const cleaned = stripCodeFence(output.trim())\n const parts = cleaned.split(/<!--\\s*PAGE_BREAK\\s*-->/)\n .map(p => p.trim())\n .filter(p => p.length > 0)\n\n // 4. Map results to page numbers (best-effort if count mismatch)\n for (let i = 0; i < pages.length; i++) {\n const pageNum = pages[i].pageNum\n if (i < parts.length) {\n results.set(pageNum, { markdown: parts[i] })\n }\n // If fewer parts than pages, remaining pages get no result\n }\n } finally {\n // 5. Clean up temp files\n for (const f of tempFiles) {\n try { unlinkSync(f) } catch { /* ignore */ }\n }\n }\n\n return results\n },\n }\n}\n\n/**\n * 비동기 CLI 실행 헬퍼 — spawn + Promise 래핑.\n * spawnSync는 이벤트 루프를 차단하여 병렬 배치 실행 불가.\n */\nfunction spawnAsync(\n cmd: string,\n args: string[],\n opts: { timeoutMs: number; cwd?: string; stdin?: string }\n): Promise<{ stdout: string; stderr: string; exitCode: number }> {\n return new Promise((resolve, reject) => {\n const child = spawn(cmd, args, {\n cwd: opts.cwd,\n env: process.env,\n stdio: [\"pipe\", \"pipe\", \"pipe\"],\n })\n\n let stdout = \"\"\n let stderr = \"\"\n let killed = false\n\n child.stdout.setEncoding(\"utf-8\")\n child.stderr.setEncoding(\"utf-8\")\n child.stdout.on(\"data\", (d: string) => { stdout += d })\n child.stderr.on(\"data\", (d: string) => { stderr += d })\n\n const timer = setTimeout(() => {\n killed = true\n child.kill(\"SIGTERM\")\n }, opts.timeoutMs)\n\n if (opts.stdin !== undefined) {\n child.stdin.end(opts.stdin)\n } else {\n child.stdin.end()\n }\n\n child.on(\"close\", (code) => {\n clearTimeout(timer)\n if (killed) {\n reject(new Error(`타임아웃 (${Math.round(opts.timeoutMs / 1000)}초)`))\n } else {\n resolve({ stdout, stderr, exitCode: code ?? 1 })\n }\n })\n child.on(\"error\", (err) => {\n clearTimeout(timer)\n reject(err)\n })\n })\n}\n\n/** gemini/claude 배치 호출 (비동기) */\nasync function callBatchCli(mode: \"gemini\" | \"claude\", imagePaths: string[]): Promise<string> {\n const fileRefs = imagePaths.map(p => `@${p}`).join(\"\\n\")\n const prompt = `${BATCH_OCR_PROMPT}\\n\\n${fileRefs}`\n\n let args: string[]\n if (mode === \"gemini\") {\n args = [\"--prompt\", prompt, \"--yolo\"]\n const model = process.env.KORDOC_GEMINI_MODEL\n if (model) args.push(\"--model\", model)\n } else {\n args = [\"--print\", prompt]\n const model = process.env.KORDOC_CLAUDE_MODEL\n if (model) args.push(\"--model\", model)\n }\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(mode, args, {\n timeoutMs,\n ...(mode === \"claude\" ? { cwd: tmpdir() } : {}),\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`${mode} 배치 OCR 실패: ${errMsg}`)\n }\n\n return result.stdout || \"\"\n}\n\n/** codex 배치 호출 (비동기) — --image를 여러 번 지정 */\nasync function callBatchCodexCli(imagePaths: string[]): Promise<string> {\n const outPath = join(tmpdir(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`)\n try {\n const args = [\"exec\", BATCH_OCR_PROMPT]\n for (const p of imagePaths) {\n args.push(\"--image\", p)\n }\n args.push(\"--output-last-message\", outPath)\n const model = process.env.KORDOC_CODEX_MODEL\n if (model) args.push(\"--model\", model)\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(\"codex\", args, {\n timeoutMs,\n stdin: \"\",\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`codex 배치 OCR 실패: ${errMsg}`)\n }\n\n try {\n return readFileSync(outPath, \"utf-8\")\n } catch {\n return result.stdout || \"\"\n }\n } finally {\n try { unlinkSync(outPath) } catch { /* ignore */ }\n }\n}\n\n/** LLM 출력에서 코드 펜스 제거 (cli-provider.ts와 동일 로직) */\nfunction stripCodeFence(text: string): string {\n const match = text.match(/^```(?:markdown|md)?\\s*\\n([\\s\\S]*?)\\n```\\s*$/m)\n return match ? match[1].trim() : text\n}\n"],"mappings":";;;;AAWA,SAAS,aAAa;AACtB,SAAS,eAAe,cAAc,YAAY,iBAAiB;AACnE,SAAS,YAAY;AACrB,SAAS,cAAc;AAIvB,IAAM,mBACJ;AAiBK,IAAM,sBAA8C;AAAA,EACzD,QAAQ;AAAA,EACR,QAAQ;AAAA,EACR,OAAO;AACT;AAGA,IAAI,gBAA+B;AACnC,SAAS,kBAA0B;AACjC,MAAI,CAAC,eAAe;AAClB,oBAAgB,KAAK,QAAQ,IAAI,GAAG,iBAAiB;AACrD,cAAU,eAAe,EAAE,WAAW,KAAK,CAAC;AAAA,EAC9C;AACA,SAAO;AACT;AAKO,SAAS,uBACd,MACA,WACkB;AAClB,SAAO;AAAA,IACL,SAAS;AAAA,IACT;AAAA,IACA,MAAM,aAAa,OAAO;AACxB,YAAM,UAAU,oBAAI,IAAiC;AACrD,YAAM,UAAU,gBAAgB;AAChC,YAAM,YAAsB,CAAC;AAE7B,UAAI;AAEF,mBAAW,EAAE,OAAO,QAAQ,KAAK,OAAO;AACtC,gBAAM,OAAO,KAAK,SAAS,UAAU,OAAO,MAAM;AAClD,wBAAc,MAAM,KAAK;AACzB,oBAAU,KAAK,IAAI;AAAA,QACrB;AAGA,YAAI;AACJ,YAAI,SAAS,SAAS;AACpB,mBAAS,MAAM,kBAAkB,SAAS;AAAA,QAC5C,OAAO;AACL,mBAAS,MAAM,aAAa,MAAM,SAAS;AAAA,QAC7C;AAGA,cAAM,UAAU,eAAe,OAAO,KAAK,CAAC;AAC5C,cAAM,QAAQ,QAAQ,MAAM,yBAAyB,EAClD,IAAI,OAAK,EAAE,KAAK,CAAC,EACjB,OAAO,OAAK,EAAE,SAAS,CAAC;AAG3B,iBAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,gBAAM,UAAU,MAAM,CAAC,EAAE;AACzB,cAAI,IAAI,MAAM,QAAQ;AACpB,oBAAQ,IAAI,SAAS,EAAE,UAAU,MAAM,CAAC,EAAE,CAAC;AAAA,UAC7C;AAAA,QAEF;AAAA,MACF,UAAE;AAEA,mBAAW,KAAK,WAAW;AACzB,cAAI;AAAE,uBAAW,CAAC;AAAA,UAAE,QAAQ;AAAA,UAAe;AAAA,QAC7C;AAAA,MACF;AAEA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAMA,SAAS,WACP,KACA,MACA,MAC+D;AAC/D,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,QAAQ,MAAM,KAAK,MAAM;AAAA,MAC7B,KAAK,KAAK;AAAA,MACV,KAAK,QAAQ;AAAA,MACb,OAAO,CAAC,QAAQ,QAAQ,MAAM;AAAA,IAChC,CAAC;AAED,QAAI,SAAS;AACb,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AACtD,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AAEtD,UAAM,QAAQ,WAAW,MAAM;AAC7B,eAAS;AACT,YAAM,KAAK,SAAS;AAAA,IACtB,GAAG,KAAK,SAAS;AAEjB,QAAI,KAAK,UAAU,QAAW;AAC5B,YAAM,MAAM,IAAI,KAAK,KAAK;AAAA,IAC5B,OAAO;AACL,YAAM,MAAM,IAAI;AAAA,IAClB;AAEA,UAAM,GAAG,SAAS,CAAC,SAAS;AAC1B,mBAAa,KAAK;AAClB,UAAI,QAAQ;AACV,eAAO,IAAI,MAAM,6BAAS,KAAK,MAAM,KAAK,YAAY,GAAI,CAAC,SAAI,CAAC;AAAA,MAClE,OAAO;AACL,gBAAQ,EAAE,QAAQ,QAAQ,UAAU,QAAQ,EAAE,CAAC;AAAA,MACjD;AAAA,IACF,CAAC;AACD,UAAM,GAAG,SAAS,CAAC,QAAQ;AACzB,mBAAa,KAAK;AAClB,aAAO,GAAG;AAAA,IACZ,CAAC;AAAA,EACH,CAAC;AACH;AAGA,eAAe,aAAa,MAA2B,YAAuC;AAC5F,QAAM,WAAW,WAAW,IAAI,OAAK,IAAI,CAAC,EAAE,EAAE,KAAK,IAAI;AACvD,QAAM,SAAS,GAAG,gBAAgB;AAAA;AAAA,EAAO,QAAQ;AAEjD,MAAI;AACJ,MAAI,SAAS,UAAU;AACrB,WAAO,CAAC,YAAY,QAAQ,QAAQ;AACpC,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAAA,EACvC,OAAO;AACL,WAAO,CAAC,WAAW,MAAM;AACzB,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAAA,EACvC;AAEA,QAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,QAAM,SAAS,MAAM,WAAW,MAAM,MAAM;AAAA,IAC1C;AAAA,IACA,GAAI,SAAS,WAAW,EAAE,KAAK,OAAO,EAAE,IAAI,CAAC;AAAA,EAC/C,CAAC;AAED,MAAI,OAAO,aAAa,GAAG;AACzB,UAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,UAAM,IAAI,MAAM,GAAG,IAAI,mCAAe,MAAM,EAAE;AAAA,EAChD;AAEA,SAAO,OAAO,UAAU;AAC1B;AAGA,eAAe,kBAAkB,YAAuC;AACtE,QAAM,UAAU,KAAK,OAAO,GAAG,sBAAsB,KAAK,IAAI,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,CAAC,CAAC,MAAM;AAC5G,MAAI;AACF,UAAM,OAAO,CAAC,QAAQ,gBAAgB;AACtC,eAAW,KAAK,YAAY;AAC1B,WAAK,KAAK,WAAW,CAAC;AAAA,IACxB;AACA,SAAK,KAAK,yBAAyB,OAAO;AAC1C,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAErC,UAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,UAAM,SAAS,MAAM,WAAW,SAAS,MAAM;AAAA,MAC7C;AAAA,MACA,OAAO;AAAA,IACT,CAAC;AAED,QAAI,OAAO,aAAa,GAAG;AACzB,YAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,YAAM,IAAI,MAAM,wCAAoB,MAAM,EAAE;AAAA,IAC9C;AAEA,QAAI;AACF,aAAO,aAAa,SAAS,OAAO;AAAA,IACtC,QAAQ;AACN,aAAO,OAAO,UAAU;AAAA,IAC1B;AAAA,EACF,UAAE;AACA,QAAI;AAAE,iBAAW,OAAO;AAAA,IAAE,QAAQ;AAAA,IAAe;AAAA,EACnD;AACF;AAGA,SAAS,eAAe,MAAsB;AAC5C,QAAM,QAAQ,KAAK,MAAM,+CAA+C;AACxE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;","names":[]}
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/utils.ts
4
- var VERSION = true ? "2.2.9" : "0.0.0-dev";
4
+ var VERSION = true ? "2.3.1" : "0.0.0-dev";
5
5
  function toArrayBuffer(buf) {
6
6
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
7
7
  return buf.buffer;
@@ -90,4 +90,4 @@ export {
90
90
  sanitizeHref,
91
91
  classifyError
92
92
  };
93
- //# sourceMappingURL=chunk-FF5M4SDK.js.map
93
+ //# sourceMappingURL=chunk-W5KUC23B.js.map
@@ -6,7 +6,7 @@ import {
6
6
  precheckZipSize,
7
7
  sanitizeHref,
8
8
  toArrayBuffer
9
- } from "./chunk-FF5M4SDK.js";
9
+ } from "./chunk-W5KUC23B.js";
10
10
  import {
11
11
  parsePageRange
12
12
  } from "./chunk-MOL7MDBG.js";
@@ -5406,9 +5406,10 @@ async function parsePdfDocument(buffer, options) {
5406
5406
  const ocrMode = options?.ocrMode;
5407
5407
  if (!ocrProvider && ocrMode && ocrMode !== "off") {
5408
5408
  try {
5409
- const { resolveOcrProvider } = await import("./resolve-UFUJEPCJ.js");
5409
+ const { resolveOcrProvider } = await import("./resolve-4FSAQF2S.js");
5410
5410
  const concurrency = options?.ocrConcurrency ?? 1;
5411
- ocrProvider = await resolveOcrProvider(ocrMode, warnings, concurrency);
5411
+ const batchSize = options?.ocrBatchSize;
5412
+ ocrProvider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
5412
5413
  } catch (resolveErr) {
5413
5414
  if (ocrMode !== "auto") {
5414
5415
  throw Object.assign(
@@ -5421,9 +5422,9 @@ async function parsePdfDocument(buffer, options) {
5421
5422
  if (ocrProvider) {
5422
5423
  let ocrBlocks = [];
5423
5424
  try {
5424
- const { ocrPages } = await import("./provider-I3XGSVL6.js");
5425
+ const { ocrPages } = await import("./provider-WYHC4NHI.js");
5425
5426
  const concurrency = options?.ocrConcurrency ?? 1;
5426
- ocrBlocks = await ocrPages(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency);
5427
+ ocrBlocks = await ocrPages(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
5427
5428
  } catch {
5428
5429
  } finally {
5429
5430
  const terminable = ocrProvider;
@@ -9623,4 +9624,4 @@ export {
9623
9624
  cfb/cfb.js:
9624
9625
  (*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
9625
9626
  */
9626
- //# sourceMappingURL=chunk-OL2NDK3E.js.map
9627
+ //# sourceMappingURL=chunk-ZOEUKD77.js.map