@clazic/kordoc 2.2.6 → 2.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -110,6 +110,13 @@ interface ParseOptions {
110
110
  * - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
111
111
  */
112
112
  ocrMode?: OcrMode;
113
+ /**
114
+ * OCR 병렬 처리 수.
115
+ * - tesseract: 기본값은 CPU 코어 수 (병렬 처리로 속도 향상)
116
+ * - CLI 제공 프로바이더(gemini/claude/codex): 기본 1 (rate limit 보호)
117
+ * - 1: 순차 처리 (기존 동작)
118
+ */
119
+ ocrConcurrency?: number;
113
120
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
114
121
  onProgress?: (current: number, total: number) => void;
115
122
  /** PDF 머리글/바닥글 자동 제거 */
package/dist/index.d.ts CHANGED
@@ -110,6 +110,13 @@ interface ParseOptions {
110
110
  * - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
111
111
  */
112
112
  ocrMode?: OcrMode;
113
+ /**
114
+ * OCR 병렬 처리 수.
115
+ * - tesseract: 기본값은 CPU 코어 수 (병렬 처리로 속도 향상)
116
+ * - CLI 제공 프로바이더(gemini/claude/codex): 기본 1 (rate limit 보호)
117
+ * - 1: 순차 처리 (기존 동작)
118
+ */
119
+ ocrConcurrency?: number;
113
120
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
114
121
  onProgress?: (current: number, total: number) => void;
115
122
  /** PDF 머리글/바닥글 자동 제거 */
package/dist/index.js CHANGED
@@ -2051,8 +2051,10 @@ function callCli(mode, imagePath) {
2051
2051
  const args = buildCliArgs(mode, imagePath);
2052
2052
  const result = spawnSync(mode, args, {
2053
2053
  encoding: "utf-8",
2054
- timeout: 6e4,
2055
- maxBuffer: 10 * 1024 * 1024
2054
+ timeout: 18e4,
2055
+ maxBuffer: 10 * 1024 * 1024,
2056
+ // claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
2057
+ ...mode === "claude" ? { cwd: tmpdir() } : {}
2056
2058
  });
2057
2059
  if (result.error) {
2058
2060
  throw new Error(`${mode} CLI \uC2E4\uD589 \uC2E4\uD328: ${result.error.message}`);
@@ -2158,6 +2160,7 @@ var init_cli_provider = __esm({
2158
2160
  // src/ocr/tesseract-provider.ts
2159
2161
  var tesseract_provider_exports = {};
2160
2162
  __export(tesseract_provider_exports, {
2163
+ createTesseractPoolProvider: () => createTesseractPoolProvider,
2161
2164
  createTesseractProvider: () => createTesseractProvider
2162
2165
  });
2163
2166
  import { createWorker } from "tesseract.js";
@@ -2176,6 +2179,37 @@ async function createTesseractProvider() {
2176
2179
  };
2177
2180
  return provider;
2178
2181
  }
2182
+ async function createTesseractPoolProvider(concurrency) {
2183
+ const workers = await Promise.all(
2184
+ Array.from({ length: concurrency }, () => createWorker("kor+eng"))
2185
+ );
2186
+ const idle = [...workers];
2187
+ const waitQueue = [];
2188
+ function acquire() {
2189
+ if (idle.length > 0) return Promise.resolve(idle.pop());
2190
+ return new Promise((resolve) => waitQueue.push(resolve));
2191
+ }
2192
+ function release(w) {
2193
+ if (waitQueue.length > 0) {
2194
+ waitQueue.shift()(w);
2195
+ } else {
2196
+ idle.push(w);
2197
+ }
2198
+ }
2199
+ const provider = async (pageImage, _pageNumber, _mimeType) => {
2200
+ const w = await acquire();
2201
+ try {
2202
+ const { data } = await w.recognize(pageImage);
2203
+ return data.text;
2204
+ } finally {
2205
+ release(w);
2206
+ }
2207
+ };
2208
+ provider.terminate = async () => {
2209
+ await Promise.all(workers.map((w) => w.terminate()));
2210
+ };
2211
+ return provider;
2212
+ }
2179
2213
  var init_tesseract_provider = __esm({
2180
2214
  "src/ocr/tesseract-provider.ts"() {
2181
2215
  "use strict";
@@ -2187,14 +2221,17 @@ var resolve_exports = {};
2187
2221
  __export(resolve_exports, {
2188
2222
  resolveOcrProvider: () => resolveOcrProvider
2189
2223
  });
2190
- async function resolveOcrProvider(mode, warnings) {
2224
+ async function resolveOcrProvider(mode, warnings, concurrency) {
2191
2225
  if (mode === "off") {
2192
2226
  throw new Error("OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC788\uC2B5\uB2C8\uB2E4 (--ocr off).");
2193
2227
  }
2194
2228
  if (mode !== "auto") {
2195
2229
  validateOcrMode(mode);
2196
2230
  if (mode === "tesseract") {
2197
- const { createTesseractProvider: createTesseractProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2231
+ const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2232
+ if (concurrency && concurrency > 1) {
2233
+ return createTesseractPoolProvider2(concurrency);
2234
+ }
2198
2235
  return createTesseractProvider2();
2199
2236
  }
2200
2237
  return createCliOcrProvider(mode);
@@ -2214,7 +2251,10 @@ async function resolveOcrProvider(mode, warnings) {
2214
2251
  }
2215
2252
  }
2216
2253
  if (detected === "tesseract") {
2217
- const { createTesseractProvider: createTesseractProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2254
+ const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2255
+ if (concurrency && concurrency > 1) {
2256
+ return createTesseractPoolProvider2(concurrency);
2257
+ }
2218
2258
  return createTesseractProvider2();
2219
2259
  }
2220
2260
  return createCliOcrProvider(detected);
@@ -2352,32 +2392,77 @@ var provider_exports = {};
2352
2392
  __export(provider_exports, {
2353
2393
  ocrPages: () => ocrPages
2354
2394
  });
2355
- async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings) {
2395
+ async function runWithConcurrency(tasks, limit) {
2396
+ const results = new Array(tasks.length);
2397
+ let nextIndex = 0;
2398
+ async function worker() {
2399
+ while (nextIndex < tasks.length) {
2400
+ const idx = nextIndex++;
2401
+ results[idx] = await tasks[idx]();
2402
+ }
2403
+ }
2404
+ await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, () => worker()));
2405
+ return results;
2406
+ }
2407
+ function ocrResultToBlocks(result, pageNum) {
2408
+ const pageBlocks = [];
2409
+ if (typeof result === "string") {
2410
+ if (result.trim()) {
2411
+ pageBlocks.push({ type: "paragraph", text: result.trim(), pageNumber: pageNum });
2412
+ }
2413
+ } else if (result && typeof result === "object" && "markdown" in result) {
2414
+ const structured = result;
2415
+ if (structured.markdown.trim()) {
2416
+ const converted = markdownToBlocks(structured.markdown, pageNum);
2417
+ for (const b of converted) pageBlocks.push(b);
2418
+ }
2419
+ }
2420
+ return pageBlocks;
2421
+ }
2422
+ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1) {
2356
2423
  const blocks = [];
2424
+ if (concurrency <= 1) {
2425
+ for (let i = 1; i <= effectivePageCount; i++) {
2426
+ if (pageFilter && !pageFilter.has(i)) continue;
2427
+ const page = await doc.getPage(i);
2428
+ try {
2429
+ const imageData = await renderPageToPng(page);
2430
+ const result = await provider(imageData, i, "image/png");
2431
+ for (const b of ocrResultToBlocks(result, i)) blocks.push(b);
2432
+ } catch (err) {
2433
+ warnings?.push({
2434
+ page: i,
2435
+ message: `\uD398\uC774\uC9C0 ${i} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2436
+ code: "OCR_PAGE_FAILED"
2437
+ });
2438
+ }
2439
+ }
2440
+ return blocks;
2441
+ }
2442
+ const pageNumbers = [];
2357
2443
  for (let i = 1; i <= effectivePageCount; i++) {
2358
2444
  if (pageFilter && !pageFilter.has(i)) continue;
2359
- const page = await doc.getPage(i);
2445
+ pageNumbers.push(i);
2446
+ }
2447
+ const tasks = pageNumbers.map((pageNum) => async () => {
2360
2448
  try {
2449
+ const page = await doc.getPage(pageNum);
2361
2450
  const imageData = await renderPageToPng(page);
2362
- const result = await provider(imageData, i, "image/png");
2363
- if (typeof result === "string") {
2364
- if (result.trim()) {
2365
- blocks.push({ type: "paragraph", text: result.trim(), pageNumber: i });
2366
- }
2367
- } else if (result && typeof result === "object" && "markdown" in result) {
2368
- const structured = result;
2369
- if (structured.markdown.trim()) {
2370
- const pageBlocks = markdownToBlocks(structured.markdown, i);
2371
- for (const b of pageBlocks) blocks.push(b);
2372
- }
2373
- }
2451
+ const result = await provider(imageData, pageNum, "image/png");
2452
+ return { pageNum, pageBlocks: ocrResultToBlocks(result, pageNum) };
2374
2453
  } catch (err) {
2375
2454
  warnings?.push({
2376
- page: i,
2377
- message: `\uD398\uC774\uC9C0 ${i} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2455
+ page: pageNum,
2456
+ message: `\uD398\uC774\uC9C0 ${pageNum} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2378
2457
  code: "OCR_PAGE_FAILED"
2379
2458
  });
2459
+ return null;
2380
2460
  }
2461
+ });
2462
+ const taskResults = await runWithConcurrency(tasks, concurrency);
2463
+ for (const item of taskResults) {
2464
+ if (!item) continue;
2465
+ for (const b of item.pageBlocks) blocks.push(b);
2381
2466
  }
2382
2467
  return blocks;
2383
2468
  }
@@ -2446,7 +2531,7 @@ import JSZip2 from "jszip";
2446
2531
  import { DOMParser } from "@xmldom/xmldom";
2447
2532
 
2448
2533
  // src/utils.ts
2449
- var VERSION = true ? "2.2.6" : "0.0.0-dev";
2534
+ var VERSION = true ? "2.2.8" : "0.0.0-dev";
2450
2535
  function toArrayBuffer(buf) {
2451
2536
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2452
2537
  return buf.buffer;
@@ -5963,7 +6048,8 @@ async function parsePdfDocument(buffer, options) {
5963
6048
  if (!ocrProvider && ocrMode && ocrMode !== "off") {
5964
6049
  try {
5965
6050
  const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
5966
- ocrProvider = await resolveOcrProvider2(ocrMode, warnings);
6051
+ const concurrency = options?.ocrConcurrency ?? 1;
6052
+ ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency);
5967
6053
  } catch (resolveErr) {
5968
6054
  if (ocrMode !== "auto") {
5969
6055
  throw Object.assign(
@@ -5976,7 +6062,8 @@ async function parsePdfDocument(buffer, options) {
5976
6062
  if (ocrProvider) {
5977
6063
  try {
5978
6064
  const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
5979
- const ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings);
6065
+ const concurrency = options?.ocrConcurrency ?? 1;
6066
+ const ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency);
5980
6067
  if (ocrBlocks.length > 0) {
5981
6068
  const ocrMarkdown = blocksToMarkdown(ocrBlocks);
5982
6069
  return {