@clazic/kordoc 2.2.7 → 2.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -110,6 +110,13 @@ interface ParseOptions {
110
110
  * - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
111
111
  */
112
112
  ocrMode?: OcrMode;
113
+ /**
114
+ * OCR 병렬 처리 수.
115
+ * - tesseract: 기본값은 CPU 코어 수 (병렬 처리로 속도 향상)
116
+ * - CLI 제공 프로바이더(gemini/claude/codex): 기본 1 (rate limit 보호)
117
+ * - 1: 순차 처리 (기존 동작)
118
+ */
119
+ ocrConcurrency?: number;
113
120
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
114
121
  onProgress?: (current: number, total: number) => void;
115
122
  /** PDF 머리글/바닥글 자동 제거 */
package/dist/index.d.ts CHANGED
@@ -110,6 +110,13 @@ interface ParseOptions {
110
110
  * - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
111
111
  */
112
112
  ocrMode?: OcrMode;
113
+ /**
114
+ * OCR 병렬 처리 수.
115
+ * - tesseract: 기본값은 CPU 코어 수 (병렬 처리로 속도 향상)
116
+ * - CLI 제공 프로바이더(gemini/claude/codex): 기본 1 (rate limit 보호)
117
+ * - 1: 순차 처리 (기존 동작)
118
+ */
119
+ ocrConcurrency?: number;
113
120
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
114
121
  onProgress?: (current: number, total: number) => void;
115
122
  /** PDF 머리글/바닥글 자동 제거 */
package/dist/index.js CHANGED
@@ -2160,6 +2160,7 @@ var init_cli_provider = __esm({
2160
2160
  // src/ocr/tesseract-provider.ts
2161
2161
  var tesseract_provider_exports = {};
2162
2162
  __export(tesseract_provider_exports, {
2163
+ createTesseractPoolProvider: () => createTesseractPoolProvider,
2163
2164
  createTesseractProvider: () => createTesseractProvider
2164
2165
  });
2165
2166
  import { createWorker } from "tesseract.js";
@@ -2178,6 +2179,37 @@ async function createTesseractProvider() {
2178
2179
  };
2179
2180
  return provider;
2180
2181
  }
2182
+ async function createTesseractPoolProvider(concurrency) {
2183
+ const workers = await Promise.all(
2184
+ Array.from({ length: concurrency }, () => createWorker("kor+eng"))
2185
+ );
2186
+ const idle = [...workers];
2187
+ const waitQueue = [];
2188
+ function acquire() {
2189
+ if (idle.length > 0) return Promise.resolve(idle.pop());
2190
+ return new Promise((resolve) => waitQueue.push(resolve));
2191
+ }
2192
+ function release(w) {
2193
+ if (waitQueue.length > 0) {
2194
+ waitQueue.shift()(w);
2195
+ } else {
2196
+ idle.push(w);
2197
+ }
2198
+ }
2199
+ const provider = async (pageImage, _pageNumber, _mimeType) => {
2200
+ const w = await acquire();
2201
+ try {
2202
+ const { data } = await w.recognize(pageImage);
2203
+ return data.text;
2204
+ } finally {
2205
+ release(w);
2206
+ }
2207
+ };
2208
+ provider.terminate = async () => {
2209
+ await Promise.all(workers.map((w) => w.terminate()));
2210
+ };
2211
+ return provider;
2212
+ }
2181
2213
  var init_tesseract_provider = __esm({
2182
2214
  "src/ocr/tesseract-provider.ts"() {
2183
2215
  "use strict";
@@ -2189,14 +2221,17 @@ var resolve_exports = {};
2189
2221
  __export(resolve_exports, {
2190
2222
  resolveOcrProvider: () => resolveOcrProvider
2191
2223
  });
2192
- async function resolveOcrProvider(mode, warnings) {
2224
+ async function resolveOcrProvider(mode, warnings, concurrency) {
2193
2225
  if (mode === "off") {
2194
2226
  throw new Error("OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC788\uC2B5\uB2C8\uB2E4 (--ocr off).");
2195
2227
  }
2196
2228
  if (mode !== "auto") {
2197
2229
  validateOcrMode(mode);
2198
2230
  if (mode === "tesseract") {
2199
- const { createTesseractProvider: createTesseractProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2231
+ const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2232
+ if (concurrency && concurrency > 1) {
2233
+ return createTesseractPoolProvider2(concurrency);
2234
+ }
2200
2235
  return createTesseractProvider2();
2201
2236
  }
2202
2237
  return createCliOcrProvider(mode);
@@ -2216,7 +2251,10 @@ async function resolveOcrProvider(mode, warnings) {
2216
2251
  }
2217
2252
  }
2218
2253
  if (detected === "tesseract") {
2219
- const { createTesseractProvider: createTesseractProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2254
+ const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2255
+ if (concurrency && concurrency > 1) {
2256
+ return createTesseractPoolProvider2(concurrency);
2257
+ }
2220
2258
  return createTesseractProvider2();
2221
2259
  }
2222
2260
  return createCliOcrProvider(detected);
@@ -2354,32 +2392,77 @@ var provider_exports = {};
2354
2392
  __export(provider_exports, {
2355
2393
  ocrPages: () => ocrPages
2356
2394
  });
2357
- async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings) {
2395
+ async function runWithConcurrency(tasks, limit) {
2396
+ const results = new Array(tasks.length);
2397
+ let nextIndex = 0;
2398
+ async function worker() {
2399
+ while (nextIndex < tasks.length) {
2400
+ const idx = nextIndex++;
2401
+ results[idx] = await tasks[idx]();
2402
+ }
2403
+ }
2404
+ await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, () => worker()));
2405
+ return results;
2406
+ }
2407
+ function ocrResultToBlocks(result, pageNum) {
2408
+ const pageBlocks = [];
2409
+ if (typeof result === "string") {
2410
+ if (result.trim()) {
2411
+ pageBlocks.push({ type: "paragraph", text: result.trim(), pageNumber: pageNum });
2412
+ }
2413
+ } else if (result && typeof result === "object" && "markdown" in result) {
2414
+ const structured = result;
2415
+ if (structured.markdown.trim()) {
2416
+ const converted = markdownToBlocks(structured.markdown, pageNum);
2417
+ for (const b of converted) pageBlocks.push(b);
2418
+ }
2419
+ }
2420
+ return pageBlocks;
2421
+ }
2422
+ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1) {
2358
2423
  const blocks = [];
2424
+ if (concurrency <= 1) {
2425
+ for (let i = 1; i <= effectivePageCount; i++) {
2426
+ if (pageFilter && !pageFilter.has(i)) continue;
2427
+ const page = await doc.getPage(i);
2428
+ try {
2429
+ const imageData = await renderPageToPng(page);
2430
+ const result = await provider(imageData, i, "image/png");
2431
+ for (const b of ocrResultToBlocks(result, i)) blocks.push(b);
2432
+ } catch (err) {
2433
+ warnings?.push({
2434
+ page: i,
2435
+ message: `\uD398\uC774\uC9C0 ${i} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2436
+ code: "OCR_PAGE_FAILED"
2437
+ });
2438
+ }
2439
+ }
2440
+ return blocks;
2441
+ }
2442
+ const pageNumbers = [];
2359
2443
  for (let i = 1; i <= effectivePageCount; i++) {
2360
2444
  if (pageFilter && !pageFilter.has(i)) continue;
2361
- const page = await doc.getPage(i);
2445
+ pageNumbers.push(i);
2446
+ }
2447
+ const tasks = pageNumbers.map((pageNum) => async () => {
2362
2448
  try {
2449
+ const page = await doc.getPage(pageNum);
2363
2450
  const imageData = await renderPageToPng(page);
2364
- const result = await provider(imageData, i, "image/png");
2365
- if (typeof result === "string") {
2366
- if (result.trim()) {
2367
- blocks.push({ type: "paragraph", text: result.trim(), pageNumber: i });
2368
- }
2369
- } else if (result && typeof result === "object" && "markdown" in result) {
2370
- const structured = result;
2371
- if (structured.markdown.trim()) {
2372
- const pageBlocks = markdownToBlocks(structured.markdown, i);
2373
- for (const b of pageBlocks) blocks.push(b);
2374
- }
2375
- }
2451
+ const result = await provider(imageData, pageNum, "image/png");
2452
+ return { pageNum, pageBlocks: ocrResultToBlocks(result, pageNum) };
2376
2453
  } catch (err) {
2377
2454
  warnings?.push({
2378
- page: i,
2379
- message: `\uD398\uC774\uC9C0 ${i} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2455
+ page: pageNum,
2456
+ message: `\uD398\uC774\uC9C0 ${pageNum} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2380
2457
  code: "OCR_PAGE_FAILED"
2381
2458
  });
2459
+ return null;
2382
2460
  }
2461
+ });
2462
+ const taskResults = await runWithConcurrency(tasks, concurrency);
2463
+ for (const item of taskResults) {
2464
+ if (!item) continue;
2465
+ for (const b of item.pageBlocks) blocks.push(b);
2383
2466
  }
2384
2467
  return blocks;
2385
2468
  }
@@ -2448,7 +2531,7 @@ import JSZip2 from "jszip";
2448
2531
  import { DOMParser } from "@xmldom/xmldom";
2449
2532
 
2450
2533
  // src/utils.ts
2451
- var VERSION = true ? "2.2.7" : "0.0.0-dev";
2534
+ var VERSION = true ? "2.2.9" : "0.0.0-dev";
2452
2535
  function toArrayBuffer(buf) {
2453
2536
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2454
2537
  return buf.buffer;
@@ -5965,7 +6048,8 @@ async function parsePdfDocument(buffer, options) {
5965
6048
  if (!ocrProvider && ocrMode && ocrMode !== "off") {
5966
6049
  try {
5967
6050
  const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
5968
- ocrProvider = await resolveOcrProvider2(ocrMode, warnings);
6051
+ const concurrency = options?.ocrConcurrency ?? 1;
6052
+ ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency);
5969
6053
  } catch (resolveErr) {
5970
6054
  if (ocrMode !== "auto") {
5971
6055
  throw Object.assign(
@@ -5976,20 +6060,28 @@ async function parsePdfDocument(buffer, options) {
5976
6060
  }
5977
6061
  }
5978
6062
  if (ocrProvider) {
6063
+ let ocrBlocks = [];
5979
6064
  try {
5980
6065
  const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
5981
- const ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings);
5982
- if (ocrBlocks.length > 0) {
5983
- const ocrMarkdown = blocksToMarkdown(ocrBlocks);
5984
- return {
5985
- markdown: ocrMarkdown,
5986
- blocks: ocrBlocks,
5987
- metadata,
5988
- warnings: warnings.length > 0 ? warnings : void 0,
5989
- isImageBased: true
5990
- };
5991
- }
6066
+ const concurrency = options?.ocrConcurrency ?? 1;
6067
+ ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency);
5992
6068
  } catch {
6069
+ } finally {
6070
+ const terminable = ocrProvider;
6071
+ if (typeof terminable.terminate === "function") {
6072
+ await terminable.terminate().catch(() => {
6073
+ });
6074
+ }
6075
+ }
6076
+ if (ocrBlocks.length > 0) {
6077
+ const ocrMarkdown = blocksToMarkdown(ocrBlocks);
6078
+ return {
6079
+ markdown: ocrMarkdown,
6080
+ blocks: ocrBlocks,
6081
+ metadata,
6082
+ warnings: warnings.length > 0 ? warnings : void 0,
6083
+ isImageBased: true
6084
+ };
5993
6085
  }
5994
6086
  }
5995
6087
  if (ocrMode === "off") {