@clazic/kordoc 2.4.4 → 2.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2696,29 +2696,44 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2696
2696
  let processed = 0;
2697
2697
  const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
2698
2698
  const pageBlocks = [];
2699
+ const batchImages = [];
2699
2700
  try {
2700
- const batchImages = [];
2701
2701
  for (const pageNum of batchPageNums) {
2702
2702
  const page = await doc.getPage(pageNum);
2703
2703
  const image = await renderPageToPng(page);
2704
2704
  batchImages.push({ image, pageNum });
2705
2705
  }
2706
- const results = await provider.processBatch(batchImages);
2707
- for (const { pageNum } of batchImages) {
2708
- const result = results.get(pageNum);
2709
- pageBlocks.push({
2710
- pageNum,
2711
- blocks: result ? ocrResultToBlocks(result, pageNum) : []
2712
- });
2713
- }
2714
- } catch (err) {
2715
- const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2716
- warnings?.push({
2717
- message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2718
- code: "OCR_PAGE_FAILED"
2719
- });
2706
+ } catch (renderErr) {
2707
+ const rendered = new Set(batchImages.map((b) => b.pageNum));
2720
2708
  for (const pageNum of batchPageNums) {
2721
- pageBlocks.push({ pageNum, blocks: [] });
2709
+ if (!rendered.has(pageNum)) pageBlocks.push({ pageNum, blocks: [] });
2710
+ }
2711
+ }
2712
+ if (batchImages.length > 0) {
2713
+ try {
2714
+ const results = await provider.processBatch(batchImages);
2715
+ for (const { pageNum } of batchImages) {
2716
+ const result = results.get(pageNum);
2717
+ pageBlocks.push({
2718
+ pageNum,
2719
+ blocks: result ? ocrResultToBlocks(result, pageNum) : []
2720
+ });
2721
+ }
2722
+ } catch (err) {
2723
+ const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2724
+ warnings?.push({
2725
+ message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"} \u2014 \uB2E8\uC77C \uD398\uC774\uC9C0\uB85C \uC7AC\uC2DC\uB3C4`,
2726
+ code: "OCR_PAGE_FAILED"
2727
+ });
2728
+ for (const { image, pageNum } of batchImages) {
2729
+ try {
2730
+ const singleResult = await provider.processBatch([{ image, pageNum }]);
2731
+ const r = singleResult.get(pageNum);
2732
+ pageBlocks.push({ pageNum, blocks: r ? ocrResultToBlocks(r, pageNum) : [] });
2733
+ } catch {
2734
+ pageBlocks.push({ pageNum, blocks: [] });
2735
+ }
2736
+ }
2722
2737
  }
2723
2738
  }
2724
2739
  processed += batchPageNums.length;
@@ -2805,7 +2820,7 @@ import JSZip2 from "jszip";
2805
2820
  import { DOMParser } from "@xmldom/xmldom";
2806
2821
 
2807
2822
  // src/utils.ts
2808
- var VERSION = true ? "2.4.4" : "0.0.0-dev";
2823
+ var VERSION = true ? "2.4.5" : "0.0.0-dev";
2809
2824
  function toArrayBuffer(buf) {
2810
2825
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2811
2826
  return buf.buffer;
@@ -6365,9 +6380,9 @@ async function parsePdfDocument(buffer, options) {
6365
6380
  }
6366
6381
  const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6367
6382
  const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6368
- const tryProvider = async (provider) => {
6383
+ const tryProvider = async (provider, filter) => {
6369
6384
  try {
6370
- return await ocrPages2(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
6385
+ return await ocrPages2(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
6371
6386
  } catch {
6372
6387
  return [];
6373
6388
  } finally {
@@ -6380,25 +6395,42 @@ async function parsePdfDocument(buffer, options) {
6380
6395
  };
6381
6396
  let ocrBlocks = [];
6382
6397
  if (options?.ocr) {
6383
- ocrBlocks = await tryProvider(options.ocr);
6398
+ ocrBlocks = await tryProvider(options.ocr, pageFilter);
6384
6399
  } else if (ocrMode === "auto") {
6385
6400
  const { getAutoFallbackChain: getAutoFallbackChain2 } = await Promise.resolve().then(() => (init_auto_detect(), auto_detect_exports));
6401
+ const pendingPages = /* @__PURE__ */ new Set();
6402
+ for (let i = 1; i <= effectivePageCount; i++) {
6403
+ if (!pageFilter || pageFilter.has(i)) pendingPages.add(i);
6404
+ }
6405
+ const allOcrBlocks = [];
6386
6406
  for (const mode of getAutoFallbackChain2()) {
6407
+ if (pendingPages.size === 0) break;
6387
6408
  try {
6409
+ const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
6388
6410
  const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
6389
- const blocks2 = await tryProvider(provider);
6411
+ const blocks2 = await tryProvider(provider, modeFilter);
6390
6412
  if (blocks2.length > 0) {
6391
- ocrBlocks = blocks2;
6392
- break;
6413
+ for (const b of blocks2) {
6414
+ if (b.pageNumber !== void 0) pendingPages.delete(b.pageNumber);
6415
+ }
6416
+ for (const b of blocks2) allOcrBlocks.push(b);
6417
+ if (pendingPages.size > 0) {
6418
+ warnings.push({
6419
+ message: `OCR: '${mode}' \uC644\uB8CC (${pendingPages.size}\uD398\uC774\uC9C0 \uBBF8\uCC98\uB9AC \u2192 \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC7AC\uC2DC\uB3C4)`,
6420
+ code: "OCR_CLI_FALLBACK"
6421
+ });
6422
+ }
6423
+ } else {
6424
+ warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
6393
6425
  }
6394
- warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
6395
6426
  } catch {
6396
6427
  }
6397
6428
  }
6429
+ ocrBlocks = allOcrBlocks;
6398
6430
  } else {
6399
6431
  try {
6400
6432
  const provider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
6401
- ocrBlocks = await tryProvider(provider);
6433
+ ocrBlocks = await tryProvider(provider, pageFilter);
6402
6434
  } catch (resolveErr) {
6403
6435
  throw Object.assign(
6404
6436
  new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),