@clazic/kordoc 2.4.3 → 2.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1961,6 +1961,13 @@ var require_cfb = __commonJS({
1961
1961
  });
1962
1962
 
1963
1963
  // src/ocr/auto-detect.ts
1964
+ var auto_detect_exports = {};
1965
+ __export(auto_detect_exports, {
1966
+ detectAvailableOcr: () => detectAvailableOcr,
1967
+ getAutoFallbackChain: () => getAutoFallbackChain,
1968
+ getTesseractFallbackMessage: () => getTesseractFallbackMessage,
1969
+ validateOcrMode: () => validateOcrMode
1970
+ });
1964
1971
  import { execSync } from "child_process";
1965
1972
  function detectAvailableOcr() {
1966
1973
  for (const cli of CLI_PRIORITY) {
@@ -1977,6 +1984,14 @@ function isCliInstalled(name) {
1977
1984
  return false;
1978
1985
  }
1979
1986
  }
1987
+ function getAutoFallbackChain() {
1988
+ const chain = [];
1989
+ for (const cli of CLI_PRIORITY) {
1990
+ if (isCliInstalled(cli)) chain.push(cli);
1991
+ }
1992
+ chain.push("tesseract");
1993
+ return chain;
1994
+ }
1980
1995
  function validateOcrMode(mode) {
1981
1996
  if (mode === "auto" || mode === "off" || mode === "tesseract") return;
1982
1997
  if (!isCliInstalled(mode)) {
@@ -2681,29 +2696,44 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2681
2696
  let processed = 0;
2682
2697
  const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
2683
2698
  const pageBlocks = [];
2699
+ const batchImages = [];
2684
2700
  try {
2685
- const batchImages = [];
2686
2701
  for (const pageNum of batchPageNums) {
2687
2702
  const page = await doc.getPage(pageNum);
2688
2703
  const image = await renderPageToPng(page);
2689
2704
  batchImages.push({ image, pageNum });
2690
2705
  }
2691
- const results = await provider.processBatch(batchImages);
2692
- for (const { pageNum } of batchImages) {
2693
- const result = results.get(pageNum);
2694
- pageBlocks.push({
2695
- pageNum,
2696
- blocks: result ? ocrResultToBlocks(result, pageNum) : []
2697
- });
2698
- }
2699
- } catch (err) {
2700
- const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2701
- warnings?.push({
2702
- message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2703
- code: "OCR_PAGE_FAILED"
2704
- });
2706
+ } catch (renderErr) {
2707
+ const rendered = new Set(batchImages.map((b) => b.pageNum));
2705
2708
  for (const pageNum of batchPageNums) {
2706
- pageBlocks.push({ pageNum, blocks: [] });
2709
+ if (!rendered.has(pageNum)) pageBlocks.push({ pageNum, blocks: [] });
2710
+ }
2711
+ }
2712
+ if (batchImages.length > 0) {
2713
+ try {
2714
+ const results = await provider.processBatch(batchImages);
2715
+ for (const { pageNum } of batchImages) {
2716
+ const result = results.get(pageNum);
2717
+ pageBlocks.push({
2718
+ pageNum,
2719
+ blocks: result ? ocrResultToBlocks(result, pageNum) : []
2720
+ });
2721
+ }
2722
+ } catch (err) {
2723
+ const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2724
+ warnings?.push({
2725
+ message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"} \u2014 \uB2E8\uC77C \uD398\uC774\uC9C0\uB85C \uC7AC\uC2DC\uB3C4`,
2726
+ code: "OCR_PAGE_FAILED"
2727
+ });
2728
+ for (const { image, pageNum } of batchImages) {
2729
+ try {
2730
+ const singleResult = await provider.processBatch([{ image, pageNum }]);
2731
+ const r = singleResult.get(pageNum);
2732
+ pageBlocks.push({ pageNum, blocks: r ? ocrResultToBlocks(r, pageNum) : [] });
2733
+ } catch {
2734
+ pageBlocks.push({ pageNum, blocks: [] });
2735
+ }
2736
+ }
2707
2737
  }
2708
2738
  }
2709
2739
  processed += batchPageNums.length;
@@ -2790,7 +2820,7 @@ import JSZip2 from "jszip";
2790
2820
  import { DOMParser } from "@xmldom/xmldom";
2791
2821
 
2792
2822
  // src/utils.ts
2793
- var VERSION = true ? "2.4.3" : "0.0.0-dev";
2823
+ var VERSION = true ? "2.4.5" : "0.0.0-dev";
2794
2824
  function toArrayBuffer(buf) {
2795
2825
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2796
2826
  return buf.buffer;
@@ -6342,53 +6372,86 @@ async function parsePdfDocument(buffer, options) {
6342
6372
  }
6343
6373
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
6344
6374
  if (isImageBased) {
6345
- let ocrProvider = options?.ocr ?? null;
6346
6375
  const ocrMode = options?.ocrMode ?? "auto";
6347
- if (!ocrProvider && ocrMode !== "off") {
6348
- try {
6349
- const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6350
- const concurrency = options?.ocrConcurrency ?? 1;
6351
- const batchSize = options?.ocrBatchSize;
6352
- ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
6353
- } catch (resolveErr) {
6354
- if (ocrMode !== "auto") {
6355
- throw Object.assign(
6356
- new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
6357
- { isImageBased: true }
6358
- );
6359
- }
6360
- }
6376
+ const concurrency = options?.ocrConcurrency ?? 1;
6377
+ const batchSize = options?.ocrBatchSize;
6378
+ if (ocrMode === "off") {
6379
+ throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
6361
6380
  }
6362
- if (ocrProvider) {
6363
- let ocrBlocks = [];
6381
+ const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6382
+ const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6383
+ const tryProvider = async (provider, filter) => {
6364
6384
  try {
6365
- const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6366
- const concurrency = options?.ocrConcurrency ?? 1;
6367
- ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
6385
+ return await ocrPages2(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
6368
6386
  } catch {
6387
+ return [];
6369
6388
  } finally {
6370
- const terminable = ocrProvider;
6389
+ const terminable = provider;
6371
6390
  if (typeof terminable.terminate === "function") {
6372
6391
  await terminable.terminate().catch(() => {
6373
6392
  });
6374
6393
  }
6375
6394
  }
6376
- if (ocrBlocks.length > 0) {
6377
- const ocrMarkdown = blocksToMarkdown(ocrBlocks);
6378
- return {
6379
- markdown: ocrMarkdown,
6380
- blocks: ocrBlocks,
6381
- metadata,
6382
- warnings: warnings.length > 0 ? warnings : void 0,
6383
- isImageBased: true
6384
- };
6395
+ };
6396
+ let ocrBlocks = [];
6397
+ if (options?.ocr) {
6398
+ ocrBlocks = await tryProvider(options.ocr, pageFilter);
6399
+ } else if (ocrMode === "auto") {
6400
+ const { getAutoFallbackChain: getAutoFallbackChain2 } = await Promise.resolve().then(() => (init_auto_detect(), auto_detect_exports));
6401
+ const pendingPages = /* @__PURE__ */ new Set();
6402
+ for (let i = 1; i <= effectivePageCount; i++) {
6403
+ if (!pageFilter || pageFilter.has(i)) pendingPages.add(i);
6404
+ }
6405
+ const allOcrBlocks = [];
6406
+ for (const mode of getAutoFallbackChain2()) {
6407
+ if (pendingPages.size === 0) break;
6408
+ try {
6409
+ const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
6410
+ const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
6411
+ const blocks2 = await tryProvider(provider, modeFilter);
6412
+ if (blocks2.length > 0) {
6413
+ for (const b of blocks2) {
6414
+ if (b.pageNumber !== void 0) pendingPages.delete(b.pageNumber);
6415
+ }
6416
+ for (const b of blocks2) allOcrBlocks.push(b);
6417
+ if (pendingPages.size > 0) {
6418
+ warnings.push({
6419
+ message: `OCR: '${mode}' \uC644\uB8CC (${pendingPages.size}\uD398\uC774\uC9C0 \uBBF8\uCC98\uB9AC \u2192 \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC7AC\uC2DC\uB3C4)`,
6420
+ code: "OCR_CLI_FALLBACK"
6421
+ });
6422
+ }
6423
+ } else {
6424
+ warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
6425
+ }
6426
+ } catch {
6427
+ }
6428
+ }
6429
+ ocrBlocks = allOcrBlocks;
6430
+ } else {
6431
+ try {
6432
+ const provider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
6433
+ ocrBlocks = await tryProvider(provider, pageFilter);
6434
+ } catch (resolveErr) {
6435
+ throw Object.assign(
6436
+ new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
6437
+ { isImageBased: true }
6438
+ );
6385
6439
  }
6386
6440
  }
6387
- if (ocrMode === "off") {
6388
- throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
6441
+ if (ocrBlocks.length > 0) {
6442
+ const ocrMarkdown = blocksToMarkdown(ocrBlocks);
6443
+ return {
6444
+ markdown: ocrMarkdown,
6445
+ blocks: ocrBlocks,
6446
+ metadata,
6447
+ warnings: warnings.length > 0 ? warnings : void 0,
6448
+ isImageBased: true
6449
+ };
6389
6450
  }
6390
- const errMsg = ocrMode ? `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` : `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`;
6391
- throw Object.assign(new KordocError(errMsg), { isImageBased: true });
6451
+ throw Object.assign(
6452
+ new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`),
6453
+ { isImageBased: true }
6454
+ );
6392
6455
  }
6393
6456
  if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
6394
6457
  const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);