@clazic/kordoc 2.4.3 → 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env node
2
+ import {
3
+ detectAvailableOcr,
4
+ getAutoFallbackChain,
5
+ getTesseractFallbackMessage,
6
+ validateOcrMode
7
+ } from "./chunk-7NOZFYH6.js";
8
+ import "./chunk-ZWE3DS7E.js";
9
+ export {
10
+ detectAvailableOcr,
11
+ getAutoFallbackChain,
12
+ getTesseractFallbackMessage,
13
+ validateOcrMode
14
+ };
15
+ //# sourceMappingURL=auto-detect-2YGFYQCN.js.map
@@ -1,4 +1,7 @@
1
1
  #!/usr/bin/env node
2
+ import {
3
+ markdownToBlocks
4
+ } from "./chunk-4PP34NVQ.js";
2
5
  import {
3
6
  KordocError,
4
7
  classifyError,
@@ -6,7 +9,7 @@ import {
6
9
  precheckZipSize,
7
10
  sanitizeHref,
8
11
  toArrayBuffer
9
- } from "./chunk-HOUVJPR7.js";
12
+ } from "./chunk-KEDUF24M.js";
10
13
  import {
11
14
  parsePageRange
12
15
  } from "./chunk-MOL7MDBG.js";
@@ -16,9 +19,6 @@ import {
16
19
  import {
17
20
  createCliOcrProvider
18
21
  } from "./chunk-JOGAFNIL.js";
19
- import {
20
- markdownToBlocks
21
- } from "./chunk-4PP34NVQ.js";
22
22
  import {
23
23
  __commonJS,
24
24
  __require,
@@ -5456,53 +5456,69 @@ async function parsePdfDocument(buffer, options) {
5456
5456
  }
5457
5457
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
5458
5458
  if (isImageBased) {
5459
- let ocrProvider = options?.ocr ?? null;
5460
5459
  const ocrMode = options?.ocrMode ?? "auto";
5461
- if (!ocrProvider && ocrMode !== "off") {
5462
- try {
5463
- const { resolveOcrProvider } = await import("./resolve-UOAOPQ4H.js");
5464
- const concurrency = options?.ocrConcurrency ?? 1;
5465
- const batchSize = options?.ocrBatchSize;
5466
- ocrProvider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
5467
- } catch (resolveErr) {
5468
- if (ocrMode !== "auto") {
5469
- throw Object.assign(
5470
- new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
5471
- { isImageBased: true }
5472
- );
5473
- }
5474
- }
5460
+ const concurrency = options?.ocrConcurrency ?? 1;
5461
+ const batchSize = options?.ocrBatchSize;
5462
+ if (ocrMode === "off") {
5463
+ throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
5475
5464
  }
5476
- if (ocrProvider) {
5477
- let ocrBlocks = [];
5465
+ const { resolveOcrProvider } = await import("./resolve-TZVGVOVD.js");
5466
+ const { ocrPages } = await import("./provider-HE727F7Z.js");
5467
+ const tryProvider = async (provider) => {
5478
5468
  try {
5479
- const { ocrPages } = await import("./provider-HE727F7Z.js");
5480
- const concurrency = options?.ocrConcurrency ?? 1;
5481
- ocrBlocks = await ocrPages(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
5469
+ return await ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
5482
5470
  } catch {
5471
+ return [];
5483
5472
  } finally {
5484
- const terminable = ocrProvider;
5473
+ const terminable = provider;
5485
5474
  if (typeof terminable.terminate === "function") {
5486
5475
  await terminable.terminate().catch(() => {
5487
5476
  });
5488
5477
  }
5489
5478
  }
5490
- if (ocrBlocks.length > 0) {
5491
- const ocrMarkdown = blocksToMarkdown(ocrBlocks);
5492
- return {
5493
- markdown: ocrMarkdown,
5494
- blocks: ocrBlocks,
5495
- metadata,
5496
- warnings: warnings.length > 0 ? warnings : void 0,
5497
- isImageBased: true
5498
- };
5479
+ };
5480
+ let ocrBlocks = [];
5481
+ if (options?.ocr) {
5482
+ ocrBlocks = await tryProvider(options.ocr);
5483
+ } else if (ocrMode === "auto") {
5484
+ const { getAutoFallbackChain } = await import("./auto-detect-2YGFYQCN.js");
5485
+ for (const mode of getAutoFallbackChain()) {
5486
+ try {
5487
+ const provider = await resolveOcrProvider(mode, warnings, concurrency, batchSize);
5488
+ const blocks2 = await tryProvider(provider);
5489
+ if (blocks2.length > 0) {
5490
+ ocrBlocks = blocks2;
5491
+ break;
5492
+ }
5493
+ warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
5494
+ } catch {
5495
+ }
5496
+ }
5497
+ } else {
5498
+ try {
5499
+ const provider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
5500
+ ocrBlocks = await tryProvider(provider);
5501
+ } catch (resolveErr) {
5502
+ throw Object.assign(
5503
+ new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
5504
+ { isImageBased: true }
5505
+ );
5499
5506
  }
5500
5507
  }
5501
- if (ocrMode === "off") {
5502
- throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
5508
+ if (ocrBlocks.length > 0) {
5509
+ const ocrMarkdown = blocksToMarkdown(ocrBlocks);
5510
+ return {
5511
+ markdown: ocrMarkdown,
5512
+ blocks: ocrBlocks,
5513
+ metadata,
5514
+ warnings: warnings.length > 0 ? warnings : void 0,
5515
+ isImageBased: true
5516
+ };
5503
5517
  }
5504
- const errMsg = ocrMode ? `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` : `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`;
5505
- throw Object.assign(new KordocError(errMsg), { isImageBased: true });
5518
+ throw Object.assign(
5519
+ new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`),
5520
+ { isImageBased: true }
5521
+ );
5506
5522
  }
5507
5523
  if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
5508
5524
  const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
@@ -9779,4 +9795,4 @@ export {
9779
9795
  cfb/cfb.js:
9780
9796
  (*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
9781
9797
  */
9782
- //# sourceMappingURL=chunk-IAU7NTTA.js.map
9798
+ //# sourceMappingURL=chunk-5AXJRBBK.js.map