@clazic/kordoc 2.4.2 → 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env node
2
+ import {
3
+ detectAvailableOcr,
4
+ getAutoFallbackChain,
5
+ getTesseractFallbackMessage,
6
+ validateOcrMode
7
+ } from "./chunk-7NOZFYH6.js";
8
+ import "./chunk-ZWE3DS7E.js";
9
+ export {
10
+ detectAvailableOcr,
11
+ getAutoFallbackChain,
12
+ getTesseractFallbackMessage,
13
+ validateOcrMode
14
+ };
15
+ //# sourceMappingURL=auto-detect-2YGFYQCN.js.map
@@ -1,4 +1,7 @@
1
1
  #!/usr/bin/env node
2
+ import {
3
+ markdownToBlocks
4
+ } from "./chunk-4PP34NVQ.js";
2
5
  import {
3
6
  KordocError,
4
7
  classifyError,
@@ -6,7 +9,7 @@ import {
6
9
  precheckZipSize,
7
10
  sanitizeHref,
8
11
  toArrayBuffer
9
- } from "./chunk-CMZPKEJ7.js";
12
+ } from "./chunk-KEDUF24M.js";
10
13
  import {
11
14
  parsePageRange
12
15
  } from "./chunk-MOL7MDBG.js";
@@ -16,9 +19,6 @@ import {
16
19
  import {
17
20
  createCliOcrProvider
18
21
  } from "./chunk-JOGAFNIL.js";
19
- import {
20
- markdownToBlocks
21
- } from "./chunk-4PP34NVQ.js";
22
22
  import {
23
23
  __commonJS,
24
24
  __require,
@@ -5432,66 +5432,93 @@ async function parsePdfDocument(buffer, options) {
5432
5432
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
5433
5433
  }
5434
5434
  };
5435
- const sampleCount = Math.min(5, targetPageNums.length);
5436
- for (let si = 0; si < sampleCount; si++) {
5435
+ const SAMPLE_SIZE = Math.min(10, targetPageNums.length);
5436
+ const sampledIndices = /* @__PURE__ */ new Set();
5437
+ if (targetPageNums.length <= SAMPLE_SIZE) {
5438
+ for (let i = 0; i < targetPageNums.length; i++) sampledIndices.add(i);
5439
+ } else {
5440
+ for (let i = 0; i < SAMPLE_SIZE; i++) {
5441
+ const idx = Math.round(i * (targetPageNums.length - 1) / (SAMPLE_SIZE - 1));
5442
+ sampledIndices.add(idx);
5443
+ }
5444
+ }
5445
+ for (const si of sampledIndices) {
5437
5446
  await parseSinglePage(targetPageNums[si]);
5438
5447
  }
5439
- const sampleParsed = parsedPages || sampleCount;
5448
+ const sampleParsed = parsedPages || sampledIndices.size;
5440
5449
  const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
5441
5450
  if (!isImageBased) {
5442
- for (let si = sampleCount; si < targetPageNums.length; si++) {
5443
- await parseSinglePage(targetPageNums[si]);
5451
+ for (let si = 0; si < targetPageNums.length; si++) {
5452
+ if (!sampledIndices.has(si)) {
5453
+ await parseSinglePage(targetPageNums[si]);
5454
+ }
5444
5455
  }
5445
5456
  }
5446
5457
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
5447
5458
  if (isImageBased) {
5448
- let ocrProvider = options?.ocr ?? null;
5449
5459
  const ocrMode = options?.ocrMode ?? "auto";
5450
- if (!ocrProvider && ocrMode !== "off") {
5451
- try {
5452
- const { resolveOcrProvider } = await import("./resolve-QA3VACUP.js");
5453
- const concurrency = options?.ocrConcurrency ?? 1;
5454
- const batchSize = options?.ocrBatchSize;
5455
- ocrProvider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
5456
- } catch (resolveErr) {
5457
- if (ocrMode !== "auto") {
5458
- throw Object.assign(
5459
- new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
5460
- { isImageBased: true }
5461
- );
5462
- }
5463
- }
5460
+ const concurrency = options?.ocrConcurrency ?? 1;
5461
+ const batchSize = options?.ocrBatchSize;
5462
+ if (ocrMode === "off") {
5463
+ throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
5464
5464
  }
5465
- if (ocrProvider) {
5466
- let ocrBlocks = [];
5465
+ const { resolveOcrProvider } = await import("./resolve-TZVGVOVD.js");
5466
+ const { ocrPages } = await import("./provider-HE727F7Z.js");
5467
+ const tryProvider = async (provider) => {
5467
5468
  try {
5468
- const { ocrPages } = await import("./provider-HE727F7Z.js");
5469
- const concurrency = options?.ocrConcurrency ?? 1;
5470
- ocrBlocks = await ocrPages(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
5469
+ return await ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
5471
5470
  } catch {
5471
+ return [];
5472
5472
  } finally {
5473
- const terminable = ocrProvider;
5473
+ const terminable = provider;
5474
5474
  if (typeof terminable.terminate === "function") {
5475
5475
  await terminable.terminate().catch(() => {
5476
5476
  });
5477
5477
  }
5478
5478
  }
5479
- if (ocrBlocks.length > 0) {
5480
- const ocrMarkdown = blocksToMarkdown(ocrBlocks);
5481
- return {
5482
- markdown: ocrMarkdown,
5483
- blocks: ocrBlocks,
5484
- metadata,
5485
- warnings: warnings.length > 0 ? warnings : void 0,
5486
- isImageBased: true
5487
- };
5479
+ };
5480
+ let ocrBlocks = [];
5481
+ if (options?.ocr) {
5482
+ ocrBlocks = await tryProvider(options.ocr);
5483
+ } else if (ocrMode === "auto") {
5484
+ const { getAutoFallbackChain } = await import("./auto-detect-2YGFYQCN.js");
5485
+ for (const mode of getAutoFallbackChain()) {
5486
+ try {
5487
+ const provider = await resolveOcrProvider(mode, warnings, concurrency, batchSize);
5488
+ const blocks2 = await tryProvider(provider);
5489
+ if (blocks2.length > 0) {
5490
+ ocrBlocks = blocks2;
5491
+ break;
5492
+ }
5493
+ warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
5494
+ } catch {
5495
+ }
5496
+ }
5497
+ } else {
5498
+ try {
5499
+ const provider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
5500
+ ocrBlocks = await tryProvider(provider);
5501
+ } catch (resolveErr) {
5502
+ throw Object.assign(
5503
+ new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
5504
+ { isImageBased: true }
5505
+ );
5488
5506
  }
5489
5507
  }
5490
- if (ocrMode === "off") {
5491
- throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
5508
+ if (ocrBlocks.length > 0) {
5509
+ const ocrMarkdown = blocksToMarkdown(ocrBlocks);
5510
+ return {
5511
+ markdown: ocrMarkdown,
5512
+ blocks: ocrBlocks,
5513
+ metadata,
5514
+ warnings: warnings.length > 0 ? warnings : void 0,
5515
+ isImageBased: true
5516
+ };
5492
5517
  }
5493
- const errMsg = ocrMode ? `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` : `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`;
5494
- throw Object.assign(new KordocError(errMsg), { isImageBased: true });
5518
+ throw Object.assign(
5519
+ new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`),
5520
+ { isImageBased: true }
5521
+ );
5495
5522
  }
5496
5523
  if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
5497
5524
  const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
@@ -9768,4 +9795,4 @@ export {
9768
9795
  cfb/cfb.js:
9769
9796
  (*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
9770
9797
  */
9771
- //# sourceMappingURL=chunk-I3HO5HLQ.js.map
9798
+ //# sourceMappingURL=chunk-5AXJRBBK.js.map