@clazic/kordoc 2.4.3 → 2.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1961,6 +1961,13 @@ var require_cfb = __commonJS({
1961
1961
  });
1962
1962
 
1963
1963
  // src/ocr/auto-detect.ts
1964
+ var auto_detect_exports = {};
1965
+ __export(auto_detect_exports, {
1966
+ detectAvailableOcr: () => detectAvailableOcr,
1967
+ getAutoFallbackChain: () => getAutoFallbackChain,
1968
+ getTesseractFallbackMessage: () => getTesseractFallbackMessage,
1969
+ validateOcrMode: () => validateOcrMode
1970
+ });
1964
1971
  import { execSync } from "child_process";
1965
1972
  function detectAvailableOcr() {
1966
1973
  for (const cli of CLI_PRIORITY) {
@@ -1977,6 +1984,14 @@ function isCliInstalled(name) {
1977
1984
  return false;
1978
1985
  }
1979
1986
  }
1987
+ function getAutoFallbackChain() {
1988
+ const chain = [];
1989
+ for (const cli of CLI_PRIORITY) {
1990
+ if (isCliInstalled(cli)) chain.push(cli);
1991
+ }
1992
+ chain.push("tesseract");
1993
+ return chain;
1994
+ }
1980
1995
  function validateOcrMode(mode) {
1981
1996
  if (mode === "auto" || mode === "off" || mode === "tesseract") return;
1982
1997
  if (!isCliInstalled(mode)) {
@@ -2790,7 +2805,7 @@ import JSZip2 from "jszip";
2790
2805
  import { DOMParser } from "@xmldom/xmldom";
2791
2806
 
2792
2807
  // src/utils.ts
2793
- var VERSION = true ? "2.4.3" : "0.0.0-dev";
2808
+ var VERSION = true ? "2.4.4" : "0.0.0-dev";
2794
2809
  function toArrayBuffer(buf) {
2795
2810
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2796
2811
  return buf.buffer;
@@ -6342,53 +6357,69 @@ async function parsePdfDocument(buffer, options) {
6342
6357
  }
6343
6358
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
6344
6359
  if (isImageBased) {
6345
- let ocrProvider = options?.ocr ?? null;
6346
6360
  const ocrMode = options?.ocrMode ?? "auto";
6347
- if (!ocrProvider && ocrMode !== "off") {
6348
- try {
6349
- const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6350
- const concurrency = options?.ocrConcurrency ?? 1;
6351
- const batchSize = options?.ocrBatchSize;
6352
- ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
6353
- } catch (resolveErr) {
6354
- if (ocrMode !== "auto") {
6355
- throw Object.assign(
6356
- new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
6357
- { isImageBased: true }
6358
- );
6359
- }
6360
- }
6361
+ const concurrency = options?.ocrConcurrency ?? 1;
6362
+ const batchSize = options?.ocrBatchSize;
6363
+ if (ocrMode === "off") {
6364
+ throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
6361
6365
  }
6362
- if (ocrProvider) {
6363
- let ocrBlocks = [];
6366
+ const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6367
+ const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6368
+ const tryProvider = async (provider) => {
6364
6369
  try {
6365
- const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6366
- const concurrency = options?.ocrConcurrency ?? 1;
6367
- ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
6370
+ return await ocrPages2(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
6368
6371
  } catch {
6372
+ return [];
6369
6373
  } finally {
6370
- const terminable = ocrProvider;
6374
+ const terminable = provider;
6371
6375
  if (typeof terminable.terminate === "function") {
6372
6376
  await terminable.terminate().catch(() => {
6373
6377
  });
6374
6378
  }
6375
6379
  }
6376
- if (ocrBlocks.length > 0) {
6377
- const ocrMarkdown = blocksToMarkdown(ocrBlocks);
6378
- return {
6379
- markdown: ocrMarkdown,
6380
- blocks: ocrBlocks,
6381
- metadata,
6382
- warnings: warnings.length > 0 ? warnings : void 0,
6383
- isImageBased: true
6384
- };
6380
+ };
6381
+ let ocrBlocks = [];
6382
+ if (options?.ocr) {
6383
+ ocrBlocks = await tryProvider(options.ocr);
6384
+ } else if (ocrMode === "auto") {
6385
+ const { getAutoFallbackChain: getAutoFallbackChain2 } = await Promise.resolve().then(() => (init_auto_detect(), auto_detect_exports));
6386
+ for (const mode of getAutoFallbackChain2()) {
6387
+ try {
6388
+ const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
6389
+ const blocks2 = await tryProvider(provider);
6390
+ if (blocks2.length > 0) {
6391
+ ocrBlocks = blocks2;
6392
+ break;
6393
+ }
6394
+ warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
6395
+ } catch {
6396
+ }
6397
+ }
6398
+ } else {
6399
+ try {
6400
+ const provider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
6401
+ ocrBlocks = await tryProvider(provider);
6402
+ } catch (resolveErr) {
6403
+ throw Object.assign(
6404
+ new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
6405
+ { isImageBased: true }
6406
+ );
6385
6407
  }
6386
6408
  }
6387
- if (ocrMode === "off") {
6388
- throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
6409
+ if (ocrBlocks.length > 0) {
6410
+ const ocrMarkdown = blocksToMarkdown(ocrBlocks);
6411
+ return {
6412
+ markdown: ocrMarkdown,
6413
+ blocks: ocrBlocks,
6414
+ metadata,
6415
+ warnings: warnings.length > 0 ? warnings : void 0,
6416
+ isImageBased: true
6417
+ };
6389
6418
  }
6390
- const errMsg = ocrMode ? `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` : `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`;
6391
- throw Object.assign(new KordocError(errMsg), { isImageBased: true });
6419
+ throw Object.assign(
6420
+ new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`),
6421
+ { isImageBased: true }
6422
+ );
6392
6423
  }
6393
6424
  if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
6394
6425
  const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);