@clazic/kordoc 2.4.3 → 2.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/utils.ts
4
- var VERSION = true ? "2.4.3" : "0.0.0-dev";
4
+ var VERSION = true ? "2.4.5" : "0.0.0-dev";
5
5
  function toArrayBuffer(buf) {
6
6
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
7
7
  return buf.buffer;
@@ -90,4 +90,4 @@ export {
90
90
  sanitizeHref,
91
91
  classifyError
92
92
  };
93
- //# sourceMappingURL=chunk-HOUVJPR7.js.map
93
+ //# sourceMappingURL=chunk-CG3DV7QG.js.map
package/dist/cli.js CHANGED
@@ -4,15 +4,15 @@ import {
4
4
  markdownToHwpx,
5
5
  markdownToXlsx,
6
6
  parse
7
- } from "./chunk-IAU7NTTA.js";
7
+ } from "./chunk-ATB6T3SG.js";
8
+ import "./chunk-4PP34NVQ.js";
8
9
  import {
9
10
  VERSION,
10
11
  toArrayBuffer
11
- } from "./chunk-HOUVJPR7.js";
12
+ } from "./chunk-CG3DV7QG.js";
12
13
  import "./chunk-MOL7MDBG.js";
13
14
  import "./chunk-7FMKAV4P.js";
14
15
  import "./chunk-JOGAFNIL.js";
15
- import "./chunk-4PP34NVQ.js";
16
16
  import "./chunk-ZWE3DS7E.js";
17
17
 
18
18
  // src/cli.ts
@@ -137,7 +137,7 @@ async function runParse(files, opts) {
137
137
  saveImages(absPath);
138
138
  }
139
139
  } catch (err) {
140
- const { sanitizeError } = await import("./utils-PYEEPTPM.js");
140
+ const { sanitizeError } = await import("./utils-LG2ALGSE.js");
141
141
  process.stderr.write(`
142
142
  [kordoc] ERROR: ${fileName} \u2014 ${sanitizeError(err)}
143
143
  `);
@@ -221,7 +221,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
221
221
  `));
222
222
  }
223
223
  } catch (err) {
224
- const { sanitizeError } = await import("./utils-PYEEPTPM.js");
224
+ const { sanitizeError } = await import("./utils-LG2ALGSE.js");
225
225
  process.stderr.write(` FAIL
226
226
  `);
227
227
  process.stderr.write(` \u2192 ${sanitizeError(err)}
@@ -230,7 +230,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
230
230
  }
231
231
  });
232
232
  program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
233
- const { watchDirectory } = await import("./watch-IQLSW2OB.js");
233
+ const { watchDirectory } = await import("./watch-Z3CENX4H.js");
234
234
  await watchDirectory({
235
235
  dir,
236
236
  outDir: opts.outDir,
package/dist/index.cjs CHANGED
@@ -1957,6 +1957,13 @@ var require_cfb = __commonJS({
1957
1957
  });
1958
1958
 
1959
1959
  // src/ocr/auto-detect.ts
1960
+ var auto_detect_exports = {};
1961
+ __export(auto_detect_exports, {
1962
+ detectAvailableOcr: () => detectAvailableOcr,
1963
+ getAutoFallbackChain: () => getAutoFallbackChain,
1964
+ getTesseractFallbackMessage: () => getTesseractFallbackMessage,
1965
+ validateOcrMode: () => validateOcrMode
1966
+ });
1960
1967
  function detectAvailableOcr() {
1961
1968
  for (const cli of CLI_PRIORITY) {
1962
1969
  if (isCliInstalled(cli)) return cli;
@@ -1972,6 +1979,14 @@ function isCliInstalled(name) {
1972
1979
  return false;
1973
1980
  }
1974
1981
  }
1982
+ function getAutoFallbackChain() {
1983
+ const chain = [];
1984
+ for (const cli of CLI_PRIORITY) {
1985
+ if (isCliInstalled(cli)) chain.push(cli);
1986
+ }
1987
+ chain.push("tesseract");
1988
+ return chain;
1989
+ }
1975
1990
  function validateOcrMode(mode) {
1976
1991
  if (mode === "auto" || mode === "off" || mode === "tesseract") return;
1977
1992
  if (!isCliInstalled(mode)) {
@@ -2678,29 +2693,44 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2678
2693
  let processed = 0;
2679
2694
  const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
2680
2695
  const pageBlocks = [];
2696
+ const batchImages = [];
2681
2697
  try {
2682
- const batchImages = [];
2683
2698
  for (const pageNum of batchPageNums) {
2684
2699
  const page = await doc.getPage(pageNum);
2685
2700
  const image = await renderPageToPng(page);
2686
2701
  batchImages.push({ image, pageNum });
2687
2702
  }
2688
- const results = await provider.processBatch(batchImages);
2689
- for (const { pageNum } of batchImages) {
2690
- const result = results.get(pageNum);
2691
- pageBlocks.push({
2692
- pageNum,
2693
- blocks: result ? ocrResultToBlocks(result, pageNum) : []
2694
- });
2695
- }
2696
- } catch (err) {
2697
- const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2698
- warnings?.push({
2699
- message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2700
- code: "OCR_PAGE_FAILED"
2701
- });
2703
+ } catch (renderErr) {
2704
+ const rendered = new Set(batchImages.map((b) => b.pageNum));
2702
2705
  for (const pageNum of batchPageNums) {
2703
- pageBlocks.push({ pageNum, blocks: [] });
2706
+ if (!rendered.has(pageNum)) pageBlocks.push({ pageNum, blocks: [] });
2707
+ }
2708
+ }
2709
+ if (batchImages.length > 0) {
2710
+ try {
2711
+ const results = await provider.processBatch(batchImages);
2712
+ for (const { pageNum } of batchImages) {
2713
+ const result = results.get(pageNum);
2714
+ pageBlocks.push({
2715
+ pageNum,
2716
+ blocks: result ? ocrResultToBlocks(result, pageNum) : []
2717
+ });
2718
+ }
2719
+ } catch (err) {
2720
+ const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2721
+ warnings?.push({
2722
+ message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"} \u2014 \uB2E8\uC77C \uD398\uC774\uC9C0\uB85C \uC7AC\uC2DC\uB3C4`,
2723
+ code: "OCR_PAGE_FAILED"
2724
+ });
2725
+ for (const { image, pageNum } of batchImages) {
2726
+ try {
2727
+ const singleResult = await provider.processBatch([{ image, pageNum }]);
2728
+ const r = singleResult.get(pageNum);
2729
+ pageBlocks.push({ pageNum, blocks: r ? ocrResultToBlocks(r, pageNum) : [] });
2730
+ } catch {
2731
+ pageBlocks.push({ pageNum, blocks: [] });
2732
+ }
2733
+ }
2704
2734
  }
2705
2735
  }
2706
2736
  processed += batchPageNums.length;
@@ -2810,7 +2840,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
2810
2840
  var import_xmldom = require("@xmldom/xmldom");
2811
2841
 
2812
2842
  // src/utils.ts
2813
- var VERSION = true ? "2.4.3" : "0.0.0-dev";
2843
+ var VERSION = true ? "2.4.5" : "0.0.0-dev";
2814
2844
  function toArrayBuffer(buf) {
2815
2845
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2816
2846
  return buf.buffer;
@@ -6362,53 +6392,86 @@ async function parsePdfDocument(buffer, options) {
6362
6392
  }
6363
6393
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
6364
6394
  if (isImageBased) {
6365
- let ocrProvider = options?.ocr ?? null;
6366
6395
  const ocrMode = options?.ocrMode ?? "auto";
6367
- if (!ocrProvider && ocrMode !== "off") {
6368
- try {
6369
- const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6370
- const concurrency = options?.ocrConcurrency ?? 1;
6371
- const batchSize = options?.ocrBatchSize;
6372
- ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
6373
- } catch (resolveErr) {
6374
- if (ocrMode !== "auto") {
6375
- throw Object.assign(
6376
- new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
6377
- { isImageBased: true }
6378
- );
6379
- }
6380
- }
6396
+ const concurrency = options?.ocrConcurrency ?? 1;
6397
+ const batchSize = options?.ocrBatchSize;
6398
+ if (ocrMode === "off") {
6399
+ throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
6381
6400
  }
6382
- if (ocrProvider) {
6383
- let ocrBlocks = [];
6401
+ const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6402
+ const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6403
+ const tryProvider = async (provider, filter) => {
6384
6404
  try {
6385
- const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6386
- const concurrency = options?.ocrConcurrency ?? 1;
6387
- ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
6405
+ return await ocrPages2(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
6388
6406
  } catch {
6407
+ return [];
6389
6408
  } finally {
6390
- const terminable = ocrProvider;
6409
+ const terminable = provider;
6391
6410
  if (typeof terminable.terminate === "function") {
6392
6411
  await terminable.terminate().catch(() => {
6393
6412
  });
6394
6413
  }
6395
6414
  }
6396
- if (ocrBlocks.length > 0) {
6397
- const ocrMarkdown = blocksToMarkdown(ocrBlocks);
6398
- return {
6399
- markdown: ocrMarkdown,
6400
- blocks: ocrBlocks,
6401
- metadata,
6402
- warnings: warnings.length > 0 ? warnings : void 0,
6403
- isImageBased: true
6404
- };
6415
+ };
6416
+ let ocrBlocks = [];
6417
+ if (options?.ocr) {
6418
+ ocrBlocks = await tryProvider(options.ocr, pageFilter);
6419
+ } else if (ocrMode === "auto") {
6420
+ const { getAutoFallbackChain: getAutoFallbackChain2 } = await Promise.resolve().then(() => (init_auto_detect(), auto_detect_exports));
6421
+ const pendingPages = /* @__PURE__ */ new Set();
6422
+ for (let i = 1; i <= effectivePageCount; i++) {
6423
+ if (!pageFilter || pageFilter.has(i)) pendingPages.add(i);
6424
+ }
6425
+ const allOcrBlocks = [];
6426
+ for (const mode of getAutoFallbackChain2()) {
6427
+ if (pendingPages.size === 0) break;
6428
+ try {
6429
+ const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
6430
+ const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
6431
+ const blocks2 = await tryProvider(provider, modeFilter);
6432
+ if (blocks2.length > 0) {
6433
+ for (const b of blocks2) {
6434
+ if (b.pageNumber !== void 0) pendingPages.delete(b.pageNumber);
6435
+ }
6436
+ for (const b of blocks2) allOcrBlocks.push(b);
6437
+ if (pendingPages.size > 0) {
6438
+ warnings.push({
6439
+ message: `OCR: '${mode}' \uC644\uB8CC (${pendingPages.size}\uD398\uC774\uC9C0 \uBBF8\uCC98\uB9AC \u2192 \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC7AC\uC2DC\uB3C4)`,
6440
+ code: "OCR_CLI_FALLBACK"
6441
+ });
6442
+ }
6443
+ } else {
6444
+ warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
6445
+ }
6446
+ } catch {
6447
+ }
6448
+ }
6449
+ ocrBlocks = allOcrBlocks;
6450
+ } else {
6451
+ try {
6452
+ const provider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
6453
+ ocrBlocks = await tryProvider(provider, pageFilter);
6454
+ } catch (resolveErr) {
6455
+ throw Object.assign(
6456
+ new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
6457
+ { isImageBased: true }
6458
+ );
6405
6459
  }
6406
6460
  }
6407
- if (ocrMode === "off") {
6408
- throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
6461
+ if (ocrBlocks.length > 0) {
6462
+ const ocrMarkdown = blocksToMarkdown(ocrBlocks);
6463
+ return {
6464
+ markdown: ocrMarkdown,
6465
+ blocks: ocrBlocks,
6466
+ metadata,
6467
+ warnings: warnings.length > 0 ? warnings : void 0,
6468
+ isImageBased: true
6469
+ };
6409
6470
  }
6410
- const errMsg = ocrMode ? `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)` : `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`;
6411
- throw Object.assign(new KordocError(errMsg), { isImageBased: true });
6471
+ throw Object.assign(
6472
+ new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`),
6473
+ { isImageBased: true }
6474
+ );
6412
6475
  }
6413
6476
  if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
6414
6477
  const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);