@clazic/kordoc 2.4.4 → 2.4.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -4,12 +4,12 @@ import {
4
4
  markdownToHwpx,
5
5
  markdownToXlsx,
6
6
  parse
7
- } from "./chunk-5AXJRBBK.js";
7
+ } from "./chunk-L2CLLZ4S.js";
8
8
  import "./chunk-4PP34NVQ.js";
9
9
  import {
10
10
  VERSION,
11
11
  toArrayBuffer
12
- } from "./chunk-KEDUF24M.js";
12
+ } from "./chunk-A2FNPGBS.js";
13
13
  import "./chunk-MOL7MDBG.js";
14
14
  import "./chunk-7FMKAV4P.js";
15
15
  import "./chunk-JOGAFNIL.js";
@@ -137,7 +137,7 @@ async function runParse(files, opts) {
137
137
  saveImages(absPath);
138
138
  }
139
139
  } catch (err) {
140
- const { sanitizeError } = await import("./utils-BB2CDSTB.js");
140
+ const { sanitizeError } = await import("./utils-RQ4S2RVN.js");
141
141
  process.stderr.write(`
142
142
  [kordoc] ERROR: ${fileName} \u2014 ${sanitizeError(err)}
143
143
  `);
@@ -221,7 +221,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
221
221
  `));
222
222
  }
223
223
  } catch (err) {
224
- const { sanitizeError } = await import("./utils-BB2CDSTB.js");
224
+ const { sanitizeError } = await import("./utils-RQ4S2RVN.js");
225
225
  process.stderr.write(` FAIL
226
226
  `);
227
227
  process.stderr.write(` \u2192 ${sanitizeError(err)}
@@ -230,7 +230,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
230
230
  }
231
231
  });
232
232
  program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
233
- const { watchDirectory } = await import("./watch-6QVK32X7.js");
233
+ const { watchDirectory } = await import("./watch-3EIG5EVL.js");
234
234
  await watchDirectory({
235
235
  dir,
236
236
  outDir: opts.outDir,
package/dist/index.cjs CHANGED
@@ -2693,29 +2693,44 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2693
2693
  let processed = 0;
2694
2694
  const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
2695
2695
  const pageBlocks = [];
2696
+ const batchImages = [];
2696
2697
  try {
2697
- const batchImages = [];
2698
2698
  for (const pageNum of batchPageNums) {
2699
2699
  const page = await doc.getPage(pageNum);
2700
2700
  const image = await renderPageToPng(page);
2701
2701
  batchImages.push({ image, pageNum });
2702
2702
  }
2703
- const results = await provider.processBatch(batchImages);
2704
- for (const { pageNum } of batchImages) {
2705
- const result = results.get(pageNum);
2706
- pageBlocks.push({
2707
- pageNum,
2708
- blocks: result ? ocrResultToBlocks(result, pageNum) : []
2709
- });
2710
- }
2711
- } catch (err) {
2712
- const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2713
- warnings?.push({
2714
- message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2715
- code: "OCR_PAGE_FAILED"
2716
- });
2703
+ } catch (renderErr) {
2704
+ const rendered = new Set(batchImages.map((b) => b.pageNum));
2717
2705
  for (const pageNum of batchPageNums) {
2718
- pageBlocks.push({ pageNum, blocks: [] });
2706
+ if (!rendered.has(pageNum)) pageBlocks.push({ pageNum, blocks: [] });
2707
+ }
2708
+ }
2709
+ if (batchImages.length > 0) {
2710
+ try {
2711
+ const results = await provider.processBatch(batchImages);
2712
+ for (const { pageNum } of batchImages) {
2713
+ const result = results.get(pageNum);
2714
+ pageBlocks.push({
2715
+ pageNum,
2716
+ blocks: result ? ocrResultToBlocks(result, pageNum) : []
2717
+ });
2718
+ }
2719
+ } catch (err) {
2720
+ const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2721
+ warnings?.push({
2722
+ message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"} \u2014 \uB2E8\uC77C \uD398\uC774\uC9C0\uB85C \uC7AC\uC2DC\uB3C4`,
2723
+ code: "OCR_PAGE_FAILED"
2724
+ });
2725
+ for (const { image, pageNum } of batchImages) {
2726
+ try {
2727
+ const singleResult = await provider.processBatch([{ image, pageNum }]);
2728
+ const r = singleResult.get(pageNum);
2729
+ pageBlocks.push({ pageNum, blocks: r ? ocrResultToBlocks(r, pageNum) : [] });
2730
+ } catch {
2731
+ pageBlocks.push({ pageNum, blocks: [] });
2732
+ }
2733
+ }
2719
2734
  }
2720
2735
  }
2721
2736
  processed += batchPageNums.length;
@@ -2825,7 +2840,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
2825
2840
  var import_xmldom = require("@xmldom/xmldom");
2826
2841
 
2827
2842
  // src/utils.ts
2828
- var VERSION = true ? "2.4.4" : "0.0.0-dev";
2843
+ var VERSION = true ? "2.4.6" : "0.0.0-dev";
2829
2844
  function toArrayBuffer(buf) {
2830
2845
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2831
2846
  return buf.buffer;
@@ -6385,9 +6400,9 @@ async function parsePdfDocument(buffer, options) {
6385
6400
  }
6386
6401
  const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6387
6402
  const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6388
- const tryProvider = async (provider) => {
6403
+ const tryProvider = async (provider, filter) => {
6389
6404
  try {
6390
- return await ocrPages2(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
6405
+ return await ocrPages2(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
6391
6406
  } catch {
6392
6407
  return [];
6393
6408
  } finally {
@@ -6400,25 +6415,43 @@ async function parsePdfDocument(buffer, options) {
6400
6415
  };
6401
6416
  let ocrBlocks = [];
6402
6417
  if (options?.ocr) {
6403
- ocrBlocks = await tryProvider(options.ocr);
6418
+ ocrBlocks = await tryProvider(options.ocr, pageFilter);
6404
6419
  } else if (ocrMode === "auto") {
6405
6420
  const { getAutoFallbackChain: getAutoFallbackChain2 } = await Promise.resolve().then(() => (init_auto_detect(), auto_detect_exports));
6421
+ const pendingPages = /* @__PURE__ */ new Set();
6422
+ for (let i = 1; i <= effectivePageCount; i++) {
6423
+ if (!pageFilter || pageFilter.has(i)) pendingPages.add(i);
6424
+ }
6425
+ const allOcrBlocks = [];
6406
6426
  for (const mode of getAutoFallbackChain2()) {
6427
+ if (pendingPages.size === 0) break;
6407
6428
  try {
6429
+ const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
6408
6430
  const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
6409
- const blocks2 = await tryProvider(provider);
6431
+ const blocks2 = await tryProvider(provider, modeFilter);
6410
6432
  if (blocks2.length > 0) {
6411
- ocrBlocks = blocks2;
6412
- break;
6433
+ for (const b of blocks2) {
6434
+ if (b.pageNumber !== void 0) pendingPages.delete(b.pageNumber);
6435
+ }
6436
+ for (const b of blocks2) allOcrBlocks.push(b);
6437
+ if (pendingPages.size > 0) {
6438
+ warnings.push({
6439
+ message: `OCR: '${mode}' \uC644\uB8CC (${pendingPages.size}\uD398\uC774\uC9C0 \uBBF8\uCC98\uB9AC \u2192 \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC7AC\uC2DC\uB3C4)`,
6440
+ code: "OCR_CLI_FALLBACK"
6441
+ });
6442
+ }
6443
+ } else {
6444
+ warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
6413
6445
  }
6414
- warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
6415
6446
  } catch {
6416
6447
  }
6417
6448
  }
6449
+ allOcrBlocks.sort((a, b) => (a.pageNumber ?? 0) - (b.pageNumber ?? 0));
6450
+ ocrBlocks = allOcrBlocks;
6418
6451
  } else {
6419
6452
  try {
6420
6453
  const provider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
6421
- ocrBlocks = await tryProvider(provider);
6454
+ ocrBlocks = await tryProvider(provider, pageFilter);
6422
6455
  } catch (resolveErr) {
6423
6456
  throw Object.assign(
6424
6457
  new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),