@clazic/kordoc 2.4.4 → 2.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-5AXJRBBK.js → chunk-ATB6T3SG.js} +28 -11
- package/dist/chunk-ATB6T3SG.js.map +1 -0
- package/dist/{chunk-KEDUF24M.js → chunk-CG3DV7QG.js} +2 -2
- package/dist/cli.js +5 -5
- package/dist/index.cjs +57 -25
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +57 -25
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{provider-HE727F7Z.js → provider-7F7NEDTN.js} +32 -17
- package/dist/provider-7F7NEDTN.js.map +1 -0
- package/dist/{utils-BB2CDSTB.js → utils-LG2ALGSE.js} +2 -2
- package/dist/{watch-6QVK32X7.js → watch-Z3CENX4H.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-5AXJRBBK.js.map +0 -1
- package/dist/provider-HE727F7Z.js.map +0 -1
- /package/dist/{chunk-KEDUF24M.js.map → chunk-CG3DV7QG.js.map} +0 -0
- /package/dist/{utils-BB2CDSTB.js.map → utils-LG2ALGSE.js.map} +0 -0
- /package/dist/{watch-6QVK32X7.js.map → watch-Z3CENX4H.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -2696,29 +2696,44 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2696
2696
|
let processed = 0;
|
|
2697
2697
|
const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
|
|
2698
2698
|
const pageBlocks = [];
|
|
2699
|
+
const batchImages = [];
|
|
2699
2700
|
try {
|
|
2700
|
-
const batchImages = [];
|
|
2701
2701
|
for (const pageNum of batchPageNums) {
|
|
2702
2702
|
const page = await doc.getPage(pageNum);
|
|
2703
2703
|
const image = await renderPageToPng(page);
|
|
2704
2704
|
batchImages.push({ image, pageNum });
|
|
2705
2705
|
}
|
|
2706
|
-
|
|
2707
|
-
|
|
2708
|
-
const result = results.get(pageNum);
|
|
2709
|
-
pageBlocks.push({
|
|
2710
|
-
pageNum,
|
|
2711
|
-
blocks: result ? ocrResultToBlocks(result, pageNum) : []
|
|
2712
|
-
});
|
|
2713
|
-
}
|
|
2714
|
-
} catch (err) {
|
|
2715
|
-
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2716
|
-
warnings?.push({
|
|
2717
|
-
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2718
|
-
code: "OCR_PAGE_FAILED"
|
|
2719
|
-
});
|
|
2706
|
+
} catch (renderErr) {
|
|
2707
|
+
const rendered = new Set(batchImages.map((b) => b.pageNum));
|
|
2720
2708
|
for (const pageNum of batchPageNums) {
|
|
2721
|
-
pageBlocks.push({ pageNum, blocks: [] });
|
|
2709
|
+
if (!rendered.has(pageNum)) pageBlocks.push({ pageNum, blocks: [] });
|
|
2710
|
+
}
|
|
2711
|
+
}
|
|
2712
|
+
if (batchImages.length > 0) {
|
|
2713
|
+
try {
|
|
2714
|
+
const results = await provider.processBatch(batchImages);
|
|
2715
|
+
for (const { pageNum } of batchImages) {
|
|
2716
|
+
const result = results.get(pageNum);
|
|
2717
|
+
pageBlocks.push({
|
|
2718
|
+
pageNum,
|
|
2719
|
+
blocks: result ? ocrResultToBlocks(result, pageNum) : []
|
|
2720
|
+
});
|
|
2721
|
+
}
|
|
2722
|
+
} catch (err) {
|
|
2723
|
+
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2724
|
+
warnings?.push({
|
|
2725
|
+
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"} \u2014 \uB2E8\uC77C \uD398\uC774\uC9C0\uB85C \uC7AC\uC2DC\uB3C4`,
|
|
2726
|
+
code: "OCR_PAGE_FAILED"
|
|
2727
|
+
});
|
|
2728
|
+
for (const { image, pageNum } of batchImages) {
|
|
2729
|
+
try {
|
|
2730
|
+
const singleResult = await provider.processBatch([{ image, pageNum }]);
|
|
2731
|
+
const r = singleResult.get(pageNum);
|
|
2732
|
+
pageBlocks.push({ pageNum, blocks: r ? ocrResultToBlocks(r, pageNum) : [] });
|
|
2733
|
+
} catch {
|
|
2734
|
+
pageBlocks.push({ pageNum, blocks: [] });
|
|
2735
|
+
}
|
|
2736
|
+
}
|
|
2722
2737
|
}
|
|
2723
2738
|
}
|
|
2724
2739
|
processed += batchPageNums.length;
|
|
@@ -2805,7 +2820,7 @@ import JSZip2 from "jszip";
|
|
|
2805
2820
|
import { DOMParser } from "@xmldom/xmldom";
|
|
2806
2821
|
|
|
2807
2822
|
// src/utils.ts
|
|
2808
|
-
var VERSION = true ? "2.4.
|
|
2823
|
+
var VERSION = true ? "2.4.5" : "0.0.0-dev";
|
|
2809
2824
|
function toArrayBuffer(buf) {
|
|
2810
2825
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2811
2826
|
return buf.buffer;
|
|
@@ -6365,9 +6380,9 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6365
6380
|
}
|
|
6366
6381
|
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
6367
6382
|
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
6368
|
-
const tryProvider = async (provider) => {
|
|
6383
|
+
const tryProvider = async (provider, filter) => {
|
|
6369
6384
|
try {
|
|
6370
|
-
return await ocrPages2(doc, provider,
|
|
6385
|
+
return await ocrPages2(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
6371
6386
|
} catch {
|
|
6372
6387
|
return [];
|
|
6373
6388
|
} finally {
|
|
@@ -6380,25 +6395,42 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6380
6395
|
};
|
|
6381
6396
|
let ocrBlocks = [];
|
|
6382
6397
|
if (options?.ocr) {
|
|
6383
|
-
ocrBlocks = await tryProvider(options.ocr);
|
|
6398
|
+
ocrBlocks = await tryProvider(options.ocr, pageFilter);
|
|
6384
6399
|
} else if (ocrMode === "auto") {
|
|
6385
6400
|
const { getAutoFallbackChain: getAutoFallbackChain2 } = await Promise.resolve().then(() => (init_auto_detect(), auto_detect_exports));
|
|
6401
|
+
const pendingPages = /* @__PURE__ */ new Set();
|
|
6402
|
+
for (let i = 1; i <= effectivePageCount; i++) {
|
|
6403
|
+
if (!pageFilter || pageFilter.has(i)) pendingPages.add(i);
|
|
6404
|
+
}
|
|
6405
|
+
const allOcrBlocks = [];
|
|
6386
6406
|
for (const mode of getAutoFallbackChain2()) {
|
|
6407
|
+
if (pendingPages.size === 0) break;
|
|
6387
6408
|
try {
|
|
6409
|
+
const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
|
|
6388
6410
|
const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
|
|
6389
|
-
const blocks2 = await tryProvider(provider);
|
|
6411
|
+
const blocks2 = await tryProvider(provider, modeFilter);
|
|
6390
6412
|
if (blocks2.length > 0) {
|
|
6391
|
-
|
|
6392
|
-
|
|
6413
|
+
for (const b of blocks2) {
|
|
6414
|
+
if (b.pageNumber !== void 0) pendingPages.delete(b.pageNumber);
|
|
6415
|
+
}
|
|
6416
|
+
for (const b of blocks2) allOcrBlocks.push(b);
|
|
6417
|
+
if (pendingPages.size > 0) {
|
|
6418
|
+
warnings.push({
|
|
6419
|
+
message: `OCR: '${mode}' \uC644\uB8CC (${pendingPages.size}\uD398\uC774\uC9C0 \uBBF8\uCC98\uB9AC \u2192 \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC7AC\uC2DC\uB3C4)`,
|
|
6420
|
+
code: "OCR_CLI_FALLBACK"
|
|
6421
|
+
});
|
|
6422
|
+
}
|
|
6423
|
+
} else {
|
|
6424
|
+
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
6393
6425
|
}
|
|
6394
|
-
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
6395
6426
|
} catch {
|
|
6396
6427
|
}
|
|
6397
6428
|
}
|
|
6429
|
+
ocrBlocks = allOcrBlocks;
|
|
6398
6430
|
} else {
|
|
6399
6431
|
try {
|
|
6400
6432
|
const provider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
|
|
6401
|
-
ocrBlocks = await tryProvider(provider);
|
|
6433
|
+
ocrBlocks = await tryProvider(provider, pageFilter);
|
|
6402
6434
|
} catch (resolveErr) {
|
|
6403
6435
|
throw Object.assign(
|
|
6404
6436
|
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|