@clazic/kordoc 2.4.3 → 2.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/auto-detect-2YGFYQCN.js +15 -0
- package/dist/chunk-7NOZFYH6.js +63 -0
- package/dist/chunk-7NOZFYH6.js.map +1 -0
- package/dist/{chunk-IAU7NTTA.js → chunk-ATB6T3SG.js} +72 -39
- package/dist/chunk-ATB6T3SG.js.map +1 -0
- package/dist/{chunk-HOUVJPR7.js → chunk-CG3DV7QG.js} +2 -2
- package/dist/cli.js +6 -6
- package/dist/index.cjs +114 -51
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +114 -51
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/{provider-HE727F7Z.js → provider-7F7NEDTN.js} +32 -17
- package/dist/provider-7F7NEDTN.js.map +1 -0
- package/dist/{resolve-UOAOPQ4H.js → resolve-TZVGVOVD.js} +6 -47
- package/dist/resolve-TZVGVOVD.js.map +1 -0
- package/dist/{utils-PYEEPTPM.js → utils-LG2ALGSE.js} +2 -2
- package/dist/utils-LG2ALGSE.js.map +1 -0
- package/dist/{watch-IQLSW2OB.js → watch-Z3CENX4H.js} +4 -4
- package/package.json +1 -1
- package/dist/chunk-IAU7NTTA.js.map +0 -1
- package/dist/provider-HE727F7Z.js.map +0 -1
- package/dist/resolve-UOAOPQ4H.js.map +0 -1
- /package/dist/{utils-PYEEPTPM.js.map → auto-detect-2YGFYQCN.js.map} +0 -0
- /package/dist/{chunk-HOUVJPR7.js.map → chunk-CG3DV7QG.js.map} +0 -0
- /package/dist/{watch-IQLSW2OB.js.map → watch-Z3CENX4H.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -1961,6 +1961,13 @@ var require_cfb = __commonJS({
|
|
|
1961
1961
|
});
|
|
1962
1962
|
|
|
1963
1963
|
// src/ocr/auto-detect.ts
|
|
1964
|
+
var auto_detect_exports = {};
|
|
1965
|
+
__export(auto_detect_exports, {
|
|
1966
|
+
detectAvailableOcr: () => detectAvailableOcr,
|
|
1967
|
+
getAutoFallbackChain: () => getAutoFallbackChain,
|
|
1968
|
+
getTesseractFallbackMessage: () => getTesseractFallbackMessage,
|
|
1969
|
+
validateOcrMode: () => validateOcrMode
|
|
1970
|
+
});
|
|
1964
1971
|
import { execSync } from "child_process";
|
|
1965
1972
|
function detectAvailableOcr() {
|
|
1966
1973
|
for (const cli of CLI_PRIORITY) {
|
|
@@ -1977,6 +1984,14 @@ function isCliInstalled(name) {
|
|
|
1977
1984
|
return false;
|
|
1978
1985
|
}
|
|
1979
1986
|
}
|
|
1987
|
+
function getAutoFallbackChain() {
|
|
1988
|
+
const chain = [];
|
|
1989
|
+
for (const cli of CLI_PRIORITY) {
|
|
1990
|
+
if (isCliInstalled(cli)) chain.push(cli);
|
|
1991
|
+
}
|
|
1992
|
+
chain.push("tesseract");
|
|
1993
|
+
return chain;
|
|
1994
|
+
}
|
|
1980
1995
|
function validateOcrMode(mode) {
|
|
1981
1996
|
if (mode === "auto" || mode === "off" || mode === "tesseract") return;
|
|
1982
1997
|
if (!isCliInstalled(mode)) {
|
|
@@ -2681,29 +2696,44 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2681
2696
|
let processed = 0;
|
|
2682
2697
|
const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
|
|
2683
2698
|
const pageBlocks = [];
|
|
2699
|
+
const batchImages = [];
|
|
2684
2700
|
try {
|
|
2685
|
-
const batchImages = [];
|
|
2686
2701
|
for (const pageNum of batchPageNums) {
|
|
2687
2702
|
const page = await doc.getPage(pageNum);
|
|
2688
2703
|
const image = await renderPageToPng(page);
|
|
2689
2704
|
batchImages.push({ image, pageNum });
|
|
2690
2705
|
}
|
|
2691
|
-
|
|
2692
|
-
|
|
2693
|
-
const result = results.get(pageNum);
|
|
2694
|
-
pageBlocks.push({
|
|
2695
|
-
pageNum,
|
|
2696
|
-
blocks: result ? ocrResultToBlocks(result, pageNum) : []
|
|
2697
|
-
});
|
|
2698
|
-
}
|
|
2699
|
-
} catch (err) {
|
|
2700
|
-
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2701
|
-
warnings?.push({
|
|
2702
|
-
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2703
|
-
code: "OCR_PAGE_FAILED"
|
|
2704
|
-
});
|
|
2706
|
+
} catch (renderErr) {
|
|
2707
|
+
const rendered = new Set(batchImages.map((b) => b.pageNum));
|
|
2705
2708
|
for (const pageNum of batchPageNums) {
|
|
2706
|
-
pageBlocks.push({ pageNum, blocks: [] });
|
|
2709
|
+
if (!rendered.has(pageNum)) pageBlocks.push({ pageNum, blocks: [] });
|
|
2710
|
+
}
|
|
2711
|
+
}
|
|
2712
|
+
if (batchImages.length > 0) {
|
|
2713
|
+
try {
|
|
2714
|
+
const results = await provider.processBatch(batchImages);
|
|
2715
|
+
for (const { pageNum } of batchImages) {
|
|
2716
|
+
const result = results.get(pageNum);
|
|
2717
|
+
pageBlocks.push({
|
|
2718
|
+
pageNum,
|
|
2719
|
+
blocks: result ? ocrResultToBlocks(result, pageNum) : []
|
|
2720
|
+
});
|
|
2721
|
+
}
|
|
2722
|
+
} catch (err) {
|
|
2723
|
+
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2724
|
+
warnings?.push({
|
|
2725
|
+
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"} \u2014 \uB2E8\uC77C \uD398\uC774\uC9C0\uB85C \uC7AC\uC2DC\uB3C4`,
|
|
2726
|
+
code: "OCR_PAGE_FAILED"
|
|
2727
|
+
});
|
|
2728
|
+
for (const { image, pageNum } of batchImages) {
|
|
2729
|
+
try {
|
|
2730
|
+
const singleResult = await provider.processBatch([{ image, pageNum }]);
|
|
2731
|
+
const r = singleResult.get(pageNum);
|
|
2732
|
+
pageBlocks.push({ pageNum, blocks: r ? ocrResultToBlocks(r, pageNum) : [] });
|
|
2733
|
+
} catch {
|
|
2734
|
+
pageBlocks.push({ pageNum, blocks: [] });
|
|
2735
|
+
}
|
|
2736
|
+
}
|
|
2707
2737
|
}
|
|
2708
2738
|
}
|
|
2709
2739
|
processed += batchPageNums.length;
|
|
@@ -2790,7 +2820,7 @@ import JSZip2 from "jszip";
|
|
|
2790
2820
|
import { DOMParser } from "@xmldom/xmldom";
|
|
2791
2821
|
|
|
2792
2822
|
// src/utils.ts
|
|
2793
|
-
var VERSION = true ? "2.4.
|
|
2823
|
+
var VERSION = true ? "2.4.5" : "0.0.0-dev";
|
|
2794
2824
|
function toArrayBuffer(buf) {
|
|
2795
2825
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2796
2826
|
return buf.buffer;
|
|
@@ -6342,53 +6372,86 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6342
6372
|
}
|
|
6343
6373
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
6344
6374
|
if (isImageBased) {
|
|
6345
|
-
let ocrProvider = options?.ocr ?? null;
|
|
6346
6375
|
const ocrMode = options?.ocrMode ?? "auto";
|
|
6347
|
-
|
|
6348
|
-
|
|
6349
|
-
|
|
6350
|
-
|
|
6351
|
-
const batchSize = options?.ocrBatchSize;
|
|
6352
|
-
ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
|
|
6353
|
-
} catch (resolveErr) {
|
|
6354
|
-
if (ocrMode !== "auto") {
|
|
6355
|
-
throw Object.assign(
|
|
6356
|
-
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
6357
|
-
{ isImageBased: true }
|
|
6358
|
-
);
|
|
6359
|
-
}
|
|
6360
|
-
}
|
|
6376
|
+
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6377
|
+
const batchSize = options?.ocrBatchSize;
|
|
6378
|
+
if (ocrMode === "off") {
|
|
6379
|
+
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
6361
6380
|
}
|
|
6362
|
-
|
|
6363
|
-
|
|
6381
|
+
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
6382
|
+
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
6383
|
+
const tryProvider = async (provider, filter) => {
|
|
6364
6384
|
try {
|
|
6365
|
-
|
|
6366
|
-
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6367
|
-
ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
6385
|
+
return await ocrPages2(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
6368
6386
|
} catch {
|
|
6387
|
+
return [];
|
|
6369
6388
|
} finally {
|
|
6370
|
-
const terminable =
|
|
6389
|
+
const terminable = provider;
|
|
6371
6390
|
if (typeof terminable.terminate === "function") {
|
|
6372
6391
|
await terminable.terminate().catch(() => {
|
|
6373
6392
|
});
|
|
6374
6393
|
}
|
|
6375
6394
|
}
|
|
6376
|
-
|
|
6377
|
-
|
|
6378
|
-
|
|
6379
|
-
|
|
6380
|
-
|
|
6381
|
-
|
|
6382
|
-
|
|
6383
|
-
|
|
6384
|
-
|
|
6395
|
+
};
|
|
6396
|
+
let ocrBlocks = [];
|
|
6397
|
+
if (options?.ocr) {
|
|
6398
|
+
ocrBlocks = await tryProvider(options.ocr, pageFilter);
|
|
6399
|
+
} else if (ocrMode === "auto") {
|
|
6400
|
+
const { getAutoFallbackChain: getAutoFallbackChain2 } = await Promise.resolve().then(() => (init_auto_detect(), auto_detect_exports));
|
|
6401
|
+
const pendingPages = /* @__PURE__ */ new Set();
|
|
6402
|
+
for (let i = 1; i <= effectivePageCount; i++) {
|
|
6403
|
+
if (!pageFilter || pageFilter.has(i)) pendingPages.add(i);
|
|
6404
|
+
}
|
|
6405
|
+
const allOcrBlocks = [];
|
|
6406
|
+
for (const mode of getAutoFallbackChain2()) {
|
|
6407
|
+
if (pendingPages.size === 0) break;
|
|
6408
|
+
try {
|
|
6409
|
+
const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
|
|
6410
|
+
const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
|
|
6411
|
+
const blocks2 = await tryProvider(provider, modeFilter);
|
|
6412
|
+
if (blocks2.length > 0) {
|
|
6413
|
+
for (const b of blocks2) {
|
|
6414
|
+
if (b.pageNumber !== void 0) pendingPages.delete(b.pageNumber);
|
|
6415
|
+
}
|
|
6416
|
+
for (const b of blocks2) allOcrBlocks.push(b);
|
|
6417
|
+
if (pendingPages.size > 0) {
|
|
6418
|
+
warnings.push({
|
|
6419
|
+
message: `OCR: '${mode}' \uC644\uB8CC (${pendingPages.size}\uD398\uC774\uC9C0 \uBBF8\uCC98\uB9AC \u2192 \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC7AC\uC2DC\uB3C4)`,
|
|
6420
|
+
code: "OCR_CLI_FALLBACK"
|
|
6421
|
+
});
|
|
6422
|
+
}
|
|
6423
|
+
} else {
|
|
6424
|
+
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
6425
|
+
}
|
|
6426
|
+
} catch {
|
|
6427
|
+
}
|
|
6428
|
+
}
|
|
6429
|
+
ocrBlocks = allOcrBlocks;
|
|
6430
|
+
} else {
|
|
6431
|
+
try {
|
|
6432
|
+
const provider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
|
|
6433
|
+
ocrBlocks = await tryProvider(provider, pageFilter);
|
|
6434
|
+
} catch (resolveErr) {
|
|
6435
|
+
throw Object.assign(
|
|
6436
|
+
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
6437
|
+
{ isImageBased: true }
|
|
6438
|
+
);
|
|
6385
6439
|
}
|
|
6386
6440
|
}
|
|
6387
|
-
if (
|
|
6388
|
-
|
|
6441
|
+
if (ocrBlocks.length > 0) {
|
|
6442
|
+
const ocrMarkdown = blocksToMarkdown(ocrBlocks);
|
|
6443
|
+
return {
|
|
6444
|
+
markdown: ocrMarkdown,
|
|
6445
|
+
blocks: ocrBlocks,
|
|
6446
|
+
metadata,
|
|
6447
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
6448
|
+
isImageBased: true
|
|
6449
|
+
};
|
|
6389
6450
|
}
|
|
6390
|
-
|
|
6391
|
-
|
|
6451
|
+
throw Object.assign(
|
|
6452
|
+
new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`),
|
|
6453
|
+
{ isImageBased: true }
|
|
6454
|
+
);
|
|
6392
6455
|
}
|
|
6393
6456
|
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
6394
6457
|
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|