@clazic/kordoc 2.4.3 → 2.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/auto-detect-2YGFYQCN.js +15 -0
- package/dist/{chunk-IAU7NTTA.js → chunk-5AXJRBBK.js} +55 -39
- package/dist/chunk-5AXJRBBK.js.map +1 -0
- package/dist/chunk-7NOZFYH6.js +63 -0
- package/dist/chunk-7NOZFYH6.js.map +1 -0
- package/dist/{chunk-HOUVJPR7.js → chunk-KEDUF24M.js} +2 -2
- package/dist/cli.js +6 -6
- package/dist/index.cjs +66 -35
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +66 -35
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/{resolve-UOAOPQ4H.js → resolve-TZVGVOVD.js} +6 -47
- package/dist/resolve-TZVGVOVD.js.map +1 -0
- package/dist/{utils-PYEEPTPM.js → utils-BB2CDSTB.js} +2 -2
- package/dist/utils-BB2CDSTB.js.map +1 -0
- package/dist/{watch-IQLSW2OB.js → watch-6QVK32X7.js} +4 -4
- package/package.json +1 -1
- package/dist/chunk-IAU7NTTA.js.map +0 -1
- package/dist/resolve-UOAOPQ4H.js.map +0 -1
- /package/dist/{utils-PYEEPTPM.js.map → auto-detect-2YGFYQCN.js.map} +0 -0
- /package/dist/{chunk-HOUVJPR7.js.map → chunk-KEDUF24M.js.map} +0 -0
- /package/dist/{watch-IQLSW2OB.js.map → watch-6QVK32X7.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -1961,6 +1961,13 @@ var require_cfb = __commonJS({
|
|
|
1961
1961
|
});
|
|
1962
1962
|
|
|
1963
1963
|
// src/ocr/auto-detect.ts
|
|
1964
|
+
var auto_detect_exports = {};
|
|
1965
|
+
__export(auto_detect_exports, {
|
|
1966
|
+
detectAvailableOcr: () => detectAvailableOcr,
|
|
1967
|
+
getAutoFallbackChain: () => getAutoFallbackChain,
|
|
1968
|
+
getTesseractFallbackMessage: () => getTesseractFallbackMessage,
|
|
1969
|
+
validateOcrMode: () => validateOcrMode
|
|
1970
|
+
});
|
|
1964
1971
|
import { execSync } from "child_process";
|
|
1965
1972
|
function detectAvailableOcr() {
|
|
1966
1973
|
for (const cli of CLI_PRIORITY) {
|
|
@@ -1977,6 +1984,14 @@ function isCliInstalled(name) {
|
|
|
1977
1984
|
return false;
|
|
1978
1985
|
}
|
|
1979
1986
|
}
|
|
1987
|
+
function getAutoFallbackChain() {
|
|
1988
|
+
const chain = [];
|
|
1989
|
+
for (const cli of CLI_PRIORITY) {
|
|
1990
|
+
if (isCliInstalled(cli)) chain.push(cli);
|
|
1991
|
+
}
|
|
1992
|
+
chain.push("tesseract");
|
|
1993
|
+
return chain;
|
|
1994
|
+
}
|
|
1980
1995
|
function validateOcrMode(mode) {
|
|
1981
1996
|
if (mode === "auto" || mode === "off" || mode === "tesseract") return;
|
|
1982
1997
|
if (!isCliInstalled(mode)) {
|
|
@@ -2790,7 +2805,7 @@ import JSZip2 from "jszip";
|
|
|
2790
2805
|
import { DOMParser } from "@xmldom/xmldom";
|
|
2791
2806
|
|
|
2792
2807
|
// src/utils.ts
|
|
2793
|
-
var VERSION = true ? "2.4.
|
|
2808
|
+
var VERSION = true ? "2.4.4" : "0.0.0-dev";
|
|
2794
2809
|
function toArrayBuffer(buf) {
|
|
2795
2810
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2796
2811
|
return buf.buffer;
|
|
@@ -6342,53 +6357,69 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6342
6357
|
}
|
|
6343
6358
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
6344
6359
|
if (isImageBased) {
|
|
6345
|
-
let ocrProvider = options?.ocr ?? null;
|
|
6346
6360
|
const ocrMode = options?.ocrMode ?? "auto";
|
|
6347
|
-
|
|
6348
|
-
|
|
6349
|
-
|
|
6350
|
-
|
|
6351
|
-
const batchSize = options?.ocrBatchSize;
|
|
6352
|
-
ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
|
|
6353
|
-
} catch (resolveErr) {
|
|
6354
|
-
if (ocrMode !== "auto") {
|
|
6355
|
-
throw Object.assign(
|
|
6356
|
-
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
6357
|
-
{ isImageBased: true }
|
|
6358
|
-
);
|
|
6359
|
-
}
|
|
6360
|
-
}
|
|
6361
|
+
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6362
|
+
const batchSize = options?.ocrBatchSize;
|
|
6363
|
+
if (ocrMode === "off") {
|
|
6364
|
+
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
6361
6365
|
}
|
|
6362
|
-
|
|
6363
|
-
|
|
6366
|
+
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
6367
|
+
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
6368
|
+
const tryProvider = async (provider) => {
|
|
6364
6369
|
try {
|
|
6365
|
-
|
|
6366
|
-
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6367
|
-
ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
6370
|
+
return await ocrPages2(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
6368
6371
|
} catch {
|
|
6372
|
+
return [];
|
|
6369
6373
|
} finally {
|
|
6370
|
-
const terminable =
|
|
6374
|
+
const terminable = provider;
|
|
6371
6375
|
if (typeof terminable.terminate === "function") {
|
|
6372
6376
|
await terminable.terminate().catch(() => {
|
|
6373
6377
|
});
|
|
6374
6378
|
}
|
|
6375
6379
|
}
|
|
6376
|
-
|
|
6377
|
-
|
|
6378
|
-
|
|
6379
|
-
|
|
6380
|
-
|
|
6381
|
-
|
|
6382
|
-
|
|
6383
|
-
|
|
6384
|
-
|
|
6380
|
+
};
|
|
6381
|
+
let ocrBlocks = [];
|
|
6382
|
+
if (options?.ocr) {
|
|
6383
|
+
ocrBlocks = await tryProvider(options.ocr);
|
|
6384
|
+
} else if (ocrMode === "auto") {
|
|
6385
|
+
const { getAutoFallbackChain: getAutoFallbackChain2 } = await Promise.resolve().then(() => (init_auto_detect(), auto_detect_exports));
|
|
6386
|
+
for (const mode of getAutoFallbackChain2()) {
|
|
6387
|
+
try {
|
|
6388
|
+
const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
|
|
6389
|
+
const blocks2 = await tryProvider(provider);
|
|
6390
|
+
if (blocks2.length > 0) {
|
|
6391
|
+
ocrBlocks = blocks2;
|
|
6392
|
+
break;
|
|
6393
|
+
}
|
|
6394
|
+
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
6395
|
+
} catch {
|
|
6396
|
+
}
|
|
6397
|
+
}
|
|
6398
|
+
} else {
|
|
6399
|
+
try {
|
|
6400
|
+
const provider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
|
|
6401
|
+
ocrBlocks = await tryProvider(provider);
|
|
6402
|
+
} catch (resolveErr) {
|
|
6403
|
+
throw Object.assign(
|
|
6404
|
+
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
6405
|
+
{ isImageBased: true }
|
|
6406
|
+
);
|
|
6385
6407
|
}
|
|
6386
6408
|
}
|
|
6387
|
-
if (
|
|
6388
|
-
|
|
6409
|
+
if (ocrBlocks.length > 0) {
|
|
6410
|
+
const ocrMarkdown = blocksToMarkdown(ocrBlocks);
|
|
6411
|
+
return {
|
|
6412
|
+
markdown: ocrMarkdown,
|
|
6413
|
+
blocks: ocrBlocks,
|
|
6414
|
+
metadata,
|
|
6415
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
6416
|
+
isImageBased: true
|
|
6417
|
+
};
|
|
6389
6418
|
}
|
|
6390
|
-
|
|
6391
|
-
|
|
6419
|
+
throw Object.assign(
|
|
6420
|
+
new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`),
|
|
6421
|
+
{ isImageBased: true }
|
|
6422
|
+
);
|
|
6392
6423
|
}
|
|
6393
6424
|
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
6394
6425
|
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|