@clazic/kordoc 2.4.3 → 2.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/auto-detect-2YGFYQCN.js +15 -0
- package/dist/{chunk-IAU7NTTA.js → chunk-5AXJRBBK.js} +55 -39
- package/dist/chunk-5AXJRBBK.js.map +1 -0
- package/dist/chunk-7NOZFYH6.js +63 -0
- package/dist/chunk-7NOZFYH6.js.map +1 -0
- package/dist/{chunk-HOUVJPR7.js → chunk-KEDUF24M.js} +2 -2
- package/dist/cli.js +6 -6
- package/dist/index.cjs +66 -35
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +66 -35
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/{resolve-UOAOPQ4H.js → resolve-TZVGVOVD.js} +6 -47
- package/dist/resolve-TZVGVOVD.js.map +1 -0
- package/dist/{utils-PYEEPTPM.js → utils-BB2CDSTB.js} +2 -2
- package/dist/utils-BB2CDSTB.js.map +1 -0
- package/dist/{watch-IQLSW2OB.js → watch-6QVK32X7.js} +4 -4
- package/package.json +1 -1
- package/dist/chunk-IAU7NTTA.js.map +0 -1
- package/dist/resolve-UOAOPQ4H.js.map +0 -1
- /package/dist/{utils-PYEEPTPM.js.map → auto-detect-2YGFYQCN.js.map} +0 -0
- /package/dist/{chunk-HOUVJPR7.js.map → chunk-KEDUF24M.js.map} +0 -0
- /package/dist/{watch-IQLSW2OB.js.map → watch-6QVK32X7.js.map} +0 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
detectAvailableOcr,
|
|
4
|
+
getAutoFallbackChain,
|
|
5
|
+
getTesseractFallbackMessage,
|
|
6
|
+
validateOcrMode
|
|
7
|
+
} from "./chunk-7NOZFYH6.js";
|
|
8
|
+
import "./chunk-ZWE3DS7E.js";
|
|
9
|
+
export {
|
|
10
|
+
detectAvailableOcr,
|
|
11
|
+
getAutoFallbackChain,
|
|
12
|
+
getTesseractFallbackMessage,
|
|
13
|
+
validateOcrMode
|
|
14
|
+
};
|
|
15
|
+
//# sourceMappingURL=auto-detect-2YGFYQCN.js.map
|
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
markdownToBlocks
|
|
4
|
+
} from "./chunk-4PP34NVQ.js";
|
|
2
5
|
import {
|
|
3
6
|
KordocError,
|
|
4
7
|
classifyError,
|
|
@@ -6,7 +9,7 @@ import {
|
|
|
6
9
|
precheckZipSize,
|
|
7
10
|
sanitizeHref,
|
|
8
11
|
toArrayBuffer
|
|
9
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-KEDUF24M.js";
|
|
10
13
|
import {
|
|
11
14
|
parsePageRange
|
|
12
15
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -16,9 +19,6 @@ import {
|
|
|
16
19
|
import {
|
|
17
20
|
createCliOcrProvider
|
|
18
21
|
} from "./chunk-JOGAFNIL.js";
|
|
19
|
-
import {
|
|
20
|
-
markdownToBlocks
|
|
21
|
-
} from "./chunk-4PP34NVQ.js";
|
|
22
22
|
import {
|
|
23
23
|
__commonJS,
|
|
24
24
|
__require,
|
|
@@ -5456,53 +5456,69 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5456
5456
|
}
|
|
5457
5457
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
5458
5458
|
if (isImageBased) {
|
|
5459
|
-
let ocrProvider = options?.ocr ?? null;
|
|
5460
5459
|
const ocrMode = options?.ocrMode ?? "auto";
|
|
5461
|
-
|
|
5462
|
-
|
|
5463
|
-
|
|
5464
|
-
|
|
5465
|
-
const batchSize = options?.ocrBatchSize;
|
|
5466
|
-
ocrProvider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
|
|
5467
|
-
} catch (resolveErr) {
|
|
5468
|
-
if (ocrMode !== "auto") {
|
|
5469
|
-
throw Object.assign(
|
|
5470
|
-
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
5471
|
-
{ isImageBased: true }
|
|
5472
|
-
);
|
|
5473
|
-
}
|
|
5474
|
-
}
|
|
5460
|
+
const concurrency = options?.ocrConcurrency ?? 1;
|
|
5461
|
+
const batchSize = options?.ocrBatchSize;
|
|
5462
|
+
if (ocrMode === "off") {
|
|
5463
|
+
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
5475
5464
|
}
|
|
5476
|
-
|
|
5477
|
-
|
|
5465
|
+
const { resolveOcrProvider } = await import("./resolve-TZVGVOVD.js");
|
|
5466
|
+
const { ocrPages } = await import("./provider-HE727F7Z.js");
|
|
5467
|
+
const tryProvider = async (provider) => {
|
|
5478
5468
|
try {
|
|
5479
|
-
|
|
5480
|
-
const concurrency = options?.ocrConcurrency ?? 1;
|
|
5481
|
-
ocrBlocks = await ocrPages(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
5469
|
+
return await ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
5482
5470
|
} catch {
|
|
5471
|
+
return [];
|
|
5483
5472
|
} finally {
|
|
5484
|
-
const terminable =
|
|
5473
|
+
const terminable = provider;
|
|
5485
5474
|
if (typeof terminable.terminate === "function") {
|
|
5486
5475
|
await terminable.terminate().catch(() => {
|
|
5487
5476
|
});
|
|
5488
5477
|
}
|
|
5489
5478
|
}
|
|
5490
|
-
|
|
5491
|
-
|
|
5492
|
-
|
|
5493
|
-
|
|
5494
|
-
|
|
5495
|
-
|
|
5496
|
-
|
|
5497
|
-
|
|
5498
|
-
|
|
5479
|
+
};
|
|
5480
|
+
let ocrBlocks = [];
|
|
5481
|
+
if (options?.ocr) {
|
|
5482
|
+
ocrBlocks = await tryProvider(options.ocr);
|
|
5483
|
+
} else if (ocrMode === "auto") {
|
|
5484
|
+
const { getAutoFallbackChain } = await import("./auto-detect-2YGFYQCN.js");
|
|
5485
|
+
for (const mode of getAutoFallbackChain()) {
|
|
5486
|
+
try {
|
|
5487
|
+
const provider = await resolveOcrProvider(mode, warnings, concurrency, batchSize);
|
|
5488
|
+
const blocks2 = await tryProvider(provider);
|
|
5489
|
+
if (blocks2.length > 0) {
|
|
5490
|
+
ocrBlocks = blocks2;
|
|
5491
|
+
break;
|
|
5492
|
+
}
|
|
5493
|
+
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
5494
|
+
} catch {
|
|
5495
|
+
}
|
|
5496
|
+
}
|
|
5497
|
+
} else {
|
|
5498
|
+
try {
|
|
5499
|
+
const provider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
|
|
5500
|
+
ocrBlocks = await tryProvider(provider);
|
|
5501
|
+
} catch (resolveErr) {
|
|
5502
|
+
throw Object.assign(
|
|
5503
|
+
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
5504
|
+
{ isImageBased: true }
|
|
5505
|
+
);
|
|
5499
5506
|
}
|
|
5500
5507
|
}
|
|
5501
|
-
if (
|
|
5502
|
-
|
|
5508
|
+
if (ocrBlocks.length > 0) {
|
|
5509
|
+
const ocrMarkdown = blocksToMarkdown(ocrBlocks);
|
|
5510
|
+
return {
|
|
5511
|
+
markdown: ocrMarkdown,
|
|
5512
|
+
blocks: ocrBlocks,
|
|
5513
|
+
metadata,
|
|
5514
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
5515
|
+
isImageBased: true
|
|
5516
|
+
};
|
|
5503
5517
|
}
|
|
5504
|
-
|
|
5505
|
-
|
|
5518
|
+
throw Object.assign(
|
|
5519
|
+
new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`),
|
|
5520
|
+
{ isImageBased: true }
|
|
5521
|
+
);
|
|
5506
5522
|
}
|
|
5507
5523
|
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
5508
5524
|
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|
|
@@ -9779,4 +9795,4 @@ export {
|
|
|
9779
9795
|
cfb/cfb.js:
|
|
9780
9796
|
(*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
|
|
9781
9797
|
*/
|
|
9782
|
-
//# sourceMappingURL=chunk-
|
|
9798
|
+
//# sourceMappingURL=chunk-5AXJRBBK.js.map
|