@clazic/kordoc 2.4.2 → 2.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/auto-detect-2YGFYQCN.js +15 -0
- package/dist/{chunk-I3HO5HLQ.js → chunk-5AXJRBBK.js} +71 -44
- package/dist/chunk-5AXJRBBK.js.map +1 -0
- package/dist/chunk-7NOZFYH6.js +63 -0
- package/dist/chunk-7NOZFYH6.js.map +1 -0
- package/dist/{chunk-CMZPKEJ7.js → chunk-KEDUF24M.js} +2 -2
- package/dist/cli.js +6 -6
- package/dist/index.cjs +84 -42
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +84 -42
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/resolve-TZVGVOVD.js +70 -0
- package/dist/resolve-TZVGVOVD.js.map +1 -0
- package/dist/{utils-BRQCU3AW.js → utils-BB2CDSTB.js} +2 -2
- package/dist/utils-BB2CDSTB.js.map +1 -0
- package/dist/{watch-SWG6JGKP.js → watch-6QVK32X7.js} +4 -4
- package/package.json +1 -1
- package/dist/chunk-I3HO5HLQ.js.map +0 -1
- package/dist/resolve-QA3VACUP.js +0 -111
- package/dist/resolve-QA3VACUP.js.map +0 -1
- /package/dist/{utils-BRQCU3AW.js.map → auto-detect-2YGFYQCN.js.map} +0 -0
- /package/dist/{chunk-CMZPKEJ7.js.map → chunk-KEDUF24M.js.map} +0 -0
- /package/dist/{watch-SWG6JGKP.js.map → watch-6QVK32X7.js.map} +0 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
detectAvailableOcr,
|
|
4
|
+
getAutoFallbackChain,
|
|
5
|
+
getTesseractFallbackMessage,
|
|
6
|
+
validateOcrMode
|
|
7
|
+
} from "./chunk-7NOZFYH6.js";
|
|
8
|
+
import "./chunk-ZWE3DS7E.js";
|
|
9
|
+
export {
|
|
10
|
+
detectAvailableOcr,
|
|
11
|
+
getAutoFallbackChain,
|
|
12
|
+
getTesseractFallbackMessage,
|
|
13
|
+
validateOcrMode
|
|
14
|
+
};
|
|
15
|
+
//# sourceMappingURL=auto-detect-2YGFYQCN.js.map
|
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import {
|
|
3
|
+
markdownToBlocks
|
|
4
|
+
} from "./chunk-4PP34NVQ.js";
|
|
2
5
|
import {
|
|
3
6
|
KordocError,
|
|
4
7
|
classifyError,
|
|
@@ -6,7 +9,7 @@ import {
|
|
|
6
9
|
precheckZipSize,
|
|
7
10
|
sanitizeHref,
|
|
8
11
|
toArrayBuffer
|
|
9
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-KEDUF24M.js";
|
|
10
13
|
import {
|
|
11
14
|
parsePageRange
|
|
12
15
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -16,9 +19,6 @@ import {
|
|
|
16
19
|
import {
|
|
17
20
|
createCliOcrProvider
|
|
18
21
|
} from "./chunk-JOGAFNIL.js";
|
|
19
|
-
import {
|
|
20
|
-
markdownToBlocks
|
|
21
|
-
} from "./chunk-4PP34NVQ.js";
|
|
22
22
|
import {
|
|
23
23
|
__commonJS,
|
|
24
24
|
__require,
|
|
@@ -5432,66 +5432,93 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5432
5432
|
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
5433
5433
|
}
|
|
5434
5434
|
};
|
|
5435
|
-
const
|
|
5436
|
-
|
|
5435
|
+
const SAMPLE_SIZE = Math.min(10, targetPageNums.length);
|
|
5436
|
+
const sampledIndices = /* @__PURE__ */ new Set();
|
|
5437
|
+
if (targetPageNums.length <= SAMPLE_SIZE) {
|
|
5438
|
+
for (let i = 0; i < targetPageNums.length; i++) sampledIndices.add(i);
|
|
5439
|
+
} else {
|
|
5440
|
+
for (let i = 0; i < SAMPLE_SIZE; i++) {
|
|
5441
|
+
const idx = Math.round(i * (targetPageNums.length - 1) / (SAMPLE_SIZE - 1));
|
|
5442
|
+
sampledIndices.add(idx);
|
|
5443
|
+
}
|
|
5444
|
+
}
|
|
5445
|
+
for (const si of sampledIndices) {
|
|
5437
5446
|
await parseSinglePage(targetPageNums[si]);
|
|
5438
5447
|
}
|
|
5439
|
-
const sampleParsed = parsedPages ||
|
|
5448
|
+
const sampleParsed = parsedPages || sampledIndices.size;
|
|
5440
5449
|
const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
|
|
5441
5450
|
if (!isImageBased) {
|
|
5442
|
-
for (let si =
|
|
5443
|
-
|
|
5451
|
+
for (let si = 0; si < targetPageNums.length; si++) {
|
|
5452
|
+
if (!sampledIndices.has(si)) {
|
|
5453
|
+
await parseSinglePage(targetPageNums[si]);
|
|
5454
|
+
}
|
|
5444
5455
|
}
|
|
5445
5456
|
}
|
|
5446
5457
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
5447
5458
|
if (isImageBased) {
|
|
5448
|
-
let ocrProvider = options?.ocr ?? null;
|
|
5449
5459
|
const ocrMode = options?.ocrMode ?? "auto";
|
|
5450
|
-
|
|
5451
|
-
|
|
5452
|
-
|
|
5453
|
-
|
|
5454
|
-
const batchSize = options?.ocrBatchSize;
|
|
5455
|
-
ocrProvider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
|
|
5456
|
-
} catch (resolveErr) {
|
|
5457
|
-
if (ocrMode !== "auto") {
|
|
5458
|
-
throw Object.assign(
|
|
5459
|
-
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
5460
|
-
{ isImageBased: true }
|
|
5461
|
-
);
|
|
5462
|
-
}
|
|
5463
|
-
}
|
|
5460
|
+
const concurrency = options?.ocrConcurrency ?? 1;
|
|
5461
|
+
const batchSize = options?.ocrBatchSize;
|
|
5462
|
+
if (ocrMode === "off") {
|
|
5463
|
+
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
5464
5464
|
}
|
|
5465
|
-
|
|
5466
|
-
|
|
5465
|
+
const { resolveOcrProvider } = await import("./resolve-TZVGVOVD.js");
|
|
5466
|
+
const { ocrPages } = await import("./provider-HE727F7Z.js");
|
|
5467
|
+
const tryProvider = async (provider) => {
|
|
5467
5468
|
try {
|
|
5468
|
-
|
|
5469
|
-
const concurrency = options?.ocrConcurrency ?? 1;
|
|
5470
|
-
ocrBlocks = await ocrPages(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
5469
|
+
return await ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
5471
5470
|
} catch {
|
|
5471
|
+
return [];
|
|
5472
5472
|
} finally {
|
|
5473
|
-
const terminable =
|
|
5473
|
+
const terminable = provider;
|
|
5474
5474
|
if (typeof terminable.terminate === "function") {
|
|
5475
5475
|
await terminable.terminate().catch(() => {
|
|
5476
5476
|
});
|
|
5477
5477
|
}
|
|
5478
5478
|
}
|
|
5479
|
-
|
|
5480
|
-
|
|
5481
|
-
|
|
5482
|
-
|
|
5483
|
-
|
|
5484
|
-
|
|
5485
|
-
|
|
5486
|
-
|
|
5487
|
-
|
|
5479
|
+
};
|
|
5480
|
+
let ocrBlocks = [];
|
|
5481
|
+
if (options?.ocr) {
|
|
5482
|
+
ocrBlocks = await tryProvider(options.ocr);
|
|
5483
|
+
} else if (ocrMode === "auto") {
|
|
5484
|
+
const { getAutoFallbackChain } = await import("./auto-detect-2YGFYQCN.js");
|
|
5485
|
+
for (const mode of getAutoFallbackChain()) {
|
|
5486
|
+
try {
|
|
5487
|
+
const provider = await resolveOcrProvider(mode, warnings, concurrency, batchSize);
|
|
5488
|
+
const blocks2 = await tryProvider(provider);
|
|
5489
|
+
if (blocks2.length > 0) {
|
|
5490
|
+
ocrBlocks = blocks2;
|
|
5491
|
+
break;
|
|
5492
|
+
}
|
|
5493
|
+
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
5494
|
+
} catch {
|
|
5495
|
+
}
|
|
5496
|
+
}
|
|
5497
|
+
} else {
|
|
5498
|
+
try {
|
|
5499
|
+
const provider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
|
|
5500
|
+
ocrBlocks = await tryProvider(provider);
|
|
5501
|
+
} catch (resolveErr) {
|
|
5502
|
+
throw Object.assign(
|
|
5503
|
+
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
5504
|
+
{ isImageBased: true }
|
|
5505
|
+
);
|
|
5488
5506
|
}
|
|
5489
5507
|
}
|
|
5490
|
-
if (
|
|
5491
|
-
|
|
5508
|
+
if (ocrBlocks.length > 0) {
|
|
5509
|
+
const ocrMarkdown = blocksToMarkdown(ocrBlocks);
|
|
5510
|
+
return {
|
|
5511
|
+
markdown: ocrMarkdown,
|
|
5512
|
+
blocks: ocrBlocks,
|
|
5513
|
+
metadata,
|
|
5514
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
5515
|
+
isImageBased: true
|
|
5516
|
+
};
|
|
5492
5517
|
}
|
|
5493
|
-
|
|
5494
|
-
|
|
5518
|
+
throw Object.assign(
|
|
5519
|
+
new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`),
|
|
5520
|
+
{ isImageBased: true }
|
|
5521
|
+
);
|
|
5495
5522
|
}
|
|
5496
5523
|
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
5497
5524
|
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|
|
@@ -9768,4 +9795,4 @@ export {
|
|
|
9768
9795
|
cfb/cfb.js:
|
|
9769
9796
|
(*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
|
|
9770
9797
|
*/
|
|
9771
|
-
//# sourceMappingURL=chunk-
|
|
9798
|
+
//# sourceMappingURL=chunk-5AXJRBBK.js.map
|