@clazic/kordoc 2.4.3 → 2.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/auto-detect-2YGFYQCN.js +15 -0
- package/dist/chunk-7NOZFYH6.js +63 -0
- package/dist/chunk-7NOZFYH6.js.map +1 -0
- package/dist/{chunk-IAU7NTTA.js → chunk-ATB6T3SG.js} +72 -39
- package/dist/chunk-ATB6T3SG.js.map +1 -0
- package/dist/{chunk-HOUVJPR7.js → chunk-CG3DV7QG.js} +2 -2
- package/dist/cli.js +6 -6
- package/dist/index.cjs +114 -51
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +114 -51
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/{provider-HE727F7Z.js → provider-7F7NEDTN.js} +32 -17
- package/dist/provider-7F7NEDTN.js.map +1 -0
- package/dist/{resolve-UOAOPQ4H.js → resolve-TZVGVOVD.js} +6 -47
- package/dist/resolve-TZVGVOVD.js.map +1 -0
- package/dist/{utils-PYEEPTPM.js → utils-LG2ALGSE.js} +2 -2
- package/dist/utils-LG2ALGSE.js.map +1 -0
- package/dist/{watch-IQLSW2OB.js → watch-Z3CENX4H.js} +4 -4
- package/package.json +1 -1
- package/dist/chunk-IAU7NTTA.js.map +0 -1
- package/dist/provider-HE727F7Z.js.map +0 -1
- package/dist/resolve-UOAOPQ4H.js.map +0 -1
- /package/dist/{utils-PYEEPTPM.js.map → auto-detect-2YGFYQCN.js.map} +0 -0
- /package/dist/{chunk-HOUVJPR7.js.map → chunk-CG3DV7QG.js.map} +0 -0
- /package/dist/{watch-IQLSW2OB.js.map → watch-Z3CENX4H.js.map} +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/utils.ts
|
|
4
|
-
var VERSION = true ? "2.4.
|
|
4
|
+
var VERSION = true ? "2.4.5" : "0.0.0-dev";
|
|
5
5
|
function toArrayBuffer(buf) {
|
|
6
6
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
7
7
|
return buf.buffer;
|
|
@@ -90,4 +90,4 @@ export {
|
|
|
90
90
|
sanitizeHref,
|
|
91
91
|
classifyError
|
|
92
92
|
};
|
|
93
|
-
//# sourceMappingURL=chunk-
|
|
93
|
+
//# sourceMappingURL=chunk-CG3DV7QG.js.map
|
package/dist/cli.js
CHANGED
|
@@ -4,15 +4,15 @@ import {
|
|
|
4
4
|
markdownToHwpx,
|
|
5
5
|
markdownToXlsx,
|
|
6
6
|
parse
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-ATB6T3SG.js";
|
|
8
|
+
import "./chunk-4PP34NVQ.js";
|
|
8
9
|
import {
|
|
9
10
|
VERSION,
|
|
10
11
|
toArrayBuffer
|
|
11
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-CG3DV7QG.js";
|
|
12
13
|
import "./chunk-MOL7MDBG.js";
|
|
13
14
|
import "./chunk-7FMKAV4P.js";
|
|
14
15
|
import "./chunk-JOGAFNIL.js";
|
|
15
|
-
import "./chunk-4PP34NVQ.js";
|
|
16
16
|
import "./chunk-ZWE3DS7E.js";
|
|
17
17
|
|
|
18
18
|
// src/cli.ts
|
|
@@ -137,7 +137,7 @@ async function runParse(files, opts) {
|
|
|
137
137
|
saveImages(absPath);
|
|
138
138
|
}
|
|
139
139
|
} catch (err) {
|
|
140
|
-
const { sanitizeError } = await import("./utils-
|
|
140
|
+
const { sanitizeError } = await import("./utils-LG2ALGSE.js");
|
|
141
141
|
process.stderr.write(`
|
|
142
142
|
[kordoc] ERROR: ${fileName} \u2014 ${sanitizeError(err)}
|
|
143
143
|
`);
|
|
@@ -221,7 +221,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
|
|
|
221
221
|
`));
|
|
222
222
|
}
|
|
223
223
|
} catch (err) {
|
|
224
|
-
const { sanitizeError } = await import("./utils-
|
|
224
|
+
const { sanitizeError } = await import("./utils-LG2ALGSE.js");
|
|
225
225
|
process.stderr.write(` FAIL
|
|
226
226
|
`);
|
|
227
227
|
process.stderr.write(` \u2192 ${sanitizeError(err)}
|
|
@@ -230,7 +230,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
|
|
|
230
230
|
}
|
|
231
231
|
});
|
|
232
232
|
program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
|
|
233
|
-
const { watchDirectory } = await import("./watch-
|
|
233
|
+
const { watchDirectory } = await import("./watch-Z3CENX4H.js");
|
|
234
234
|
await watchDirectory({
|
|
235
235
|
dir,
|
|
236
236
|
outDir: opts.outDir,
|
package/dist/index.cjs
CHANGED
|
@@ -1957,6 +1957,13 @@ var require_cfb = __commonJS({
|
|
|
1957
1957
|
});
|
|
1958
1958
|
|
|
1959
1959
|
// src/ocr/auto-detect.ts
|
|
1960
|
+
var auto_detect_exports = {};
|
|
1961
|
+
__export(auto_detect_exports, {
|
|
1962
|
+
detectAvailableOcr: () => detectAvailableOcr,
|
|
1963
|
+
getAutoFallbackChain: () => getAutoFallbackChain,
|
|
1964
|
+
getTesseractFallbackMessage: () => getTesseractFallbackMessage,
|
|
1965
|
+
validateOcrMode: () => validateOcrMode
|
|
1966
|
+
});
|
|
1960
1967
|
function detectAvailableOcr() {
|
|
1961
1968
|
for (const cli of CLI_PRIORITY) {
|
|
1962
1969
|
if (isCliInstalled(cli)) return cli;
|
|
@@ -1972,6 +1979,14 @@ function isCliInstalled(name) {
|
|
|
1972
1979
|
return false;
|
|
1973
1980
|
}
|
|
1974
1981
|
}
|
|
1982
|
+
function getAutoFallbackChain() {
|
|
1983
|
+
const chain = [];
|
|
1984
|
+
for (const cli of CLI_PRIORITY) {
|
|
1985
|
+
if (isCliInstalled(cli)) chain.push(cli);
|
|
1986
|
+
}
|
|
1987
|
+
chain.push("tesseract");
|
|
1988
|
+
return chain;
|
|
1989
|
+
}
|
|
1975
1990
|
function validateOcrMode(mode) {
|
|
1976
1991
|
if (mode === "auto" || mode === "off" || mode === "tesseract") return;
|
|
1977
1992
|
if (!isCliInstalled(mode)) {
|
|
@@ -2678,29 +2693,44 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2678
2693
|
let processed = 0;
|
|
2679
2694
|
const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
|
|
2680
2695
|
const pageBlocks = [];
|
|
2696
|
+
const batchImages = [];
|
|
2681
2697
|
try {
|
|
2682
|
-
const batchImages = [];
|
|
2683
2698
|
for (const pageNum of batchPageNums) {
|
|
2684
2699
|
const page = await doc.getPage(pageNum);
|
|
2685
2700
|
const image = await renderPageToPng(page);
|
|
2686
2701
|
batchImages.push({ image, pageNum });
|
|
2687
2702
|
}
|
|
2688
|
-
|
|
2689
|
-
|
|
2690
|
-
const result = results.get(pageNum);
|
|
2691
|
-
pageBlocks.push({
|
|
2692
|
-
pageNum,
|
|
2693
|
-
blocks: result ? ocrResultToBlocks(result, pageNum) : []
|
|
2694
|
-
});
|
|
2695
|
-
}
|
|
2696
|
-
} catch (err) {
|
|
2697
|
-
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2698
|
-
warnings?.push({
|
|
2699
|
-
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2700
|
-
code: "OCR_PAGE_FAILED"
|
|
2701
|
-
});
|
|
2703
|
+
} catch (renderErr) {
|
|
2704
|
+
const rendered = new Set(batchImages.map((b) => b.pageNum));
|
|
2702
2705
|
for (const pageNum of batchPageNums) {
|
|
2703
|
-
pageBlocks.push({ pageNum, blocks: [] });
|
|
2706
|
+
if (!rendered.has(pageNum)) pageBlocks.push({ pageNum, blocks: [] });
|
|
2707
|
+
}
|
|
2708
|
+
}
|
|
2709
|
+
if (batchImages.length > 0) {
|
|
2710
|
+
try {
|
|
2711
|
+
const results = await provider.processBatch(batchImages);
|
|
2712
|
+
for (const { pageNum } of batchImages) {
|
|
2713
|
+
const result = results.get(pageNum);
|
|
2714
|
+
pageBlocks.push({
|
|
2715
|
+
pageNum,
|
|
2716
|
+
blocks: result ? ocrResultToBlocks(result, pageNum) : []
|
|
2717
|
+
});
|
|
2718
|
+
}
|
|
2719
|
+
} catch (err) {
|
|
2720
|
+
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2721
|
+
warnings?.push({
|
|
2722
|
+
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"} \u2014 \uB2E8\uC77C \uD398\uC774\uC9C0\uB85C \uC7AC\uC2DC\uB3C4`,
|
|
2723
|
+
code: "OCR_PAGE_FAILED"
|
|
2724
|
+
});
|
|
2725
|
+
for (const { image, pageNum } of batchImages) {
|
|
2726
|
+
try {
|
|
2727
|
+
const singleResult = await provider.processBatch([{ image, pageNum }]);
|
|
2728
|
+
const r = singleResult.get(pageNum);
|
|
2729
|
+
pageBlocks.push({ pageNum, blocks: r ? ocrResultToBlocks(r, pageNum) : [] });
|
|
2730
|
+
} catch {
|
|
2731
|
+
pageBlocks.push({ pageNum, blocks: [] });
|
|
2732
|
+
}
|
|
2733
|
+
}
|
|
2704
2734
|
}
|
|
2705
2735
|
}
|
|
2706
2736
|
processed += batchPageNums.length;
|
|
@@ -2810,7 +2840,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
|
|
|
2810
2840
|
var import_xmldom = require("@xmldom/xmldom");
|
|
2811
2841
|
|
|
2812
2842
|
// src/utils.ts
|
|
2813
|
-
var VERSION = true ? "2.4.
|
|
2843
|
+
var VERSION = true ? "2.4.5" : "0.0.0-dev";
|
|
2814
2844
|
function toArrayBuffer(buf) {
|
|
2815
2845
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2816
2846
|
return buf.buffer;
|
|
@@ -6362,53 +6392,86 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6362
6392
|
}
|
|
6363
6393
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
6364
6394
|
if (isImageBased) {
|
|
6365
|
-
let ocrProvider = options?.ocr ?? null;
|
|
6366
6395
|
const ocrMode = options?.ocrMode ?? "auto";
|
|
6367
|
-
|
|
6368
|
-
|
|
6369
|
-
|
|
6370
|
-
|
|
6371
|
-
const batchSize = options?.ocrBatchSize;
|
|
6372
|
-
ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
|
|
6373
|
-
} catch (resolveErr) {
|
|
6374
|
-
if (ocrMode !== "auto") {
|
|
6375
|
-
throw Object.assign(
|
|
6376
|
-
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
6377
|
-
{ isImageBased: true }
|
|
6378
|
-
);
|
|
6379
|
-
}
|
|
6380
|
-
}
|
|
6396
|
+
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6397
|
+
const batchSize = options?.ocrBatchSize;
|
|
6398
|
+
if (ocrMode === "off") {
|
|
6399
|
+
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
6381
6400
|
}
|
|
6382
|
-
|
|
6383
|
-
|
|
6401
|
+
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
6402
|
+
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
6403
|
+
const tryProvider = async (provider, filter) => {
|
|
6384
6404
|
try {
|
|
6385
|
-
|
|
6386
|
-
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6387
|
-
ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
6405
|
+
return await ocrPages2(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
6388
6406
|
} catch {
|
|
6407
|
+
return [];
|
|
6389
6408
|
} finally {
|
|
6390
|
-
const terminable =
|
|
6409
|
+
const terminable = provider;
|
|
6391
6410
|
if (typeof terminable.terminate === "function") {
|
|
6392
6411
|
await terminable.terminate().catch(() => {
|
|
6393
6412
|
});
|
|
6394
6413
|
}
|
|
6395
6414
|
}
|
|
6396
|
-
|
|
6397
|
-
|
|
6398
|
-
|
|
6399
|
-
|
|
6400
|
-
|
|
6401
|
-
|
|
6402
|
-
|
|
6403
|
-
|
|
6404
|
-
|
|
6415
|
+
};
|
|
6416
|
+
let ocrBlocks = [];
|
|
6417
|
+
if (options?.ocr) {
|
|
6418
|
+
ocrBlocks = await tryProvider(options.ocr, pageFilter);
|
|
6419
|
+
} else if (ocrMode === "auto") {
|
|
6420
|
+
const { getAutoFallbackChain: getAutoFallbackChain2 } = await Promise.resolve().then(() => (init_auto_detect(), auto_detect_exports));
|
|
6421
|
+
const pendingPages = /* @__PURE__ */ new Set();
|
|
6422
|
+
for (let i = 1; i <= effectivePageCount; i++) {
|
|
6423
|
+
if (!pageFilter || pageFilter.has(i)) pendingPages.add(i);
|
|
6424
|
+
}
|
|
6425
|
+
const allOcrBlocks = [];
|
|
6426
|
+
for (const mode of getAutoFallbackChain2()) {
|
|
6427
|
+
if (pendingPages.size === 0) break;
|
|
6428
|
+
try {
|
|
6429
|
+
const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
|
|
6430
|
+
const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
|
|
6431
|
+
const blocks2 = await tryProvider(provider, modeFilter);
|
|
6432
|
+
if (blocks2.length > 0) {
|
|
6433
|
+
for (const b of blocks2) {
|
|
6434
|
+
if (b.pageNumber !== void 0) pendingPages.delete(b.pageNumber);
|
|
6435
|
+
}
|
|
6436
|
+
for (const b of blocks2) allOcrBlocks.push(b);
|
|
6437
|
+
if (pendingPages.size > 0) {
|
|
6438
|
+
warnings.push({
|
|
6439
|
+
message: `OCR: '${mode}' \uC644\uB8CC (${pendingPages.size}\uD398\uC774\uC9C0 \uBBF8\uCC98\uB9AC \u2192 \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC7AC\uC2DC\uB3C4)`,
|
|
6440
|
+
code: "OCR_CLI_FALLBACK"
|
|
6441
|
+
});
|
|
6442
|
+
}
|
|
6443
|
+
} else {
|
|
6444
|
+
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
6445
|
+
}
|
|
6446
|
+
} catch {
|
|
6447
|
+
}
|
|
6448
|
+
}
|
|
6449
|
+
ocrBlocks = allOcrBlocks;
|
|
6450
|
+
} else {
|
|
6451
|
+
try {
|
|
6452
|
+
const provider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
|
|
6453
|
+
ocrBlocks = await tryProvider(provider, pageFilter);
|
|
6454
|
+
} catch (resolveErr) {
|
|
6455
|
+
throw Object.assign(
|
|
6456
|
+
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
6457
|
+
{ isImageBased: true }
|
|
6458
|
+
);
|
|
6405
6459
|
}
|
|
6406
6460
|
}
|
|
6407
|
-
if (
|
|
6408
|
-
|
|
6461
|
+
if (ocrBlocks.length > 0) {
|
|
6462
|
+
const ocrMarkdown = blocksToMarkdown(ocrBlocks);
|
|
6463
|
+
return {
|
|
6464
|
+
markdown: ocrMarkdown,
|
|
6465
|
+
blocks: ocrBlocks,
|
|
6466
|
+
metadata,
|
|
6467
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
6468
|
+
isImageBased: true
|
|
6469
|
+
};
|
|
6409
6470
|
}
|
|
6410
|
-
|
|
6411
|
-
|
|
6471
|
+
throw Object.assign(
|
|
6472
|
+
new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`),
|
|
6473
|
+
{ isImageBased: true }
|
|
6474
|
+
);
|
|
6412
6475
|
}
|
|
6413
6476
|
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
6414
6477
|
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|