@clazic/kordoc 2.4.4 → 2.4.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-KEDUF24M.js → chunk-A2FNPGBS.js} +2 -2
- package/dist/{chunk-5AXJRBBK.js → chunk-L2CLLZ4S.js} +29 -11
- package/dist/chunk-L2CLLZ4S.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/index.cjs +58 -25
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +58 -25
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{provider-HE727F7Z.js → provider-7F7NEDTN.js} +32 -17
- package/dist/provider-7F7NEDTN.js.map +1 -0
- package/dist/{utils-BB2CDSTB.js → utils-RQ4S2RVN.js} +2 -2
- package/dist/{watch-6QVK32X7.js → watch-3EIG5EVL.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-5AXJRBBK.js.map +0 -1
- package/dist/provider-HE727F7Z.js.map +0 -1
- /package/dist/{chunk-KEDUF24M.js.map → chunk-A2FNPGBS.js.map} +0 -0
- /package/dist/{utils-BB2CDSTB.js.map → utils-RQ4S2RVN.js.map} +0 -0
- /package/dist/{watch-6QVK32X7.js.map → watch-3EIG5EVL.js.map} +0 -0
package/dist/cli.js
CHANGED
|
@@ -4,12 +4,12 @@ import {
|
|
|
4
4
|
markdownToHwpx,
|
|
5
5
|
markdownToXlsx,
|
|
6
6
|
parse
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-L2CLLZ4S.js";
|
|
8
8
|
import "./chunk-4PP34NVQ.js";
|
|
9
9
|
import {
|
|
10
10
|
VERSION,
|
|
11
11
|
toArrayBuffer
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-A2FNPGBS.js";
|
|
13
13
|
import "./chunk-MOL7MDBG.js";
|
|
14
14
|
import "./chunk-7FMKAV4P.js";
|
|
15
15
|
import "./chunk-JOGAFNIL.js";
|
|
@@ -137,7 +137,7 @@ async function runParse(files, opts) {
|
|
|
137
137
|
saveImages(absPath);
|
|
138
138
|
}
|
|
139
139
|
} catch (err) {
|
|
140
|
-
const { sanitizeError } = await import("./utils-
|
|
140
|
+
const { sanitizeError } = await import("./utils-RQ4S2RVN.js");
|
|
141
141
|
process.stderr.write(`
|
|
142
142
|
[kordoc] ERROR: ${fileName} \u2014 ${sanitizeError(err)}
|
|
143
143
|
`);
|
|
@@ -221,7 +221,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
|
|
|
221
221
|
`));
|
|
222
222
|
}
|
|
223
223
|
} catch (err) {
|
|
224
|
-
const { sanitizeError } = await import("./utils-
|
|
224
|
+
const { sanitizeError } = await import("./utils-RQ4S2RVN.js");
|
|
225
225
|
process.stderr.write(` FAIL
|
|
226
226
|
`);
|
|
227
227
|
process.stderr.write(` \u2192 ${sanitizeError(err)}
|
|
@@ -230,7 +230,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
|
|
|
230
230
|
}
|
|
231
231
|
});
|
|
232
232
|
program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
|
|
233
|
-
const { watchDirectory } = await import("./watch-
|
|
233
|
+
const { watchDirectory } = await import("./watch-3EIG5EVL.js");
|
|
234
234
|
await watchDirectory({
|
|
235
235
|
dir,
|
|
236
236
|
outDir: opts.outDir,
|
package/dist/index.cjs
CHANGED
|
@@ -2693,29 +2693,44 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2693
2693
|
let processed = 0;
|
|
2694
2694
|
const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
|
|
2695
2695
|
const pageBlocks = [];
|
|
2696
|
+
const batchImages = [];
|
|
2696
2697
|
try {
|
|
2697
|
-
const batchImages = [];
|
|
2698
2698
|
for (const pageNum of batchPageNums) {
|
|
2699
2699
|
const page = await doc.getPage(pageNum);
|
|
2700
2700
|
const image = await renderPageToPng(page);
|
|
2701
2701
|
batchImages.push({ image, pageNum });
|
|
2702
2702
|
}
|
|
2703
|
-
|
|
2704
|
-
|
|
2705
|
-
const result = results.get(pageNum);
|
|
2706
|
-
pageBlocks.push({
|
|
2707
|
-
pageNum,
|
|
2708
|
-
blocks: result ? ocrResultToBlocks(result, pageNum) : []
|
|
2709
|
-
});
|
|
2710
|
-
}
|
|
2711
|
-
} catch (err) {
|
|
2712
|
-
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2713
|
-
warnings?.push({
|
|
2714
|
-
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2715
|
-
code: "OCR_PAGE_FAILED"
|
|
2716
|
-
});
|
|
2703
|
+
} catch (renderErr) {
|
|
2704
|
+
const rendered = new Set(batchImages.map((b) => b.pageNum));
|
|
2717
2705
|
for (const pageNum of batchPageNums) {
|
|
2718
|
-
pageBlocks.push({ pageNum, blocks: [] });
|
|
2706
|
+
if (!rendered.has(pageNum)) pageBlocks.push({ pageNum, blocks: [] });
|
|
2707
|
+
}
|
|
2708
|
+
}
|
|
2709
|
+
if (batchImages.length > 0) {
|
|
2710
|
+
try {
|
|
2711
|
+
const results = await provider.processBatch(batchImages);
|
|
2712
|
+
for (const { pageNum } of batchImages) {
|
|
2713
|
+
const result = results.get(pageNum);
|
|
2714
|
+
pageBlocks.push({
|
|
2715
|
+
pageNum,
|
|
2716
|
+
blocks: result ? ocrResultToBlocks(result, pageNum) : []
|
|
2717
|
+
});
|
|
2718
|
+
}
|
|
2719
|
+
} catch (err) {
|
|
2720
|
+
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2721
|
+
warnings?.push({
|
|
2722
|
+
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"} \u2014 \uB2E8\uC77C \uD398\uC774\uC9C0\uB85C \uC7AC\uC2DC\uB3C4`,
|
|
2723
|
+
code: "OCR_PAGE_FAILED"
|
|
2724
|
+
});
|
|
2725
|
+
for (const { image, pageNum } of batchImages) {
|
|
2726
|
+
try {
|
|
2727
|
+
const singleResult = await provider.processBatch([{ image, pageNum }]);
|
|
2728
|
+
const r = singleResult.get(pageNum);
|
|
2729
|
+
pageBlocks.push({ pageNum, blocks: r ? ocrResultToBlocks(r, pageNum) : [] });
|
|
2730
|
+
} catch {
|
|
2731
|
+
pageBlocks.push({ pageNum, blocks: [] });
|
|
2732
|
+
}
|
|
2733
|
+
}
|
|
2719
2734
|
}
|
|
2720
2735
|
}
|
|
2721
2736
|
processed += batchPageNums.length;
|
|
@@ -2825,7 +2840,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
|
|
|
2825
2840
|
var import_xmldom = require("@xmldom/xmldom");
|
|
2826
2841
|
|
|
2827
2842
|
// src/utils.ts
|
|
2828
|
-
var VERSION = true ? "2.4.
|
|
2843
|
+
var VERSION = true ? "2.4.6" : "0.0.0-dev";
|
|
2829
2844
|
function toArrayBuffer(buf) {
|
|
2830
2845
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2831
2846
|
return buf.buffer;
|
|
@@ -6385,9 +6400,9 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6385
6400
|
}
|
|
6386
6401
|
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
6387
6402
|
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
6388
|
-
const tryProvider = async (provider) => {
|
|
6403
|
+
const tryProvider = async (provider, filter) => {
|
|
6389
6404
|
try {
|
|
6390
|
-
return await ocrPages2(doc, provider,
|
|
6405
|
+
return await ocrPages2(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
6391
6406
|
} catch {
|
|
6392
6407
|
return [];
|
|
6393
6408
|
} finally {
|
|
@@ -6400,25 +6415,43 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6400
6415
|
};
|
|
6401
6416
|
let ocrBlocks = [];
|
|
6402
6417
|
if (options?.ocr) {
|
|
6403
|
-
ocrBlocks = await tryProvider(options.ocr);
|
|
6418
|
+
ocrBlocks = await tryProvider(options.ocr, pageFilter);
|
|
6404
6419
|
} else if (ocrMode === "auto") {
|
|
6405
6420
|
const { getAutoFallbackChain: getAutoFallbackChain2 } = await Promise.resolve().then(() => (init_auto_detect(), auto_detect_exports));
|
|
6421
|
+
const pendingPages = /* @__PURE__ */ new Set();
|
|
6422
|
+
for (let i = 1; i <= effectivePageCount; i++) {
|
|
6423
|
+
if (!pageFilter || pageFilter.has(i)) pendingPages.add(i);
|
|
6424
|
+
}
|
|
6425
|
+
const allOcrBlocks = [];
|
|
6406
6426
|
for (const mode of getAutoFallbackChain2()) {
|
|
6427
|
+
if (pendingPages.size === 0) break;
|
|
6407
6428
|
try {
|
|
6429
|
+
const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
|
|
6408
6430
|
const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
|
|
6409
|
-
const blocks2 = await tryProvider(provider);
|
|
6431
|
+
const blocks2 = await tryProvider(provider, modeFilter);
|
|
6410
6432
|
if (blocks2.length > 0) {
|
|
6411
|
-
|
|
6412
|
-
|
|
6433
|
+
for (const b of blocks2) {
|
|
6434
|
+
if (b.pageNumber !== void 0) pendingPages.delete(b.pageNumber);
|
|
6435
|
+
}
|
|
6436
|
+
for (const b of blocks2) allOcrBlocks.push(b);
|
|
6437
|
+
if (pendingPages.size > 0) {
|
|
6438
|
+
warnings.push({
|
|
6439
|
+
message: `OCR: '${mode}' \uC644\uB8CC (${pendingPages.size}\uD398\uC774\uC9C0 \uBBF8\uCC98\uB9AC \u2192 \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC7AC\uC2DC\uB3C4)`,
|
|
6440
|
+
code: "OCR_CLI_FALLBACK"
|
|
6441
|
+
});
|
|
6442
|
+
}
|
|
6443
|
+
} else {
|
|
6444
|
+
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
6413
6445
|
}
|
|
6414
|
-
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
6415
6446
|
} catch {
|
|
6416
6447
|
}
|
|
6417
6448
|
}
|
|
6449
|
+
allOcrBlocks.sort((a, b) => (a.pageNumber ?? 0) - (b.pageNumber ?? 0));
|
|
6450
|
+
ocrBlocks = allOcrBlocks;
|
|
6418
6451
|
} else {
|
|
6419
6452
|
try {
|
|
6420
6453
|
const provider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
|
|
6421
|
-
ocrBlocks = await tryProvider(provider);
|
|
6454
|
+
ocrBlocks = await tryProvider(provider, pageFilter);
|
|
6422
6455
|
} catch (resolveErr) {
|
|
6423
6456
|
throw Object.assign(
|
|
6424
6457
|
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|