@clazic/kordoc 2.4.2 → 2.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/auto-detect-2YGFYQCN.js +15 -0
- package/dist/{chunk-I3HO5HLQ.js → chunk-5AXJRBBK.js} +71 -44
- package/dist/chunk-5AXJRBBK.js.map +1 -0
- package/dist/chunk-7NOZFYH6.js +63 -0
- package/dist/chunk-7NOZFYH6.js.map +1 -0
- package/dist/{chunk-CMZPKEJ7.js → chunk-KEDUF24M.js} +2 -2
- package/dist/cli.js +6 -6
- package/dist/index.cjs +84 -42
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +84 -42
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/resolve-TZVGVOVD.js +70 -0
- package/dist/resolve-TZVGVOVD.js.map +1 -0
- package/dist/{utils-BRQCU3AW.js → utils-BB2CDSTB.js} +2 -2
- package/dist/utils-BB2CDSTB.js.map +1 -0
- package/dist/{watch-SWG6JGKP.js → watch-6QVK32X7.js} +4 -4
- package/package.json +1 -1
- package/dist/chunk-I3HO5HLQ.js.map +0 -1
- package/dist/resolve-QA3VACUP.js +0 -111
- package/dist/resolve-QA3VACUP.js.map +0 -1
- /package/dist/{utils-BRQCU3AW.js.map → auto-detect-2YGFYQCN.js.map} +0 -0
- /package/dist/{chunk-CMZPKEJ7.js.map → chunk-KEDUF24M.js.map} +0 -0
- /package/dist/{watch-SWG6JGKP.js.map → watch-6QVK32X7.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -1961,6 +1961,13 @@ var require_cfb = __commonJS({
|
|
|
1961
1961
|
});
|
|
1962
1962
|
|
|
1963
1963
|
// src/ocr/auto-detect.ts
|
|
1964
|
+
var auto_detect_exports = {};
|
|
1965
|
+
__export(auto_detect_exports, {
|
|
1966
|
+
detectAvailableOcr: () => detectAvailableOcr,
|
|
1967
|
+
getAutoFallbackChain: () => getAutoFallbackChain,
|
|
1968
|
+
getTesseractFallbackMessage: () => getTesseractFallbackMessage,
|
|
1969
|
+
validateOcrMode: () => validateOcrMode
|
|
1970
|
+
});
|
|
1964
1971
|
import { execSync } from "child_process";
|
|
1965
1972
|
function detectAvailableOcr() {
|
|
1966
1973
|
for (const cli of CLI_PRIORITY) {
|
|
@@ -1977,6 +1984,14 @@ function isCliInstalled(name) {
|
|
|
1977
1984
|
return false;
|
|
1978
1985
|
}
|
|
1979
1986
|
}
|
|
1987
|
+
function getAutoFallbackChain() {
|
|
1988
|
+
const chain = [];
|
|
1989
|
+
for (const cli of CLI_PRIORITY) {
|
|
1990
|
+
if (isCliInstalled(cli)) chain.push(cli);
|
|
1991
|
+
}
|
|
1992
|
+
chain.push("tesseract");
|
|
1993
|
+
return chain;
|
|
1994
|
+
}
|
|
1980
1995
|
function validateOcrMode(mode) {
|
|
1981
1996
|
if (mode === "auto" || mode === "off" || mode === "tesseract") return;
|
|
1982
1997
|
if (!isCliInstalled(mode)) {
|
|
@@ -2425,7 +2440,7 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2425
2440
|
return createCliOcrProvider(mode);
|
|
2426
2441
|
}
|
|
2427
2442
|
const detected = detectAvailableOcr();
|
|
2428
|
-
if (detected !== "
|
|
2443
|
+
if (detected !== "codex") {
|
|
2429
2444
|
if (detected === "tesseract") {
|
|
2430
2445
|
warnings?.push({
|
|
2431
2446
|
message: getTesseractFallbackMessage(),
|
|
@@ -2433,7 +2448,7 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2433
2448
|
});
|
|
2434
2449
|
} else {
|
|
2435
2450
|
warnings?.push({
|
|
2436
|
-
message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (
|
|
2451
|
+
message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
|
|
2437
2452
|
code: "OCR_CLI_FALLBACK"
|
|
2438
2453
|
});
|
|
2439
2454
|
}
|
|
@@ -2790,7 +2805,7 @@ import JSZip2 from "jszip";
|
|
|
2790
2805
|
import { DOMParser } from "@xmldom/xmldom";
|
|
2791
2806
|
|
|
2792
2807
|
// src/utils.ts
|
|
2793
|
-
var VERSION = true ? "2.4.
|
|
2808
|
+
var VERSION = true ? "2.4.4" : "0.0.0-dev";
|
|
2794
2809
|
function toArrayBuffer(buf) {
|
|
2795
2810
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2796
2811
|
return buf.buffer;
|
|
@@ -6318,66 +6333,93 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6318
6333
|
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
6319
6334
|
}
|
|
6320
6335
|
};
|
|
6321
|
-
const
|
|
6322
|
-
|
|
6336
|
+
const SAMPLE_SIZE = Math.min(10, targetPageNums.length);
|
|
6337
|
+
const sampledIndices = /* @__PURE__ */ new Set();
|
|
6338
|
+
if (targetPageNums.length <= SAMPLE_SIZE) {
|
|
6339
|
+
for (let i = 0; i < targetPageNums.length; i++) sampledIndices.add(i);
|
|
6340
|
+
} else {
|
|
6341
|
+
for (let i = 0; i < SAMPLE_SIZE; i++) {
|
|
6342
|
+
const idx = Math.round(i * (targetPageNums.length - 1) / (SAMPLE_SIZE - 1));
|
|
6343
|
+
sampledIndices.add(idx);
|
|
6344
|
+
}
|
|
6345
|
+
}
|
|
6346
|
+
for (const si of sampledIndices) {
|
|
6323
6347
|
await parseSinglePage(targetPageNums[si]);
|
|
6324
6348
|
}
|
|
6325
|
-
const sampleParsed = parsedPages ||
|
|
6349
|
+
const sampleParsed = parsedPages || sampledIndices.size;
|
|
6326
6350
|
const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
|
|
6327
6351
|
if (!isImageBased) {
|
|
6328
|
-
for (let si =
|
|
6329
|
-
|
|
6352
|
+
for (let si = 0; si < targetPageNums.length; si++) {
|
|
6353
|
+
if (!sampledIndices.has(si)) {
|
|
6354
|
+
await parseSinglePage(targetPageNums[si]);
|
|
6355
|
+
}
|
|
6330
6356
|
}
|
|
6331
6357
|
}
|
|
6332
6358
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
6333
6359
|
if (isImageBased) {
|
|
6334
|
-
let ocrProvider = options?.ocr ?? null;
|
|
6335
6360
|
const ocrMode = options?.ocrMode ?? "auto";
|
|
6336
|
-
|
|
6337
|
-
|
|
6338
|
-
|
|
6339
|
-
|
|
6340
|
-
const batchSize = options?.ocrBatchSize;
|
|
6341
|
-
ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
|
|
6342
|
-
} catch (resolveErr) {
|
|
6343
|
-
if (ocrMode !== "auto") {
|
|
6344
|
-
throw Object.assign(
|
|
6345
|
-
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
6346
|
-
{ isImageBased: true }
|
|
6347
|
-
);
|
|
6348
|
-
}
|
|
6349
|
-
}
|
|
6361
|
+
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6362
|
+
const batchSize = options?.ocrBatchSize;
|
|
6363
|
+
if (ocrMode === "off") {
|
|
6364
|
+
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
6350
6365
|
}
|
|
6351
|
-
|
|
6352
|
-
|
|
6366
|
+
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
6367
|
+
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
6368
|
+
const tryProvider = async (provider) => {
|
|
6353
6369
|
try {
|
|
6354
|
-
|
|
6355
|
-
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6356
|
-
ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
6370
|
+
return await ocrPages2(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
6357
6371
|
} catch {
|
|
6372
|
+
return [];
|
|
6358
6373
|
} finally {
|
|
6359
|
-
const terminable =
|
|
6374
|
+
const terminable = provider;
|
|
6360
6375
|
if (typeof terminable.terminate === "function") {
|
|
6361
6376
|
await terminable.terminate().catch(() => {
|
|
6362
6377
|
});
|
|
6363
6378
|
}
|
|
6364
6379
|
}
|
|
6365
|
-
|
|
6366
|
-
|
|
6367
|
-
|
|
6368
|
-
|
|
6369
|
-
|
|
6370
|
-
|
|
6371
|
-
|
|
6372
|
-
|
|
6373
|
-
|
|
6380
|
+
};
|
|
6381
|
+
let ocrBlocks = [];
|
|
6382
|
+
if (options?.ocr) {
|
|
6383
|
+
ocrBlocks = await tryProvider(options.ocr);
|
|
6384
|
+
} else if (ocrMode === "auto") {
|
|
6385
|
+
const { getAutoFallbackChain: getAutoFallbackChain2 } = await Promise.resolve().then(() => (init_auto_detect(), auto_detect_exports));
|
|
6386
|
+
for (const mode of getAutoFallbackChain2()) {
|
|
6387
|
+
try {
|
|
6388
|
+
const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
|
|
6389
|
+
const blocks2 = await tryProvider(provider);
|
|
6390
|
+
if (blocks2.length > 0) {
|
|
6391
|
+
ocrBlocks = blocks2;
|
|
6392
|
+
break;
|
|
6393
|
+
}
|
|
6394
|
+
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
6395
|
+
} catch {
|
|
6396
|
+
}
|
|
6397
|
+
}
|
|
6398
|
+
} else {
|
|
6399
|
+
try {
|
|
6400
|
+
const provider = await resolveOcrProvider2(ocrMode, warnings, concurrency, batchSize);
|
|
6401
|
+
ocrBlocks = await tryProvider(provider);
|
|
6402
|
+
} catch (resolveErr) {
|
|
6403
|
+
throw Object.assign(
|
|
6404
|
+
new KordocError(resolveErr instanceof Error ? resolveErr.message : "OCR \uD504\uB85C\uBC14\uC774\uB354 \uCD08\uAE30\uD654 \uC2E4\uD328"),
|
|
6405
|
+
{ isImageBased: true }
|
|
6406
|
+
);
|
|
6374
6407
|
}
|
|
6375
6408
|
}
|
|
6376
|
-
if (
|
|
6377
|
-
|
|
6409
|
+
if (ocrBlocks.length > 0) {
|
|
6410
|
+
const ocrMarkdown = blocksToMarkdown(ocrBlocks);
|
|
6411
|
+
return {
|
|
6412
|
+
markdown: ocrMarkdown,
|
|
6413
|
+
blocks: ocrBlocks,
|
|
6414
|
+
metadata,
|
|
6415
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
6416
|
+
isImageBased: true
|
|
6417
|
+
};
|
|
6378
6418
|
}
|
|
6379
|
-
|
|
6380
|
-
|
|
6419
|
+
throw Object.assign(
|
|
6420
|
+
new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF \u2014 OCR \uC2E4\uD328 (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`),
|
|
6421
|
+
{ isImageBased: true }
|
|
6422
|
+
);
|
|
6381
6423
|
}
|
|
6382
6424
|
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
6383
6425
|
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|