@clazic/kordoc 2.2.7 → 2.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-3CNYQD23.js → chunk-FF5M4SDK.js} +2 -2
- package/dist/{chunk-UFIRSH5G.js → chunk-OL2NDK3E.js} +25 -16
- package/dist/chunk-OL2NDK3E.js.map +1 -0
- package/dist/cli.js +13 -6
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +124 -32
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +7 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.js +124 -32
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +6 -5
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-EPHXUWRL.js → provider-I3XGSVL6.js} +62 -17
- package/dist/provider-I3XGSVL6.js.map +1 -0
- package/dist/{resolve-NYKB5P3U.js → resolve-UFUJEPCJ.js} +10 -4
- package/dist/{resolve-NYKB5P3U.js.map → resolve-UFUJEPCJ.js.map} +1 -1
- package/dist/tesseract-provider-WCVJWBUT.js +56 -0
- package/dist/tesseract-provider-WCVJWBUT.js.map +1 -0
- package/dist/{utils-3EDZ5QEH.js → utils-CU26KLDC.js} +2 -2
- package/dist/{watch-BDL7I557.js → watch-Z6SH4KRB.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-UFIRSH5G.js.map +0 -1
- package/dist/provider-EPHXUWRL.js.map +0 -1
- package/dist/tesseract-provider-UNJOI25M.js +0 -24
- package/dist/tesseract-provider-UNJOI25M.js.map +0 -1
- /package/dist/{chunk-3CNYQD23.js.map → chunk-FF5M4SDK.js.map} +0 -0
- /package/dist/{utils-3EDZ5QEH.js.map → utils-CU26KLDC.js.map} +0 -0
- /package/dist/{watch-BDL7I557.js.map → watch-Z6SH4KRB.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -110,6 +110,13 @@ interface ParseOptions {
|
|
|
110
110
|
* - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
|
|
111
111
|
*/
|
|
112
112
|
ocrMode?: OcrMode;
|
|
113
|
+
/**
|
|
114
|
+
* OCR 병렬 처리 수.
|
|
115
|
+
* - tesseract: 기본값은 CPU 코어 수 (병렬 처리로 속도 향상)
|
|
116
|
+
* - CLI 제공 프로바이더(gemini/claude/codex): 기본 1 (rate limit 보호)
|
|
117
|
+
* - 1: 순차 처리 (기존 동작)
|
|
118
|
+
*/
|
|
119
|
+
ocrConcurrency?: number;
|
|
113
120
|
/** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
|
|
114
121
|
onProgress?: (current: number, total: number) => void;
|
|
115
122
|
/** PDF 머리글/바닥글 자동 제거 */
|
package/dist/index.d.ts
CHANGED
|
@@ -110,6 +110,13 @@ interface ParseOptions {
|
|
|
110
110
|
* - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
|
|
111
111
|
*/
|
|
112
112
|
ocrMode?: OcrMode;
|
|
113
|
+
/**
|
|
114
|
+
* OCR 병렬 처리 수.
|
|
115
|
+
* - tesseract: 기본값은 CPU 코어 수 (병렬 처리로 속도 향상)
|
|
116
|
+
* - CLI 제공 프로바이더(gemini/claude/codex): 기본 1 (rate limit 보호)
|
|
117
|
+
* - 1: 순차 처리 (기존 동작)
|
|
118
|
+
*/
|
|
119
|
+
ocrConcurrency?: number;
|
|
113
120
|
/** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
|
|
114
121
|
onProgress?: (current: number, total: number) => void;
|
|
115
122
|
/** PDF 머리글/바닥글 자동 제거 */
|
package/dist/index.js
CHANGED
|
@@ -2160,6 +2160,7 @@ var init_cli_provider = __esm({
|
|
|
2160
2160
|
// src/ocr/tesseract-provider.ts
|
|
2161
2161
|
var tesseract_provider_exports = {};
|
|
2162
2162
|
__export(tesseract_provider_exports, {
|
|
2163
|
+
createTesseractPoolProvider: () => createTesseractPoolProvider,
|
|
2163
2164
|
createTesseractProvider: () => createTesseractProvider
|
|
2164
2165
|
});
|
|
2165
2166
|
import { createWorker } from "tesseract.js";
|
|
@@ -2178,6 +2179,37 @@ async function createTesseractProvider() {
|
|
|
2178
2179
|
};
|
|
2179
2180
|
return provider;
|
|
2180
2181
|
}
|
|
2182
|
+
async function createTesseractPoolProvider(concurrency) {
|
|
2183
|
+
const workers = await Promise.all(
|
|
2184
|
+
Array.from({ length: concurrency }, () => createWorker("kor+eng"))
|
|
2185
|
+
);
|
|
2186
|
+
const idle = [...workers];
|
|
2187
|
+
const waitQueue = [];
|
|
2188
|
+
function acquire() {
|
|
2189
|
+
if (idle.length > 0) return Promise.resolve(idle.pop());
|
|
2190
|
+
return new Promise((resolve) => waitQueue.push(resolve));
|
|
2191
|
+
}
|
|
2192
|
+
function release(w) {
|
|
2193
|
+
if (waitQueue.length > 0) {
|
|
2194
|
+
waitQueue.shift()(w);
|
|
2195
|
+
} else {
|
|
2196
|
+
idle.push(w);
|
|
2197
|
+
}
|
|
2198
|
+
}
|
|
2199
|
+
const provider = async (pageImage, _pageNumber, _mimeType) => {
|
|
2200
|
+
const w = await acquire();
|
|
2201
|
+
try {
|
|
2202
|
+
const { data } = await w.recognize(pageImage);
|
|
2203
|
+
return data.text;
|
|
2204
|
+
} finally {
|
|
2205
|
+
release(w);
|
|
2206
|
+
}
|
|
2207
|
+
};
|
|
2208
|
+
provider.terminate = async () => {
|
|
2209
|
+
await Promise.all(workers.map((w) => w.terminate()));
|
|
2210
|
+
};
|
|
2211
|
+
return provider;
|
|
2212
|
+
}
|
|
2181
2213
|
var init_tesseract_provider = __esm({
|
|
2182
2214
|
"src/ocr/tesseract-provider.ts"() {
|
|
2183
2215
|
"use strict";
|
|
@@ -2189,14 +2221,17 @@ var resolve_exports = {};
|
|
|
2189
2221
|
__export(resolve_exports, {
|
|
2190
2222
|
resolveOcrProvider: () => resolveOcrProvider
|
|
2191
2223
|
});
|
|
2192
|
-
async function resolveOcrProvider(mode, warnings) {
|
|
2224
|
+
async function resolveOcrProvider(mode, warnings, concurrency) {
|
|
2193
2225
|
if (mode === "off") {
|
|
2194
2226
|
throw new Error("OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC788\uC2B5\uB2C8\uB2E4 (--ocr off).");
|
|
2195
2227
|
}
|
|
2196
2228
|
if (mode !== "auto") {
|
|
2197
2229
|
validateOcrMode(mode);
|
|
2198
2230
|
if (mode === "tesseract") {
|
|
2199
|
-
const { createTesseractProvider: createTesseractProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2231
|
+
const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2232
|
+
if (concurrency && concurrency > 1) {
|
|
2233
|
+
return createTesseractPoolProvider2(concurrency);
|
|
2234
|
+
}
|
|
2200
2235
|
return createTesseractProvider2();
|
|
2201
2236
|
}
|
|
2202
2237
|
return createCliOcrProvider(mode);
|
|
@@ -2216,7 +2251,10 @@ async function resolveOcrProvider(mode, warnings) {
|
|
|
2216
2251
|
}
|
|
2217
2252
|
}
|
|
2218
2253
|
if (detected === "tesseract") {
|
|
2219
|
-
const { createTesseractProvider: createTesseractProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2254
|
+
const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2255
|
+
if (concurrency && concurrency > 1) {
|
|
2256
|
+
return createTesseractPoolProvider2(concurrency);
|
|
2257
|
+
}
|
|
2220
2258
|
return createTesseractProvider2();
|
|
2221
2259
|
}
|
|
2222
2260
|
return createCliOcrProvider(detected);
|
|
@@ -2354,32 +2392,77 @@ var provider_exports = {};
|
|
|
2354
2392
|
__export(provider_exports, {
|
|
2355
2393
|
ocrPages: () => ocrPages
|
|
2356
2394
|
});
|
|
2357
|
-
async function
|
|
2395
|
+
async function runWithConcurrency(tasks, limit) {
|
|
2396
|
+
const results = new Array(tasks.length);
|
|
2397
|
+
let nextIndex = 0;
|
|
2398
|
+
async function worker() {
|
|
2399
|
+
while (nextIndex < tasks.length) {
|
|
2400
|
+
const idx = nextIndex++;
|
|
2401
|
+
results[idx] = await tasks[idx]();
|
|
2402
|
+
}
|
|
2403
|
+
}
|
|
2404
|
+
await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, () => worker()));
|
|
2405
|
+
return results;
|
|
2406
|
+
}
|
|
2407
|
+
function ocrResultToBlocks(result, pageNum) {
|
|
2408
|
+
const pageBlocks = [];
|
|
2409
|
+
if (typeof result === "string") {
|
|
2410
|
+
if (result.trim()) {
|
|
2411
|
+
pageBlocks.push({ type: "paragraph", text: result.trim(), pageNumber: pageNum });
|
|
2412
|
+
}
|
|
2413
|
+
} else if (result && typeof result === "object" && "markdown" in result) {
|
|
2414
|
+
const structured = result;
|
|
2415
|
+
if (structured.markdown.trim()) {
|
|
2416
|
+
const converted = markdownToBlocks(structured.markdown, pageNum);
|
|
2417
|
+
for (const b of converted) pageBlocks.push(b);
|
|
2418
|
+
}
|
|
2419
|
+
}
|
|
2420
|
+
return pageBlocks;
|
|
2421
|
+
}
|
|
2422
|
+
async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1) {
|
|
2358
2423
|
const blocks = [];
|
|
2424
|
+
if (concurrency <= 1) {
|
|
2425
|
+
for (let i = 1; i <= effectivePageCount; i++) {
|
|
2426
|
+
if (pageFilter && !pageFilter.has(i)) continue;
|
|
2427
|
+
const page = await doc.getPage(i);
|
|
2428
|
+
try {
|
|
2429
|
+
const imageData = await renderPageToPng(page);
|
|
2430
|
+
const result = await provider(imageData, i, "image/png");
|
|
2431
|
+
for (const b of ocrResultToBlocks(result, i)) blocks.push(b);
|
|
2432
|
+
} catch (err) {
|
|
2433
|
+
warnings?.push({
|
|
2434
|
+
page: i,
|
|
2435
|
+
message: `\uD398\uC774\uC9C0 ${i} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2436
|
+
code: "OCR_PAGE_FAILED"
|
|
2437
|
+
});
|
|
2438
|
+
}
|
|
2439
|
+
}
|
|
2440
|
+
return blocks;
|
|
2441
|
+
}
|
|
2442
|
+
const pageNumbers = [];
|
|
2359
2443
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
2360
2444
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
2361
|
-
|
|
2445
|
+
pageNumbers.push(i);
|
|
2446
|
+
}
|
|
2447
|
+
const tasks = pageNumbers.map((pageNum) => async () => {
|
|
2362
2448
|
try {
|
|
2449
|
+
const page = await doc.getPage(pageNum);
|
|
2363
2450
|
const imageData = await renderPageToPng(page);
|
|
2364
|
-
const result = await provider(imageData,
|
|
2365
|
-
|
|
2366
|
-
if (result.trim()) {
|
|
2367
|
-
blocks.push({ type: "paragraph", text: result.trim(), pageNumber: i });
|
|
2368
|
-
}
|
|
2369
|
-
} else if (result && typeof result === "object" && "markdown" in result) {
|
|
2370
|
-
const structured = result;
|
|
2371
|
-
if (structured.markdown.trim()) {
|
|
2372
|
-
const pageBlocks = markdownToBlocks(structured.markdown, i);
|
|
2373
|
-
for (const b of pageBlocks) blocks.push(b);
|
|
2374
|
-
}
|
|
2375
|
-
}
|
|
2451
|
+
const result = await provider(imageData, pageNum, "image/png");
|
|
2452
|
+
return { pageNum, pageBlocks: ocrResultToBlocks(result, pageNum) };
|
|
2376
2453
|
} catch (err) {
|
|
2377
2454
|
warnings?.push({
|
|
2378
|
-
page:
|
|
2379
|
-
message: `\uD398\uC774\uC9C0 ${
|
|
2455
|
+
page: pageNum,
|
|
2456
|
+
message: `\uD398\uC774\uC9C0 ${pageNum} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2380
2457
|
code: "OCR_PAGE_FAILED"
|
|
2381
2458
|
});
|
|
2459
|
+
return null;
|
|
2382
2460
|
}
|
|
2461
|
+
});
|
|
2462
|
+
const taskResults = await runWithConcurrency(tasks, concurrency);
|
|
2463
|
+
for (const item of taskResults) {
|
|
2464
|
+
if (!item) continue;
|
|
2465
|
+
for (const b of item.pageBlocks) blocks.push(b);
|
|
2383
2466
|
}
|
|
2384
2467
|
return blocks;
|
|
2385
2468
|
}
|
|
@@ -2448,7 +2531,7 @@ import JSZip2 from "jszip";
|
|
|
2448
2531
|
import { DOMParser } from "@xmldom/xmldom";
|
|
2449
2532
|
|
|
2450
2533
|
// src/utils.ts
|
|
2451
|
-
var VERSION = true ? "2.2.
|
|
2534
|
+
var VERSION = true ? "2.2.9" : "0.0.0-dev";
|
|
2452
2535
|
function toArrayBuffer(buf) {
|
|
2453
2536
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2454
2537
|
return buf.buffer;
|
|
@@ -5965,7 +6048,8 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5965
6048
|
if (!ocrProvider && ocrMode && ocrMode !== "off") {
|
|
5966
6049
|
try {
|
|
5967
6050
|
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
5968
|
-
|
|
6051
|
+
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6052
|
+
ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency);
|
|
5969
6053
|
} catch (resolveErr) {
|
|
5970
6054
|
if (ocrMode !== "auto") {
|
|
5971
6055
|
throw Object.assign(
|
|
@@ -5976,20 +6060,28 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5976
6060
|
}
|
|
5977
6061
|
}
|
|
5978
6062
|
if (ocrProvider) {
|
|
6063
|
+
let ocrBlocks = [];
|
|
5979
6064
|
try {
|
|
5980
6065
|
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
5981
|
-
const
|
|
5982
|
-
|
|
5983
|
-
const ocrMarkdown = blocksToMarkdown(ocrBlocks);
|
|
5984
|
-
return {
|
|
5985
|
-
markdown: ocrMarkdown,
|
|
5986
|
-
blocks: ocrBlocks,
|
|
5987
|
-
metadata,
|
|
5988
|
-
warnings: warnings.length > 0 ? warnings : void 0,
|
|
5989
|
-
isImageBased: true
|
|
5990
|
-
};
|
|
5991
|
-
}
|
|
6066
|
+
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6067
|
+
ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency);
|
|
5992
6068
|
} catch {
|
|
6069
|
+
} finally {
|
|
6070
|
+
const terminable = ocrProvider;
|
|
6071
|
+
if (typeof terminable.terminate === "function") {
|
|
6072
|
+
await terminable.terminate().catch(() => {
|
|
6073
|
+
});
|
|
6074
|
+
}
|
|
6075
|
+
}
|
|
6076
|
+
if (ocrBlocks.length > 0) {
|
|
6077
|
+
const ocrMarkdown = blocksToMarkdown(ocrBlocks);
|
|
6078
|
+
return {
|
|
6079
|
+
markdown: ocrMarkdown,
|
|
6080
|
+
blocks: ocrBlocks,
|
|
6081
|
+
metadata,
|
|
6082
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
6083
|
+
isImageBased: true
|
|
6084
|
+
};
|
|
5993
6085
|
}
|
|
5994
6086
|
}
|
|
5995
6087
|
if (ocrMode === "off") {
|