@clazic/kordoc 2.2.6 → 2.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-BU42ZFTN.js → chunk-6KLTURMA.js} +2 -2
- package/dist/{chunk-KHUTUB7G.js → chunk-FC6BQOWD.js} +8 -6
- package/dist/chunk-FC6BQOWD.js.map +1 -0
- package/dist/cli.js +10 -6
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +111 -24
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +7 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.js +111 -24
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{provider-EPHXUWRL.js → provider-I3XGSVL6.js} +62 -17
- package/dist/provider-I3XGSVL6.js.map +1 -0
- package/dist/{resolve-77C5OWLO.js → resolve-UFUJEPCJ.js} +14 -6
- package/dist/resolve-UFUJEPCJ.js.map +1 -0
- package/dist/tesseract-provider-WCVJWBUT.js +56 -0
- package/dist/tesseract-provider-WCVJWBUT.js.map +1 -0
- package/dist/{utils-PGXOUDRW.js → utils-JRBHPKTC.js} +2 -2
- package/dist/{watch-VDBYOMEJ.js → watch-JANDW746.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-KHUTUB7G.js.map +0 -1
- package/dist/provider-EPHXUWRL.js.map +0 -1
- package/dist/resolve-77C5OWLO.js.map +0 -1
- package/dist/tesseract-provider-UNJOI25M.js +0 -24
- package/dist/tesseract-provider-UNJOI25M.js.map +0 -1
- /package/dist/{chunk-BU42ZFTN.js.map → chunk-6KLTURMA.js.map} +0 -0
- /package/dist/{utils-PGXOUDRW.js.map → utils-JRBHPKTC.js.map} +0 -0
- /package/dist/{watch-VDBYOMEJ.js.map → watch-JANDW746.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -110,6 +110,13 @@ interface ParseOptions {
|
|
|
110
110
|
* - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
|
|
111
111
|
*/
|
|
112
112
|
ocrMode?: OcrMode;
|
|
113
|
+
/**
|
|
114
|
+
* OCR 병렬 처리 수.
|
|
115
|
+
* - tesseract: 기본값은 CPU 코어 수 (병렬 처리로 속도 향상)
|
|
116
|
+
* - CLI 제공 프로바이더(gemini/claude/codex): 기본 1 (rate limit 보호)
|
|
117
|
+
* - 1: 순차 처리 (기존 동작)
|
|
118
|
+
*/
|
|
119
|
+
ocrConcurrency?: number;
|
|
113
120
|
/** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
|
|
114
121
|
onProgress?: (current: number, total: number) => void;
|
|
115
122
|
/** PDF 머리글/바닥글 자동 제거 */
|
package/dist/index.d.ts
CHANGED
|
@@ -110,6 +110,13 @@ interface ParseOptions {
|
|
|
110
110
|
* - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
|
|
111
111
|
*/
|
|
112
112
|
ocrMode?: OcrMode;
|
|
113
|
+
/**
|
|
114
|
+
* OCR 병렬 처리 수.
|
|
115
|
+
* - tesseract: 기본값은 CPU 코어 수 (병렬 처리로 속도 향상)
|
|
116
|
+
* - CLI 제공 프로바이더(gemini/claude/codex): 기본 1 (rate limit 보호)
|
|
117
|
+
* - 1: 순차 처리 (기존 동작)
|
|
118
|
+
*/
|
|
119
|
+
ocrConcurrency?: number;
|
|
113
120
|
/** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
|
|
114
121
|
onProgress?: (current: number, total: number) => void;
|
|
115
122
|
/** PDF 머리글/바닥글 자동 제거 */
|
package/dist/index.js
CHANGED
|
@@ -2051,8 +2051,10 @@ function callCli(mode, imagePath) {
|
|
|
2051
2051
|
const args = buildCliArgs(mode, imagePath);
|
|
2052
2052
|
const result = spawnSync(mode, args, {
|
|
2053
2053
|
encoding: "utf-8",
|
|
2054
|
-
timeout:
|
|
2055
|
-
maxBuffer: 10 * 1024 * 1024
|
|
2054
|
+
timeout: 18e4,
|
|
2055
|
+
maxBuffer: 10 * 1024 * 1024,
|
|
2056
|
+
// claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
|
|
2057
|
+
...mode === "claude" ? { cwd: tmpdir() } : {}
|
|
2056
2058
|
});
|
|
2057
2059
|
if (result.error) {
|
|
2058
2060
|
throw new Error(`${mode} CLI \uC2E4\uD589 \uC2E4\uD328: ${result.error.message}`);
|
|
@@ -2158,6 +2160,7 @@ var init_cli_provider = __esm({
|
|
|
2158
2160
|
// src/ocr/tesseract-provider.ts
|
|
2159
2161
|
var tesseract_provider_exports = {};
|
|
2160
2162
|
__export(tesseract_provider_exports, {
|
|
2163
|
+
createTesseractPoolProvider: () => createTesseractPoolProvider,
|
|
2161
2164
|
createTesseractProvider: () => createTesseractProvider
|
|
2162
2165
|
});
|
|
2163
2166
|
import { createWorker } from "tesseract.js";
|
|
@@ -2176,6 +2179,37 @@ async function createTesseractProvider() {
|
|
|
2176
2179
|
};
|
|
2177
2180
|
return provider;
|
|
2178
2181
|
}
|
|
2182
|
+
async function createTesseractPoolProvider(concurrency) {
|
|
2183
|
+
const workers = await Promise.all(
|
|
2184
|
+
Array.from({ length: concurrency }, () => createWorker("kor+eng"))
|
|
2185
|
+
);
|
|
2186
|
+
const idle = [...workers];
|
|
2187
|
+
const waitQueue = [];
|
|
2188
|
+
function acquire() {
|
|
2189
|
+
if (idle.length > 0) return Promise.resolve(idle.pop());
|
|
2190
|
+
return new Promise((resolve) => waitQueue.push(resolve));
|
|
2191
|
+
}
|
|
2192
|
+
function release(w) {
|
|
2193
|
+
if (waitQueue.length > 0) {
|
|
2194
|
+
waitQueue.shift()(w);
|
|
2195
|
+
} else {
|
|
2196
|
+
idle.push(w);
|
|
2197
|
+
}
|
|
2198
|
+
}
|
|
2199
|
+
const provider = async (pageImage, _pageNumber, _mimeType) => {
|
|
2200
|
+
const w = await acquire();
|
|
2201
|
+
try {
|
|
2202
|
+
const { data } = await w.recognize(pageImage);
|
|
2203
|
+
return data.text;
|
|
2204
|
+
} finally {
|
|
2205
|
+
release(w);
|
|
2206
|
+
}
|
|
2207
|
+
};
|
|
2208
|
+
provider.terminate = async () => {
|
|
2209
|
+
await Promise.all(workers.map((w) => w.terminate()));
|
|
2210
|
+
};
|
|
2211
|
+
return provider;
|
|
2212
|
+
}
|
|
2179
2213
|
var init_tesseract_provider = __esm({
|
|
2180
2214
|
"src/ocr/tesseract-provider.ts"() {
|
|
2181
2215
|
"use strict";
|
|
@@ -2187,14 +2221,17 @@ var resolve_exports = {};
|
|
|
2187
2221
|
__export(resolve_exports, {
|
|
2188
2222
|
resolveOcrProvider: () => resolveOcrProvider
|
|
2189
2223
|
});
|
|
2190
|
-
async function resolveOcrProvider(mode, warnings) {
|
|
2224
|
+
async function resolveOcrProvider(mode, warnings, concurrency) {
|
|
2191
2225
|
if (mode === "off") {
|
|
2192
2226
|
throw new Error("OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC788\uC2B5\uB2C8\uB2E4 (--ocr off).");
|
|
2193
2227
|
}
|
|
2194
2228
|
if (mode !== "auto") {
|
|
2195
2229
|
validateOcrMode(mode);
|
|
2196
2230
|
if (mode === "tesseract") {
|
|
2197
|
-
const { createTesseractProvider: createTesseractProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2231
|
+
const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2232
|
+
if (concurrency && concurrency > 1) {
|
|
2233
|
+
return createTesseractPoolProvider2(concurrency);
|
|
2234
|
+
}
|
|
2198
2235
|
return createTesseractProvider2();
|
|
2199
2236
|
}
|
|
2200
2237
|
return createCliOcrProvider(mode);
|
|
@@ -2214,7 +2251,10 @@ async function resolveOcrProvider(mode, warnings) {
|
|
|
2214
2251
|
}
|
|
2215
2252
|
}
|
|
2216
2253
|
if (detected === "tesseract") {
|
|
2217
|
-
const { createTesseractProvider: createTesseractProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2254
|
+
const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2255
|
+
if (concurrency && concurrency > 1) {
|
|
2256
|
+
return createTesseractPoolProvider2(concurrency);
|
|
2257
|
+
}
|
|
2218
2258
|
return createTesseractProvider2();
|
|
2219
2259
|
}
|
|
2220
2260
|
return createCliOcrProvider(detected);
|
|
@@ -2352,32 +2392,77 @@ var provider_exports = {};
|
|
|
2352
2392
|
__export(provider_exports, {
|
|
2353
2393
|
ocrPages: () => ocrPages
|
|
2354
2394
|
});
|
|
2355
|
-
async function
|
|
2395
|
+
async function runWithConcurrency(tasks, limit) {
|
|
2396
|
+
const results = new Array(tasks.length);
|
|
2397
|
+
let nextIndex = 0;
|
|
2398
|
+
async function worker() {
|
|
2399
|
+
while (nextIndex < tasks.length) {
|
|
2400
|
+
const idx = nextIndex++;
|
|
2401
|
+
results[idx] = await tasks[idx]();
|
|
2402
|
+
}
|
|
2403
|
+
}
|
|
2404
|
+
await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, () => worker()));
|
|
2405
|
+
return results;
|
|
2406
|
+
}
|
|
2407
|
+
function ocrResultToBlocks(result, pageNum) {
|
|
2408
|
+
const pageBlocks = [];
|
|
2409
|
+
if (typeof result === "string") {
|
|
2410
|
+
if (result.trim()) {
|
|
2411
|
+
pageBlocks.push({ type: "paragraph", text: result.trim(), pageNumber: pageNum });
|
|
2412
|
+
}
|
|
2413
|
+
} else if (result && typeof result === "object" && "markdown" in result) {
|
|
2414
|
+
const structured = result;
|
|
2415
|
+
if (structured.markdown.trim()) {
|
|
2416
|
+
const converted = markdownToBlocks(structured.markdown, pageNum);
|
|
2417
|
+
for (const b of converted) pageBlocks.push(b);
|
|
2418
|
+
}
|
|
2419
|
+
}
|
|
2420
|
+
return pageBlocks;
|
|
2421
|
+
}
|
|
2422
|
+
async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1) {
|
|
2356
2423
|
const blocks = [];
|
|
2424
|
+
if (concurrency <= 1) {
|
|
2425
|
+
for (let i = 1; i <= effectivePageCount; i++) {
|
|
2426
|
+
if (pageFilter && !pageFilter.has(i)) continue;
|
|
2427
|
+
const page = await doc.getPage(i);
|
|
2428
|
+
try {
|
|
2429
|
+
const imageData = await renderPageToPng(page);
|
|
2430
|
+
const result = await provider(imageData, i, "image/png");
|
|
2431
|
+
for (const b of ocrResultToBlocks(result, i)) blocks.push(b);
|
|
2432
|
+
} catch (err) {
|
|
2433
|
+
warnings?.push({
|
|
2434
|
+
page: i,
|
|
2435
|
+
message: `\uD398\uC774\uC9C0 ${i} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2436
|
+
code: "OCR_PAGE_FAILED"
|
|
2437
|
+
});
|
|
2438
|
+
}
|
|
2439
|
+
}
|
|
2440
|
+
return blocks;
|
|
2441
|
+
}
|
|
2442
|
+
const pageNumbers = [];
|
|
2357
2443
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
2358
2444
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
2359
|
-
|
|
2445
|
+
pageNumbers.push(i);
|
|
2446
|
+
}
|
|
2447
|
+
const tasks = pageNumbers.map((pageNum) => async () => {
|
|
2360
2448
|
try {
|
|
2449
|
+
const page = await doc.getPage(pageNum);
|
|
2361
2450
|
const imageData = await renderPageToPng(page);
|
|
2362
|
-
const result = await provider(imageData,
|
|
2363
|
-
|
|
2364
|
-
if (result.trim()) {
|
|
2365
|
-
blocks.push({ type: "paragraph", text: result.trim(), pageNumber: i });
|
|
2366
|
-
}
|
|
2367
|
-
} else if (result && typeof result === "object" && "markdown" in result) {
|
|
2368
|
-
const structured = result;
|
|
2369
|
-
if (structured.markdown.trim()) {
|
|
2370
|
-
const pageBlocks = markdownToBlocks(structured.markdown, i);
|
|
2371
|
-
for (const b of pageBlocks) blocks.push(b);
|
|
2372
|
-
}
|
|
2373
|
-
}
|
|
2451
|
+
const result = await provider(imageData, pageNum, "image/png");
|
|
2452
|
+
return { pageNum, pageBlocks: ocrResultToBlocks(result, pageNum) };
|
|
2374
2453
|
} catch (err) {
|
|
2375
2454
|
warnings?.push({
|
|
2376
|
-
page:
|
|
2377
|
-
message: `\uD398\uC774\uC9C0 ${
|
|
2455
|
+
page: pageNum,
|
|
2456
|
+
message: `\uD398\uC774\uC9C0 ${pageNum} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2378
2457
|
code: "OCR_PAGE_FAILED"
|
|
2379
2458
|
});
|
|
2459
|
+
return null;
|
|
2380
2460
|
}
|
|
2461
|
+
});
|
|
2462
|
+
const taskResults = await runWithConcurrency(tasks, concurrency);
|
|
2463
|
+
for (const item of taskResults) {
|
|
2464
|
+
if (!item) continue;
|
|
2465
|
+
for (const b of item.pageBlocks) blocks.push(b);
|
|
2381
2466
|
}
|
|
2382
2467
|
return blocks;
|
|
2383
2468
|
}
|
|
@@ -2446,7 +2531,7 @@ import JSZip2 from "jszip";
|
|
|
2446
2531
|
import { DOMParser } from "@xmldom/xmldom";
|
|
2447
2532
|
|
|
2448
2533
|
// src/utils.ts
|
|
2449
|
-
var VERSION = true ? "2.2.
|
|
2534
|
+
var VERSION = true ? "2.2.8" : "0.0.0-dev";
|
|
2450
2535
|
function toArrayBuffer(buf) {
|
|
2451
2536
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2452
2537
|
return buf.buffer;
|
|
@@ -5963,7 +6048,8 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5963
6048
|
if (!ocrProvider && ocrMode && ocrMode !== "off") {
|
|
5964
6049
|
try {
|
|
5965
6050
|
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
5966
|
-
|
|
6051
|
+
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6052
|
+
ocrProvider = await resolveOcrProvider2(ocrMode, warnings, concurrency);
|
|
5967
6053
|
} catch (resolveErr) {
|
|
5968
6054
|
if (ocrMode !== "auto") {
|
|
5969
6055
|
throw Object.assign(
|
|
@@ -5976,7 +6062,8 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5976
6062
|
if (ocrProvider) {
|
|
5977
6063
|
try {
|
|
5978
6064
|
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
5979
|
-
const
|
|
6065
|
+
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6066
|
+
const ocrBlocks = await ocrPages2(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency);
|
|
5980
6067
|
if (ocrBlocks.length > 0) {
|
|
5981
6068
|
const ocrMarkdown = blocksToMarkdown(ocrBlocks);
|
|
5982
6069
|
return {
|