@clazic/kordoc 2.4.19 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-MZN7PLTZ.js → chunk-IJGNPAK2.js} +2 -2
- package/dist/{chunk-MZN7PLTZ.js.map → chunk-IJGNPAK2.js.map} +1 -1
- package/dist/{chunk-463YQ2WL.js → chunk-QG6BYZMR.js} +2 -2
- package/dist/{chunk-463YQ2WL.js.map → chunk-QG6BYZMR.js.map} +1 -1
- package/dist/cli.js +5 -5
- package/dist/index.cjs +192 -103
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +3 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +192 -103
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{utils-YUAT7LFD.js → utils-RBXHHCLI.js} +2 -2
- package/dist/{watch-WEOFVVDO.js → watch-5CCMTZ7F.js} +3 -3
- package/package.json +1 -1
- /package/dist/{utils-YUAT7LFD.js.map → utils-RBXHHCLI.js.map} +0 -0
- /package/dist/{watch-WEOFVVDO.js.map → watch-5CCMTZ7F.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -421,6 +421,8 @@ interface UnifiedOcrProgressEvent {
|
|
|
421
421
|
code?: UnifiedOcrErrorCode;
|
|
422
422
|
message?: string;
|
|
423
423
|
model?: string;
|
|
424
|
+
pageNumber?: number;
|
|
425
|
+
workerCount?: number;
|
|
424
426
|
}
|
|
425
427
|
interface UnifiedOcrOptions {
|
|
426
428
|
workspaceDir?: string;
|
|
@@ -436,6 +438,7 @@ interface UnifiedOcrOptions {
|
|
|
436
438
|
probeConcurrency?: number;
|
|
437
439
|
logger?: Logger;
|
|
438
440
|
runId?: string;
|
|
441
|
+
concurrencyPerKey?: number;
|
|
439
442
|
}
|
|
440
443
|
interface UnifiedOcrResult {
|
|
441
444
|
outputPath: string;
|
package/dist/index.d.ts
CHANGED
|
@@ -421,6 +421,8 @@ interface UnifiedOcrProgressEvent {
|
|
|
421
421
|
code?: UnifiedOcrErrorCode;
|
|
422
422
|
message?: string;
|
|
423
423
|
model?: string;
|
|
424
|
+
pageNumber?: number;
|
|
425
|
+
workerCount?: number;
|
|
424
426
|
}
|
|
425
427
|
interface UnifiedOcrOptions {
|
|
426
428
|
workspaceDir?: string;
|
|
@@ -436,6 +438,7 @@ interface UnifiedOcrOptions {
|
|
|
436
438
|
probeConcurrency?: number;
|
|
437
439
|
logger?: Logger;
|
|
438
440
|
runId?: string;
|
|
441
|
+
concurrencyPerKey?: number;
|
|
439
442
|
}
|
|
440
443
|
interface UnifiedOcrResult {
|
|
441
444
|
outputPath: string;
|
package/dist/index.js
CHANGED
|
@@ -3035,7 +3035,7 @@ import JSZip2 from "jszip";
|
|
|
3035
3035
|
import { DOMParser } from "@xmldom/xmldom";
|
|
3036
3036
|
|
|
3037
3037
|
// src/utils.ts
|
|
3038
|
-
var VERSION = true ? "2.
|
|
3038
|
+
var VERSION = true ? "2.5.0" : "0.0.0-dev";
|
|
3039
3039
|
function toArrayBuffer(buf) {
|
|
3040
3040
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3041
3041
|
return buf.buffer;
|
|
@@ -11231,6 +11231,66 @@ import { spawn as spawn2 } from "child_process";
|
|
|
11231
11231
|
import { performance } from "perf_hooks";
|
|
11232
11232
|
import libre from "libreoffice-convert";
|
|
11233
11233
|
init_logger();
|
|
11234
|
+
|
|
11235
|
+
// src/pipeline/bounded-queue.ts
|
|
11236
|
+
var QUEUE_DONE = /* @__PURE__ */ Symbol("QUEUE_DONE");
|
|
11237
|
+
var BoundedQueue = class {
|
|
11238
|
+
buffer = [];
|
|
11239
|
+
capacity;
|
|
11240
|
+
closed = false;
|
|
11241
|
+
producerWaiters = [];
|
|
11242
|
+
consumerWaiters = [];
|
|
11243
|
+
constructor(capacity) {
|
|
11244
|
+
if (capacity < 1) throw new RangeError("BoundedQueue capacity must be >= 1");
|
|
11245
|
+
this.capacity = capacity;
|
|
11246
|
+
}
|
|
11247
|
+
async enqueue(item) {
|
|
11248
|
+
if (this.closed) throw new Error("BoundedQueue: cannot enqueue after close()");
|
|
11249
|
+
if (this.consumerWaiters.length > 0) {
|
|
11250
|
+
const resolve4 = this.consumerWaiters.shift();
|
|
11251
|
+
resolve4(item);
|
|
11252
|
+
return;
|
|
11253
|
+
}
|
|
11254
|
+
while (this.buffer.length >= this.capacity) {
|
|
11255
|
+
await new Promise((resolve4) => this.producerWaiters.push(resolve4));
|
|
11256
|
+
if (this.closed) throw new Error("BoundedQueue: closed while waiting to enqueue");
|
|
11257
|
+
}
|
|
11258
|
+
this.buffer.push(item);
|
|
11259
|
+
}
|
|
11260
|
+
async dequeue() {
|
|
11261
|
+
if (this.buffer.length > 0) {
|
|
11262
|
+
const item = this.buffer.shift();
|
|
11263
|
+
this._wakeProducer();
|
|
11264
|
+
return item;
|
|
11265
|
+
}
|
|
11266
|
+
if (this.closed) return QUEUE_DONE;
|
|
11267
|
+
return new Promise((resolve4) => {
|
|
11268
|
+
this.consumerWaiters.push(resolve4);
|
|
11269
|
+
});
|
|
11270
|
+
}
|
|
11271
|
+
close() {
|
|
11272
|
+
if (this.closed) return;
|
|
11273
|
+
this.closed = true;
|
|
11274
|
+
for (const resolve4 of this.consumerWaiters) {
|
|
11275
|
+
resolve4(QUEUE_DONE);
|
|
11276
|
+
}
|
|
11277
|
+
this.consumerWaiters = [];
|
|
11278
|
+
for (const wake of this.producerWaiters) {
|
|
11279
|
+
wake();
|
|
11280
|
+
}
|
|
11281
|
+
this.producerWaiters = [];
|
|
11282
|
+
}
|
|
11283
|
+
get size() {
|
|
11284
|
+
return this.buffer.length;
|
|
11285
|
+
}
|
|
11286
|
+
_wakeProducer() {
|
|
11287
|
+
if (this.producerWaiters.length > 0) {
|
|
11288
|
+
this.producerWaiters.shift()();
|
|
11289
|
+
}
|
|
11290
|
+
}
|
|
11291
|
+
};
|
|
11292
|
+
|
|
11293
|
+
// src/pipeline/unified-ocr.ts
|
|
11234
11294
|
var libreConvert = libre.convert;
|
|
11235
11295
|
var UnifiedOcrError = class extends Error {
|
|
11236
11296
|
code;
|
|
@@ -11262,11 +11322,11 @@ var DEFAULT_MODEL_MAX_TOKENS = {
|
|
|
11262
11322
|
};
|
|
11263
11323
|
var DEFAULT_STAGE_WEIGHTS = {
|
|
11264
11324
|
convert: 15,
|
|
11265
|
-
render:
|
|
11325
|
+
render: 15,
|
|
11266
11326
|
probe: 5,
|
|
11267
|
-
ocr:
|
|
11327
|
+
ocr: 55,
|
|
11268
11328
|
proofread: 0,
|
|
11269
|
-
merge:
|
|
11329
|
+
merge: 10
|
|
11270
11330
|
};
|
|
11271
11331
|
var OCR_PROMPT2 = [
|
|
11272
11332
|
"\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, OCR \uC624\uC778\uC2DD \uC624\uB958\uB97C \uC989\uC2DC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uBB3C\uC744 \uCD9C\uB825\uD558\uC138\uC694.",
|
|
@@ -11355,39 +11415,19 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11355
11415
|
logStage("info", "convert", "done", "PDF \uBCC0\uD658 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
|
|
11356
11416
|
const renderStart = performance.now();
|
|
11357
11417
|
currentStage = "render";
|
|
11418
|
+
const totalPages = await getPdfPageCount(workingPdfPath).catch(() => 0);
|
|
11419
|
+
if (totalPages === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC218\uB97C \uD655\uC778\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
11358
11420
|
markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
|
|
11359
|
-
logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi });
|
|
11360
|
-
|
|
11361
|
-
|
|
11362
|
-
|
|
11363
|
-
|
|
11364
|
-
|
|
11365
|
-
markStageProgress(
|
|
11366
|
-
"render",
|
|
11367
|
-
Math.round(current / total * 100),
|
|
11368
|
-
current,
|
|
11369
|
-
total,
|
|
11370
|
-
`\uD398\uC774\uC9C0 ${current}/${total} \uB80C\uB354\uB9C1`
|
|
11371
|
-
);
|
|
11372
|
-
}
|
|
11373
|
-
);
|
|
11374
|
-
const images = await listPageImages(imagesDir);
|
|
11375
|
-
if (images.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328: \uACB0\uACFC \uC774\uBBF8\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
11376
|
-
if (!renderWithProgress.emittedPerPageProgress) {
|
|
11377
|
-
markStageProgress("render", 100, images.length, images.length, `\uD398\uC774\uC9C0 ${images.length}\uC7A5 \uC0DD\uC131`);
|
|
11378
|
-
}
|
|
11379
|
-
timingsMs.render = elapsedMs(renderStart);
|
|
11380
|
-
markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
|
|
11381
|
-
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", {
|
|
11382
|
-
pages: images.length,
|
|
11383
|
-
elapsedMs: timingsMs.render,
|
|
11384
|
-
pageCountSource: renderWithProgress.pageCountSource
|
|
11385
|
-
});
|
|
11421
|
+
logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
|
|
11422
|
+
await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, join4(imagesDir, "page")]);
|
|
11423
|
+
const firstFiles = (await readdir(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
|
|
11424
|
+
if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
|
|
11425
|
+
const probeImage = join4(imagesDir, firstFiles[0]);
|
|
11426
|
+
markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
|
|
11386
11427
|
const probeStart = performance.now();
|
|
11387
11428
|
currentStage = "probe";
|
|
11388
11429
|
markStageStart("probe", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC218\uD589 \uC911");
|
|
11389
11430
|
logStage("info", "probe", "start", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC2DC\uC791", { models, probeConcurrency });
|
|
11390
|
-
const probeImage = await pickRepresentativeImage(images);
|
|
11391
11431
|
let probeDone = 0;
|
|
11392
11432
|
const probeRuns = startParallelProbeRuns({
|
|
11393
11433
|
models,
|
|
@@ -11414,33 +11454,74 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11414
11454
|
await updateModelCache(modelCachePath, results);
|
|
11415
11455
|
return results;
|
|
11416
11456
|
});
|
|
11457
|
+
const concurrencyPerKey = Math.max(1, options.concurrencyPerKey ?? 2);
|
|
11458
|
+
const keyCount = keyPool.snapshot().length;
|
|
11459
|
+
const workerCount = Math.max(1, keyCount * concurrencyPerKey);
|
|
11460
|
+
const queueCapacity = workerCount * 2;
|
|
11461
|
+
const queue = new BoundedQueue(queueCapacity);
|
|
11417
11462
|
const ocrStart = performance.now();
|
|
11418
11463
|
currentStage = "ocr";
|
|
11419
|
-
markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (${
|
|
11420
|
-
logStage("info", "ocr", "start", "\uD398\uC774\uC9C0 OCR \uC2DC\uC791", {
|
|
11421
|
-
|
|
11422
|
-
|
|
11423
|
-
|
|
11424
|
-
|
|
11425
|
-
|
|
11426
|
-
|
|
11427
|
-
|
|
11428
|
-
|
|
11429
|
-
|
|
11430
|
-
|
|
11431
|
-
|
|
11432
|
-
|
|
11464
|
+
markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
|
|
11465
|
+
logStage("info", "ocr", "start", "\uD398\uC774\uC9C0 OCR \uC2DC\uC791", { workerCount, keyCount, pageCount: totalPages });
|
|
11466
|
+
let renderDone = 1;
|
|
11467
|
+
const renderProducer = (async () => {
|
|
11468
|
+
try {
|
|
11469
|
+
await queue.enqueue({ pageNumber: 1, imagePath: probeImage });
|
|
11470
|
+
if (totalPages > 1) {
|
|
11471
|
+
for await (const item of renderPdfToPngStream(workingPdfPath, join4(imagesDir, "page"), dpi, totalPages, 2)) {
|
|
11472
|
+
await queue.enqueue(item);
|
|
11473
|
+
renderDone++;
|
|
11474
|
+
markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
|
|
11475
|
+
logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
|
|
11476
|
+
}
|
|
11477
|
+
}
|
|
11478
|
+
} finally {
|
|
11479
|
+
queue.close();
|
|
11480
|
+
timingsMs.render = elapsedMs(renderStart);
|
|
11481
|
+
markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
|
|
11482
|
+
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
|
|
11483
|
+
}
|
|
11484
|
+
})();
|
|
11485
|
+
const [, pageResultsMap] = await Promise.all([
|
|
11486
|
+
renderProducer,
|
|
11487
|
+
ocrWorkerPool({
|
|
11488
|
+
queue,
|
|
11489
|
+
workerCount,
|
|
11490
|
+
totalPages,
|
|
11491
|
+
ocrInput: {
|
|
11492
|
+
prompt: OCR_PROMPT2,
|
|
11493
|
+
models: fallbackModelOrder,
|
|
11494
|
+
modelMaxTokens,
|
|
11495
|
+
baseUrl,
|
|
11496
|
+
keyPool,
|
|
11497
|
+
timeoutMs,
|
|
11498
|
+
maxRetriesPerPage,
|
|
11499
|
+
logger
|
|
11500
|
+
},
|
|
11501
|
+
onPageDone: (pageNumber, completedCount, model) => {
|
|
11502
|
+
markStageProgress(
|
|
11503
|
+
"ocr",
|
|
11504
|
+
Math.round(completedCount / totalPages * 100),
|
|
11505
|
+
completedCount,
|
|
11506
|
+
totalPages,
|
|
11507
|
+
`OCR ${completedCount}/${totalPages}`,
|
|
11508
|
+
model || void 0
|
|
11509
|
+
);
|
|
11510
|
+
logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: pageNumber, total: totalPages, model });
|
|
11511
|
+
},
|
|
11433
11512
|
logger
|
|
11434
|
-
})
|
|
11435
|
-
|
|
11436
|
-
await writeFile(pagePath, markdown, "utf-8");
|
|
11437
|
-
rawPagePaths.push(pagePath);
|
|
11438
|
-
markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`, selectedModel);
|
|
11439
|
-
logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
|
|
11440
|
-
}
|
|
11513
|
+
})
|
|
11514
|
+
]);
|
|
11441
11515
|
timingsMs.ocr = elapsedMs(ocrStart);
|
|
11442
11516
|
markStageDone("ocr", "OCR \uC644\uB8CC");
|
|
11443
11517
|
logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
|
|
11518
|
+
const sortedEntries = Array.from(pageResultsMap.entries()).sort((a, b) => a[0] - b[0]);
|
|
11519
|
+
const rawPagePaths = [];
|
|
11520
|
+
for (const [pageNum, markdown] of sortedEntries) {
|
|
11521
|
+
const pagePath = join4(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
|
|
11522
|
+
await writeFile(pagePath, markdown, "utf-8");
|
|
11523
|
+
rawPagePaths.push(pagePath);
|
|
11524
|
+
}
|
|
11444
11525
|
const mergeStart = performance.now();
|
|
11445
11526
|
currentStage = "merge";
|
|
11446
11527
|
markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
|
|
@@ -11457,7 +11538,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11457
11538
|
selectedModel,
|
|
11458
11539
|
probeImage,
|
|
11459
11540
|
probeResults: await probeResultsPromise,
|
|
11460
|
-
pageCount:
|
|
11541
|
+
pageCount: totalPages,
|
|
11461
11542
|
keyHealth: keyPool.snapshot(),
|
|
11462
11543
|
timingsMs,
|
|
11463
11544
|
modelCachePath
|
|
@@ -11535,13 +11616,6 @@ async function convertWithLibreOffice(buffer, ext) {
|
|
|
11535
11616
|
});
|
|
11536
11617
|
});
|
|
11537
11618
|
}
|
|
11538
|
-
async function renderPdfToPng(pdfPath, prefixPath, dpi) {
|
|
11539
|
-
try {
|
|
11540
|
-
await runCommand("pdftoppm", ["-png", "-r", String(dpi), pdfPath, prefixPath]);
|
|
11541
|
-
} catch (err) {
|
|
11542
|
-
throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
|
|
11543
|
-
}
|
|
11544
|
-
}
|
|
11545
11619
|
async function getPdfPageCount(pdfPath) {
|
|
11546
11620
|
const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
|
|
11547
11621
|
const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
|
|
@@ -11554,36 +11628,33 @@ async function getPdfPageCount(pdfPath) {
|
|
|
11554
11628
|
}
|
|
11555
11629
|
return n;
|
|
11556
11630
|
}
|
|
11557
|
-
async function
|
|
11558
|
-
|
|
11559
|
-
|
|
11560
|
-
totalPages = await getPdfPageCount(pdfPath);
|
|
11561
|
-
} catch {
|
|
11562
|
-
totalPages = 0;
|
|
11563
|
-
}
|
|
11564
|
-
if (totalPages > 0) {
|
|
11631
|
+
async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, startPage = 1) {
|
|
11632
|
+
const imagesDir = dirname3(prefixPath);
|
|
11633
|
+
for (let page = startPage; page <= totalPages; page++) {
|
|
11565
11634
|
try {
|
|
11566
|
-
|
|
11567
|
-
|
|
11568
|
-
|
|
11569
|
-
|
|
11570
|
-
|
|
11571
|
-
|
|
11572
|
-
|
|
11573
|
-
|
|
11574
|
-
|
|
11575
|
-
|
|
11576
|
-
|
|
11577
|
-
|
|
11578
|
-
|
|
11579
|
-
|
|
11580
|
-
|
|
11635
|
+
await runCommand("pdftoppm", [
|
|
11636
|
+
"-png",
|
|
11637
|
+
"-r",
|
|
11638
|
+
String(dpi),
|
|
11639
|
+
"-f",
|
|
11640
|
+
String(page),
|
|
11641
|
+
"-l",
|
|
11642
|
+
String(page),
|
|
11643
|
+
pdfPath,
|
|
11644
|
+
prefixPath
|
|
11645
|
+
]);
|
|
11646
|
+
const files = await readdir(imagesDir);
|
|
11647
|
+
const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
|
|
11648
|
+
const imagePath = join4(imagesDir, pageFiles[pageFiles.length - 1]);
|
|
11649
|
+
yield { pageNumber: page, imagePath };
|
|
11581
11650
|
} catch (err) {
|
|
11582
|
-
|
|
11651
|
+
yield {
|
|
11652
|
+
pageNumber: page,
|
|
11653
|
+
imagePath: null,
|
|
11654
|
+
error: err instanceof Error ? err : new Error(String(err))
|
|
11655
|
+
};
|
|
11583
11656
|
}
|
|
11584
11657
|
}
|
|
11585
|
-
await renderPdfToPng(pdfPath, prefixPath, dpi);
|
|
11586
|
-
return { emittedPerPageProgress: false, pageCountSource: "fallback" };
|
|
11587
11658
|
}
|
|
11588
11659
|
async function runCommand(cmd, args) {
|
|
11589
11660
|
await new Promise((resolvePromise, reject) => {
|
|
@@ -11624,26 +11695,11 @@ async function assertSofficeAvailable() {
|
|
|
11624
11695
|
throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
|
|
11625
11696
|
}
|
|
11626
11697
|
}
|
|
11627
|
-
async function listPageImages(imagesDir) {
|
|
11628
|
-
const files = await readdir(imagesDir);
|
|
11629
|
-
return files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b)).map((f) => join4(imagesDir, f));
|
|
11630
|
-
}
|
|
11631
11698
|
function naturalPageSort(a, b) {
|
|
11632
11699
|
const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
11633
11700
|
const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
11634
11701
|
return na - nb;
|
|
11635
11702
|
}
|
|
11636
|
-
async function pickRepresentativeImage(images) {
|
|
11637
|
-
const sample = images.slice(0, Math.min(images.length, 8));
|
|
11638
|
-
const weighted = [];
|
|
11639
|
-
for (const p of sample) {
|
|
11640
|
-
const st = await stat(p);
|
|
11641
|
-
if (st.size > 8 * 1024) weighted.push({ path: p, size: st.size });
|
|
11642
|
-
}
|
|
11643
|
-
const use = weighted.length > 0 ? weighted : await Promise.all(sample.map(async (p) => ({ path: p, size: (await stat(p)).size })));
|
|
11644
|
-
use.sort((a, b) => a.size - b.size);
|
|
11645
|
-
return use[Math.floor(use.length / 2)].path;
|
|
11646
|
-
}
|
|
11647
11703
|
async function mapWithConcurrency(items, concurrency, mapper) {
|
|
11648
11704
|
const results = new Array(items.length);
|
|
11649
11705
|
let nextIndex = 0;
|
|
@@ -11745,11 +11801,43 @@ async function updateModelCache(path, probes) {
|
|
|
11745
11801
|
current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
11746
11802
|
await writeFile(path, JSON.stringify(current, null, 2), "utf-8");
|
|
11747
11803
|
}
|
|
11804
|
+
async function ocrWorkerPool(input) {
|
|
11805
|
+
const { queue, workerCount, ocrInput, onPageDone } = input;
|
|
11806
|
+
const results = /* @__PURE__ */ new Map();
|
|
11807
|
+
let completedCount = 0;
|
|
11808
|
+
async function worker() {
|
|
11809
|
+
while (true) {
|
|
11810
|
+
const item = await queue.dequeue();
|
|
11811
|
+
if (item === QUEUE_DONE) break;
|
|
11812
|
+
const { pageNumber, imagePath, error } = item;
|
|
11813
|
+
if (imagePath === null) {
|
|
11814
|
+
input.logger?.log({
|
|
11815
|
+
level: "warn",
|
|
11816
|
+
stage: "ocr",
|
|
11817
|
+
event: "message",
|
|
11818
|
+
message: `\uD398\uC774\uC9C0 ${pageNumber} \uB80C\uB354 \uC2E4\uD328 \u2014 \uBE48 \uD398\uC774\uC9C0\uB85C \uCC98\uB9AC`,
|
|
11819
|
+
meta: { error: String(error) }
|
|
11820
|
+
});
|
|
11821
|
+
results.set(pageNumber, "");
|
|
11822
|
+
completedCount++;
|
|
11823
|
+
onPageDone(pageNumber, completedCount, "");
|
|
11824
|
+
continue;
|
|
11825
|
+
}
|
|
11826
|
+
const { markdown, model } = await ocrImageWithFallback({ ...ocrInput, imagePath });
|
|
11827
|
+
results.set(pageNumber, markdown);
|
|
11828
|
+
completedCount++;
|
|
11829
|
+
onPageDone(pageNumber, completedCount, model);
|
|
11830
|
+
}
|
|
11831
|
+
}
|
|
11832
|
+
const workers = Array.from({ length: workerCount }, () => worker());
|
|
11833
|
+
await Promise.all(workers);
|
|
11834
|
+
return results;
|
|
11835
|
+
}
|
|
11748
11836
|
async function ocrImageWithFallback(input) {
|
|
11749
11837
|
let lastErr = "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958";
|
|
11750
11838
|
for (const model of input.models) {
|
|
11751
11839
|
try {
|
|
11752
|
-
|
|
11840
|
+
const markdown = await ocrImageViaNim({
|
|
11753
11841
|
imagePath: input.imagePath,
|
|
11754
11842
|
prompt: input.prompt,
|
|
11755
11843
|
model,
|
|
@@ -11761,6 +11849,7 @@ async function ocrImageWithFallback(input) {
|
|
|
11761
11849
|
logger: input.logger,
|
|
11762
11850
|
stage: "ocr"
|
|
11763
11851
|
});
|
|
11852
|
+
return { markdown, model };
|
|
11764
11853
|
} catch (err) {
|
|
11765
11854
|
lastErr = err instanceof Error ? err.message : String(err);
|
|
11766
11855
|
}
|