@clazic/kordoc 2.4.19 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-MZN7PLTZ.js → chunk-IJGNPAK2.js} +2 -2
- package/dist/{chunk-MZN7PLTZ.js.map → chunk-IJGNPAK2.js.map} +1 -1
- package/dist/{chunk-463YQ2WL.js → chunk-QG6BYZMR.js} +2 -2
- package/dist/{chunk-463YQ2WL.js.map → chunk-QG6BYZMR.js.map} +1 -1
- package/dist/cli.js +5 -5
- package/dist/index.cjs +192 -103
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +3 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.js +192 -103
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{utils-YUAT7LFD.js → utils-RBXHHCLI.js} +2 -2
- package/dist/{watch-WEOFVVDO.js → watch-5CCMTZ7F.js} +3 -3
- package/package.json +1 -1
- /package/dist/{utils-YUAT7LFD.js.map → utils-RBXHHCLI.js.map} +0 -0
- /package/dist/{watch-WEOFVVDO.js.map → watch-5CCMTZ7F.js.map} +0 -0
package/dist/cli.js
CHANGED
|
@@ -4,11 +4,11 @@ import {
|
|
|
4
4
|
markdownToHwpx,
|
|
5
5
|
markdownToXlsx,
|
|
6
6
|
parse
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-QG6BYZMR.js";
|
|
8
8
|
import {
|
|
9
9
|
VERSION,
|
|
10
10
|
toArrayBuffer
|
|
11
|
-
} from "./chunk-
|
|
11
|
+
} from "./chunk-IJGNPAK2.js";
|
|
12
12
|
import "./chunk-MOL7MDBG.js";
|
|
13
13
|
import "./chunk-Y4WFKJ5P.js";
|
|
14
14
|
import "./chunk-YW5G6BCJ.js";
|
|
@@ -173,7 +173,7 @@ async function runParse(files, opts) {
|
|
|
173
173
|
saveImages(absPath);
|
|
174
174
|
}
|
|
175
175
|
} catch (err) {
|
|
176
|
-
const { sanitizeError } = await import("./utils-
|
|
176
|
+
const { sanitizeError } = await import("./utils-RBXHHCLI.js");
|
|
177
177
|
process.stderr.write(`
|
|
178
178
|
[kordoc] ERROR: ${fileName} \u2014 ${sanitizeError(err)}
|
|
179
179
|
`);
|
|
@@ -255,7 +255,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
|
|
|
255
255
|
`));
|
|
256
256
|
}
|
|
257
257
|
} catch (err) {
|
|
258
|
-
const { sanitizeError } = await import("./utils-
|
|
258
|
+
const { sanitizeError } = await import("./utils-RBXHHCLI.js");
|
|
259
259
|
process.stderr.write(` FAIL
|
|
260
260
|
`);
|
|
261
261
|
process.stderr.write(` \u2192 ${sanitizeError(err)}
|
|
@@ -287,7 +287,7 @@ program.command("init-env").description("kordoc\uC6A9 .env \uD15C\uD50C\uB9BF \u
|
|
|
287
287
|
}
|
|
288
288
|
});
|
|
289
289
|
program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
|
|
290
|
-
const { watchDirectory } = await import("./watch-
|
|
290
|
+
const { watchDirectory } = await import("./watch-5CCMTZ7F.js");
|
|
291
291
|
await watchDirectory({
|
|
292
292
|
dir,
|
|
293
293
|
outDir: opts.outDir,
|
package/dist/index.cjs
CHANGED
|
@@ -3057,7 +3057,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
|
|
|
3057
3057
|
var import_xmldom = require("@xmldom/xmldom");
|
|
3058
3058
|
|
|
3059
3059
|
// src/utils.ts
|
|
3060
|
-
var VERSION = true ? "2.
|
|
3060
|
+
var VERSION = true ? "2.5.0" : "0.0.0-dev";
|
|
3061
3061
|
function toArrayBuffer(buf) {
|
|
3062
3062
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3063
3063
|
return buf.buffer;
|
|
@@ -11253,6 +11253,66 @@ var import_child_process4 = require("child_process");
|
|
|
11253
11253
|
var import_node_perf_hooks = require("perf_hooks");
|
|
11254
11254
|
var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
|
|
11255
11255
|
init_logger();
|
|
11256
|
+
|
|
11257
|
+
// src/pipeline/bounded-queue.ts
|
|
11258
|
+
var QUEUE_DONE = /* @__PURE__ */ Symbol("QUEUE_DONE");
|
|
11259
|
+
var BoundedQueue = class {
|
|
11260
|
+
buffer = [];
|
|
11261
|
+
capacity;
|
|
11262
|
+
closed = false;
|
|
11263
|
+
producerWaiters = [];
|
|
11264
|
+
consumerWaiters = [];
|
|
11265
|
+
constructor(capacity) {
|
|
11266
|
+
if (capacity < 1) throw new RangeError("BoundedQueue capacity must be >= 1");
|
|
11267
|
+
this.capacity = capacity;
|
|
11268
|
+
}
|
|
11269
|
+
async enqueue(item) {
|
|
11270
|
+
if (this.closed) throw new Error("BoundedQueue: cannot enqueue after close()");
|
|
11271
|
+
if (this.consumerWaiters.length > 0) {
|
|
11272
|
+
const resolve4 = this.consumerWaiters.shift();
|
|
11273
|
+
resolve4(item);
|
|
11274
|
+
return;
|
|
11275
|
+
}
|
|
11276
|
+
while (this.buffer.length >= this.capacity) {
|
|
11277
|
+
await new Promise((resolve4) => this.producerWaiters.push(resolve4));
|
|
11278
|
+
if (this.closed) throw new Error("BoundedQueue: closed while waiting to enqueue");
|
|
11279
|
+
}
|
|
11280
|
+
this.buffer.push(item);
|
|
11281
|
+
}
|
|
11282
|
+
async dequeue() {
|
|
11283
|
+
if (this.buffer.length > 0) {
|
|
11284
|
+
const item = this.buffer.shift();
|
|
11285
|
+
this._wakeProducer();
|
|
11286
|
+
return item;
|
|
11287
|
+
}
|
|
11288
|
+
if (this.closed) return QUEUE_DONE;
|
|
11289
|
+
return new Promise((resolve4) => {
|
|
11290
|
+
this.consumerWaiters.push(resolve4);
|
|
11291
|
+
});
|
|
11292
|
+
}
|
|
11293
|
+
close() {
|
|
11294
|
+
if (this.closed) return;
|
|
11295
|
+
this.closed = true;
|
|
11296
|
+
for (const resolve4 of this.consumerWaiters) {
|
|
11297
|
+
resolve4(QUEUE_DONE);
|
|
11298
|
+
}
|
|
11299
|
+
this.consumerWaiters = [];
|
|
11300
|
+
for (const wake of this.producerWaiters) {
|
|
11301
|
+
wake();
|
|
11302
|
+
}
|
|
11303
|
+
this.producerWaiters = [];
|
|
11304
|
+
}
|
|
11305
|
+
get size() {
|
|
11306
|
+
return this.buffer.length;
|
|
11307
|
+
}
|
|
11308
|
+
_wakeProducer() {
|
|
11309
|
+
if (this.producerWaiters.length > 0) {
|
|
11310
|
+
this.producerWaiters.shift()();
|
|
11311
|
+
}
|
|
11312
|
+
}
|
|
11313
|
+
};
|
|
11314
|
+
|
|
11315
|
+
// src/pipeline/unified-ocr.ts
|
|
11256
11316
|
var libreConvert = import_libreoffice_convert.default.convert;
|
|
11257
11317
|
var UnifiedOcrError = class extends Error {
|
|
11258
11318
|
code;
|
|
@@ -11284,11 +11344,11 @@ var DEFAULT_MODEL_MAX_TOKENS = {
|
|
|
11284
11344
|
};
|
|
11285
11345
|
var DEFAULT_STAGE_WEIGHTS = {
|
|
11286
11346
|
convert: 15,
|
|
11287
|
-
render:
|
|
11347
|
+
render: 15,
|
|
11288
11348
|
probe: 5,
|
|
11289
|
-
ocr:
|
|
11349
|
+
ocr: 55,
|
|
11290
11350
|
proofread: 0,
|
|
11291
|
-
merge:
|
|
11351
|
+
merge: 10
|
|
11292
11352
|
};
|
|
11293
11353
|
var OCR_PROMPT2 = [
|
|
11294
11354
|
"\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, OCR \uC624\uC778\uC2DD \uC624\uB958\uB97C \uC989\uC2DC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uBB3C\uC744 \uCD9C\uB825\uD558\uC138\uC694.",
|
|
@@ -11377,39 +11437,19 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11377
11437
|
logStage("info", "convert", "done", "PDF \uBCC0\uD658 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
|
|
11378
11438
|
const renderStart = import_node_perf_hooks.performance.now();
|
|
11379
11439
|
currentStage = "render";
|
|
11440
|
+
const totalPages = await getPdfPageCount(workingPdfPath).catch(() => 0);
|
|
11441
|
+
if (totalPages === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC218\uB97C \uD655\uC778\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
11380
11442
|
markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
|
|
11381
|
-
logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi });
|
|
11382
|
-
|
|
11383
|
-
|
|
11384
|
-
|
|
11385
|
-
|
|
11386
|
-
|
|
11387
|
-
markStageProgress(
|
|
11388
|
-
"render",
|
|
11389
|
-
Math.round(current / total * 100),
|
|
11390
|
-
current,
|
|
11391
|
-
total,
|
|
11392
|
-
`\uD398\uC774\uC9C0 ${current}/${total} \uB80C\uB354\uB9C1`
|
|
11393
|
-
);
|
|
11394
|
-
}
|
|
11395
|
-
);
|
|
11396
|
-
const images = await listPageImages(imagesDir);
|
|
11397
|
-
if (images.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328: \uACB0\uACFC \uC774\uBBF8\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
11398
|
-
if (!renderWithProgress.emittedPerPageProgress) {
|
|
11399
|
-
markStageProgress("render", 100, images.length, images.length, `\uD398\uC774\uC9C0 ${images.length}\uC7A5 \uC0DD\uC131`);
|
|
11400
|
-
}
|
|
11401
|
-
timingsMs.render = elapsedMs(renderStart);
|
|
11402
|
-
markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
|
|
11403
|
-
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", {
|
|
11404
|
-
pages: images.length,
|
|
11405
|
-
elapsedMs: timingsMs.render,
|
|
11406
|
-
pageCountSource: renderWithProgress.pageCountSource
|
|
11407
|
-
});
|
|
11443
|
+
logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
|
|
11444
|
+
await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path5.join)(imagesDir, "page")]);
|
|
11445
|
+
const firstFiles = (await (0, import_promises2.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
|
|
11446
|
+
if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
|
|
11447
|
+
const probeImage = (0, import_path5.join)(imagesDir, firstFiles[0]);
|
|
11448
|
+
markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
|
|
11408
11449
|
const probeStart = import_node_perf_hooks.performance.now();
|
|
11409
11450
|
currentStage = "probe";
|
|
11410
11451
|
markStageStart("probe", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC218\uD589 \uC911");
|
|
11411
11452
|
logStage("info", "probe", "start", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC2DC\uC791", { models, probeConcurrency });
|
|
11412
|
-
const probeImage = await pickRepresentativeImage(images);
|
|
11413
11453
|
let probeDone = 0;
|
|
11414
11454
|
const probeRuns = startParallelProbeRuns({
|
|
11415
11455
|
models,
|
|
@@ -11436,33 +11476,74 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11436
11476
|
await updateModelCache(modelCachePath, results);
|
|
11437
11477
|
return results;
|
|
11438
11478
|
});
|
|
11479
|
+
const concurrencyPerKey = Math.max(1, options.concurrencyPerKey ?? 2);
|
|
11480
|
+
const keyCount = keyPool.snapshot().length;
|
|
11481
|
+
const workerCount = Math.max(1, keyCount * concurrencyPerKey);
|
|
11482
|
+
const queueCapacity = workerCount * 2;
|
|
11483
|
+
const queue = new BoundedQueue(queueCapacity);
|
|
11439
11484
|
const ocrStart = import_node_perf_hooks.performance.now();
|
|
11440
11485
|
currentStage = "ocr";
|
|
11441
|
-
markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (${
|
|
11442
|
-
logStage("info", "ocr", "start", "\uD398\uC774\uC9C0 OCR \uC2DC\uC791", {
|
|
11443
|
-
|
|
11444
|
-
|
|
11445
|
-
|
|
11446
|
-
|
|
11447
|
-
|
|
11448
|
-
|
|
11449
|
-
|
|
11450
|
-
|
|
11451
|
-
|
|
11452
|
-
|
|
11453
|
-
|
|
11454
|
-
|
|
11486
|
+
markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
|
|
11487
|
+
logStage("info", "ocr", "start", "\uD398\uC774\uC9C0 OCR \uC2DC\uC791", { workerCount, keyCount, pageCount: totalPages });
|
|
11488
|
+
let renderDone = 1;
|
|
11489
|
+
const renderProducer = (async () => {
|
|
11490
|
+
try {
|
|
11491
|
+
await queue.enqueue({ pageNumber: 1, imagePath: probeImage });
|
|
11492
|
+
if (totalPages > 1) {
|
|
11493
|
+
for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi, totalPages, 2)) {
|
|
11494
|
+
await queue.enqueue(item);
|
|
11495
|
+
renderDone++;
|
|
11496
|
+
markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
|
|
11497
|
+
logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
|
|
11498
|
+
}
|
|
11499
|
+
}
|
|
11500
|
+
} finally {
|
|
11501
|
+
queue.close();
|
|
11502
|
+
timingsMs.render = elapsedMs(renderStart);
|
|
11503
|
+
markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
|
|
11504
|
+
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
|
|
11505
|
+
}
|
|
11506
|
+
})();
|
|
11507
|
+
const [, pageResultsMap] = await Promise.all([
|
|
11508
|
+
renderProducer,
|
|
11509
|
+
ocrWorkerPool({
|
|
11510
|
+
queue,
|
|
11511
|
+
workerCount,
|
|
11512
|
+
totalPages,
|
|
11513
|
+
ocrInput: {
|
|
11514
|
+
prompt: OCR_PROMPT2,
|
|
11515
|
+
models: fallbackModelOrder,
|
|
11516
|
+
modelMaxTokens,
|
|
11517
|
+
baseUrl,
|
|
11518
|
+
keyPool,
|
|
11519
|
+
timeoutMs,
|
|
11520
|
+
maxRetriesPerPage,
|
|
11521
|
+
logger
|
|
11522
|
+
},
|
|
11523
|
+
onPageDone: (pageNumber, completedCount, model) => {
|
|
11524
|
+
markStageProgress(
|
|
11525
|
+
"ocr",
|
|
11526
|
+
Math.round(completedCount / totalPages * 100),
|
|
11527
|
+
completedCount,
|
|
11528
|
+
totalPages,
|
|
11529
|
+
`OCR ${completedCount}/${totalPages}`,
|
|
11530
|
+
model || void 0
|
|
11531
|
+
);
|
|
11532
|
+
logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: pageNumber, total: totalPages, model });
|
|
11533
|
+
},
|
|
11455
11534
|
logger
|
|
11456
|
-
})
|
|
11457
|
-
|
|
11458
|
-
await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
|
|
11459
|
-
rawPagePaths.push(pagePath);
|
|
11460
|
-
markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`, selectedModel);
|
|
11461
|
-
logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
|
|
11462
|
-
}
|
|
11535
|
+
})
|
|
11536
|
+
]);
|
|
11463
11537
|
timingsMs.ocr = elapsedMs(ocrStart);
|
|
11464
11538
|
markStageDone("ocr", "OCR \uC644\uB8CC");
|
|
11465
11539
|
logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
|
|
11540
|
+
const sortedEntries = Array.from(pageResultsMap.entries()).sort((a, b) => a[0] - b[0]);
|
|
11541
|
+
const rawPagePaths = [];
|
|
11542
|
+
for (const [pageNum, markdown] of sortedEntries) {
|
|
11543
|
+
const pagePath = (0, import_path5.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
|
|
11544
|
+
await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
|
|
11545
|
+
rawPagePaths.push(pagePath);
|
|
11546
|
+
}
|
|
11466
11547
|
const mergeStart = import_node_perf_hooks.performance.now();
|
|
11467
11548
|
currentStage = "merge";
|
|
11468
11549
|
markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
|
|
@@ -11479,7 +11560,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11479
11560
|
selectedModel,
|
|
11480
11561
|
probeImage,
|
|
11481
11562
|
probeResults: await probeResultsPromise,
|
|
11482
|
-
pageCount:
|
|
11563
|
+
pageCount: totalPages,
|
|
11483
11564
|
keyHealth: keyPool.snapshot(),
|
|
11484
11565
|
timingsMs,
|
|
11485
11566
|
modelCachePath
|
|
@@ -11557,13 +11638,6 @@ async function convertWithLibreOffice(buffer, ext) {
|
|
|
11557
11638
|
});
|
|
11558
11639
|
});
|
|
11559
11640
|
}
|
|
11560
|
-
async function renderPdfToPng(pdfPath, prefixPath, dpi) {
|
|
11561
|
-
try {
|
|
11562
|
-
await runCommand("pdftoppm", ["-png", "-r", String(dpi), pdfPath, prefixPath]);
|
|
11563
|
-
} catch (err) {
|
|
11564
|
-
throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
|
|
11565
|
-
}
|
|
11566
|
-
}
|
|
11567
11641
|
async function getPdfPageCount(pdfPath) {
|
|
11568
11642
|
const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
|
|
11569
11643
|
const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
|
|
@@ -11576,36 +11650,33 @@ async function getPdfPageCount(pdfPath) {
|
|
|
11576
11650
|
}
|
|
11577
11651
|
return n;
|
|
11578
11652
|
}
|
|
11579
|
-
async function
|
|
11580
|
-
|
|
11581
|
-
|
|
11582
|
-
totalPages = await getPdfPageCount(pdfPath);
|
|
11583
|
-
} catch {
|
|
11584
|
-
totalPages = 0;
|
|
11585
|
-
}
|
|
11586
|
-
if (totalPages > 0) {
|
|
11653
|
+
async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, startPage = 1) {
|
|
11654
|
+
const imagesDir = (0, import_path5.dirname)(prefixPath);
|
|
11655
|
+
for (let page = startPage; page <= totalPages; page++) {
|
|
11587
11656
|
try {
|
|
11588
|
-
|
|
11589
|
-
|
|
11590
|
-
|
|
11591
|
-
|
|
11592
|
-
|
|
11593
|
-
|
|
11594
|
-
|
|
11595
|
-
|
|
11596
|
-
|
|
11597
|
-
|
|
11598
|
-
|
|
11599
|
-
|
|
11600
|
-
|
|
11601
|
-
|
|
11602
|
-
|
|
11657
|
+
await runCommand("pdftoppm", [
|
|
11658
|
+
"-png",
|
|
11659
|
+
"-r",
|
|
11660
|
+
String(dpi),
|
|
11661
|
+
"-f",
|
|
11662
|
+
String(page),
|
|
11663
|
+
"-l",
|
|
11664
|
+
String(page),
|
|
11665
|
+
pdfPath,
|
|
11666
|
+
prefixPath
|
|
11667
|
+
]);
|
|
11668
|
+
const files = await (0, import_promises2.readdir)(imagesDir);
|
|
11669
|
+
const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
|
|
11670
|
+
const imagePath = (0, import_path5.join)(imagesDir, pageFiles[pageFiles.length - 1]);
|
|
11671
|
+
yield { pageNumber: page, imagePath };
|
|
11603
11672
|
} catch (err) {
|
|
11604
|
-
|
|
11673
|
+
yield {
|
|
11674
|
+
pageNumber: page,
|
|
11675
|
+
imagePath: null,
|
|
11676
|
+
error: err instanceof Error ? err : new Error(String(err))
|
|
11677
|
+
};
|
|
11605
11678
|
}
|
|
11606
11679
|
}
|
|
11607
|
-
await renderPdfToPng(pdfPath, prefixPath, dpi);
|
|
11608
|
-
return { emittedPerPageProgress: false, pageCountSource: "fallback" };
|
|
11609
11680
|
}
|
|
11610
11681
|
async function runCommand(cmd, args) {
|
|
11611
11682
|
await new Promise((resolvePromise, reject) => {
|
|
@@ -11646,26 +11717,11 @@ async function assertSofficeAvailable() {
|
|
|
11646
11717
|
throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
|
|
11647
11718
|
}
|
|
11648
11719
|
}
|
|
11649
|
-
async function listPageImages(imagesDir) {
|
|
11650
|
-
const files = await (0, import_promises2.readdir)(imagesDir);
|
|
11651
|
-
return files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b)).map((f) => (0, import_path5.join)(imagesDir, f));
|
|
11652
|
-
}
|
|
11653
11720
|
function naturalPageSort(a, b) {
|
|
11654
11721
|
const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
11655
11722
|
const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
11656
11723
|
return na - nb;
|
|
11657
11724
|
}
|
|
11658
|
-
async function pickRepresentativeImage(images) {
|
|
11659
|
-
const sample = images.slice(0, Math.min(images.length, 8));
|
|
11660
|
-
const weighted = [];
|
|
11661
|
-
for (const p of sample) {
|
|
11662
|
-
const st = await (0, import_promises2.stat)(p);
|
|
11663
|
-
if (st.size > 8 * 1024) weighted.push({ path: p, size: st.size });
|
|
11664
|
-
}
|
|
11665
|
-
const use = weighted.length > 0 ? weighted : await Promise.all(sample.map(async (p) => ({ path: p, size: (await (0, import_promises2.stat)(p)).size })));
|
|
11666
|
-
use.sort((a, b) => a.size - b.size);
|
|
11667
|
-
return use[Math.floor(use.length / 2)].path;
|
|
11668
|
-
}
|
|
11669
11725
|
async function mapWithConcurrency(items, concurrency, mapper) {
|
|
11670
11726
|
const results = new Array(items.length);
|
|
11671
11727
|
let nextIndex = 0;
|
|
@@ -11767,11 +11823,43 @@ async function updateModelCache(path, probes) {
|
|
|
11767
11823
|
current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
11768
11824
|
await (0, import_promises2.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
|
|
11769
11825
|
}
|
|
11826
|
+
async function ocrWorkerPool(input) {
|
|
11827
|
+
const { queue, workerCount, ocrInput, onPageDone } = input;
|
|
11828
|
+
const results = /* @__PURE__ */ new Map();
|
|
11829
|
+
let completedCount = 0;
|
|
11830
|
+
async function worker() {
|
|
11831
|
+
while (true) {
|
|
11832
|
+
const item = await queue.dequeue();
|
|
11833
|
+
if (item === QUEUE_DONE) break;
|
|
11834
|
+
const { pageNumber, imagePath, error } = item;
|
|
11835
|
+
if (imagePath === null) {
|
|
11836
|
+
input.logger?.log({
|
|
11837
|
+
level: "warn",
|
|
11838
|
+
stage: "ocr",
|
|
11839
|
+
event: "message",
|
|
11840
|
+
message: `\uD398\uC774\uC9C0 ${pageNumber} \uB80C\uB354 \uC2E4\uD328 \u2014 \uBE48 \uD398\uC774\uC9C0\uB85C \uCC98\uB9AC`,
|
|
11841
|
+
meta: { error: String(error) }
|
|
11842
|
+
});
|
|
11843
|
+
results.set(pageNumber, "");
|
|
11844
|
+
completedCount++;
|
|
11845
|
+
onPageDone(pageNumber, completedCount, "");
|
|
11846
|
+
continue;
|
|
11847
|
+
}
|
|
11848
|
+
const { markdown, model } = await ocrImageWithFallback({ ...ocrInput, imagePath });
|
|
11849
|
+
results.set(pageNumber, markdown);
|
|
11850
|
+
completedCount++;
|
|
11851
|
+
onPageDone(pageNumber, completedCount, model);
|
|
11852
|
+
}
|
|
11853
|
+
}
|
|
11854
|
+
const workers = Array.from({ length: workerCount }, () => worker());
|
|
11855
|
+
await Promise.all(workers);
|
|
11856
|
+
return results;
|
|
11857
|
+
}
|
|
11770
11858
|
async function ocrImageWithFallback(input) {
|
|
11771
11859
|
let lastErr = "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958";
|
|
11772
11860
|
for (const model of input.models) {
|
|
11773
11861
|
try {
|
|
11774
|
-
|
|
11862
|
+
const markdown = await ocrImageViaNim({
|
|
11775
11863
|
imagePath: input.imagePath,
|
|
11776
11864
|
prompt: input.prompt,
|
|
11777
11865
|
model,
|
|
@@ -11783,6 +11871,7 @@ async function ocrImageWithFallback(input) {
|
|
|
11783
11871
|
logger: input.logger,
|
|
11784
11872
|
stage: "ocr"
|
|
11785
11873
|
});
|
|
11874
|
+
return { markdown, model };
|
|
11786
11875
|
} catch (err) {
|
|
11787
11876
|
lastErr = err instanceof Error ? err.message : String(err);
|
|
11788
11877
|
}
|