@clazic/kordoc 2.4.19 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -4,11 +4,11 @@ import {
4
4
  markdownToHwpx,
5
5
  markdownToXlsx,
6
6
  parse
7
- } from "./chunk-463YQ2WL.js";
7
+ } from "./chunk-QG6BYZMR.js";
8
8
  import {
9
9
  VERSION,
10
10
  toArrayBuffer
11
- } from "./chunk-MZN7PLTZ.js";
11
+ } from "./chunk-IJGNPAK2.js";
12
12
  import "./chunk-MOL7MDBG.js";
13
13
  import "./chunk-Y4WFKJ5P.js";
14
14
  import "./chunk-YW5G6BCJ.js";
@@ -173,7 +173,7 @@ async function runParse(files, opts) {
173
173
  saveImages(absPath);
174
174
  }
175
175
  } catch (err) {
176
- const { sanitizeError } = await import("./utils-YUAT7LFD.js");
176
+ const { sanitizeError } = await import("./utils-RBXHHCLI.js");
177
177
  process.stderr.write(`
178
178
  [kordoc] ERROR: ${fileName} \u2014 ${sanitizeError(err)}
179
179
  `);
@@ -255,7 +255,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
255
255
  `));
256
256
  }
257
257
  } catch (err) {
258
- const { sanitizeError } = await import("./utils-YUAT7LFD.js");
258
+ const { sanitizeError } = await import("./utils-RBXHHCLI.js");
259
259
  process.stderr.write(` FAIL
260
260
  `);
261
261
  process.stderr.write(` \u2192 ${sanitizeError(err)}
@@ -287,7 +287,7 @@ program.command("init-env").description("kordoc\uC6A9 .env \uD15C\uD50C\uB9BF \u
287
287
  }
288
288
  });
289
289
  program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
290
- const { watchDirectory } = await import("./watch-WEOFVVDO.js");
290
+ const { watchDirectory } = await import("./watch-5CCMTZ7F.js");
291
291
  await watchDirectory({
292
292
  dir,
293
293
  outDir: opts.outDir,
package/dist/index.cjs CHANGED
@@ -3057,7 +3057,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
3057
3057
  var import_xmldom = require("@xmldom/xmldom");
3058
3058
 
3059
3059
  // src/utils.ts
3060
- var VERSION = true ? "2.4.19" : "0.0.0-dev";
3060
+ var VERSION = true ? "2.5.0" : "0.0.0-dev";
3061
3061
  function toArrayBuffer(buf) {
3062
3062
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3063
3063
  return buf.buffer;
@@ -11253,6 +11253,66 @@ var import_child_process4 = require("child_process");
11253
11253
  var import_node_perf_hooks = require("perf_hooks");
11254
11254
  var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
11255
11255
  init_logger();
11256
+
11257
+ // src/pipeline/bounded-queue.ts
11258
+ var QUEUE_DONE = /* @__PURE__ */ Symbol("QUEUE_DONE");
11259
+ var BoundedQueue = class {
11260
+ buffer = [];
11261
+ capacity;
11262
+ closed = false;
11263
+ producerWaiters = [];
11264
+ consumerWaiters = [];
11265
+ constructor(capacity) {
11266
+ if (capacity < 1) throw new RangeError("BoundedQueue capacity must be >= 1");
11267
+ this.capacity = capacity;
11268
+ }
11269
+ async enqueue(item) {
11270
+ if (this.closed) throw new Error("BoundedQueue: cannot enqueue after close()");
11271
+ if (this.consumerWaiters.length > 0) {
11272
+ const resolve4 = this.consumerWaiters.shift();
11273
+ resolve4(item);
11274
+ return;
11275
+ }
11276
+ while (this.buffer.length >= this.capacity) {
11277
+ await new Promise((resolve4) => this.producerWaiters.push(resolve4));
11278
+ if (this.closed) throw new Error("BoundedQueue: closed while waiting to enqueue");
11279
+ }
11280
+ this.buffer.push(item);
11281
+ }
11282
+ async dequeue() {
11283
+ if (this.buffer.length > 0) {
11284
+ const item = this.buffer.shift();
11285
+ this._wakeProducer();
11286
+ return item;
11287
+ }
11288
+ if (this.closed) return QUEUE_DONE;
11289
+ return new Promise((resolve4) => {
11290
+ this.consumerWaiters.push(resolve4);
11291
+ });
11292
+ }
11293
+ close() {
11294
+ if (this.closed) return;
11295
+ this.closed = true;
11296
+ for (const resolve4 of this.consumerWaiters) {
11297
+ resolve4(QUEUE_DONE);
11298
+ }
11299
+ this.consumerWaiters = [];
11300
+ for (const wake of this.producerWaiters) {
11301
+ wake();
11302
+ }
11303
+ this.producerWaiters = [];
11304
+ }
11305
+ get size() {
11306
+ return this.buffer.length;
11307
+ }
11308
+ _wakeProducer() {
11309
+ if (this.producerWaiters.length > 0) {
11310
+ this.producerWaiters.shift()();
11311
+ }
11312
+ }
11313
+ };
11314
+
11315
+ // src/pipeline/unified-ocr.ts
11256
11316
  var libreConvert = import_libreoffice_convert.default.convert;
11257
11317
  var UnifiedOcrError = class extends Error {
11258
11318
  code;
@@ -11284,11 +11344,11 @@ var DEFAULT_MODEL_MAX_TOKENS = {
11284
11344
  };
11285
11345
  var DEFAULT_STAGE_WEIGHTS = {
11286
11346
  convert: 15,
11287
- render: 20,
11347
+ render: 15,
11288
11348
  probe: 5,
11289
- ocr: 45,
11349
+ ocr: 55,
11290
11350
  proofread: 0,
11291
- merge: 5
11351
+ merge: 10
11292
11352
  };
11293
11353
  var OCR_PROMPT2 = [
11294
11354
  "\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, OCR \uC624\uC778\uC2DD \uC624\uB958\uB97C \uC989\uC2DC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uBB3C\uC744 \uCD9C\uB825\uD558\uC138\uC694.",
@@ -11377,39 +11437,19 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11377
11437
  logStage("info", "convert", "done", "PDF \uBCC0\uD658 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
11378
11438
  const renderStart = import_node_perf_hooks.performance.now();
11379
11439
  currentStage = "render";
11440
+ const totalPages = await getPdfPageCount(workingPdfPath).catch(() => 0);
11441
+ if (totalPages === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC218\uB97C \uD655\uC778\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.");
11380
11442
  markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
11381
- logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi });
11382
- const renderWithProgress = await renderPdfToPngWithProgress(
11383
- workingPdfPath,
11384
- (0, import_path5.join)(imagesDir, "page"),
11385
- dpi,
11386
- (current, total) => {
11387
- markStageProgress(
11388
- "render",
11389
- Math.round(current / total * 100),
11390
- current,
11391
- total,
11392
- `\uD398\uC774\uC9C0 ${current}/${total} \uB80C\uB354\uB9C1`
11393
- );
11394
- }
11395
- );
11396
- const images = await listPageImages(imagesDir);
11397
- if (images.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328: \uACB0\uACFC \uC774\uBBF8\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
11398
- if (!renderWithProgress.emittedPerPageProgress) {
11399
- markStageProgress("render", 100, images.length, images.length, `\uD398\uC774\uC9C0 ${images.length}\uC7A5 \uC0DD\uC131`);
11400
- }
11401
- timingsMs.render = elapsedMs(renderStart);
11402
- markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
11403
- logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", {
11404
- pages: images.length,
11405
- elapsedMs: timingsMs.render,
11406
- pageCountSource: renderWithProgress.pageCountSource
11407
- });
11443
+ logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
11444
+ await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path5.join)(imagesDir, "page")]);
11445
+ const firstFiles = (await (0, import_promises2.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11446
+ if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
11447
+ const probeImage = (0, import_path5.join)(imagesDir, firstFiles[0]);
11448
+ markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
11408
11449
  const probeStart = import_node_perf_hooks.performance.now();
11409
11450
  currentStage = "probe";
11410
11451
  markStageStart("probe", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC218\uD589 \uC911");
11411
11452
  logStage("info", "probe", "start", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC2DC\uC791", { models, probeConcurrency });
11412
- const probeImage = await pickRepresentativeImage(images);
11413
11453
  let probeDone = 0;
11414
11454
  const probeRuns = startParallelProbeRuns({
11415
11455
  models,
@@ -11436,33 +11476,74 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11436
11476
  await updateModelCache(modelCachePath, results);
11437
11477
  return results;
11438
11478
  });
11479
+ const concurrencyPerKey = Math.max(1, options.concurrencyPerKey ?? 2);
11480
+ const keyCount = keyPool.snapshot().length;
11481
+ const workerCount = Math.max(1, keyCount * concurrencyPerKey);
11482
+ const queueCapacity = workerCount * 2;
11483
+ const queue = new BoundedQueue(queueCapacity);
11439
11484
  const ocrStart = import_node_perf_hooks.performance.now();
11440
11485
  currentStage = "ocr";
11441
- markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (${selectedModel})`);
11442
- logStage("info", "ocr", "start", "\uD398\uC774\uC9C0 OCR \uC2DC\uC791", { selectedModel, pageCount: images.length });
11443
- const rawPagePaths = [];
11444
- for (let i = 0; i < images.length; i++) {
11445
- const imagePath = images[i];
11446
- const markdown = await ocrImageWithFallback({
11447
- imagePath,
11448
- prompt: OCR_PROMPT2,
11449
- models: fallbackModelOrder,
11450
- modelMaxTokens,
11451
- baseUrl,
11452
- keyPool,
11453
- timeoutMs,
11454
- maxRetriesPerPage,
11486
+ markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
11487
+ logStage("info", "ocr", "start", "\uD398\uC774\uC9C0 OCR \uC2DC\uC791", { workerCount, keyCount, pageCount: totalPages });
11488
+ let renderDone = 1;
11489
+ const renderProducer = (async () => {
11490
+ try {
11491
+ await queue.enqueue({ pageNumber: 1, imagePath: probeImage });
11492
+ if (totalPages > 1) {
11493
+ for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi, totalPages, 2)) {
11494
+ await queue.enqueue(item);
11495
+ renderDone++;
11496
+ markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
11497
+ logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
11498
+ }
11499
+ }
11500
+ } finally {
11501
+ queue.close();
11502
+ timingsMs.render = elapsedMs(renderStart);
11503
+ markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
11504
+ logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
11505
+ }
11506
+ })();
11507
+ const [, pageResultsMap] = await Promise.all([
11508
+ renderProducer,
11509
+ ocrWorkerPool({
11510
+ queue,
11511
+ workerCount,
11512
+ totalPages,
11513
+ ocrInput: {
11514
+ prompt: OCR_PROMPT2,
11515
+ models: fallbackModelOrder,
11516
+ modelMaxTokens,
11517
+ baseUrl,
11518
+ keyPool,
11519
+ timeoutMs,
11520
+ maxRetriesPerPage,
11521
+ logger
11522
+ },
11523
+ onPageDone: (pageNumber, completedCount, model) => {
11524
+ markStageProgress(
11525
+ "ocr",
11526
+ Math.round(completedCount / totalPages * 100),
11527
+ completedCount,
11528
+ totalPages,
11529
+ `OCR ${completedCount}/${totalPages}`,
11530
+ model || void 0
11531
+ );
11532
+ logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: pageNumber, total: totalPages, model });
11533
+ },
11455
11534
  logger
11456
- });
11457
- const pagePath = (0, import_path5.join)(rawDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11458
- await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
11459
- rawPagePaths.push(pagePath);
11460
- markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`, selectedModel);
11461
- logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
11462
- }
11535
+ })
11536
+ ]);
11463
11537
  timingsMs.ocr = elapsedMs(ocrStart);
11464
11538
  markStageDone("ocr", "OCR \uC644\uB8CC");
11465
11539
  logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
11540
+ const sortedEntries = Array.from(pageResultsMap.entries()).sort((a, b) => a[0] - b[0]);
11541
+ const rawPagePaths = [];
11542
+ for (const [pageNum, markdown] of sortedEntries) {
11543
+ const pagePath = (0, import_path5.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
11544
+ await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
11545
+ rawPagePaths.push(pagePath);
11546
+ }
11466
11547
  const mergeStart = import_node_perf_hooks.performance.now();
11467
11548
  currentStage = "merge";
11468
11549
  markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
@@ -11479,7 +11560,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11479
11560
  selectedModel,
11480
11561
  probeImage,
11481
11562
  probeResults: await probeResultsPromise,
11482
- pageCount: images.length,
11563
+ pageCount: totalPages,
11483
11564
  keyHealth: keyPool.snapshot(),
11484
11565
  timingsMs,
11485
11566
  modelCachePath
@@ -11557,13 +11638,6 @@ async function convertWithLibreOffice(buffer, ext) {
11557
11638
  });
11558
11639
  });
11559
11640
  }
11560
- async function renderPdfToPng(pdfPath, prefixPath, dpi) {
11561
- try {
11562
- await runCommand("pdftoppm", ["-png", "-r", String(dpi), pdfPath, prefixPath]);
11563
- } catch (err) {
11564
- throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
11565
- }
11566
- }
11567
11641
  async function getPdfPageCount(pdfPath) {
11568
11642
  const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
11569
11643
  const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
@@ -11576,36 +11650,33 @@ async function getPdfPageCount(pdfPath) {
11576
11650
  }
11577
11651
  return n;
11578
11652
  }
11579
- async function renderPdfToPngWithProgress(pdfPath, prefixPath, dpi, onPageDone) {
11580
- let totalPages = 0;
11581
- try {
11582
- totalPages = await getPdfPageCount(pdfPath);
11583
- } catch {
11584
- totalPages = 0;
11585
- }
11586
- if (totalPages > 0) {
11653
+ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, startPage = 1) {
11654
+ const imagesDir = (0, import_path5.dirname)(prefixPath);
11655
+ for (let page = startPage; page <= totalPages; page++) {
11587
11656
  try {
11588
- for (let page = 1; page <= totalPages; page++) {
11589
- await runCommand("pdftoppm", [
11590
- "-png",
11591
- "-r",
11592
- String(dpi),
11593
- "-f",
11594
- String(page),
11595
- "-l",
11596
- String(page),
11597
- pdfPath,
11598
- prefixPath
11599
- ]);
11600
- onPageDone(page, totalPages);
11601
- }
11602
- return { emittedPerPageProgress: true, pageCountSource: "pdfinfo" };
11657
+ await runCommand("pdftoppm", [
11658
+ "-png",
11659
+ "-r",
11660
+ String(dpi),
11661
+ "-f",
11662
+ String(page),
11663
+ "-l",
11664
+ String(page),
11665
+ pdfPath,
11666
+ prefixPath
11667
+ ]);
11668
+ const files = await (0, import_promises2.readdir)(imagesDir);
11669
+ const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11670
+ const imagePath = (0, import_path5.join)(imagesDir, pageFiles[pageFiles.length - 1]);
11671
+ yield { pageNumber: page, imagePath };
11603
11672
  } catch (err) {
11604
- throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
11673
+ yield {
11674
+ pageNumber: page,
11675
+ imagePath: null,
11676
+ error: err instanceof Error ? err : new Error(String(err))
11677
+ };
11605
11678
  }
11606
11679
  }
11607
- await renderPdfToPng(pdfPath, prefixPath, dpi);
11608
- return { emittedPerPageProgress: false, pageCountSource: "fallback" };
11609
11680
  }
11610
11681
  async function runCommand(cmd, args) {
11611
11682
  await new Promise((resolvePromise, reject) => {
@@ -11646,26 +11717,11 @@ async function assertSofficeAvailable() {
11646
11717
  throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
11647
11718
  }
11648
11719
  }
11649
- async function listPageImages(imagesDir) {
11650
- const files = await (0, import_promises2.readdir)(imagesDir);
11651
- return files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b)).map((f) => (0, import_path5.join)(imagesDir, f));
11652
- }
11653
11720
  function naturalPageSort(a, b) {
11654
11721
  const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
11655
11722
  const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
11656
11723
  return na - nb;
11657
11724
  }
11658
- async function pickRepresentativeImage(images) {
11659
- const sample = images.slice(0, Math.min(images.length, 8));
11660
- const weighted = [];
11661
- for (const p of sample) {
11662
- const st = await (0, import_promises2.stat)(p);
11663
- if (st.size > 8 * 1024) weighted.push({ path: p, size: st.size });
11664
- }
11665
- const use = weighted.length > 0 ? weighted : await Promise.all(sample.map(async (p) => ({ path: p, size: (await (0, import_promises2.stat)(p)).size })));
11666
- use.sort((a, b) => a.size - b.size);
11667
- return use[Math.floor(use.length / 2)].path;
11668
- }
11669
11725
  async function mapWithConcurrency(items, concurrency, mapper) {
11670
11726
  const results = new Array(items.length);
11671
11727
  let nextIndex = 0;
@@ -11767,11 +11823,43 @@ async function updateModelCache(path, probes) {
11767
11823
  current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
11768
11824
  await (0, import_promises2.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
11769
11825
  }
11826
+ async function ocrWorkerPool(input) {
11827
+ const { queue, workerCount, ocrInput, onPageDone } = input;
11828
+ const results = /* @__PURE__ */ new Map();
11829
+ let completedCount = 0;
11830
+ async function worker() {
11831
+ while (true) {
11832
+ const item = await queue.dequeue();
11833
+ if (item === QUEUE_DONE) break;
11834
+ const { pageNumber, imagePath, error } = item;
11835
+ if (imagePath === null) {
11836
+ input.logger?.log({
11837
+ level: "warn",
11838
+ stage: "ocr",
11839
+ event: "message",
11840
+ message: `\uD398\uC774\uC9C0 ${pageNumber} \uB80C\uB354 \uC2E4\uD328 \u2014 \uBE48 \uD398\uC774\uC9C0\uB85C \uCC98\uB9AC`,
11841
+ meta: { error: String(error) }
11842
+ });
11843
+ results.set(pageNumber, "");
11844
+ completedCount++;
11845
+ onPageDone(pageNumber, completedCount, "");
11846
+ continue;
11847
+ }
11848
+ const { markdown, model } = await ocrImageWithFallback({ ...ocrInput, imagePath });
11849
+ results.set(pageNumber, markdown);
11850
+ completedCount++;
11851
+ onPageDone(pageNumber, completedCount, model);
11852
+ }
11853
+ }
11854
+ const workers = Array.from({ length: workerCount }, () => worker());
11855
+ await Promise.all(workers);
11856
+ return results;
11857
+ }
11770
11858
  async function ocrImageWithFallback(input) {
11771
11859
  let lastErr = "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958";
11772
11860
  for (const model of input.models) {
11773
11861
  try {
11774
- return await ocrImageViaNim({
11862
+ const markdown = await ocrImageViaNim({
11775
11863
  imagePath: input.imagePath,
11776
11864
  prompt: input.prompt,
11777
11865
  model,
@@ -11783,6 +11871,7 @@ async function ocrImageWithFallback(input) {
11783
11871
  logger: input.logger,
11784
11872
  stage: "ocr"
11785
11873
  });
11874
+ return { markdown, model };
11786
11875
  } catch (err) {
11787
11876
  lastErr = err instanceof Error ? err.message : String(err);
11788
11877
  }