@clazic/kordoc 2.4.19 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -421,6 +421,8 @@ interface UnifiedOcrProgressEvent {
421
421
  code?: UnifiedOcrErrorCode;
422
422
  message?: string;
423
423
  model?: string;
424
+ pageNumber?: number;
425
+ workerCount?: number;
424
426
  }
425
427
  interface UnifiedOcrOptions {
426
428
  workspaceDir?: string;
@@ -436,6 +438,7 @@ interface UnifiedOcrOptions {
436
438
  probeConcurrency?: number;
437
439
  logger?: Logger;
438
440
  runId?: string;
441
+ concurrencyPerKey?: number;
439
442
  }
440
443
  interface UnifiedOcrResult {
441
444
  outputPath: string;
package/dist/index.d.ts CHANGED
@@ -421,6 +421,8 @@ interface UnifiedOcrProgressEvent {
421
421
  code?: UnifiedOcrErrorCode;
422
422
  message?: string;
423
423
  model?: string;
424
+ pageNumber?: number;
425
+ workerCount?: number;
424
426
  }
425
427
  interface UnifiedOcrOptions {
426
428
  workspaceDir?: string;
@@ -436,6 +438,7 @@ interface UnifiedOcrOptions {
436
438
  probeConcurrency?: number;
437
439
  logger?: Logger;
438
440
  runId?: string;
441
+ concurrencyPerKey?: number;
439
442
  }
440
443
  interface UnifiedOcrResult {
441
444
  outputPath: string;
package/dist/index.js CHANGED
@@ -3035,7 +3035,7 @@ import JSZip2 from "jszip";
3035
3035
  import { DOMParser } from "@xmldom/xmldom";
3036
3036
 
3037
3037
  // src/utils.ts
3038
- var VERSION = true ? "2.4.19" : "0.0.0-dev";
3038
+ var VERSION = true ? "2.5.0" : "0.0.0-dev";
3039
3039
  function toArrayBuffer(buf) {
3040
3040
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3041
3041
  return buf.buffer;
@@ -11231,6 +11231,66 @@ import { spawn as spawn2 } from "child_process";
11231
11231
  import { performance } from "perf_hooks";
11232
11232
  import libre from "libreoffice-convert";
11233
11233
  init_logger();
11234
+
11235
+ // src/pipeline/bounded-queue.ts
11236
+ var QUEUE_DONE = /* @__PURE__ */ Symbol("QUEUE_DONE");
11237
+ var BoundedQueue = class {
11238
+ buffer = [];
11239
+ capacity;
11240
+ closed = false;
11241
+ producerWaiters = [];
11242
+ consumerWaiters = [];
11243
+ constructor(capacity) {
11244
+ if (capacity < 1) throw new RangeError("BoundedQueue capacity must be >= 1");
11245
+ this.capacity = capacity;
11246
+ }
11247
+ async enqueue(item) {
11248
+ if (this.closed) throw new Error("BoundedQueue: cannot enqueue after close()");
11249
+ if (this.consumerWaiters.length > 0) {
11250
+ const resolve4 = this.consumerWaiters.shift();
11251
+ resolve4(item);
11252
+ return;
11253
+ }
11254
+ while (this.buffer.length >= this.capacity) {
11255
+ await new Promise((resolve4) => this.producerWaiters.push(resolve4));
11256
+ if (this.closed) throw new Error("BoundedQueue: closed while waiting to enqueue");
11257
+ }
11258
+ this.buffer.push(item);
11259
+ }
11260
+ async dequeue() {
11261
+ if (this.buffer.length > 0) {
11262
+ const item = this.buffer.shift();
11263
+ this._wakeProducer();
11264
+ return item;
11265
+ }
11266
+ if (this.closed) return QUEUE_DONE;
11267
+ return new Promise((resolve4) => {
11268
+ this.consumerWaiters.push(resolve4);
11269
+ });
11270
+ }
11271
+ close() {
11272
+ if (this.closed) return;
11273
+ this.closed = true;
11274
+ for (const resolve4 of this.consumerWaiters) {
11275
+ resolve4(QUEUE_DONE);
11276
+ }
11277
+ this.consumerWaiters = [];
11278
+ for (const wake of this.producerWaiters) {
11279
+ wake();
11280
+ }
11281
+ this.producerWaiters = [];
11282
+ }
11283
+ get size() {
11284
+ return this.buffer.length;
11285
+ }
11286
+ _wakeProducer() {
11287
+ if (this.producerWaiters.length > 0) {
11288
+ this.producerWaiters.shift()();
11289
+ }
11290
+ }
11291
+ };
11292
+
11293
+ // src/pipeline/unified-ocr.ts
11234
11294
  var libreConvert = libre.convert;
11235
11295
  var UnifiedOcrError = class extends Error {
11236
11296
  code;
@@ -11262,11 +11322,11 @@ var DEFAULT_MODEL_MAX_TOKENS = {
11262
11322
  };
11263
11323
  var DEFAULT_STAGE_WEIGHTS = {
11264
11324
  convert: 15,
11265
- render: 20,
11325
+ render: 15,
11266
11326
  probe: 5,
11267
- ocr: 45,
11327
+ ocr: 55,
11268
11328
  proofread: 0,
11269
- merge: 5
11329
+ merge: 10
11270
11330
  };
11271
11331
  var OCR_PROMPT2 = [
11272
11332
  "\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, OCR \uC624\uC778\uC2DD \uC624\uB958\uB97C \uC989\uC2DC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uBB3C\uC744 \uCD9C\uB825\uD558\uC138\uC694.",
@@ -11355,39 +11415,19 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11355
11415
  logStage("info", "convert", "done", "PDF \uBCC0\uD658 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
11356
11416
  const renderStart = performance.now();
11357
11417
  currentStage = "render";
11418
+ const totalPages = await getPdfPageCount(workingPdfPath).catch(() => 0);
11419
+ if (totalPages === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC218\uB97C \uD655\uC778\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.");
11358
11420
  markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
11359
- logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi });
11360
- const renderWithProgress = await renderPdfToPngWithProgress(
11361
- workingPdfPath,
11362
- join4(imagesDir, "page"),
11363
- dpi,
11364
- (current, total) => {
11365
- markStageProgress(
11366
- "render",
11367
- Math.round(current / total * 100),
11368
- current,
11369
- total,
11370
- `\uD398\uC774\uC9C0 ${current}/${total} \uB80C\uB354\uB9C1`
11371
- );
11372
- }
11373
- );
11374
- const images = await listPageImages(imagesDir);
11375
- if (images.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328: \uACB0\uACFC \uC774\uBBF8\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
11376
- if (!renderWithProgress.emittedPerPageProgress) {
11377
- markStageProgress("render", 100, images.length, images.length, `\uD398\uC774\uC9C0 ${images.length}\uC7A5 \uC0DD\uC131`);
11378
- }
11379
- timingsMs.render = elapsedMs(renderStart);
11380
- markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
11381
- logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", {
11382
- pages: images.length,
11383
- elapsedMs: timingsMs.render,
11384
- pageCountSource: renderWithProgress.pageCountSource
11385
- });
11421
+ logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
11422
+ await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, join4(imagesDir, "page")]);
11423
+ const firstFiles = (await readdir(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11424
+ if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
11425
+ const probeImage = join4(imagesDir, firstFiles[0]);
11426
+ markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
11386
11427
  const probeStart = performance.now();
11387
11428
  currentStage = "probe";
11388
11429
  markStageStart("probe", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC218\uD589 \uC911");
11389
11430
  logStage("info", "probe", "start", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC2DC\uC791", { models, probeConcurrency });
11390
- const probeImage = await pickRepresentativeImage(images);
11391
11431
  let probeDone = 0;
11392
11432
  const probeRuns = startParallelProbeRuns({
11393
11433
  models,
@@ -11414,33 +11454,74 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11414
11454
  await updateModelCache(modelCachePath, results);
11415
11455
  return results;
11416
11456
  });
11457
+ const concurrencyPerKey = Math.max(1, options.concurrencyPerKey ?? 2);
11458
+ const keyCount = keyPool.snapshot().length;
11459
+ const workerCount = Math.max(1, keyCount * concurrencyPerKey);
11460
+ const queueCapacity = workerCount * 2;
11461
+ const queue = new BoundedQueue(queueCapacity);
11417
11462
  const ocrStart = performance.now();
11418
11463
  currentStage = "ocr";
11419
- markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (${selectedModel})`);
11420
- logStage("info", "ocr", "start", "\uD398\uC774\uC9C0 OCR \uC2DC\uC791", { selectedModel, pageCount: images.length });
11421
- const rawPagePaths = [];
11422
- for (let i = 0; i < images.length; i++) {
11423
- const imagePath = images[i];
11424
- const markdown = await ocrImageWithFallback({
11425
- imagePath,
11426
- prompt: OCR_PROMPT2,
11427
- models: fallbackModelOrder,
11428
- modelMaxTokens,
11429
- baseUrl,
11430
- keyPool,
11431
- timeoutMs,
11432
- maxRetriesPerPage,
11464
+ markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
11465
+ logStage("info", "ocr", "start", "\uD398\uC774\uC9C0 OCR \uC2DC\uC791", { workerCount, keyCount, pageCount: totalPages });
11466
+ let renderDone = 1;
11467
+ const renderProducer = (async () => {
11468
+ try {
11469
+ await queue.enqueue({ pageNumber: 1, imagePath: probeImage });
11470
+ if (totalPages > 1) {
11471
+ for await (const item of renderPdfToPngStream(workingPdfPath, join4(imagesDir, "page"), dpi, totalPages, 2)) {
11472
+ await queue.enqueue(item);
11473
+ renderDone++;
11474
+ markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
11475
+ logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
11476
+ }
11477
+ }
11478
+ } finally {
11479
+ queue.close();
11480
+ timingsMs.render = elapsedMs(renderStart);
11481
+ markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
11482
+ logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
11483
+ }
11484
+ })();
11485
+ const [, pageResultsMap] = await Promise.all([
11486
+ renderProducer,
11487
+ ocrWorkerPool({
11488
+ queue,
11489
+ workerCount,
11490
+ totalPages,
11491
+ ocrInput: {
11492
+ prompt: OCR_PROMPT2,
11493
+ models: fallbackModelOrder,
11494
+ modelMaxTokens,
11495
+ baseUrl,
11496
+ keyPool,
11497
+ timeoutMs,
11498
+ maxRetriesPerPage,
11499
+ logger
11500
+ },
11501
+ onPageDone: (pageNumber, completedCount, model) => {
11502
+ markStageProgress(
11503
+ "ocr",
11504
+ Math.round(completedCount / totalPages * 100),
11505
+ completedCount,
11506
+ totalPages,
11507
+ `OCR ${completedCount}/${totalPages}`,
11508
+ model || void 0
11509
+ );
11510
+ logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: pageNumber, total: totalPages, model });
11511
+ },
11433
11512
  logger
11434
- });
11435
- const pagePath = join4(rawDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11436
- await writeFile(pagePath, markdown, "utf-8");
11437
- rawPagePaths.push(pagePath);
11438
- markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`, selectedModel);
11439
- logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
11440
- }
11513
+ })
11514
+ ]);
11441
11515
  timingsMs.ocr = elapsedMs(ocrStart);
11442
11516
  markStageDone("ocr", "OCR \uC644\uB8CC");
11443
11517
  logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
11518
+ const sortedEntries = Array.from(pageResultsMap.entries()).sort((a, b) => a[0] - b[0]);
11519
+ const rawPagePaths = [];
11520
+ for (const [pageNum, markdown] of sortedEntries) {
11521
+ const pagePath = join4(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
11522
+ await writeFile(pagePath, markdown, "utf-8");
11523
+ rawPagePaths.push(pagePath);
11524
+ }
11444
11525
  const mergeStart = performance.now();
11445
11526
  currentStage = "merge";
11446
11527
  markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
@@ -11457,7 +11538,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11457
11538
  selectedModel,
11458
11539
  probeImage,
11459
11540
  probeResults: await probeResultsPromise,
11460
- pageCount: images.length,
11541
+ pageCount: totalPages,
11461
11542
  keyHealth: keyPool.snapshot(),
11462
11543
  timingsMs,
11463
11544
  modelCachePath
@@ -11535,13 +11616,6 @@ async function convertWithLibreOffice(buffer, ext) {
11535
11616
  });
11536
11617
  });
11537
11618
  }
11538
- async function renderPdfToPng(pdfPath, prefixPath, dpi) {
11539
- try {
11540
- await runCommand("pdftoppm", ["-png", "-r", String(dpi), pdfPath, prefixPath]);
11541
- } catch (err) {
11542
- throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
11543
- }
11544
- }
11545
11619
  async function getPdfPageCount(pdfPath) {
11546
11620
  const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
11547
11621
  const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
@@ -11554,36 +11628,33 @@ async function getPdfPageCount(pdfPath) {
11554
11628
  }
11555
11629
  return n;
11556
11630
  }
11557
- async function renderPdfToPngWithProgress(pdfPath, prefixPath, dpi, onPageDone) {
11558
- let totalPages = 0;
11559
- try {
11560
- totalPages = await getPdfPageCount(pdfPath);
11561
- } catch {
11562
- totalPages = 0;
11563
- }
11564
- if (totalPages > 0) {
11631
+ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, startPage = 1) {
11632
+ const imagesDir = dirname3(prefixPath);
11633
+ for (let page = startPage; page <= totalPages; page++) {
11565
11634
  try {
11566
- for (let page = 1; page <= totalPages; page++) {
11567
- await runCommand("pdftoppm", [
11568
- "-png",
11569
- "-r",
11570
- String(dpi),
11571
- "-f",
11572
- String(page),
11573
- "-l",
11574
- String(page),
11575
- pdfPath,
11576
- prefixPath
11577
- ]);
11578
- onPageDone(page, totalPages);
11579
- }
11580
- return { emittedPerPageProgress: true, pageCountSource: "pdfinfo" };
11635
+ await runCommand("pdftoppm", [
11636
+ "-png",
11637
+ "-r",
11638
+ String(dpi),
11639
+ "-f",
11640
+ String(page),
11641
+ "-l",
11642
+ String(page),
11643
+ pdfPath,
11644
+ prefixPath
11645
+ ]);
11646
+ const files = await readdir(imagesDir);
11647
+ const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11648
+ const imagePath = join4(imagesDir, pageFiles[pageFiles.length - 1]);
11649
+ yield { pageNumber: page, imagePath };
11581
11650
  } catch (err) {
11582
- throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
11651
+ yield {
11652
+ pageNumber: page,
11653
+ imagePath: null,
11654
+ error: err instanceof Error ? err : new Error(String(err))
11655
+ };
11583
11656
  }
11584
11657
  }
11585
- await renderPdfToPng(pdfPath, prefixPath, dpi);
11586
- return { emittedPerPageProgress: false, pageCountSource: "fallback" };
11587
11658
  }
11588
11659
  async function runCommand(cmd, args) {
11589
11660
  await new Promise((resolvePromise, reject) => {
@@ -11624,26 +11695,11 @@ async function assertSofficeAvailable() {
11624
11695
  throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
11625
11696
  }
11626
11697
  }
11627
- async function listPageImages(imagesDir) {
11628
- const files = await readdir(imagesDir);
11629
- return files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b)).map((f) => join4(imagesDir, f));
11630
- }
11631
11698
  function naturalPageSort(a, b) {
11632
11699
  const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
11633
11700
  const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
11634
11701
  return na - nb;
11635
11702
  }
11636
- async function pickRepresentativeImage(images) {
11637
- const sample = images.slice(0, Math.min(images.length, 8));
11638
- const weighted = [];
11639
- for (const p of sample) {
11640
- const st = await stat(p);
11641
- if (st.size > 8 * 1024) weighted.push({ path: p, size: st.size });
11642
- }
11643
- const use = weighted.length > 0 ? weighted : await Promise.all(sample.map(async (p) => ({ path: p, size: (await stat(p)).size })));
11644
- use.sort((a, b) => a.size - b.size);
11645
- return use[Math.floor(use.length / 2)].path;
11646
- }
11647
11703
  async function mapWithConcurrency(items, concurrency, mapper) {
11648
11704
  const results = new Array(items.length);
11649
11705
  let nextIndex = 0;
@@ -11745,11 +11801,43 @@ async function updateModelCache(path, probes) {
11745
11801
  current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
11746
11802
  await writeFile(path, JSON.stringify(current, null, 2), "utf-8");
11747
11803
  }
11804
+ async function ocrWorkerPool(input) {
11805
+ const { queue, workerCount, ocrInput, onPageDone } = input;
11806
+ const results = /* @__PURE__ */ new Map();
11807
+ let completedCount = 0;
11808
+ async function worker() {
11809
+ while (true) {
11810
+ const item = await queue.dequeue();
11811
+ if (item === QUEUE_DONE) break;
11812
+ const { pageNumber, imagePath, error } = item;
11813
+ if (imagePath === null) {
11814
+ input.logger?.log({
11815
+ level: "warn",
11816
+ stage: "ocr",
11817
+ event: "message",
11818
+ message: `\uD398\uC774\uC9C0 ${pageNumber} \uB80C\uB354 \uC2E4\uD328 \u2014 \uBE48 \uD398\uC774\uC9C0\uB85C \uCC98\uB9AC`,
11819
+ meta: { error: String(error) }
11820
+ });
11821
+ results.set(pageNumber, "");
11822
+ completedCount++;
11823
+ onPageDone(pageNumber, completedCount, "");
11824
+ continue;
11825
+ }
11826
+ const { markdown, model } = await ocrImageWithFallback({ ...ocrInput, imagePath });
11827
+ results.set(pageNumber, markdown);
11828
+ completedCount++;
11829
+ onPageDone(pageNumber, completedCount, model);
11830
+ }
11831
+ }
11832
+ const workers = Array.from({ length: workerCount }, () => worker());
11833
+ await Promise.all(workers);
11834
+ return results;
11835
+ }
11748
11836
  async function ocrImageWithFallback(input) {
11749
11837
  let lastErr = "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958";
11750
11838
  for (const model of input.models) {
11751
11839
  try {
11752
- return await ocrImageViaNim({
11840
+ const markdown = await ocrImageViaNim({
11753
11841
  imagePath: input.imagePath,
11754
11842
  prompt: input.prompt,
11755
11843
  model,
@@ -11761,6 +11849,7 @@ async function ocrImageWithFallback(input) {
11761
11849
  logger: input.logger,
11762
11850
  stage: "ocr"
11763
11851
  });
11852
+ return { markdown, model };
11764
11853
  } catch (err) {
11765
11854
  lastErr = err instanceof Error ? err.message : String(err);
11766
11855
  }