@clazic/kordoc 2.4.15 → 2.4.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -421,6 +421,7 @@ interface UnifiedOcrProgressEvent {
421
421
  total?: number;
422
422
  code?: UnifiedOcrErrorCode;
423
423
  message?: string;
424
+ model?: string;
424
425
  }
425
426
  interface UnifiedOcrOptions {
426
427
  workspaceDir?: string;
package/dist/index.d.ts CHANGED
@@ -421,6 +421,7 @@ interface UnifiedOcrProgressEvent {
421
421
  total?: number;
422
422
  code?: UnifiedOcrErrorCode;
423
423
  message?: string;
424
+ model?: string;
424
425
  }
425
426
  interface UnifiedOcrOptions {
426
427
  workspaceDir?: string;
package/dist/index.js CHANGED
@@ -3115,7 +3115,7 @@ import JSZip2 from "jszip";
3115
3115
  import { DOMParser } from "@xmldom/xmldom";
3116
3116
 
3117
3117
  // src/utils.ts
3118
- var VERSION = true ? "2.4.14" : "0.0.0-dev";
3118
+ var VERSION = true ? "2.4.17" : "0.0.0-dev";
3119
3119
  function toArrayBuffer(buf) {
3120
3120
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3121
3121
  return buf.buffer;
@@ -11349,14 +11349,14 @@ var DEFAULT_STAGE_WEIGHTS = {
11349
11349
  proofread: 10,
11350
11350
  merge: 5
11351
11351
  };
11352
- var OCR_PROMPT2 = "\uC774 \uC774\uBBF8\uC9C0 1\uC7A5\uC758 \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uC694\uC57D \uC5C6\uC774 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uCD94\uCD9C\uD558\uC138\uC694. \uC808\uB300\uB85C \uB0B4\uC6A9\uC744 \uCD94\uCE21\uD558\uAC70\uB098 \uBC14\uAFB8\uC9C0 \uB9C8\uC138\uC694.";
11352
+ var OCR_PROMPT2 = "Extract all text and tables from this image exactly as-is into Markdown. Do not summarize, infer, or alter the content in any way.";
11353
11353
  var PROOFREAD_PROMPT = [
11354
- "\uC544\uB798 Markdown\uC744 \uBE44\uD30C\uAD34 \uAD50\uC815\uB9CC \uC218\uD589\uD558\uC138\uC694.",
11355
- "\uADDC\uCE59:",
11356
- "- \uC0AC\uC2E4 \uCD94\uAC00/\uC0AD\uC81C/\uCD94\uCE21 \uAE08\uC9C0",
11357
- "- \uC22B\uC790, \uB2E8\uC704, \uACE0\uC720\uBA85\uC0AC \uBCC0\uACBD \uAE08\uC9C0",
11358
- "- \uC624\uD0C8\uC790, \uB744\uC5B4\uC4F0\uAE30, \uC904\uBC14\uAFC8, Markdown \uAD6C\uC870\uB9CC \uAD50\uC815",
11359
- "- \uACB0\uACFC\uB294 Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825"
11354
+ "Perform non-destructive proofreading only on the Markdown below.",
11355
+ "Rules:",
11356
+ "- Do not add, remove, or infer any facts",
11357
+ "- Do not change numbers, units, or proper nouns",
11358
+ "- Correct only typos, spacing, line breaks, and Markdown structure",
11359
+ "- Output the corrected Markdown body only"
11360
11360
  ].join("\n");
11361
11361
  function elapsedMs(startAt) {
11362
11362
  return Math.round(performance.now() - startAt);
@@ -11391,7 +11391,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11391
11391
  await mkdir(diffDir, { recursive: true });
11392
11392
  const timingsMs = {};
11393
11393
  const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
11394
- const markStageProgress = (stage, stagePercent, current, total, message) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message });
11394
+ const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
11395
11395
  const markStageDone = (stage, message) => emitProgress(options.onEvent, stage, 100, stageWeights, { message, type: "stage_done" });
11396
11396
  let currentStage = "convert";
11397
11397
  const logStage = (level, stage, event, message, meta) => {
@@ -11418,13 +11418,32 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11418
11418
  currentStage = "render";
11419
11419
  markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
11420
11420
  logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi });
11421
- await renderPdfToPng(workingPdfPath, join4(imagesDir, "page"), dpi);
11421
+ const renderWithProgress = await renderPdfToPngWithProgress(
11422
+ workingPdfPath,
11423
+ join4(imagesDir, "page"),
11424
+ dpi,
11425
+ (current, total) => {
11426
+ markStageProgress(
11427
+ "render",
11428
+ Math.round(current / total * 100),
11429
+ current,
11430
+ total,
11431
+ `\uD398\uC774\uC9C0 ${current}/${total} \uB80C\uB354\uB9C1`
11432
+ );
11433
+ }
11434
+ );
11422
11435
  const images = await listPageImages(imagesDir);
11423
11436
  if (images.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328: \uACB0\uACFC \uC774\uBBF8\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
11424
- markStageProgress("render", 100, images.length, images.length, `\uD398\uC774\uC9C0 ${images.length}\uC7A5 \uC0DD\uC131`);
11437
+ if (!renderWithProgress.emittedPerPageProgress) {
11438
+ markStageProgress("render", 100, images.length, images.length, `\uD398\uC774\uC9C0 ${images.length}\uC7A5 \uC0DD\uC131`);
11439
+ }
11425
11440
  timingsMs.render = elapsedMs(renderStart);
11426
11441
  markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
11427
- logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: images.length, elapsedMs: timingsMs.render });
11442
+ logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", {
11443
+ pages: images.length,
11444
+ elapsedMs: timingsMs.render,
11445
+ pageCountSource: renderWithProgress.pageCountSource
11446
+ });
11428
11447
  const probeStart = performance.now();
11429
11448
  currentStage = "probe";
11430
11449
  markStageStart("probe", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC218\uD589 \uC911");
@@ -11477,7 +11496,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11477
11496
  const pagePath = join4(rawDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11478
11497
  await writeFile(pagePath, markdown, "utf-8");
11479
11498
  rawPagePaths.push(pagePath);
11480
- markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`);
11499
+ markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`, selectedModel);
11481
11500
  logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
11482
11501
  }
11483
11502
  timingsMs.ocr = elapsedMs(ocrStart);
@@ -11601,7 +11620,8 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
11601
11620
  current: extra.current,
11602
11621
  total: extra.total,
11603
11622
  code: extra.code,
11604
- message: extra.message
11623
+ message: extra.message,
11624
+ model: extra.model
11605
11625
  });
11606
11626
  }
11607
11627
  async function convertWithLibreOffice(buffer, ext) {
@@ -11622,6 +11642,49 @@ async function renderPdfToPng(pdfPath, prefixPath, dpi) {
11622
11642
  throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
11623
11643
  }
11624
11644
  }
11645
+ async function getPdfPageCount(pdfPath) {
11646
+ const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
11647
+ const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
11648
+ if (!m) {
11649
+ throw new Error("pdfinfo \uCD9C\uB825\uC5D0\uC11C \uD398\uC774\uC9C0 \uC218\uB97C \uCC3E\uC9C0 \uBABB\uD588\uC2B5\uB2C8\uB2E4.");
11650
+ }
11651
+ const n = Number(m[1]);
11652
+ if (!Number.isFinite(n) || n <= 0) {
11653
+ throw new Error(`\uC798\uBABB\uB41C \uD398\uC774\uC9C0 \uC218: ${m[1]}`);
11654
+ }
11655
+ return n;
11656
+ }
11657
+ async function renderPdfToPngWithProgress(pdfPath, prefixPath, dpi, onPageDone) {
11658
+ let totalPages = 0;
11659
+ try {
11660
+ totalPages = await getPdfPageCount(pdfPath);
11661
+ } catch {
11662
+ totalPages = 0;
11663
+ }
11664
+ if (totalPages > 0) {
11665
+ try {
11666
+ for (let page = 1; page <= totalPages; page++) {
11667
+ await runCommand("pdftoppm", [
11668
+ "-png",
11669
+ "-r",
11670
+ String(dpi),
11671
+ "-f",
11672
+ String(page),
11673
+ "-l",
11674
+ String(page),
11675
+ pdfPath,
11676
+ prefixPath
11677
+ ]);
11678
+ onPageDone(page, totalPages);
11679
+ }
11680
+ return { emittedPerPageProgress: true, pageCountSource: "pdfinfo" };
11681
+ } catch (err) {
11682
+ throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
11683
+ }
11684
+ }
11685
+ await renderPdfToPng(pdfPath, prefixPath, dpi);
11686
+ return { emittedPerPageProgress: false, pageCountSource: "fallback" };
11687
+ }
11625
11688
  async function runCommand(cmd, args) {
11626
11689
  await new Promise((resolvePromise, reject) => {
11627
11690
  const child = spawn2(cmd, args, { stdio: "pipe" });
@@ -11636,6 +11699,24 @@ async function runCommand(cmd, args) {
11636
11699
  });
11637
11700
  });
11638
11701
  }
11702
+ async function runCommandWithStdout(cmd, args) {
11703
+ return await new Promise((resolvePromise, reject) => {
11704
+ const child = spawn2(cmd, args, { stdio: "pipe" });
11705
+ let stdout = "";
11706
+ let stderr = "";
11707
+ child.stdout.on("data", (d) => {
11708
+ stdout += String(d);
11709
+ });
11710
+ child.stderr.on("data", (d) => {
11711
+ stderr += String(d);
11712
+ });
11713
+ child.on("error", reject);
11714
+ child.on("close", (code) => {
11715
+ if (code === 0) resolvePromise(stdout);
11716
+ else reject(new Error(`${cmd} \uC2E4\uD328 (code=${code}): ${stderr.trim()}`));
11717
+ });
11718
+ });
11719
+ }
11639
11720
  async function assertSofficeAvailable() {
11640
11721
  try {
11641
11722
  await runCommand("soffice", ["--version"]);