@clazic/kordoc 2.4.16 → 2.4.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -421,6 +421,7 @@ interface UnifiedOcrProgressEvent {
421
421
  total?: number;
422
422
  code?: UnifiedOcrErrorCode;
423
423
  message?: string;
424
+ model?: string;
424
425
  }
425
426
  interface UnifiedOcrOptions {
426
427
  workspaceDir?: string;
package/dist/index.d.ts CHANGED
@@ -421,6 +421,7 @@ interface UnifiedOcrProgressEvent {
421
421
  total?: number;
422
422
  code?: UnifiedOcrErrorCode;
423
423
  message?: string;
424
+ model?: string;
424
425
  }
425
426
  interface UnifiedOcrOptions {
426
427
  workspaceDir?: string;
package/dist/index.js CHANGED
@@ -3115,7 +3115,7 @@ import JSZip2 from "jszip";
3115
3115
  import { DOMParser } from "@xmldom/xmldom";
3116
3116
 
3117
3117
  // src/utils.ts
3118
- var VERSION = true ? "2.4.15" : "0.0.0-dev";
3118
+ var VERSION = true ? "2.4.17" : "0.0.0-dev";
3119
3119
  function toArrayBuffer(buf) {
3120
3120
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3121
3121
  return buf.buffer;
@@ -11346,17 +11346,39 @@ var DEFAULT_STAGE_WEIGHTS = {
11346
11346
  render: 20,
11347
11347
  probe: 5,
11348
11348
  ocr: 45,
11349
- proofread: 10,
11349
+ proofread: 0,
11350
11350
  merge: 5
11351
11351
  };
11352
- var OCR_PROMPT2 = "\uC774 \uC774\uBBF8\uC9C0 1\uC7A5\uC758 \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uC694\uC57D \uC5C6\uC774 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uCD94\uCD9C\uD558\uC138\uC694. \uC808\uB300\uB85C \uB0B4\uC6A9\uC744 \uCD94\uCE21\uD558\uAC70\uB098 \uBC14\uAFB8\uC9C0 \uB9C8\uC138\uC694.";
11353
- var PROOFREAD_PROMPT = [
11354
- "\uC544\uB798 Markdown\uC744 \uBE44\uD30C\uAD34 \uAD50\uC815\uB9CC \uC218\uD589\uD558\uC138\uC694.",
11355
- "\uADDC\uCE59:",
11356
- "- \uC0AC\uC2E4 \uCD94\uAC00/\uC0AD\uC81C/\uCD94\uCE21 \uAE08\uC9C0",
11357
- "- \uC22B\uC790, \uB2E8\uC704, \uACE0\uC720\uBA85\uC0AC \uBCC0\uACBD \uAE08\uC9C0",
11358
- "- \uC624\uD0C8\uC790, \uB744\uC5B4\uC4F0\uAE30, \uC904\uBC14\uAFC8, Markdown \uAD6C\uC870\uB9CC \uAD50\uC815",
11359
- "- \uACB0\uACFC\uB294 Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825"
11352
+ var OCR_PROMPT2 = [
11353
+ "\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, OCR \uC624\uC778\uC2DD \uC624\uB958\uB97C \uC989\uC2DC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uBB3C\uC744 \uCD9C\uB825\uD558\uC138\uC694.",
11354
+ "",
11355
+ "\uCD94\uCD9C \uADDC\uCE59:",
11356
+ "- \uD14D\uC2A4\uD2B8, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8\uB97C \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658",
11357
+ "- \uD45C\uB294 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)",
11358
+ "- \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9",
11359
+ "- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9",
11360
+ "- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC",
11361
+ "- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0",
11362
+ "",
11363
+ "\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D:",
11364
+ "- \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
11365
+ "- \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11366
+ "- \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11367
+ "- \uD45C\uC758 \uC81C\uBAA9\uC744 \uBCC0\uACBD \uB610\uB294 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
11368
+ "- \uD45C\uC758 \uD589\xB7\uC5F4 \uC218, \uC140 \uB0B4\uC6A9, \uD5E4\uB354\uB97C \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11369
+ "- \uC81C\uBAA9 \uC218\uC900(#, ##, ### \uB4F1)\uC744 \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83",
11370
+ "- \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83",
11371
+ "- ` ``` `\uB85C \uAC10\uC2F8\uAC70\uB098 \uC124\uBA85 \uD14D\uC2A4\uD2B8\uB97C \uCD94\uAC00\uD558\uC9C0 \uB9D0 \uAC83",
11372
+ "",
11373
+ "\uD5C8\uC6A9\uB418\uB294 \uAD50\uC815 \uBC94\uC704 (OCR \uC624\uC778\uC2DD \uC218\uC815):",
11374
+ "- \uBA85\uBC31\uD55C \uAE00\uC790 \uC624\uC778\uC2DD \uC218\uC815 (\uC608: '0' \u2192 'O', 'l' \u2192 '1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uD655\uD55C \uACBD\uC6B0\uB9CC)",
11375
+ "- \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70",
11376
+ "- \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uBD84\uB9AC\uB41C \uBB38\uC7A5 \uBCD1\uD569 (\uC758\uBBF8 \uB2E8\uC704 \uAE30\uC900)",
11377
+ "- Markdown \uBB38\uBC95 \uC624\uB958 \uC218\uC815 (\uD45C \uAD6C\uBD84\uC120 \uB204\uB77D, \uB9AC\uC2A4\uD2B8 \uB4E4\uC5EC\uC4F0\uAE30 \uB4F1)",
11378
+ "",
11379
+ "\uCD9C\uB825 \uADDC\uCE59:",
11380
+ "- \uBCC0\uD658\uB41C Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825\uD560 \uAC83 (\uC124\uBA85, \uC8FC\uC11D, \uBA54\uD0C0 \uD14D\uC2A4\uD2B8 \uC5C6\uC774)",
11381
+ "- \uD655\uC2E4\uD558\uC9C0 \uC54A\uC73C\uBA74 \uC6D0\uBB38\uC744 \uADF8\uB300\uB85C \uC720\uC9C0\uD560 \uAC83"
11360
11382
  ].join("\n");
11361
11383
  function elapsedMs(startAt) {
11362
11384
  return Math.round(performance.now() - startAt);
@@ -11367,7 +11389,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11367
11389
  const workspaceDir = resolve3(options.workspaceDir ?? join4(dirname3(absInput), `${stem}_ocr_workspace`));
11368
11390
  const imagesDir = join4(workspaceDir, "images");
11369
11391
  const rawDir = join4(workspaceDir, "ocr", "raw");
11370
- const proofDir = join4(workspaceDir, "ocr", "proofread");
11371
11392
  const diffDir = join4(workspaceDir, "ocr", "diff");
11372
11393
  const outputPath = resolve3(options.outputPath ?? join4(dirname3(absInput), `${stem}.md`));
11373
11394
  const reportPath = join4(workspaceDir, "run-report.json");
@@ -11387,11 +11408,10 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11387
11408
  const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
11388
11409
  await mkdir(imagesDir, { recursive: true });
11389
11410
  await mkdir(rawDir, { recursive: true });
11390
- await mkdir(proofDir, { recursive: true });
11391
11411
  await mkdir(diffDir, { recursive: true });
11392
11412
  const timingsMs = {};
11393
11413
  const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
11394
- const markStageProgress = (stage, stagePercent, current, total, message) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message });
11414
+ const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
11395
11415
  const markStageDone = (stage, message) => emitProgress(options.onEvent, stage, 100, stageWeights, { message, type: "stage_done" });
11396
11416
  let currentStage = "convert";
11397
11417
  const logStage = (level, stage, event, message, meta) => {
@@ -11496,56 +11516,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11496
11516
  const pagePath = join4(rawDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11497
11517
  await writeFile(pagePath, markdown, "utf-8");
11498
11518
  rawPagePaths.push(pagePath);
11499
- markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`);
11519
+ markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`, selectedModel);
11500
11520
  logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
11501
11521
  }
11502
11522
  timingsMs.ocr = elapsedMs(ocrStart);
11503
11523
  markStageDone("ocr", "OCR \uC644\uB8CC");
11504
11524
  logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
11505
- const proofStart = performance.now();
11506
- currentStage = "proofread";
11507
- markStageStart("proofread", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC9C4\uD589 \uC911");
11508
- logStage("info", "proofread", "start", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC2DC\uC791", { pages: rawPagePaths.length });
11509
- const proofedPaths = [];
11510
- for (let i = 0; i < rawPagePaths.length; i++) {
11511
- const rawMd = await readFile(rawPagePaths[i], "utf-8");
11512
- const prompt = `${PROOFREAD_PROMPT}
11513
-
11514
- ---
11515
- ${rawMd}
11516
- ---`;
11517
- const corrected = await ocrImageViaNim({
11518
- textOnlyPrompt: prompt,
11519
- model: selectedModel,
11520
- maxTokens: modelMaxTokens[selectedModel] ?? 8192,
11521
- baseUrl,
11522
- keyPool,
11523
- timeoutMs,
11524
- maxRetries: maxRetriesPerPage,
11525
- logger,
11526
- stage: "proofread"
11527
- });
11528
- const safeCorrected = preserveNumericIntegrity(rawMd, corrected);
11529
- const taggedCorrected = addUncertainTag(rawMd, safeCorrected);
11530
- const pagePath = join4(proofDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11531
- await writeFile(pagePath, taggedCorrected, "utf-8");
11532
- await writeFile(
11533
- join4(diffDir, `page_${String(i + 1).padStart(4, "0")}.json`),
11534
- JSON.stringify(buildDiffSummary(rawMd, taggedCorrected), null, 2),
11535
- "utf-8"
11536
- );
11537
- proofedPaths.push(pagePath);
11538
- markStageProgress("proofread", Math.round((i + 1) / rawPagePaths.length * 100), i + 1, rawPagePaths.length, `\uAD50\uC815 ${i + 1}/${rawPagePaths.length}`);
11539
- logStage("debug", "proofread", "progress", "\uD398\uC774\uC9C0 \uAD50\uC815 \uC644\uB8CC", { page: i + 1, total: rawPagePaths.length });
11540
- }
11541
- timingsMs.proofread = elapsedMs(proofStart);
11542
- markStageDone("proofread", "\uAD50\uC815 \uC644\uB8CC");
11543
- logStage("info", "proofread", "done", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC644\uB8CC", { elapsedMs: timingsMs.proofread });
11544
11525
  const mergeStart = performance.now();
11545
11526
  currentStage = "merge";
11546
11527
  markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
11547
- logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: proofedPaths.length });
11548
- const merged = await mergeMarkdownPages(proofedPaths);
11528
+ logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
11529
+ const merged = await mergeMarkdownPages(rawPagePaths);
11549
11530
  await writeFile(outputPath, merged, "utf-8");
11550
11531
  timingsMs.merge = elapsedMs(mergeStart);
11551
11532
  markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
@@ -11620,7 +11601,8 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
11620
11601
  current: extra.current,
11621
11602
  total: extra.total,
11622
11603
  code: extra.code,
11623
- message: extra.message
11604
+ message: extra.message,
11605
+ model: extra.model
11624
11606
  });
11625
11607
  }
11626
11608
  async function convertWithLibreOffice(buffer, ext) {
@@ -12003,40 +11985,6 @@ function ensureSupportedInput(path) {
12003
11985
  throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
12004
11986
  }
12005
11987
  }
12006
- function extractNumericTokens(text) {
12007
- return text.match(/\d[\d,./-]*/g) ?? [];
12008
- }
12009
- function preserveNumericIntegrity(rawText, correctedText) {
12010
- const rawTokens = extractNumericTokens(rawText);
12011
- const correctedTokens = extractNumericTokens(correctedText);
12012
- if (rawTokens.length !== correctedTokens.length) return rawText;
12013
- for (let i = 0; i < rawTokens.length; i++) {
12014
- if (rawTokens[i] !== correctedTokens[i]) return rawText;
12015
- }
12016
- return correctedText;
12017
- }
12018
- function addUncertainTag(rawText, correctedText) {
12019
- if (correctedText.includes("[\uD655\uC778\uD544\uC694:")) return correctedText;
12020
- const rawLen = rawText.trim().length;
12021
- const corrLen = correctedText.trim().length;
12022
- if (rawLen === 0 || corrLen === 0) return correctedText;
12023
- const rawLines = rawText.split("\n").filter(Boolean).length;
12024
- const corrLines = correctedText.split("\n").filter(Boolean).length;
12025
- const rawTableLines = rawText.split("\n").filter((l) => l.includes("|")).length;
12026
- const corrTableLines = correctedText.split("\n").filter((l) => l.includes("|")).length;
12027
- const suspicious = corrLen < rawLen * 0.75 || corrLines < Math.max(1, Math.floor(rawLines * 0.8)) || rawTableLines >= 2 && corrTableLines < Math.floor(rawTableLines * 0.7);
12028
- if (!suspicious) return correctedText;
12029
- return `${correctedText}
12030
-
12031
- [\uD655\uC778\uD544\uC694: \uAD50\uC815 \uACB0\uACFC\uAC00 \uCD95\uC57D\uB418\uC5C8\uC744 \uC218 \uC788\uC5B4 \uC6D0\uBB38\uACFC \uB300\uC870\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.]`;
12032
- }
12033
- function buildDiffSummary(before, after) {
12034
- return {
12035
- changed: before !== after,
12036
- beforeLength: before.length,
12037
- afterLength: after.length
12038
- };
12039
- }
12040
11988
  function normalizePipelineError(err, stage) {
12041
11989
  if (err instanceof UnifiedOcrError) return err;
12042
11990
  const message = err instanceof Error ? err.message : String(err);