@clazic/kordoc 2.4.17 → 2.4.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -4,7 +4,7 @@ import {
4
4
  markdownToHwpx,
5
5
  markdownToXlsx,
6
6
  parse
7
- } from "./chunk-WM3XI23V.js";
7
+ } from "./chunk-T7EBS5XP.js";
8
8
  import "./chunk-YW5G6BCJ.js";
9
9
  import {
10
10
  VERSION,
@@ -291,7 +291,7 @@ program.command("init-env").description("kordoc\uC6A9 .env \uD15C\uD50C\uB9BF \u
291
291
  }
292
292
  });
293
293
  program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
294
- const { watchDirectory } = await import("./watch-RM4VNOL4.js");
294
+ const { watchDirectory } = await import("./watch-YGIU7RN7.js");
295
295
  await watchDirectory({
296
296
  dir,
297
297
  outDir: opts.outDir,
package/dist/index.cjs CHANGED
@@ -11369,17 +11369,39 @@ var DEFAULT_STAGE_WEIGHTS = {
11369
11369
  render: 20,
11370
11370
  probe: 5,
11371
11371
  ocr: 45,
11372
- proofread: 10,
11372
+ proofread: 0,
11373
11373
  merge: 5
11374
11374
  };
11375
- var OCR_PROMPT2 = "Extract all text and tables from this image exactly as-is into Markdown. Do not summarize, infer, or alter the content in any way.";
11376
- var PROOFREAD_PROMPT = [
11377
- "Perform non-destructive proofreading only on the Markdown below.",
11378
- "Rules:",
11379
- "- Do not add, remove, or infer any facts",
11380
- "- Do not change numbers, units, or proper nouns",
11381
- "- Correct only typos, spacing, line breaks, and Markdown structure",
11382
- "- Output the corrected Markdown body only"
11375
+ var OCR_PROMPT2 = [
11376
+ "\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, OCR \uC624\uC778\uC2DD \uC624\uB958\uB97C \uC989\uC2DC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uBB3C\uC744 \uCD9C\uB825\uD558\uC138\uC694.",
11377
+ "",
11378
+ "\uCD94\uCD9C \uADDC\uCE59:",
11379
+ "- \uD14D\uC2A4\uD2B8, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8\uB97C \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658",
11380
+ "- \uD45C\uB294 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)",
11381
+ "- \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9",
11382
+ "- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9",
11383
+ "- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC",
11384
+ "- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0",
11385
+ "",
11386
+ "\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D:",
11387
+ "- \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
11388
+ "- \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11389
+ "- \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11390
+ "- \uD45C\uC758 \uC81C\uBAA9\uC744 \uBCC0\uACBD \uB610\uB294 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
11391
+ "- \uD45C\uC758 \uD589\xB7\uC5F4 \uC218, \uC140 \uB0B4\uC6A9, \uD5E4\uB354\uB97C \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11392
+ "- \uC81C\uBAA9 \uC218\uC900(#, ##, ### \uB4F1)\uC744 \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83",
11393
+ "- \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83",
11394
+ "- ` ``` `\uB85C \uAC10\uC2F8\uAC70\uB098 \uC124\uBA85 \uD14D\uC2A4\uD2B8\uB97C \uCD94\uAC00\uD558\uC9C0 \uB9D0 \uAC83",
11395
+ "",
11396
+ "\uD5C8\uC6A9\uB418\uB294 \uAD50\uC815 \uBC94\uC704 (OCR \uC624\uC778\uC2DD \uC218\uC815):",
11397
+ "- \uBA85\uBC31\uD55C \uAE00\uC790 \uC624\uC778\uC2DD \uC218\uC815 (\uC608: '0' \u2192 'O', 'l' \u2192 '1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uD655\uD55C \uACBD\uC6B0\uB9CC)",
11398
+ "- \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70",
11399
+ "- \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uBD84\uB9AC\uB41C \uBB38\uC7A5 \uBCD1\uD569 (\uC758\uBBF8 \uB2E8\uC704 \uAE30\uC900)",
11400
+ "- Markdown \uBB38\uBC95 \uC624\uB958 \uC218\uC815 (\uD45C \uAD6C\uBD84\uC120 \uB204\uB77D, \uB9AC\uC2A4\uD2B8 \uB4E4\uC5EC\uC4F0\uAE30 \uB4F1)",
11401
+ "",
11402
+ "\uCD9C\uB825 \uADDC\uCE59:",
11403
+ "- \uBCC0\uD658\uB41C Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825\uD560 \uAC83 (\uC124\uBA85, \uC8FC\uC11D, \uBA54\uD0C0 \uD14D\uC2A4\uD2B8 \uC5C6\uC774)",
11404
+ "- \uD655\uC2E4\uD558\uC9C0 \uC54A\uC73C\uBA74 \uC6D0\uBB38\uC744 \uADF8\uB300\uB85C \uC720\uC9C0\uD560 \uAC83"
11383
11405
  ].join("\n");
11384
11406
  function elapsedMs(startAt) {
11385
11407
  return Math.round(import_node_perf_hooks.performance.now() - startAt);
@@ -11390,7 +11412,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11390
11412
  const workspaceDir = (0, import_path5.resolve)(options.workspaceDir ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}_ocr_workspace`));
11391
11413
  const imagesDir = (0, import_path5.join)(workspaceDir, "images");
11392
11414
  const rawDir = (0, import_path5.join)(workspaceDir, "ocr", "raw");
11393
- const proofDir = (0, import_path5.join)(workspaceDir, "ocr", "proofread");
11394
11415
  const diffDir = (0, import_path5.join)(workspaceDir, "ocr", "diff");
11395
11416
  const outputPath = (0, import_path5.resolve)(options.outputPath ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}.md`));
11396
11417
  const reportPath = (0, import_path5.join)(workspaceDir, "run-report.json");
@@ -11410,7 +11431,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11410
11431
  const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
11411
11432
  await (0, import_promises2.mkdir)(imagesDir, { recursive: true });
11412
11433
  await (0, import_promises2.mkdir)(rawDir, { recursive: true });
11413
- await (0, import_promises2.mkdir)(proofDir, { recursive: true });
11414
11434
  await (0, import_promises2.mkdir)(diffDir, { recursive: true });
11415
11435
  const timingsMs = {};
11416
11436
  const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
@@ -11525,50 +11545,11 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11525
11545
  timingsMs.ocr = elapsedMs(ocrStart);
11526
11546
  markStageDone("ocr", "OCR \uC644\uB8CC");
11527
11547
  logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
11528
- const proofStart = import_node_perf_hooks.performance.now();
11529
- currentStage = "proofread";
11530
- markStageStart("proofread", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC9C4\uD589 \uC911");
11531
- logStage("info", "proofread", "start", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC2DC\uC791", { pages: rawPagePaths.length });
11532
- const proofedPaths = [];
11533
- for (let i = 0; i < rawPagePaths.length; i++) {
11534
- const rawMd = await (0, import_promises2.readFile)(rawPagePaths[i], "utf-8");
11535
- const prompt = `${PROOFREAD_PROMPT}
11536
-
11537
- ---
11538
- ${rawMd}
11539
- ---`;
11540
- const corrected = await ocrImageViaNim({
11541
- textOnlyPrompt: prompt,
11542
- model: selectedModel,
11543
- maxTokens: modelMaxTokens[selectedModel] ?? 8192,
11544
- baseUrl,
11545
- keyPool,
11546
- timeoutMs,
11547
- maxRetries: maxRetriesPerPage,
11548
- logger,
11549
- stage: "proofread"
11550
- });
11551
- const safeCorrected = preserveNumericIntegrity(rawMd, corrected);
11552
- const taggedCorrected = addUncertainTag(rawMd, safeCorrected);
11553
- const pagePath = (0, import_path5.join)(proofDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11554
- await (0, import_promises2.writeFile)(pagePath, taggedCorrected, "utf-8");
11555
- await (0, import_promises2.writeFile)(
11556
- (0, import_path5.join)(diffDir, `page_${String(i + 1).padStart(4, "0")}.json`),
11557
- JSON.stringify(buildDiffSummary(rawMd, taggedCorrected), null, 2),
11558
- "utf-8"
11559
- );
11560
- proofedPaths.push(pagePath);
11561
- markStageProgress("proofread", Math.round((i + 1) / rawPagePaths.length * 100), i + 1, rawPagePaths.length, `\uAD50\uC815 ${i + 1}/${rawPagePaths.length}`);
11562
- logStage("debug", "proofread", "progress", "\uD398\uC774\uC9C0 \uAD50\uC815 \uC644\uB8CC", { page: i + 1, total: rawPagePaths.length });
11563
- }
11564
- timingsMs.proofread = elapsedMs(proofStart);
11565
- markStageDone("proofread", "\uAD50\uC815 \uC644\uB8CC");
11566
- logStage("info", "proofread", "done", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC644\uB8CC", { elapsedMs: timingsMs.proofread });
11567
11548
  const mergeStart = import_node_perf_hooks.performance.now();
11568
11549
  currentStage = "merge";
11569
11550
  markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
11570
- logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: proofedPaths.length });
11571
- const merged = await mergeMarkdownPages(proofedPaths);
11551
+ logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
11552
+ const merged = await mergeMarkdownPages(rawPagePaths);
11572
11553
  await (0, import_promises2.writeFile)(outputPath, merged, "utf-8");
11573
11554
  timingsMs.merge = elapsedMs(mergeStart);
11574
11555
  markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
@@ -12027,40 +12008,6 @@ function ensureSupportedInput(path) {
12027
12008
  throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
12028
12009
  }
12029
12010
  }
12030
- function extractNumericTokens(text) {
12031
- return text.match(/\d[\d,./-]*/g) ?? [];
12032
- }
12033
- function preserveNumericIntegrity(rawText, correctedText) {
12034
- const rawTokens = extractNumericTokens(rawText);
12035
- const correctedTokens = extractNumericTokens(correctedText);
12036
- if (rawTokens.length !== correctedTokens.length) return rawText;
12037
- for (let i = 0; i < rawTokens.length; i++) {
12038
- if (rawTokens[i] !== correctedTokens[i]) return rawText;
12039
- }
12040
- return correctedText;
12041
- }
12042
- function addUncertainTag(rawText, correctedText) {
12043
- if (correctedText.includes("[\uD655\uC778\uD544\uC694:")) return correctedText;
12044
- const rawLen = rawText.trim().length;
12045
- const corrLen = correctedText.trim().length;
12046
- if (rawLen === 0 || corrLen === 0) return correctedText;
12047
- const rawLines = rawText.split("\n").filter(Boolean).length;
12048
- const corrLines = correctedText.split("\n").filter(Boolean).length;
12049
- const rawTableLines = rawText.split("\n").filter((l) => l.includes("|")).length;
12050
- const corrTableLines = correctedText.split("\n").filter((l) => l.includes("|")).length;
12051
- const suspicious = corrLen < rawLen * 0.75 || corrLines < Math.max(1, Math.floor(rawLines * 0.8)) || rawTableLines >= 2 && corrTableLines < Math.floor(rawTableLines * 0.7);
12052
- if (!suspicious) return correctedText;
12053
- return `${correctedText}
12054
-
12055
- [\uD655\uC778\uD544\uC694: \uAD50\uC815 \uACB0\uACFC\uAC00 \uCD95\uC57D\uB418\uC5C8\uC744 \uC218 \uC788\uC5B4 \uC6D0\uBB38\uACFC \uB300\uC870\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.]`;
12056
- }
12057
- function buildDiffSummary(before, after) {
12058
- return {
12059
- changed: before !== after,
12060
- beforeLength: before.length,
12061
- afterLength: after.length
12062
- };
12063
- }
12064
12011
  function normalizePipelineError(err, stage) {
12065
12012
  if (err instanceof UnifiedOcrError) return err;
12066
12013
  const message = err instanceof Error ? err.message : String(err);