RubyGems - completion-kit - Versions diffs - 0.5.36 → 0.5.37 - Mend

completion-kit 0.5.36 → 0.5.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/app/assets/stylesheets/completion_kit/application.css.erb +207 -19
data/app/controllers/completion_kit/metrics_controller.rb +52 -1
data/app/models/completion_kit/judge_version.rb +17 -1
data/app/models/completion_kit/metric.rb +19 -0
data/app/services/completion_kit/calibration_math.rb +84 -0
data/app/services/completion_kit/judge_variant_generator.rb +108 -0
data/app/services/completion_kit/mcp_dispatcher.rb +3 -1
data/app/services/completion_kit/mcp_tools/judges.rb +138 -0
data/app/services/completion_kit/metric_calibration_stats.rb +99 -0
data/app/views/completion_kit/calibrations/_buttons.html.erb +5 -4
data/app/views/completion_kit/calibrations/_trust_panel.html.erb +34 -0
data/app/views/completion_kit/metrics/show.html.erb +141 -0
data/config/routes.rb +7 -1
data/db/migrate/20260523000001_add_few_shot_examples_to_completion_kit_metrics.rb +5 -0
data/db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb +15 -0
data/lib/completion_kit/version.rb +1 -1
metadata +8 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f3189fdba715e4750befcadf517462d0523ae7366611964388309031ad6a4d4f
-  data.tar.gz: 322e2dd4847c5e8bd3b8af83b4b09873efcd4238c945d928fdac7c7ffbbaf7de
+  metadata.gz: 1ccc7d1feb86aed6af17569a642d8b8e81fe522f0a7c68ca4ebb34abc113dbce
+  data.tar.gz: 88793eabe6b04c3497c761cde5b61511623c5a9844ce7e101560f8eb3b492e18
 SHA512:
-  metadata.gz: f611cfbc07196fd75eb16962bb1acf4a271d759f42473af9b94d049860839a12657223c72bb12aaeaa80a14adfbb29a20c4cb3c4c3176193ce44966bb876b011
-  data.tar.gz: 2806d017ce92c625e6c7f83e789d13afe53065107a5a803f0d309cb1eaee65d752a8b1e18734c42e9106fc6c38c2cbbfcc803f76477571d7e2330130ebe8eee1
+  metadata.gz: d133a9d0db55ee41eb07e290b9657e044c8a0836806bbd055d0b7b6d1cf8b981056b40e3a8795a951e36e7d7dbcc7626e249c2a2f4cba9492fad38aa931b6bfc
+  data.tar.gz: 71bbbe827f33648b12f121c949af74fe8d02702d44c70fd39beb4795c8f95d2b9941aa755d631a78029e0b412cdec7ca9e2bea107d1395ea001abe46dcfddf3f

data/app/assets/stylesheets/completion_kit/application.css.erb CHANGED Viewed

@@ -5123,53 +5123,241 @@ a.tag-mark {
 .ck-calibration {
   margin-top: 12px;
   padding-top: 12px;
-  border-top: 1px dashed rgba(255, 255, 255, 0.08);
+  border-top: 1px dashed var(--ck-line);
 }
 .ck-calibration__prompt {
-  font-size: 0.8rem;
+  font-family: var(--ck-mono);
+  font-size: 0.72rem;
+  letter-spacing: 0.06em;
+  text-transform: uppercase;
   color: var(--ck-dim);
-  margin: 0 0 8px;
+  margin: 0 0 10px;
   display: flex;
   align-items: center;
-  gap: 8px;
+  gap: 10px;
 }
 .ck-calibration__count {
-  font-size: 0.75rem;
+  font-family: var(--ck-mono);
+  font-size: 0.72rem;
+  letter-spacing: 0.03em;
   color: var(--ck-accent);
+  text-transform: none;
 }
 .ck-calibration__buttons {
   display: flex;
-  gap: 8px;
+  gap: 6px;
   flex-wrap: wrap;
 }
 .ck-calibration__pill {
   display: inline-flex;
   align-items: center;
-  gap: 6px;
-  padding: 6px 12px;
-  border-radius: 999px;
-  font-size: 0.85rem;
-  background: transparent;
-  border: 1px solid rgba(255, 255, 255, 0.18);
-  color: inherit;
+  gap: 0.4rem;
+  padding: 0.32rem 0.65rem;
+  border-radius: 4px;
+  font-family: var(--ck-mono);
+  font-size: 0.78rem;
+  font-weight: 500;
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  background: var(--ck-surface-soft);
+  border: 1px solid var(--ck-line);
+  color: var(--ck-dim);
   cursor: pointer;
+  transition: background 0.12s, border-color 0.12s, color 0.12s;
+}
+.ck-calibration__pill svg {
+  width: 14px;
+  height: 14px;
 }
 .ck-calibration__pill:hover,
 .ck-calibration__pill:focus-visible {
-  border-color: var(--ck-accent);
+  color: var(--ck-text);
+  border-color: var(--ck-dim);
 }
-.ck-calibration__pill.is-active {
-  background: var(--ck-accent);
-  color: #0b1320;
-  border-color: var(--ck-accent);
+.ck-calibration__pill--agree.is-active {
+  background: var(--ck-success-soft);
+  border-color: rgba(45, 212, 168, 0.35);
+  color: var(--ck-success);
 }
+.ck-calibration__pill--disagree.is-active {
+  background: var(--ck-danger-soft);
+  border-color: rgba(248, 113, 113, 0.35);
+  color: var(--ck-danger);
+}
+.ck-calibration__pill--borderline.is-active {
+  background: var(--ck-warning-soft);
+  border-color: rgba(224, 164, 88, 0.35);
+  color: var(--ck-warning);
+}
+.ck-calibration__pill--agree:hover { border-color: rgba(45, 212, 168, 0.45); color: var(--ck-success); }
+.ck-calibration__pill--disagree:hover { border-color: rgba(248, 113, 113, 0.45); color: var(--ck-danger); }
+.ck-calibration__pill--borderline:hover { border-color: rgba(224, 164, 88, 0.45); color: var(--ck-warning); }
 .ck-calibration__detail {
-  margin-top: 10px;
+  margin-top: 12px;
   display: flex;
   flex-direction: column;
   gap: 8px;
+  padding: 12px;
+  background: var(--ck-surface-soft);
+  border: 1px solid var(--ck-line);
+  border-radius: 6px;
 }
 .ck-calibration__value {
   color: var(--ck-accent);
+  font-family: var(--ck-mono);
+  font-weight: 600;
+}
+.ck-trust-panel {
+  display: inline-flex;
+  flex-direction: column;
+  gap: 6px;
+  margin-top: 12px;
+  padding: 10px 14px;
+  background: var(--ck-surface-soft);
+  border: 1px solid var(--ck-line);
+  border-radius: 6px;
+}
+.ck-trust-panel__label {
+  margin: 0;
+  font-family: var(--ck-mono);
+  font-size: 0.7rem;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+  color: var(--ck-dim);
+}
+.ck-trust-panel__body {
+  display: flex;
+  align-items: baseline;
+  gap: 10px;
+}
+.ck-trust-panel__counter {
+  font-family: var(--ck-mono);
+  font-size: 1.6rem;
+  font-weight: 600;
+  color: var(--ck-accent);
+}
+.ck-trust-panel__counter-of {
+  font-size: 0.9rem;
+  color: var(--ck-dim);
+  margin-left: 4px;
+}
+.ck-trust-panel__hint {
+  font-family: var(--ck-mono);
+  font-size: 0.72rem;
+  color: var(--ck-dim);
+  letter-spacing: 0.04em;
+}
+.ck-trust-panel__score {
+  font-family: var(--ck-mono);
+  font-size: 1.6rem;
   font-weight: 600;
+  color: var(--ck-success);
+}
+.ck-trust-panel__score-pct {
+  font-size: 0.9rem;
+  color: var(--ck-dim);
+  margin-left: 2px;
+}
+.ck-trust-panel__margin {
+  font-family: var(--ck-mono);
+  font-size: 0.8rem;
+  color: var(--ck-dim);
+}
+.ck-trust-panel__gate {
+  font-family: var(--ck-mono);
+  font-size: 0.66rem;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+  padding: 2px 6px;
+  border-radius: 3px;
+  background: var(--ck-surface);
+  border: 1px solid var(--ck-line);
+  color: var(--ck-dim);
+}
+.ck-trust-panel--firm .ck-trust-panel__gate {
+  color: var(--ck-success);
+  border-color: rgba(45, 212, 168, 0.35);
+}
+.ck-trust-panel__details {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 14px;
+  font-family: var(--ck-mono);
+  font-size: 0.72rem;
+  color: var(--ck-dim);
+}
+.ck-trust-panel__borderline {
+  color: var(--ck-warning);
+}
+.ck-trust-panel__borderline--ok { color: var(--ck-dim); }
+.ck-trust-panel__borderline--warning { color: var(--ck-warning); }
+.ck-trust-panel__borderline--danger { color: var(--ck-danger); }
+.ck-disagreements-table td .ck-meta-copy {
+  font-size: 0.78rem;
+}
+.ck-few-shot-list {
+  list-style: decimal;
+  padding-left: 1.4rem;
+  margin: 0;
+  display: flex;
+  flex-direction: column;
+  gap: 12px;
+}
+.ck-few-shot-item {
+  padding: 10px 12px;
+  background: var(--ck-surface-soft);
+  border: 1px solid var(--ck-line);
+  border-radius: 6px;
+}
+.ck-few-shot-item__scores {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  font-family: var(--ck-mono);
+  font-size: 0.75rem;
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+}
+.ck-draft-banner {
+  display: inline-flex;
+  align-items: center;
+  gap: 10px;
+  margin-top: 10px;
+  padding: 8px 12px;
+  background: var(--ck-accent-soft);
+  border: 1px dashed rgba(6, 182, 212, 0.4);
+  border-radius: 6px;
+}
+.ck-suggestion-list {
+  display: flex;
+  flex-direction: column;
+  gap: 12px;
+}
+.ck-suggestion-card {
+  padding: 12px 14px;
+  background: var(--ck-surface-soft);
+  border: 1px solid var(--ck-line);
+  border-radius: 6px;
+  display: flex;
+  flex-direction: column;
+  gap: 10px;
+}
+.ck-suggestion-card__header {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+}
+.ck-suggestion-card__instruction {
+  margin: 0;
+  white-space: pre-wrap;
+  font-size: 0.85rem;
+  background: var(--ck-bg-strong);
+  padding: 10px 12px;
+  border-radius: 4px;
+  border: 1px solid var(--ck-line);
 }

data/app/controllers/completion_kit/metrics_controller.rb CHANGED Viewed

@@ -1,13 +1,19 @@
 module CompletionKit
   class MetricsController < ApplicationController
     include CompletionKit::TagFiltering
-    before_action :set_metric, only: [:show, :edit, :update, :destroy]
+    before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants]
     def index
       @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
     end
     def show
+      @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
+                                  .includes(response: [:reviews, :run])
+                                  .order(created_at: :desc)
+                                  .limit(50)
+      @latest_draft = JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
+      @suggestion_drafts = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc)
     end
     def new
@@ -40,6 +46,51 @@ module CompletionKit
       redirect_to metrics_path, notice: "Metric was successfully destroyed."
     end
+    def suggest_variants
+      generator = JudgeVariantGenerator.new(@metric)
+      variants = generator.call
+      if variants.empty?
+        redirect_to metric_path(@metric), alert: "The model returned no usable variants. Try again with a different model."
+        return
+      end
+      generator.persist!(variants)
+      label = variants.length == 1 ? "judge variant" : "judge variants"
+      redirect_to metric_path(@metric), notice: "Generated #{variants.length} #{label} as drafts. Pick one to publish."
+    end
+    def publish_draft
+      draft = JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
+      if draft.nil?
+        redirect_to metric_path(@metric), alert: "No draft to publish."
+        return
+      end
+      JudgeVersion.transaction do
+        JudgeVersion.where(metric_id: @metric.id, state: "published").update_all(current: false)
+        draft.update!(state: "published", current: true)
+      end
+      redirect_to metric_path(@metric), notice: "Draft published as the current judge version."
+    end
+    def add_few_shot
+      calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
+      review = calibration.response.reviews.find_by(metric_id: @metric.id)
+      examples = Array(@metric.few_shot_examples)
+      examples << {
+        "input" => calibration.response.input_data.to_s.truncate(2000),
+        "response" => calibration.response.response_text.to_s.truncate(2000),
+        "judge_score" => review&.ai_score&.to_f,
+        "judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
+        "human_score" => calibration.corrected_score&.to_f,
+        "human_note" => calibration.note.to_s.truncate(1000),
+        "calibration_id" => calibration.id,
+        "added_at" => Time.current.utc.iso8601
+      }
+      @metric.update!(few_shot_examples: examples)
+      redirect_to metric_path(@metric), notice: "Added as a judge few-shot."
+    end
     private
     def set_metric

data/app/models/completion_kit/judge_version.rb CHANGED Viewed

@@ -1,23 +1,37 @@
 module CompletionKit
   class JudgeVersion < ApplicationRecord
+    STATES = %w[draft published].freeze
     belongs_to :metric
     has_many :calibrations, dependent: :destroy
     serialize :rubric_bands, coder: JSON
     validates :metric_id, presence: true
+    validates :state, inclusion: { in: STATES }
     scope :current, -> { where(current: true) }
+    scope :published, -> { where(state: "published") }
+    scope :drafts, -> { where(state: "draft") }
     def self.ensure_current_for(metric)
       current.find_by(metric_id: metric.id) || create!(
         metric: metric,
         instruction: metric.instruction,
         rubric_bands: metric.rubric_bands,
-        current: true
+        current: true,
+        state: "published"
       )
     end
+    def draft?
+      state == "draft"
+    end
+    def published?
+      state == "published"
+    end
     def as_json(options = {})
       {
         id: id,
@@ -25,6 +39,8 @@ module CompletionKit
         instruction: instruction,
         rubric_bands: rubric_bands,
         current: current,
+        state: state,
+        source: source,
         created_at: created_at
       }
     end

data/app/models/completion_kit/metric.rb CHANGED Viewed

@@ -16,6 +16,7 @@ module CompletionKit
     has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
     serialize :rubric_bands, coder: JSON
+    serialize :few_shot_examples, coder: JSON, type: Array
     validates :name, presence: true
     validates :key, tenant_scoped_uniqueness: { allow_nil: true }
@@ -23,6 +24,7 @@ module CompletionKit
     before_validation :generate_key
     before_validation :normalize_rubric_bands
     before_validation :set_defaults
+    after_update :fork_draft_judge_version, if: :judge_relevant_changes?
     def self.default_rubric_bands
       DEFAULT_RUBRIC_BANDS.map(&:dup)
@@ -95,5 +97,22 @@ module CompletionKit
     def normalize_rubric_bands
       self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
     end
+    def judge_relevant_changes?
+      saved_change_to_instruction? || saved_change_to_rubric_bands?
+    end
+    def fork_draft_judge_version
+      JudgeVersion.ensure_current_for(self)
+      JudgeVersion.where(metric_id: id, state: "draft").update_all(current: false)
+      JudgeVersion.create!(
+        metric: self,
+        instruction: instruction,
+        rubric_bands: rubric_bands,
+        current: false,
+        state: "draft",
+        source: "edit"
+      )
+    end
   end
 end

data/app/services/completion_kit/calibration_math.rb ADDED Viewed

@@ -0,0 +1,84 @@
+module CompletionKit
+  module CalibrationMath
+    Z_95 = 1.959963984540054
+    module_function
+    def wilson_interval(successes:, n:, z: Z_95)
+      return { point: nil, low: nil, high: nil } if n.to_i.zero?
+      p_hat = successes.to_f / n
+      denom = 1.0 + (z * z) / n
+      center = (p_hat + (z * z) / (2.0 * n)) / denom
+      margin = z * Math.sqrt((p_hat * (1 - p_hat) / n) + ((z * z) / (4.0 * n * n))) / denom
+      { point: p_hat, low: [center - margin, 0.0].max, high: [center + margin, 1.0].min }
+    end
+    def mae(pairs)
+      return nil if pairs.empty?
+      sum = pairs.sum { |ai, human| (ai.to_f - human.to_f).abs }
+      sum / pairs.length
+    end
+    def pearson(pairs)
+      return nil if pairs.length < 2
+      xs = pairs.map { |a, _| a.to_f }
+      ys = pairs.map { |_, h| h.to_f }
+      mx = xs.sum / xs.length
+      my = ys.sum / ys.length
+      num = xs.zip(ys).sum { |x, y| (x - mx) * (y - my) }
+      dx2 = xs.sum { |x| (x - mx)**2 }
+      dy2 = ys.sum { |y| (y - my)**2 }
+      denom = Math.sqrt(dx2 * dy2)
+      return nil if denom.zero?
+      num / denom
+    end
+    def quadratic_weighted_kappa(pairs, categories:)
+      return nil if pairs.empty?
+      ratings = categories.to_a
+      k = ratings.length
+      return nil if k < 2
+      index = ratings.each_with_index.to_h
+      observed = Array.new(k) { Array.new(k, 0) }
+      row_totals = Array.new(k, 0)
+      col_totals = Array.new(k, 0)
+      n = 0
+      pairs.each do |ai, human|
+        i = index[score_bucket(ai, ratings)]
+        j = index[score_bucket(human, ratings)]
+        next if i.nil? || j.nil?
+        observed[i][j] += 1
+        row_totals[i] += 1
+        col_totals[j] += 1
+        n += 1
+      end
+      return nil if n.zero?
+      max_dist_sq = (k - 1.0)**2
+      numerator = 0.0
+      denominator = 0.0
+      (0...k).each do |i|
+        (0...k).each do |j|
+          weight = ((i - j)**2) / max_dist_sq
+          expected = (row_totals[i] * col_totals[j]).to_f / n
+          numerator   += weight * observed[i][j]
+          denominator += weight * expected
+        end
+      end
+      return 1.0 if denominator.zero?
+      1.0 - (numerator / denominator)
+    end
+    def score_bucket(value, ratings)
+      rounded = value.to_f.round
+      return ratings.first if rounded <= ratings.first
+      return ratings.last if rounded >= ratings.last
+      rounded
+    end
+  end
+end

data/app/services/completion_kit/judge_variant_generator.rb ADDED Viewed

@@ -0,0 +1,108 @@
+module CompletionKit
+  class JudgeVariantGenerator
+    DEFAULT_VARIANT_COUNT = 3
+    DEFAULT_TEMPERATURE = 0.4
+    Variant = Struct.new(:reasoning, :instruction, keyword_init: true)
+    def initialize(metric, count: DEFAULT_VARIANT_COUNT, model: nil)
+      @metric = metric
+      @count = count
+      @model = model || CompletionKit.config.judge_model
+    end
+    def call
+      client = LlmClient.for_model(@model, ApiConfig.for_model(@model))
+      raw = client.generate_completion(build_meta_prompt, model: @model, max_tokens: 2500, temperature: DEFAULT_TEMPERATURE)
+      parse(raw).first(@count)
+    end
+    def persist!(variants)
+      JudgeVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
+      versions = variants.map do |variant|
+        JudgeVersion.create!(
+          metric: @metric,
+          instruction: variant.instruction,
+          rubric_bands: @metric.rubric_bands,
+          state: "draft",
+          source: "suggestion",
+          current: false
+        )
+      end
+      ActiveSupport::Notifications.instrument("completion_kit.judge_suggestion.generated",
+                                              metric_id: @metric.id,
+                                              count: versions.length,
+                                              model: @model)
+      versions
+    end
+    private
+    def build_meta_prompt
+      examples = JudgeCalibrationExamples.for(@metric)
+      sections = []
+      sections << "You are an expert evaluator. Rewrite a judge's grading instruction so it agrees better with humans on the cases below."
+      sections << ""
+      sections << "## Current instruction"
+      sections << "```"
+      sections << @metric.instruction.to_s
+      sections << "```"
+      sections << ""
+      sections << "## Rubric (unchanged across variants — only rewrite the instruction)"
+      sections << @metric.display_rubric_text
+      sections << ""
+      sections << "## Recent disagreements (judge vs human)"
+      examples.each_with_index do |ex, i|
+        sections << "### Case #{i + 1}"
+        sections << "Input: #{ex[:input].to_s.truncate(200)}"
+        sections << "Output: #{ex[:output].to_s.truncate(200)}"
+        sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
+        sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
+        sections << ""
+      end
+      sections << "## Task"
+      sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite — not a wholesale rewrite of the rubric. Aim to close the disagreement gap."
+      sections << ""
+      sections << "Respond in EXACTLY this format, repeated #{@count} times:"
+      sections << ""
+      sections << "VARIANT:"
+      sections << "REASONING: <one sentence explaining what this variant changes>"
+      sections << "INSTRUCTION:"
+      sections << "<the rewritten instruction>"
+      sections << "END_VARIANT"
+      sections.join("\n")
+    end
+    def parse(text)
+      blocks = text.to_s.scan(/VARIANT:(.*?)END_VARIANT/m).flatten
+      blocks.filter_map do |raw|
+        reasoning = raw[/REASONING:\s*(.*?)(?=INSTRUCTION:|\z)/m, 1].to_s.strip
+        instruction = raw[/INSTRUCTION:\s*(.*)/m, 1].to_s.strip
+        next if instruction.empty?
+        Variant.new(reasoning: reasoning, instruction: instruction)
+      end
+    end
+  end
+  module JudgeCalibrationExamples
+    module_function
+    def for(metric, limit: 8)
+      disagreements = Calibration.where(metric_id: metric.id, verdict: "disagree")
+                                 .includes(response: :reviews)
+                                 .order(created_at: :desc)
+                                 .limit(limit)
+      disagreements.map do |cal|
+        review = cal.response.reviews.find { |r| r.metric_id == metric.id }
+        {
+          input: cal.response.input_data,
+          output: cal.response.response_text,
+          judge_score: review&.ai_score,
+          judge_feedback: review&.ai_feedback,
+          human_score: cal.corrected_score,
+          human_note: cal.note
+        }
+      end
+    end
+  end
+end

data/app/services/completion_kit/mcp_dispatcher.rb CHANGED Viewed

@@ -34,7 +34,8 @@ module CompletionKit
         McpTools::MetricGroups.definitions +
         McpTools::ProviderCredentials.definitions +
         McpTools::Tags.definitions +
-        McpTools::Calibrations.definitions
+        McpTools::Calibrations.definitions +
+        McpTools::Judges.definitions
     end
     def self.call_tool(name, arguments)
@@ -48,6 +49,7 @@ module CompletionKit
       when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
       when /\Atags_/                 then McpTools::Tags.call(name, arguments)
       when /\Acalibrations_/         then McpTools::Calibrations.call(name, arguments)
+      when /\Ajudges_/               then McpTools::Judges.call(name, arguments)
       else raise MethodNotFound, "Unknown tool: #{name}"
       end
     end

data/app/services/completion_kit/mcp_tools/judges.rb ADDED Viewed

@@ -0,0 +1,138 @@
+module CompletionKit
+  module McpTools
+    module Judges
+      extend Base
+      TOOLS = {
+        "judges_suggest" => {
+          description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft JudgeVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
+          inputSchema: {
+            type: "object",
+            properties: {
+              metric_id: { type: "integer" },
+              count: { type: "integer", description: "How many variants to request (default 3, max 5)." },
+              model: { type: "string", description: "Override the model used to generate variants. Defaults to CompletionKit.config.judge_model." }
+            },
+            required: ["metric_id"]
+          },
+          handler: :suggest
+        },
+        "judges_replay" => {
+          description: "Run the current judge against a dataset (judge-only run). Wraps runs_create with prompt_id omitted and output_column supplied. Re-judges existing dataset outputs so you can compare against human verdicts.",
+          inputSchema: {
+            type: "object",
+            properties: {
+              name: { type: "string" },
+              metric_id: { type: "integer" },
+              dataset_id: { type: "integer" },
+              judge_model: { type: "string" },
+              output_column: { type: "string", description: "Dataset column with the existing outputs to grade. Defaults to actual_output." }
+            },
+            required: ["name", "metric_id", "dataset_id", "judge_model"]
+          },
+          handler: :replay
+        },
+        "judges_compare" => {
+          description: "Compare two judge versions' calibration stats side by side. Pass either two judge_version_ids or one metric_id with judge_version_a_id / judge_version_b_id.",
+          inputSchema: {
+            type: "object",
+            properties: {
+              metric_id: { type: "integer" },
+              judge_version_a_id: { type: "integer" },
+              judge_version_b_id: { type: "integer" }
+            },
+            required: ["metric_id", "judge_version_a_id", "judge_version_b_id"]
+          },
+          handler: :compare
+        }
+      }.freeze
+      def self.suggest(args)
+        metric = CompletionKit::Metric.find(args["metric_id"])
+        count = [args["count"].to_i, 5].min
+        count = CompletionKit::JudgeVariantGenerator::DEFAULT_VARIANT_COUNT if count <= 0
+        generator = CompletionKit::JudgeVariantGenerator.new(metric, count: count, model: args["model"])
+        variants = generator.call
+        return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
+        versions = generator.persist!(variants)
+        text_result(versions.map(&:as_json))
+      end
+      def self.replay(args)
+        metric = CompletionKit::Metric.find(args["metric_id"])
+        dataset = CompletionKit::Dataset.find(args["dataset_id"])
+        run = CompletionKit::Run.new(
+          name: args["name"],
+          dataset: dataset,
+          judge_model: args["judge_model"],
+          output_column: args["output_column"].presence || "actual_output"
+        )
+        if run.save
+          run.replace_metrics!([metric.id])
+          text_result(run.reload.as_json)
+        else
+          error_result(run.errors.full_messages.join(", "))
+        end
+      end
+      def self.compare(args)
+        metric = CompletionKit::Metric.find(args["metric_id"])
+        a = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_a_id"])
+        b = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_b_id"])
+        stats_a = CompletionKit::MetricCalibrationStats.for(metric, judge_version: a)
+        stats_b = CompletionKit::MetricCalibrationStats.for(metric, judge_version: b)
+        text_result({
+          metric_id: metric.id,
+          a: judge_version_payload(a, stats_a),
+          b: judge_version_payload(b, stats_b),
+          delta: delta_payload(stats_a, stats_b),
+          recommendation: recommendation_for(stats_a, stats_b)
+        })
+      end
+      def self.judge_version_payload(version, stats)
+        {
+          id: version.id, state: version.state, current: version.current,
+          source: version.source, created_at: version.created_at,
+          sample_size: stats.sample_size,
+          agreement_point: stats.agreement_point,
+          agreement_low: stats.agreement_low,
+          agreement_high: stats.agreement_high,
+          borderline_rate: stats.borderline_rate,
+          mae: stats.mae, kappa: stats.kappa
+        }
+      end
+      def self.delta_payload(a, b)
+        {
+          agreement: pair_delta(a.agreement_point, b.agreement_point),
+          mae: pair_delta(a.mae, b.mae),
+          kappa: pair_delta(a.kappa, b.kappa),
+          sample_size: { a: a.sample_size, b: b.sample_size }
+        }
+      end
+      def self.pair_delta(a, b)
+        { a: a, b: b, delta: (a.nil? || b.nil?) ? nil : (b - a) }
+      end
+      def self.recommendation_for(a, b)
+        total = a.sample_size + b.sample_size
+        if total < 30
+          { state: "need_more_data", reason: "Combined n=#{total}; need 30+ to make a call." }
+        elsif a.agreement_point.nil? || b.agreement_point.nil?
+          { state: "no_change", reason: "Not enough verdicts on one of the versions to compare." }
+        else
+          lift = b.agreement_point - a.agreement_point
+          if lift > 0.03
+            { state: "recommend", reason: "B agreement +#{(lift * 100).round}pt over A." }
+          elsif lift < -0.03
+            { state: "hold", reason: "B agreement #{(lift * 100).round}pt vs A." }
+          else
+            { state: "no_change", reason: "Agreement within noise (#{(lift * 100).round}pt)." }
+          end
+        end
+      end
+    end
+  end
+end

data/app/services/completion_kit/metric_calibration_stats.rb ADDED Viewed

@@ -0,0 +1,99 @@
+module CompletionKit
+  class MetricCalibrationStats
+    PROVISIONAL_MIN = 10
+    FIRM_MIN = 30
+    Result = Struct.new(
+      :sample_size, :agree_count, :disagree_count, :borderline_count,
+      :agreement_point, :agreement_low, :agreement_high,
+      :borderline_rate, :mae, :pearson, :kappa, :gate,
+      keyword_init: true
+    ) do
+      def counter_only?
+        gate == :counter
+      end
+      def provisional?
+        gate == :provisional
+      end
+      def firm?
+        gate == :firm
+      end
+      def short_to_target
+        [PROVISIONAL_MIN - sample_size, 0].max
+      end
+      def margin
+        return nil if agreement_low.nil? || agreement_high.nil?
+        (agreement_high - agreement_low) / 2.0
+      end
+    end
+    def self.for(metric, judge_version: nil)
+      new(metric: metric, judge_version: judge_version).call
+    end
+    def initialize(metric:, judge_version: nil)
+      @metric = metric
+      @judge_version = judge_version
+    end
+    def call
+      scope = Calibration.where(metric_id: @metric.id)
+      scope = scope.where(judge_version_id: @judge_version.id) if @judge_version
+      verdicts = scope.pluck(:verdict, :corrected_score, :response_id)
+      n = verdicts.length
+      agrees = verdicts.count { |v, _, _| v == "agree" }
+      disagrees = verdicts.count { |v, _, _| v == "disagree" }
+      borderlines = verdicts.count { |v, _, _| v == "borderline" }
+      ci = CalibrationMath.wilson_interval(successes: agrees, n: n)
+      pairs = score_pairs(verdicts)
+      mae_value = CalibrationMath.mae(pairs)
+      pearson_value = CalibrationMath.pearson(pairs)
+      kappa_value = CalibrationMath.quadratic_weighted_kappa(pairs, categories: 1..5)
+      Result.new(
+        sample_size: n,
+        agree_count: agrees,
+        disagree_count: disagrees,
+        borderline_count: borderlines,
+        agreement_point: ci[:point],
+        agreement_low: ci[:low],
+        agreement_high: ci[:high],
+        borderline_rate: n.zero? ? nil : borderlines.to_f / n,
+        mae: mae_value,
+        pearson: pearson_value,
+        kappa: kappa_value,
+        gate: gate_for(n)
+      )
+    end
+    private
+    def score_pairs(verdicts)
+      response_ids = verdicts.map { |_, _, rid| rid }.uniq
+      ai_scores = Review.where(response_id: response_ids, metric_id: @metric.id)
+                       .pluck(:response_id, :ai_score).to_h
+      verdicts.filter_map do |verdict, corrected, response_id|
+        next if verdict == "borderline"
+        ai = ai_scores[response_id]
+        next if ai.nil?
+        human = verdict == "agree" ? ai : corrected
+        next if human.nil?
+        [ai.to_f, human.to_f]
+      end
+    end
+    def gate_for(n)
+      return :counter if n < PROVISIONAL_MIN
+      return :firm if n >= FIRM_MIN
+      :provisional
+    end
+  end
+end

data/app/views/completion_kit/calibrations/_buttons.html.erb CHANGED Viewed

@@ -2,20 +2,21 @@
   <% current_verdict = calibration&.verdict %>
   <% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
   <p class="ck-calibration__prompt">
-    How does this score feel?
+    Your verdict
     <% if verdict_count > 0 %>
-      <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> collected</span>
+      <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score</span>
     <% end %>
   </p>
   <div class="ck-calibration__buttons">
+    <% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
     <% CompletionKit::Calibration::VERDICTS.each do |verdict| %>
       <%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
               method: :post,
               form: { data: { turbo: "true" } },
               class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == current_verdict}",
               "aria-pressed": (verdict == current_verdict).to_s do %>
-        <% case verdict
-           when "agree" %>👍 Agree<% when "disagree" %>👎 Disagree<% else %>🤔 Borderline<% end %>
+        <%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
+        <span><%= verdict %></span>
       <% end %>
     <% end %>
   </div>

data/app/views/completion_kit/calibrations/_trust_panel.html.erb ADDED Viewed

@@ -0,0 +1,34 @@
+<% stats = local_assigns[:stats] %>
+<div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
+  <p class="ck-trust-panel__label">Judge trust</p>
+  <% if stats.counter_only? %>
+    <div class="ck-trust-panel__body">
+      <span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>
+      <span class="ck-trust-panel__hint">verdicts<% if stats.short_to_target > 0 %> · <%= pluralize(stats.short_to_target, "more") %> to score<% end %></span>
+    </div>
+  <% else %>
+    <div class="ck-trust-panel__body">
+      <span class="ck-trust-panel__score">~<%= (stats.agreement_point * 100).round %><span class="ck-trust-panel__score-pct">%</span></span>
+      <span class="ck-trust-panel__margin">±<%= (stats.margin * 100).round %> pt</span>
+      <span class="ck-trust-panel__gate"><%= stats.firm? ? "settled" : "provisional" %></span>
+    </div>
+    <div class="ck-trust-panel__details">
+      <span><%= stats.sample_size %> verdicts</span>
+      <% if stats.borderline_rate && stats.borderline_rate > 0 %>
+        <% level = if stats.borderline_rate > 0.30 then "danger"
+                   elsif stats.borderline_rate > 0.15 then "warning"
+                   else "ok" end %>
+        <span class="ck-trust-panel__borderline ck-trust-panel__borderline--<%= level %>"
+              title="<%= level == 'ok' ? '' : 'Rubric ambiguous. Consider splitting the metric or clarifying the rubric.' %>">
+          <%= (stats.borderline_rate * 100).round %>% borderline
+        </span>
+      <% end %>
+      <% if stats.mae %>
+        <span>MAE <%= stats.mae.round(2) %></span>
+      <% end %>
+      <% if stats.kappa %>
+        <span>κ <%= stats.kappa.round(2) %></span>
+      <% end %>
+    </div>
+  <% end %>
+</div>

data/app/views/completion_kit/metrics/show.html.erb CHANGED Viewed

@@ -6,8 +6,27 @@
 <section class="ck-page-header">
   <div>
     <h1 class="ck-title"><%= @metric.name %></h1>
+    <% if CompletionKit.config.judge_calibration_enabled %>
+      <%= render "completion_kit/calibrations/trust_panel",
+            stats: CompletionKit::MetricCalibrationStats.for(@metric) %>
+      <% if @latest_draft %>
+        <div class="ck-draft-banner">
+          <span class="ck-chip ck-chip--soft">Draft pending</span>
+          <span class="ck-meta-copy">An edit forked a draft judge version. Publish it to make this the current judge.</span>
+          <%= button_to "Publish draft", publish_draft_metric_path(@metric),
+                method: :post, form_class: "inline-block",
+                class: ck_button_classes(:dark) %>
+        </div>
+      <% end %>
+    <% end %>
   </div>
   <div class="ck-actions">
+    <% if CompletionKit.config.judge_calibration_enabled %>
+      <%= button_to "Suggest improvements", suggest_variants_metric_path(@metric),
+            method: :post, form_class: "inline-block",
+            class: ck_button_classes(:light, variant: :outline),
+            data: { turbo_confirm: "Ask the model to propose new judge instructions based on the disagreements collected so far?" } %>
+    <% end %>
     <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
   </div>
 </section>
@@ -42,3 +61,125 @@
     <% end %>
   </div>
 </section>
+<% if CompletionKit.config.judge_calibration_enabled %>
+  <section class="ck-card ck-card--spaced">
+    <div class="ck-prompt-preview__header">
+      <p class="ck-kicker">Disagreements</p>
+      <% if @disagreements.any? %>
+        <span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
+      <% end %>
+    </div>
+    <% if @disagreements.empty? %>
+      <p class="ck-meta-copy">No disagreements yet. As humans give the verdict "disagree" on individual rows, the judge's misses will show up here for review.</p>
+    <% else %>
+      <table class="ck-results-table ck-disagreements-table">
+        <thead>
+          <tr>
+            <th scope="col">Run · row</th>
+            <th scope="col">Judge</th>
+            <th scope="col">Human</th>
+            <th scope="col">Note</th>
+            <th scope="col"></th>
+          </tr>
+        </thead>
+        <tbody>
+          <% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
+          <% @disagreements.each do |cal| %>
+            <% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
+            <% already = existing_ids.include?(cal.id) %>
+            <tr>
+              <td>
+                <%= link_to ck_run_path(cal.response.run), class: "ck-record-name" do %>
+                  <strong><%= cal.response.run.name.to_s.truncate(40) %></strong>
+                <% end %>
+                <span class="ck-meta-copy">· <%= link_to "row ##{cal.response.id}", run_response_path(cal.response.run, cal.response), class: "ck-link" %></span>
+              </td>
+              <td>
+                <% if review&.ai_score %>
+                  <span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
+                <% else %>
+                  <span class="ck-meta-copy">—</span>
+                <% end %>
+              </td>
+              <td>
+                <% if cal.corrected_score %>
+                  <span class="<%= ck_badge_classes(ck_score_kind(cal.corrected_score.to_f)) %>"><%= cal.corrected_score %></span>
+                <% else %>
+                  <span class="ck-meta-copy">—</span>
+                <% end %>
+              </td>
+              <td class="ck-meta-copy"><%= cal.note.to_s.truncate(120) %></td>
+              <td>
+                <% if already %>
+                  <span class="ck-chip ck-chip--done">Added</span>
+                <% else %>
+                  <%= button_to "Add as judge few-shot",
+                        add_few_shot_metric_path(@metric, calibration_id: cal.id),
+                        method: :post,
+                        form_class: "inline-block",
+                        class: ck_button_classes(:light, variant: :outline) %>
+                <% end %>
+              </td>
+            </tr>
+          <% end %>
+        </tbody>
+      </table>
+    <% end %>
+  </section>
+  <% if @suggestion_drafts.any? %>
+    <section class="ck-card ck-card--spaced">
+      <div class="ck-prompt-preview__header">
+        <p class="ck-kicker">Suggested judge variants</p>
+        <span class="ck-chip"><%= @suggestion_drafts.size %> draft<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
+      </div>
+      <p class="ck-meta-copy">Pick one and publish it to make it the current judge. The previous published version stays in history.</p>
+      <div class="ck-suggestion-list">
+        <% @suggestion_drafts.each do |draft| %>
+          <article class="ck-suggestion-card">
+            <header class="ck-suggestion-card__header">
+              <span class="ck-chip ck-chip--soft">Draft #<%= draft.id %></span>
+              <time class="ck-meta-copy" data-relative-time datetime="<%= draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(draft.created_at) %> ago</time>
+            </header>
+            <pre class="ck-code ck-suggestion-card__instruction"><%= draft.instruction %></pre>
+            <div class="ck-actions">
+              <%= button_to "Publish this draft", publish_draft_metric_path(@metric),
+                    method: :post, form_class: "inline-block",
+                    class: ck_button_classes(:dark) %>
+            </div>
+          </article>
+        <% end %>
+      </div>
+    </section>
+  <% end %>
+  <% if Array(@metric.few_shot_examples).any? %>
+    <section class="ck-card ck-card--spaced">
+      <div class="ck-prompt-preview__header">
+        <p class="ck-kicker">Judge few-shot examples</p>
+        <span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "example") %></span>
+      </div>
+      <p class="ck-meta-copy">Disagreements added here will be injected as worked examples when the judge runs on this metric. Used by Phase 4 / 5 to retrain the judge.</p>
+      <ol class="ck-few-shot-list">
+        <% Array(@metric.few_shot_examples).each do |fs| %>
+          <li class="ck-few-shot-item">
+            <div class="ck-few-shot-item__scores">
+              <span class="ck-meta-copy">judge said</span>
+              <% if fs["judge_score"] %>
+                <span class="<%= ck_badge_classes(ck_score_kind(fs["judge_score"].to_f)) %>"><%= fs["judge_score"] %></span>
+              <% end %>
+              <span class="ck-meta-copy">human said</span>
+              <% if fs["human_score"] %>
+                <span class="<%= ck_badge_classes(ck_score_kind(fs["human_score"].to_f)) %>"><%= fs["human_score"] %></span>
+              <% end %>
+            </div>
+            <% if fs["human_note"].to_s.present? %>
+              <p class="ck-copy"><%= fs["human_note"] %></p>
+            <% end %>
+          </li>
+        <% end %>
+      </ol>
+    </section>
+  <% end %>
+<% end %>

data/config/routes.rb CHANGED Viewed

@@ -12,7 +12,13 @@ CompletionKit::Engine.routes.draw do
   end
   resources :datasets
-  resources :metrics
+  resources :metrics do
+    member do
+      post :add_few_shot
+      post :publish_draft
+      post :suggest_variants
+    end
+  end
   resources :metric_groups
   resources :tags
   resources :dashboard_dismissals, only: [:create, :destroy]

data/db/migrate/20260523000001_add_few_shot_examples_to_completion_kit_metrics.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class AddFewShotExamplesToCompletionKitMetrics < ActiveRecord::Migration[8.1]
+  def change
+    add_column :completion_kit_metrics, :few_shot_examples, :text
+  end
+end

data/db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb ADDED Viewed

@@ -0,0 +1,15 @@
+class AddStateToCompletionKitJudgeVersions < ActiveRecord::Migration[8.1]
+  def change
+    add_column :completion_kit_judge_versions, :state, :string, null: false, default: "published"
+    add_column :completion_kit_judge_versions, :source, :string
+    reversible do |dir|
+      dir.up do
+        execute "UPDATE completion_kit_judge_versions SET state = 'published'"
+      end
+    end
+    add_index :completion_kit_judge_versions, [:metric_id, :state],
+              name: "index_ck_judge_versions_on_metric_state"
+  end
+end

data/lib/completion_kit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CompletionKit
-  VERSION = "0.5.36"
+  VERSION = "0.5.37"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: completion-kit
 version: !ruby/object:Gem::Version
-  version: 0.5.36
+  version: 0.5.37
 platform: ruby
 authors:
 - Damien Bastin
@@ -290,14 +290,17 @@ files:
 - app/models/concerns/completion_kit/taggable.rb
 - app/services/completion_kit/anthropic_client.rb
 - app/services/completion_kit/api_config.rb
+- app/services/completion_kit/calibration_math.rb
 - app/services/completion_kit/csv_processor.rb
 - app/services/completion_kit/dashboard_stats.rb
 - app/services/completion_kit/judge_service.rb
+- app/services/completion_kit/judge_variant_generator.rb
 - app/services/completion_kit/llm_client.rb
 - app/services/completion_kit/mcp_dispatcher.rb
 - app/services/completion_kit/mcp_tools/base.rb
 - app/services/completion_kit/mcp_tools/calibrations.rb
 - app/services/completion_kit/mcp_tools/datasets.rb
+- app/services/completion_kit/mcp_tools/judges.rb
 - app/services/completion_kit/mcp_tools/metric_groups.rb
 - app/services/completion_kit/mcp_tools/metrics.rb
 - app/services/completion_kit/mcp_tools/prompts.rb
@@ -305,6 +308,7 @@ files:
 - app/services/completion_kit/mcp_tools/responses.rb
 - app/services/completion_kit/mcp_tools/runs.rb
 - app/services/completion_kit/mcp_tools/tags.rb
+- app/services/completion_kit/metric_calibration_stats.rb
 - app/services/completion_kit/model_discovery_service.rb
 - app/services/completion_kit/ollama_client.rb
 - app/services/completion_kit/onboarding/checklist.rb
@@ -323,6 +327,7 @@ files:
 - app/views/completion_kit/api_reference/_resource_list.html.erb
 - app/views/completion_kit/api_reference/index.html.erb
 - app/views/completion_kit/calibrations/_buttons.html.erb
+- app/views/completion_kit/calibrations/_trust_panel.html.erb
 - app/views/completion_kit/dashboard/_eye_icon.html.erb
 - app/views/completion_kit/dashboard/_eye_off_icon.html.erb
 - app/views/completion_kit/dashboard/_failures_card.html.erb
@@ -407,6 +412,8 @@ files:
 - db/migrate/20260516000001_create_completion_kit_dashboard_dismissals.rb
 - db/migrate/20260522000001_create_completion_kit_judge_versions.rb
 - db/migrate/20260522000002_create_completion_kit_calibrations.rb
+- db/migrate/20260523000001_add_few_shot_examples_to_completion_kit_metrics.rb
+- db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb
 - lib/completion-kit.rb
 - lib/completion_kit.rb
 - lib/completion_kit/concurrency_check.rb