RubyGems - completion-kit - Versions diffs - 0.5.36 → 0.5.38 - Mend

completion-kit 0.5.36 → 0.5.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/app/assets/stylesheets/completion_kit/application.css.erb +218 -19
data/app/controllers/completion_kit/metrics_controller.rb +58 -1
data/app/models/completion_kit/judge_version.rb +17 -1
data/app/models/completion_kit/metric.rb +19 -0
data/app/services/completion_kit/calibration_math.rb +84 -0
data/app/services/completion_kit/judge_variant_generator.rb +108 -0
data/app/services/completion_kit/mcp_dispatcher.rb +3 -1
data/app/services/completion_kit/mcp_tools/judges.rb +138 -0
data/app/services/completion_kit/metric_calibration_stats.rb +99 -0
data/app/views/completion_kit/calibrations/_buttons.html.erb +15 -6
data/app/views/completion_kit/calibrations/_trust_panel.html.erb +31 -0
data/app/views/completion_kit/metrics/index.html.erb +18 -0
data/app/views/completion_kit/metrics/show.html.erb +144 -0
data/config/routes.rb +7 -1
data/db/migrate/20260523000001_add_few_shot_examples_to_completion_kit_metrics.rb +5 -0
data/db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb +15 -0
data/lib/completion_kit/version.rb +1 -1
metadata +8 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f3189fdba715e4750befcadf517462d0523ae7366611964388309031ad6a4d4f
-  data.tar.gz: 322e2dd4847c5e8bd3b8af83b4b09873efcd4238c945d928fdac7c7ffbbaf7de
+  metadata.gz: fc7d527828189c2993060b315dca634fb958d2da11fd7fae63c4790179c46701
+  data.tar.gz: f0323b980bdfb35d36742b548ddd3629e66d39e587775521678dc80b4cd2f068
 SHA512:
-  metadata.gz: f611cfbc07196fd75eb16962bb1acf4a271d759f42473af9b94d049860839a12657223c72bb12aaeaa80a14adfbb29a20c4cb3c4c3176193ce44966bb876b011
-  data.tar.gz: 2806d017ce92c625e6c7f83e789d13afe53065107a5a803f0d309cb1eaee65d752a8b1e18734c42e9106fc6c38c2cbbfcc803f76477571d7e2330130ebe8eee1
+  metadata.gz: '020946bdac698194bb5246cfbe21fdf45c56006c80c15d1d7bcfda4d3494d95cde45645e090df14a411b172c83dcde42be777d74811b25a340e5710dba6ae7ce'
+  data.tar.gz: 1b4f0ea8cf4e613df783ac428404ef1ae19b285db04f8a6768119760c65fb81d9ee72d52905ab2ef30e077ae910eb00e622925c1dcc43aee0c9e3a0f748718b1

data/app/assets/stylesheets/completion_kit/application.css.erb CHANGED Viewed

@@ -5123,53 +5123,252 @@ a.tag-mark {
 .ck-calibration {
   margin-top: 12px;
   padding-top: 12px;
-  border-top: 1px dashed rgba(255, 255, 255, 0.08);
+  border-top: 1px dashed var(--ck-line);
 }
 .ck-calibration__prompt {
-  font-size: 0.8rem;
+  font-family: var(--ck-mono);
+  font-size: 0.72rem;
+  letter-spacing: 0.06em;
+  text-transform: uppercase;
   color: var(--ck-dim);
-  margin: 0 0 8px;
+  margin: 0 0 10px;
   display: flex;
   align-items: center;
-  gap: 8px;
+  gap: 10px;
 }
 .ck-calibration__count {
-  font-size: 0.75rem;
+  font-family: var(--ck-mono);
+  font-size: 0.72rem;
+  letter-spacing: 0.03em;
   color: var(--ck-accent);
+  text-transform: none;
 }
 .ck-calibration__buttons {
   display: flex;
-  gap: 8px;
+  gap: 6px;
   flex-wrap: wrap;
 }
 .ck-calibration__pill {
   display: inline-flex;
   align-items: center;
-  gap: 6px;
-  padding: 6px 12px;
-  border-radius: 999px;
-  font-size: 0.85rem;
-  background: transparent;
-  border: 1px solid rgba(255, 255, 255, 0.18);
-  color: inherit;
+  gap: 0.4rem;
+  padding: 0.32rem 0.65rem;
+  border-radius: 4px;
+  font-family: var(--ck-mono);
+  font-size: 0.78rem;
+  font-weight: 500;
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+  background: var(--ck-surface-soft);
+  border: 1px solid var(--ck-line);
+  color: var(--ck-dim);
   cursor: pointer;
+  transition: background 0.12s, border-color 0.12s, color 0.12s;
+}
+.ck-calibration__pill svg {
+  width: 14px;
+  height: 14px;
 }
 .ck-calibration__pill:hover,
 .ck-calibration__pill:focus-visible {
-  border-color: var(--ck-accent);
+  color: var(--ck-text);
+  border-color: var(--ck-dim);
 }
-.ck-calibration__pill.is-active {
-  background: var(--ck-accent);
-  color: #0b1320;
-  border-color: var(--ck-accent);
+.ck-calibration__pill--agree.is-active {
+  background: var(--ck-success-soft);
+  border-color: rgba(45, 212, 168, 0.35);
+  color: var(--ck-success);
 }
+.ck-calibration__pill--disagree.is-active {
+  background: var(--ck-danger-soft);
+  border-color: rgba(248, 113, 113, 0.35);
+  color: var(--ck-danger);
+}
+.ck-calibration__pill--borderline.is-active {
+  background: var(--ck-warning-soft);
+  border-color: rgba(224, 164, 88, 0.35);
+  color: var(--ck-warning);
+}
+.ck-calibration__pill--agree:hover { border-color: rgba(45, 212, 168, 0.45); color: var(--ck-success); }
+.ck-calibration__pill--disagree:hover { border-color: rgba(248, 113, 113, 0.45); color: var(--ck-danger); }
+.ck-calibration__pill--borderline:hover { border-color: rgba(224, 164, 88, 0.45); color: var(--ck-warning); }
 .ck-calibration__detail {
-  margin-top: 10px;
+  margin-top: 12px;
   display: flex;
   flex-direction: column;
   gap: 8px;
+  padding: 12px;
+  background: var(--ck-surface-soft);
+  border: 1px solid var(--ck-line);
+  border-radius: 6px;
 }
 .ck-calibration__value {
   color: var(--ck-accent);
+  font-family: var(--ck-mono);
+  font-weight: 600;
+}
+.ck-trust-panel {
+  display: inline-flex;
+  flex-direction: column;
+  gap: 6px;
+  margin-top: 12px;
+  padding: 10px 14px;
+  background: var(--ck-surface-soft);
+  border: 1px solid var(--ck-line);
+  border-radius: 6px;
+}
+.ck-trust-panel__label {
+  margin: 0;
+  font-family: var(--ck-mono);
+  font-size: 0.7rem;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+  color: var(--ck-dim);
+}
+.ck-trust-panel__body {
+  display: flex;
+  align-items: baseline;
+  gap: 10px;
+}
+.ck-trust-panel__counter {
+  font-family: var(--ck-mono);
+  font-size: 1.6rem;
+  font-weight: 600;
+  color: var(--ck-accent);
+}
+.ck-trust-panel__counter-of {
+  font-size: 0.9rem;
+  color: var(--ck-dim);
+  margin-left: 4px;
+}
+.ck-trust-panel__hint {
+  font-family: var(--ck-mono);
+  font-size: 0.72rem;
+  color: var(--ck-dim);
+  letter-spacing: 0.04em;
+}
+.ck-trust-panel__score {
+  font-family: var(--ck-mono);
+  font-size: 1.6rem;
   font-weight: 600;
+  color: var(--ck-success);
+}
+.ck-trust-panel__score-pct {
+  font-size: 0.9rem;
+  color: var(--ck-dim);
+  margin-left: 2px;
+}
+.ck-trust-panel__margin {
+  font-family: var(--ck-mono);
+  font-size: 0.8rem;
+  color: var(--ck-dim);
+}
+.ck-trust-panel__gate {
+  font-family: var(--ck-mono);
+  font-size: 0.66rem;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+  padding: 2px 6px;
+  border-radius: 3px;
+  background: var(--ck-surface);
+  border: 1px solid var(--ck-line);
+  color: var(--ck-dim);
+}
+.ck-trust-panel--firm .ck-trust-panel__gate {
+  color: var(--ck-success);
+  border-color: rgba(45, 212, 168, 0.35);
+}
+.ck-trust-panel__details {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 14px;
+  font-family: var(--ck-mono);
+  font-size: 0.72rem;
+  color: var(--ck-dim);
+}
+.ck-trust-panel__borderline {
+  color: var(--ck-warning);
+}
+.ck-trust-panel__borderline--ok { color: var(--ck-dim); }
+.ck-trust-panel__borderline--warning { color: var(--ck-warning); }
+.ck-trust-panel__borderline--danger { color: var(--ck-danger); }
+.ck-disagreements-table td .ck-meta-copy {
+  font-size: 0.78rem;
+}
+.ck-few-shot-list {
+  list-style: decimal;
+  padding-left: 1.4rem;
+  margin: 0;
+  display: flex;
+  flex-direction: column;
+  gap: 12px;
+}
+.ck-few-shot-item {
+  padding: 10px 12px;
+  background: var(--ck-surface-soft);
+  border: 1px solid var(--ck-line);
+  border-radius: 6px;
+}
+.ck-few-shot-item__scores {
+  display: flex;
+  align-items: center;
+  gap: 8px;
+  font-family: var(--ck-mono);
+  font-size: 0.75rem;
+  letter-spacing: 0.04em;
+  text-transform: uppercase;
+}
+.ck-draft-banner {
+  display: inline-flex;
+  align-items: center;
+  gap: 10px;
+  margin-top: 10px;
+  padding: 8px 12px;
+  background: var(--ck-accent-soft);
+  border: 1px dashed rgba(6, 182, 212, 0.4);
+  border-radius: 6px;
+}
+.ck-suggestion-list {
+  display: flex;
+  flex-direction: column;
+  gap: 12px;
+}
+.ck-suggestion-card {
+  padding: 12px 14px;
+  background: var(--ck-surface-soft);
+  border: 1px solid var(--ck-line);
+  border-radius: 6px;
+  display: flex;
+  flex-direction: column;
+  gap: 10px;
+}
+.ck-suggestion-card__header {
+  display: flex;
+  align-items: center;
+  gap: 10px;
+}
+.ck-suggestion-card__instruction {
+  margin: 0;
+  white-space: pre-wrap;
+  font-size: 0.85rem;
+  background: var(--ck-bg-strong);
+  padding: 10px 12px;
+  border-radius: 4px;
+  border: 1px solid var(--ck-line);
+}
+.ck-metrics-table__trust {
+  font-family: var(--ck-mono);
+  font-size: 0.78rem;
+  letter-spacing: 0.03em;
+}
+.ck-metrics-table__trust-rate {
+  font-weight: 600;
+  color: var(--ck-success);
+  margin-right: 6px;
 }

data/app/controllers/completion_kit/metrics_controller.rb CHANGED Viewed

@@ -1,13 +1,19 @@
 module CompletionKit
   class MetricsController < ApplicationController
     include CompletionKit::TagFiltering
-    before_action :set_metric, only: [:show, :edit, :update, :destroy]
+    before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants]
     def index
       @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
     end
     def show
+      @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
+                                  .includes(response: [:reviews, :run])
+                                  .order(created_at: :desc)
+                                  .limit(50)
+      @latest_draft = JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
+      @suggestion_drafts = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc)
     end
     def new
@@ -40,6 +46,57 @@ module CompletionKit
       redirect_to metrics_path, notice: "Metric was successfully destroyed."
     end
+    def suggest_variants
+      generator = JudgeVariantGenerator.new(@metric)
+      variants = generator.call
+      if variants.empty?
+        redirect_to metric_path(@metric), alert: "The model returned no usable variants. Try again with a different model."
+        return
+      end
+      generator.persist!(variants)
+      label = variants.length == 1 ? "alternative" : "alternatives"
+      redirect_to metric_path(@metric), notice: "Wrote #{variants.length} #{label} for the judge instruction. Pick one to make it live."
+    end
+    def publish_draft
+      scope = JudgeVersion.drafts.where(metric_id: @metric.id)
+      draft = params[:draft_id].present? ? scope.find_by(id: params[:draft_id]) : scope.order(created_at: :desc).first
+      if draft.nil?
+        redirect_to metric_path(@metric), alert: "No draft to publish."
+        return
+      end
+      JudgeVersion.transaction do
+        JudgeVersion.where(metric_id: @metric.id, state: "published").update_all(current: false)
+        draft.update!(state: "published", current: true)
+        @metric.update_columns(
+          instruction: draft.instruction,
+          rubric_bands: Array(draft.rubric_bands).to_json
+        )
+      end
+      redirect_to metric_path(@metric), notice: "This judge version is now live."
+    end
+    def add_few_shot
+      calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
+      review = calibration.response.reviews.find_by(metric_id: @metric.id)
+      examples = Array(@metric.few_shot_examples)
+      examples << {
+        "input" => calibration.response.input_data.to_s.truncate(2000),
+        "response" => calibration.response.response_text.to_s.truncate(2000),
+        "judge_score" => review&.ai_score&.to_f,
+        "judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
+        "human_score" => calibration.corrected_score&.to_f,
+        "human_note" => calibration.note.to_s.truncate(1000),
+        "calibration_id" => calibration.id,
+        "added_at" => Time.current.utc.iso8601
+      }
+      @metric.update!(few_shot_examples: examples)
+      redirect_to metric_path(@metric), notice: "Saved as a teaching example. The judge will see it next time it grades."
+    end
     private
     def set_metric

data/app/models/completion_kit/judge_version.rb CHANGED Viewed

@@ -1,23 +1,37 @@
 module CompletionKit
   class JudgeVersion < ApplicationRecord
+    STATES = %w[draft published].freeze
     belongs_to :metric
     has_many :calibrations, dependent: :destroy
     serialize :rubric_bands, coder: JSON
     validates :metric_id, presence: true
+    validates :state, inclusion: { in: STATES }
     scope :current, -> { where(current: true) }
+    scope :published, -> { where(state: "published") }
+    scope :drafts, -> { where(state: "draft") }
     def self.ensure_current_for(metric)
       current.find_by(metric_id: metric.id) || create!(
         metric: metric,
         instruction: metric.instruction,
         rubric_bands: metric.rubric_bands,
-        current: true
+        current: true,
+        state: "published"
       )
     end
+    def draft?
+      state == "draft"
+    end
+    def published?
+      state == "published"
+    end
     def as_json(options = {})
       {
         id: id,
@@ -25,6 +39,8 @@ module CompletionKit
         instruction: instruction,
         rubric_bands: rubric_bands,
         current: current,
+        state: state,
+        source: source,
         created_at: created_at
       }
     end

data/app/models/completion_kit/metric.rb CHANGED Viewed

@@ -16,6 +16,7 @@ module CompletionKit
     has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
     serialize :rubric_bands, coder: JSON
+    serialize :few_shot_examples, coder: JSON, type: Array
     validates :name, presence: true
     validates :key, tenant_scoped_uniqueness: { allow_nil: true }
@@ -23,6 +24,7 @@ module CompletionKit
     before_validation :generate_key
     before_validation :normalize_rubric_bands
     before_validation :set_defaults
+    after_update :fork_draft_judge_version, if: :judge_relevant_changes?
     def self.default_rubric_bands
       DEFAULT_RUBRIC_BANDS.map(&:dup)
@@ -95,5 +97,22 @@ module CompletionKit
     def normalize_rubric_bands
       self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
     end
+    def judge_relevant_changes?
+      saved_change_to_instruction? || saved_change_to_rubric_bands?
+    end
+    def fork_draft_judge_version
+      JudgeVersion.ensure_current_for(self)
+      JudgeVersion.where(metric_id: id, state: "draft").update_all(current: false)
+      JudgeVersion.create!(
+        metric: self,
+        instruction: instruction,
+        rubric_bands: rubric_bands,
+        current: false,
+        state: "draft",
+        source: "edit"
+      )
+    end
   end
 end

data/app/services/completion_kit/calibration_math.rb ADDED Viewed

@@ -0,0 +1,84 @@
+module CompletionKit
+  module CalibrationMath
+    Z_95 = 1.959963984540054
+    module_function
+    def wilson_interval(successes:, n:, z: Z_95)
+      return { point: nil, low: nil, high: nil } if n.to_i.zero?
+      p_hat = successes.to_f / n
+      denom = 1.0 + (z * z) / n
+      center = (p_hat + (z * z) / (2.0 * n)) / denom
+      margin = z * Math.sqrt((p_hat * (1 - p_hat) / n) + ((z * z) / (4.0 * n * n))) / denom
+      { point: p_hat, low: [center - margin, 0.0].max, high: [center + margin, 1.0].min }
+    end
+    def mae(pairs)
+      return nil if pairs.empty?
+      sum = pairs.sum { |ai, human| (ai.to_f - human.to_f).abs }
+      sum / pairs.length
+    end
+    def pearson(pairs)
+      return nil if pairs.length < 2
+      xs = pairs.map { |a, _| a.to_f }
+      ys = pairs.map { |_, h| h.to_f }
+      mx = xs.sum / xs.length
+      my = ys.sum / ys.length
+      num = xs.zip(ys).sum { |x, y| (x - mx) * (y - my) }
+      dx2 = xs.sum { |x| (x - mx)**2 }
+      dy2 = ys.sum { |y| (y - my)**2 }
+      denom = Math.sqrt(dx2 * dy2)
+      return nil if denom.zero?
+      num / denom
+    end
+    def quadratic_weighted_kappa(pairs, categories:)
+      return nil if pairs.empty?
+      ratings = categories.to_a
+      k = ratings.length
+      return nil if k < 2
+      index = ratings.each_with_index.to_h
+      observed = Array.new(k) { Array.new(k, 0) }
+      row_totals = Array.new(k, 0)
+      col_totals = Array.new(k, 0)
+      n = 0
+      pairs.each do |ai, human|
+        i = index[score_bucket(ai, ratings)]
+        j = index[score_bucket(human, ratings)]
+        next if i.nil? || j.nil?
+        observed[i][j] += 1
+        row_totals[i] += 1
+        col_totals[j] += 1
+        n += 1
+      end
+      return nil if n.zero?
+      max_dist_sq = (k - 1.0)**2
+      numerator = 0.0
+      denominator = 0.0
+      (0...k).each do |i|
+        (0...k).each do |j|
+          weight = ((i - j)**2) / max_dist_sq
+          expected = (row_totals[i] * col_totals[j]).to_f / n
+          numerator   += weight * observed[i][j]
+          denominator += weight * expected
+        end
+      end
+      return 1.0 if denominator.zero?
+      1.0 - (numerator / denominator)
+    end
+    def score_bucket(value, ratings)
+      rounded = value.to_f.round
+      return ratings.first if rounded <= ratings.first
+      return ratings.last if rounded >= ratings.last
+      rounded
+    end
+  end
+end

data/app/services/completion_kit/judge_variant_generator.rb ADDED Viewed

@@ -0,0 +1,108 @@
+module CompletionKit
+  class JudgeVariantGenerator
+    DEFAULT_VARIANT_COUNT = 3
+    DEFAULT_TEMPERATURE = 0.4
+    Variant = Struct.new(:reasoning, :instruction, keyword_init: true)
+    def initialize(metric, count: DEFAULT_VARIANT_COUNT, model: nil)
+      @metric = metric
+      @count = count
+      @model = model || CompletionKit.config.judge_model
+    end
+    def call
+      client = LlmClient.for_model(@model, ApiConfig.for_model(@model))
+      raw = client.generate_completion(build_meta_prompt, model: @model, max_tokens: 2500, temperature: DEFAULT_TEMPERATURE)
+      parse(raw).first(@count)
+    end
+    def persist!(variants)
+      JudgeVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
+      versions = variants.map do |variant|
+        JudgeVersion.create!(
+          metric: @metric,
+          instruction: variant.instruction,
+          rubric_bands: @metric.rubric_bands,
+          state: "draft",
+          source: "suggestion",
+          current: false
+        )
+      end
+      ActiveSupport::Notifications.instrument("completion_kit.judge_suggestion.generated",
+                                              metric_id: @metric.id,
+                                              count: versions.length,
+                                              model: @model)
+      versions
+    end
+    private
+    def build_meta_prompt
+      examples = JudgeCalibrationExamples.for(@metric)
+      sections = []
+      sections << "You are an expert evaluator. Rewrite a judge's grading instruction so it agrees better with humans on the cases below."
+      sections << ""
+      sections << "## Current instruction"
+      sections << "```"
+      sections << @metric.instruction.to_s
+      sections << "```"
+      sections << ""
+      sections << "## Rubric (unchanged across variants — only rewrite the instruction)"
+      sections << @metric.display_rubric_text
+      sections << ""
+      sections << "## Recent disagreements (judge vs human)"
+      examples.each_with_index do |ex, i|
+        sections << "### Case #{i + 1}"
+        sections << "Input: #{ex[:input].to_s.truncate(200)}"
+        sections << "Output: #{ex[:output].to_s.truncate(200)}"
+        sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
+        sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
+        sections << ""
+      end
+      sections << "## Task"
+      sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite — not a wholesale rewrite of the rubric. Aim to close the disagreement gap."
+      sections << ""
+      sections << "Respond in EXACTLY this format, repeated #{@count} times:"
+      sections << ""
+      sections << "VARIANT:"
+      sections << "REASONING: <one sentence explaining what this variant changes>"
+      sections << "INSTRUCTION:"
+      sections << "<the rewritten instruction>"
+      sections << "END_VARIANT"
+      sections.join("\n")
+    end
+    def parse(text)
+      blocks = text.to_s.scan(/VARIANT:(.*?)END_VARIANT/m).flatten
+      blocks.filter_map do |raw|
+        reasoning = raw[/REASONING:\s*(.*?)(?=INSTRUCTION:|\z)/m, 1].to_s.strip
+        instruction = raw[/INSTRUCTION:\s*(.*)/m, 1].to_s.strip
+        next if instruction.empty?
+        Variant.new(reasoning: reasoning, instruction: instruction)
+      end
+    end
+  end
+  module JudgeCalibrationExamples
+    module_function
+    def for(metric, limit: 8)
+      disagreements = Calibration.where(metric_id: metric.id, verdict: "disagree")
+                                 .includes(response: :reviews)
+                                 .order(created_at: :desc)
+                                 .limit(limit)
+      disagreements.map do |cal|
+        review = cal.response.reviews.find { |r| r.metric_id == metric.id }
+        {
+          input: cal.response.input_data,
+          output: cal.response.response_text,
+          judge_score: review&.ai_score,
+          judge_feedback: review&.ai_feedback,
+          human_score: cal.corrected_score,
+          human_note: cal.note
+        }
+      end
+    end
+  end
+end

data/app/services/completion_kit/mcp_dispatcher.rb CHANGED Viewed

@@ -34,7 +34,8 @@ module CompletionKit
         McpTools::MetricGroups.definitions +
         McpTools::ProviderCredentials.definitions +
         McpTools::Tags.definitions +
-        McpTools::Calibrations.definitions
+        McpTools::Calibrations.definitions +
+        McpTools::Judges.definitions
     end
     def self.call_tool(name, arguments)
@@ -48,6 +49,7 @@ module CompletionKit
       when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
       when /\Atags_/                 then McpTools::Tags.call(name, arguments)
       when /\Acalibrations_/         then McpTools::Calibrations.call(name, arguments)
+      when /\Ajudges_/               then McpTools::Judges.call(name, arguments)
       else raise MethodNotFound, "Unknown tool: #{name}"
       end
     end

data/app/services/completion_kit/mcp_tools/judges.rb ADDED Viewed

@@ -0,0 +1,138 @@
+module CompletionKit
+  module McpTools
+    module Judges
+      extend Base
+      TOOLS = {
+        "judges_suggest" => {
+          description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft JudgeVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
+          inputSchema: {
+            type: "object",
+            properties: {
+              metric_id: { type: "integer" },
+              count: { type: "integer", description: "How many variants to request (default 3, max 5)." },
+              model: { type: "string", description: "Override the model used to generate variants. Defaults to CompletionKit.config.judge_model." }
+            },
+            required: ["metric_id"]
+          },
+          handler: :suggest
+        },
+        "judges_replay" => {
+          description: "Run the current judge against a dataset (judge-only run). Wraps runs_create with prompt_id omitted and output_column supplied. Re-judges existing dataset outputs so you can compare against human verdicts.",
+          inputSchema: {
+            type: "object",
+            properties: {
+              name: { type: "string" },
+              metric_id: { type: "integer" },
+              dataset_id: { type: "integer" },
+              judge_model: { type: "string" },
+              output_column: { type: "string", description: "Dataset column with the existing outputs to grade. Defaults to actual_output." }
+            },
+            required: ["name", "metric_id", "dataset_id", "judge_model"]
+          },
+          handler: :replay
+        },
+        "judges_compare" => {
+          description: "Compare two judge versions' calibration stats side by side. Pass either two judge_version_ids or one metric_id with judge_version_a_id / judge_version_b_id.",
+          inputSchema: {
+            type: "object",
+            properties: {
+              metric_id: { type: "integer" },
+              judge_version_a_id: { type: "integer" },
+              judge_version_b_id: { type: "integer" }
+            },
+            required: ["metric_id", "judge_version_a_id", "judge_version_b_id"]
+          },
+          handler: :compare
+        }
+      }.freeze
+      def self.suggest(args)
+        metric = CompletionKit::Metric.find(args["metric_id"])
+        count = [args["count"].to_i, 5].min
+        count = CompletionKit::JudgeVariantGenerator::DEFAULT_VARIANT_COUNT if count <= 0
+        generator = CompletionKit::JudgeVariantGenerator.new(metric, count: count, model: args["model"])
+        variants = generator.call
+        return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
+        versions = generator.persist!(variants)
+        text_result(versions.map(&:as_json))
+      end
+      def self.replay(args)
+        metric = CompletionKit::Metric.find(args["metric_id"])
+        dataset = CompletionKit::Dataset.find(args["dataset_id"])
+        run = CompletionKit::Run.new(
+          name: args["name"],
+          dataset: dataset,
+          judge_model: args["judge_model"],
+          output_column: args["output_column"].presence || "actual_output"
+        )
+        if run.save
+          run.replace_metrics!([metric.id])
+          text_result(run.reload.as_json)
+        else
+          error_result(run.errors.full_messages.join(", "))
+        end
+      end
+      def self.compare(args)
+        metric = CompletionKit::Metric.find(args["metric_id"])
+        a = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_a_id"])
+        b = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_b_id"])
+        stats_a = CompletionKit::MetricCalibrationStats.for(metric, judge_version: a)
+        stats_b = CompletionKit::MetricCalibrationStats.for(metric, judge_version: b)
+        text_result({
+          metric_id: metric.id,
+          a: judge_version_payload(a, stats_a),
+          b: judge_version_payload(b, stats_b),
+          delta: delta_payload(stats_a, stats_b),
+          recommendation: recommendation_for(stats_a, stats_b)
+        })
+      end
+      def self.judge_version_payload(version, stats)
+        {
+          id: version.id, state: version.state, current: version.current,
+          source: version.source, created_at: version.created_at,
+          sample_size: stats.sample_size,
+          agreement_point: stats.agreement_point,
+          agreement_low: stats.agreement_low,
+          agreement_high: stats.agreement_high,
+          borderline_rate: stats.borderline_rate,
+          mae: stats.mae, kappa: stats.kappa
+        }
+      end
+      def self.delta_payload(a, b)
+        {
+          agreement: pair_delta(a.agreement_point, b.agreement_point),
+          mae: pair_delta(a.mae, b.mae),
+          kappa: pair_delta(a.kappa, b.kappa),
+          sample_size: { a: a.sample_size, b: b.sample_size }
+        }
+      end
+      def self.pair_delta(a, b)
+        { a: a, b: b, delta: (a.nil? || b.nil?) ? nil : (b - a) }
+      end
+      def self.recommendation_for(a, b)
+        total = a.sample_size + b.sample_size
+        if total < 30
+          { state: "need_more_data", reason: "Combined n=#{total}; need 30+ to make a call." }
+        elsif a.agreement_point.nil? || b.agreement_point.nil?
+          { state: "no_change", reason: "Not enough verdicts on one of the versions to compare." }
+        else
+          lift = b.agreement_point - a.agreement_point
+          if lift > 0.03
+            { state: "recommend", reason: "B agreement +#{(lift * 100).round}pt over A." }
+          elsif lift < -0.03
+            { state: "hold", reason: "B agreement #{(lift * 100).round}pt vs A." }
+          else
+            { state: "no_change", reason: "Agreement within noise (#{(lift * 100).round}pt)." }
+          end
+        end
+      end
+    end
+  end
+end

data/app/services/completion_kit/metric_calibration_stats.rb ADDED Viewed

@@ -0,0 +1,99 @@
+module CompletionKit
+  class MetricCalibrationStats
+    PROVISIONAL_MIN = 10
+    FIRM_MIN = 30
+    Result = Struct.new(
+      :sample_size, :agree_count, :disagree_count, :borderline_count,
+      :agreement_point, :agreement_low, :agreement_high,
+      :borderline_rate, :mae, :pearson, :kappa, :gate,
+      keyword_init: true
+    ) do
+      def counter_only?
+        gate == :counter
+      end
+      def provisional?
+        gate == :provisional
+      end
+      def firm?
+        gate == :firm
+      end
+      def short_to_target
+        [PROVISIONAL_MIN - sample_size, 0].max
+      end
+      def margin
+        return nil if agreement_low.nil? || agreement_high.nil?
+        (agreement_high - agreement_low) / 2.0
+      end
+    end
+    def self.for(metric, judge_version: nil)
+      new(metric: metric, judge_version: judge_version).call
+    end
+    def initialize(metric:, judge_version: nil)
+      @metric = metric
+      @judge_version = judge_version
+    end
+    def call
+      scope = Calibration.where(metric_id: @metric.id)
+      scope = scope.where(judge_version_id: @judge_version.id) if @judge_version
+      verdicts = scope.pluck(:verdict, :corrected_score, :response_id)
+      n = verdicts.length
+      agrees = verdicts.count { |v, _, _| v == "agree" }
+      disagrees = verdicts.count { |v, _, _| v == "disagree" }
+      borderlines = verdicts.count { |v, _, _| v == "borderline" }
+      ci = CalibrationMath.wilson_interval(successes: agrees, n: n)
+      pairs = score_pairs(verdicts)
+      mae_value = CalibrationMath.mae(pairs)
+      pearson_value = CalibrationMath.pearson(pairs)
+      kappa_value = CalibrationMath.quadratic_weighted_kappa(pairs, categories: 1..5)
+      Result.new(
+        sample_size: n,
+        agree_count: agrees,
+        disagree_count: disagrees,
+        borderline_count: borderlines,
+        agreement_point: ci[:point],
+        agreement_low: ci[:low],
+        agreement_high: ci[:high],
+        borderline_rate: n.zero? ? nil : borderlines.to_f / n,
+        mae: mae_value,
+        pearson: pearson_value,
+        kappa: kappa_value,
+        gate: gate_for(n)
+      )
+    end
+    private
+    def score_pairs(verdicts)
+      response_ids = verdicts.map { |_, _, rid| rid }.uniq
+      ai_scores = Review.where(response_id: response_ids, metric_id: @metric.id)
+                       .pluck(:response_id, :ai_score).to_h
+      verdicts.filter_map do |verdict, corrected, response_id|
+        next if verdict == "borderline"
+        ai = ai_scores[response_id]
+        next if ai.nil?
+        human = verdict == "agree" ? ai : corrected
+        next if human.nil?
+        [ai.to_f, human.to_f]
+      end
+    end
+    def gate_for(n)
+      return :counter if n < PROVISIONAL_MIN
+      return :firm if n >= FIRM_MIN
+      :provisional
+    end
+  end
+end

data/app/views/completion_kit/calibrations/_buttons.html.erb CHANGED Viewed

@@ -2,20 +2,29 @@
   <% current_verdict = calibration&.verdict %>
   <% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
   <p class="ck-calibration__prompt">
-    How does this score feel?
+    Your verdict
     <% if verdict_count > 0 %>
-      <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> collected</span>
+      <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "judge trust →", metric_path(metric), class: "ck-link" %></span>
+    <% else %>
+      <span class="ck-calibration__count">Tell us what you think — was the score right? Verdicts roll up into <%= link_to "judge trust", metric_path(metric), class: "ck-link" %>.</span>
     <% end %>
   </p>
   <div class="ck-calibration__buttons">
+    <% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
+    <% verdict_hints = {
+         "agree" => "The score looks right.",
+         "disagree" => "The score is wrong — you'll pick the right one.",
+         "borderline" => "The rubric is unclear here; either score could be defensible."
+       } %>
     <% CompletionKit::Calibration::VERDICTS.each do |verdict| %>
       <%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
               method: :post,
               form: { data: { turbo: "true" } },
               class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == current_verdict}",
-              "aria-pressed": (verdict == current_verdict).to_s do %>
-        <% case verdict
-           when "agree" %>👍 Agree<% when "disagree" %>👎 Disagree<% else %>🤔 Borderline<% end %>
+              "aria-pressed": (verdict == current_verdict).to_s,
+              title: verdict_hints[verdict] do %>
+        <%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
+        <span><%= verdict %></span>
       <% end %>
     <% end %>
   </div>
@@ -27,7 +36,7 @@
       <%= hidden_field_tag :metric_id, metric.id %>
       <%= hidden_field_tag :verdict, "disagree" %>
       <label class="ck-label">
-        Your score
+        What should the score have been?
         <span class="ck-calibration__value" data-calibration-value><%= calibration.corrected_score || review&.ai_score || 3 %></span>
       </label>
       <input type="range" name="corrected_score" min="1" max="5" step="0.5"

data/app/views/completion_kit/calibrations/_trust_panel.html.erb ADDED Viewed

@@ -0,0 +1,31 @@
+<% stats = local_assigns[:stats] %>
+<div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
+  <p class="ck-trust-panel__label" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</p>
+  <% if stats.counter_only? %>
+    <div class="ck-trust-panel__body">
+      <span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>
+      <span class="ck-trust-panel__hint">verdicts so far<% if stats.short_to_target > 0 %> · <%= stats.short_to_target %> more before we can score the judge<% end %></span>
+    </div>
+  <% else %>
+    <div class="ck-trust-panel__body">
+      <span class="ck-trust-panel__score"
+            title="Roughly how often the judge and the humans agreed.">~<%= (stats.agreement_point * 100).round %><span class="ck-trust-panel__score-pct">%</span></span>
+      <span class="ck-trust-panel__margin"
+            title="The range we're confident the true rate sits in, given how few verdicts we have so far.">±<%= (stats.margin * 100).round %> pt</span>
+      <span class="ck-trust-panel__gate"
+            title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts to tighten the margin.' %>"><%= stats.firm? ? "settled" : "early read" %></span>
+    </div>
+    <div class="ck-trust-panel__details">
+      <span><%= pluralize(stats.sample_size, "verdict") %></span>
+      <% if stats.borderline_rate && stats.borderline_rate > 0 %>
+        <% level = if stats.borderline_rate > 0.30 then "danger"
+                   elsif stats.borderline_rate > 0.15 then "warning"
+                   else "ok" end %>
+        <span class="ck-trust-panel__borderline ck-trust-panel__borderline--<%= level %>"
+              title="<%= level == 'ok' ? 'Some reviewers said the rubric was unclear here.' : 'A lot of reviewers say the rubric is unclear here. Consider splitting the metric or rewriting the rubric.' %>">
+          <%= (stats.borderline_rate * 100).round %>% said "unclear"
+        </span>
+      <% end %>
+    </div>
+  <% end %>
+</div>

data/app/views/completion_kit/metrics/index.html.erb CHANGED Viewed

@@ -19,6 +19,7 @@
       <tr>
         <th scope="col">Name</th>
         <th scope="col">Instruction</th>
+        <th scope="col" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</th>
         <th scope="col">In groups</th>
         <th scope="col"></th>
       </tr>
@@ -35,6 +36,23 @@
             <% end %>
           </td>
           <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
+          <td data-label="Judge trust" class="ck-metrics-table__trust">
+            <% if CompletionKit.config.judge_calibration_enabled %>
+              <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
+              <% if s.counter_only? %>
+                <% if s.sample_size.zero? %>
+                  <span class="ck-meta-copy">No verdicts yet</span>
+                <% else %>
+                  <span class="ck-meta-copy"><%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts</span>
+                <% end %>
+              <% else %>
+                <span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read — keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
+                <span class="ck-meta-copy">±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %></span>
+              <% end %>
+            <% else %>
+              <span class="ck-meta-copy">—</span>
+            <% end %>
+          </td>
           <td data-label="In groups">
             <% groups = metric.metric_groups %>
             <% if groups.any? %>

data/app/views/completion_kit/metrics/show.html.erb CHANGED Viewed

@@ -6,8 +6,28 @@
 <section class="ck-page-header">
   <div>
     <h1 class="ck-title"><%= @metric.name %></h1>
+    <% if CompletionKit.config.judge_calibration_enabled %>
+      <%= render "completion_kit/calibrations/trust_panel",
+            stats: CompletionKit::MetricCalibrationStats.for(@metric) %>
+      <% if @latest_draft %>
+        <div class="ck-draft-banner">
+          <span class="ck-chip ck-chip--soft">Draft pending</span>
+          <span class="ck-meta-copy">A draft version of this judge is saved. Publishing it replaces the live instruction and rubric.</span>
+          <%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @latest_draft.id),
+                method: :post, form_class: "inline-block",
+                class: ck_button_classes(:dark) %>
+        </div>
+      <% end %>
+    <% end %>
   </div>
   <div class="ck-actions">
+    <% if CompletionKit.config.judge_calibration_enabled %>
+      <%= button_to "Suggest rewrites", suggest_variants_metric_path(@metric),
+            method: :post, form_class: "inline-block",
+            class: ck_button_classes(:light, variant: :outline),
+            title: "Ask the model to rewrite this judge instruction based on the disagreements collected so far.",
+            data: { turbo_confirm: "Ask the model to rewrite this judge instruction based on the disagreements collected so far?" } %>
+    <% end %>
     <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
   </div>
 </section>
@@ -42,3 +62,127 @@
     <% end %>
   </div>
 </section>
+<% if CompletionKit.config.judge_calibration_enabled %>
+  <section class="ck-card ck-card--spaced">
+    <div class="ck-prompt-preview__header">
+      <p class="ck-kicker">Where the judge got it wrong</p>
+      <% if @disagreements.any? %>
+        <span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
+      <% end %>
+    </div>
+    <% if @disagreements.empty? %>
+      <p class="ck-meta-copy">Nothing here yet. As people give a "disagree" verdict on response rows, those rows show up below so you can review the judge's misses and turn them into teaching examples.</p>
+    <% else %>
+      <p class="ck-meta-copy">Rows where a reviewer said the judge got it wrong. Save the best ones as teaching examples — the judge will see them next time it grades.</p>
+      <table class="ck-results-table ck-disagreements-table">
+        <thead>
+          <tr>
+            <th scope="col">Run · row</th>
+            <th scope="col">Judge</th>
+            <th scope="col">Human</th>
+            <th scope="col">Note</th>
+            <th scope="col"></th>
+          </tr>
+        </thead>
+        <tbody>
+          <% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
+          <% @disagreements.each do |cal| %>
+            <% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
+            <% already = existing_ids.include?(cal.id) %>
+            <tr>
+              <td>
+                <%= link_to ck_run_path(cal.response.run), class: "ck-record-name" do %>
+                  <strong><%= cal.response.run.name.to_s.truncate(40) %></strong>
+                <% end %>
+                <span class="ck-meta-copy">· <%= link_to "row ##{cal.response.id}", run_response_path(cal.response.run, cal.response), class: "ck-link" %></span>
+              </td>
+              <td>
+                <% if review&.ai_score %>
+                  <span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
+                <% else %>
+                  <span class="ck-meta-copy">—</span>
+                <% end %>
+              </td>
+              <td>
+                <% if cal.corrected_score %>
+                  <span class="<%= ck_badge_classes(ck_score_kind(cal.corrected_score.to_f)) %>"><%= cal.corrected_score %></span>
+                <% else %>
+                  <span class="ck-meta-copy">—</span>
+                <% end %>
+              </td>
+              <td class="ck-meta-copy"><%= cal.note.to_s.truncate(120) %></td>
+              <td>
+                <% if already %>
+                  <span class="ck-chip ck-chip--done">Saved as example</span>
+                <% else %>
+                  <%= button_to "Teach the judge",
+                        add_few_shot_metric_path(@metric, calibration_id: cal.id),
+                        method: :post,
+                        form_class: "inline-block",
+                        class: ck_button_classes(:light, variant: :outline),
+                        title: "Save this row as a teaching example. The judge will see it next time it grades." %>
+                <% end %>
+              </td>
+            </tr>
+          <% end %>
+        </tbody>
+      </table>
+    <% end %>
+  </section>
+  <% if @suggestion_drafts.any? %>
+    <section class="ck-card ck-card--spaced">
+      <div class="ck-prompt-preview__header">
+        <p class="ck-kicker">Suggested rewrites</p>
+        <span class="ck-chip"><%= @suggestion_drafts.size %> option<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
+      </div>
+      <p class="ck-meta-copy">The model wrote these alternate instructions based on the disagreements above. Pick one to make it the live judge — the previous version is kept in history.</p>
+      <div class="ck-suggestion-list">
+        <% @suggestion_drafts.each do |draft| %>
+          <article class="ck-suggestion-card">
+            <header class="ck-suggestion-card__header">
+              <span class="ck-chip ck-chip--soft">Option #<%= draft.id %></span>
+              <time class="ck-meta-copy" data-relative-time datetime="<%= draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(draft.created_at) %> ago</time>
+            </header>
+            <pre class="ck-code ck-suggestion-card__instruction"><%= draft.instruction %></pre>
+            <div class="ck-actions">
+              <%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: draft.id),
+                    method: :post, form_class: "inline-block",
+                    class: ck_button_classes(:dark) %>
+            </div>
+          </article>
+        <% end %>
+      </div>
+    </section>
+  <% end %>
+  <% if Array(@metric.few_shot_examples).any? %>
+    <section class="ck-card ck-card--spaced">
+      <div class="ck-prompt-preview__header">
+        <p class="ck-kicker">Teaching examples</p>
+        <span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "example") %></span>
+      </div>
+      <p class="ck-meta-copy">The judge sees these worked examples whenever it grades for this metric. Each shows what the judge gave and what a human said it should have been.</p>
+      <ol class="ck-few-shot-list">
+        <% Array(@metric.few_shot_examples).each do |fs| %>
+          <li class="ck-few-shot-item">
+            <div class="ck-few-shot-item__scores">
+              <span class="ck-meta-copy">judge said</span>
+              <% if fs["judge_score"] %>
+                <span class="<%= ck_badge_classes(ck_score_kind(fs["judge_score"].to_f)) %>"><%= fs["judge_score"] %></span>
+              <% end %>
+              <span class="ck-meta-copy">human said</span>
+              <% if fs["human_score"] %>
+                <span class="<%= ck_badge_classes(ck_score_kind(fs["human_score"].to_f)) %>"><%= fs["human_score"] %></span>
+              <% end %>
+            </div>
+            <% if fs["human_note"].to_s.present? %>
+              <p class="ck-copy"><%= fs["human_note"] %></p>
+            <% end %>
+          </li>
+        <% end %>
+      </ol>
+    </section>
+  <% end %>
+<% end %>

data/config/routes.rb CHANGED Viewed

@@ -12,7 +12,13 @@ CompletionKit::Engine.routes.draw do
   end
   resources :datasets
-  resources :metrics
+  resources :metrics do
+    member do
+      post :add_few_shot
+      post :publish_draft
+      post :suggest_variants
+    end
+  end
   resources :metric_groups
   resources :tags
   resources :dashboard_dismissals, only: [:create, :destroy]

data/db/migrate/20260523000001_add_few_shot_examples_to_completion_kit_metrics.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class AddFewShotExamplesToCompletionKitMetrics < ActiveRecord::Migration[8.1]
+  def change
+    add_column :completion_kit_metrics, :few_shot_examples, :text
+  end
+end

data/db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb ADDED Viewed

@@ -0,0 +1,15 @@
+class AddStateToCompletionKitJudgeVersions < ActiveRecord::Migration[8.1]
+  def change
+    add_column :completion_kit_judge_versions, :state, :string, null: false, default: "published"
+    add_column :completion_kit_judge_versions, :source, :string
+    reversible do |dir|
+      dir.up do
+        execute "UPDATE completion_kit_judge_versions SET state = 'published'"
+      end
+    end
+    add_index :completion_kit_judge_versions, [:metric_id, :state],
+              name: "index_ck_judge_versions_on_metric_state"
+  end
+end

data/lib/completion_kit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CompletionKit
-  VERSION = "0.5.36"
+  VERSION = "0.5.38"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: completion-kit
 version: !ruby/object:Gem::Version
-  version: 0.5.36
+  version: 0.5.38
 platform: ruby
 authors:
 - Damien Bastin
@@ -290,14 +290,17 @@ files:
 - app/models/concerns/completion_kit/taggable.rb
 - app/services/completion_kit/anthropic_client.rb
 - app/services/completion_kit/api_config.rb
+- app/services/completion_kit/calibration_math.rb
 - app/services/completion_kit/csv_processor.rb
 - app/services/completion_kit/dashboard_stats.rb
 - app/services/completion_kit/judge_service.rb
+- app/services/completion_kit/judge_variant_generator.rb
 - app/services/completion_kit/llm_client.rb
 - app/services/completion_kit/mcp_dispatcher.rb
 - app/services/completion_kit/mcp_tools/base.rb
 - app/services/completion_kit/mcp_tools/calibrations.rb
 - app/services/completion_kit/mcp_tools/datasets.rb
+- app/services/completion_kit/mcp_tools/judges.rb
 - app/services/completion_kit/mcp_tools/metric_groups.rb
 - app/services/completion_kit/mcp_tools/metrics.rb
 - app/services/completion_kit/mcp_tools/prompts.rb
@@ -305,6 +308,7 @@ files:
 - app/services/completion_kit/mcp_tools/responses.rb
 - app/services/completion_kit/mcp_tools/runs.rb
 - app/services/completion_kit/mcp_tools/tags.rb
+- app/services/completion_kit/metric_calibration_stats.rb
 - app/services/completion_kit/model_discovery_service.rb
 - app/services/completion_kit/ollama_client.rb
 - app/services/completion_kit/onboarding/checklist.rb
@@ -323,6 +327,7 @@ files:
 - app/views/completion_kit/api_reference/_resource_list.html.erb
 - app/views/completion_kit/api_reference/index.html.erb
 - app/views/completion_kit/calibrations/_buttons.html.erb
+- app/views/completion_kit/calibrations/_trust_panel.html.erb
 - app/views/completion_kit/dashboard/_eye_icon.html.erb
 - app/views/completion_kit/dashboard/_eye_off_icon.html.erb
 - app/views/completion_kit/dashboard/_failures_card.html.erb
@@ -407,6 +412,8 @@ files:
 - db/migrate/20260516000001_create_completion_kit_dashboard_dismissals.rb
 - db/migrate/20260522000001_create_completion_kit_judge_versions.rb
 - db/migrate/20260522000002_create_completion_kit_calibrations.rb
+- db/migrate/20260523000001_add_few_shot_examples_to_completion_kit_metrics.rb
+- db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb
 - lib/completion-kit.rb
 - lib/completion_kit.rb
 - lib/completion_kit/concurrency_check.rb