RubyGems - completion-kit - Versions diffs - 0.5.43 → 0.6.0 - Mend

completion-kit 0.5.43 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7a9284c6a53b1b609de8ca2c081111687990d32f40f1fa4d2422670daeae9f2f
-  data.tar.gz: edb62bc8b34b3ecce534a1e4f0730066d6b56f591d05046b1904eb33f9f7cbc6
+  metadata.gz: d8454bbb11d5064ca0c6d4355c780425a28198280dffe7dd424d266fbeef6a09
+  data.tar.gz: 24c1da76e1e9118d5e2a732e8e45b684f588f553ad4d8bac89e239bc22c953c3
 SHA512:
-  metadata.gz: 0aaf95d75bdfee01b387d3ebe97434168815d58627f8d855ad3dd15534e33c2a69eca7ee8a25a964f6669f891026d350abb5c23e23006ada5a1c56df9ad616ea
-  data.tar.gz: 800fec24cee472a245fcfffbb025eabb2a3bc62cbfc513d1ec0a2c7aa8d1e304f59cc28aa9074712060d0f49ac6bbfba4597cc17e1d3b8db71c5e3b9c557dcab
+  metadata.gz: 6fbc5b8047a20240897e19c389bb3f6104d3e2a219794d190183b5433e14d524bb692eb0a27b36ab6471e596c2b9b8af2d70a4f56ae81aa327726fe92f092eb9
+  data.tar.gz: a3399003a48836fd457a8c8b488305fad6d006596c6f940a82b232e2a731dfbc3df5ded4ba8bc16b94690a88d56baaaff6edc6801f47f2bbf422ca8fb74270df

data/app/assets/stylesheets/completion_kit/application.css CHANGED Viewed

@@ -2816,6 +2816,44 @@ select.ck-input {
   line-height: 1.55;
 }
+.ck-review-card--stale {
+  border-left: 2px solid rgba(224, 164, 88, 0.45);
+}
+.ck-stale-versions-banner {
+  margin: 0 0 1rem;
+  padding: 0.9rem 1rem;
+  border: 1px solid rgba(224, 164, 88, 0.4);
+  background: rgba(224, 164, 88, 0.06);
+  border-radius: var(--ck-radius);
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: 1rem;
+  flex-wrap: wrap;
+}
+.ck-stale-versions-banner__body { min-width: 0; flex: 1 1 320px; }
+.ck-stale-versions-banner .ck-kicker { color: var(--ck-warning); }
+.ck-delta {
+  font-family: var(--ck-mono);
+  font-size: 0.78rem;
+  letter-spacing: 0.04em;
+  padding: 2px 6px;
+  border-radius: 4px;
+}
+.ck-delta--positive { color: var(--ck-success); background: var(--ck-success-soft); }
+.ck-delta--negative { color: var(--ck-danger); background: var(--ck-danger-soft); }
+.ck-delta--zero { color: var(--ck-dim); }
+.ck-run-compare-table td { vertical-align: middle; }
+.ck-review-card__stale-note {
+  margin: 0.4rem 0 0;
+  font-family: var(--ck-mono);
+  font-size: 0.78rem;
+  color: var(--ck-warning);
+}
 @media (max-width: 900px) {
   .ck-grid--sidebar,
   .ck-grid--cards,

data/app/controllers/completion_kit/api/v1/calibrations_controller.rb CHANGED Viewed

@@ -15,7 +15,7 @@ module CompletionKit
             run: @run,
             response: @response,
             metric: @metric,
-            judge_version: JudgeVersion.ensure_current_for(@metric),
+            metric_version: MetricVersion.ensure_current_for(@metric),
             **calibration_params
           )

data/app/controllers/completion_kit/api/v1/runs_controller.rb CHANGED Viewed

@@ -45,6 +45,10 @@ module CompletionKit
         end
         def retry_failures
+          if @run.stale_review_summary.any?
+            return render(json: { error: "Judge has changed since this run executed. Retry would mix versions in the same run; use POST /api/v1/runs/:id/rerun instead." }, status: :conflict)
+          end
           scope = @run.responses.where(status: "failed")
           scope = scope.where(id: params[:only]) if params[:only].present?

data/app/controllers/completion_kit/calibrations_controller.rb CHANGED Viewed

@@ -18,7 +18,7 @@ module CompletionKit
         run: @run, response: @response, metric: @metric, created_by: created_by
       )
       calibration.assign_attributes(
-        judge_version: JudgeVersion.ensure_current_for(@metric),
+        metric_version: MetricVersion.ensure_current_for(@metric),
         verdict: params[:verdict],
         corrected_score: params[:corrected_score].presence,
         note: params[:note].presence

data/app/controllers/completion_kit/metrics_controller.rb CHANGED Viewed

@@ -35,16 +35,15 @@ module CompletionKit
     end
     def show
-      @published_judge_version = JudgeVersion.ensure_current_for(@metric)
+      @published_metric_version = MetricVersion.ensure_current_for(@metric)
       @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
-                                  .includes(:judge_version, response: [:reviews, :run])
+                                  .includes(:metric_version, response: [:reviews, :run])
                                   .order(created_at: :desc)
                                   .limit(50)
-      @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
-      @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
-      @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree",
-                                                      judge_version_id: @published_judge_version.id).count
-      @versions = JudgeVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
+      @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
+      @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
+      @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
+      @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
     end
     def new
@@ -52,9 +51,14 @@ module CompletionKit
     end
     def edit
-      @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
-      @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
-      @published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
+      @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
+      @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
+      @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
+      if @edit_draft
+        @metric.instruction = @edit_draft.instruction
+        @metric.rubric_bands = @edit_draft.rubric_bands
+      end
     end
     def create
@@ -68,10 +72,42 @@ module CompletionKit
     end
     def update
-      if @metric.update(metric_params)
-        redirect_to metric_path(@metric), notice: "Metric was successfully updated."
+      judge_keys = %i[instruction rubric_bands]
+      meta_attrs = metric_params.except(*judge_keys)
+      proposed_instruction = metric_params[:instruction]
+      proposed_rubric = metric_params[:rubric_bands]
+      unless @metric.update(meta_attrs)
+        return render(:edit, status: :unprocessable_entity)
+      end
+      current_instruction = @metric.instruction.to_s
+      current_rubric = @metric.rubric_bands || []
+      normalized_proposed_rubric = normalize_rubric_bands_for_update(proposed_rubric)
+      instruction_changed = !proposed_instruction.nil? && proposed_instruction.to_s != current_instruction
+      rubric_changed = !normalized_proposed_rubric.nil? && normalized_proposed_rubric != current_rubric
+      unless instruction_changed || rubric_changed
+        return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
+      end
+      new_instruction = instruction_changed ? proposed_instruction.to_s : current_instruction
+      new_rubric = rubric_changed ? normalized_proposed_rubric : current_rubric
+      if @metric.reviews.exists?
+        MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
+        draft = MetricVersion.create!(
+          metric: @metric, instruction: new_instruction, rubric_bands: new_rubric,
+          state: "draft", source: "edit", current: false
+        )
+        redirect_to edit_metric_path(@metric),
+                    notice: "Saved as draft #{draft.version_label}. Publish to push these changes to the live judge."
       else
-        render :edit, status: :unprocessable_entity
+        @metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
+        current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
+        current_pub&.update!(instruction: @metric.instruction, rubric_bands: @metric.rubric_bands)
+        redirect_to metric_path(@metric), notice: "Metric was successfully updated."
       end
     end
@@ -88,9 +124,9 @@ module CompletionKit
         return
       end
-      JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
+      MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
-      generator = JudgeVariantGenerator.new(@metric, count: 1)
+      generator = MetricVariantGenerator.new(@metric, count: 1)
       variants = generator.call
       if variants.empty?
         redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
@@ -101,18 +137,18 @@ module CompletionKit
     end
     def dismiss_suggestion
-      draft = JudgeVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
+      draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
       draft&.destroy
       target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
       redirect_to target, notice: "Dismissed."
     end
     def publish_draft
-      scope = JudgeVersion.where(metric_id: @metric.id)
+      scope = MetricVersion.where(metric_id: @metric.id)
       version = if params[:draft_id].present?
                   scope.find_by(id: params[:draft_id])
                 else
-                  JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
+                  MetricVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
                 end
       if version.nil?
@@ -120,9 +156,20 @@ module CompletionKit
         return
       end
+      was_published_already = version.published?
+      reverting = was_published_already && !version.current?
+      previously_current = MetricVersion.current.find_by(metric_id: @metric.id)
       version.publish!
-      redirect_to metric_path(@metric),
-                  notice: "#{@metric.name} #{version.version_label} is now the published version."
+      if reverting
+        prior_label = previously_current.version_label
+        redirect_to metric_path(@metric),
+                    notice: "Reverted to #{@metric.name} #{version.version_label}. Pinned cases still flow to the judge, and calibration verdicts collected against #{prior_label} stay tied to it."
+      else
+        redirect_to metric_path(@metric),
+                    notice: "#{@metric.name} #{version.version_label} is now the published version."
+      end
     end
     def add_few_shot
@@ -160,5 +207,14 @@ module CompletionKit
       params.require(:metric).permit(:name, :instruction,
         rubric_bands: [:stars, :description], tag_names: [])
     end
+    def normalize_rubric_bands_for_update(bands)
+      return nil if bands.nil?
+      array = bands.is_a?(ActionController::Parameters) ? bands.to_unsafe_h.values : bands
+      Array(array).map do |b|
+        h = b.respond_to?(:to_unsafe_h) ? b.to_unsafe_h : b
+        { "stars" => h["stars"].to_i, "description" => h["description"].to_s }
+      end.sort_by { |b| -b["stars"] }
+    end
   end
 end

data/app/controllers/completion_kit/runs_controller.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module CompletionKit
   class RunsController < ApplicationController
     include CompletionKit::TagFiltering
-    before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :refresh_status]
+    before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :regrade, :refresh_status, :compare]
     before_action :load_form_collections, only: [:new, :edit, :create, :update]
     def index
@@ -78,6 +78,29 @@ module CompletionKit
       end
     end
+    def compare
+      other_id = params[:with]
+      if other_id.blank?
+        @other_runs = Run.where(dataset_id: @run.dataset_id, prompt_id: @run.prompt_id)
+                          .where.not(id: @run.id)
+                          .order(created_at: :desc)
+                          .limit(50)
+        return render(:compare_picker)
+      end
+      @other_run = Run.find(other_id)
+      @comparison = build_run_comparison(@run, @other_run)
+      render(:compare)
+    end
+    def regrade
+      if @run.regrade!
+        redirect_to run_path(@run), notice: "Re-grading existing responses with the current judge."
+      else
+        redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
+      end
+    end
     def rerun
       new_run = Run.create!(
         prompt_id: @run.prompt_id,
@@ -126,6 +149,12 @@ module CompletionKit
     end
     def retry_failures
+      if @run.stale_review_summary.any?
+        redirect_to run_path(@run),
+                    alert: "The judge has changed since this run executed. Retrying failed cases would mix scores from two metric versions in the same run. Use 'Re-run with current judge' to refresh everything against the live judge."
+        return
+      end
       scope = @run.responses.where(status: "failed")
       scope = scope.where(id: params[:only]) if params[:only].present?
@@ -157,6 +186,45 @@ module CompletionKit
       @run = Run.find(params[:id])
     end
+    def build_run_comparison(left, right)
+      left_responses = left.responses.includes(:reviews).order(:row_index, :id)
+      right_responses = right.responses.includes(:reviews).order(:row_index, :id)
+      right_by_input = right_responses.each_with_object({}) { |r, h| h[r.input_data.to_s] ||= r }
+      all_reviews = left_responses.flat_map(&:reviews) + right_responses.flat_map(&:reviews)
+      metric_ids = all_reviews.map(&:metric_id).compact.uniq
+      metric_versions = MetricVersion.where(id: all_reviews.map(&:metric_version_id).compact.uniq).index_by(&:id)
+      rows = left_responses.map do |lr|
+        rr = right_by_input[lr.input_data.to_s]
+        {
+          left_response: lr,
+          right_response: rr,
+          per_metric: metric_ids.map do |mid|
+            l_review = lr.reviews.find { |r| r.metric_id == mid }
+            r_review = rr && rr.reviews.find { |r| r.metric_id == mid }
+            next nil if l_review.nil? && r_review.nil?
+            anchor = l_review || r_review
+            {
+              metric_id: mid,
+              metric_name: anchor.metric_name,
+              left_score: l_review ? l_review.ai_score : nil,
+              right_score: r_review ? r_review.ai_score : nil,
+              left_version_label: version_label_for(l_review, metric_versions),
+              right_version_label: version_label_for(r_review, metric_versions),
+              delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil
+            }
+          end.compact
+        }
+      end
+      { rows: rows, metric_ids: metric_ids }
+    end
+    def version_label_for(review, metric_versions)
+      return nil if review.nil? || review.metric_version_id.nil?
+      metric_versions[review.metric_version_id]&.version_label
+    end
     def load_form_collections
       @prompts = Prompt.order(:name)
       @datasets = Dataset.order(:name)

data/app/jobs/completion_kit/judge_review_job.rb CHANGED Viewed

@@ -62,9 +62,11 @@ module CompletionKit
       )
       review = response.reviews.find_or_initialize_by(metric_id: metric.id)
+      current_metric_version = MetricVersion.ensure_current_for(metric)
       review.assign_attributes(
         metric_name: metric.name,
         instruction: metric.instruction.to_s,
+        metric_version_id: current_metric_version.id,
         status: "succeeded",
         ai_score: evaluation[:score],
         ai_feedback: evaluation[:feedback],
@@ -122,6 +124,7 @@ module CompletionKit
     end
     def few_shot_payload(metric)
+      return nil unless CompletionKit.config.judge_calibration_enabled
       Array(metric.few_shot_examples).map do |fs|
         {
           human_score: fs["human_score"],

data/app/models/completion_kit/calibration.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module CompletionKit
     belongs_to :run
     belongs_to :response
     belongs_to :metric
-    belongs_to :judge_version
+    belongs_to :metric_version
     validates :verdict, presence: true, inclusion: { in: VERDICTS }
     validates :response_id,
@@ -22,7 +22,7 @@ module CompletionKit
         run_id: run_id,
         response_id: response_id,
         metric_id: metric_id,
-        judge_version_id: judge_version_id,
+        metric_version_id: metric_version_id,
         verdict: verdict,
         corrected_score: corrected_score,
         note: note,

data/app/models/completion_kit/metric.rb CHANGED Viewed

@@ -24,7 +24,6 @@ module CompletionKit
     before_validation :generate_key
     before_validation :normalize_rubric_bands
     before_validation :set_defaults
-    after_update :fork_draft_judge_version, if: :judge_relevant_changes?
     def self.default_rubric_bands
       DEFAULT_RUBRIC_BANDS.map(&:dup)
@@ -98,21 +97,5 @@ module CompletionKit
       self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
     end
-    def judge_relevant_changes?
-      saved_change_to_instruction? || saved_change_to_rubric_bands?
-    end
-    def fork_draft_judge_version
-      JudgeVersion.ensure_current_for(self)
-      JudgeVersion.where(metric_id: id, state: "draft").update_all(current: false)
-      JudgeVersion.create!(
-        metric: self,
-        instruction: instruction,
-        rubric_bands: rubric_bands,
-        current: false,
-        state: "draft",
-        source: "edit"
-      )
-    end
   end
 end

data/app/models/completion_kit/{judge_version.rb → metric_version.rb} RENAMED Viewed

@@ -1,5 +1,5 @@
 module CompletionKit
-  class JudgeVersion < ApplicationRecord
+  class MetricVersion < ApplicationRecord
     STATES = %w[draft published].freeze
     belongs_to :metric
@@ -41,7 +41,7 @@ module CompletionKit
     end
     def publish!
-      JudgeVersion.transaction do
+      MetricVersion.transaction do
         self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
         reload
         update!(state: "published", current: true, published_at: published_at || Time.current)
@@ -76,4 +76,5 @@ module CompletionKit
       self.version_number = max + 1
     end
   end
 end

data/app/models/completion_kit/review.rb CHANGED Viewed

@@ -5,8 +5,16 @@ module CompletionKit
     belongs_to :response
     belongs_to :metric, optional: true
+    belongs_to :metric_version, optional: true
     has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
+    def stale_against_current_judge?
+      return false unless metric_id && metric_version_id
+      current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
+      return false if current_id.nil?
+      metric_version_id != current_id
+    end
     validates :metric_name, presence: true
     validates :status, inclusion: { in: STATUSES }
     validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
@@ -29,6 +37,7 @@ module CompletionKit
     def as_json(options = {})
       {
         id: id, response_id: response_id, metric_id: metric_id,
+        metric_version_id: metric_version_id,
         metric_name: metric_name, ai_score: ai_score,
         ai_feedback: ai_feedback, status: status, attempts: attempts,
         error: error_payload

data/app/models/completion_kit/run.rb CHANGED Viewed

@@ -89,6 +89,34 @@ module CompletionKit
       end
     end
+    def stale_review_summary
+      review_pairs = Review.where(response_id: response_ids)
+                          .where.not(metric_id: nil)
+                          .where.not(metric_version_id: nil)
+                          .pluck(:metric_id, :metric_version_id, :metric_name)
+      return {} if review_pairs.empty?
+      metric_ids = review_pairs.map(&:first).uniq
+      version_ids = review_pairs.map { |_, vid, _| vid }.uniq
+      current_by_metric = MetricVersion.current.where(metric_id: metric_ids).pluck(:metric_id, :id, :version_number).each_with_object({}) do |(mid, vid, vnum), h|
+        h[mid] = { id: vid, label: "v#{vnum}" }
+      end
+      label_by_version = MetricVersion.where(id: version_ids).pluck(:id, :version_number).each_with_object({}) { |(vid, vnum), h| h[vid] = "v#{vnum}" }
+      summary = {}
+      review_pairs.each do |metric_id, version_id, metric_name|
+        current = current_by_metric[metric_id]
+        next if current.nil?
+        next if version_id == current[:id]
+        label = label_by_version[version_id]
+        next if label.nil?
+        summary[metric_id] ||= { metric_name: metric_name, current_label: current[:label], stale_count: 0, scored_labels: [] }
+        summary[metric_id][:stale_count] += 1
+        summary[metric_id][:scored_labels] |= [label]
+      end
+      summary
+    end
     def start!
       rows = if dataset
                CsvProcessor.process_self(self)
@@ -151,6 +179,38 @@ module CompletionKit
       start!
     end
+    def regrade!
+      grading_metrics = metrics
+      return false if grading_metrics.empty? || !judge_configured?
+      eligible_responses = responses.where(status: "succeeded").where.not(response_text: nil)
+      response_ids = eligible_responses.pluck(:id)
+      return false if response_ids.empty?
+      transaction do
+        Review.where(response_id: response_ids).update_all(
+          status: "pending",
+          attempts: 0,
+          metric_version_id: nil,
+          ai_score: nil,
+          ai_feedback: nil,
+          error_provider: nil,
+          error_class: nil,
+          error_status: nil,
+          error_message: nil
+        )
+        update!(status: "running", failure_summary: nil, error_message: nil)
+        response_ids.each do |rid|
+          grading_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id) }
+        end
+        RunCompletionCheckJob.perform_later(id)
+      end
+      broadcast_ui
+      true
+    end
     def progress_snapshot
       generated_done = responses.where(status: "succeeded").count
       generated_failed = responses.where(status: "failed").count

data/app/services/completion_kit/mcp_tools/calibrations.rb CHANGED Viewed

@@ -56,7 +56,7 @@ module CompletionKit
           run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
         )
         calibration.assign_attributes(
-          judge_version: CompletionKit::JudgeVersion.ensure_current_for(metric),
+          metric_version: CompletionKit::MetricVersion.ensure_current_for(metric),
           verdict: args["verdict"],
           corrected_score: args["corrected_score"],
           note: args["note"]

data/app/services/completion_kit/mcp_tools/judges.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module CompletionKit
       TOOLS = {
         "judges_suggest" => {
-          description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft JudgeVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
+          description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft MetricVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
           inputSchema: {
             type: "object",
             properties: {
@@ -33,15 +33,15 @@ module CompletionKit
           handler: :replay
         },
         "judges_compare" => {
-          description: "Compare two judge versions' calibration stats side by side. Pass either two judge_version_ids or one metric_id with judge_version_a_id / judge_version_b_id.",
+          description: "Compare two metric versions' calibration stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
           inputSchema: {
             type: "object",
             properties: {
               metric_id: { type: "integer" },
-              judge_version_a_id: { type: "integer" },
-              judge_version_b_id: { type: "integer" }
+              metric_version_a_id: { type: "integer" },
+              metric_version_b_id: { type: "integer" }
             },
-            required: ["metric_id", "judge_version_a_id", "judge_version_b_id"]
+            required: ["metric_id", "metric_version_a_id", "metric_version_b_id"]
           },
           handler: :compare
         }
@@ -49,7 +49,7 @@ module CompletionKit
       def self.suggest(args)
         metric = CompletionKit::Metric.find(args["metric_id"])
-        generator = CompletionKit::JudgeVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
+        generator = CompletionKit::MetricVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
         variants = generator.call
         return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
         versions = generator.persist!(variants)
@@ -75,20 +75,20 @@ module CompletionKit
       def self.compare(args)
         metric = CompletionKit::Metric.find(args["metric_id"])
-        a = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_a_id"])
-        b = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_b_id"])
-        stats_a = CompletionKit::MetricCalibrationStats.for(metric, judge_version: a)
-        stats_b = CompletionKit::MetricCalibrationStats.for(metric, judge_version: b)
+        a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
+        b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
+        stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
+        stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
         text_result({
           metric_id: metric.id,
-          a: judge_version_payload(a, stats_a),
-          b: judge_version_payload(b, stats_b),
+          a: metric_version_payload(a, stats_a),
+          b: metric_version_payload(b, stats_b),
           delta: delta_payload(stats_a, stats_b),
           recommendation: recommendation_for(stats_a, stats_b)
         })
       end
-      def self.judge_version_payload(version, stats)
+      def self.metric_version_payload(version, stats)
         {
           id: version.id, state: version.state, current: version.current,
           source: version.source, created_at: version.created_at,

data/app/services/completion_kit/metric_calibration_stats.rb CHANGED Viewed

@@ -33,25 +33,25 @@ module CompletionKit
     CURRENT = :current
-    def self.for(metric, judge_version: CURRENT)
-      resolved = case judge_version
-                 when CURRENT then JudgeVersion.current.find_by(metric_id: metric.id)
+    def self.for(metric, metric_version: CURRENT)
+      resolved = case metric_version
+                 when CURRENT then MetricVersion.current.find_by(metric_id: metric.id)
                  when nil then nil
-                 else judge_version
+                 else metric_version
                  end
-      new(metric: metric, judge_version: resolved, all_versions: judge_version.nil?).call
+      new(metric: metric, metric_version: resolved, all_versions: metric_version.nil?).call
     end
-    def initialize(metric:, judge_version: nil, all_versions: false)
+    def initialize(metric:, metric_version: nil, all_versions: false)
       @metric = metric
-      @judge_version = judge_version
+      @metric_version = metric_version
       @all_versions = all_versions
     end
     def call
       scope = Calibration.where(metric_id: @metric.id)
-      if @judge_version
-        scope = scope.where(judge_version_id: @judge_version.id)
+      if @metric_version
+        scope = scope.where(metric_version_id: @metric_version.id)
       elsif !@all_versions
         scope = scope.none
       end

data/app/services/completion_kit/{judge_variant_generator.rb → metric_variant_generator.rb} RENAMED Viewed

@@ -1,5 +1,5 @@
 module CompletionKit
-  class JudgeVariantGenerator
+  class MetricVariantGenerator
     DEFAULT_VARIANT_COUNT = 1
     MAX_VARIANT_COUNT = 3
     DEFAULT_TEMPERATURE = 0.4
@@ -20,9 +20,9 @@ module CompletionKit
     end
     def persist!(variants)
-      JudgeVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
+      MetricVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
       versions = variants.map do |variant|
-        JudgeVersion.create!(
+        MetricVersion.create!(
           metric: @metric,
           instruction: variant.instruction,
           rubric_bands: variant.rubric_bands.presence || @metric.rubric_bands,
@@ -41,8 +41,9 @@ module CompletionKit
     private
     def build_meta_prompt
-      disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
-      borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
+      disagreements = MetricCalibrationExamples.disagreements_for(@metric)
+      borderlines = MetricCalibrationExamples.borderlines_for(@metric)
+      pinned_examples = Array(@metric.few_shot_examples)
       sections = []
       sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
       sections << ""
@@ -77,6 +78,18 @@ module CompletionKit
           sections << ""
         end
       end
+      if pinned_examples.any?
+        sections << "## Pinned cases the judge already references"
+        sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
+        pinned_examples.each_with_index do |ex, i|
+          sections << "### Pinned #{i + 1}"
+          sections << "Input: #{ex["input"].to_s.truncate(200)}"
+          sections << "Output: #{ex["response"].to_s.truncate(200)}"
+          sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
+          sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
+          sections << ""
+        end
+      end
       sections << "## Task"
       sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
       sections << ""
@@ -117,7 +130,7 @@ module CompletionKit
     end
   end
-  module JudgeCalibrationExamples
+  module MetricCalibrationExamples
     module_function
     def for(metric, limit: 8)
@@ -133,13 +146,14 @@ module CompletionKit
     end
     def calibrations_for(metric, verdict:, limit:)
-      scope = Calibration.where(metric_id: metric.id, verdict: verdict)
-      current_version = JudgeVersion.current.find_by(metric_id: metric.id)
-      scope = scope.where(judge_version_id: current_version.id) if current_version
-      scope.includes(response: :reviews)
-           .order(created_at: :desc)
-           .limit(limit)
-           .map do |cal|
+      base = Calibration.where(metric_id: metric.id, verdict: verdict)
+      current_version = MetricVersion.current.find_by(metric_id: metric.id)
+      scoped = current_version ? base.where(metric_version_id: current_version.id) : base
+      effective = scoped.exists? ? scoped : base
+      effective.includes(response: :reviews)
+               .order(created_at: :desc)
+               .limit(limit)
+               .map do |cal|
         review = cal.response.reviews.find { |r| r.metric_id == metric.id }
         {
           input: cal.response.input_data,

data/app/views/completion_kit/calibrations/_trust_panel.html.erb CHANGED Viewed

@@ -3,19 +3,29 @@
 <% anchor = metric&.name&.parameterize %>
 <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
      created_by = CompletionKit.config.username.presence || "operator"
-     verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by).pluck(:response_id)
+     current_metric_version = CompletionKit::MetricVersion.current.find_by(metric_id: metric.id)
+     verdicted_ids = if current_metric_version
+       CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
+     else
+       []
+     end
      CompletionKit::Response.joins(:reviews)
        .where(reviews: { metric_id: metric.id })
        .where.not(reviews: { ai_score: nil })
        .where.not(id: verdicted_ids)
        .order(created_at: :desc).first
    end %>
+<% prior_version_verdicts = if stats.sample_size.zero? && metric && current_metric_version
+     CompletionKit::Calibration.where(metric_id: metric.id).where.not(metric_version_id: current_metric_version.id).count
+   else
+     0
+   end %>
 <p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
   <span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
   <% if stats.sample_size.zero? %>
     <span class="ck-trust-line__state">Not measured yet.</span>
-    <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if target_response %>
+    <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "verdict") %> on prior versions, tied to that version's history.)<% end %><% if target_response %>
       <%= link_to "Give a verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %>
     <% else %> Run this metric on a dataset, then give a verdict.<% end %></span>
   <% elsif stats.counter_only? %>

data/app/views/completion_kit/metrics/_form.html.erb CHANGED Viewed

@@ -16,14 +16,14 @@
     </div>
   <% end %>
-  <% if edit_draft && !suggestion %>
-    <% pub = local_assigns[:published_judge_version] %>
+  <% if edit_draft %>
+    <% pub = local_assigns[:published_metric_version] %>
     <% draft_instr_changed = pub && pub.instruction.to_s != edit_draft.instruction.to_s %>
     <% draft_rubric_changed = pub && pub.rubric_bands != edit_draft.rubric_bands %>
     <div class="ck-suggestion-banner" role="status">
       <div class="ck-suggestion-banner__body">
         <p class="ck-kicker">Draft pending</p>
-        <p class="ck-meta-copy">An unpublished draft of this metric is saved. Publish to replace the live<%= " instruction" if draft_instr_changed %><%= " and" if draft_instr_changed && draft_rubric_changed %><%= " rubric" if draft_rubric_changed %> for future runs, or keep editing.</p>
+        <p class="ck-meta-copy">The form below shows your unpublished draft. Publish to replace the live<%= " instruction" if draft_instr_changed %><%= " and" if draft_instr_changed && draft_rubric_changed %><%= " rubric" if draft_rubric_changed %> for future runs, or keep editing.</p>
       </div>
       <div class="ck-suggestion-banner__actions">
         <%= button_to "Discard draft", dismiss_suggestion_metric_path(metric, draft_id: edit_draft.id, back_to: "edit"),

data/app/views/completion_kit/metrics/edit.html.erb CHANGED Viewed

@@ -14,4 +14,4 @@
       metric: @metric,
       suggestion_draft: @suggestion_draft,
       edit_draft: @edit_draft,
-      published_judge_version: @published_judge_version %>
+      published_metric_version: @published_metric_version %>

data/app/views/completion_kit/metrics/show.html.erb CHANGED Viewed

@@ -19,20 +19,17 @@
   </div>
   <div class="ck-actions">
     <% if CompletionKit.config.judge_calibration_enabled %>
-      <% if @suggestion_draft %>
-        <%= link_to "Review improvements →", edit_metric_path(@metric),
+      <% if @suggestion_draft || @edit_draft %>
+        <% review_title = @suggestion_draft ? "The model proposed improvements based on your disagreements. Review and apply what you want." : "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
+        <%= link_to "Review changes →", edit_metric_path(@metric),
               class: ck_button_classes(:dark),
-              title: "The model proposed improvements based on your disagreements. Review and apply what you want." %>
-      <% elsif @edit_draft %>
-        <%= link_to "Review draft →", edit_metric_path(@metric),
-              class: ck_button_classes(:dark),
-              title: "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
+              title: review_title %>
       <% elsif @improve_disagreement_count.positive? %>
         <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
               method: :post, form_class: "inline-block",
               class: ck_button_classes(:light, variant: :outline),
-              title: "Have the model rewrite this metric's instruction and rubric based on the disagreements collected so far.",
-              data: { turbo_confirm: "Rewrite this metric based on the disagreements collected so far?" } %>
+              title: "Ask the model to suggest improvements to this metric's instruction and rubric based on the disagreements collected so far.",
+              data: { turbo_confirm: "Ask the model for suggested improvements based on the disagreements collected so far?" } %>
       <% else %>
         <button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
                 title="Mark at least one case as Disagree before the model can suggest a change.">Improve the metric</button>
@@ -168,19 +165,20 @@
       <p class="ck-kicker">Cases to learn from</p>
       <span class="ck-chip"><%= pluralize(@disagreements.size, "case") %></span>
     </div>
-    <p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades.</p>
+    <% mixed_versions = @disagreements.any? { |c| c.metric_version_id != @published_metric_version.id } %>
+    <p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades<%= " (pins flow into the current judge regardless of which version produced the verdict)" if mixed_versions %>.</p>
     <% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
     <ul class="ck-disagreement-list">
       <% @disagreements.each do |cal| %>
         <% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
         <% already = existing_ids.include?(cal.id) %>
-        <% cal_version = cal.judge_version %>
-        <% on_current = cal_version&.id == @published_judge_version.id %>
+        <% cal_metric_version = cal.metric_version %>
+        <% on_current = cal_metric_version&.id == @published_metric_version.id %>
         <li class="ck-disagreement<%= " ck-disagreement--remembered" if already %><%= " ck-disagreement--stale" unless on_current %>">
           <div class="ck-disagreement__head">
             <div class="ck-disagreement__scores">
-              <% if cal_version %>
-                <span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_version.version_label %></span>
+              <% if cal_metric_version && mixed_versions %>
+                <span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_metric_version.version_label %></span>
               <% end %>
               <span class="ck-disagreement__scores-label">Judge</span>
               <% if review&.ai_score %>

data/app/views/completion_kit/responses/show.html.erb CHANGED Viewed

@@ -98,10 +98,15 @@
     <div class="ck-review-list">
       <% @reviews.each do |review| %>
-        <div class="ck-review-card" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
+        <% review_version = review.metric_version %>
+        <% stale = review.stale_against_current_judge? %>
+        <div class="ck-review-card<%= " ck-review-card--stale" if stale %>" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
           <div class="ck-review-card__header">
             <span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
             <div class="ck-inline">
+              <% if review_version %>
+                <span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Score produced by #{review_version.version_label} of this metric. The live judge has changed since." : "Score produced by the live judge (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
+              <% end %>
               <% if review.ai_score %>
                 <% 5.times do |i| %>
                   <svg viewBox="0 0 24 24" width="16" height="16" stroke-width="1.75" class="ck-star <%= i < review.ai_score.to_i ? "ck-star--filled" : "ck-star--empty" %>"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
@@ -111,6 +116,9 @@
               <% end %>
             </div>
           </div>
+          <% if stale %>
+            <p class="ck-review-card__stale-note">Scored against a superseded version of this metric. The live judge may score this differently.</p>
+          <% end %>
           <% if review.ai_feedback.present? %>
             <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
           <% end %>

data/app/views/completion_kit/runs/_actions.html.erb CHANGED Viewed

@@ -11,6 +11,7 @@
       <%= button_to "Retry", generate_run_path(run), method: :post, class: ck_button_classes(:light, variant: :outline), form_class: "inline-block" %>
       <%= button_to "Re-run as new", rerun_run_path(run), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
     <% elsif run.status == "completed" %>
+      <%= link_to "Compare", compare_run_path(run), class: ck_button_classes(:light, variant: :outline) %>
       <%= button_to "Re-run", rerun_run_path(run), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
     <% end %>
   <% end %>

data/app/views/completion_kit/runs/compare.html.erb ADDED Viewed

@@ -0,0 +1,85 @@
+<ol class="ck-breadcrumb">
+  <li><%= link_to "Runs", runs_path %></li>
+  <li><%= link_to @run.name, run_path(@run) %></li>
+  <li>vs <%= @other_run.name %></li>
+</ol>
+<section class="ck-page-header">
+  <div>
+    <h1 class="ck-title">Comparing runs</h1>
+    <p class="ck-meta-copy"><strong>A</strong>: <%= link_to @run.name, run_path(@run), class: "ck-link" %> &middot; <strong>B</strong>: <%= link_to @other_run.name, run_path(@other_run), class: "ck-link" %></p>
+  </div>
+  <div class="ck-actions">
+    <%= link_to "Pick another", compare_run_path(@run), class: ck_button_classes(:light, variant: :outline) %>
+  </div>
+</section>
+<% if @comparison[:rows].empty? %>
+  <div class="ck-empty">
+    <p>No responses to compare yet.</p>
+  </div>
+<% else %>
+  <table class="ck-results-table ck-run-compare-table">
+    <thead>
+      <tr>
+        <th scope="col">Case</th>
+        <th scope="col">Metric</th>
+        <th scope="col">A score</th>
+        <th scope="col">B score</th>
+        <th scope="col">Δ</th>
+        <th scope="col">A version</th>
+        <th scope="col">B version</th>
+      </tr>
+    </thead>
+    <tbody>
+      <% @comparison[:rows].each do |row| %>
+        <% case_label = ((row[:left_response].row_index || 0) + 1).to_s %>
+        <% row[:per_metric].each_with_index do |pm, idx| %>
+          <tr>
+            <% if idx == 0 %>
+              <td rowspan="<%= row[:per_metric].size %>">
+                <%= link_to case_label, run_response_path(@run, row[:left_response]), class: "ck-link" %>
+                <% if row[:right_response] %>
+                  <span class="ck-meta-copy">/ <%= link_to "B", run_response_path(@other_run, row[:right_response]), class: "ck-link" %></span>
+                <% end %>
+              </td>
+            <% end %>
+            <td><%= pm[:metric_name] %></td>
+            <td>
+              <% if pm[:left_score] %>
+                <span class="<%= ck_badge_classes(ck_score_kind(pm[:left_score].to_f)) %>"><%= pm[:left_score] %></span>
+              <% else %>
+                <span class="ck-meta-copy">—</span>
+              <% end %>
+            </td>
+            <td>
+              <% if pm[:right_score] %>
+                <span class="<%= ck_badge_classes(ck_score_kind(pm[:right_score].to_f)) %>"><%= pm[:right_score] %></span>
+              <% else %>
+                <span class="ck-meta-copy">—</span>
+              <% end %>
+            </td>
+            <td>
+              <% if pm[:delta] %>
+                <% delta_class = pm[:delta] > 0 ? "ck-delta--positive" : pm[:delta] < 0 ? "ck-delta--negative" : "ck-delta--zero" %>
+                <span class="ck-delta <%= delta_class %>"><%= pm[:delta].positive? ? "+#{pm[:delta]}" : pm[:delta].to_s %></span>
+              <% else %>
+                <span class="ck-meta-copy">—</span>
+              <% end %>
+            </td>
+            <td>
+              <% if pm[:left_version_label] %>
+                <span class="ck-source-chip ck-source-chip--current"><%= pm[:left_version_label] %></span>
+              <% end %>
+            </td>
+            <td>
+              <% if pm[:right_version_label] %>
+                <span class="ck-source-chip ck-source-chip--current"><%= pm[:right_version_label] %></span>
+              <% end %>
+            </td>
+          </tr>
+        <% end %>
+      <% end %>
+    </tbody>
+  </table>
+<% end %>

data/app/views/completion_kit/runs/compare_picker.html.erb ADDED Viewed

@@ -0,0 +1,39 @@
+<ol class="ck-breadcrumb">
+  <li><%= link_to "Runs", runs_path %></li>
+  <li><%= link_to @run.name, run_path(@run) %></li>
+  <li>Compare</li>
+</ol>
+<section class="ck-page-header">
+  <div>
+    <h1 class="ck-title">Compare with another run</h1>
+    <p class="ck-lead">Pick a run on the same dataset and prompt to see per-case score deltas side by side.</p>
+  </div>
+</section>
+<% if @other_runs.any? %>
+  <table class="ck-results-table">
+    <thead>
+      <tr>
+        <th scope="col">Run</th>
+        <th scope="col">Judge</th>
+        <th scope="col">Created</th>
+        <th scope="col"></th>
+      </tr>
+    </thead>
+    <tbody>
+      <% @other_runs.each do |other| %>
+        <tr>
+          <td><%= link_to other.name, run_path(other), class: "ck-link" %></td>
+          <td class="ck-meta-copy"><%= other.judge_model %></td>
+          <td class="ck-meta-copy"><time datetime="<%= other.created_at.utc.iso8601 %>"><%= time_ago_in_words(other.created_at) %> ago</time></td>
+          <td class="ck-results-table__arrow"><%= link_to "Compare →", compare_run_path(@run, with: other.id), class: "ck-link" %></td>
+        </tr>
+      <% end %>
+    </tbody>
+  </table>
+<% else %>
+  <div class="ck-empty">
+    <p>No other runs on this dataset + prompt combination yet. <%= link_to "Re-run from this one", rerun_run_path(@run), method: :post, class: "ck-link" %> to create one.</p>
+  </div>
+<% end %>

data/app/views/completion_kit/runs/show.html.erb CHANGED Viewed

@@ -18,6 +18,35 @@
   <% dataset_preview_lines = dataset_lines.first(50) %>
 <% end %>
+<% if CompletionKit.config.judge_calibration_enabled %>
+  <% stale_summary = @run.stale_review_summary %>
+  <% if stale_summary.any? %>
+    <div class="ck-stale-versions-banner" role="status">
+      <div class="ck-stale-versions-banner__body">
+        <p class="ck-kicker">Stale judge versions</p>
+        <p class="ck-meta-copy">
+          This run was scored against metric versions that are no longer live.
+          <% stale_summary.values.each_with_index do |s, i| %>
+            <%= ", " if i > 0 %><strong><%= s[:metric_name] %></strong> (scored by <%= s[:scored_labels].join(", ") %>; live is <%= s[:current_label] %>)<% end %>.
+          Re-run to refresh the scores with the current judge.
+        </p>
+      </div>
+      <% if @run.status == "completed" %>
+        <%= button_to "Re-run from scratch",
+              rerun_run_path(@run), method: :post,
+              class: ck_button_classes(:light, variant: :outline), form_class: "inline-block",
+              title: "Create a new run that regenerates responses and grades them with the current judge.",
+              data: { turbo_confirm: "Create a new run with fresh responses and the current judge? The original run stays as a record." } %>
+        <%= button_to "Re-grade with current judge",
+              regrade_run_path(@run), method: :post,
+              class: ck_button_classes(:dark), form_class: "inline-block",
+              title: "Re-judge this run's existing responses against the current judge. Faster and cheaper than re-running.",
+              data: { turbo_confirm: "Re-judge this run's existing responses against the current judge?" } %>
+      <% end %>
+    </div>
+  <% end %>
+<% end %>
 <div class="ck-run-config">
   <div class="ck-run-config__row">
     <span class="ck-run-config__key">Created</span>

data/config/routes.rb CHANGED Viewed

@@ -37,7 +37,9 @@ CompletionKit::Engine.routes.draw do
       post :suggest
       post :retry_failures
       post :rerun
+      post :regrade
       get :refresh_status
+      get :compare
     end
     resources :responses, only: [:show] do
       resources :calibrations, only: [:create]

data/db/migrate/20260528000001_rename_judge_version_to_metric_version.rb ADDED Viewed

@@ -0,0 +1,22 @@
+class RenameJudgeVersionToMetricVersion < ActiveRecord::Migration[8.1]
+  def change
+    rename_table :completion_kit_judge_versions, :completion_kit_metric_versions
+    rename_column :completion_kit_calibrations, :judge_version_id, :metric_version_id
+    rename_index :completion_kit_metric_versions,
+                 "index_ck_judge_versions_on_metric_id",
+                 "index_ck_metric_versions_on_metric_id"
+    rename_index :completion_kit_metric_versions,
+                 "index_ck_judge_versions_on_metric_current",
+                 "index_ck_metric_versions_on_metric_current"
+    rename_index :completion_kit_metric_versions,
+                 "index_ck_judge_versions_on_metric_state",
+                 "index_ck_metric_versions_on_metric_state"
+    rename_index :completion_kit_metric_versions,
+                 "index_ck_judge_versions_on_metric_version",
+                 "index_ck_metric_versions_on_metric_vnum"
+    rename_index :completion_kit_calibrations,
+                 "index_ck_calibrations_on_judge_version_id",
+                 "index_ck_calibrations_on_metric_version_id"
+  end
+end

data/db/migrate/20260528000002_add_metric_version_to_reviews.rb ADDED Viewed

@@ -0,0 +1,21 @@
+class AddMetricVersionToReviews < ActiveRecord::Migration[8.1]
+  def change
+    add_column :completion_kit_reviews, :metric_version_id, :bigint
+    add_index :completion_kit_reviews, :metric_version_id, name: "index_ck_reviews_on_metric_version_id"
+    reversible do |dir|
+      dir.up do
+        execute <<~SQL
+          UPDATE completion_kit_reviews
+          SET metric_version_id = (
+            SELECT id FROM completion_kit_metric_versions mv
+            WHERE mv.metric_id = completion_kit_reviews.metric_id
+              AND mv.current = #{ActiveRecord::Base.connection.quote(true)}
+            LIMIT 1
+          )
+          WHERE metric_id IS NOT NULL
+        SQL
+      end
+    end
+  end
+end

data/lib/completion_kit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CompletionKit
-  VERSION = "0.5.43"
+  VERSION = "0.6.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: completion-kit
 version: !ruby/object:Gem::Version
-  version: 0.5.43
+  version: 0.6.0
 platform: ruby
 authors:
 - Damien Bastin
@@ -272,11 +272,11 @@ files:
 - app/models/completion_kit/calibration.rb
 - app/models/completion_kit/dashboard_dismissal.rb
 - app/models/completion_kit/dataset.rb
-- app/models/completion_kit/judge_version.rb
 - app/models/completion_kit/mcp_session.rb
 - app/models/completion_kit/metric.rb
 - app/models/completion_kit/metric_group.rb
 - app/models/completion_kit/metric_group_membership.rb
+- app/models/completion_kit/metric_version.rb
 - app/models/completion_kit/model.rb
 - app/models/completion_kit/prompt.rb
 - app/models/completion_kit/provider_credential.rb
@@ -295,7 +295,6 @@ files:
 - app/services/completion_kit/csv_processor.rb
 - app/services/completion_kit/dashboard_stats.rb
 - app/services/completion_kit/judge_service.rb
-- app/services/completion_kit/judge_variant_generator.rb
 - app/services/completion_kit/llm_client.rb
 - app/services/completion_kit/mcp_dispatcher.rb
 - app/services/completion_kit/mcp_tools/base.rb
@@ -310,6 +309,7 @@ files:
 - app/services/completion_kit/mcp_tools/runs.rb
 - app/services/completion_kit/mcp_tools/tags.rb
 - app/services/completion_kit/metric_calibration_stats.rb
+- app/services/completion_kit/metric_variant_generator.rb
 - app/services/completion_kit/model_discovery_service.rb
 - app/services/completion_kit/ollama_client.rb
 - app/services/completion_kit/onboarding/checklist.rb
@@ -377,6 +377,8 @@ files:
 - app/views/completion_kit/runs/_status_header.html.erb
 - app/views/completion_kit/runs/_status_panel.html.erb
 - app/views/completion_kit/runs/_table.html.erb
+- app/views/completion_kit/runs/compare.html.erb
+- app/views/completion_kit/runs/compare_picker.html.erb
 - app/views/completion_kit/runs/edit.html.erb
 - app/views/completion_kit/runs/index.html.erb
 - app/views/completion_kit/runs/new.html.erb
@@ -422,6 +424,8 @@ files:
 - db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb
 - db/migrate/20260524000001_create_completion_kit_starter_metric_dismissals.rb
 - db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb
+- db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
+- db/migrate/20260528000002_add_metric_version_to_reviews.rb
 - lib/completion-kit.rb
 - lib/completion_kit.rb
 - lib/completion_kit/concurrency_check.rb