RubyGems - completion-kit - Versions diffs - 0.5.43 → 0.5.44 - Mend

completion-kit 0.5.43 → 0.5.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 7a9284c6a53b1b609de8ca2c081111687990d32f40f1fa4d2422670daeae9f2f
-  data.tar.gz: edb62bc8b34b3ecce534a1e4f0730066d6b56f591d05046b1904eb33f9f7cbc6
+  metadata.gz: d81df0996441d12c0fb540b9f29bb514813adcdbea3ceefb515d318f28947731
+  data.tar.gz: 606764f41e74cec3284f1155d7ef86e77a61af708af2320d5b02640827741f7a
 SHA512:
-  metadata.gz: 0aaf95d75bdfee01b387d3ebe97434168815d58627f8d855ad3dd15534e33c2a69eca7ee8a25a964f6669f891026d350abb5c23e23006ada5a1c56df9ad616ea
-  data.tar.gz: 800fec24cee472a245fcfffbb025eabb2a3bc62cbfc513d1ec0a2c7aa8d1e304f59cc28aa9074712060d0f49ac6bbfba4597cc17e1d3b8db71c5e3b9c557dcab
+  metadata.gz: 9e468cd12eb143f4b5eb64333339199420db4c9d0c78ec548965972eee5e326d574a80c6c3092d63f4d99d88901ce3470ac688468d2813f5370e589568fba669
+  data.tar.gz: 7377f00a31d539297f9e79059083aa7bfef782d18d1ecfcb9f7da1ff648ce1eaf6f8a94bc55d56fcca22a47e09c7fcb1bc89981563aa351e4293c47f8d886570

data/app/assets/stylesheets/completion_kit/application.css CHANGED Viewed

@@ -2816,6 +2816,31 @@ select.ck-input {
   line-height: 1.55;
 }
+.ck-review-card--stale {
+  border-left: 2px solid rgba(224, 164, 88, 0.45);
+}
+.ck-stale-versions-banner {
+  margin: 0 0 1rem;
+  padding: 0.9rem 1rem;
+  border: 1px solid rgba(224, 164, 88, 0.4);
+  background: rgba(224, 164, 88, 0.06);
+  border-radius: var(--ck-radius);
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  gap: 1rem;
+  flex-wrap: wrap;
+}
+.ck-stale-versions-banner__body { min-width: 0; flex: 1 1 320px; }
+.ck-stale-versions-banner .ck-kicker { color: var(--ck-warning); }
+.ck-review-card__stale-note {
+  margin: 0.4rem 0 0;
+  font-family: var(--ck-mono);
+  font-size: 0.78rem;
+  color: var(--ck-warning);
+}
 @media (max-width: 900px) {
   .ck-grid--sidebar,
   .ck-grid--cards,

data/app/controllers/completion_kit/api/v1/calibrations_controller.rb CHANGED Viewed

@@ -15,7 +15,7 @@ module CompletionKit
             run: @run,
             response: @response,
             metric: @metric,
-            judge_version: JudgeVersion.ensure_current_for(@metric),
+            metric_version: MetricVersion.ensure_current_for(@metric),
             **calibration_params
           )

data/app/controllers/completion_kit/api/v1/runs_controller.rb CHANGED Viewed

@@ -45,6 +45,10 @@ module CompletionKit
         end
         def retry_failures
+          if @run.stale_review_summary.any?
+            return render(json: { error: "Judge has changed since this run executed. Retry would mix versions in the same run; use POST /api/v1/runs/:id/rerun instead." }, status: :conflict)
+          end
           scope = @run.responses.where(status: "failed")
           scope = scope.where(id: params[:only]) if params[:only].present?

data/app/controllers/completion_kit/calibrations_controller.rb CHANGED Viewed

@@ -18,7 +18,7 @@ module CompletionKit
         run: @run, response: @response, metric: @metric, created_by: created_by
       )
       calibration.assign_attributes(
-        judge_version: JudgeVersion.ensure_current_for(@metric),
+        metric_version: MetricVersion.ensure_current_for(@metric),
         verdict: params[:verdict],
         corrected_score: params[:corrected_score].presence,
         note: params[:note].presence

data/app/controllers/completion_kit/metrics_controller.rb CHANGED Viewed

@@ -35,16 +35,16 @@ module CompletionKit
     end
     def show
-      @published_judge_version = JudgeVersion.ensure_current_for(@metric)
+      @published_metric_version = MetricVersion.ensure_current_for(@metric)
       @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
-                                  .includes(:judge_version, response: [:reviews, :run])
+                                  .includes(:metric_version, response: [:reviews, :run])
                                   .order(created_at: :desc)
                                   .limit(50)
-      @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
-      @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
+      @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
+      @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
       @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree",
-                                                      judge_version_id: @published_judge_version.id).count
-      @versions = JudgeVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
+                                                      metric_version_id: @published_metric_version.id).count
+      @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
     end
     def new
@@ -52,9 +52,14 @@ module CompletionKit
     end
     def edit
-      @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
-      @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
-      @published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
+      @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
+      @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
+      @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
+      if @edit_draft
+        @metric.instruction = @edit_draft.instruction
+        @metric.rubric_bands = @edit_draft.rubric_bands
+      end
     end
     def create
@@ -68,10 +73,42 @@ module CompletionKit
     end
     def update
-      if @metric.update(metric_params)
-        redirect_to metric_path(@metric), notice: "Metric was successfully updated."
+      judge_keys = %i[instruction rubric_bands]
+      meta_attrs = metric_params.except(*judge_keys)
+      proposed_instruction = metric_params[:instruction]
+      proposed_rubric = metric_params[:rubric_bands]
+      unless @metric.update(meta_attrs)
+        return render(:edit, status: :unprocessable_entity)
+      end
+      current_instruction = @metric.instruction.to_s
+      current_rubric = @metric.rubric_bands || []
+      normalized_proposed_rubric = normalize_rubric_bands_for_update(proposed_rubric)
+      instruction_changed = !proposed_instruction.nil? && proposed_instruction.to_s != current_instruction
+      rubric_changed = !normalized_proposed_rubric.nil? && normalized_proposed_rubric != current_rubric
+      unless instruction_changed || rubric_changed
+        return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
+      end
+      new_instruction = instruction_changed ? proposed_instruction.to_s : current_instruction
+      new_rubric = rubric_changed ? normalized_proposed_rubric : current_rubric
+      if @metric.reviews.exists?
+        MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
+        draft = MetricVersion.create!(
+          metric: @metric, instruction: new_instruction, rubric_bands: new_rubric,
+          state: "draft", source: "edit", current: false
+        )
+        redirect_to edit_metric_path(@metric),
+                    notice: "Saved as draft #{draft.version_label}. Publish to push these changes to the live judge."
       else
-        render :edit, status: :unprocessable_entity
+        @metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
+        current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
+        current_pub&.update!(instruction: @metric.instruction, rubric_bands: @metric.rubric_bands)
+        redirect_to metric_path(@metric), notice: "Metric was successfully updated."
       end
     end
@@ -88,9 +125,9 @@ module CompletionKit
         return
       end
-      JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
+      MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
-      generator = JudgeVariantGenerator.new(@metric, count: 1)
+      generator = MetricVariantGenerator.new(@metric, count: 1)
       variants = generator.call
       if variants.empty?
         redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
@@ -101,18 +138,18 @@ module CompletionKit
     end
     def dismiss_suggestion
-      draft = JudgeVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
+      draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
       draft&.destroy
       target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
       redirect_to target, notice: "Dismissed."
     end
     def publish_draft
-      scope = JudgeVersion.where(metric_id: @metric.id)
+      scope = MetricVersion.where(metric_id: @metric.id)
       version = if params[:draft_id].present?
                   scope.find_by(id: params[:draft_id])
                 else
-                  JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
+                  MetricVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
                 end
       if version.nil?
@@ -160,5 +197,14 @@ module CompletionKit
       params.require(:metric).permit(:name, :instruction,
         rubric_bands: [:stars, :description], tag_names: [])
     end
+    def normalize_rubric_bands_for_update(bands)
+      return nil if bands.nil?
+      array = bands.is_a?(ActionController::Parameters) ? bands.to_unsafe_h.values : bands
+      Array(array).map do |b|
+        h = b.respond_to?(:to_unsafe_h) ? b.to_unsafe_h : b
+        { "stars" => h["stars"].to_i, "description" => h["description"].to_s }
+      end.sort_by { |b| -b["stars"] }
+    end
   end
 end

data/app/controllers/completion_kit/runs_controller.rb CHANGED Viewed

@@ -126,6 +126,12 @@ module CompletionKit
     end
     def retry_failures
+      if @run.stale_review_summary.any?
+        redirect_to run_path(@run),
+                    alert: "The judge has changed since this run executed. Retrying failed cases would mix scores from two metric versions in the same run. Use 'Re-run with current judge' to refresh everything against the live judge."
+        return
+      end
       scope = @run.responses.where(status: "failed")
       scope = scope.where(id: params[:only]) if params[:only].present?

data/app/jobs/completion_kit/judge_review_job.rb CHANGED Viewed

@@ -62,9 +62,11 @@ module CompletionKit
       )
       review = response.reviews.find_or_initialize_by(metric_id: metric.id)
+      current_metric_version = MetricVersion.ensure_current_for(metric)
       review.assign_attributes(
         metric_name: metric.name,
         instruction: metric.instruction.to_s,
+        metric_version_id: current_metric_version.id,
         status: "succeeded",
         ai_score: evaluation[:score],
         ai_feedback: evaluation[:feedback],
@@ -122,6 +124,7 @@ module CompletionKit
     end
     def few_shot_payload(metric)
+      return nil unless CompletionKit.config.judge_calibration_enabled
       Array(metric.few_shot_examples).map do |fs|
         {
           human_score: fs["human_score"],

data/app/models/completion_kit/calibration.rb CHANGED Viewed

@@ -5,7 +5,11 @@ module CompletionKit
     belongs_to :run
     belongs_to :response
     belongs_to :metric
-    belongs_to :judge_version
+    belongs_to :metric_version
+    alias_attribute :judge_version_id, :metric_version_id
+    alias_method :judge_version, :metric_version
+    alias_method :judge_version=, :metric_version=
     validates :verdict, presence: true, inclusion: { in: VERDICTS }
     validates :response_id,
@@ -22,7 +26,7 @@ module CompletionKit
         run_id: run_id,
         response_id: response_id,
         metric_id: metric_id,
-        judge_version_id: judge_version_id,
+        metric_version_id: metric_version_id,
         verdict: verdict,
         corrected_score: corrected_score,
         note: note,

data/app/models/completion_kit/metric.rb CHANGED Viewed

@@ -24,7 +24,6 @@ module CompletionKit
     before_validation :generate_key
     before_validation :normalize_rubric_bands
     before_validation :set_defaults
-    after_update :fork_draft_judge_version, if: :judge_relevant_changes?
     def self.default_rubric_bands
       DEFAULT_RUBRIC_BANDS.map(&:dup)
@@ -98,21 +97,5 @@ module CompletionKit
       self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
     end
-    def judge_relevant_changes?
-      saved_change_to_instruction? || saved_change_to_rubric_bands?
-    end
-    def fork_draft_judge_version
-      JudgeVersion.ensure_current_for(self)
-      JudgeVersion.where(metric_id: id, state: "draft").update_all(current: false)
-      JudgeVersion.create!(
-        metric: self,
-        instruction: instruction,
-        rubric_bands: rubric_bands,
-        current: false,
-        state: "draft",
-        source: "edit"
-      )
-    end
   end
 end

data/app/models/completion_kit/{judge_version.rb → metric_version.rb} RENAMED Viewed

@@ -1,5 +1,5 @@
 module CompletionKit
-  class JudgeVersion < ApplicationRecord
+  class MetricVersion < ApplicationRecord
     STATES = %w[draft published].freeze
     belongs_to :metric
@@ -41,7 +41,7 @@ module CompletionKit
     end
     def publish!
-      JudgeVersion.transaction do
+      MetricVersion.transaction do
         self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
         reload
         update!(state: "published", current: true, published_at: published_at || Time.current)
@@ -76,4 +76,6 @@ module CompletionKit
       self.version_number = max + 1
     end
   end
+  JudgeVersion = MetricVersion
 end

data/app/models/completion_kit/review.rb CHANGED Viewed

@@ -5,8 +5,16 @@ module CompletionKit
     belongs_to :response
     belongs_to :metric, optional: true
+    belongs_to :metric_version, optional: true
     has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
+    def stale_against_current_judge?
+      return false unless metric_id && metric_version_id
+      current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
+      return false if current_id.nil?
+      metric_version_id != current_id
+    end
     validates :metric_name, presence: true
     validates :status, inclusion: { in: STATUSES }
     validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
@@ -29,6 +37,7 @@ module CompletionKit
     def as_json(options = {})
       {
         id: id, response_id: response_id, metric_id: metric_id,
+        metric_version_id: metric_version_id,
         metric_name: metric_name, ai_score: ai_score,
         ai_feedback: ai_feedback, status: status, attempts: attempts,
         error: error_payload

data/app/models/completion_kit/run.rb CHANGED Viewed

@@ -89,6 +89,34 @@ module CompletionKit
       end
     end
+    def stale_review_summary
+      review_pairs = Review.where(response_id: response_ids)
+                          .where.not(metric_id: nil)
+                          .where.not(metric_version_id: nil)
+                          .pluck(:metric_id, :metric_version_id, :metric_name)
+      return {} if review_pairs.empty?
+      metric_ids = review_pairs.map(&:first).uniq
+      version_ids = review_pairs.map { |_, vid, _| vid }.uniq
+      current_by_metric = MetricVersion.current.where(metric_id: metric_ids).pluck(:metric_id, :id, :version_number).each_with_object({}) do |(mid, vid, vnum), h|
+        h[mid] = { id: vid, label: "v#{vnum}" }
+      end
+      label_by_version = MetricVersion.where(id: version_ids).pluck(:id, :version_number).each_with_object({}) { |(vid, vnum), h| h[vid] = "v#{vnum}" }
+      summary = {}
+      review_pairs.each do |metric_id, version_id, metric_name|
+        current = current_by_metric[metric_id]
+        next if current.nil?
+        next if version_id == current[:id]
+        label = label_by_version[version_id]
+        next if label.nil?
+        summary[metric_id] ||= { metric_name: metric_name, current_label: current[:label], stale_count: 0, scored_labels: [] }
+        summary[metric_id][:stale_count] += 1
+        summary[metric_id][:scored_labels] |= [label]
+      end
+      summary
+    end
     def start!
       rows = if dataset
                CsvProcessor.process_self(self)

data/app/services/completion_kit/mcp_tools/calibrations.rb CHANGED Viewed

@@ -56,7 +56,7 @@ module CompletionKit
           run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
         )
         calibration.assign_attributes(
-          judge_version: CompletionKit::JudgeVersion.ensure_current_for(metric),
+          metric_version: CompletionKit::MetricVersion.ensure_current_for(metric),
           verdict: args["verdict"],
           corrected_score: args["corrected_score"],
           note: args["note"]

data/app/services/completion_kit/mcp_tools/judges.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module CompletionKit
       TOOLS = {
         "judges_suggest" => {
-          description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft JudgeVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
+          description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft MetricVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
           inputSchema: {
             type: "object",
             properties: {
@@ -33,15 +33,15 @@ module CompletionKit
           handler: :replay
         },
         "judges_compare" => {
-          description: "Compare two judge versions' calibration stats side by side. Pass either two judge_version_ids or one metric_id with judge_version_a_id / judge_version_b_id.",
+          description: "Compare two metric versions' calibration stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
           inputSchema: {
             type: "object",
             properties: {
               metric_id: { type: "integer" },
-              judge_version_a_id: { type: "integer" },
-              judge_version_b_id: { type: "integer" }
+              metric_version_a_id: { type: "integer" },
+              metric_version_b_id: { type: "integer" }
             },
-            required: ["metric_id", "judge_version_a_id", "judge_version_b_id"]
+            required: ["metric_id", "metric_version_a_id", "metric_version_b_id"]
           },
           handler: :compare
         }
@@ -49,7 +49,7 @@ module CompletionKit
       def self.suggest(args)
         metric = CompletionKit::Metric.find(args["metric_id"])
-        generator = CompletionKit::JudgeVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
+        generator = CompletionKit::MetricVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
         variants = generator.call
         return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
         versions = generator.persist!(variants)
@@ -75,20 +75,22 @@ module CompletionKit
       def self.compare(args)
         metric = CompletionKit::Metric.find(args["metric_id"])
-        a = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_a_id"])
-        b = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_b_id"])
-        stats_a = CompletionKit::MetricCalibrationStats.for(metric, judge_version: a)
-        stats_b = CompletionKit::MetricCalibrationStats.for(metric, judge_version: b)
+        a_id = args["metric_version_a_id"] || args["judge_version_a_id"]
+        b_id = args["metric_version_b_id"] || args["judge_version_b_id"]
+        a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(a_id)
+        b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(b_id)
+        stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
+        stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
         text_result({
           metric_id: metric.id,
-          a: judge_version_payload(a, stats_a),
-          b: judge_version_payload(b, stats_b),
+          a: metric_version_payload(a, stats_a),
+          b: metric_version_payload(b, stats_b),
           delta: delta_payload(stats_a, stats_b),
           recommendation: recommendation_for(stats_a, stats_b)
         })
       end
-      def self.judge_version_payload(version, stats)
+      def self.metric_version_payload(version, stats)
         {
           id: version.id, state: version.state, current: version.current,
           source: version.source, created_at: version.created_at,

data/app/services/completion_kit/metric_calibration_stats.rb CHANGED Viewed

@@ -33,25 +33,25 @@ module CompletionKit
     CURRENT = :current
-    def self.for(metric, judge_version: CURRENT)
-      resolved = case judge_version
-                 when CURRENT then JudgeVersion.current.find_by(metric_id: metric.id)
+    def self.for(metric, metric_version: CURRENT)
+      resolved = case metric_version
+                 when CURRENT then MetricVersion.current.find_by(metric_id: metric.id)
                  when nil then nil
-                 else judge_version
+                 else metric_version
                  end
-      new(metric: metric, judge_version: resolved, all_versions: judge_version.nil?).call
+      new(metric: metric, metric_version: resolved, all_versions: metric_version.nil?).call
     end
-    def initialize(metric:, judge_version: nil, all_versions: false)
+    def initialize(metric:, metric_version: nil, all_versions: false)
       @metric = metric
-      @judge_version = judge_version
+      @metric_version = metric_version
       @all_versions = all_versions
     end
     def call
       scope = Calibration.where(metric_id: @metric.id)
-      if @judge_version
-        scope = scope.where(judge_version_id: @judge_version.id)
+      if @metric_version
+        scope = scope.where(metric_version_id: @metric_version.id)
       elsif !@all_versions
         scope = scope.none
       end

data/app/services/completion_kit/{judge_variant_generator.rb → metric_variant_generator.rb} RENAMED Viewed

@@ -1,5 +1,5 @@
 module CompletionKit
-  class JudgeVariantGenerator
+  class MetricVariantGenerator
     DEFAULT_VARIANT_COUNT = 1
     MAX_VARIANT_COUNT = 3
     DEFAULT_TEMPERATURE = 0.4
@@ -20,9 +20,9 @@ module CompletionKit
     end
     def persist!(variants)
-      JudgeVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
+      MetricVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
       versions = variants.map do |variant|
-        JudgeVersion.create!(
+        MetricVersion.create!(
           metric: @metric,
           instruction: variant.instruction,
           rubric_bands: variant.rubric_bands.presence || @metric.rubric_bands,
@@ -41,8 +41,8 @@ module CompletionKit
     private
     def build_meta_prompt
-      disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
-      borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
+      disagreements = MetricCalibrationExamples.disagreements_for(@metric)
+      borderlines = MetricCalibrationExamples.borderlines_for(@metric)
       sections = []
       sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
       sections << ""
@@ -117,7 +117,7 @@ module CompletionKit
     end
   end
-  module JudgeCalibrationExamples
+  module MetricCalibrationExamples
     module_function
     def for(metric, limit: 8)
@@ -134,8 +134,8 @@ module CompletionKit
     def calibrations_for(metric, verdict:, limit:)
       scope = Calibration.where(metric_id: metric.id, verdict: verdict)
-      current_version = JudgeVersion.current.find_by(metric_id: metric.id)
-      scope = scope.where(judge_version_id: current_version.id) if current_version
+      current_version = MetricVersion.current.find_by(metric_id: metric.id)
+      scope = scope.where(metric_version_id: current_version.id) if current_version
       scope.includes(response: :reviews)
            .order(created_at: :desc)
            .limit(limit)

data/app/views/completion_kit/calibrations/_trust_panel.html.erb CHANGED Viewed

@@ -3,7 +3,12 @@
 <% anchor = metric&.name&.parameterize %>
 <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
      created_by = CompletionKit.config.username.presence || "operator"
-     verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by).pluck(:response_id)
+     current_metric_version = CompletionKit::MetricVersion.current.find_by(metric_id: metric.id)
+     verdicted_ids = if current_metric_version
+       CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
+     else
+       []
+     end
      CompletionKit::Response.joins(:reviews)
        .where(reviews: { metric_id: metric.id })
        .where.not(reviews: { ai_score: nil })

data/app/views/completion_kit/metrics/_form.html.erb CHANGED Viewed

@@ -16,14 +16,14 @@
     </div>
   <% end %>
-  <% if edit_draft && !suggestion %>
-    <% pub = local_assigns[:published_judge_version] %>
+  <% if edit_draft %>
+    <% pub = local_assigns[:published_metric_version] %>
     <% draft_instr_changed = pub && pub.instruction.to_s != edit_draft.instruction.to_s %>
     <% draft_rubric_changed = pub && pub.rubric_bands != edit_draft.rubric_bands %>
     <div class="ck-suggestion-banner" role="status">
       <div class="ck-suggestion-banner__body">
         <p class="ck-kicker">Draft pending</p>
-        <p class="ck-meta-copy">An unpublished draft of this metric is saved. Publish to replace the live<%= " instruction" if draft_instr_changed %><%= " and" if draft_instr_changed && draft_rubric_changed %><%= " rubric" if draft_rubric_changed %> for future runs, or keep editing.</p>
+        <p class="ck-meta-copy">The form below shows your unpublished draft. Publish to replace the live<%= " instruction" if draft_instr_changed %><%= " and" if draft_instr_changed && draft_rubric_changed %><%= " rubric" if draft_rubric_changed %> for future runs, or keep editing.</p>
       </div>
       <div class="ck-suggestion-banner__actions">
         <%= button_to "Discard draft", dismiss_suggestion_metric_path(metric, draft_id: edit_draft.id, back_to: "edit"),

data/app/views/completion_kit/metrics/edit.html.erb CHANGED Viewed

@@ -14,4 +14,4 @@
       metric: @metric,
       suggestion_draft: @suggestion_draft,
       edit_draft: @edit_draft,
-      published_judge_version: @published_judge_version %>
+      published_metric_version: @published_metric_version %>

data/app/views/completion_kit/metrics/show.html.erb CHANGED Viewed

@@ -19,20 +19,17 @@
   </div>
   <div class="ck-actions">
     <% if CompletionKit.config.judge_calibration_enabled %>
-      <% if @suggestion_draft %>
-        <%= link_to "Review improvements →", edit_metric_path(@metric),
+      <% if @suggestion_draft || @edit_draft %>
+        <% review_title = @suggestion_draft ? "The model proposed improvements based on your disagreements. Review and apply what you want." : "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
+        <%= link_to "Review changes →", edit_metric_path(@metric),
               class: ck_button_classes(:dark),
-              title: "The model proposed improvements based on your disagreements. Review and apply what you want." %>
-      <% elsif @edit_draft %>
-        <%= link_to "Review draft →", edit_metric_path(@metric),
-              class: ck_button_classes(:dark),
-              title: "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
+              title: review_title %>
       <% elsif @improve_disagreement_count.positive? %>
         <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
               method: :post, form_class: "inline-block",
               class: ck_button_classes(:light, variant: :outline),
-              title: "Have the model rewrite this metric's instruction and rubric based on the disagreements collected so far.",
-              data: { turbo_confirm: "Rewrite this metric based on the disagreements collected so far?" } %>
+              title: "Ask the model to suggest improvements to this metric's instruction and rubric based on the disagreements collected so far.",
+              data: { turbo_confirm: "Ask the model for suggested improvements based on the disagreements collected so far?" } %>
       <% else %>
         <button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
                 title="Mark at least one case as Disagree before the model can suggest a change.">Improve the metric</button>
@@ -168,19 +165,20 @@
       <p class="ck-kicker">Cases to learn from</p>
       <span class="ck-chip"><%= pluralize(@disagreements.size, "case") %></span>
     </div>
-    <p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades.</p>
+    <% mixed_versions = @disagreements.map(&:metric_version_id).uniq.size > 1 %>
+    <p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades<%= " (pins flow into the current judge regardless of which version produced the verdict)" if mixed_versions %>.</p>
     <% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
     <ul class="ck-disagreement-list">
       <% @disagreements.each do |cal| %>
         <% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
         <% already = existing_ids.include?(cal.id) %>
-        <% cal_version = cal.judge_version %>
-        <% on_current = cal_version&.id == @published_judge_version.id %>
+        <% cal_metric_version = cal.metric_version %>
+        <% on_current = cal_metric_version&.id == @published_metric_version.id %>
         <li class="ck-disagreement<%= " ck-disagreement--remembered" if already %><%= " ck-disagreement--stale" unless on_current %>">
           <div class="ck-disagreement__head">
             <div class="ck-disagreement__scores">
-              <% if cal_version %>
-                <span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_version.version_label %></span>
+              <% if cal_metric_version && mixed_versions %>
+                <span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_metric_version.version_label %></span>
               <% end %>
               <span class="ck-disagreement__scores-label">Judge</span>
               <% if review&.ai_score %>

data/app/views/completion_kit/responses/show.html.erb CHANGED Viewed

@@ -98,10 +98,15 @@
     <div class="ck-review-list">
       <% @reviews.each do |review| %>
-        <div class="ck-review-card" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
+        <% review_version = review.metric_version %>
+        <% stale = review.stale_against_current_judge? %>
+        <div class="ck-review-card<%= " ck-review-card--stale" if stale %>" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
           <div class="ck-review-card__header">
             <span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
             <div class="ck-inline">
+              <% if review_version %>
+                <span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Score produced by #{review_version.version_label} of this metric. The live judge has changed since." : "Score produced by the live judge (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
+              <% end %>
               <% if review.ai_score %>
                 <% 5.times do |i| %>
                   <svg viewBox="0 0 24 24" width="16" height="16" stroke-width="1.75" class="ck-star <%= i < review.ai_score.to_i ? "ck-star--filled" : "ck-star--empty" %>"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
@@ -111,6 +116,9 @@
               <% end %>
             </div>
           </div>
+          <% if stale %>
+            <p class="ck-review-card__stale-note">Scored against a superseded version of this metric. The live judge may score this differently.</p>
+          <% end %>
           <% if review.ai_feedback.present? %>
             <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
           <% end %>

data/app/views/completion_kit/runs/show.html.erb CHANGED Viewed

@@ -18,6 +18,29 @@
   <% dataset_preview_lines = dataset_lines.first(50) %>
 <% end %>
+<% if CompletionKit.config.judge_calibration_enabled %>
+  <% stale_summary = @run.stale_review_summary %>
+  <% if stale_summary.any? %>
+    <div class="ck-stale-versions-banner" role="status">
+      <div class="ck-stale-versions-banner__body">
+        <p class="ck-kicker">Stale judge versions</p>
+        <p class="ck-meta-copy">
+          This run was scored against metric versions that are no longer live.
+          <% stale_summary.values.each_with_index do |s, i| %>
+            <%= ", " if i > 0 %><strong><%= s[:metric_name] %></strong> (scored by <%= s[:scored_labels].join(", ") %>; live is <%= s[:current_label] %>)<% end %>.
+          Re-run to refresh the scores with the current judge.
+        </p>
+      </div>
+      <% if @run.status == "completed" %>
+        <%= button_to "Re-run with current judge",
+              rerun_run_path(@run), method: :post,
+              class: ck_button_classes(:dark), form_class: "inline-block",
+              data: { turbo_confirm: "Create a new run with the current metric versions? The original run stays as a record." } %>
+      <% end %>
+    </div>
+  <% end %>
+<% end %>
 <div class="ck-run-config">
   <div class="ck-run-config__row">
     <span class="ck-run-config__key">Created</span>

data/db/migrate/20260528000001_rename_judge_version_to_metric_version.rb ADDED Viewed

@@ -0,0 +1,22 @@
+class RenameJudgeVersionToMetricVersion < ActiveRecord::Migration[8.1]
+  def change
+    rename_table :completion_kit_judge_versions, :completion_kit_metric_versions
+    rename_column :completion_kit_calibrations, :judge_version_id, :metric_version_id
+    rename_index :completion_kit_metric_versions,
+                 "index_ck_judge_versions_on_metric_id",
+                 "index_ck_metric_versions_on_metric_id"
+    rename_index :completion_kit_metric_versions,
+                 "index_ck_judge_versions_on_metric_current",
+                 "index_ck_metric_versions_on_metric_current"
+    rename_index :completion_kit_metric_versions,
+                 "index_ck_judge_versions_on_metric_state",
+                 "index_ck_metric_versions_on_metric_state"
+    rename_index :completion_kit_metric_versions,
+                 "index_ck_judge_versions_on_metric_version",
+                 "index_ck_metric_versions_on_metric_vnum"
+    rename_index :completion_kit_calibrations,
+                 "index_ck_calibrations_on_judge_version_id",
+                 "index_ck_calibrations_on_metric_version_id"
+  end
+end

data/db/migrate/20260528000002_add_metric_version_to_reviews.rb ADDED Viewed

@@ -0,0 +1,21 @@
+class AddMetricVersionToReviews < ActiveRecord::Migration[8.1]
+  def change
+    add_column :completion_kit_reviews, :metric_version_id, :bigint
+    add_index :completion_kit_reviews, :metric_version_id, name: "index_ck_reviews_on_metric_version_id"
+    reversible do |dir|
+      dir.up do
+        execute <<~SQL
+          UPDATE completion_kit_reviews
+          SET metric_version_id = (
+            SELECT id FROM completion_kit_metric_versions mv
+            WHERE mv.metric_id = completion_kit_reviews.metric_id
+              AND mv.current = #{ActiveRecord::Base.connection.quote(true)}
+            LIMIT 1
+          )
+          WHERE metric_id IS NOT NULL
+        SQL
+      end
+    end
+  end
+end

data/lib/completion_kit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CompletionKit
-  VERSION = "0.5.43"
+  VERSION = "0.5.44"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: completion-kit
 version: !ruby/object:Gem::Version
-  version: 0.5.43
+  version: 0.5.44
 platform: ruby
 authors:
 - Damien Bastin
@@ -272,11 +272,11 @@ files:
 - app/models/completion_kit/calibration.rb
 - app/models/completion_kit/dashboard_dismissal.rb
 - app/models/completion_kit/dataset.rb
-- app/models/completion_kit/judge_version.rb
 - app/models/completion_kit/mcp_session.rb
 - app/models/completion_kit/metric.rb
 - app/models/completion_kit/metric_group.rb
 - app/models/completion_kit/metric_group_membership.rb
+- app/models/completion_kit/metric_version.rb
 - app/models/completion_kit/model.rb
 - app/models/completion_kit/prompt.rb
 - app/models/completion_kit/provider_credential.rb
@@ -295,7 +295,6 @@ files:
 - app/services/completion_kit/csv_processor.rb
 - app/services/completion_kit/dashboard_stats.rb
 - app/services/completion_kit/judge_service.rb
-- app/services/completion_kit/judge_variant_generator.rb
 - app/services/completion_kit/llm_client.rb
 - app/services/completion_kit/mcp_dispatcher.rb
 - app/services/completion_kit/mcp_tools/base.rb
@@ -310,6 +309,7 @@ files:
 - app/services/completion_kit/mcp_tools/runs.rb
 - app/services/completion_kit/mcp_tools/tags.rb
 - app/services/completion_kit/metric_calibration_stats.rb
+- app/services/completion_kit/metric_variant_generator.rb
 - app/services/completion_kit/model_discovery_service.rb
 - app/services/completion_kit/ollama_client.rb
 - app/services/completion_kit/onboarding/checklist.rb
@@ -422,6 +422,8 @@ files:
 - db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb
 - db/migrate/20260524000001_create_completion_kit_starter_metric_dismissals.rb
 - db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb
+- db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
+- db/migrate/20260528000002_add_metric_version_to_reviews.rb
 - lib/completion-kit.rb
 - lib/completion_kit.rb
 - lib/completion_kit/concurrency_check.rb