RubyGems - completion-kit - Versions diffs - 0.5.42 → 0.5.44 - Mend

completion-kit 0.5.42 → 0.5.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

data/app/controllers/completion_kit/api/v1/runs_controller.rb CHANGED Viewed

@@ -45,6 +45,10 @@ module CompletionKit
         end
         def retry_failures
+          if @run.stale_review_summary.any?
+            return render(json: { error: "Judge has changed since this run executed. Retry would mix versions in the same run; use POST /api/v1/runs/:id/rerun instead." }, status: :conflict)
+          end
           scope = @run.responses.where(status: "failed")
           scope = scope.where(id: params[:only]) if params[:only].present?

data/app/controllers/completion_kit/calibrations_controller.rb CHANGED Viewed

@@ -18,7 +18,7 @@ module CompletionKit
         run: @run, response: @response, metric: @metric, created_by: created_by
       )
       calibration.assign_attributes(
-        judge_version: JudgeVersion.ensure_current_for(@metric),
+        metric_version: MetricVersion.ensure_current_for(@metric),
         verdict: params[:verdict],
         corrected_score: params[:corrected_score].presence,
         note: params[:note].presence

data/app/controllers/completion_kit/metrics_controller.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module CompletionKit
   class MetricsController < ApplicationController
     include CompletionKit::TagFiltering
-    before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
+    before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :remove_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
     def index
       @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
@@ -35,14 +35,16 @@ module CompletionKit
     end
     def show
+      @published_metric_version = MetricVersion.ensure_current_for(@metric)
       @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
-                                  .includes(response: [:reviews, :run])
+                                  .includes(:metric_version, response: [:reviews, :run])
                                   .order(created_at: :desc)
                                   .limit(50)
-      @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
-      @published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
-      @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
-      @improve_disagreement_count = @disagreements.size
+      @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
+      @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
+      @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree",
+                                                      metric_version_id: @published_metric_version.id).count
+      @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
     end
     def new
@@ -50,6 +52,14 @@ module CompletionKit
     end
     def edit
+      @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
+      @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
+      @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
+      if @edit_draft
+        @metric.instruction = @edit_draft.instruction
+        @metric.rubric_bands = @edit_draft.rubric_bands
+      end
     end
     def create
@@ -63,10 +73,42 @@ module CompletionKit
     end
     def update
-      if @metric.update(metric_params)
-        redirect_to metric_path(@metric), notice: "Metric was successfully updated."
+      judge_keys = %i[instruction rubric_bands]
+      meta_attrs = metric_params.except(*judge_keys)
+      proposed_instruction = metric_params[:instruction]
+      proposed_rubric = metric_params[:rubric_bands]
+      unless @metric.update(meta_attrs)
+        return render(:edit, status: :unprocessable_entity)
+      end
+      current_instruction = @metric.instruction.to_s
+      current_rubric = @metric.rubric_bands || []
+      normalized_proposed_rubric = normalize_rubric_bands_for_update(proposed_rubric)
+      instruction_changed = !proposed_instruction.nil? && proposed_instruction.to_s != current_instruction
+      rubric_changed = !normalized_proposed_rubric.nil? && normalized_proposed_rubric != current_rubric
+      unless instruction_changed || rubric_changed
+        return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
+      end
+      new_instruction = instruction_changed ? proposed_instruction.to_s : current_instruction
+      new_rubric = rubric_changed ? normalized_proposed_rubric : current_rubric
+      if @metric.reviews.exists?
+        MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
+        draft = MetricVersion.create!(
+          metric: @metric, instruction: new_instruction, rubric_bands: new_rubric,
+          state: "draft", source: "edit", current: false
+        )
+        redirect_to edit_metric_path(@metric),
+                    notice: "Saved as draft #{draft.version_label}. Publish to push these changes to the live judge."
       else
-        render :edit, status: :unprocessable_entity
+        @metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
+        current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
+        current_pub&.update!(instruction: @metric.instruction, rubric_bands: @metric.rubric_bands)
+        redirect_to metric_path(@metric), notice: "Metric was successfully updated."
       end
     end
@@ -76,49 +118,48 @@ module CompletionKit
     end
     def suggest_variants
+      target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
       disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
       if disagreement_count.zero?
-        redirect_to metric_path(@metric), alert: "Mark at least one row as Disagree before asking the model to suggest a change."
+        redirect_to target, alert: "Mark at least one row as Disagree before asking the model to suggest a change."
         return
       end
-      JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
+      MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
-      generator = JudgeVariantGenerator.new(@metric, count: 1)
+      generator = MetricVariantGenerator.new(@metric, count: 1)
       variants = generator.call
       if variants.empty?
-        redirect_to metric_path(@metric), alert: "The model returned no usable variants. Try again with a different model."
+        redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
         return
       end
       generator.persist!(variants)
-      redirect_to metric_path(@metric), notice: "Drafted a new version. Review it below."
+      redirect_to target, notice: "Drafted a new version. Review it below."
     end
     def dismiss_suggestion
-      draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").find_by(id: params[:draft_id])
+      draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
       draft&.destroy
-      redirect_to metric_path(@metric), notice: "Dismissed."
+      target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
+      redirect_to target, notice: "Dismissed."
     end
     def publish_draft
-      scope = JudgeVersion.drafts.where(metric_id: @metric.id)
-      draft = params[:draft_id].present? ? scope.find_by(id: params[:draft_id]) : scope.order(created_at: :desc).first
-      if draft.nil?
-        redirect_to metric_path(@metric), alert: "No draft to publish."
+      scope = MetricVersion.where(metric_id: @metric.id)
+      version = if params[:draft_id].present?
+                  scope.find_by(id: params[:draft_id])
+                else
+                  MetricVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
+                end
+      if version.nil?
+        redirect_to metric_path(@metric), alert: "No version to publish."
         return
       end
-      JudgeVersion.transaction do
-        JudgeVersion.where(metric_id: @metric.id, state: "published").update_all(current: false)
-        draft.update!(state: "published", current: true)
-        @metric.update_columns(
-          instruction: draft.instruction,
-          rubric_bands: Array(draft.rubric_bands).to_json
-        )
-      end
-      redirect_to metric_path(@metric), notice: "This judge version is now live."
+      version.publish!
+      redirect_to metric_path(@metric),
+                  notice: "#{@metric.name} #{version.version_label} is now the published version."
     end
     def add_few_shot
@@ -139,6 +180,13 @@ module CompletionKit
       redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
     end
+    def remove_few_shot
+      cal_id = params[:calibration_id].to_i
+      remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
+      @metric.update!(few_shot_examples: remaining)
+      redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
+    end
     private
     def set_metric
@@ -149,5 +197,14 @@ module CompletionKit
       params.require(:metric).permit(:name, :instruction,
         rubric_bands: [:stars, :description], tag_names: [])
     end
+    def normalize_rubric_bands_for_update(bands)
+      return nil if bands.nil?
+      array = bands.is_a?(ActionController::Parameters) ? bands.to_unsafe_h.values : bands
+      Array(array).map do |b|
+        h = b.respond_to?(:to_unsafe_h) ? b.to_unsafe_h : b
+        { "stars" => h["stars"].to_i, "description" => h["description"].to_s }
+      end.sort_by { |b| -b["stars"] }
+    end
   end
 end

data/app/controllers/completion_kit/runs_controller.rb CHANGED Viewed

@@ -126,6 +126,12 @@ module CompletionKit
     end
     def retry_failures
+      if @run.stale_review_summary.any?
+        redirect_to run_path(@run),
+                    alert: "The judge has changed since this run executed. Retrying failed cases would mix scores from two metric versions in the same run. Use 'Re-run with current judge' to refresh everything against the live judge."
+        return
+      end
       scope = @run.responses.where(status: "failed")
       scope = scope.where(id: params[:only]) if params[:only].present?

data/app/jobs/completion_kit/judge_review_job.rb CHANGED Viewed

@@ -57,13 +57,16 @@ module CompletionKit
         run.prompt&.template,
         criteria: metric.instruction.to_s,
         rubric_text: metric.display_rubric_text,
+        human_examples: few_shot_payload(metric),
         input_data: response.input_data
       )
       review = response.reviews.find_or_initialize_by(metric_id: metric.id)
+      current_metric_version = MetricVersion.ensure_current_for(metric)
       review.assign_attributes(
         metric_name: metric.name,
         instruction: metric.instruction.to_s,
+        metric_version_id: current_metric_version.id,
         status: "succeeded",
         ai_score: evaluation[:score],
         ai_feedback: evaluation[:feedback],
@@ -119,5 +122,16 @@ module CompletionKit
       response = Response.find_by(id: response_id)
       RunCompletionCheckJob.perform_later(response.run_id) if response
     end
+    def few_shot_payload(metric)
+      return nil unless CompletionKit.config.judge_calibration_enabled
+      Array(metric.few_shot_examples).map do |fs|
+        {
+          human_score: fs["human_score"],
+          response_text: fs["response"].to_s,
+          human_note: fs["human_note"].to_s
+        }
+      end
+    end
   end
 end

data/app/models/completion_kit/calibration.rb CHANGED Viewed

@@ -5,7 +5,11 @@ module CompletionKit
     belongs_to :run
     belongs_to :response
     belongs_to :metric
-    belongs_to :judge_version
+    belongs_to :metric_version
+    alias_attribute :judge_version_id, :metric_version_id
+    alias_method :judge_version, :metric_version
+    alias_method :judge_version=, :metric_version=
     validates :verdict, presence: true, inclusion: { in: VERDICTS }
     validates :response_id,
@@ -22,7 +26,7 @@ module CompletionKit
         run_id: run_id,
         response_id: response_id,
         metric_id: metric_id,
-        judge_version_id: judge_version_id,
+        metric_version_id: metric_version_id,
         verdict: verdict,
         corrected_score: corrected_score,
         note: note,

data/app/models/completion_kit/metric.rb CHANGED Viewed

@@ -24,7 +24,6 @@ module CompletionKit
     before_validation :generate_key
     before_validation :normalize_rubric_bands
     before_validation :set_defaults
-    after_update :fork_draft_judge_version, if: :judge_relevant_changes?
     def self.default_rubric_bands
       DEFAULT_RUBRIC_BANDS.map(&:dup)
@@ -98,21 +97,5 @@ module CompletionKit
       self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
     end
-    def judge_relevant_changes?
-      saved_change_to_instruction? || saved_change_to_rubric_bands?
-    end
-    def fork_draft_judge_version
-      JudgeVersion.ensure_current_for(self)
-      JudgeVersion.where(metric_id: id, state: "draft").update_all(current: false)
-      JudgeVersion.create!(
-        metric: self,
-        instruction: instruction,
-        rubric_bands: rubric_bands,
-        current: false,
-        state: "draft",
-        source: "edit"
-      )
-    end
   end
 end

data/app/models/completion_kit/{judge_version.rb → metric_version.rb} RENAMED Viewed

@@ -1,5 +1,5 @@
 module CompletionKit
-  class JudgeVersion < ApplicationRecord
+  class MetricVersion < ApplicationRecord
     STATES = %w[draft published].freeze
     belongs_to :metric
@@ -7,8 +7,11 @@ module CompletionKit
     serialize :rubric_bands, coder: JSON
+    before_validation :assign_version_number, on: :create
     validates :metric_id, presence: true
     validates :state, inclusion: { in: STATES }
+    validates :version_number, presence: true, uniqueness: { scope: :metric_id }
     scope :current, -> { where(current: true) }
     scope :published, -> { where(state: "published") }
@@ -20,7 +23,8 @@ module CompletionKit
         instruction: metric.instruction,
         rubric_bands: metric.rubric_bands,
         current: true,
-        state: "published"
+        state: "published",
+        published_at: Time.current
       )
     end
@@ -32,17 +36,46 @@ module CompletionKit
       state == "published"
     end
+    def version_label
+      "v#{version_number}"
+    end
+    def publish!
+      MetricVersion.transaction do
+        self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
+        reload
+        update!(state: "published", current: true, published_at: published_at || Time.current)
+        metric.update_columns(
+          instruction: instruction,
+          rubric_bands: Array(rubric_bands).to_json
+        )
+      end
+      self
+    end
     def as_json(options = {})
       {
         id: id,
         metric_id: metric_id,
+        version_number: version_number,
         instruction: instruction,
         rubric_bands: rubric_bands,
         current: current,
         state: state,
         source: source,
+        published_at: published_at,
         created_at: created_at
       }
     end
+    private
+    def assign_version_number
+      return if version_number.present?
+      max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i
+      self.version_number = max + 1
+    end
   end
+  JudgeVersion = MetricVersion
 end

data/app/models/completion_kit/review.rb CHANGED Viewed

@@ -5,8 +5,16 @@ module CompletionKit
     belongs_to :response
     belongs_to :metric, optional: true
+    belongs_to :metric_version, optional: true
     has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
+    def stale_against_current_judge?
+      return false unless metric_id && metric_version_id
+      current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
+      return false if current_id.nil?
+      metric_version_id != current_id
+    end
     validates :metric_name, presence: true
     validates :status, inclusion: { in: STATUSES }
     validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
@@ -29,6 +37,7 @@ module CompletionKit
     def as_json(options = {})
       {
         id: id, response_id: response_id, metric_id: metric_id,
+        metric_version_id: metric_version_id,
         metric_name: metric_name, ai_score: ai_score,
         ai_feedback: ai_feedback, status: status, attempts: attempts,
         error: error_payload

data/app/models/completion_kit/run.rb CHANGED Viewed

@@ -89,6 +89,34 @@ module CompletionKit
       end
     end
+    def stale_review_summary
+      review_pairs = Review.where(response_id: response_ids)
+                          .where.not(metric_id: nil)
+                          .where.not(metric_version_id: nil)
+                          .pluck(:metric_id, :metric_version_id, :metric_name)
+      return {} if review_pairs.empty?
+      metric_ids = review_pairs.map(&:first).uniq
+      version_ids = review_pairs.map { |_, vid, _| vid }.uniq
+      current_by_metric = MetricVersion.current.where(metric_id: metric_ids).pluck(:metric_id, :id, :version_number).each_with_object({}) do |(mid, vid, vnum), h|
+        h[mid] = { id: vid, label: "v#{vnum}" }
+      end
+      label_by_version = MetricVersion.where(id: version_ids).pluck(:id, :version_number).each_with_object({}) { |(vid, vnum), h| h[vid] = "v#{vnum}" }
+      summary = {}
+      review_pairs.each do |metric_id, version_id, metric_name|
+        current = current_by_metric[metric_id]
+        next if current.nil?
+        next if version_id == current[:id]
+        label = label_by_version[version_id]
+        next if label.nil?
+        summary[metric_id] ||= { metric_name: metric_name, current_label: current[:label], stale_count: 0, scored_labels: [] }
+        summary[metric_id][:stale_count] += 1
+        summary[metric_id][:scored_labels] |= [label]
+      end
+      summary
+    end
     def start!
       rows = if dataset
                CsvProcessor.process_self(self)

data/app/services/completion_kit/mcp_tools/calibrations.rb CHANGED Viewed

@@ -56,7 +56,7 @@ module CompletionKit
           run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
         )
         calibration.assign_attributes(
-          judge_version: CompletionKit::JudgeVersion.ensure_current_for(metric),
+          metric_version: CompletionKit::MetricVersion.ensure_current_for(metric),
           verdict: args["verdict"],
           corrected_score: args["corrected_score"],
           note: args["note"]

data/app/services/completion_kit/mcp_tools/judges.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module CompletionKit
       TOOLS = {
         "judges_suggest" => {
-          description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft JudgeVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
+          description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft MetricVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
           inputSchema: {
             type: "object",
             properties: {
@@ -33,15 +33,15 @@ module CompletionKit
           handler: :replay
         },
         "judges_compare" => {
-          description: "Compare two judge versions' calibration stats side by side. Pass either two judge_version_ids or one metric_id with judge_version_a_id / judge_version_b_id.",
+          description: "Compare two metric versions' calibration stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
           inputSchema: {
             type: "object",
             properties: {
               metric_id: { type: "integer" },
-              judge_version_a_id: { type: "integer" },
-              judge_version_b_id: { type: "integer" }
+              metric_version_a_id: { type: "integer" },
+              metric_version_b_id: { type: "integer" }
             },
-            required: ["metric_id", "judge_version_a_id", "judge_version_b_id"]
+            required: ["metric_id", "metric_version_a_id", "metric_version_b_id"]
           },
           handler: :compare
         }
@@ -49,7 +49,7 @@ module CompletionKit
       def self.suggest(args)
         metric = CompletionKit::Metric.find(args["metric_id"])
-        generator = CompletionKit::JudgeVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
+        generator = CompletionKit::MetricVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
         variants = generator.call
         return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
         versions = generator.persist!(variants)
@@ -75,20 +75,22 @@ module CompletionKit
       def self.compare(args)
         metric = CompletionKit::Metric.find(args["metric_id"])
-        a = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_a_id"])
-        b = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_b_id"])
-        stats_a = CompletionKit::MetricCalibrationStats.for(metric, judge_version: a)
-        stats_b = CompletionKit::MetricCalibrationStats.for(metric, judge_version: b)
+        a_id = args["metric_version_a_id"] || args["judge_version_a_id"]
+        b_id = args["metric_version_b_id"] || args["judge_version_b_id"]
+        a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(a_id)
+        b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(b_id)
+        stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
+        stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
         text_result({
           metric_id: metric.id,
-          a: judge_version_payload(a, stats_a),
-          b: judge_version_payload(b, stats_b),
+          a: metric_version_payload(a, stats_a),
+          b: metric_version_payload(b, stats_b),
           delta: delta_payload(stats_a, stats_b),
           recommendation: recommendation_for(stats_a, stats_b)
         })
       end
-      def self.judge_version_payload(version, stats)
+      def self.metric_version_payload(version, stats)
         {
           id: version.id, state: version.state, current: version.current,
           source: version.source, created_at: version.created_at,

data/app/services/completion_kit/metric_calibration_stats.rb CHANGED Viewed

@@ -31,18 +31,30 @@ module CompletionKit
       end
     end
-    def self.for(metric, judge_version: nil)
-      new(metric: metric, judge_version: judge_version).call
+    CURRENT = :current
+    def self.for(metric, metric_version: CURRENT)
+      resolved = case metric_version
+                 when CURRENT then MetricVersion.current.find_by(metric_id: metric.id)
+                 when nil then nil
+                 else metric_version
+                 end
+      new(metric: metric, metric_version: resolved, all_versions: metric_version.nil?).call
     end
-    def initialize(metric:, judge_version: nil)
+    def initialize(metric:, metric_version: nil, all_versions: false)
       @metric = metric
-      @judge_version = judge_version
+      @metric_version = metric_version
+      @all_versions = all_versions
     end
     def call
       scope = Calibration.where(metric_id: @metric.id)
-      scope = scope.where(judge_version_id: @judge_version.id) if @judge_version
+      if @metric_version
+        scope = scope.where(metric_version_id: @metric_version.id)
+      elsif !@all_versions
+        scope = scope.none
+      end
       verdicts = scope.pluck(:verdict, :corrected_score, :response_id)
       n = verdicts.length

data/app/services/completion_kit/{judge_variant_generator.rb → metric_variant_generator.rb} RENAMED Viewed

@@ -1,5 +1,5 @@
 module CompletionKit
-  class JudgeVariantGenerator
+  class MetricVariantGenerator
     DEFAULT_VARIANT_COUNT = 1
     MAX_VARIANT_COUNT = 3
     DEFAULT_TEMPERATURE = 0.4
@@ -20,9 +20,9 @@ module CompletionKit
     end
     def persist!(variants)
-      JudgeVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
+      MetricVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
       versions = variants.map do |variant|
-        JudgeVersion.create!(
+        MetricVersion.create!(
           metric: @metric,
           instruction: variant.instruction,
           rubric_bands: variant.rubric_bands.presence || @metric.rubric_bands,
@@ -41,8 +41,8 @@ module CompletionKit
     private
     def build_meta_prompt
-      disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
-      borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
+      disagreements = MetricCalibrationExamples.disagreements_for(@metric)
+      borderlines = MetricCalibrationExamples.borderlines_for(@metric)
       sections = []
       sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
       sections << ""
@@ -86,7 +86,7 @@ module CompletionKit
       sections << "REASONING: <one short sentence: what changes and why>"
       sections << "INSTRUCTION:"
       sections << "<the rewritten instruction>"
-      sections << "RUBRIC:                  # optional — omit this block if the rubric is unchanged"
+      sections << "RUBRIC:                  # optional. Omit this block if the rubric is unchanged."
       sections << "5: <description for 5 stars>"
       sections << "4: <description for 4 stars>"
       sections << "3: <description for 3 stars>"
@@ -117,7 +117,7 @@ module CompletionKit
     end
   end
-  module JudgeCalibrationExamples
+  module MetricCalibrationExamples
     module_function
     def for(metric, limit: 8)
@@ -133,11 +133,13 @@ module CompletionKit
     end
     def calibrations_for(metric, verdict:, limit:)
-      Calibration.where(metric_id: metric.id, verdict: verdict)
-                 .includes(response: :reviews)
-                 .order(created_at: :desc)
-                 .limit(limit)
-                 .map do |cal|
+      scope = Calibration.where(metric_id: metric.id, verdict: verdict)
+      current_version = MetricVersion.current.find_by(metric_id: metric.id)
+      scope = scope.where(metric_version_id: current_version.id) if current_version
+      scope.includes(response: :reviews)
+           .order(created_at: :desc)
+           .limit(limit)
+           .map do |cal|
         review = cal.response.reviews.find { |r| r.metric_id == metric.id }
         {
           input: cal.response.input_data,

data/app/views/completion_kit/api_reference/_body.html.erb CHANGED Viewed

@@ -187,7 +187,7 @@
       </div>
       <%= render "completion_kit/api_reference/resource_list", title: "Your datasets",
             items: datasets.map { |d|
-              { name: d.name, meta: pluralize([d.csv_data.to_s.lines.count - 1, 0].max, "row"),
+              { name: d.name, meta: pluralize([d.csv_data.to_s.lines.count - 1, 0].max, "entry"),
                 url: "#{base_url}/api/v1/datasets/#{d.id}", dom_id: "dataset_ep_#{d.id}" }
             } %>
     </div>