RubyGems - completion-kit - Versions diffs - 0.8.0 → 0.10.0 - Mend

completion-kit 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

data/app/controllers/completion_kit/api/v1/metrics_controller.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module CompletionKit
   module Api
     module V1
       class MetricsController < BaseController
-        before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants, :add_few_shot, :remove_few_shot]
+        before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants]
         def index
           scope = Metric.includes(:tags)
@@ -54,33 +54,6 @@ module CompletionKit
           render json: versions, status: :created
         end
-        def add_few_shot
-          calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
-          review = calibration.response.reviews.find_by(metric_id: @metric.id)
-          examples = Array(@metric.few_shot_examples)
-          examples << {
-            "input" => calibration.response.input_data.to_s.truncate(2000),
-            "response" => calibration.response.response_text.to_s.truncate(2000),
-            "judge_score" => review&.ai_score&.to_f,
-            "judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
-            "human_score" => calibration.corrected_score&.to_f,
-            "human_note" => calibration.note.to_s.truncate(1000),
-            "calibration_id" => calibration.id,
-            "added_at" => Time.current.utc.iso8601
-          }
-          @metric.update!(few_shot_examples: examples)
-          render json: @metric.reload
-        rescue ActiveRecord::RecordNotFound
-          render_error("Calibration not found or not a disagree on this metric.", status: :not_found)
-        end
-        def remove_few_shot
-          cal_id = params[:calibration_id].to_i
-          remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
-          @metric.update!(few_shot_examples: remaining)
-          render json: @metric.reload
-        end
         private
         def set_metric

data/app/controllers/completion_kit/metrics_controller.rb CHANGED Viewed

@@ -1,11 +1,13 @@
 module CompletionKit
   class MetricsController < ApplicationController
     include CompletionKit::TagFiltering
-    before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :remove_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
+    before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion, :exclude_example]
+    before_action :ensure_examples_from_reviews_enabled, only: [:exclude_example]
     def index
       @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
       @available_starters = StarterMetrics.available
+      @current_versions = MetricVersion.published.current.where(metric_id: @metrics.map(&:id)).index_by(&:metric_id)
     end
     def starter_preview
@@ -35,15 +37,11 @@ module CompletionKit
     end
     def show
-      @published_metric_version = MetricVersion.ensure_current_for(@metric)
-      @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
-                                  .includes(:metric_version, response: [:reviews, :run])
-                                  .order(created_at: :desc)
-                                  .limit(50)
       @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
       @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
       @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
       @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
+      @guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricCalibrationExamples.judge_examples_for(@metric) : []
     end
     def new
@@ -54,6 +52,7 @@ module CompletionKit
       @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
       @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
       @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
+      @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
       if @edit_draft
         @metric.instruction = @edit_draft.instruction
@@ -102,7 +101,7 @@ module CompletionKit
           state: "draft", source: "edit", current: false
         )
         redirect_to edit_metric_path(@metric),
-                    notice: "Saved as draft #{draft.version_label}. Publish to push these changes to the live judge."
+                    notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
       else
         @metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
         current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
@@ -120,7 +119,7 @@ module CompletionKit
       target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
       disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
       if disagreement_count.zero?
-        redirect_to target, alert: "Mark at least one row as Disagree before asking the model to suggest a change."
+        redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
         return
       end
@@ -132,15 +131,31 @@ module CompletionKit
         redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
         return
       end
-      generator.persist!(variants)
-      redirect_to target, notice: "Drafted a new version. Review it below."
+      versions = generator.persist!(variants)
+      new_version = versions.max_by(&:version_number)
+      if params[:back_to] == "edit"
+        redirect_to edit_metric_path(@metric), notice: "Drafted #{new_version.version_label} from your reviews. Review the proposed changes below, then Publish to use it."
+      else
+        redirect_to metric_path(@metric, show_change: new_version.id), notice: "Drafted #{new_version.version_label} from your reviews."
+      end
     end
     def dismiss_suggestion
       draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
+      label = draft&.version_label
       draft&.destroy
       target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
-      redirect_to target, notice: "Dismissed."
+      redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
+    end
+    def exclude_example
+      calibration = Calibration.where(metric_id: @metric.id).find(params[:calibration_id])
+      calibration.update!(excluded_from_examples: true)
+      render turbo_stream: turbo_stream.replace(
+        "ck-guiding-#{@metric.id}",
+        partial: "completion_kit/metrics/guiding_examples",
+        locals: { metric: @metric, examples: MetricCalibrationExamples.judge_examples_for(@metric) }
+      )
     end
     def publish_draft
@@ -164,7 +179,7 @@ module CompletionKit
         audit = version.revert!
         prior_label = previously_current.version_label
         redirect_to metric_path(@metric),
-                    notice: "Reverted to #{@metric.name} #{version.version_label} (now logged as #{audit.version_label}). Pinned cases still flow to the judge, and calibration verdicts collected against #{prior_label} stay tied to it."
+                    notice: "Reverted #{@metric.name} to #{version.version_label} (logged as #{audit.version_label}). Human reviews collected against #{prior_label} stay tied to it."
       else
         version.publish!
         redirect_to metric_path(@metric),
@@ -172,33 +187,12 @@ module CompletionKit
       end
     end
-    def add_few_shot
-      calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
-      review = calibration.response.reviews.find_by(metric_id: @metric.id)
-      examples = Array(@metric.few_shot_examples)
-      examples << {
-        "input" => calibration.response.input_data.to_s.truncate(2000),
-        "response" => calibration.response.response_text.to_s.truncate(2000),
-        "judge_score" => review&.ai_score&.to_f,
-        "judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
-        "human_score" => calibration.corrected_score&.to_f,
-        "human_note" => calibration.note.to_s.truncate(1000),
-        "calibration_id" => calibration.id,
-        "added_at" => Time.current.utc.iso8601
-      }
-      @metric.update!(few_shot_examples: examples)
-      redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
-    end
+    private
-    def remove_few_shot
-      cal_id = params[:calibration_id].to_i
-      remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
-      @metric.update!(few_shot_examples: remaining)
-      redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
+    def ensure_examples_from_reviews_enabled
+      head :not_found unless CompletionKit.config.judge_examples_from_reviews
     end
-    private
     def set_metric
       @metric = Metric.find(params[:id])
     end

data/app/controllers/completion_kit/runs_controller.rb CHANGED Viewed

@@ -95,7 +95,7 @@ module CompletionKit
     def regrade
       if @run.regrade!
-        redirect_to run_path(@run), notice: "Re-grading existing responses with the current judge."
+        redirect_to run_path(@run), notice: "Re-grading existing responses against the current metrics."
       else
         redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
       end
@@ -151,7 +151,7 @@ module CompletionKit
     def retry_failures
       if @run.stale_review_summary.any?
         redirect_to run_path(@run),
-                    alert: "The judge has changed since this run executed. Retrying failed cases would mix scores from two metric versions in the same run. Use 'Re-run with current judge' to refresh everything against the live judge."
+                    alert: "A metric has a newer version than the one this run was scored against. Retrying failed cases would mix scores from two versions in the same run. Use 'Re-run from scratch' to refresh everything against the current metrics."
         return
       end

data/app/jobs/completion_kit/judge_review_job.rb CHANGED Viewed

@@ -58,8 +58,8 @@ module CompletionKit
         run.prompt&.template,
         criteria: metric.instruction.to_s,
         rubric_text: metric.display_rubric_text,
-        human_examples: few_shot_payload(metric),
-        input_data: response.input_data
+        input_data: response.input_data,
+        human_examples: review_examples_for(metric, response)
       )
       review = response.reviews.find_or_initialize_by(metric_id: metric.id)
@@ -81,9 +81,13 @@ module CompletionKit
     private
-    # A model with supports_judging == nil ("untested") just produced a valid
-    # review — promote it to confirmed. No-op once confirmed (so repeated runs
-    # don't churn the row), and a model already flagged as a bad judge stays so.
+    def review_examples_for(metric, response)
+      return nil unless CompletionKit.config.judge_calibration_enabled
+      return nil unless CompletionKit.config.judge_examples_from_reviews
+      MetricCalibrationExamples.judge_examples_for(metric, exclude_response_id: response.id)
+    end
     def confirm_judging_capability(judge_model_id)
       model = Model.find_by(provider: ApiConfig.provider_for_model(judge_model_id), model_id: judge_model_id)
       return unless model && model.supports_judging.nil?
@@ -116,16 +120,5 @@ module CompletionKit
       response = Response.find_by(id: @response_id)
       RunCompletionCheckJob.perform_later(response.run_id) if response
     end
-    def few_shot_payload(metric)
-      return nil unless CompletionKit.config.judge_calibration_enabled
-      Array(metric.few_shot_examples).map do |fs|
-        {
-          human_score: fs["human_score"],
-          response_text: fs["response"].to_s,
-          human_note: fs["human_note"].to_s
-        }
-      end
-    end
   end
 end

data/app/models/completion_kit/metric.rb CHANGED Viewed

@@ -17,7 +17,6 @@ module CompletionKit
     has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
     serialize :rubric_bands, coder: JSON
-    serialize :few_shot_examples, coder: JSON, type: Array
     validates :name, presence: true
     validates :key, tenant_scoped_uniqueness: { allow_nil: true }

data/app/models/completion_kit/metric_version.rb CHANGED Viewed

@@ -40,6 +40,35 @@ module CompletionKit
       "v#{version_number}"
     end
+    def change_summary_against(previous)
+      return nil if previous.nil?
+      instruction_changed = previous.instruction.to_s.strip != instruction.to_s.strip
+      rubric_changes = rubric_band_change_count(previous)
+      return nil unless instruction_changed || rubric_changes.positive?
+      dimensions = []
+      dimensions << "instruction" if instruction_changed
+      dimensions << "rubric" if rubric_changes.positive?
+      words_changed = 0
+      if instruction_changed
+        old_words = previous.instruction.to_s.split
+        new_words = instruction.to_s.split
+        words_changed = (old_words - new_words).size + (new_words - old_words).size
+      end
+      magnitude = if rubric_changes >= 2 || (instruction_changed && rubric_changes >= 1) || words_changed >= 15
+        :major
+      elsif rubric_changes == 1 || words_changed >= 4
+        :minor
+      else
+        :trivial
+      end
+      { magnitude: magnitude, label: "#{magnitude.to_s.capitalize} #{dimensions.to_sentence} changes" }
+    end
     def publish!
       MetricVersion.transaction do
         self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
@@ -86,6 +115,12 @@ module CompletionKit
     private
+    def rubric_band_change_count(previous)
+      prev = Metric.normalize_rubric_bands(previous.rubric_bands)
+      curr = Metric.normalize_rubric_bands(rubric_bands)
+      prev.zip(curr).count { |p, c| p["description"].to_s.strip != c["description"].to_s.strip }
+    end
     def assign_version_number
       return if version_number.present?
       max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i

data/app/services/completion_kit/judge_service.rb CHANGED Viewed

@@ -10,13 +10,14 @@ module CompletionKit
       @judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
     end
-    def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, human_examples: nil, input_data: nil, **_extras)
+    def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil, **_extras)
       raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
       judge_prompt = build_judge_prompt(output, expected_output, prompt,
         criteria: criteria,
-        rubric_text: rubric_text, human_examples: human_examples,
-        input_data: input_data)
+        rubric_text: rubric_text,
+        input_data: input_data,
+        human_examples: human_examples)
       response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
       raise StandardError, response if response.start_with?("Error:")
@@ -25,7 +26,7 @@ module CompletionKit
     private
-    def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, human_examples: nil, input_data: nil)
+    def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil)
       judge_prompt = <<~PROMPT
         You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
@@ -42,12 +43,7 @@ module CompletionKit
         judge_prompt += "\nCriteria: #{criteria}\n"
       end
-      if human_examples.present?
-        judge_prompt += "\nCalibration examples:\n"
-        human_examples.each_with_index do |example, index|
-          judge_prompt += "Example #{index + 1}: score=#{example[:human_score]} output=#{example[:response_text].to_s.truncate(200)}\n"
-        end
-      end
+      judge_prompt += human_examples_block(human_examples)
       judge_prompt += <<~PROMPT
@@ -60,6 +56,19 @@ module CompletionKit
       judge_prompt
     end
+    def human_examples_block(examples)
+      return "" if examples.blank?
+      lines = ["", "Reviewed examples where a human corrected the judge on this metric. Weigh them when scoring:"]
+      examples.each_with_index do |example, index|
+        note = example[:human_note].to_s
+        line = "Example #{index + 1}: Output: #{example[:output].to_s.truncate(200)}. The judge scored this #{example[:judge_score].to_i}/5. A reviewer corrected it to #{example[:human_score].to_i}/5"
+        line += note.present? ? ": #{note.truncate(160)}" : "."
+        lines << line
+      end
+      lines.join("\n") + "\n"
+    end
     def parse_judge_response(response)
       score_match = response.match(/\*{0,2}Score:?\*{0,2}\s*(\d+(?:\.\d+)?)/i)
       feedback_match = response.match(/\*{0,2}Feedback:?\*{0,2}\s*(.+)/mi)

data/app/services/completion_kit/mcp_tools/metric_versions.rb CHANGED Viewed

@@ -16,7 +16,7 @@ module CompletionKit
           handler: :list
         },
         "metric_versions_publish" => {
-          description: "Publish a MetricVersion as the live judge for its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge actually uses them.",
+          description: "Publish a MetricVersion as the live version of its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge grades against it.",
           inputSchema: {
             type: "object",
             properties: {

data/app/services/completion_kit/metric_calibration_examples.rb ADDED Viewed

@@ -0,0 +1,56 @@
+module CompletionKit
+  module MetricCalibrationExamples
+    DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
+    module_function
+    def for(metric, limit: 8)
+      disagreements_for(metric, limit: limit)
+    end
+    def disagreements_for(metric, limit: 8)
+      calibrations_for(metric, verdict: "disagree", limit: limit)
+    end
+    def borderlines_for(metric, limit: 6)
+      calibrations_for(metric, verdict: "borderline", limit: limit)
+    end
+    def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
+      current_version = MetricVersion.current.find_by(metric_id: metric.id)
+      return [] unless current_version
+      relation = Calibration
+                 .where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
+                 .where.not(corrected_score: nil)
+      relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
+      map_examples(relation.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
+        .reject { |example| example[:judge_score].nil? }
+    end
+    def calibrations_for(metric, verdict:, limit:)
+      base = Calibration.where(metric_id: metric.id, verdict: verdict)
+      current_version = MetricVersion.current.find_by(metric_id: metric.id)
+      scoped = current_version ? base.where(metric_version_id: current_version.id) : base
+      effective = scoped.exists? ? scoped : base
+      map_examples(effective.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
+    end
+    def map_examples(relation, metric)
+      relation.map do |cal|
+        review = cal.response.reviews.find { |r| r.metric_id == metric.id }
+        {
+          id: cal.id,
+          run_id: cal.run_id,
+          response_id: cal.response_id,
+          input: cal.response.input_data,
+          output: cal.response.response_text,
+          judge_score: review&.ai_score,
+          judge_feedback: review&.ai_feedback,
+          human_score: cal.corrected_score,
+          human_note: cal.note
+        }
+      end
+    end
+  end
+end

data/app/services/completion_kit/metric_variant_generator.rb CHANGED Viewed

@@ -43,7 +43,6 @@ module CompletionKit
     def build_meta_prompt
       disagreements = MetricCalibrationExamples.disagreements_for(@metric)
       borderlines = MetricCalibrationExamples.borderlines_for(@metric)
-      pinned_examples = Array(@metric.few_shot_examples)
       sections = []
       sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
       sections << ""
@@ -78,18 +77,6 @@ module CompletionKit
           sections << ""
         end
       end
-      if pinned_examples.any?
-        sections << "## Pinned cases the judge already references"
-        sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
-        pinned_examples.each_with_index do |ex, i|
-          sections << "### Pinned #{i + 1}"
-          sections << "Input: #{ex["input"].to_s.truncate(200)}"
-          sections << "Output: #{ex["response"].to_s.truncate(200)}"
-          sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
-          sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
-          sections << ""
-        end
-      end
       sections << "## Task"
       sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
       sections << ""
@@ -130,40 +117,4 @@ module CompletionKit
     end
   end
-  module MetricCalibrationExamples
-    module_function
-    def for(metric, limit: 8)
-      disagreements_for(metric, limit: limit)
-    end
-    def disagreements_for(metric, limit: 8)
-      calibrations_for(metric, verdict: "disagree", limit: limit)
-    end
-    def borderlines_for(metric, limit: 6)
-      calibrations_for(metric, verdict: "borderline", limit: limit)
-    end
-    def calibrations_for(metric, verdict:, limit:)
-      base = Calibration.where(metric_id: metric.id, verdict: verdict)
-      current_version = MetricVersion.current.find_by(metric_id: metric.id)
-      scoped = current_version ? base.where(metric_version_id: current_version.id) : base
-      effective = scoped.exists? ? scoped : base
-      effective.includes(response: :reviews)
-               .order(created_at: :desc)
-               .limit(limit)
-               .map do |cal|
-        review = cal.response.reviews.find { |r| r.metric_id == metric.id }
-        {
-          input: cal.response.input_data,
-          output: cal.response.response_text,
-          judge_score: review&.ai_score,
-          judge_feedback: review&.ai_feedback,
-          human_score: cal.corrected_score,
-          human_note: cal.note
-        }
-      end
-    end
-  end
 end

data/app/views/completion_kit/api_reference/_body.html.erb CHANGED Viewed

@@ -27,7 +27,7 @@
     <label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">10</span></label>
     <label for="ck-tab-responses" class="ck-api-tabs__label">Responses <span class="ck-api-tabs__count">2</span></label>
     <label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
-    <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">12</span></label>
+    <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">10</span></label>
     <label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
     <label for="ck-tab-calibrations" class="ck-api-tabs__label">Calibrations <span class="ck-api-tabs__count">3</span></label>
     <label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
@@ -239,7 +239,7 @@
       <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
         <p class="ck-kicker" style="margin-bottom: 0.5rem;">Calibration loop</p>
-        <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model for variants, then pin individual cases as few-shot examples on the metric.</p>
+        <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model to rewrite the instruction and rubric into a new draft version.</p>
       </div>
       <div class="ck-api-endpoint">
         <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
@@ -247,16 +247,6 @@
         <p class="ck-api-params"><strong>Optional:</strong>&ensp;<code>count</code>, <code>model</code></p>
         <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/metrics/1/suggest_variants \\\n  -H \"Authorization: Bearer #{token}\"" %>
       </div>
-      <div class="ck-api-endpoint">
-        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/add_few_shot</p>
-        <p class="ck-meta-copy">Pin a disagree calibration as a few-shot example on the metric. Returns the updated metric, or 404 if the calibration is not a disagree on this metric.</p>
-        <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
-      </div>
-      <div class="ck-api-endpoint">
-        <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:id/remove_few_shot</p>
-        <p class="ck-meta-copy">Drop the pinned few-shot example by calibration ID.</p>
-        <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
-      </div>
       <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
         <p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>

data/app/views/completion_kit/calibrations/_trust_panel.html.erb CHANGED Viewed

@@ -1,16 +1,12 @@
 <% stats = local_assigns[:stats] %>
 <% metric = local_assigns[:metric] %>
 <% anchor = metric&.name&.parameterize %>
-<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
+<% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
+<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
      created_by = CompletionKit.config.username.presence || "operator"
-     current_metric_version = CompletionKit::MetricVersion.current.find_by(metric_id: metric.id)
-     verdicted_ids = if current_metric_version
-       CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
-     else
-       []
-     end
+     verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
      CompletionKit::Response.joins(:reviews)
-       .where(reviews: { metric_id: metric.id })
+       .where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
        .where.not(reviews: { ai_score: nil })
        .where.not(id: verdicted_ids)
        .order(created_at: :desc).first
@@ -22,19 +18,29 @@
    end %>
 <p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
-  <span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
   <% if stats.sample_size.zero? %>
-    <span class="ck-trust-line__state">Not measured yet.</span>
-    <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "verdict") %> on prior versions, tied to that version's history.)<% end %><% if target_response %>
-      <%= link_to "Give a verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %>
-    <% else %> Run this metric on a dataset, then give a verdict.<% end %></span>
+    <span class="ck-trust-line__lead">Not measured yet.</span>
+    <span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
+    <% if target_response %>
+      <%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
+    <% end %>
   <% elsif stats.counter_only? %>
-    <span class="ck-trust-line__counter"><%= stats.sample_size %>/<%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span>
-    <span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.short_to_target > 0 %> · <%= stats.short_to_target %> more before this can be measured<% end %><% if target_response %> · <%= link_to "Give another verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %><% end %></span>
+    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></strong></span>
+    <% if stats.short_to_target > 0 %><span class="ck-trust-line__hint"><%= stats.short_to_target %> more to report a rate</span><% end %>
+    <% if target_response %>
+      <%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
+    <% end %>
   <% else %>
-    <span class="ck-trust-line__score" title="Roughly how often the judge and the humans agreed.">~<%= (stats.agreement_point * 100).round %>%</span>
-    <span class="ck-trust-line__margin" title="The range we're confident the true rate sits in.">±<%= (stats.margin * 100).round %> pt</span>
-    <span class="ck-trust-line__gate" title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts.' %>"><%= stats.firm? ? "settled" : "early" %></span>
-    <span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.borderline_rate && stats.borderline_rate > 0 %><% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %> · <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>" title="<%= level == 'ok' ? '' : 'Reviewers said the rubric was unclear here.' %>"><%= (stats.borderline_rate * 100).round %>% unclear</span><% end %></span>
+    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agreement</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong></span>
+    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
+    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
+    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
+    <% if stats.borderline_rate && stats.borderline_rate > 0 %>
+      <% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
+      <span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>
+    <% end %>
   <% end %>
 </p>
+<% if stats.sample_size.zero? && prior_version_verdicts > 0 %>
+  <p class="ck-trust-line__aside"><%= pluralize(prior_version_verdicts, "review") %> from an earlier version <%= prior_version_verdicts == 1 ? "doesn't" : "don't" %> count toward this version.</p>
+<% end %>

data/app/views/completion_kit/metrics/_form.html.erb CHANGED Viewed

@@ -40,20 +40,19 @@
   <% if suggestion %>
     <div class="ck-suggestion-banner" role="status">
       <div class="ck-suggestion-banner__body">
-        <p class="ck-kicker">Proposed improvements</p>
-        <p class="ck-meta-copy">Based on your disagreements, the model proposed these changes to the instruction and rubric. Apply pieces inline below, take everything at once, try again, or discard.</p>
+        <p class="ck-kicker ck-kicker--icon"><%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>Proposed changes</p>
+        <p class="ck-meta-copy">Based on human reviews, here are some proposed changes to the metric.</p>
       </div>
       <div class="ck-suggestion-banner__actions">
-        <%= button_to "Try again", suggest_variants_metric_path(metric, back_to: "edit"),
-              method: :post, form_class: "inline-block",
-              class: ck_button_classes(:light, variant: :outline),
-              title: "Discard these improvements and ask the model for fresh ones.",
-              data: { turbo_confirm: "Replace these improvements with fresh ones from the model?" } %>
-        <%= button_to "Discard", dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
-              method: :delete, form_class: "inline-block",
-              class: ck_button_classes(:light, variant: :outline),
-              data: { turbo_confirm: "Drop these improvements?" } %>
-        <%= button_to "Take everything", publish_draft_metric_path(metric, draft_id: suggestion.id),
+        <%= button_to suggest_variants_metric_path(metric, back_to: "edit"),
+              method: :post, form_class: "inline-block", class: "ck-icon-btn",
+              title: "Try again", "aria-label": "Try again",
+              data: { turbo_confirm: "Replace these changes with fresh ones from the model?" } do %><%= heroicon_tag "arrow-path", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
+        <%= button_to dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
+              method: :delete, form_class: "inline-block", class: "ck-icon-btn",
+              title: "Discard these changes", "aria-label": "Discard",
+              data: { turbo_confirm: "Drop these changes?" } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
+        <%= button_to "Apply all", publish_draft_metric_path(metric, draft_id: suggestion.id),
               method: :post, form_class: "inline-block",
               class: ck_button_classes(:dark) %>
       </div>