RubyGems - completion-kit - Versions diffs - 0.9.0 → 0.11.0 - Mend

completion-kit 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ddf80d4e74705494435d5ae2d9f0ed5ce0dd927f32bffcb1a13819076f94bced
-  data.tar.gz: 74eadf6abc0f173d0047c961502aeaaab9b5b3de7dd155a00d5d054fb5b8f6e6
+  metadata.gz: 0b32ec77fb60d07f40e4b83827c2510aaeb695c96c9c6df86e4b42a7ec57516b
+  data.tar.gz: ade912039e4942c87d73c13443bd405533eec2988478e02cd1ccb87550de2783
 SHA512:
-  metadata.gz: fa0c962d8282310584ff52a849eeb7efc3c66debe9246d8231e5e24e55c45e8566b4edf83a19d3a021dc4a41b6241c042e0af3059c1b47ba709412220628ed96
-  data.tar.gz: 20cdeabe363e212a572cbe6b1f08128a7aae88f5a2ef50a3b3d012e5fbef2a64c93571bc6ebb28e971dc819ac57d060a11a3006f1b7a60ebeb265957be221eab
+  metadata.gz: bb8664ea804d59e3761ab385d1af98ecf7d110dd7e68e7003e1a4b2c059c5e377a5e42d350a46310f58c6d8c41c0e31a6aa0cdaf8a6b50b4d9f419e6fa60e474
+  data.tar.gz: 3bbe72cf7e99a4ae899765829ee8bee83703885ebdfa5b6b9f7253f25f2373b1dd22aadfeb071f4b340c1b7d33dfc4e590e4f8a2df2239afbbc305de009af2cf

data/app/assets/stylesheets/completion_kit/application.css CHANGED Viewed

@@ -686,16 +686,6 @@ tr:hover .ck-chip--publish {
   justify-content: space-between;
   gap: 10px;
 }
-.ck-version-state {
-  font-family: var(--ck-mono);
-  font-size: 0.66rem;
-  letter-spacing: 0.07em;
-  text-transform: uppercase;
-  color: var(--ck-dim);
-}
-.ck-version-state--live {
-  color: var(--ck-text);
-}
 .ck-chip--soft {
   background: var(--ck-accent-soft);
@@ -2877,10 +2867,6 @@ select.ck-input {
   line-height: 1.55;
 }
-.ck-review-card--stale {
-  border-left: 2px solid rgba(224, 164, 88, 0.45);
-}
 .ck-stale-versions-banner {
   margin: 0 0 1rem;
   padding: 0.9rem 1rem;
@@ -2908,12 +2894,6 @@ select.ck-input {
 .ck-delta--zero { color: var(--ck-dim); }
 .ck-run-compare-table td { vertical-align: middle; }
-.ck-review-card__stale-note {
-  margin: 0.4rem 0 0;
-  font-family: var(--ck-mono);
-  font-size: 0.78rem;
-  color: var(--ck-warning);
-}
 @media (max-width: 900px) {
   .ck-grid--sidebar,
@@ -3617,9 +3597,10 @@ select.ck-input {
 }
 .ck-metrics-table th:nth-child(1), .ck-metrics-table td:nth-child(1) { width: 18rem; white-space: normal; }
-.ck-metrics-table th:nth-child(2), .ck-metrics-table td:nth-child(2) { width: auto; }
-.ck-metrics-table th:nth-child(3), .ck-metrics-table td:nth-child(3) { width: 16rem; }
-.ck-metrics-table th:nth-child(4), .ck-metrics-table td:nth-child(4) { width: 3rem; }
+.ck-metrics-table th:nth-child(2), .ck-metrics-table td:nth-child(2) { width: 6rem; }
+.ck-metrics-table th:nth-child(3), .ck-metrics-table td:nth-child(3) { width: auto; }
+.ck-metrics-table th:nth-child(4), .ck-metrics-table td:nth-child(4) { width: 16rem; }
+.ck-metrics-table th:nth-child(5), .ck-metrics-table td:nth-child(5) { width: 3rem; }
 .ck-metrics-table td:nth-child(1) strong { overflow-wrap: anywhere; }
 .ck-datasets-table th:nth-child(1), .ck-datasets-table td:nth-child(1) { width: auto; }
@@ -3638,32 +3619,10 @@ select.ck-input {
 .ck-prompt-versions-table th:nth-child(3), .ck-prompt-versions-table td:nth-child(3) { width: 8rem; white-space: nowrap; }
 .ck-prompt-versions-table th:nth-child(4), .ck-prompt-versions-table td:nth-child(4) { width: auto; }
-.ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 14rem; }
-.ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: auto; }
-.ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: 9rem; white-space: nowrap; }
-.ck-metric-versions-table th:nth-child(4), .ck-metric-versions-table td:nth-child(4) { width: 9rem; white-space: nowrap; }
+.ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 18rem; }
+.ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: 16rem; white-space: nowrap; }
+.ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: auto; white-space: nowrap; }
-.ck-change-link {
-  background: none;
-  border: 0;
-  padding: 0;
-  cursor: pointer;
-  font-family: inherit;
-  font-size: 0.86rem;
-  text-align: left;
-  color: var(--ck-text);
-}
-.ck-change-link:hover,
-.ck-change-link:focus-visible {
-  color: var(--ck-accent);
-  text-decoration: underline;
-}
-.ck-change-link--trivial {
-  color: var(--ck-dim);
-}
-.ck-change-link--major {
-  color: rgb(217, 119, 6);
-}
 .ck-source-chip {
   display: inline-block;
@@ -5632,6 +5591,11 @@ a.tag-mark {
 .ck-trust-line__hint {
   color: var(--ck-dim);
 }
+.ck-trust-line__aside {
+  margin: 4px 0 0;
+  font-size: 0.78rem;
+  color: var(--ck-muted);
+}
 .ck-cal-stat {
   display: inline-flex;
   align-items: baseline;
@@ -5945,3 +5909,158 @@ a.tag-mark {
 .ck-starter-actions .ck-button {
   line-height: 1;
 }
+.ck-guiding {
+  margin-top: 14px;
+  padding-top: 12px;
+  border-top: 1px solid var(--ck-line);
+}
+.ck-guiding__head {
+  display: flex;
+  align-items: baseline;
+  justify-content: space-between;
+  gap: 12px;
+}
+.ck-guiding__head .ck-kicker--inset {
+  margin-top: 0;
+}
+.ck-guiding__legend {
+  font-family: var(--ck-mono);
+  font-size: 0.64rem;
+  letter-spacing: 0.09em;
+  text-transform: uppercase;
+  color: var(--ck-muted);
+}
+.ck-guiding__list {
+  list-style: none;
+  margin: 8px -8px 0;
+  padding: 0;
+  display: flex;
+  flex-direction: column;
+}
+.ck-guiding__item {
+  display: flex;
+  align-items: center;
+  gap: 12px;
+  padding: 5px 8px;
+  border-radius: 7px;
+  transition: background 0.15s;
+}
+.ck-guiding__item:hover {
+  background: var(--ck-surface-hover);
+}
+.ck-guiding__item:hover .ck-guiding__output {
+  color: var(--ck-text);
+}
+.ck-guiding__link {
+  flex: 1;
+  min-width: 0;
+  display: flex;
+  align-items: center;
+  gap: 12px;
+  text-decoration: none;
+  color: inherit;
+}
+.ck-guiding__output {
+  flex: 1;
+  min-width: 0;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+  color: var(--ck-dim);
+  font-size: 0.86rem;
+}
+.ck-guiding__scores {
+  font-family: var(--ck-mono);
+  font-size: 0.78rem;
+  color: var(--ck-text);
+  white-space: nowrap;
+}
+.ck-guiding__judge {
+  color: var(--ck-dim);
+}
+.ck-guiding__human {
+  color: var(--ck-text);
+  font-weight: 600;
+}
+.ck-guiding__item .ck-icon-btn {
+  width: 2rem;
+  height: 2rem;
+}
+.ck-suggestion-status:empty { display: none; }
+.ck-suggestion-status {
+  margin-top: 10px;
+  display: flex;
+  align-items: baseline;
+  gap: 10px;
+  flex-wrap: wrap;
+}
+.ck-scoreboard {
+  margin-bottom: 16px;
+  padding-bottom: 14px;
+  border-bottom: 1px solid var(--ck-line);
+}
+.ck-scoreboard__headline {
+  margin: 0 0 8px;
+  font-size: 0.95rem;
+  color: var(--ck-text);
+}
+.ck-scoreboard__was {
+  font-family: var(--ck-mono);
+  font-size: 0.74rem;
+  color: var(--ck-muted);
+  margin-left: 6px;
+}
+.ck-scoreboard__tally {
+  list-style: none;
+  margin: 0;
+  padding: 0;
+  display: flex;
+  gap: 18px;
+}
+.ck-scoreboard__stat {
+  font-family: var(--ck-mono);
+  font-size: 0.72rem;
+  letter-spacing: 0.06em;
+  text-transform: uppercase;
+  color: var(--ck-muted);
+}
+.ck-scoreboard__stat strong { color: var(--ck-text); }
+.ck-scoreboard__stat--break strong { color: var(--ck-warning); }
+.ck-scoreboard__note {
+  margin: 8px 0 0;
+  font-size: 0.78rem;
+  color: var(--ck-muted);
+}
+.ck-version-change {
+  display: inline-flex;
+  align-items: baseline;
+  gap: 0.6rem;
+}
+.ck-version-score {
+  font-family: var(--ck-mono);
+  font-size: 0.74rem;
+  color: var(--ck-dim);
+}
+.ck-version-score__label {
+  font-size: 0.6rem;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+  color: var(--ck-muted);
+  margin-right: 0.2rem;
+}

data/app/controllers/completion_kit/metrics_controller.rb CHANGED Viewed

@@ -1,11 +1,13 @@
 module CompletionKit
   class MetricsController < ApplicationController
     include CompletionKit::TagFiltering
-    before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion]
+    before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion, :exclude_example]
+    before_action :ensure_examples_from_reviews_enabled, only: [:exclude_example]
     def index
       @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
       @available_starters = StarterMetrics.available
+      @current_versions = MetricVersion.published.current.where(metric_id: @metrics.map(&:id)).index_by(&:metric_id)
     end
     def starter_preview
@@ -39,6 +41,7 @@ module CompletionKit
       @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
       @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
       @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
+      @guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricCalibrationExamples.judge_examples_for(@metric) : []
     end
     def new
@@ -114,26 +117,22 @@ module CompletionKit
     def suggest_variants
       target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
-      disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
-      if disagreement_count.zero?
+      counts = Calibration.where(metric_id: @metric.id, verdict: %w[agree disagree]).group(:verdict).count
+      if counts["disagree"].to_i.zero?
         redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
         return
       end
-      MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
+      MetricSuggestionJob.perform_later(@metric.id)
-      generator = MetricVariantGenerator.new(@metric, count: 1)
-      variants = generator.call
-      if variants.empty?
-        redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
-        return
-      end
-      versions = generator.persist!(variants)
-      new_version = versions.max_by(&:version_number)
       if params[:back_to] == "edit"
-        redirect_to edit_metric_path(@metric), notice: "Drafted #{new_version.version_label} from your reviews. Review the proposed changes below, then Publish to use it."
+        redirect_to metric_path(@metric), notice: "Drafting a change from your reviews. It will appear here once it's tested."
       else
-        redirect_to metric_path(@metric, show_change: new_version.id), notice: "Drafted #{new_version.version_label} from your reviews."
+        render turbo_stream: turbo_stream.replace(
+          "ck-suggestion-status-#{@metric.id}",
+          partial: "completion_kit/metrics/suggestion_pending",
+          locals: { metric: @metric, count: counts.values.sum }
+        )
       end
     end
@@ -145,6 +144,16 @@ module CompletionKit
       redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
     end
+    def exclude_example
+      calibration = Calibration.where(metric_id: @metric.id).find(params[:calibration_id])
+      calibration.update!(excluded_from_examples: true)
+      render turbo_stream: turbo_stream.replace(
+        "ck-guiding-#{@metric.id}",
+        partial: "completion_kit/metrics/guiding_examples",
+        locals: { metric: @metric, examples: MetricCalibrationExamples.judge_examples_for(@metric) }
+      )
+    end
     def publish_draft
       scope = MetricVersion.where(metric_id: @metric.id)
       version = if params[:draft_id].present?
@@ -176,6 +185,10 @@ module CompletionKit
     private
+    def ensure_examples_from_reviews_enabled
+      head :not_found unless CompletionKit.config.judge_examples_from_reviews
+    end
     def set_metric
       @metric = Metric.find(params[:id])
     end

data/app/jobs/completion_kit/judge_review_job.rb CHANGED Viewed

@@ -58,7 +58,8 @@ module CompletionKit
         run.prompt&.template,
         criteria: metric.instruction.to_s,
         rubric_text: metric.display_rubric_text,
-        input_data: response.input_data
+        input_data: response.input_data,
+        human_examples: review_examples_for(metric, response)
       )
       review = response.reviews.find_or_initialize_by(metric_id: metric.id)
@@ -80,9 +81,13 @@ module CompletionKit
     private
-    # A model with supports_judging == nil ("untested") just produced a valid
-    # review — promote it to confirmed. No-op once confirmed (so repeated runs
-    # don't churn the row), and a model already flagged as a bad judge stays so.
+    def review_examples_for(metric, response)
+      return nil unless CompletionKit.config.judge_calibration_enabled
+      return nil unless CompletionKit.config.judge_examples_from_reviews
+      MetricCalibrationExamples.judge_examples_for(metric, exclude_response_id: response.id)
+    end
     def confirm_judging_capability(judge_model_id)
       model = Model.find_by(provider: ApiConfig.provider_for_model(judge_model_id), model_id: judge_model_id)
       return unless model && model.supports_judging.nil?

data/app/jobs/completion_kit/metric_suggestion_job.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require "faraday"
+module CompletionKit
+  class MetricSuggestionJob < ApplicationJob
+    queue_as :llm
+    retry_on Faraday::TimeoutError, Faraday::ConnectionFailed, wait: :polynomially_longer, attempts: 5
+    retry_on CompletionKit::RateLimitError, wait: :polynomially_longer, attempts: 5
+    rescue_from(StandardError) do |error|
+      Rails.error.report(error, handled: true, context: { job: self.class.name })
+      broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
+    end
+    def perform(metric_id)
+      @metric = Metric.find_by(id: metric_id)
+      return unless @metric
+      MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
+      generator = MetricVariantGenerator.new(@metric, count: 1)
+      variants = generator.call
+      if variants.empty?
+        broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
+        return
+      end
+      draft = generator.persist!(variants).max_by(&:version_number)
+      summary = MetricImprovementValidator.new(@metric, draft).call
+      draft.update!(validation_summary: summary)
+      broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_ready", locals: { metric: @metric, draft: draft })
+    end
+    private
+    def broadcast_status(metric, partial:, locals:)
+      html = CompletionKit::ApplicationController.render(partial: partial, locals: locals)
+      Turbo::StreamsChannel.broadcast_replace_to(
+        "metric_#{metric.id}_suggestion",
+        target: "ck-suggestion-status-#{metric.id}",
+        html: html
+      )
+    end
+  end
+end

data/app/models/completion_kit/metric_version.rb CHANGED Viewed

@@ -6,6 +6,7 @@ module CompletionKit
     has_many :calibrations, dependent: :destroy
     serialize :rubric_bands, coder: JSON
+    serialize :validation_summary, coder: JSON
     before_validation :assign_version_number, on: :create

data/app/services/completion_kit/judge_service.rb CHANGED Viewed

@@ -10,13 +10,14 @@ module CompletionKit
       @judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
     end
-    def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, **_extras)
+    def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil, **_extras)
       raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
       judge_prompt = build_judge_prompt(output, expected_output, prompt,
         criteria: criteria,
         rubric_text: rubric_text,
-        input_data: input_data)
+        input_data: input_data,
+        human_examples: human_examples)
       response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
       raise StandardError, response if response.start_with?("Error:")
@@ -25,7 +26,7 @@ module CompletionKit
     private
-    def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil)
+    def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil)
       judge_prompt = <<~PROMPT
         You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
@@ -42,6 +43,8 @@ module CompletionKit
         judge_prompt += "\nCriteria: #{criteria}\n"
       end
+      judge_prompt += human_examples_block(human_examples)
       judge_prompt += <<~PROMPT
         Original prompt: #{prompt || "Not provided"}
@@ -53,6 +56,19 @@ module CompletionKit
       judge_prompt
     end
+    def human_examples_block(examples)
+      return "" if examples.blank?
+      lines = ["", "Reviewed examples where a human corrected the judge on this metric. Weigh them when scoring:"]
+      examples.each_with_index do |example, index|
+        note = example[:human_note].to_s
+        line = "Example #{index + 1}: Output: #{example[:output].to_s.truncate(200)}. The judge scored this #{example[:judge_score].to_i}/5. A reviewer corrected it to #{example[:human_score].to_i}/5"
+        line += note.present? ? ": #{note.truncate(160)}" : "."
+        lines << line
+      end
+      lines.join("\n") + "\n"
+    end
     def parse_judge_response(response)
       score_match = response.match(/\*{0,2}Score:?\*{0,2}\s*(\d+(?:\.\d+)?)/i)
       feedback_match = response.match(/\*{0,2}Feedback:?\*{0,2}\s*(.+)/mi)

data/app/services/completion_kit/metric_calibration_examples.rb ADDED Viewed

@@ -0,0 +1,56 @@
+module CompletionKit
+  module MetricCalibrationExamples
+    DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
+    module_function
+    def for(metric, limit: 8)
+      disagreements_for(metric, limit: limit)
+    end
+    def disagreements_for(metric, limit: 8)
+      calibrations_for(metric, verdict: "disagree", limit: limit)
+    end
+    def borderlines_for(metric, limit: 6)
+      calibrations_for(metric, verdict: "borderline", limit: limit)
+    end
+    def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
+      current_version = MetricVersion.current.find_by(metric_id: metric.id)
+      return [] unless current_version
+      relation = Calibration
+                 .where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
+                 .where.not(corrected_score: nil)
+      relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
+      map_examples(relation.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
+        .reject { |example| example[:judge_score].nil? }
+    end
+    def calibrations_for(metric, verdict:, limit:)
+      base = Calibration.where(metric_id: metric.id, verdict: verdict)
+      current_version = MetricVersion.current.find_by(metric_id: metric.id)
+      scoped = current_version ? base.where(metric_version_id: current_version.id) : base
+      effective = scoped.exists? ? scoped : base
+      map_examples(effective.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
+    end
+    def map_examples(relation, metric)
+      relation.map do |cal|
+        review = cal.response.reviews.find { |r| r.metric_id == metric.id }
+        {
+          id: cal.id,
+          run_id: cal.run_id,
+          response_id: cal.response_id,
+          input: cal.response.input_data,
+          output: cal.response.response_text,
+          judge_score: review&.ai_score,
+          judge_feedback: review&.ai_feedback,
+          human_score: cal.corrected_score,
+          human_note: cal.note
+        }
+      end
+    end
+  end
+end

data/app/services/completion_kit/metric_improvement_validator.rb ADDED Viewed

@@ -0,0 +1,101 @@
+module CompletionKit
+  class MetricImprovementValidator
+    ANSWER_KEY_LIMIT = 30
+    def initialize(metric, candidate, scorer: nil)
+      @metric = metric
+      @candidate = candidate
+      @scorer = scorer || method(:rescore)
+    end
+    def call
+      key = answer_key
+      rows = []
+      key.each do |entry|
+        begin
+          score = @scorer.call(entry[:response], @candidate)
+        rescue StandardError
+          next
+        end
+        rows << classify(entry, score.to_i)
+      end
+      summarize(rows, key.size, key_capped?)
+    end
+    private
+    def answer_key
+      current = MetricVersion.current.find_by(metric_id: @metric.id)
+      return [] unless current
+      base = Calibration.where(metric_id: @metric.id, metric_version_id: current.id, verdict: %w[agree disagree])
+      @key_size_before_cap = base.count
+      base.includes(response: :reviews)
+          .order(created_at: :desc)
+          .limit(ANSWER_KEY_LIMIT)
+          .filter_map do |cal|
+        response = cal.response
+        next unless response.response_text.present?
+        review = response.reviews.find { |r| r.metric_id == @metric.id }
+        position = cal.verdict == "disagree" ? cal.corrected_score : review&.ai_score
+        next if position.nil?
+        { response: response, verdict: cal.verdict, position: position }
+      end
+    end
+    def key_capped?
+      @key_size_before_cap.to_i > ANSWER_KEY_LIMIT
+    end
+    def classify(entry, candidate_score)
+      matched = candidate_score == entry[:position].to_i
+      outcome = if entry[:verdict] == "disagree"
+        matched ? "fix" : "still_off"
+      else
+        matched ? "keep" : "break"
+      end
+      {
+        "response_id" => entry[:response].id,
+        "verdict" => entry[:verdict],
+        "position" => entry[:position].to_i,
+        "candidate_score" => candidate_score,
+        "outcome" => outcome
+      }
+    end
+    def summarize(rows, total, capped)
+      fixes = rows.count { |r| r["outcome"] == "fix" }
+      keeps = rows.count { |r| r["outcome"] == "keep" }
+      breaks = rows.count { |r| r["outcome"] == "break" }
+      still_off = rows.count { |r| r["outcome"] == "still_off" }
+      agreements = rows.count { |r| r["verdict"] == "agree" }
+      {
+        "total" => total,
+        "tested" => rows.size,
+        "capped" => capped,
+        "fixes" => fixes,
+        "keeps" => keeps,
+        "breaks" => breaks,
+        "still_off" => still_off,
+        "before" => agreements,
+        "after" => fixes + keeps,
+        "rows" => rows
+      }
+    end
+    def rescore(response, candidate)
+      run = response.run
+      config = ApiConfig.for_model(run.judge_model).merge(judge_model: run.judge_model)
+      rubric_text = Metric.rubric_text_for(Metric.normalize_rubric_bands(candidate.rubric_bands))
+      result = JudgeService.new(config).evaluate(
+        response.response_text,
+        response.expected_output,
+        run.prompt&.template,
+        criteria: candidate.instruction.to_s,
+        rubric_text: rubric_text,
+        input_data: response.input_data
+      )
+      result[:score]
+    end
+  end
+end

data/app/services/completion_kit/metric_variant_generator.rb CHANGED Viewed

@@ -117,40 +117,4 @@ module CompletionKit
     end
   end
-  module MetricCalibrationExamples
-    module_function
-    def for(metric, limit: 8)
-      disagreements_for(metric, limit: limit)
-    end
-    def disagreements_for(metric, limit: 8)
-      calibrations_for(metric, verdict: "disagree", limit: limit)
-    end
-    def borderlines_for(metric, limit: 6)
-      calibrations_for(metric, verdict: "borderline", limit: limit)
-    end
-    def calibrations_for(metric, verdict:, limit:)
-      base = Calibration.where(metric_id: metric.id, verdict: verdict)
-      current_version = MetricVersion.current.find_by(metric_id: metric.id)
-      scoped = current_version ? base.where(metric_version_id: current_version.id) : base
-      effective = scoped.exists? ? scoped : base
-      effective.includes(response: :reviews)
-               .order(created_at: :desc)
-               .limit(limit)
-               .map do |cal|
-        review = cal.response.reviews.find { |r| r.metric_id == metric.id }
-        {
-          input: cal.response.input_data,
-          output: cal.response.response_text,
-          judge_score: review&.ai_score,
-          judge_feedback: review&.ai_feedback,
-          human_score: cal.corrected_score,
-          human_note: cal.note
-        }
-      end
-    end
-  end
 end

data/app/views/completion_kit/calibrations/_trust_panel.html.erb CHANGED Viewed

@@ -2,15 +2,11 @@
 <% metric = local_assigns[:metric] %>
 <% anchor = metric&.name&.parameterize %>
 <% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
-<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
+<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
      created_by = CompletionKit.config.username.presence || "operator"
-     verdicted_ids = if current_metric_version
-       CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
-     else
-       []
-     end
+     verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
      CompletionKit::Response.joins(:reviews)
-       .where(reviews: { metric_id: metric.id })
+       .where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
        .where.not(reviews: { ai_score: nil })
        .where.not(id: verdicted_ids)
        .order(created_at: :desc).first
@@ -24,7 +20,7 @@
 <p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
   <% if stats.sample_size.zero? %>
     <span class="ck-trust-line__lead">Not measured yet.</span>
-    <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "earlier-version review") %> kept on file.)<% end %></span>
+    <span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
     <% if target_response %>
       <%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
     <% end %>
@@ -35,13 +31,13 @@
       <%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
     <% end %>
   <% else %>
-    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agreement</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong></span>
-    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
-    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
-    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
+    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agrees with you</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong> of <%= stats.sample_size %> reviews</span>
     <% if stats.borderline_rate && stats.borderline_rate > 0 %>
       <% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
       <span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>
     <% end %>
   <% end %>
 </p>
+<% if stats.sample_size.zero? && prior_version_verdicts > 0 %>
+  <p class="ck-trust-line__aside"><%= pluralize(prior_version_verdicts, "review") %> from an earlier version <%= prior_version_verdicts == 1 ? "doesn't" : "don't" %> count toward this version.</p>
+<% end %>

data/app/views/completion_kit/metrics/_guiding_examples.html.erb ADDED Viewed

@@ -0,0 +1,23 @@
+<div id="ck-guiding-<%= metric.id %>" class="ck-guiding">
+  <% if examples.any? %>
+    <div class="ck-guiding__head">
+      <p class="ck-kicker ck-kicker--inset">Guiding the judge</p>
+      <span class="ck-guiding__legend">Judge &rarr; Human</span>
+    </div>
+    <ul class="ck-guiding__list">
+      <% examples.each do |example| %>
+        <li class="ck-guiding__item">
+          <%= link_to run_response_path(example[:run_id], example[:response_id], anchor: metric.name.parameterize),
+                class: "ck-guiding__link", title: "Open this review" do %>
+            <span class="ck-guiding__output"><%= truncate(example[:output].to_s, length: 90) %></span>
+            <span class="ck-guiding__scores"><span class="ck-guiding__judge"><%= example[:judge_score].to_i %></span> &rarr; <span class="ck-guiding__human"><%= example[:human_score].to_i %></span></span>
+          <% end %>
+          <%= button_to exclude_example_metric_path(metric, calibration_id: example[:id]),
+                method: :post, form_class: "inline-block", class: "ck-icon-btn",
+                title: "Stop using this case", "aria-label": "Stop using this case",
+                data: { turbo_confirm: "Stop using this corrected case to guide the judge?" } do %><%= heroicon_tag "x-mark", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
+        </li>
+      <% end %>
+    </ul>
+  <% end %>
+</div>

data/app/views/completion_kit/metrics/_suggestion_failed.html.erb ADDED Viewed

@@ -0,0 +1,3 @@
+<div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status">
+  <span class="ck-cal-foot__note">The model returned no usable change. Try again, or review a few more scores first.</span>
+</div>

data/app/views/completion_kit/metrics/_suggestion_pending.html.erb ADDED Viewed

@@ -0,0 +1,3 @@
+<div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status ck-suggestion-status--pending">
+  <span class="ck-cal-foot__note">Drafting a change and testing it against your <%= pluralize(count, "review") %>…</span>
+</div>

data/app/views/completion_kit/metrics/_suggestion_ready.html.erb ADDED Viewed

@@ -0,0 +1,4 @@
+<div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status ck-suggestion-status--ready">
+  <span class="ck-cal-foot__note">Drafted <%= draft.version_label %> and tested it against your reviews.</span>
+  <%= link_to "Compare and publish →", CompletionKit::Engine.routes.url_helpers.metric_path(metric, show_change: draft.id), class: "ck-cal-link" %>
+</div>

data/app/views/completion_kit/metrics/_validation_scoreboard.html.erb ADDED Viewed

@@ -0,0 +1,12 @@
+<% s = summary %>
+<div class="ck-scoreboard">
+  <p class="ck-scoreboard__headline">Matches you on <strong><%= s["after"] %> of <%= s["tested"] %></strong> of your reviews <span class="ck-scoreboard__was">was <%= s["before"] %> of <%= s["tested"] %></span></p>
+  <ul class="ck-scoreboard__tally">
+    <li class="ck-scoreboard__stat ck-scoreboard__stat--fix">Fixes <strong><%= s["fixes"] %></strong></li>
+    <li class="ck-scoreboard__stat ck-scoreboard__stat--keep">Keeps <strong><%= s["keeps"] %></strong></li>
+    <li class="ck-scoreboard__stat ck-scoreboard__stat--break">Breaks <strong><%= s["breaks"] %></strong></li>
+  </ul>
+  <% if s["capped"] %>
+    <p class="ck-scoreboard__note">Tested against your 30 most recent reviews.</p>
+  <% end %>
+</div>

data/app/views/completion_kit/metrics/index.html.erb CHANGED Viewed

@@ -18,6 +18,7 @@
     <thead>
       <tr>
         <th scope="col">Name</th>
+        <th scope="col">Version</th>
         <th scope="col">Instruction</th>
         <th scope="col">In groups</th>
         <th scope="col"></th>
@@ -34,6 +35,10 @@
               </div>
             <% end %>
           </td>
+          <td data-label="Version">
+            <% v = @current_versions[metric.id] %>
+            <span class="ck-chip ck-chip--soft"><%= v ? v.version_label : "v1" %></span>
+          </td>
           <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
           <td data-label="In groups">
             <% groups = metric.metric_groups %>

data/app/views/completion_kit/metrics/show.html.erb CHANGED Viewed

@@ -50,7 +50,6 @@
       <thead>
         <tr>
           <th scope="col">Version</th>
-          <th scope="col">&Delta; Change</th>
           <th scope="col">Source</th>
           <th scope="col">Created</th>
         </tr>
@@ -60,39 +59,37 @@
           <% pred = predecessor_of[v] %>
           <tr>
             <td>
+              <% summary = v.change_summary_against(pred) %>
               <div class="ck-version-cell">
                 <div class="ck-version-cell__label">
                   <strong><%= v.version_label %></strong>
                   <% if v.current? %>
-                    <span class="ck-version-state ck-version-state--live">Published</span>
+                    <span class="ck-chip">Published</span>
                   <% elsif v.draft? %>
-                    <span class="ck-version-state">Draft</span>
                     <%= button_to "Publish", publish_draft_metric_path(@metric, draft_id: v.id),
                           method: :post, form_class: "inline-block",
-                          class: "ck-chip ck-chip--cta" %>
+                          class: "ck-chip ck-chip--publish" %>
                   <% else %>
-                    <span class="ck-version-state">Past</span>
                     <%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
                           method: :post, form_class: "inline-block",
                           class: "ck-chip ck-chip--publish",
                           data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
                   <% end %>
                 </div>
+                <% vs = v.validation_summary %>
+                <% if summary %>
+                  <div class="ck-version-change">
+                    <% if v.draft? && vs.present? %>
+                      <span class="ck-version-score"><span class="ck-version-score__label">Match</span> <%= vs["after"] %>/<%= vs["tested"] %></span>
+                    <% end %>
+                    <button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">&Delta;</button>
+                  </div>
+                <% end %>
               </div>
             </td>
-            <td>
-              <% summary = v.change_summary_against(pred) %>
-              <% if summary %>
-                <button type="button" class="ck-change-link ck-change-link--<%= summary[:magnitude] %>"
-                        title="Compare with <%= pred.version_label %>"
-                        onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()"><%= summary[:label] %></button>
-              <% else %>
-                <span class="ck-meta-copy">—</span>
-              <% end %>
-            </td>
             <td>
               <% source_label, source_class = case v.source
-                                              when "suggestion" then ["AI calibration", "ck-source-chip ck-source-chip--ai"]
+                                              when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
                                               when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
                                               when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
                                               else ["Original", "ck-source-chip ck-source-chip--initial"]
@@ -119,6 +116,7 @@
   <% @versions.each do |v| %>
     <% pred = predecessor_of[v] %>
     <% next unless v.change_summary_against(pred) %>
+    <% vs = v.validation_summary %>
     <dialog id="ck-mvdiff-<%= v.id %>" class="ck-modal" onclick="if(event.target===this)this.close()">
       <article class="ck-modal__panel" tabindex="-1" onclick="event.stopPropagation()">
         <header class="ck-modal__header">
@@ -129,6 +127,9 @@
           <button type="button" class="ck-modal__close" aria-label="Close" onclick="this.closest('dialog').close()">&times;</button>
         </header>
         <div class="ck-modal__body">
+          <% if v.draft? && vs.present? %>
+            <%= render "completion_kit/metrics/validation_scoreboard", summary: vs %>
+          <% end %>
           <% if pred.instruction.to_s != v.instruction.to_s %>
             <div class="ck-suggest-diff">
               <div class="ck-suggest-diff__pane">
@@ -161,8 +162,10 @@
                     title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
                     data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
               <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
+              <% net_negative = vs.present? && (vs["after"].to_i < vs["before"].to_i || vs["breaks"].to_i > vs["fixes"].to_i) %>
               <%= button_to "Publish #{v.version_label} →", publish_draft_metric_path(@metric, draft_id: v.id),
-                    method: :post, form_class: "inline-block", class: ck_button_classes(:dark) %>
+                    method: :post, form_class: "inline-block", class: ck_button_classes(:dark),
+                    data: net_negative ? { turbo_confirm: "This agrees with you less than the current version. Publish anyway?" } : {} %>
             </span>
           <% else %>
             <span class="ck-modal__foot-note">Roll this metric back to this version.</span>
@@ -177,20 +180,11 @@
 <% end %>
 <% if CompletionKit.config.judge_calibration_enabled %>
+  <% draft = @suggestion_draft || @edit_draft %>
   <section class="ck-card ck-card--spaced">
-    <p class="ck-kicker">Calibration</p>
-    <p class="ck-meta-copy">This is a measure of how often the judge's scores match a human reviewer. Review its scores to build that signal; the scores you disagree with become the cases the model learns from when you improve the metric.</p>
-    <%= render "completion_kit/calibrations/trust_panel",
-          stats: CompletionKit::MetricCalibrationStats.for(@metric),
-          metric: @metric %>
-    <% draft = @suggestion_draft || @edit_draft %>
-    <% if draft %>
-      <div class="ck-cal-foot">
-        <span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
-      </div>
-    <% elsif @improve_disagreement_count.positive? %>
-      <div class="ck-cal-foot">
-        <span class="ck-cal-foot__note"><%= pluralize(@improve_disagreement_count, "case") %> where a reviewer's score didn't match the judge.</span>
+    <div class="ck-prompt-preview__header">
+      <p class="ck-kicker">Agreement</p>
+      <% if draft.nil? && @improve_disagreement_count.positive? %>
         <%= button_to suggest_variants_metric_path(@metric),
               method: :post, form_class: "inline-block",
               class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
@@ -198,6 +192,20 @@
           <%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
           Suggest improvements
         <% end %>
+      <% end %>
+    </div>
+    <%= turbo_stream_from "metric_#{@metric.id}_suggestion" %>
+    <div id="ck-suggestion-status-<%= @metric.id %>" class="ck-suggestion-status"></div>
+    <p class="ck-meta-copy">How often the judge lands on the same score you would. Review its scores to build that signal, and improve the metric to raise it.</p>
+    <%= render "completion_kit/calibrations/trust_panel",
+          stats: CompletionKit::MetricCalibrationStats.for(@metric),
+          metric: @metric %>
+    <% if CompletionKit.config.judge_examples_from_reviews %>
+      <%= render "completion_kit/metrics/guiding_examples", metric: @metric, examples: @guiding_examples %>
+    <% end %>
+    <% if draft %>
+      <div class="ck-cal-foot">
+        <span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
       </div>
     <% end %>
   </section>

data/app/views/completion_kit/responses/show.html.erb CHANGED Viewed

@@ -100,12 +100,17 @@
       <% @reviews.each do |review| %>
         <% review_version = review.metric_version %>
         <% stale = review.stale_against_current_judge? %>
-        <div class="ck-review-card<%= " ck-review-card--stale" if stale %>" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
+        <div class="ck-review-card" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
           <div class="ck-review-card__header">
             <span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
             <div class="ck-inline">
               <% if review_version %>
-                <span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Scored against #{review_version.version_label} of this metric. The metric has been republished since." : "Scored against the metric's current version (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
+                <% if stale %>
+                  <% current_version = CompletionKit::MetricVersion.current.find_by(metric_id: review.metric_id) %>
+                  <span class="ck-source-chip ck-source-chip--past" title="Scored on <%= review_version.version_label %>; the metric is now on <%= current_version.version_label %>, which may score this differently."><%= review_version.version_label %> &rarr; <%= current_version.version_label %></span>
+                <% else %>
+                  <span class="ck-source-chip ck-source-chip--current" title="Scored on the metric's current version (<%= review_version.version_label %>)."><%= review_version.version_label %></span>
+                <% end %>
               <% end %>
               <% if review.ai_score %>
                 <% 5.times do |i| %>
@@ -116,9 +121,6 @@
               <% end %>
             </div>
           </div>
-          <% if stale %>
-            <p class="ck-review-card__stale-note">Scored against a superseded version of this metric. Its current version may score this differently.</p>
-          <% end %>
           <% if review.ai_feedback.present? %>
             <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
           <% end %>

data/config/routes.rb CHANGED Viewed

@@ -22,6 +22,7 @@ CompletionKit::Engine.routes.draw do
       post :publish_draft
       post :suggest_variants
       delete :dismiss_suggestion
+      post :exclude_example
     end
   end
   resources :metric_groups

data/db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class AddExcludedFromExamplesToCompletionKitCalibrations < ActiveRecord::Migration[8.1]
+  def change
+    add_column :completion_kit_calibrations, :excluded_from_examples, :boolean, null: false, default: false
+  end
+end

data/db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class AddValidationSummaryToCompletionKitMetricVersions < ActiveRecord::Migration[8.1]
+  def change
+    add_column :completion_kit_metric_versions, :validation_summary, :text
+  end
+end

data/lib/completion_kit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CompletionKit
-  VERSION = "0.9.0"
+  VERSION = "0.11.0"
 end

data/lib/completion_kit.rb CHANGED Viewed

@@ -13,6 +13,7 @@ module CompletionKit
     attr_accessor :api_rate_limit, :web_rate_limit
     attr_accessor :allow_loopback_endpoints
     attr_accessor :judge_calibration_enabled
+    attr_accessor :judge_examples_from_reviews
     def initialize
       @openai_api_key = ENV['OPENAI_API_KEY']
@@ -29,6 +30,7 @@ module CompletionKit
       @allow_loopback_endpoints = true
       @judge_calibration_enabled = true
+      @judge_examples_from_reviews = false
       @api_reference_authentication_partial = "completion_kit/api_reference/authentication"
     end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: completion-kit
 version: !ruby/object:Gem::Version
-  version: 0.9.0
+  version: 0.11.0
 platform: ruby
 authors:
 - Damien Bastin
@@ -266,6 +266,7 @@ files:
 - app/jobs/completion_kit/application_job.rb
 - app/jobs/completion_kit/generate_row_job.rb
 - app/jobs/completion_kit/judge_review_job.rb
+- app/jobs/completion_kit/metric_suggestion_job.rb
 - app/jobs/completion_kit/model_discovery_job.rb
 - app/jobs/completion_kit/run_completion_check_job.rb
 - app/mailers/completion_kit/application_mailer.rb
@@ -311,7 +312,9 @@ files:
 - app/services/completion_kit/mcp_tools/responses.rb
 - app/services/completion_kit/mcp_tools/runs.rb
 - app/services/completion_kit/mcp_tools/tags.rb
+- app/services/completion_kit/metric_calibration_examples.rb
 - app/services/completion_kit/metric_calibration_stats.rb
+- app/services/completion_kit/metric_improvement_validator.rb
 - app/services/completion_kit/metric_variant_generator.rb
 - app/services/completion_kit/model_discovery_service.rb
 - app/services/completion_kit/ollama_client.rb
@@ -350,9 +353,14 @@ files:
 - app/views/completion_kit/metric_groups/new.html.erb
 - app/views/completion_kit/metric_groups/show.html.erb
 - app/views/completion_kit/metrics/_form.html.erb
+- app/views/completion_kit/metrics/_guiding_examples.html.erb
 - app/views/completion_kit/metrics/_rubric_diff.html.erb
 - app/views/completion_kit/metrics/_rubric_hint.html.erb
 - app/views/completion_kit/metrics/_starter_card.html.erb
+- app/views/completion_kit/metrics/_suggestion_failed.html.erb
+- app/views/completion_kit/metrics/_suggestion_pending.html.erb
+- app/views/completion_kit/metrics/_suggestion_ready.html.erb
+- app/views/completion_kit/metrics/_validation_scoreboard.html.erb
 - app/views/completion_kit/metrics/edit.html.erb
 - app/views/completion_kit/metrics/index.html.erb
 - app/views/completion_kit/metrics/new.html.erb
@@ -430,6 +438,8 @@ files:
 - db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
 - db/migrate/20260528000002_add_metric_version_to_reviews.rb
 - db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
+- db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb
+- db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb
 - lib/completion-kit.rb
 - lib/completion_kit.rb
 - lib/completion_kit/concurrency_check.rb