RubyGems - completion-kit - Versions diffs - 0.9.0 → 0.10.0 - Mend

completion-kit 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/app/assets/stylesheets/completion_kit/application.css +104 -48
data/app/controllers/completion_kit/metrics_controller.rb +18 -1
data/app/jobs/completion_kit/judge_review_job.rb +9 -4
data/app/services/completion_kit/judge_service.rb +19 -3
data/app/services/completion_kit/metric_calibration_examples.rb +56 -0
data/app/services/completion_kit/metric_variant_generator.rb +0 -36
data/app/views/completion_kit/calibrations/_trust_panel.html.erb +7 -8
data/app/views/completion_kit/metrics/_guiding_examples.html.erb +23 -0
data/app/views/completion_kit/metrics/index.html.erb +5 -0
data/app/views/completion_kit/metrics/show.html.erb +22 -28
data/app/views/completion_kit/responses/show.html.erb +7 -5
data/config/routes.rb +1 -0
data/db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb +5 -0
data/lib/completion_kit/version.rb +1 -1
data/lib/completion_kit.rb +2 -0
metadata +4 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: ddf80d4e74705494435d5ae2d9f0ed5ce0dd927f32bffcb1a13819076f94bced
-  data.tar.gz: 74eadf6abc0f173d0047c961502aeaaab9b5b3de7dd155a00d5d054fb5b8f6e6
+  metadata.gz: 4772b264a668a86e004f78c7bc2397d93f5266d8dc287b422728d951ab24fbcc
+  data.tar.gz: 63b221e144e930df9978607a78533d354f659ca2bf6141046291168af50d3cd7
 SHA512:
-  metadata.gz: fa0c962d8282310584ff52a849eeb7efc3c66debe9246d8231e5e24e55c45e8566b4edf83a19d3a021dc4a41b6241c042e0af3059c1b47ba709412220628ed96
-  data.tar.gz: 20cdeabe363e212a572cbe6b1f08128a7aae88f5a2ef50a3b3d012e5fbef2a64c93571bc6ebb28e971dc819ac57d060a11a3006f1b7a60ebeb265957be221eab
+  metadata.gz: a3249ae1c734dcee0c6f9410baf0400f4b16e091b220d86e8417dec91ff9943a165bbc6c8368629cc14054c3af7946bdb693f006a5756de40a809c41db5bbe3a
+  data.tar.gz: 541323c93b08f08f32c2f024e3709ee5f3e4e48144cd8e9cb2ba8891312fd9b5712e4ca39cbcc739b64ba61ff0ac890be3528e5bfd9c238e073d640b37b47e90

data/app/assets/stylesheets/completion_kit/application.css CHANGED Viewed

@@ -686,16 +686,6 @@ tr:hover .ck-chip--publish {
   justify-content: space-between;
   gap: 10px;
 }
-.ck-version-state {
-  font-family: var(--ck-mono);
-  font-size: 0.66rem;
-  letter-spacing: 0.07em;
-  text-transform: uppercase;
-  color: var(--ck-dim);
-}
-.ck-version-state--live {
-  color: var(--ck-text);
-}
 .ck-chip--soft {
   background: var(--ck-accent-soft);
@@ -2877,10 +2867,6 @@ select.ck-input {
   line-height: 1.55;
 }
-.ck-review-card--stale {
-  border-left: 2px solid rgba(224, 164, 88, 0.45);
-}
 .ck-stale-versions-banner {
   margin: 0 0 1rem;
   padding: 0.9rem 1rem;
@@ -2908,12 +2894,6 @@ select.ck-input {
 .ck-delta--zero { color: var(--ck-dim); }
 .ck-run-compare-table td { vertical-align: middle; }
-.ck-review-card__stale-note {
-  margin: 0.4rem 0 0;
-  font-family: var(--ck-mono);
-  font-size: 0.78rem;
-  color: var(--ck-warning);
-}
 @media (max-width: 900px) {
   .ck-grid--sidebar,
@@ -3617,9 +3597,10 @@ select.ck-input {
 }
 .ck-metrics-table th:nth-child(1), .ck-metrics-table td:nth-child(1) { width: 18rem; white-space: normal; }
-.ck-metrics-table th:nth-child(2), .ck-metrics-table td:nth-child(2) { width: auto; }
-.ck-metrics-table th:nth-child(3), .ck-metrics-table td:nth-child(3) { width: 16rem; }
-.ck-metrics-table th:nth-child(4), .ck-metrics-table td:nth-child(4) { width: 3rem; }
+.ck-metrics-table th:nth-child(2), .ck-metrics-table td:nth-child(2) { width: 6rem; }
+.ck-metrics-table th:nth-child(3), .ck-metrics-table td:nth-child(3) { width: auto; }
+.ck-metrics-table th:nth-child(4), .ck-metrics-table td:nth-child(4) { width: 16rem; }
+.ck-metrics-table th:nth-child(5), .ck-metrics-table td:nth-child(5) { width: 3rem; }
 .ck-metrics-table td:nth-child(1) strong { overflow-wrap: anywhere; }
 .ck-datasets-table th:nth-child(1), .ck-datasets-table td:nth-child(1) { width: auto; }
@@ -3638,32 +3619,11 @@ select.ck-input {
 .ck-prompt-versions-table th:nth-child(3), .ck-prompt-versions-table td:nth-child(3) { width: 8rem; white-space: nowrap; }
 .ck-prompt-versions-table th:nth-child(4), .ck-prompt-versions-table td:nth-child(4) { width: auto; }
-.ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 14rem; }
-.ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: auto; }
-.ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: 9rem; white-space: nowrap; }
-.ck-metric-versions-table th:nth-child(4), .ck-metric-versions-table td:nth-child(4) { width: 9rem; white-space: nowrap; }
+.ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 34%; }
+.ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: 33%; white-space: nowrap; }
+.ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: 33%; white-space: nowrap; }
+.ck-metric-versions-table .ck-version-cell { justify-content: flex-start; gap: 0.75rem; }
-.ck-change-link {
-  background: none;
-  border: 0;
-  padding: 0;
-  cursor: pointer;
-  font-family: inherit;
-  font-size: 0.86rem;
-  text-align: left;
-  color: var(--ck-text);
-}
-.ck-change-link:hover,
-.ck-change-link:focus-visible {
-  color: var(--ck-accent);
-  text-decoration: underline;
-}
-.ck-change-link--trivial {
-  color: var(--ck-dim);
-}
-.ck-change-link--major {
-  color: rgb(217, 119, 6);
-}
 .ck-source-chip {
   display: inline-block;
@@ -5632,6 +5592,11 @@ a.tag-mark {
 .ck-trust-line__hint {
   color: var(--ck-dim);
 }
+.ck-trust-line__aside {
+  margin: 4px 0 0;
+  font-size: 0.78rem;
+  color: var(--ck-muted);
+}
 .ck-cal-stat {
   display: inline-flex;
   align-items: baseline;
@@ -5945,3 +5910,94 @@ a.tag-mark {
 .ck-starter-actions .ck-button {
   line-height: 1;
 }
+.ck-guiding {
+  margin-top: 14px;
+  padding-top: 12px;
+  border-top: 1px solid var(--ck-line);
+}
+.ck-guiding__head {
+  display: flex;
+  align-items: baseline;
+  justify-content: space-between;
+  gap: 12px;
+}
+.ck-guiding__head .ck-kicker--inset {
+  margin-top: 0;
+}
+.ck-guiding__legend {
+  font-family: var(--ck-mono);
+  font-size: 0.64rem;
+  letter-spacing: 0.09em;
+  text-transform: uppercase;
+  color: var(--ck-muted);
+}
+.ck-guiding__list {
+  list-style: none;
+  margin: 8px -8px 0;
+  padding: 0;
+  display: flex;
+  flex-direction: column;
+}
+.ck-guiding__item {
+  display: flex;
+  align-items: center;
+  gap: 12px;
+  padding: 5px 8px;
+  border-radius: 7px;
+  transition: background 0.15s;
+}
+.ck-guiding__item:hover {
+  background: var(--ck-surface-hover);
+}
+.ck-guiding__item:hover .ck-guiding__output {
+  color: var(--ck-text);
+}
+.ck-guiding__link {
+  flex: 1;
+  min-width: 0;
+  display: flex;
+  align-items: center;
+  gap: 12px;
+  text-decoration: none;
+  color: inherit;
+}
+.ck-guiding__output {
+  flex: 1;
+  min-width: 0;
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+  color: var(--ck-dim);
+  font-size: 0.86rem;
+}
+.ck-guiding__scores {
+  font-family: var(--ck-mono);
+  font-size: 0.78rem;
+  color: var(--ck-text);
+  white-space: nowrap;
+}
+.ck-guiding__judge {
+  color: var(--ck-dim);
+}
+.ck-guiding__human {
+  color: var(--ck-text);
+  font-weight: 600;
+}
+.ck-guiding__item .ck-icon-btn {
+  width: 2rem;
+  height: 2rem;
+}

data/app/controllers/completion_kit/metrics_controller.rb CHANGED Viewed

@@ -1,11 +1,13 @@
 module CompletionKit
   class MetricsController < ApplicationController
     include CompletionKit::TagFiltering
-    before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion]
+    before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion, :exclude_example]
+    before_action :ensure_examples_from_reviews_enabled, only: [:exclude_example]
     def index
       @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
       @available_starters = StarterMetrics.available
+      @current_versions = MetricVersion.published.current.where(metric_id: @metrics.map(&:id)).index_by(&:metric_id)
     end
     def starter_preview
@@ -39,6 +41,7 @@ module CompletionKit
       @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
       @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
       @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
+      @guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricCalibrationExamples.judge_examples_for(@metric) : []
     end
     def new
@@ -145,6 +148,16 @@ module CompletionKit
       redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
     end
+    def exclude_example
+      calibration = Calibration.where(metric_id: @metric.id).find(params[:calibration_id])
+      calibration.update!(excluded_from_examples: true)
+      render turbo_stream: turbo_stream.replace(
+        "ck-guiding-#{@metric.id}",
+        partial: "completion_kit/metrics/guiding_examples",
+        locals: { metric: @metric, examples: MetricCalibrationExamples.judge_examples_for(@metric) }
+      )
+    end
     def publish_draft
       scope = MetricVersion.where(metric_id: @metric.id)
       version = if params[:draft_id].present?
@@ -176,6 +189,10 @@ module CompletionKit
     private
+    def ensure_examples_from_reviews_enabled
+      head :not_found unless CompletionKit.config.judge_examples_from_reviews
+    end
     def set_metric
       @metric = Metric.find(params[:id])
     end

data/app/jobs/completion_kit/judge_review_job.rb CHANGED Viewed

@@ -58,7 +58,8 @@ module CompletionKit
         run.prompt&.template,
         criteria: metric.instruction.to_s,
         rubric_text: metric.display_rubric_text,
-        input_data: response.input_data
+        input_data: response.input_data,
+        human_examples: review_examples_for(metric, response)
       )
       review = response.reviews.find_or_initialize_by(metric_id: metric.id)
@@ -80,9 +81,13 @@ module CompletionKit
     private
-    # A model with supports_judging == nil ("untested") just produced a valid
-    # review — promote it to confirmed. No-op once confirmed (so repeated runs
-    # don't churn the row), and a model already flagged as a bad judge stays so.
+    def review_examples_for(metric, response)
+      return nil unless CompletionKit.config.judge_calibration_enabled
+      return nil unless CompletionKit.config.judge_examples_from_reviews
+      MetricCalibrationExamples.judge_examples_for(metric, exclude_response_id: response.id)
+    end
     def confirm_judging_capability(judge_model_id)
       model = Model.find_by(provider: ApiConfig.provider_for_model(judge_model_id), model_id: judge_model_id)
       return unless model && model.supports_judging.nil?

data/app/services/completion_kit/judge_service.rb CHANGED Viewed

@@ -10,13 +10,14 @@ module CompletionKit
       @judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
     end
-    def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, **_extras)
+    def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil, **_extras)
       raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
       judge_prompt = build_judge_prompt(output, expected_output, prompt,
         criteria: criteria,
         rubric_text: rubric_text,
-        input_data: input_data)
+        input_data: input_data,
+        human_examples: human_examples)
       response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
       raise StandardError, response if response.start_with?("Error:")
@@ -25,7 +26,7 @@ module CompletionKit
     private
-    def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil)
+    def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil)
       judge_prompt = <<~PROMPT
         You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
@@ -42,6 +43,8 @@ module CompletionKit
         judge_prompt += "\nCriteria: #{criteria}\n"
       end
+      judge_prompt += human_examples_block(human_examples)
       judge_prompt += <<~PROMPT
         Original prompt: #{prompt || "Not provided"}
@@ -53,6 +56,19 @@ module CompletionKit
       judge_prompt
     end
+    def human_examples_block(examples)
+      return "" if examples.blank?
+      lines = ["", "Reviewed examples where a human corrected the judge on this metric. Weigh them when scoring:"]
+      examples.each_with_index do |example, index|
+        note = example[:human_note].to_s
+        line = "Example #{index + 1}: Output: #{example[:output].to_s.truncate(200)}. The judge scored this #{example[:judge_score].to_i}/5. A reviewer corrected it to #{example[:human_score].to_i}/5"
+        line += note.present? ? ": #{note.truncate(160)}" : "."
+        lines << line
+      end
+      lines.join("\n") + "\n"
+    end
     def parse_judge_response(response)
       score_match = response.match(/\*{0,2}Score:?\*{0,2}\s*(\d+(?:\.\d+)?)/i)
       feedback_match = response.match(/\*{0,2}Feedback:?\*{0,2}\s*(.+)/mi)

data/app/services/completion_kit/metric_calibration_examples.rb ADDED Viewed

@@ -0,0 +1,56 @@
+module CompletionKit
+  module MetricCalibrationExamples
+    DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
+    module_function
+    def for(metric, limit: 8)
+      disagreements_for(metric, limit: limit)
+    end
+    def disagreements_for(metric, limit: 8)
+      calibrations_for(metric, verdict: "disagree", limit: limit)
+    end
+    def borderlines_for(metric, limit: 6)
+      calibrations_for(metric, verdict: "borderline", limit: limit)
+    end
+    def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
+      current_version = MetricVersion.current.find_by(metric_id: metric.id)
+      return [] unless current_version
+      relation = Calibration
+                 .where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
+                 .where.not(corrected_score: nil)
+      relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
+      map_examples(relation.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
+        .reject { |example| example[:judge_score].nil? }
+    end
+    def calibrations_for(metric, verdict:, limit:)
+      base = Calibration.where(metric_id: metric.id, verdict: verdict)
+      current_version = MetricVersion.current.find_by(metric_id: metric.id)
+      scoped = current_version ? base.where(metric_version_id: current_version.id) : base
+      effective = scoped.exists? ? scoped : base
+      map_examples(effective.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
+    end
+    def map_examples(relation, metric)
+      relation.map do |cal|
+        review = cal.response.reviews.find { |r| r.metric_id == metric.id }
+        {
+          id: cal.id,
+          run_id: cal.run_id,
+          response_id: cal.response_id,
+          input: cal.response.input_data,
+          output: cal.response.response_text,
+          judge_score: review&.ai_score,
+          judge_feedback: review&.ai_feedback,
+          human_score: cal.corrected_score,
+          human_note: cal.note
+        }
+      end
+    end
+  end
+end

data/app/services/completion_kit/metric_variant_generator.rb CHANGED Viewed

@@ -117,40 +117,4 @@ module CompletionKit
     end
   end
-  module MetricCalibrationExamples
-    module_function
-    def for(metric, limit: 8)
-      disagreements_for(metric, limit: limit)
-    end
-    def disagreements_for(metric, limit: 8)
-      calibrations_for(metric, verdict: "disagree", limit: limit)
-    end
-    def borderlines_for(metric, limit: 6)
-      calibrations_for(metric, verdict: "borderline", limit: limit)
-    end
-    def calibrations_for(metric, verdict:, limit:)
-      base = Calibration.where(metric_id: metric.id, verdict: verdict)
-      current_version = MetricVersion.current.find_by(metric_id: metric.id)
-      scoped = current_version ? base.where(metric_version_id: current_version.id) : base
-      effective = scoped.exists? ? scoped : base
-      effective.includes(response: :reviews)
-               .order(created_at: :desc)
-               .limit(limit)
-               .map do |cal|
-        review = cal.response.reviews.find { |r| r.metric_id == metric.id }
-        {
-          input: cal.response.input_data,
-          output: cal.response.response_text,
-          judge_score: review&.ai_score,
-          judge_feedback: review&.ai_feedback,
-          human_score: cal.corrected_score,
-          human_note: cal.note
-        }
-      end
-    end
-  end
 end

data/app/views/completion_kit/calibrations/_trust_panel.html.erb CHANGED Viewed

@@ -2,15 +2,11 @@
 <% metric = local_assigns[:metric] %>
 <% anchor = metric&.name&.parameterize %>
 <% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
-<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
+<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
      created_by = CompletionKit.config.username.presence || "operator"
-     verdicted_ids = if current_metric_version
-       CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
-     else
-       []
-     end
+     verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
      CompletionKit::Response.joins(:reviews)
-       .where(reviews: { metric_id: metric.id })
+       .where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
        .where.not(reviews: { ai_score: nil })
        .where.not(id: verdicted_ids)
        .order(created_at: :desc).first
@@ -24,7 +20,7 @@
 <p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
   <% if stats.sample_size.zero? %>
     <span class="ck-trust-line__lead">Not measured yet.</span>
-    <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "earlier-version review") %> kept on file.)<% end %></span>
+    <span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
     <% if target_response %>
       <%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
     <% end %>
@@ -45,3 +41,6 @@
     <% end %>
   <% end %>
 </p>
+<% if stats.sample_size.zero? && prior_version_verdicts > 0 %>
+  <p class="ck-trust-line__aside"><%= pluralize(prior_version_verdicts, "review") %> from an earlier version <%= prior_version_verdicts == 1 ? "doesn't" : "don't" %> count toward this version.</p>
+<% end %>

data/app/views/completion_kit/metrics/_guiding_examples.html.erb ADDED Viewed

@@ -0,0 +1,23 @@
+<div id="ck-guiding-<%= metric.id %>" class="ck-guiding">
+  <% if examples.any? %>
+    <div class="ck-guiding__head">
+      <p class="ck-kicker ck-kicker--inset">Guiding the judge</p>
+      <span class="ck-guiding__legend">Judge &rarr; Human</span>
+    </div>
+    <ul class="ck-guiding__list">
+      <% examples.each do |example| %>
+        <li class="ck-guiding__item">
+          <%= link_to run_response_path(example[:run_id], example[:response_id], anchor: metric.name.parameterize),
+                class: "ck-guiding__link", title: "Open this review" do %>
+            <span class="ck-guiding__output"><%= truncate(example[:output].to_s, length: 90) %></span>
+            <span class="ck-guiding__scores"><span class="ck-guiding__judge"><%= example[:judge_score].to_i %></span> &rarr; <span class="ck-guiding__human"><%= example[:human_score].to_i %></span></span>
+          <% end %>
+          <%= button_to exclude_example_metric_path(metric, calibration_id: example[:id]),
+                method: :post, form_class: "inline-block", class: "ck-icon-btn",
+                title: "Stop using this case", "aria-label": "Stop using this case",
+                data: { turbo_confirm: "Stop using this corrected case to guide the judge?" } do %><%= heroicon_tag "x-mark", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
+        </li>
+      <% end %>
+    </ul>
+  <% end %>
+</div>

data/app/views/completion_kit/metrics/index.html.erb CHANGED Viewed

@@ -18,6 +18,7 @@
     <thead>
       <tr>
         <th scope="col">Name</th>
+        <th scope="col">Version</th>
         <th scope="col">Instruction</th>
         <th scope="col">In groups</th>
         <th scope="col"></th>
@@ -34,6 +35,10 @@
               </div>
             <% end %>
           </td>
+          <td data-label="Version">
+            <% v = @current_versions[metric.id] %>
+            <span class="ck-chip ck-chip--soft"><%= v ? v.version_label : "v1" %></span>
+          </td>
           <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
           <td data-label="In groups">
             <% groups = metric.metric_groups %>

data/app/views/completion_kit/metrics/show.html.erb CHANGED Viewed

@@ -50,7 +50,6 @@
       <thead>
         <tr>
           <th scope="col">Version</th>
-          <th scope="col">&Delta; Change</th>
           <th scope="col">Source</th>
           <th scope="col">Created</th>
         </tr>
@@ -60,36 +59,28 @@
           <% pred = predecessor_of[v] %>
           <tr>
             <td>
+              <% summary = v.change_summary_against(pred) %>
               <div class="ck-version-cell">
                 <div class="ck-version-cell__label">
                   <strong><%= v.version_label %></strong>
                   <% if v.current? %>
-                    <span class="ck-version-state ck-version-state--live">Published</span>
+                    <span class="ck-chip">Published</span>
                   <% elsif v.draft? %>
-                    <span class="ck-version-state">Draft</span>
                     <%= button_to "Publish", publish_draft_metric_path(@metric, draft_id: v.id),
                           method: :post, form_class: "inline-block",
-                          class: "ck-chip ck-chip--cta" %>
+                          class: "ck-chip ck-chip--publish" %>
                   <% else %>
-                    <span class="ck-version-state">Past</span>
                     <%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
                           method: :post, form_class: "inline-block",
                           class: "ck-chip ck-chip--publish",
                           data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
                   <% end %>
                 </div>
+                <% if summary %>
+                  <button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">&Delta;</button>
+                <% end %>
               </div>
             </td>
-            <td>
-              <% summary = v.change_summary_against(pred) %>
-              <% if summary %>
-                <button type="button" class="ck-change-link ck-change-link--<%= summary[:magnitude] %>"
-                        title="Compare with <%= pred.version_label %>"
-                        onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()"><%= summary[:label] %></button>
-              <% else %>
-                <span class="ck-meta-copy">—</span>
-              <% end %>
-            </td>
             <td>
               <% source_label, source_class = case v.source
                                               when "suggestion" then ["AI calibration", "ck-source-chip ck-source-chip--ai"]
@@ -177,20 +168,11 @@
 <% end %>
 <% if CompletionKit.config.judge_calibration_enabled %>
+  <% draft = @suggestion_draft || @edit_draft %>
   <section class="ck-card ck-card--spaced">
-    <p class="ck-kicker">Calibration</p>
-    <p class="ck-meta-copy">This is a measure of how often the judge's scores match a human reviewer. Review its scores to build that signal; the scores you disagree with become the cases the model learns from when you improve the metric.</p>
-    <%= render "completion_kit/calibrations/trust_panel",
-          stats: CompletionKit::MetricCalibrationStats.for(@metric),
-          metric: @metric %>
-    <% draft = @suggestion_draft || @edit_draft %>
-    <% if draft %>
-      <div class="ck-cal-foot">
-        <span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
-      </div>
-    <% elsif @improve_disagreement_count.positive? %>
-      <div class="ck-cal-foot">
-        <span class="ck-cal-foot__note"><%= pluralize(@improve_disagreement_count, "case") %> where a reviewer's score didn't match the judge.</span>
+    <div class="ck-prompt-preview__header">
+      <p class="ck-kicker">Calibration</p>
+      <% if draft.nil? && @improve_disagreement_count.positive? %>
         <%= button_to suggest_variants_metric_path(@metric),
               method: :post, form_class: "inline-block",
               class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
@@ -198,6 +180,18 @@
           <%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
           Suggest improvements
         <% end %>
+      <% end %>
+    </div>
+    <p class="ck-meta-copy">This is a measure of how often the judge's scores match a human reviewer. Review its scores to build that signal; the scores you disagree with become the cases the model learns from when you improve the metric.</p>
+    <%= render "completion_kit/calibrations/trust_panel",
+          stats: CompletionKit::MetricCalibrationStats.for(@metric),
+          metric: @metric %>
+    <% if CompletionKit.config.judge_examples_from_reviews %>
+      <%= render "completion_kit/metrics/guiding_examples", metric: @metric, examples: @guiding_examples %>
+    <% end %>
+    <% if draft %>
+      <div class="ck-cal-foot">
+        <span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
       </div>
     <% end %>
   </section>

data/app/views/completion_kit/responses/show.html.erb CHANGED Viewed

@@ -100,12 +100,17 @@
       <% @reviews.each do |review| %>
         <% review_version = review.metric_version %>
         <% stale = review.stale_against_current_judge? %>
-        <div class="ck-review-card<%= " ck-review-card--stale" if stale %>" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
+        <div class="ck-review-card" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
           <div class="ck-review-card__header">
             <span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
             <div class="ck-inline">
               <% if review_version %>
-                <span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Scored against #{review_version.version_label} of this metric. The metric has been republished since." : "Scored against the metric's current version (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
+                <% if stale %>
+                  <% current_version = CompletionKit::MetricVersion.current.find_by(metric_id: review.metric_id) %>
+                  <span class="ck-source-chip ck-source-chip--past" title="Scored on <%= review_version.version_label %>; the metric is now on <%= current_version.version_label %>, which may score this differently."><%= review_version.version_label %> &rarr; <%= current_version.version_label %></span>
+                <% else %>
+                  <span class="ck-source-chip ck-source-chip--current" title="Scored on the metric's current version (<%= review_version.version_label %>)."><%= review_version.version_label %></span>
+                <% end %>
               <% end %>
               <% if review.ai_score %>
                 <% 5.times do |i| %>
@@ -116,9 +121,6 @@
               <% end %>
             </div>
           </div>
-          <% if stale %>
-            <p class="ck-review-card__stale-note">Scored against a superseded version of this metric. Its current version may score this differently.</p>
-          <% end %>
           <% if review.ai_feedback.present? %>
             <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
           <% end %>

data/config/routes.rb CHANGED Viewed

@@ -22,6 +22,7 @@ CompletionKit::Engine.routes.draw do
       post :publish_draft
       post :suggest_variants
       delete :dismiss_suggestion
+      post :exclude_example
     end
   end
   resources :metric_groups

data/db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class AddExcludedFromExamplesToCompletionKitCalibrations < ActiveRecord::Migration[8.1]
+  def change
+    add_column :completion_kit_calibrations, :excluded_from_examples, :boolean, null: false, default: false
+  end
+end

data/lib/completion_kit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CompletionKit
-  VERSION = "0.9.0"
+  VERSION = "0.10.0"
 end

data/lib/completion_kit.rb CHANGED Viewed

@@ -13,6 +13,7 @@ module CompletionKit
     attr_accessor :api_rate_limit, :web_rate_limit
     attr_accessor :allow_loopback_endpoints
     attr_accessor :judge_calibration_enabled
+    attr_accessor :judge_examples_from_reviews
     def initialize
       @openai_api_key = ENV['OPENAI_API_KEY']
@@ -29,6 +30,7 @@ module CompletionKit
       @allow_loopback_endpoints = true
       @judge_calibration_enabled = true
+      @judge_examples_from_reviews = false
       @api_reference_authentication_partial = "completion_kit/api_reference/authentication"
     end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: completion-kit
 version: !ruby/object:Gem::Version
-  version: 0.9.0
+  version: 0.10.0
 platform: ruby
 authors:
 - Damien Bastin
@@ -311,6 +311,7 @@ files:
 - app/services/completion_kit/mcp_tools/responses.rb
 - app/services/completion_kit/mcp_tools/runs.rb
 - app/services/completion_kit/mcp_tools/tags.rb
+- app/services/completion_kit/metric_calibration_examples.rb
 - app/services/completion_kit/metric_calibration_stats.rb
 - app/services/completion_kit/metric_variant_generator.rb
 - app/services/completion_kit/model_discovery_service.rb
@@ -350,6 +351,7 @@ files:
 - app/views/completion_kit/metric_groups/new.html.erb
 - app/views/completion_kit/metric_groups/show.html.erb
 - app/views/completion_kit/metrics/_form.html.erb
+- app/views/completion_kit/metrics/_guiding_examples.html.erb
 - app/views/completion_kit/metrics/_rubric_diff.html.erb
 - app/views/completion_kit/metrics/_rubric_hint.html.erb
 - app/views/completion_kit/metrics/_starter_card.html.erb
@@ -430,6 +432,7 @@ files:
 - db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
 - db/migrate/20260528000002_add_metric_version_to_reviews.rb
 - db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
+- db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb
 - lib/completion-kit.rb
 - lib/completion_kit.rb
 - lib/completion_kit/concurrency_check.rb