RubyGems - completion-kit - Versions diffs - 0.5.38 → 0.5.40 - Mend

completion-kit 0.5.38 → 0.5.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/app/assets/stylesheets/completion_kit/{application.css.erb → application.css} +167 -54
data/app/controllers/completion_kit/calibrations_controller.rb +35 -8
data/app/controllers/completion_kit/metrics_controller.rb +21 -6
data/app/helpers/completion_kit/application_helper.rb +73 -2
data/app/services/completion_kit/judge_variant_generator.rb +70 -25
data/app/services/completion_kit/mcp_tools/judges.rb +2 -4
data/app/services/completion_kit/onboarding/sample_data.rb +3 -3
data/app/views/completion_kit/calibrations/_buttons.html.erb +32 -19
data/app/views/completion_kit/calibrations/_trust_panel.html.erb +1 -1
data/app/views/completion_kit/metrics/index.html.erb +2 -2
data/app/views/completion_kit/metrics/show.html.erb +91 -39
data/app/views/completion_kit/responses/show.html.erb +1 -3
data/config/routes.rb +1 -0
data/lib/completion_kit/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fc7d527828189c2993060b315dca634fb958d2da11fd7fae63c4790179c46701
-  data.tar.gz: f0323b980bdfb35d36742b548ddd3629e66d39e587775521678dc80b4cd2f068
+  metadata.gz: d8875fa22de9c8626e401706818b271f3cb26bffa7faae1f583b29503228cead
+  data.tar.gz: 5bf39fad883b2eed2f505b11403ab60fddf9efc5735073ff61c290d53f59a36d
 SHA512:
-  metadata.gz: '020946bdac698194bb5246cfbe21fdf45c56006c80c15d1d7bcfda4d3494d95cde45645e090df14a411b172c83dcde42be777d74811b25a340e5710dba6ae7ce'
-  data.tar.gz: 1b4f0ea8cf4e613df783ac428404ef1ae19b285db04f8a6768119760c65fb81d9ee72d52905ab2ef30e077ae910eb00e622925c1dcc43aee0c9e3a0f748718b1
+  metadata.gz: a8f2a7f14235c1214b567b891defaf523a645a21a2409ed81df964893a260cccb1fc9bf63903794f952c91bc9c91f3c1e3850db751a08ce0edc49b360ad9642d
+  data.tar.gz: e148b500e498a00dc370bf203fea3e2618b9f1c8fccd1dc5f220ae77a4988a8934198ad32bbf25a29bc84f948c687fab9a0ae16b9c03fe575fe9fca4cf98a0ea

data/app/assets/stylesheets/completion_kit/{application.css.erb → application.css} RENAMED Viewed

@@ -1,12 +1,26 @@
-<% %w[400 500 700].each do |weight| %>
 @font-face {
   font-family: 'JetBrains Mono';
   font-style: normal;
-  font-weight: <%= weight %>;
+  font-weight: 400;
+  font-display: swap;
+  src: url('completion_kit/jetbrains-mono-400.woff2') format('woff2');
+}
+@font-face {
+  font-family: 'JetBrains Mono';
+  font-style: normal;
+  font-weight: 500;
+  font-display: swap;
+  src: url('completion_kit/jetbrains-mono-500.woff2') format('woff2');
+}
+@font-face {
+  font-family: 'JetBrains Mono';
+  font-style: normal;
+  font-weight: 700;
   font-display: swap;
-  src: url('<%= asset_path("completion_kit/jetbrains-mono-#{weight}.woff2") %>') format('woff2');
+  src: url('completion_kit/jetbrains-mono-700.woff2') format('woff2');
 }
-<% end %>
 .turbo-progress-bar {
   background-color: var(--ck-accent);
@@ -1527,6 +1541,25 @@ tr:hover .ck-chip--publish {
   border: 0;
   border-radius: 0;
   background: transparent;
+  white-space: pre;
+  color: #93c5fd;
+  font-size: 0.86rem;
+  line-height: 1.55;
+}
+.ck-code-scroll-wrap > .ck-code .ck-json-key {
+  color: #c4b5fd;
+}
+.ck-code-scroll-wrap > .ck-code .ck-json-string {
+  color: #93c5fd;
+}
+.ck-code-scroll-wrap > .ck-code .ck-json-number {
+  color: #fcd34d;
+}
+.ck-code-scroll-wrap > .ck-code .ck-json-keyword {
+  color: #f9a8d4;
+}
+.ck-code-scroll-wrap > .ck-code .ck-json-punct {
+  color: var(--ck-dim);
 }
 .ck-note-box {
@@ -2741,7 +2774,7 @@ select.ck-input {
   border: 1px solid var(--ck-line);
   border-radius: var(--ck-radius-lg);
   background: var(--ck-surface);
-  padding: 1.5rem;
+  padding: 1.25rem;
 }
 .ck-review-card__header {
@@ -2751,6 +2784,11 @@ select.ck-input {
   gap: 1rem;
 }
+.ck-review-card__header .ck-inline {
+  flex-wrap: nowrap;
+  flex-shrink: 0;
+}
 .ck-review-card__metric {
   font-family: var(--ck-mono);
   font-size: 0.95rem;
@@ -2759,11 +2797,9 @@ select.ck-input {
 }
 .ck-review-card__feedback {
-  margin-top: 0.75rem;
-}
-.ck-review-card__feedback .ck-note-box {
-  margin-top: 0;
+  margin: 0.6rem 0 0;
+  color: var(--ck-muted);
+  line-height: 1.55;
 }
 @media (max-width: 900px) {
@@ -2800,6 +2836,14 @@ select.ck-input {
     width: 100%;
   }
+  /* button_to renders a form.inline-block wrapping the button. When the inner
+     button is a full .ck-button (not an icon-button or chip), the form should
+     stretch with it. */
+  form.inline-block:has(> .ck-button) {
+    width: 100%;
+    display: block;
+  }
   /* Page header stacks: title, then lead text full-width, then action. */
   .ck-page-header {
     flex-direction: column;
@@ -2827,6 +2871,12 @@ select.ck-input {
     padding: 1rem;
   }
+  .ck-review-card__header {
+    flex-direction: column;
+    align-items: flex-start;
+    gap: 0.5rem;
+  }
   /* Topbar nav collapses behind the hamburger trigger. */
   .ck-nav-menu__trigger {
     display: inline-flex;
@@ -4559,9 +4609,8 @@ a.tag-mark {
 }
 .ck-launch__progress {
-  padding-bottom: 1.5rem;
-  margin-bottom: 0.5rem;
-  border-bottom: 1px solid var(--ck-line);
+  padding-bottom: 0;
+  margin-bottom: 1.25rem;
 }
 .ck-launch__progress-head {
   display: flex;
@@ -5126,22 +5175,37 @@ a.tag-mark {
   border-top: 1px dashed var(--ck-line);
 }
 .ck-calibration__prompt {
+  margin: 0 0 10px;
+  display: flex;
+  align-items: baseline;
+  flex-wrap: wrap;
+  gap: 8px 12px;
+}
+.ck-calibration__label {
   font-family: var(--ck-mono);
   font-size: 0.72rem;
   letter-spacing: 0.06em;
   text-transform: uppercase;
   color: var(--ck-dim);
-  margin: 0 0 10px;
-  display: flex;
-  align-items: center;
-  gap: 10px;
+  flex-shrink: 0;
 }
 .ck-calibration__count {
   font-family: var(--ck-mono);
   font-size: 0.72rem;
   letter-spacing: 0.03em;
   color: var(--ck-accent);
-  text-transform: none;
+}
+.ck-calibration__hint {
+  font-size: 0.82rem;
+  color: var(--ck-dim);
+  line-height: 1.4;
+}
+@media (max-width: 640px) {
+  .ck-calibration__prompt {
+    flex-direction: column;
+    align-items: flex-start;
+    gap: 4px;
+  }
 }
 .ck-calibration__buttons {
   display: flex;
@@ -5196,11 +5260,13 @@ a.tag-mark {
   margin-top: 12px;
   display: flex;
   flex-direction: column;
-  gap: 8px;
-  padding: 12px;
-  background: var(--ck-surface-soft);
-  border: 1px solid var(--ck-line);
-  border-radius: 6px;
+  gap: 12px;
+}
+.ck-calibration__detail > * {
+  margin: 0;
+}
+.ck-calibration__detail .ck-button {
+  align-self: flex-start;
 }
 .ck-calibration__value {
   color: var(--ck-accent);
@@ -5322,44 +5388,28 @@ a.tag-mark {
   text-transform: uppercase;
 }
-.ck-draft-banner {
+.ck-draft-pending {
+  border-color: rgba(6, 182, 212, 0.45);
+  background: linear-gradient(180deg, var(--ck-accent-soft), var(--ck-surface));
+}
+.ck-suggestion-banner {
   display: inline-flex;
   align-items: center;
   gap: 10px;
   margin-top: 10px;
-  padding: 8px 12px;
+  padding: 8px 14px;
   background: var(--ck-accent-soft);
-  border: 1px dashed rgba(6, 182, 212, 0.4);
-  border-radius: 6px;
-}
-.ck-suggestion-list {
-  display: flex;
-  flex-direction: column;
-  gap: 12px;
-}
-.ck-suggestion-card {
-  padding: 12px 14px;
-  background: var(--ck-surface-soft);
-  border: 1px solid var(--ck-line);
+  border: 1px solid rgba(6, 182, 212, 0.35);
   border-radius: 6px;
-  display: flex;
-  flex-direction: column;
-  gap: 10px;
-}
-.ck-suggestion-card__header {
-  display: flex;
-  align-items: center;
-  gap: 10px;
+  color: var(--ck-accent);
+  font-family: var(--ck-mono);
+  font-size: 0.82rem;
+  text-decoration: none;
 }
-.ck-suggestion-card__instruction {
-  margin: 0;
-  white-space: pre-wrap;
-  font-size: 0.85rem;
-  background: var(--ck-bg-strong);
-  padding: 10px 12px;
-  border-radius: 4px;
-  border: 1px solid var(--ck-line);
+.ck-suggestion-banner:hover,
+.ck-suggestion-banner:focus-visible {
+  border-color: var(--ck-accent);
 }
 .ck-metrics-table__trust {
@@ -5372,3 +5422,66 @@ a.tag-mark {
   color: var(--ck-success);
   margin-right: 6px;
 }
+.ck-calibration__error {
+  margin: 8px 0 0;
+  padding: 8px 10px;
+  background: var(--ck-danger-soft);
+  border: 1px solid rgba(248, 113, 113, 0.3);
+  border-radius: 4px;
+  color: var(--ck-danger);
+  font-size: 0.82rem;
+}
+.ck-star-picker {
+  border: 0;
+  padding: 0;
+  margin: 0;
+}
+.ck-star-picker__row {
+  display: inline-flex;
+  flex-direction: row-reverse;
+  gap: 2px;
+}
+.ck-star-picker input {
+  position: absolute;
+  width: 1px;
+  height: 1px;
+  opacity: 0;
+  pointer-events: none;
+}
+.ck-star-picker label {
+  cursor: pointer;
+  display: inline-flex;
+  padding: 4px;
+  border-radius: 4px;
+}
+.ck-star-picker label svg {
+  fill: transparent;
+  stroke: var(--ck-line-strong);
+  transition: fill 0.08s, stroke 0.08s;
+}
+.ck-star-picker input:checked ~ label svg {
+  fill: var(--ck-warning);
+  stroke: var(--ck-warning);
+}
+.ck-star-picker__row:hover label svg {
+  fill: transparent;
+  stroke: var(--ck-line-strong);
+}
+.ck-star-picker__row:hover label:hover svg,
+.ck-star-picker__row:hover label:hover ~ label svg {
+  fill: var(--ck-warning);
+  stroke: var(--ck-warning);
+}
+.ck-star-picker input:focus-visible + label {
+  outline: 2px solid var(--ck-accent);
+  outline-offset: 2px;
+}
+.ck-button--just-saved {
+  animation: ck-saved-flash 1.4s ease-out;
+}
+@keyframes ck-saved-flash {
+  0% { background: var(--ck-success); border-color: var(--ck-success); }
+}

data/app/controllers/completion_kit/calibrations_controller.rb CHANGED Viewed

@@ -5,9 +5,18 @@ module CompletionKit
     def create
       created_by = calibration_creator
-      calibration = Calibration.find_or_initialize_by(
+      existing = Calibration.find_by(
         run_id: @run.id, response_id: @response.id, metric_id: @metric.id, created_by: created_by
       )
+      if params[:verdict] == "disagree" && params[:corrected_score].blank?
+        render_calibration(calibration: existing, pending_verdict: "disagree")
+        return
+      end
+      calibration = existing || Calibration.new(
+        run: @run, response: @response, metric: @metric, created_by: created_by
+      )
       calibration.assign_attributes(
         judge_version: JudgeVersion.ensure_current_for(@metric),
         verdict: params[:verdict],
@@ -16,19 +25,37 @@ module CompletionKit
       )
       if calibration.save
-        render turbo_stream: turbo_stream.replace(
-          "calibration_#{@response.id}_#{@metric.id}",
-          partial: "completion_kit/calibrations/buttons",
-          locals: { review: review_for_metric, calibration: calibration, run: @run, response_row: @response, metric: @metric }
-        )
+        render_calibration(calibration: calibration, just_saved: true)
       else
-        flash[:alert] = calibration.errors.full_messages.to_sentence
-        redirect_to run_response_path(@run, @response)
+        render_calibration(
+          calibration: existing,
+          pending_verdict: params[:verdict],
+          error: calibration.errors.full_messages.to_sentence,
+          status: :unprocessable_entity
+        )
       end
     end
     private
+    def render_calibration(calibration:, pending_verdict: nil, error: nil, just_saved: false, status: :ok)
+      locals = {
+        review: review_for_metric,
+        calibration: calibration,
+        run: @run,
+        response_row: @response,
+        metric: @metric,
+        pending_verdict: pending_verdict,
+        error: error,
+        just_saved: just_saved
+      }
+      render turbo_stream: turbo_stream.replace(
+        "calibration_#{@response.id}_#{@metric.id}",
+        partial: "completion_kit/calibrations/buttons",
+        locals: locals
+      ), status: status
+    end
     def ensure_calibration_enabled
       head :not_found unless CompletionKit.config.judge_calibration_enabled
     end

data/app/controllers/completion_kit/metrics_controller.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module CompletionKit
   class MetricsController < ApplicationController
     include CompletionKit::TagFiltering
-    before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants]
+    before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
     def index
       @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
@@ -12,8 +12,10 @@ module CompletionKit
                                   .includes(response: [:reviews, :run])
                                   .order(created_at: :desc)
                                   .limit(50)
-      @latest_draft = JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
-      @suggestion_drafts = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc)
+      @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
+      @published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
+      @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
+      @improve_disagreement_count = @disagreements.size
     end
     def new
@@ -47,15 +49,28 @@ module CompletionKit
     end
     def suggest_variants
-      generator = JudgeVariantGenerator.new(@metric)
+      disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
+      if disagreement_count.zero?
+        redirect_to metric_path(@metric), alert: "Mark at least one row as Disagree before asking the model to suggest a change."
+        return
+      end
+      JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
+      generator = JudgeVariantGenerator.new(@metric, count: 1)
       variants = generator.call
       if variants.empty?
         redirect_to metric_path(@metric), alert: "The model returned no usable variants. Try again with a different model."
         return
       end
       generator.persist!(variants)
-      label = variants.length == 1 ? "alternative" : "alternatives"
-      redirect_to metric_path(@metric), notice: "Wrote #{variants.length} #{label} for the judge instruction. Pick one to make it live."
+      redirect_to metric_path(@metric), notice: "Drafted a new version. Review it below."
+    end
+    def dismiss_suggestion
+      draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").find_by(id: params[:draft_id])
+      draft&.destroy
+      redirect_to metric_path(@metric), notice: "Dismissed."
     end
     def publish_draft

data/app/helpers/completion_kit/application_helper.rb CHANGED Viewed

@@ -202,15 +202,86 @@ module CompletionKit
     def ck_format_maybe_json(text)
       s = text.to_s
       return s if s.strip.empty?
-      first = s.strip[0]
+      payload = ck_unwrap_json_fence(s.strip)
+      first = payload[0]
       return s unless first == "{" || first == "["
       begin
-        JSON.pretty_generate(JSON.parse(s))
+        ck_highlight_json(JSON.pretty_generate(JSON.parse(payload)))
       rescue JSON::ParserError
         s
       end
     end
+    def ck_unwrap_json_fence(text)
+      m = text.match(/\A```(?:json|JSON)?\s*\n(.*?)\n?```\s*\z/m)
+      m ? m[1].strip : text
+    end
+    def ck_highlight_json(text)
+      tokens = ck_tokenize_json(text)
+      is_key = ck_mark_json_keys(tokens)
+      parts = tokens.each_with_index.map do |(type, value), idx|
+        escaped = ERB::Util.html_escape(value)
+        case type
+        when :punct then %(<span class="ck-json-punct">#{escaped}</span>)
+        when :string
+          %(<span class="#{is_key[idx] ? "ck-json-key" : "ck-json-string"}">#{escaped}</span>)
+        when :number then %(<span class="ck-json-number">#{escaped}</span>)
+        when :keyword then %(<span class="ck-json-keyword">#{escaped}</span>)
+        else escaped
+        end
+      end
+      parts.join.html_safe
+    end
+    def ck_tokenize_json(text)
+      tokens = []
+      i = 0
+      len = text.length
+      while i < len
+        ch = text[i]
+        if ch == " " || ch == "\n" || ch == "\t"
+          tokens << [:ws, ch]
+          i += 1
+        elsif "{}[]:,".include?(ch)
+          tokens << [:punct, ch]
+          i += 1
+        elsif ch == '"'
+          j = i + 1
+          while j < len && text[j] != '"'
+            j += text[j] == "\\" ? 2 : 1
+          end
+          j = len - 1 if j >= len
+          tokens << [:string, text[i..j]]
+          i = j + 1
+        elsif ch == "-" || (ch >= "0" && ch <= "9")
+          j = i + 1
+          j += 1 while j < len && "0123456789.eE+-".include?(text[j])
+          tokens << [:number, text[i...j]]
+          i = j
+        elsif text[i, 4] == "true" || text[i, 4] == "null"
+          tokens << [:keyword, text[i, 4]]
+          i += 4
+        elsif text[i, 5] == "false"
+          tokens << [:keyword, "false"]
+          i += 5
+        else
+          tokens << [:other, ch]
+          i += 1
+        end
+      end
+      tokens
+    end
+    def ck_mark_json_keys(tokens)
+      tokens.each_with_index.map do |(type, _), idx|
+        next false unless type == :string
+        j = idx + 1
+        j += 1 while j < tokens.length && tokens[j][0] == :ws
+        j < tokens.length && tokens[j] == [:punct, ":"]
+      end
+    end
     def tag_filter_url(base_path, selected, toggling)
       remaining = selected.reject { |t| t.id == toggling.id }
       next_set = selected.include?(toggling) ? remaining : remaining + [toggling]

data/app/services/completion_kit/judge_variant_generator.rb CHANGED Viewed

@@ -1,13 +1,15 @@
 module CompletionKit
   class JudgeVariantGenerator
-    DEFAULT_VARIANT_COUNT = 3
+    DEFAULT_VARIANT_COUNT = 1
+    MAX_VARIANT_COUNT = 3
     DEFAULT_TEMPERATURE = 0.4
-    Variant = Struct.new(:reasoning, :instruction, keyword_init: true)
+    Variant = Struct.new(:reasoning, :instruction, :rubric_bands, keyword_init: true)
     def initialize(metric, count: DEFAULT_VARIANT_COUNT, model: nil)
       @metric = metric
-      @count = count
+      n = count.to_i
+      @count = n < 1 ? DEFAULT_VARIANT_COUNT : [n, MAX_VARIANT_COUNT].min
       @model = model || CompletionKit.config.judge_model
     end
@@ -23,7 +25,7 @@ module CompletionKit
         JudgeVersion.create!(
           metric: @metric,
           instruction: variant.instruction,
-          rubric_bands: @metric.rubric_bands,
+          rubric_bands: variant.rubric_bands.presence || @metric.rubric_bands,
           state: "draft",
           source: "suggestion",
           current: false
@@ -39,36 +41,57 @@ module CompletionKit
     private
     def build_meta_prompt
-      examples = JudgeCalibrationExamples.for(@metric)
+      disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
+      borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
       sections = []
-      sections << "You are an expert evaluator. Rewrite a judge's grading instruction so it agrees better with humans on the cases below."
+      sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
       sections << ""
       sections << "## Current instruction"
       sections << "```"
       sections << @metric.instruction.to_s
       sections << "```"
       sections << ""
-      sections << "## Rubric (unchanged across variants — only rewrite the instruction)"
+      sections << "## Current rubric (5 to 1)"
       sections << @metric.display_rubric_text
       sections << ""
-      sections << "## Recent disagreements (judge vs human)"
-      examples.each_with_index do |ex, i|
-        sections << "### Case #{i + 1}"
-        sections << "Input: #{ex[:input].to_s.truncate(200)}"
-        sections << "Output: #{ex[:output].to_s.truncate(200)}"
-        sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
-        sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
-        sections << ""
+      if disagreements.any?
+        sections << "## Recent disagreements (judge vs human)"
+        disagreements.each_with_index do |ex, i|
+          sections << "### Case #{i + 1}"
+          sections << "Input: #{ex[:input].to_s.truncate(200)}"
+          sections << "Output: #{ex[:output].to_s.truncate(200)}"
+          sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
+          sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
+          sections << ""
+        end
+      end
+      if borderlines.any?
+        sections << "## Rubric-ambiguous cases (humans marked these borderline)"
+        sections << "These are cases where a human said the rubric itself was unclear. If the rubric needs sharpening, rewrite it."
+        borderlines.each_with_index do |ex, i|
+          sections << "### Borderline #{i + 1}"
+          sections << "Input: #{ex[:input].to_s.truncate(200)}"
+          sections << "Output: #{ex[:output].to_s.truncate(200)}"
+          sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
+          sections << "Human note: #{ex[:human_note].to_s.truncate(200)}" if ex[:human_note].to_s.present?
+          sections << ""
+        end
       end
       sections << "## Task"
-      sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite — not a wholesale rewrite of the rubric. Aim to close the disagreement gap."
+      sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
       sections << ""
-      sections << "Respond in EXACTLY this format, repeated #{@count} times:"
+      sections << "Respond in EXACTLY this format, repeated #{@count} time#{@count == 1 ? "" : "s"}:"
       sections << ""
       sections << "VARIANT:"
-      sections << "REASONING: <one sentence explaining what this variant changes>"
+      sections << "REASONING: <one short sentence: what changes and why>"
       sections << "INSTRUCTION:"
       sections << "<the rewritten instruction>"
+      sections << "RUBRIC:                  # optional — omit this block if the rubric is unchanged"
+      sections << "5: <description for 5 stars>"
+      sections << "4: <description for 4 stars>"
+      sections << "3: <description for 3 stars>"
+      sections << "2: <description for 2 stars>"
+      sections << "1: <description for 1 star>"
       sections << "END_VARIANT"
       sections.join("\n")
     end
@@ -77,22 +100,44 @@ module CompletionKit
       blocks = text.to_s.scan(/VARIANT:(.*?)END_VARIANT/m).flatten
       blocks.filter_map do |raw|
         reasoning = raw[/REASONING:\s*(.*?)(?=INSTRUCTION:|\z)/m, 1].to_s.strip
-        instruction = raw[/INSTRUCTION:\s*(.*)/m, 1].to_s.strip
+        instruction = raw[/INSTRUCTION:\s*(.*?)(?=RUBRIC:|\z)/m, 1].to_s.strip
         next if instruction.empty?
-        Variant.new(reasoning: reasoning, instruction: instruction)
+        rubric_block = raw[/RUBRIC:\s*(.*)/m, 1].to_s
+        Variant.new(reasoning: reasoning, instruction: instruction, rubric_bands: parse_rubric(rubric_block))
       end
     end
+    def parse_rubric(block)
+      return nil if block.strip.empty?
+      bands = block.scan(/^\s*([1-5])\s*[:\-]\s*(.+?)\s*$/).map do |stars, description|
+        { "stars" => stars.to_i, "description" => description.strip }
+      end
+      return nil if bands.length != 5
+      bands.sort_by { |b| -b["stars"] }
+    end
   end
   module JudgeCalibrationExamples
     module_function
     def for(metric, limit: 8)
-      disagreements = Calibration.where(metric_id: metric.id, verdict: "disagree")
-                                 .includes(response: :reviews)
-                                 .order(created_at: :desc)
-                                 .limit(limit)
-      disagreements.map do |cal|
+      disagreements_for(metric, limit: limit)
+    end
+    def disagreements_for(metric, limit: 8)
+      calibrations_for(metric, verdict: "disagree", limit: limit)
+    end
+    def borderlines_for(metric, limit: 6)
+      calibrations_for(metric, verdict: "borderline", limit: limit)
+    end
+    def calibrations_for(metric, verdict:, limit:)
+      Calibration.where(metric_id: metric.id, verdict: verdict)
+                 .includes(response: :reviews)
+                 .order(created_at: :desc)
+                 .limit(limit)
+                 .map do |cal|
         review = cal.response.reviews.find { |r| r.metric_id == metric.id }
         {
           input: cal.response.input_data,

data/app/services/completion_kit/mcp_tools/judges.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module CompletionKit
             type: "object",
             properties: {
               metric_id: { type: "integer" },
-              count: { type: "integer", description: "How many variants to request (default 3, max 5)." },
+              count: { type: "integer", description: "How many variants to request (default 1, max 3). One focused rewrite beats five reworded copies." },
               model: { type: "string", description: "Override the model used to generate variants. Defaults to CompletionKit.config.judge_model." }
             },
             required: ["metric_id"]
@@ -49,9 +49,7 @@ module CompletionKit
       def self.suggest(args)
         metric = CompletionKit::Metric.find(args["metric_id"])
-        count = [args["count"].to_i, 5].min
-        count = CompletionKit::JudgeVariantGenerator::DEFAULT_VARIANT_COUNT if count <= 0
-        generator = CompletionKit::JudgeVariantGenerator.new(metric, count: count, model: args["model"])
+        generator = CompletionKit::JudgeVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
         variants = generator.call
         return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
         versions = generator.persist!(variants)

data/app/services/completion_kit/onboarding/sample_data.rb CHANGED Viewed

@@ -1,20 +1,20 @@
 module CompletionKit
   module Onboarding
     # Opt-in starter data for the onboarding page: one dataset + one prompt so a
-    # brand-new install has something to poke at. Idempotent — a no-op once the
+    # brand-new install has something to poke at. Idempotent. A no-op once the
     # workspace already has any prompt or dataset. Deliberately does NOT create a
     # provider credential (needs a real API key) or a run (user-initiated).
     module SampleData
       SAMPLE_CSV = <<~CSV.freeze
         ticket
         "My order #4827 arrived with a dented panel. I emailed photos 11 days ago and heard nothing. Today I was told the return window 'closed'. I paid $749. I want a refund or replacement, not store credit."
-        "Tracking says delivered to my porch Tuesday 3:47pm. I was home all day, nothing arrived, neighbours' cameras show no van. Order #5102 — a $315 mixer, wedding gift, wedding is Saturday. Can someone look today?"
+        "Tracking says delivered to my porch Tuesday 3:47pm. I was home all day, nothing arrived, neighbours' cameras show no van. Order #5102. A $315 mixer, wedding gift, wedding is Saturday. Can someone look today?"
         "WELCOME20 says 'invalid' at checkout but the promo email says it's good through May 31. Same email I'm signed in with. Tried Chrome and Safari. Cart is $186 waiting on you."
       CSV
       SAMPLE_PROMPT = {
         name: "Sample: Support reply",
-        description: "A starter prompt — drafts a warm, professional reply to a customer support ticket. Edit it or delete it; it's just here to get you going.",
+        description: "A starter prompt. Drafts a warm, professional reply to a customer support ticket. Edit it or delete it; it's just here to get you going.",
         template: "You are a senior customer-support specialist. Write a warm, professional reply to this ticket. Acknowledge the customer's situation, be specific about next steps, and don't be defensive.\n\nTicket:\n{{ticket}}",
         llm_model: "gpt-4o-mini"
       }.freeze

data/app/views/completion_kit/calibrations/_buttons.html.erb CHANGED Viewed

@@ -1,12 +1,15 @@
 <div id="calibration_<%= response_row.id %>_<%= metric.id %>" class="ck-calibration">
   <% current_verdict = calibration&.verdict %>
+  <% pending_verdict = local_assigns[:pending_verdict] %>
+  <% active_verdict = pending_verdict || current_verdict %>
+  <% error = local_assigns[:error] %>
   <% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
   <p class="ck-calibration__prompt">
-    Your verdict
+    <span class="ck-calibration__label">Your verdict</span>
     <% if verdict_count > 0 %>
-      <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "judge trust →", metric_path(metric), class: "ck-link" %></span>
+      <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "trust level →", metric_path(metric), class: "ck-link" %></span>
     <% else %>
-      <span class="ck-calibration__count">Tell us what you think — was the score right? Verdicts roll up into <%= link_to "judge trust", metric_path(metric), class: "ck-link" %>.</span>
+      <span class="ck-calibration__hint">Tell us what you think — was the score right? Verdicts roll up into the metric's <%= link_to "trust level", metric_path(metric), class: "ck-link" %>.</span>
     <% end %>
   </p>
   <div class="ck-calibration__buttons">
@@ -20,8 +23,8 @@
       <%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
               method: :post,
               form: { data: { turbo: "true" } },
-              class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == current_verdict}",
-              "aria-pressed": (verdict == current_verdict).to_s,
+              class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
+              "aria-pressed": (verdict == active_verdict).to_s,
               title: verdict_hints[verdict] do %>
         <%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
         <span><%= verdict %></span>
@@ -29,31 +32,41 @@
     <% end %>
   </div>
-  <% if current_verdict == "disagree" %>
+  <% if error.present? %>
+    <p class="ck-calibration__error" role="alert"><%= error %></p>
+  <% end %>
+  <% if active_verdict == "disagree" %>
+    <% existing_score = (calibration&.corrected_score || review&.ai_score)&.round %>
     <%= form_with url: run_response_calibrations_path(run, response_row),
                   method: :post, local: false,
                   class: "ck-calibration__detail" do |f| %>
       <%= hidden_field_tag :metric_id, metric.id %>
       <%= hidden_field_tag :verdict, "disagree" %>
-      <label class="ck-label">
-        What should the score have been?
-        <span class="ck-calibration__value" data-calibration-value><%= calibration.corrected_score || review&.ai_score || 3 %></span>
-      </label>
-      <input type="range" name="corrected_score" min="1" max="5" step="0.5"
-             value="<%= calibration.corrected_score || review&.ai_score || 3 %>"
-             oninput="this.closest('.ck-calibration__detail').querySelector('[data-calibration-value]').textContent = this.value"
-             class="ck-slider">
-      <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration.note %></textarea>
-      <%= f.submit "Save", class: ck_button_classes(:dark) %>
+      <p class="ck-label">What should the score have been?</p>
+      <fieldset class="ck-star-picker">
+        <legend class="ck-visually-hidden">Pick a score from 1 to 5 stars</legend>
+        <div class="ck-star-picker__row">
+          <% [5, 4, 3, 2, 1].each do |n| %>
+            <% radio_id = "ck-star-#{response_row.id}-#{metric.id}-#{n}" %>
+            <input type="radio" name="corrected_score" id="<%= radio_id %>" value="<%= n %>" <%= "checked" if existing_score == n %> required>
+            <label for="<%= radio_id %>" title="<%= pluralize(n, 'star') %>" aria-label="<%= pluralize(n, 'star') %>">
+              <svg viewBox="0 0 24 24" width="28" height="28" stroke-width="1.5" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
+            </label>
+          <% end %>
+        </div>
+      </fieldset>
+      <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration&.note %></textarea>
+      <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
     <% end %>
-  <% elsif current_verdict == "borderline" %>
+  <% elsif active_verdict == "borderline" %>
     <%= form_with url: run_response_calibrations_path(run, response_row),
                   method: :post, local: false,
                   class: "ck-calibration__detail" do |f| %>
       <%= hidden_field_tag :metric_id, metric.id %>
       <%= hidden_field_tag :verdict, "borderline" %>
-      <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration.note %></textarea>
-      <%= f.submit "Save", class: ck_button_classes(:dark) %>
+      <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration&.note %></textarea>
+      <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
     <% end %>
   <% end %>
 </div>

data/app/views/completion_kit/calibrations/_trust_panel.html.erb CHANGED Viewed

@@ -1,6 +1,6 @@
 <% stats = local_assigns[:stats] %>
 <div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
-  <p class="ck-trust-panel__label" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</p>
+  <p class="ck-trust-panel__label" title="How often this metric's scores match the humans who reviewed them.">Trust level</p>
   <% if stats.counter_only? %>
     <div class="ck-trust-panel__body">
       <span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>

data/app/views/completion_kit/metrics/index.html.erb CHANGED Viewed

@@ -19,7 +19,7 @@
       <tr>
         <th scope="col">Name</th>
         <th scope="col">Instruction</th>
-        <th scope="col" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</th>
+        <th scope="col" title="How often this metric's scores match the humans who reviewed them.">Trust level</th>
         <th scope="col">In groups</th>
         <th scope="col"></th>
       </tr>
@@ -36,7 +36,7 @@
             <% end %>
           </td>
           <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
-          <td data-label="Judge trust" class="ck-metrics-table__trust">
+          <td data-label="Trust level" class="ck-metrics-table__trust">
             <% if CompletionKit.config.judge_calibration_enabled %>
               <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
               <% if s.counter_only? %>

data/app/views/completion_kit/metrics/show.html.erb CHANGED Viewed

@@ -9,24 +9,58 @@
     <% if CompletionKit.config.judge_calibration_enabled %>
       <%= render "completion_kit/calibrations/trust_panel",
             stats: CompletionKit::MetricCalibrationStats.for(@metric) %>
-      <% if @latest_draft %>
-        <div class="ck-draft-banner">
-          <span class="ck-chip ck-chip--soft">Draft pending</span>
-          <span class="ck-meta-copy">A draft version of this judge is saved. Publishing it replaces the live instruction and rubric.</span>
-          <%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @latest_draft.id),
-                method: :post, form_class: "inline-block",
-                class: ck_button_classes(:dark) %>
-        </div>
+      <% if @edit_draft %>
+        <% pub_instr = @published_judge_version&.instruction.to_s %>
+        <% draft_instr = @edit_draft.instruction.to_s %>
+        <% instruction_changed = pub_instr != draft_instr %>
+        <% rubric_changed = @published_judge_version && @published_judge_version.rubric_bands != @edit_draft.rubric_bands %>
+        <section class="ck-card ck-card--spaced ck-draft-pending">
+          <div class="ck-prompt-preview__header">
+            <p class="ck-kicker">Draft pending</p>
+            <%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @edit_draft.id),
+                  method: :post, form_class: "inline-block",
+                  class: ck_button_classes(:dark) %>
+          </div>
+          <p class="ck-meta-copy">A draft of this metric is saved. Publishing it replaces the live instruction<%= ", rubric," if rubric_changed %> for future runs. Here's what changes.</p>
+          <% if instruction_changed %>
+            <div class="ck-suggest-diff">
+              <div class="ck-suggest-diff__pane">
+                <div class="ck-suggest-diff__header">
+                  <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Currently live</span>
+                </div>
+                <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(pub_instr, draft_instr) %></pre>
+              </div>
+              <div class="ck-suggest-diff__pane">
+                <div class="ck-suggest-diff__header">
+                  <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Draft</span>
+                </div>
+                <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(pub_instr, draft_instr) %></pre>
+              </div>
+            </div>
+          <% else %>
+            <p class="ck-meta-copy">The instruction is unchanged.</p>
+          <% end %>
+          <% if rubric_changed %>
+            <p class="ck-meta-copy"><strong>Rubric also changed.</strong> Edit the metric to inspect each band, or publish to apply the new wording.</p>
+          <% end %>
+        </section>
       <% end %>
     <% end %>
   </div>
   <div class="ck-actions">
     <% if CompletionKit.config.judge_calibration_enabled %>
-      <%= button_to "Suggest rewrites", suggest_variants_metric_path(@metric),
-            method: :post, form_class: "inline-block",
-            class: ck_button_classes(:light, variant: :outline),
-            title: "Ask the model to rewrite this judge instruction based on the disagreements collected so far.",
-            data: { turbo_confirm: "Ask the model to rewrite this judge instruction based on the disagreements collected so far?" } %>
+      <% if @improve_disagreement_count.positive? %>
+        <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
+              method: :post, form_class: "inline-block",
+              class: ck_button_classes(:light, variant: :outline),
+              title: "Rewrite this metric based on the disagreements collected so far.",
+              data: { turbo_confirm: "Rewrite this metric based on the disagreements collected so far?" } %>
+      <% else %>
+        <button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
+                title="Mark at least one row as Disagree before the model can suggest a change.">Improve the metric</button>
+      <% end %>
     <% end %>
     <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
   </div>
@@ -41,7 +75,7 @@
 <% if @metric.instruction.present? %>
 <section class="ck-card">
   <p class="ck-kicker">Instruction</p>
-  <div class="ck-note-box"><%= simple_format(@metric.instruction) %></div>
+  <%= simple_format(@metric.instruction, {}, class: "ck-copy") %>
 </section>
 <% end %>
@@ -63,6 +97,49 @@
   </div>
 </section>
+<% if CompletionKit.config.judge_calibration_enabled && @suggestion_draft %>
+  <% sd_current_instr = @published_judge_version&.instruction.to_s %>
+  <% sd_draft_instr = @suggestion_draft.instruction.to_s %>
+  <% sd_current_rubric = @published_judge_version&.rubric_bands || [] %>
+  <% sd_rubric_changed = @suggestion_draft.rubric_bands != sd_current_rubric %>
+  <section class="ck-card ck-card--spaced ck-draft-pending">
+    <div class="ck-prompt-preview__header">
+      <p class="ck-kicker">Suggested change</p>
+      <time class="ck-meta-copy" data-relative-time datetime="<%= @suggestion_draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(@suggestion_draft.created_at) %> ago</time>
+    </div>
+    <p class="ck-meta-copy">Based on your disagreements, the model proposed this rewrite. Use it to replace the live version, or discard.</p>
+    <div class="ck-suggest-diff">
+      <div class="ck-suggest-diff__pane">
+        <div class="ck-suggest-diff__header">
+          <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Currently live</span>
+        </div>
+        <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(sd_current_instr, sd_draft_instr) %></pre>
+      </div>
+      <div class="ck-suggest-diff__pane">
+        <div class="ck-suggest-diff__header">
+          <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Proposed</span>
+        </div>
+        <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(sd_current_instr, sd_draft_instr) %></pre>
+      </div>
+    </div>
+    <% if sd_rubric_changed %>
+      <p class="ck-meta-copy"><strong>Rubric also changed.</strong> Publishing applies the new rubric too.</p>
+    <% end %>
+    <div class="ck-actions">
+      <%= button_to "Discard", dismiss_suggestion_metric_path(@metric, draft_id: @suggestion_draft.id),
+            method: :delete, form_class: "inline-block",
+            class: ck_button_classes(:light, variant: :outline),
+            data: { turbo_confirm: "Drop this suggestion?" } %>
+      <%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: @suggestion_draft.id),
+            method: :post, form_class: "inline-block",
+            class: ck_button_classes(:dark) %>
+    </div>
+  </section>
+<% end %>
 <% if CompletionKit.config.judge_calibration_enabled %>
   <section class="ck-card ck-card--spaced">
     <div class="ck-prompt-preview__header">
@@ -131,31 +208,6 @@
     <% end %>
   </section>
-  <% if @suggestion_drafts.any? %>
-    <section class="ck-card ck-card--spaced">
-      <div class="ck-prompt-preview__header">
-        <p class="ck-kicker">Suggested rewrites</p>
-        <span class="ck-chip"><%= @suggestion_drafts.size %> option<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
-      </div>
-      <p class="ck-meta-copy">The model wrote these alternate instructions based on the disagreements above. Pick one to make it the live judge — the previous version is kept in history.</p>
-      <div class="ck-suggestion-list">
-        <% @suggestion_drafts.each do |draft| %>
-          <article class="ck-suggestion-card">
-            <header class="ck-suggestion-card__header">
-              <span class="ck-chip ck-chip--soft">Option #<%= draft.id %></span>
-              <time class="ck-meta-copy" data-relative-time datetime="<%= draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(draft.created_at) %> ago</time>
-            </header>
-            <pre class="ck-code ck-suggestion-card__instruction"><%= draft.instruction %></pre>
-            <div class="ck-actions">
-              <%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: draft.id),
-                    method: :post, form_class: "inline-block",
-                    class: ck_button_classes(:dark) %>
-            </div>
-          </article>
-        <% end %>
-      </div>
-    </section>
-  <% end %>
   <% if Array(@metric.few_shot_examples).any? %>
     <section class="ck-card ck-card--spaced">

data/app/views/completion_kit/responses/show.html.erb CHANGED Viewed

@@ -112,9 +112,7 @@
             </div>
           </div>
           <% if review.ai_feedback.present? %>
-            <div class="ck-review-card__feedback">
-              <div class="ck-note-box"><%= review.ai_feedback %></div>
-            </div>
+            <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
           <% end %>
           <% if CompletionKit.config.judge_calibration_enabled && review.metric && review.ai_score %>
             <% existing = CompletionKit::Calibration.find_by(

data/config/routes.rb CHANGED Viewed

@@ -17,6 +17,7 @@ CompletionKit::Engine.routes.draw do
       post :add_few_shot
       post :publish_draft
       post :suggest_variants
+      delete :dismiss_suggestion
     end
   end
   resources :metric_groups

data/lib/completion_kit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CompletionKit
-  VERSION = "0.5.38"
+  VERSION = "0.5.40"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: completion-kit
 version: !ruby/object:Gem::Version
-  version: 0.5.38
+  version: 0.5.40
 platform: ruby
 authors:
 - Damien Bastin
@@ -233,7 +233,7 @@ files:
 - app/assets/images/completion_kit/favicon.ico
 - app/assets/images/completion_kit/logo.png
 - app/assets/javascripts/completion_kit/application.js
-- app/assets/stylesheets/completion_kit/application.css.erb
+- app/assets/stylesheets/completion_kit/application.css
 - app/controllers/completion_kit/api/v1/base_controller.rb
 - app/controllers/completion_kit/api/v1/calibrations_controller.rb
 - app/controllers/completion_kit/api/v1/datasets_controller.rb