RubyGems - completion-kit - Versions diffs - 0.5.39 → 0.5.40 - Mend

completion-kit 0.5.39 → 0.5.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/app/assets/stylesheets/completion_kit/application.css +128 -50
data/app/controllers/completion_kit/calibrations_controller.rb +4 -3
data/app/controllers/completion_kit/metrics_controller.rb +21 -6
data/app/helpers/completion_kit/application_helper.rb +73 -2
data/app/services/completion_kit/judge_variant_generator.rb +30 -12
data/app/services/completion_kit/mcp_tools/judges.rb +2 -4
data/app/services/completion_kit/onboarding/sample_data.rb +3 -3
data/app/views/completion_kit/calibrations/_buttons.html.erb +19 -14
data/app/views/completion_kit/calibrations/_trust_panel.html.erb +1 -1
data/app/views/completion_kit/metrics/index.html.erb +2 -2
data/app/views/completion_kit/metrics/show.html.erb +90 -38
data/app/views/completion_kit/responses/show.html.erb +1 -3
data/config/routes.rb +1 -0
data/lib/completion_kit/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: d8d052b5ce9253412be890b820439248547d767575969e4260566a63426ac612
-  data.tar.gz: 8e2f73e59c977c1923b90c9b36fae7dd8eadd35d0c499ae04cea1d63113e7655
+  metadata.gz: d8875fa22de9c8626e401706818b271f3cb26bffa7faae1f583b29503228cead
+  data.tar.gz: 5bf39fad883b2eed2f505b11403ab60fddf9efc5735073ff61c290d53f59a36d
 SHA512:
-  metadata.gz: 54dd9bd2a4b2e64f929865508649ca2ada6972840715552b920b2bcc156b74cc76fe957b8ac58ec2f9ad7d8594dbe2ef15c600efb10304963b66b226cdee959b
-  data.tar.gz: 2db1e93c654e7d0de826a9f9c0ffadae292cf57d1dfff1df71763c5a98da4fc6d547560808bee9c8364f64c08c747661546a91f330d0effcda7b3587547d35e8
+  metadata.gz: a8f2a7f14235c1214b567b891defaf523a645a21a2409ed81df964893a260cccb1fc9bf63903794f952c91bc9c91f3c1e3850db751a08ce0edc49b360ad9642d
+  data.tar.gz: e148b500e498a00dc370bf203fea3e2618b9f1c8fccd1dc5f220ae77a4988a8934198ad32bbf25a29bc84f948c687fab9a0ae16b9c03fe575fe9fca4cf98a0ea

data/app/assets/stylesheets/completion_kit/application.css CHANGED Viewed

@@ -1541,6 +1541,25 @@ tr:hover .ck-chip--publish {
   border: 0;
   border-radius: 0;
   background: transparent;
+  white-space: pre;
+  color: #93c5fd;
+  font-size: 0.86rem;
+  line-height: 1.55;
+}
+.ck-code-scroll-wrap > .ck-code .ck-json-key {
+  color: #c4b5fd;
+}
+.ck-code-scroll-wrap > .ck-code .ck-json-string {
+  color: #93c5fd;
+}
+.ck-code-scroll-wrap > .ck-code .ck-json-number {
+  color: #fcd34d;
+}
+.ck-code-scroll-wrap > .ck-code .ck-json-keyword {
+  color: #f9a8d4;
+}
+.ck-code-scroll-wrap > .ck-code .ck-json-punct {
+  color: var(--ck-dim);
 }
 .ck-note-box {
@@ -2755,7 +2774,7 @@ select.ck-input {
   border: 1px solid var(--ck-line);
   border-radius: var(--ck-radius-lg);
   background: var(--ck-surface);
-  padding: 1.5rem;
+  padding: 1.25rem;
 }
 .ck-review-card__header {
@@ -2778,11 +2797,9 @@ select.ck-input {
 }
 .ck-review-card__feedback {
-  margin-top: 0.75rem;
-}
-.ck-review-card__feedback .ck-note-box {
-  margin-top: 0;
+  margin: 0.6rem 0 0;
+  color: var(--ck-muted);
+  line-height: 1.55;
 }
 @media (max-width: 900px) {
@@ -2819,6 +2836,14 @@ select.ck-input {
     width: 100%;
   }
+  /* button_to renders a form.inline-block wrapping the button. When the inner
+     button is a full .ck-button (not an icon-button or chip), the form should
+     stretch with it. */
+  form.inline-block:has(> .ck-button) {
+    width: 100%;
+    display: block;
+  }
   /* Page header stacks: title, then lead text full-width, then action. */
   .ck-page-header {
     flex-direction: column;
@@ -4584,9 +4609,8 @@ a.tag-mark {
 }
 .ck-launch__progress {
-  padding-bottom: 1.5rem;
-  margin-bottom: 0.5rem;
-  border-bottom: 1px solid var(--ck-line);
+  padding-bottom: 0;
+  margin-bottom: 1.25rem;
 }
 .ck-launch__progress-head {
   display: flex;
@@ -5151,22 +5175,37 @@ a.tag-mark {
   border-top: 1px dashed var(--ck-line);
 }
 .ck-calibration__prompt {
+  margin: 0 0 10px;
+  display: flex;
+  align-items: baseline;
+  flex-wrap: wrap;
+  gap: 8px 12px;
+}
+.ck-calibration__label {
   font-family: var(--ck-mono);
   font-size: 0.72rem;
   letter-spacing: 0.06em;
   text-transform: uppercase;
   color: var(--ck-dim);
-  margin: 0 0 10px;
-  display: flex;
-  align-items: center;
-  gap: 10px;
+  flex-shrink: 0;
 }
 .ck-calibration__count {
   font-family: var(--ck-mono);
   font-size: 0.72rem;
   letter-spacing: 0.03em;
   color: var(--ck-accent);
-  text-transform: none;
+}
+.ck-calibration__hint {
+  font-size: 0.82rem;
+  color: var(--ck-dim);
+  line-height: 1.4;
+}
+@media (max-width: 640px) {
+  .ck-calibration__prompt {
+    flex-direction: column;
+    align-items: flex-start;
+    gap: 4px;
+  }
 }
 .ck-calibration__buttons {
   display: flex;
@@ -5221,11 +5260,13 @@ a.tag-mark {
   margin-top: 12px;
   display: flex;
   flex-direction: column;
-  gap: 8px;
-  padding: 12px;
-  background: var(--ck-surface-soft);
-  border: 1px solid var(--ck-line);
-  border-radius: 6px;
+  gap: 12px;
+}
+.ck-calibration__detail > * {
+  margin: 0;
+}
+.ck-calibration__detail .ck-button {
+  align-self: flex-start;
 }
 .ck-calibration__value {
   color: var(--ck-accent);
@@ -5347,44 +5388,28 @@ a.tag-mark {
   text-transform: uppercase;
 }
-.ck-draft-banner {
+.ck-draft-pending {
+  border-color: rgba(6, 182, 212, 0.45);
+  background: linear-gradient(180deg, var(--ck-accent-soft), var(--ck-surface));
+}
+.ck-suggestion-banner {
   display: inline-flex;
   align-items: center;
   gap: 10px;
   margin-top: 10px;
-  padding: 8px 12px;
+  padding: 8px 14px;
   background: var(--ck-accent-soft);
-  border: 1px dashed rgba(6, 182, 212, 0.4);
-  border-radius: 6px;
-}
-.ck-suggestion-list {
-  display: flex;
-  flex-direction: column;
-  gap: 12px;
-}
-.ck-suggestion-card {
-  padding: 12px 14px;
-  background: var(--ck-surface-soft);
-  border: 1px solid var(--ck-line);
+  border: 1px solid rgba(6, 182, 212, 0.35);
   border-radius: 6px;
-  display: flex;
-  flex-direction: column;
-  gap: 10px;
-}
-.ck-suggestion-card__header {
-  display: flex;
-  align-items: center;
-  gap: 10px;
+  color: var(--ck-accent);
+  font-family: var(--ck-mono);
+  font-size: 0.82rem;
+  text-decoration: none;
 }
-.ck-suggestion-card__instruction {
-  margin: 0;
-  white-space: pre-wrap;
-  font-size: 0.85rem;
-  background: var(--ck-bg-strong);
-  padding: 10px 12px;
-  border-radius: 4px;
-  border: 1px solid var(--ck-line);
+.ck-suggestion-banner:hover,
+.ck-suggestion-banner:focus-visible {
+  border-color: var(--ck-accent);
 }
 .ck-metrics-table__trust {
@@ -5407,3 +5432,56 @@ a.tag-mark {
   color: var(--ck-danger);
   font-size: 0.82rem;
 }
+.ck-star-picker {
+  border: 0;
+  padding: 0;
+  margin: 0;
+}
+.ck-star-picker__row {
+  display: inline-flex;
+  flex-direction: row-reverse;
+  gap: 2px;
+}
+.ck-star-picker input {
+  position: absolute;
+  width: 1px;
+  height: 1px;
+  opacity: 0;
+  pointer-events: none;
+}
+.ck-star-picker label {
+  cursor: pointer;
+  display: inline-flex;
+  padding: 4px;
+  border-radius: 4px;
+}
+.ck-star-picker label svg {
+  fill: transparent;
+  stroke: var(--ck-line-strong);
+  transition: fill 0.08s, stroke 0.08s;
+}
+.ck-star-picker input:checked ~ label svg {
+  fill: var(--ck-warning);
+  stroke: var(--ck-warning);
+}
+.ck-star-picker__row:hover label svg {
+  fill: transparent;
+  stroke: var(--ck-line-strong);
+}
+.ck-star-picker__row:hover label:hover svg,
+.ck-star-picker__row:hover label:hover ~ label svg {
+  fill: var(--ck-warning);
+  stroke: var(--ck-warning);
+}
+.ck-star-picker input:focus-visible + label {
+  outline: 2px solid var(--ck-accent);
+  outline-offset: 2px;
+}
+.ck-button--just-saved {
+  animation: ck-saved-flash 1.4s ease-out;
+}
+@keyframes ck-saved-flash {
+  0% { background: var(--ck-success); border-color: var(--ck-success); }
+}

data/app/controllers/completion_kit/calibrations_controller.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module CompletionKit
       )
       if calibration.save
-        render_calibration(calibration: calibration)
+        render_calibration(calibration: calibration, just_saved: true)
       else
         render_calibration(
           calibration: existing,
@@ -38,7 +38,7 @@ module CompletionKit
     private
-    def render_calibration(calibration:, pending_verdict: nil, error: nil, status: :ok)
+    def render_calibration(calibration:, pending_verdict: nil, error: nil, just_saved: false, status: :ok)
       locals = {
         review: review_for_metric,
         calibration: calibration,
@@ -46,7 +46,8 @@ module CompletionKit
         response_row: @response,
         metric: @metric,
         pending_verdict: pending_verdict,
-        error: error
+        error: error,
+        just_saved: just_saved
       }
       render turbo_stream: turbo_stream.replace(
         "calibration_#{@response.id}_#{@metric.id}",

data/app/controllers/completion_kit/metrics_controller.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module CompletionKit
   class MetricsController < ApplicationController
     include CompletionKit::TagFiltering
-    before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants]
+    before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
     def index
       @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
@@ -12,8 +12,10 @@ module CompletionKit
                                   .includes(response: [:reviews, :run])
                                   .order(created_at: :desc)
                                   .limit(50)
-      @latest_draft = JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
-      @suggestion_drafts = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc)
+      @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
+      @published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
+      @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
+      @improve_disagreement_count = @disagreements.size
     end
     def new
@@ -47,15 +49,28 @@ module CompletionKit
     end
     def suggest_variants
-      generator = JudgeVariantGenerator.new(@metric)
+      disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
+      if disagreement_count.zero?
+        redirect_to metric_path(@metric), alert: "Mark at least one row as Disagree before asking the model to suggest a change."
+        return
+      end
+      JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
+      generator = JudgeVariantGenerator.new(@metric, count: 1)
       variants = generator.call
       if variants.empty?
         redirect_to metric_path(@metric), alert: "The model returned no usable variants. Try again with a different model."
         return
       end
       generator.persist!(variants)
-      label = variants.length == 1 ? "alternative" : "alternatives"
-      redirect_to metric_path(@metric), notice: "Wrote #{variants.length} #{label} for this metric. Pick one to make it live."
+      redirect_to metric_path(@metric), notice: "Drafted a new version. Review it below."
+    end
+    def dismiss_suggestion
+      draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").find_by(id: params[:draft_id])
+      draft&.destroy
+      redirect_to metric_path(@metric), notice: "Dismissed."
     end
     def publish_draft

data/app/helpers/completion_kit/application_helper.rb CHANGED Viewed

@@ -202,15 +202,86 @@ module CompletionKit
     def ck_format_maybe_json(text)
       s = text.to_s
       return s if s.strip.empty?
-      first = s.strip[0]
+      payload = ck_unwrap_json_fence(s.strip)
+      first = payload[0]
       return s unless first == "{" || first == "["
       begin
-        JSON.pretty_generate(JSON.parse(s))
+        ck_highlight_json(JSON.pretty_generate(JSON.parse(payload)))
       rescue JSON::ParserError
         s
       end
     end
+    def ck_unwrap_json_fence(text)
+      m = text.match(/\A```(?:json|JSON)?\s*\n(.*?)\n?```\s*\z/m)
+      m ? m[1].strip : text
+    end
+    def ck_highlight_json(text)
+      tokens = ck_tokenize_json(text)
+      is_key = ck_mark_json_keys(tokens)
+      parts = tokens.each_with_index.map do |(type, value), idx|
+        escaped = ERB::Util.html_escape(value)
+        case type
+        when :punct then %(<span class="ck-json-punct">#{escaped}</span>)
+        when :string
+          %(<span class="#{is_key[idx] ? "ck-json-key" : "ck-json-string"}">#{escaped}</span>)
+        when :number then %(<span class="ck-json-number">#{escaped}</span>)
+        when :keyword then %(<span class="ck-json-keyword">#{escaped}</span>)
+        else escaped
+        end
+      end
+      parts.join.html_safe
+    end
+    def ck_tokenize_json(text)
+      tokens = []
+      i = 0
+      len = text.length
+      while i < len
+        ch = text[i]
+        if ch == " " || ch == "\n" || ch == "\t"
+          tokens << [:ws, ch]
+          i += 1
+        elsif "{}[]:,".include?(ch)
+          tokens << [:punct, ch]
+          i += 1
+        elsif ch == '"'
+          j = i + 1
+          while j < len && text[j] != '"'
+            j += text[j] == "\\" ? 2 : 1
+          end
+          j = len - 1 if j >= len
+          tokens << [:string, text[i..j]]
+          i = j + 1
+        elsif ch == "-" || (ch >= "0" && ch <= "9")
+          j = i + 1
+          j += 1 while j < len && "0123456789.eE+-".include?(text[j])
+          tokens << [:number, text[i...j]]
+          i = j
+        elsif text[i, 4] == "true" || text[i, 4] == "null"
+          tokens << [:keyword, text[i, 4]]
+          i += 4
+        elsif text[i, 5] == "false"
+          tokens << [:keyword, "false"]
+          i += 5
+        else
+          tokens << [:other, ch]
+          i += 1
+        end
+      end
+      tokens
+    end
+    def ck_mark_json_keys(tokens)
+      tokens.each_with_index.map do |(type, _), idx|
+        next false unless type == :string
+        j = idx + 1
+        j += 1 while j < tokens.length && tokens[j][0] == :ws
+        j < tokens.length && tokens[j] == [:punct, ":"]
+      end
+    end
     def tag_filter_url(base_path, selected, toggling)
       remaining = selected.reject { |t| t.id == toggling.id }
       next_set = selected.include?(toggling) ? remaining : remaining + [toggling]

data/app/services/completion_kit/judge_variant_generator.rb CHANGED Viewed

@@ -1,13 +1,15 @@
 module CompletionKit
   class JudgeVariantGenerator
-    DEFAULT_VARIANT_COUNT = 3
+    DEFAULT_VARIANT_COUNT = 1
+    MAX_VARIANT_COUNT = 3
     DEFAULT_TEMPERATURE = 0.4
-    Variant = Struct.new(:reasoning, :instruction, keyword_init: true)
+    Variant = Struct.new(:reasoning, :instruction, :rubric_bands, keyword_init: true)
     def initialize(metric, count: DEFAULT_VARIANT_COUNT, model: nil)
       @metric = metric
-      @count = count
+      n = count.to_i
+      @count = n < 1 ? DEFAULT_VARIANT_COUNT : [n, MAX_VARIANT_COUNT].min
       @model = model || CompletionKit.config.judge_model
     end
@@ -23,7 +25,7 @@ module CompletionKit
         JudgeVersion.create!(
           metric: @metric,
           instruction: variant.instruction,
-          rubric_bands: @metric.rubric_bands,
+          rubric_bands: variant.rubric_bands.presence || @metric.rubric_bands,
           state: "draft",
           source: "suggestion",
           current: false
@@ -42,14 +44,14 @@ module CompletionKit
       disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
       borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
       sections = []
-      sections << "You are an expert evaluator. Rewrite a judge's grading instruction so it agrees better with humans on the cases below."
+      sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
       sections << ""
       sections << "## Current instruction"
       sections << "```"
       sections << @metric.instruction.to_s
       sections << "```"
       sections << ""
-      sections << "## Rubric (unchanged across variants — only rewrite the instruction)"
+      sections << "## Current rubric (5 to 1)"
       sections << @metric.display_rubric_text
       sections << ""
       if disagreements.any?
@@ -65,7 +67,7 @@ module CompletionKit
       end
       if borderlines.any?
         sections << "## Rubric-ambiguous cases (humans marked these borderline)"
-        sections << "Each case below is one where a human said the rubric was unclear. Use these to sharpen language, split overlapping bands, or call out edge cases explicitly."
+        sections << "These are cases where a human said the rubric itself was unclear. If the rubric needs sharpening, rewrite it."
         borderlines.each_with_index do |ex, i|
           sections << "### Borderline #{i + 1}"
           sections << "Input: #{ex[:input].to_s.truncate(200)}"
@@ -76,14 +78,20 @@ module CompletionKit
         end
       end
       sections << "## Task"
-      sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite — not a wholesale rewrite of the rubric. Close the disagreement gap and disambiguate the borderline cases."
+      sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
       sections << ""
-      sections << "Respond in EXACTLY this format, repeated #{@count} times:"
+      sections << "Respond in EXACTLY this format, repeated #{@count} time#{@count == 1 ? "" : "s"}:"
       sections << ""
       sections << "VARIANT:"
-      sections << "REASONING: <one sentence explaining what this variant changes>"
+      sections << "REASONING: <one short sentence: what changes and why>"
       sections << "INSTRUCTION:"
       sections << "<the rewritten instruction>"
+      sections << "RUBRIC:                  # optional — omit this block if the rubric is unchanged"
+      sections << "5: <description for 5 stars>"
+      sections << "4: <description for 4 stars>"
+      sections << "3: <description for 3 stars>"
+      sections << "2: <description for 2 stars>"
+      sections << "1: <description for 1 star>"
       sections << "END_VARIANT"
       sections.join("\n")
     end
@@ -92,11 +100,21 @@ module CompletionKit
       blocks = text.to_s.scan(/VARIANT:(.*?)END_VARIANT/m).flatten
       blocks.filter_map do |raw|
         reasoning = raw[/REASONING:\s*(.*?)(?=INSTRUCTION:|\z)/m, 1].to_s.strip
-        instruction = raw[/INSTRUCTION:\s*(.*)/m, 1].to_s.strip
+        instruction = raw[/INSTRUCTION:\s*(.*?)(?=RUBRIC:|\z)/m, 1].to_s.strip
         next if instruction.empty?
-        Variant.new(reasoning: reasoning, instruction: instruction)
+        rubric_block = raw[/RUBRIC:\s*(.*)/m, 1].to_s
+        Variant.new(reasoning: reasoning, instruction: instruction, rubric_bands: parse_rubric(rubric_block))
       end
     end
+    def parse_rubric(block)
+      return nil if block.strip.empty?
+      bands = block.scan(/^\s*([1-5])\s*[:\-]\s*(.+?)\s*$/).map do |stars, description|
+        { "stars" => stars.to_i, "description" => description.strip }
+      end
+      return nil if bands.length != 5
+      bands.sort_by { |b| -b["stars"] }
+    end
   end
   module JudgeCalibrationExamples

data/app/services/completion_kit/mcp_tools/judges.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module CompletionKit
             type: "object",
             properties: {
               metric_id: { type: "integer" },
-              count: { type: "integer", description: "How many variants to request (default 3, max 5)." },
+              count: { type: "integer", description: "How many variants to request (default 1, max 3). One focused rewrite beats five reworded copies." },
               model: { type: "string", description: "Override the model used to generate variants. Defaults to CompletionKit.config.judge_model." }
             },
             required: ["metric_id"]
@@ -49,9 +49,7 @@ module CompletionKit
       def self.suggest(args)
         metric = CompletionKit::Metric.find(args["metric_id"])
-        count = [args["count"].to_i, 5].min
-        count = CompletionKit::JudgeVariantGenerator::DEFAULT_VARIANT_COUNT if count <= 0
-        generator = CompletionKit::JudgeVariantGenerator.new(metric, count: count, model: args["model"])
+        generator = CompletionKit::JudgeVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
         variants = generator.call
         return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
         versions = generator.persist!(variants)

data/app/services/completion_kit/onboarding/sample_data.rb CHANGED Viewed

@@ -1,20 +1,20 @@
 module CompletionKit
   module Onboarding
     # Opt-in starter data for the onboarding page: one dataset + one prompt so a
-    # brand-new install has something to poke at. Idempotent — a no-op once the
+    # brand-new install has something to poke at. Idempotent. A no-op once the
     # workspace already has any prompt or dataset. Deliberately does NOT create a
     # provider credential (needs a real API key) or a run (user-initiated).
     module SampleData
       SAMPLE_CSV = <<~CSV.freeze
         ticket
         "My order #4827 arrived with a dented panel. I emailed photos 11 days ago and heard nothing. Today I was told the return window 'closed'. I paid $749. I want a refund or replacement, not store credit."
-        "Tracking says delivered to my porch Tuesday 3:47pm. I was home all day, nothing arrived, neighbours' cameras show no van. Order #5102 — a $315 mixer, wedding gift, wedding is Saturday. Can someone look today?"
+        "Tracking says delivered to my porch Tuesday 3:47pm. I was home all day, nothing arrived, neighbours' cameras show no van. Order #5102. A $315 mixer, wedding gift, wedding is Saturday. Can someone look today?"
         "WELCOME20 says 'invalid' at checkout but the promo email says it's good through May 31. Same email I'm signed in with. Tried Chrome and Safari. Cart is $186 waiting on you."
       CSV
       SAMPLE_PROMPT = {
         name: "Sample: Support reply",
-        description: "A starter prompt — drafts a warm, professional reply to a customer support ticket. Edit it or delete it; it's just here to get you going.",
+        description: "A starter prompt. Drafts a warm, professional reply to a customer support ticket. Edit it or delete it; it's just here to get you going.",
         template: "You are a senior customer-support specialist. Write a warm, professional reply to this ticket. Acknowledge the customer's situation, be specific about next steps, and don't be defensive.\n\nTicket:\n{{ticket}}",
         llm_model: "gpt-4o-mini"
       }.freeze

data/app/views/completion_kit/calibrations/_buttons.html.erb CHANGED Viewed

@@ -5,11 +5,11 @@
   <% error = local_assigns[:error] %>
   <% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
   <p class="ck-calibration__prompt">
-    Your verdict
+    <span class="ck-calibration__label">Your verdict</span>
     <% if verdict_count > 0 %>
-      <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "trust score →", metric_path(metric), class: "ck-link" %></span>
+      <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "trust level →", metric_path(metric), class: "ck-link" %></span>
     <% else %>
-      <span class="ck-calibration__count">Tell us what you think — was the score right? Verdicts roll up into the metric's <%= link_to "trust score", metric_path(metric), class: "ck-link" %>.</span>
+      <span class="ck-calibration__hint">Tell us what you think — was the score right? Verdicts roll up into the metric's <%= link_to "trust level", metric_path(metric), class: "ck-link" %>.</span>
     <% end %>
   </p>
   <div class="ck-calibration__buttons">
@@ -37,22 +37,27 @@
   <% end %>
   <% if active_verdict == "disagree" %>
+    <% existing_score = (calibration&.corrected_score || review&.ai_score)&.round %>
     <%= form_with url: run_response_calibrations_path(run, response_row),
                   method: :post, local: false,
                   class: "ck-calibration__detail" do |f| %>
       <%= hidden_field_tag :metric_id, metric.id %>
       <%= hidden_field_tag :verdict, "disagree" %>
-      <label class="ck-label">
-        What should the score have been?
-        <span class="ck-calibration__value" data-calibration-value><%= calibration&.corrected_score || review&.ai_score || 3 %></span>
-      </label>
-      <input type="range" name="corrected_score" min="1" max="5" step="0.5"
-             value="<%= calibration&.corrected_score || review&.ai_score || 3 %>"
-             oninput="this.closest('.ck-calibration__detail').querySelector('[data-calibration-value]').textContent = this.value"
-             class="ck-slider"
-             required>
+      <p class="ck-label">What should the score have been?</p>
+      <fieldset class="ck-star-picker">
+        <legend class="ck-visually-hidden">Pick a score from 1 to 5 stars</legend>
+        <div class="ck-star-picker__row">
+          <% [5, 4, 3, 2, 1].each do |n| %>
+            <% radio_id = "ck-star-#{response_row.id}-#{metric.id}-#{n}" %>
+            <input type="radio" name="corrected_score" id="<%= radio_id %>" value="<%= n %>" <%= "checked" if existing_score == n %> required>
+            <label for="<%= radio_id %>" title="<%= pluralize(n, 'star') %>" aria-label="<%= pluralize(n, 'star') %>">
+              <svg viewBox="0 0 24 24" width="28" height="28" stroke-width="1.5" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
+            </label>
+          <% end %>
+        </div>
+      </fieldset>
       <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration&.note %></textarea>
-      <%= f.submit (current_verdict == "disagree" ? "Update" : "Save disagree"), class: ck_button_classes(:dark) %>
+      <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
     <% end %>
   <% elsif active_verdict == "borderline" %>
     <%= form_with url: run_response_calibrations_path(run, response_row),
@@ -61,7 +66,7 @@
       <%= hidden_field_tag :metric_id, metric.id %>
       <%= hidden_field_tag :verdict, "borderline" %>
       <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration&.note %></textarea>
-      <%= f.submit (current_verdict == "borderline" ? "Update" : "Save"), class: ck_button_classes(:dark) %>
+      <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
     <% end %>
   <% end %>
 </div>

data/app/views/completion_kit/calibrations/_trust_panel.html.erb CHANGED Viewed

@@ -1,6 +1,6 @@
 <% stats = local_assigns[:stats] %>
 <div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
-  <p class="ck-trust-panel__label" title="How often this metric's scores match the humans who reviewed them.">Trust score</p>
+  <p class="ck-trust-panel__label" title="How often this metric's scores match the humans who reviewed them.">Trust level</p>
   <% if stats.counter_only? %>
     <div class="ck-trust-panel__body">
       <span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>

data/app/views/completion_kit/metrics/index.html.erb CHANGED Viewed

@@ -19,7 +19,7 @@
       <tr>
         <th scope="col">Name</th>
         <th scope="col">Instruction</th>
-        <th scope="col" title="How often this metric's scores match the humans who reviewed them.">Trust score</th>
+        <th scope="col" title="How often this metric's scores match the humans who reviewed them.">Trust level</th>
         <th scope="col">In groups</th>
         <th scope="col"></th>
       </tr>
@@ -36,7 +36,7 @@
             <% end %>
           </td>
           <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
-          <td data-label="Trust score" class="ck-metrics-table__trust">
+          <td data-label="Trust level" class="ck-metrics-table__trust">
             <% if CompletionKit.config.judge_calibration_enabled %>
               <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
               <% if s.counter_only? %>

data/app/views/completion_kit/metrics/show.html.erb CHANGED Viewed

@@ -9,24 +9,58 @@
     <% if CompletionKit.config.judge_calibration_enabled %>
       <%= render "completion_kit/calibrations/trust_panel",
             stats: CompletionKit::MetricCalibrationStats.for(@metric) %>
-      <% if @latest_draft %>
-        <div class="ck-draft-banner">
-          <span class="ck-chip ck-chip--soft">Draft pending</span>
-          <span class="ck-meta-copy">A draft version of this judge is saved. Publishing it replaces the live instruction and rubric.</span>
-          <%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @latest_draft.id),
-                method: :post, form_class: "inline-block",
-                class: ck_button_classes(:dark) %>
-        </div>
+      <% if @edit_draft %>
+        <% pub_instr = @published_judge_version&.instruction.to_s %>
+        <% draft_instr = @edit_draft.instruction.to_s %>
+        <% instruction_changed = pub_instr != draft_instr %>
+        <% rubric_changed = @published_judge_version && @published_judge_version.rubric_bands != @edit_draft.rubric_bands %>
+        <section class="ck-card ck-card--spaced ck-draft-pending">
+          <div class="ck-prompt-preview__header">
+            <p class="ck-kicker">Draft pending</p>
+            <%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @edit_draft.id),
+                  method: :post, form_class: "inline-block",
+                  class: ck_button_classes(:dark) %>
+          </div>
+          <p class="ck-meta-copy">A draft of this metric is saved. Publishing it replaces the live instruction<%= ", rubric," if rubric_changed %> for future runs. Here's what changes.</p>
+          <% if instruction_changed %>
+            <div class="ck-suggest-diff">
+              <div class="ck-suggest-diff__pane">
+                <div class="ck-suggest-diff__header">
+                  <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Currently live</span>
+                </div>
+                <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(pub_instr, draft_instr) %></pre>
+              </div>
+              <div class="ck-suggest-diff__pane">
+                <div class="ck-suggest-diff__header">
+                  <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Draft</span>
+                </div>
+                <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(pub_instr, draft_instr) %></pre>
+              </div>
+            </div>
+          <% else %>
+            <p class="ck-meta-copy">The instruction is unchanged.</p>
+          <% end %>
+          <% if rubric_changed %>
+            <p class="ck-meta-copy"><strong>Rubric also changed.</strong> Edit the metric to inspect each band, or publish to apply the new wording.</p>
+          <% end %>
+        </section>
       <% end %>
     <% end %>
   </div>
   <div class="ck-actions">
     <% if CompletionKit.config.judge_calibration_enabled %>
-      <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
-            method: :post, form_class: "inline-block",
-            class: ck_button_classes(:light, variant: :outline),
-            title: "Ask the model to rewrite this metric's instruction based on the disagreements and rubric-ambiguous cases collected so far.",
-            data: { turbo_confirm: "Ask the model to rewrite this metric's instruction based on the disagreements and rubric-ambiguous cases collected so far?" } %>
+      <% if @improve_disagreement_count.positive? %>
+        <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
+              method: :post, form_class: "inline-block",
+              class: ck_button_classes(:light, variant: :outline),
+              title: "Rewrite this metric based on the disagreements collected so far.",
+              data: { turbo_confirm: "Rewrite this metric based on the disagreements collected so far?" } %>
+      <% else %>
+        <button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
+                title="Mark at least one row as Disagree before the model can suggest a change.">Improve the metric</button>
+      <% end %>
     <% end %>
     <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
   </div>
@@ -63,6 +97,49 @@
   </div>
 </section>
+<% if CompletionKit.config.judge_calibration_enabled && @suggestion_draft %>
+  <% sd_current_instr = @published_judge_version&.instruction.to_s %>
+  <% sd_draft_instr = @suggestion_draft.instruction.to_s %>
+  <% sd_current_rubric = @published_judge_version&.rubric_bands || [] %>
+  <% sd_rubric_changed = @suggestion_draft.rubric_bands != sd_current_rubric %>
+  <section class="ck-card ck-card--spaced ck-draft-pending">
+    <div class="ck-prompt-preview__header">
+      <p class="ck-kicker">Suggested change</p>
+      <time class="ck-meta-copy" data-relative-time datetime="<%= @suggestion_draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(@suggestion_draft.created_at) %> ago</time>
+    </div>
+    <p class="ck-meta-copy">Based on your disagreements, the model proposed this rewrite. Use it to replace the live version, or discard.</p>
+    <div class="ck-suggest-diff">
+      <div class="ck-suggest-diff__pane">
+        <div class="ck-suggest-diff__header">
+          <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Currently live</span>
+        </div>
+        <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(sd_current_instr, sd_draft_instr) %></pre>
+      </div>
+      <div class="ck-suggest-diff__pane">
+        <div class="ck-suggest-diff__header">
+          <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Proposed</span>
+        </div>
+        <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(sd_current_instr, sd_draft_instr) %></pre>
+      </div>
+    </div>
+    <% if sd_rubric_changed %>
+      <p class="ck-meta-copy"><strong>Rubric also changed.</strong> Publishing applies the new rubric too.</p>
+    <% end %>
+    <div class="ck-actions">
+      <%= button_to "Discard", dismiss_suggestion_metric_path(@metric, draft_id: @suggestion_draft.id),
+            method: :delete, form_class: "inline-block",
+            class: ck_button_classes(:light, variant: :outline),
+            data: { turbo_confirm: "Drop this suggestion?" } %>
+      <%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: @suggestion_draft.id),
+            method: :post, form_class: "inline-block",
+            class: ck_button_classes(:dark) %>
+    </div>
+  </section>
+<% end %>
 <% if CompletionKit.config.judge_calibration_enabled %>
   <section class="ck-card ck-card--spaced">
     <div class="ck-prompt-preview__header">
@@ -131,31 +208,6 @@
     <% end %>
   </section>
-  <% if @suggestion_drafts.any? %>
-    <section class="ck-card ck-card--spaced">
-      <div class="ck-prompt-preview__header">
-        <p class="ck-kicker">Suggested improvements</p>
-        <span class="ck-chip"><%= @suggestion_drafts.size %> option<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
-      </div>
-      <p class="ck-meta-copy">Based on your verdicts, the model proposed these alternative instructions for this metric. Pick one to make it live — the previous version stays in history.</p>
-      <div class="ck-suggestion-list">
-        <% @suggestion_drafts.each do |draft| %>
-          <article class="ck-suggestion-card">
-            <header class="ck-suggestion-card__header">
-              <span class="ck-chip ck-chip--soft">Option #<%= draft.id %></span>
-              <time class="ck-meta-copy" data-relative-time datetime="<%= draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(draft.created_at) %> ago</time>
-            </header>
-            <pre class="ck-code ck-suggestion-card__instruction"><%= draft.instruction %></pre>
-            <div class="ck-actions">
-              <%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: draft.id),
-                    method: :post, form_class: "inline-block",
-                    class: ck_button_classes(:dark) %>
-            </div>
-          </article>
-        <% end %>
-      </div>
-    </section>
-  <% end %>
   <% if Array(@metric.few_shot_examples).any? %>
     <section class="ck-card ck-card--spaced">

data/app/views/completion_kit/responses/show.html.erb CHANGED Viewed

@@ -112,9 +112,7 @@
             </div>
           </div>
           <% if review.ai_feedback.present? %>
-            <div class="ck-review-card__feedback">
-              <div class="ck-note-box"><%= review.ai_feedback %></div>
-            </div>
+            <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
           <% end %>
           <% if CompletionKit.config.judge_calibration_enabled && review.metric && review.ai_score %>
             <% existing = CompletionKit::Calibration.find_by(

data/config/routes.rb CHANGED Viewed

@@ -17,6 +17,7 @@ CompletionKit::Engine.routes.draw do
       post :add_few_shot
       post :publish_draft
       post :suggest_variants
+      delete :dismiss_suggestion
     end
   end
   resources :metric_groups

data/lib/completion_kit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CompletionKit
-  VERSION = "0.5.39"
+  VERSION = "0.5.40"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: completion-kit
 version: !ruby/object:Gem::Version
-  version: 0.5.39
+  version: 0.5.40
 platform: ruby
 authors:
 - Damien Bastin