RubyGems - completion-kit - Versions diffs - 0.5.38 → 0.5.39 - Mend

completion-kit 0.5.38 → 0.5.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fc7d527828189c2993060b315dca634fb958d2da11fd7fae63c4790179c46701
-  data.tar.gz: f0323b980bdfb35d36742b548ddd3629e66d39e587775521678dc80b4cd2f068
+  metadata.gz: d8d052b5ce9253412be890b820439248547d767575969e4260566a63426ac612
+  data.tar.gz: 8e2f73e59c977c1923b90c9b36fae7dd8eadd35d0c499ae04cea1d63113e7655
 SHA512:
-  metadata.gz: '020946bdac698194bb5246cfbe21fdf45c56006c80c15d1d7bcfda4d3494d95cde45645e090df14a411b172c83dcde42be777d74811b25a340e5710dba6ae7ce'
-  data.tar.gz: 1b4f0ea8cf4e613df783ac428404ef1ae19b285db04f8a6768119760c65fb81d9ee72d52905ab2ef30e077ae910eb00e622925c1dcc43aee0c9e3a0f748718b1
+  metadata.gz: 54dd9bd2a4b2e64f929865508649ca2ada6972840715552b920b2bcc156b74cc76fe957b8ac58ec2f9ad7d8594dbe2ef15c600efb10304963b66b226cdee959b
+  data.tar.gz: 2db1e93c654e7d0de826a9f9c0ffadae292cf57d1dfff1df71763c5a98da4fc6d547560808bee9c8364f64c08c747661546a91f330d0effcda7b3587547d35e8

data/app/assets/stylesheets/completion_kit/{application.css.erb → application.css} RENAMED Viewed

@@ -1,12 +1,26 @@
-<% %w[400 500 700].each do |weight| %>
 @font-face {
   font-family: 'JetBrains Mono';
   font-style: normal;
-  font-weight: <%= weight %>;
+  font-weight: 400;
+  font-display: swap;
+  src: url('completion_kit/jetbrains-mono-400.woff2') format('woff2');
+}
+@font-face {
+  font-family: 'JetBrains Mono';
+  font-style: normal;
+  font-weight: 500;
+  font-display: swap;
+  src: url('completion_kit/jetbrains-mono-500.woff2') format('woff2');
+}
+@font-face {
+  font-family: 'JetBrains Mono';
+  font-style: normal;
+  font-weight: 700;
   font-display: swap;
-  src: url('<%= asset_path("completion_kit/jetbrains-mono-#{weight}.woff2") %>') format('woff2');
+  src: url('completion_kit/jetbrains-mono-700.woff2') format('woff2');
 }
-<% end %>
 .turbo-progress-bar {
   background-color: var(--ck-accent);
@@ -2751,6 +2765,11 @@ select.ck-input {
   gap: 1rem;
 }
+.ck-review-card__header .ck-inline {
+  flex-wrap: nowrap;
+  flex-shrink: 0;
+}
 .ck-review-card__metric {
   font-family: var(--ck-mono);
   font-size: 0.95rem;
@@ -2827,6 +2846,12 @@ select.ck-input {
     padding: 1rem;
   }
+  .ck-review-card__header {
+    flex-direction: column;
+    align-items: flex-start;
+    gap: 0.5rem;
+  }
   /* Topbar nav collapses behind the hamburger trigger. */
   .ck-nav-menu__trigger {
     display: inline-flex;
@@ -5372,3 +5397,13 @@ a.tag-mark {
   color: var(--ck-success);
   margin-right: 6px;
 }
+.ck-calibration__error {
+  margin: 8px 0 0;
+  padding: 8px 10px;
+  background: var(--ck-danger-soft);
+  border: 1px solid rgba(248, 113, 113, 0.3);
+  border-radius: 4px;
+  color: var(--ck-danger);
+  font-size: 0.82rem;
+}

data/app/controllers/completion_kit/calibrations_controller.rb CHANGED Viewed

@@ -5,9 +5,18 @@ module CompletionKit
     def create
       created_by = calibration_creator
-      calibration = Calibration.find_or_initialize_by(
+      existing = Calibration.find_by(
         run_id: @run.id, response_id: @response.id, metric_id: @metric.id, created_by: created_by
       )
+      if params[:verdict] == "disagree" && params[:corrected_score].blank?
+        render_calibration(calibration: existing, pending_verdict: "disagree")
+        return
+      end
+      calibration = existing || Calibration.new(
+        run: @run, response: @response, metric: @metric, created_by: created_by
+      )
       calibration.assign_attributes(
         judge_version: JudgeVersion.ensure_current_for(@metric),
         verdict: params[:verdict],
@@ -16,19 +25,36 @@ module CompletionKit
       )
       if calibration.save
-        render turbo_stream: turbo_stream.replace(
-          "calibration_#{@response.id}_#{@metric.id}",
-          partial: "completion_kit/calibrations/buttons",
-          locals: { review: review_for_metric, calibration: calibration, run: @run, response_row: @response, metric: @metric }
-        )
+        render_calibration(calibration: calibration)
       else
-        flash[:alert] = calibration.errors.full_messages.to_sentence
-        redirect_to run_response_path(@run, @response)
+        render_calibration(
+          calibration: existing,
+          pending_verdict: params[:verdict],
+          error: calibration.errors.full_messages.to_sentence,
+          status: :unprocessable_entity
+        )
       end
     end
     private
+    def render_calibration(calibration:, pending_verdict: nil, error: nil, status: :ok)
+      locals = {
+        review: review_for_metric,
+        calibration: calibration,
+        run: @run,
+        response_row: @response,
+        metric: @metric,
+        pending_verdict: pending_verdict,
+        error: error
+      }
+      render turbo_stream: turbo_stream.replace(
+        "calibration_#{@response.id}_#{@metric.id}",
+        partial: "completion_kit/calibrations/buttons",
+        locals: locals
+      ), status: status
+    end
     def ensure_calibration_enabled
       head :not_found unless CompletionKit.config.judge_calibration_enabled
     end

data/app/controllers/completion_kit/metrics_controller.rb CHANGED Viewed

@@ -55,7 +55,7 @@ module CompletionKit
       end
       generator.persist!(variants)
       label = variants.length == 1 ? "alternative" : "alternatives"
-      redirect_to metric_path(@metric), notice: "Wrote #{variants.length} #{label} for the judge instruction. Pick one to make it live."
+      redirect_to metric_path(@metric), notice: "Wrote #{variants.length} #{label} for this metric. Pick one to make it live."
     end
     def publish_draft

data/app/services/completion_kit/judge_variant_generator.rb CHANGED Viewed

@@ -39,7 +39,8 @@ module CompletionKit
     private
     def build_meta_prompt
-      examples = JudgeCalibrationExamples.for(@metric)
+      disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
+      borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
       sections = []
       sections << "You are an expert evaluator. Rewrite a judge's grading instruction so it agrees better with humans on the cases below."
       sections << ""
@@ -51,17 +52,31 @@ module CompletionKit
       sections << "## Rubric (unchanged across variants — only rewrite the instruction)"
       sections << @metric.display_rubric_text
       sections << ""
-      sections << "## Recent disagreements (judge vs human)"
-      examples.each_with_index do |ex, i|
-        sections << "### Case #{i + 1}"
-        sections << "Input: #{ex[:input].to_s.truncate(200)}"
-        sections << "Output: #{ex[:output].to_s.truncate(200)}"
-        sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
-        sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
-        sections << ""
+      if disagreements.any?
+        sections << "## Recent disagreements (judge vs human)"
+        disagreements.each_with_index do |ex, i|
+          sections << "### Case #{i + 1}"
+          sections << "Input: #{ex[:input].to_s.truncate(200)}"
+          sections << "Output: #{ex[:output].to_s.truncate(200)}"
+          sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
+          sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
+          sections << ""
+        end
+      end
+      if borderlines.any?
+        sections << "## Rubric-ambiguous cases (humans marked these borderline)"
+        sections << "Each case below is one where a human said the rubric was unclear. Use these to sharpen language, split overlapping bands, or call out edge cases explicitly."
+        borderlines.each_with_index do |ex, i|
+          sections << "### Borderline #{i + 1}"
+          sections << "Input: #{ex[:input].to_s.truncate(200)}"
+          sections << "Output: #{ex[:output].to_s.truncate(200)}"
+          sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
+          sections << "Human note: #{ex[:human_note].to_s.truncate(200)}" if ex[:human_note].to_s.present?
+          sections << ""
+        end
       end
       sections << "## Task"
-      sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite — not a wholesale rewrite of the rubric. Aim to close the disagreement gap."
+      sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite — not a wholesale rewrite of the rubric. Close the disagreement gap and disambiguate the borderline cases."
       sections << ""
       sections << "Respond in EXACTLY this format, repeated #{@count} times:"
       sections << ""
@@ -88,11 +103,23 @@ module CompletionKit
     module_function
     def for(metric, limit: 8)
-      disagreements = Calibration.where(metric_id: metric.id, verdict: "disagree")
-                                 .includes(response: :reviews)
-                                 .order(created_at: :desc)
-                                 .limit(limit)
-      disagreements.map do |cal|
+      disagreements_for(metric, limit: limit)
+    end
+    def disagreements_for(metric, limit: 8)
+      calibrations_for(metric, verdict: "disagree", limit: limit)
+    end
+    def borderlines_for(metric, limit: 6)
+      calibrations_for(metric, verdict: "borderline", limit: limit)
+    end
+    def calibrations_for(metric, verdict:, limit:)
+      Calibration.where(metric_id: metric.id, verdict: verdict)
+                 .includes(response: :reviews)
+                 .order(created_at: :desc)
+                 .limit(limit)
+                 .map do |cal|
         review = cal.response.reviews.find { |r| r.metric_id == metric.id }
         {
           input: cal.response.input_data,

data/app/views/completion_kit/calibrations/_buttons.html.erb CHANGED Viewed

@@ -1,12 +1,15 @@
 <div id="calibration_<%= response_row.id %>_<%= metric.id %>" class="ck-calibration">
   <% current_verdict = calibration&.verdict %>
+  <% pending_verdict = local_assigns[:pending_verdict] %>
+  <% active_verdict = pending_verdict || current_verdict %>
+  <% error = local_assigns[:error] %>
   <% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
   <p class="ck-calibration__prompt">
     Your verdict
     <% if verdict_count > 0 %>
-      <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "judge trust →", metric_path(metric), class: "ck-link" %></span>
+      <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "trust score →", metric_path(metric), class: "ck-link" %></span>
     <% else %>
-      <span class="ck-calibration__count">Tell us what you think — was the score right? Verdicts roll up into <%= link_to "judge trust", metric_path(metric), class: "ck-link" %>.</span>
+      <span class="ck-calibration__count">Tell us what you think — was the score right? Verdicts roll up into the metric's <%= link_to "trust score", metric_path(metric), class: "ck-link" %>.</span>
     <% end %>
   </p>
   <div class="ck-calibration__buttons">
@@ -20,8 +23,8 @@
       <%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
               method: :post,
               form: { data: { turbo: "true" } },
-              class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == current_verdict}",
-              "aria-pressed": (verdict == current_verdict).to_s,
+              class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
+              "aria-pressed": (verdict == active_verdict).to_s,
               title: verdict_hints[verdict] do %>
         <%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
         <span><%= verdict %></span>
@@ -29,7 +32,11 @@
     <% end %>
   </div>
-  <% if current_verdict == "disagree" %>
+  <% if error.present? %>
+    <p class="ck-calibration__error" role="alert"><%= error %></p>
+  <% end %>
+  <% if active_verdict == "disagree" %>
     <%= form_with url: run_response_calibrations_path(run, response_row),
                   method: :post, local: false,
                   class: "ck-calibration__detail" do |f| %>
@@ -37,23 +44,24 @@
       <%= hidden_field_tag :verdict, "disagree" %>
       <label class="ck-label">
         What should the score have been?
-        <span class="ck-calibration__value" data-calibration-value><%= calibration.corrected_score || review&.ai_score || 3 %></span>
+        <span class="ck-calibration__value" data-calibration-value><%= calibration&.corrected_score || review&.ai_score || 3 %></span>
       </label>
       <input type="range" name="corrected_score" min="1" max="5" step="0.5"
-             value="<%= calibration.corrected_score || review&.ai_score || 3 %>"
+             value="<%= calibration&.corrected_score || review&.ai_score || 3 %>"
              oninput="this.closest('.ck-calibration__detail').querySelector('[data-calibration-value]').textContent = this.value"
-             class="ck-slider">
-      <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration.note %></textarea>
-      <%= f.submit "Save", class: ck_button_classes(:dark) %>
+             class="ck-slider"
+             required>
+      <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration&.note %></textarea>
+      <%= f.submit (current_verdict == "disagree" ? "Update" : "Save disagree"), class: ck_button_classes(:dark) %>
     <% end %>
-  <% elsif current_verdict == "borderline" %>
+  <% elsif active_verdict == "borderline" %>
     <%= form_with url: run_response_calibrations_path(run, response_row),
                   method: :post, local: false,
                   class: "ck-calibration__detail" do |f| %>
       <%= hidden_field_tag :metric_id, metric.id %>
       <%= hidden_field_tag :verdict, "borderline" %>
-      <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration.note %></textarea>
-      <%= f.submit "Save", class: ck_button_classes(:dark) %>
+      <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration&.note %></textarea>
+      <%= f.submit (current_verdict == "borderline" ? "Update" : "Save"), class: ck_button_classes(:dark) %>
     <% end %>
   <% end %>
 </div>

data/app/views/completion_kit/calibrations/_trust_panel.html.erb CHANGED Viewed

@@ -1,6 +1,6 @@
 <% stats = local_assigns[:stats] %>
 <div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
-  <p class="ck-trust-panel__label" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</p>
+  <p class="ck-trust-panel__label" title="How often this metric's scores match the humans who reviewed them.">Trust score</p>
   <% if stats.counter_only? %>
     <div class="ck-trust-panel__body">
       <span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>

data/app/views/completion_kit/metrics/index.html.erb CHANGED Viewed

@@ -19,7 +19,7 @@
       <tr>
         <th scope="col">Name</th>
         <th scope="col">Instruction</th>
-        <th scope="col" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</th>
+        <th scope="col" title="How often this metric's scores match the humans who reviewed them.">Trust score</th>
         <th scope="col">In groups</th>
         <th scope="col"></th>
       </tr>
@@ -36,7 +36,7 @@
             <% end %>
           </td>
           <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
-          <td data-label="Judge trust" class="ck-metrics-table__trust">
+          <td data-label="Trust score" class="ck-metrics-table__trust">
             <% if CompletionKit.config.judge_calibration_enabled %>
               <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
               <% if s.counter_only? %>

data/app/views/completion_kit/metrics/show.html.erb CHANGED Viewed

@@ -22,11 +22,11 @@
   </div>
   <div class="ck-actions">
     <% if CompletionKit.config.judge_calibration_enabled %>
-      <%= button_to "Suggest rewrites", suggest_variants_metric_path(@metric),
+      <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
             method: :post, form_class: "inline-block",
             class: ck_button_classes(:light, variant: :outline),
-            title: "Ask the model to rewrite this judge instruction based on the disagreements collected so far.",
-            data: { turbo_confirm: "Ask the model to rewrite this judge instruction based on the disagreements collected so far?" } %>
+            title: "Ask the model to rewrite this metric's instruction based on the disagreements and rubric-ambiguous cases collected so far.",
+            data: { turbo_confirm: "Ask the model to rewrite this metric's instruction based on the disagreements and rubric-ambiguous cases collected so far?" } %>
     <% end %>
     <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
   </div>
@@ -41,7 +41,7 @@
 <% if @metric.instruction.present? %>
 <section class="ck-card">
   <p class="ck-kicker">Instruction</p>
-  <div class="ck-note-box"><%= simple_format(@metric.instruction) %></div>
+  <%= simple_format(@metric.instruction, {}, class: "ck-copy") %>
 </section>
 <% end %>
@@ -134,10 +134,10 @@
   <% if @suggestion_drafts.any? %>
     <section class="ck-card ck-card--spaced">
       <div class="ck-prompt-preview__header">
-        <p class="ck-kicker">Suggested rewrites</p>
+        <p class="ck-kicker">Suggested improvements</p>
         <span class="ck-chip"><%= @suggestion_drafts.size %> option<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
       </div>
-      <p class="ck-meta-copy">The model wrote these alternate instructions based on the disagreements above. Pick one to make it the live judge — the previous version is kept in history.</p>
+      <p class="ck-meta-copy">Based on your verdicts, the model proposed these alternative instructions for this metric. Pick one to make it live — the previous version stays in history.</p>
       <div class="ck-suggestion-list">
         <% @suggestion_drafts.each do |draft| %>
           <article class="ck-suggestion-card">

data/lib/completion_kit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CompletionKit
-  VERSION = "0.5.38"
+  VERSION = "0.5.39"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: completion-kit
 version: !ruby/object:Gem::Version
-  version: 0.5.38
+  version: 0.5.39
 platform: ruby
 authors:
 - Damien Bastin
@@ -233,7 +233,7 @@ files:
 - app/assets/images/completion_kit/favicon.ico
 - app/assets/images/completion_kit/logo.png
 - app/assets/javascripts/completion_kit/application.js
-- app/assets/stylesheets/completion_kit/application.css.erb
+- app/assets/stylesheets/completion_kit/application.css
 - app/controllers/completion_kit/api/v1/base_controller.rb
 - app/controllers/completion_kit/api/v1/calibrations_controller.rb
 - app/controllers/completion_kit/api/v1/datasets_controller.rb