RubyGems - completion-kit - Versions diffs - 0.10.0 → 0.11.0 - Mend

completion-kit 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4772b264a668a86e004f78c7bc2397d93f5266d8dc287b422728d951ab24fbcc
-  data.tar.gz: 63b221e144e930df9978607a78533d354f659ca2bf6141046291168af50d3cd7
+  metadata.gz: 0b32ec77fb60d07f40e4b83827c2510aaeb695c96c9c6df86e4b42a7ec57516b
+  data.tar.gz: ade912039e4942c87d73c13443bd405533eec2988478e02cd1ccb87550de2783
 SHA512:
-  metadata.gz: a3249ae1c734dcee0c6f9410baf0400f4b16e091b220d86e8417dec91ff9943a165bbc6c8368629cc14054c3af7946bdb693f006a5756de40a809c41db5bbe3a
-  data.tar.gz: 541323c93b08f08f32c2f024e3709ee5f3e4e48144cd8e9cb2ba8891312fd9b5712e4ca39cbcc739b64ba61ff0ac890be3528e5bfd9c238e073d640b37b47e90
+  metadata.gz: bb8664ea804d59e3761ab385d1af98ecf7d110dd7e68e7003e1a4b2c059c5e377a5e42d350a46310f58c6d8c41c0e31a6aa0cdaf8a6b50b4d9f419e6fa60e474
+  data.tar.gz: 3bbe72cf7e99a4ae899765829ee8bee83703885ebdfa5b6b9f7253f25f2373b1dd22aadfeb071f4b340c1b7d33dfc4e590e4f8a2df2239afbbc305de009af2cf

data/app/assets/stylesheets/completion_kit/application.css CHANGED Viewed

@@ -3619,10 +3619,9 @@ select.ck-input {
 .ck-prompt-versions-table th:nth-child(3), .ck-prompt-versions-table td:nth-child(3) { width: 8rem; white-space: nowrap; }
 .ck-prompt-versions-table th:nth-child(4), .ck-prompt-versions-table td:nth-child(4) { width: auto; }
-.ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 34%; }
-.ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: 33%; white-space: nowrap; }
-.ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: 33%; white-space: nowrap; }
-.ck-metric-versions-table .ck-version-cell { justify-content: flex-start; gap: 0.75rem; }
+.ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 18rem; }
+.ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: 16rem; white-space: nowrap; }
+.ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: auto; white-space: nowrap; }
 .ck-source-chip {
@@ -6001,3 +6000,67 @@ a.tag-mark {
   width: 2rem;
   height: 2rem;
 }
+.ck-suggestion-status:empty { display: none; }
+.ck-suggestion-status {
+  margin-top: 10px;
+  display: flex;
+  align-items: baseline;
+  gap: 10px;
+  flex-wrap: wrap;
+}
+.ck-scoreboard {
+  margin-bottom: 16px;
+  padding-bottom: 14px;
+  border-bottom: 1px solid var(--ck-line);
+}
+.ck-scoreboard__headline {
+  margin: 0 0 8px;
+  font-size: 0.95rem;
+  color: var(--ck-text);
+}
+.ck-scoreboard__was {
+  font-family: var(--ck-mono);
+  font-size: 0.74rem;
+  color: var(--ck-muted);
+  margin-left: 6px;
+}
+.ck-scoreboard__tally {
+  list-style: none;
+  margin: 0;
+  padding: 0;
+  display: flex;
+  gap: 18px;
+}
+.ck-scoreboard__stat {
+  font-family: var(--ck-mono);
+  font-size: 0.72rem;
+  letter-spacing: 0.06em;
+  text-transform: uppercase;
+  color: var(--ck-muted);
+}
+.ck-scoreboard__stat strong { color: var(--ck-text); }
+.ck-scoreboard__stat--break strong { color: var(--ck-warning); }
+.ck-scoreboard__note {
+  margin: 8px 0 0;
+  font-size: 0.78rem;
+  color: var(--ck-muted);
+}
+.ck-version-change {
+  display: inline-flex;
+  align-items: baseline;
+  gap: 0.6rem;
+}
+.ck-version-score {
+  font-family: var(--ck-mono);
+  font-size: 0.74rem;
+  color: var(--ck-dim);
+}
+.ck-version-score__label {
+  font-size: 0.6rem;
+  letter-spacing: 0.08em;
+  text-transform: uppercase;
+  color: var(--ck-muted);
+  margin-right: 0.2rem;
+}

data/app/controllers/completion_kit/metrics_controller.rb CHANGED Viewed

@@ -117,26 +117,22 @@ module CompletionKit
     def suggest_variants
       target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
-      disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
-      if disagreement_count.zero?
+      counts = Calibration.where(metric_id: @metric.id, verdict: %w[agree disagree]).group(:verdict).count
+      if counts["disagree"].to_i.zero?
         redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
         return
       end
-      MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
+      MetricSuggestionJob.perform_later(@metric.id)
-      generator = MetricVariantGenerator.new(@metric, count: 1)
-      variants = generator.call
-      if variants.empty?
-        redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
-        return
-      end
-      versions = generator.persist!(variants)
-      new_version = versions.max_by(&:version_number)
       if params[:back_to] == "edit"
-        redirect_to edit_metric_path(@metric), notice: "Drafted #{new_version.version_label} from your reviews. Review the proposed changes below, then Publish to use it."
+        redirect_to metric_path(@metric), notice: "Drafting a change from your reviews. It will appear here once it's tested."
       else
-        redirect_to metric_path(@metric, show_change: new_version.id), notice: "Drafted #{new_version.version_label} from your reviews."
+        render turbo_stream: turbo_stream.replace(
+          "ck-suggestion-status-#{@metric.id}",
+          partial: "completion_kit/metrics/suggestion_pending",
+          locals: { metric: @metric, count: counts.values.sum }
+        )
       end
     end

data/app/jobs/completion_kit/metric_suggestion_job.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require "faraday"
+module CompletionKit
+  class MetricSuggestionJob < ApplicationJob
+    queue_as :llm
+    retry_on Faraday::TimeoutError, Faraday::ConnectionFailed, wait: :polynomially_longer, attempts: 5
+    retry_on CompletionKit::RateLimitError, wait: :polynomially_longer, attempts: 5
+    rescue_from(StandardError) do |error|
+      Rails.error.report(error, handled: true, context: { job: self.class.name })
+      broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
+    end
+    def perform(metric_id)
+      @metric = Metric.find_by(id: metric_id)
+      return unless @metric
+      MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
+      generator = MetricVariantGenerator.new(@metric, count: 1)
+      variants = generator.call
+      if variants.empty?
+        broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
+        return
+      end
+      draft = generator.persist!(variants).max_by(&:version_number)
+      summary = MetricImprovementValidator.new(@metric, draft).call
+      draft.update!(validation_summary: summary)
+      broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_ready", locals: { metric: @metric, draft: draft })
+    end
+    private
+    def broadcast_status(metric, partial:, locals:)
+      html = CompletionKit::ApplicationController.render(partial: partial, locals: locals)
+      Turbo::StreamsChannel.broadcast_replace_to(
+        "metric_#{metric.id}_suggestion",
+        target: "ck-suggestion-status-#{metric.id}",
+        html: html
+      )
+    end
+  end
+end

data/app/models/completion_kit/metric_version.rb CHANGED Viewed

@@ -6,6 +6,7 @@ module CompletionKit
     has_many :calibrations, dependent: :destroy
     serialize :rubric_bands, coder: JSON
+    serialize :validation_summary, coder: JSON
     before_validation :assign_version_number, on: :create

data/app/services/completion_kit/metric_improvement_validator.rb ADDED Viewed

@@ -0,0 +1,101 @@
+module CompletionKit
+  class MetricImprovementValidator
+    ANSWER_KEY_LIMIT = 30
+    def initialize(metric, candidate, scorer: nil)
+      @metric = metric
+      @candidate = candidate
+      @scorer = scorer || method(:rescore)
+    end
+    def call
+      key = answer_key
+      rows = []
+      key.each do |entry|
+        begin
+          score = @scorer.call(entry[:response], @candidate)
+        rescue StandardError
+          next
+        end
+        rows << classify(entry, score.to_i)
+      end
+      summarize(rows, key.size, key_capped?)
+    end
+    private
+    def answer_key
+      current = MetricVersion.current.find_by(metric_id: @metric.id)
+      return [] unless current
+      base = Calibration.where(metric_id: @metric.id, metric_version_id: current.id, verdict: %w[agree disagree])
+      @key_size_before_cap = base.count
+      base.includes(response: :reviews)
+          .order(created_at: :desc)
+          .limit(ANSWER_KEY_LIMIT)
+          .filter_map do |cal|
+        response = cal.response
+        next unless response.response_text.present?
+        review = response.reviews.find { |r| r.metric_id == @metric.id }
+        position = cal.verdict == "disagree" ? cal.corrected_score : review&.ai_score
+        next if position.nil?
+        { response: response, verdict: cal.verdict, position: position }
+      end
+    end
+    def key_capped?
+      @key_size_before_cap.to_i > ANSWER_KEY_LIMIT
+    end
+    def classify(entry, candidate_score)
+      matched = candidate_score == entry[:position].to_i
+      outcome = if entry[:verdict] == "disagree"
+        matched ? "fix" : "still_off"
+      else
+        matched ? "keep" : "break"
+      end
+      {
+        "response_id" => entry[:response].id,
+        "verdict" => entry[:verdict],
+        "position" => entry[:position].to_i,
+        "candidate_score" => candidate_score,
+        "outcome" => outcome
+      }
+    end
+    def summarize(rows, total, capped)
+      fixes = rows.count { |r| r["outcome"] == "fix" }
+      keeps = rows.count { |r| r["outcome"] == "keep" }
+      breaks = rows.count { |r| r["outcome"] == "break" }
+      still_off = rows.count { |r| r["outcome"] == "still_off" }
+      agreements = rows.count { |r| r["verdict"] == "agree" }
+      {
+        "total" => total,
+        "tested" => rows.size,
+        "capped" => capped,
+        "fixes" => fixes,
+        "keeps" => keeps,
+        "breaks" => breaks,
+        "still_off" => still_off,
+        "before" => agreements,
+        "after" => fixes + keeps,
+        "rows" => rows
+      }
+    end
+    def rescore(response, candidate)
+      run = response.run
+      config = ApiConfig.for_model(run.judge_model).merge(judge_model: run.judge_model)
+      rubric_text = Metric.rubric_text_for(Metric.normalize_rubric_bands(candidate.rubric_bands))
+      result = JudgeService.new(config).evaluate(
+        response.response_text,
+        response.expected_output,
+        run.prompt&.template,
+        criteria: candidate.instruction.to_s,
+        rubric_text: rubric_text,
+        input_data: response.input_data
+      )
+      result[:score]
+    end
+  end
+end

data/app/views/completion_kit/calibrations/_trust_panel.html.erb CHANGED Viewed

@@ -31,10 +31,7 @@
       <%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
     <% end %>
   <% else %>
-    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agreement</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong></span>
-    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
-    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
-    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
+    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agrees with you</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong> of <%= stats.sample_size %> reviews</span>
     <% if stats.borderline_rate && stats.borderline_rate > 0 %>
       <% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
       <span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>

data/app/views/completion_kit/metrics/_suggestion_failed.html.erb ADDED Viewed

@@ -0,0 +1,3 @@
+<div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status">
+  <span class="ck-cal-foot__note">The model returned no usable change. Try again, or review a few more scores first.</span>
+</div>

data/app/views/completion_kit/metrics/_suggestion_pending.html.erb ADDED Viewed

@@ -0,0 +1,3 @@
+<div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status ck-suggestion-status--pending">
+  <span class="ck-cal-foot__note">Drafting a change and testing it against your <%= pluralize(count, "review") %>…</span>
+</div>

data/app/views/completion_kit/metrics/_suggestion_ready.html.erb ADDED Viewed

@@ -0,0 +1,4 @@
+<div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status ck-suggestion-status--ready">
+  <span class="ck-cal-foot__note">Drafted <%= draft.version_label %> and tested it against your reviews.</span>
+  <%= link_to "Compare and publish →", CompletionKit::Engine.routes.url_helpers.metric_path(metric, show_change: draft.id), class: "ck-cal-link" %>
+</div>

data/app/views/completion_kit/metrics/_validation_scoreboard.html.erb ADDED Viewed

@@ -0,0 +1,12 @@
+<% s = summary %>
+<div class="ck-scoreboard">
+  <p class="ck-scoreboard__headline">Matches you on <strong><%= s["after"] %> of <%= s["tested"] %></strong> of your reviews <span class="ck-scoreboard__was">was <%= s["before"] %> of <%= s["tested"] %></span></p>
+  <ul class="ck-scoreboard__tally">
+    <li class="ck-scoreboard__stat ck-scoreboard__stat--fix">Fixes <strong><%= s["fixes"] %></strong></li>
+    <li class="ck-scoreboard__stat ck-scoreboard__stat--keep">Keeps <strong><%= s["keeps"] %></strong></li>
+    <li class="ck-scoreboard__stat ck-scoreboard__stat--break">Breaks <strong><%= s["breaks"] %></strong></li>
+  </ul>
+  <% if s["capped"] %>
+    <p class="ck-scoreboard__note">Tested against your 30 most recent reviews.</p>
+  <% end %>
+</div>

data/app/views/completion_kit/metrics/show.html.erb CHANGED Viewed

@@ -76,14 +76,20 @@
                           data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
                   <% end %>
                 </div>
+                <% vs = v.validation_summary %>
                 <% if summary %>
-                  <button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">&Delta;</button>
+                  <div class="ck-version-change">
+                    <% if v.draft? && vs.present? %>
+                      <span class="ck-version-score"><span class="ck-version-score__label">Match</span> <%= vs["after"] %>/<%= vs["tested"] %></span>
+                    <% end %>
+                    <button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">&Delta;</button>
+                  </div>
                 <% end %>
               </div>
             </td>
             <td>
               <% source_label, source_class = case v.source
-                                              when "suggestion" then ["AI calibration", "ck-source-chip ck-source-chip--ai"]
+                                              when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
                                               when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
                                               when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
                                               else ["Original", "ck-source-chip ck-source-chip--initial"]
@@ -110,6 +116,7 @@
   <% @versions.each do |v| %>
     <% pred = predecessor_of[v] %>
     <% next unless v.change_summary_against(pred) %>
+    <% vs = v.validation_summary %>
     <dialog id="ck-mvdiff-<%= v.id %>" class="ck-modal" onclick="if(event.target===this)this.close()">
       <article class="ck-modal__panel" tabindex="-1" onclick="event.stopPropagation()">
         <header class="ck-modal__header">
@@ -120,6 +127,9 @@
           <button type="button" class="ck-modal__close" aria-label="Close" onclick="this.closest('dialog').close()">&times;</button>
         </header>
         <div class="ck-modal__body">
+          <% if v.draft? && vs.present? %>
+            <%= render "completion_kit/metrics/validation_scoreboard", summary: vs %>
+          <% end %>
           <% if pred.instruction.to_s != v.instruction.to_s %>
             <div class="ck-suggest-diff">
               <div class="ck-suggest-diff__pane">
@@ -152,8 +162,10 @@
                     title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
                     data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
               <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
+              <% net_negative = vs.present? && (vs["after"].to_i < vs["before"].to_i || vs["breaks"].to_i > vs["fixes"].to_i) %>
               <%= button_to "Publish #{v.version_label} →", publish_draft_metric_path(@metric, draft_id: v.id),
-                    method: :post, form_class: "inline-block", class: ck_button_classes(:dark) %>
+                    method: :post, form_class: "inline-block", class: ck_button_classes(:dark),
+                    data: net_negative ? { turbo_confirm: "This agrees with you less than the current version. Publish anyway?" } : {} %>
             </span>
           <% else %>
             <span class="ck-modal__foot-note">Roll this metric back to this version.</span>
@@ -171,7 +183,7 @@
   <% draft = @suggestion_draft || @edit_draft %>
   <section class="ck-card ck-card--spaced">
     <div class="ck-prompt-preview__header">
-      <p class="ck-kicker">Calibration</p>
+      <p class="ck-kicker">Agreement</p>
       <% if draft.nil? && @improve_disagreement_count.positive? %>
         <%= button_to suggest_variants_metric_path(@metric),
               method: :post, form_class: "inline-block",
@@ -182,7 +194,9 @@
         <% end %>
       <% end %>
     </div>
-    <p class="ck-meta-copy">This is a measure of how often the judge's scores match a human reviewer. Review its scores to build that signal; the scores you disagree with become the cases the model learns from when you improve the metric.</p>
+    <%= turbo_stream_from "metric_#{@metric.id}_suggestion" %>
+    <div id="ck-suggestion-status-<%= @metric.id %>" class="ck-suggestion-status"></div>
+    <p class="ck-meta-copy">How often the judge lands on the same score you would. Review its scores to build that signal, and improve the metric to raise it.</p>
     <%= render "completion_kit/calibrations/trust_panel",
           stats: CompletionKit::MetricCalibrationStats.for(@metric),
           metric: @metric %>

data/db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb ADDED Viewed

@@ -0,0 +1,5 @@
+class AddValidationSummaryToCompletionKitMetricVersions < ActiveRecord::Migration[8.1]
+  def change
+    add_column :completion_kit_metric_versions, :validation_summary, :text
+  end
+end

data/lib/completion_kit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CompletionKit
-  VERSION = "0.10.0"
+  VERSION = "0.11.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: completion-kit
 version: !ruby/object:Gem::Version
-  version: 0.10.0
+  version: 0.11.0
 platform: ruby
 authors:
 - Damien Bastin
@@ -266,6 +266,7 @@ files:
 - app/jobs/completion_kit/application_job.rb
 - app/jobs/completion_kit/generate_row_job.rb
 - app/jobs/completion_kit/judge_review_job.rb
+- app/jobs/completion_kit/metric_suggestion_job.rb
 - app/jobs/completion_kit/model_discovery_job.rb
 - app/jobs/completion_kit/run_completion_check_job.rb
 - app/mailers/completion_kit/application_mailer.rb
@@ -313,6 +314,7 @@ files:
 - app/services/completion_kit/mcp_tools/tags.rb
 - app/services/completion_kit/metric_calibration_examples.rb
 - app/services/completion_kit/metric_calibration_stats.rb
+- app/services/completion_kit/metric_improvement_validator.rb
 - app/services/completion_kit/metric_variant_generator.rb
 - app/services/completion_kit/model_discovery_service.rb
 - app/services/completion_kit/ollama_client.rb
@@ -355,6 +357,10 @@ files:
 - app/views/completion_kit/metrics/_rubric_diff.html.erb
 - app/views/completion_kit/metrics/_rubric_hint.html.erb
 - app/views/completion_kit/metrics/_starter_card.html.erb
+- app/views/completion_kit/metrics/_suggestion_failed.html.erb
+- app/views/completion_kit/metrics/_suggestion_pending.html.erb
+- app/views/completion_kit/metrics/_suggestion_ready.html.erb
+- app/views/completion_kit/metrics/_validation_scoreboard.html.erb
 - app/views/completion_kit/metrics/edit.html.erb
 - app/views/completion_kit/metrics/index.html.erb
 - app/views/completion_kit/metrics/new.html.erb
@@ -433,6 +439,7 @@ files:
 - db/migrate/20260528000002_add_metric_version_to_reviews.rb
 - db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
 - db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb
+- db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb
 - lib/completion-kit.rb
 - lib/completion_kit.rb
 - lib/completion_kit/concurrency_check.rb