RubyGems - completion-kit - Versions diffs - 0.11.0 → 0.12.0 - Mend

completion-kit 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

data/app/services/completion_kit/mcp_tools/{calibrations.rb → agreements.rb} RENAMED Viewed

@@ -1,11 +1,11 @@
 module CompletionKit
   module McpTools
-    module Calibrations
+    module Agreements
       extend Base
       TOOLS = {
-        "calibrations_list" => {
-          description: "List calibrations. Filter by run_id, response_id, metric_id, or created_by.",
+        "agreements_list" => {
+          description: "List agreements. Filter by run_id, response_id, metric_id, or created_by.",
           inputSchema: {
             type: "object",
             properties: {
@@ -18,8 +18,8 @@ module CompletionKit
           },
           handler: :list
         },
-        "calibrations_create" => {
-          description: "Upsert a calibration for (run, response, metric, created_by). Verdict is one of agree, disagree, borderline. corrected_score (1..5) is required when verdict is 'disagree'.",
+        "agreements_create" => {
+          description: "Upsert an agreement for (run, response, metric, created_by). Verdict is one of agree, disagree, borderline. corrected_score (1..5) is required when verdict is 'disagree'.",
           inputSchema: {
             type: "object",
             properties: {
@@ -38,7 +38,7 @@ module CompletionKit
       }.freeze
       def self.list(args)
-        scope = CompletionKit::Calibration.all
+        scope = CompletionKit::Agreement.all
         scope = scope.where(run_id: args["run_id"]) if args["run_id"]
         scope = scope.where(response_id: args["response_id"]) if args["response_id"]
         scope = scope.where(metric_id: args["metric_id"]) if args["metric_id"]
@@ -52,20 +52,20 @@ module CompletionKit
         metric = CompletionKit::Metric.find(args["metric_id"])
         created_by = args["created_by"].presence || "mcp"
-        calibration = CompletionKit::Calibration.find_or_initialize_by(
+        agreement = CompletionKit::Agreement.find_or_initialize_by(
           run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
         )
-        calibration.assign_attributes(
+        agreement.assign_attributes(
           metric_version: CompletionKit::MetricVersion.ensure_current_for(metric),
           verdict: args["verdict"],
           corrected_score: args["corrected_score"],
           note: args["note"]
         )
-        if calibration.save
-          text_result(calibration.as_json)
+        if agreement.save
+          text_result(agreement.as_json)
         else
-          error_result(calibration.errors.full_messages.join(", "))
+          error_result(agreement.errors.full_messages.join(", "))
         end
       end
     end

data/app/services/completion_kit/mcp_tools/judges.rb CHANGED Viewed

@@ -33,7 +33,7 @@ module CompletionKit
           handler: :replay
         },
         "judges_compare" => {
-          description: "Compare two metric versions' calibration stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
+          description: "Compare two metric versions' agreement stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
           inputSchema: {
             type: "object",
             properties: {
@@ -77,8 +77,8 @@ module CompletionKit
         metric = CompletionKit::Metric.find(args["metric_id"])
         a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
         b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
-        stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
-        stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
+        stats_a = CompletionKit::MetricAgreementStats.for(metric, metric_version: a)
+        stats_b = CompletionKit::MetricAgreementStats.for(metric, metric_version: b)
         text_result({
           metric_id: metric.id,
           a: metric_version_payload(a, stats_a),

data/app/services/completion_kit/mcp_tools/metric_versions.rb CHANGED Viewed

@@ -47,13 +47,8 @@ module CompletionKit
       def self.publish(args)
         version = CompletionKit::MetricVersion.find(args["metric_version_id"])
-        if version.published? && !version.current?
-          audit = version.revert!
-          text_result(audit.as_json)
-        else
-          version.publish!
-          text_result(version.reload.as_json)
-        end
+        version.publish!
+        text_result(version.reload.as_json)
       end
       def self.dismiss(args)

data/app/services/completion_kit/{metric_calibration_examples.rb → metric_agreement_examples.rb} RENAMED Viewed

@@ -1,5 +1,5 @@
 module CompletionKit
-  module MetricCalibrationExamples
+  module MetricAgreementExamples
     DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
     module_function
@@ -9,18 +9,18 @@ module CompletionKit
     end
     def disagreements_for(metric, limit: 8)
-      calibrations_for(metric, verdict: "disagree", limit: limit)
+      agreements_for(metric, verdict: "disagree", limit: limit)
     end
     def borderlines_for(metric, limit: 6)
-      calibrations_for(metric, verdict: "borderline", limit: limit)
+      agreements_for(metric, verdict: "borderline", limit: limit)
     end
     def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
       current_version = MetricVersion.current.find_by(metric_id: metric.id)
       return [] unless current_version
-      relation = Calibration
+      relation = Agreement
                  .where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
                  .where.not(corrected_score: nil)
       relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
@@ -28,8 +28,8 @@ module CompletionKit
         .reject { |example| example[:judge_score].nil? }
     end
-    def calibrations_for(metric, verdict:, limit:)
-      base = Calibration.where(metric_id: metric.id, verdict: verdict)
+    def agreements_for(metric, verdict:, limit:)
+      base = Agreement.where(metric_id: metric.id, verdict: verdict)
       current_version = MetricVersion.current.find_by(metric_id: metric.id)
       scoped = current_version ? base.where(metric_version_id: current_version.id) : base
       effective = scoped.exists? ? scoped : base

data/app/services/completion_kit/{metric_calibration_stats.rb → metric_agreement_stats.rb} RENAMED Viewed

@@ -1,5 +1,5 @@
 module CompletionKit
-  class MetricCalibrationStats
+  class MetricAgreementStats
     PROVISIONAL_MIN = 10
     FIRM_MIN = 30
@@ -49,7 +49,7 @@ module CompletionKit
     end
     def call
-      scope = Calibration.where(metric_id: @metric.id)
+      scope = Agreement.where(metric_id: @metric.id)
       if @metric_version
         scope = scope.where(metric_version_id: @metric_version.id)
       elsif !@all_versions
@@ -62,12 +62,12 @@ module CompletionKit
       disagrees = verdicts.count { |v, _, _| v == "disagree" }
       borderlines = verdicts.count { |v, _, _| v == "borderline" }
-      ci = CalibrationMath.wilson_interval(successes: agrees, n: n)
+      ci = AgreementMath.wilson_interval(successes: agrees, n: n)
       pairs = score_pairs(verdicts)
-      mae_value = CalibrationMath.mae(pairs)
-      pearson_value = CalibrationMath.pearson(pairs)
-      kappa_value = CalibrationMath.quadratic_weighted_kappa(pairs, categories: 1..5)
+      mae_value = AgreementMath.mae(pairs)
+      pearson_value = AgreementMath.pearson(pairs)
+      kappa_value = AgreementMath.quadratic_weighted_kappa(pairs, categories: 1..5)
       Result.new(
         sample_size: n,

data/app/services/completion_kit/metric_improvement_validator.rb CHANGED Viewed

@@ -28,7 +28,7 @@ module CompletionKit
       current = MetricVersion.current.find_by(metric_id: @metric.id)
       return [] unless current
-      base = Calibration.where(metric_id: @metric.id, metric_version_id: current.id, verdict: %w[agree disagree])
+      base = Agreement.where(metric_id: @metric.id, metric_version_id: current.id, verdict: %w[agree disagree])
       @key_size_before_cap = base.count
       base.includes(response: :reviews)
           .order(created_at: :desc)

data/app/services/completion_kit/metric_variant_generator.rb CHANGED Viewed

@@ -41,8 +41,8 @@ module CompletionKit
     private
     def build_meta_prompt
-      disagreements = MetricCalibrationExamples.disagreements_for(@metric)
-      borderlines = MetricCalibrationExamples.borderlines_for(@metric)
+      disagreements = MetricAgreementExamples.disagreements_for(@metric)
+      borderlines = MetricAgreementExamples.borderlines_for(@metric)
       sections = []
       sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
       sections << ""

data/app/views/completion_kit/{calibrations → agreements}/_buttons.html.erb RENAMED Viewed

@@ -1,34 +1,34 @@
-<div id="calibration_<%= response_row.id %>_<%= metric.id %>" class="ck-calibration">
-  <% current_verdict = calibration&.verdict %>
+<div id="agreement_<%= response_row.id %>_<%= metric.id %>" class="ck-agreement">
+  <% current_verdict = agreement&.verdict %>
   <% pending_verdict = local_assigns[:pending_verdict] %>
   <% active_verdict = pending_verdict || current_verdict %>
   <% error = local_assigns[:error] %>
   <% me = CompletionKit.config.username.presence || "operator" %>
-  <% other_calibrations = CompletionKit::Calibration
+  <% other_agreements = CompletionKit::Agreement
         .where(response_id: response_row.id, metric_id: metric.id)
         .where.not(created_by: me)
         .order(created_at: :asc).to_a %>
   <% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
-  <p class="ck-calibration__prompt">
-    <span class="ck-calibration__label">Your verdict</span>
-    <% if other_calibrations.any? %>
-      <span class="ck-calibration__meta"><%= pluralize(other_calibrations.size, "other verdict") %> on this score</span>
-      <span class="ck-calibration__sep">·</span>
+  <p class="ck-agreement__prompt">
+    <span class="ck-agreement__label">Your verdict</span>
+    <% if other_agreements.any? %>
+      <span class="ck-agreement__meta"><%= pluralize(other_agreements.size, "other verdict") %> on this score</span>
+      <span class="ck-agreement__sep">·</span>
     <% end %>
-    <%= link_to metric_path(metric), class: "ck-calibration__meta-link" do %><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration →<% end %>
+    <%= link_to metric_path(metric, anchor: "agreement"), class: "ck-agreement__meta-link" do %><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Agreement →<% end %>
   </p>
-  <div class="ck-calibration__buttons">
+  <div class="ck-agreement__buttons">
     <% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
     <% verdict_hints = {
          "agree" => "The score looks right.",
          "disagree" => "The score is wrong — you'll pick the right one.",
          "borderline" => "The rubric is unclear here; either score could be defensible."
        } %>
-    <% CompletionKit::Calibration::VERDICTS.each do |verdict| %>
-      <%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
+    <% CompletionKit::Agreement::VERDICTS.each do |verdict| %>
+      <%= button_to run_response_agreements_path(run, response_row, metric_id: metric.id, verdict: verdict),
               method: :post,
               form: { data: { turbo: "true" } },
-              class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
+              class: "ck-agreement__pill ck-agreement__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
               "aria-pressed": (verdict == active_verdict).to_s,
               title: verdict_hints[verdict] do %>
         <%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
@@ -38,26 +38,26 @@
   </div>
   <% if error.present? %>
-    <p class="ck-calibration__error" role="alert"><%= error %></p>
+    <p class="ck-agreement__error" role="alert"><%= error %></p>
   <% end %>
-  <% if other_calibrations.any? %>
-    <details class="ck-calibration__others">
-      <summary class="ck-calibration__others-summary">
+  <% if other_agreements.any? %>
+    <details class="ck-agreement__others">
+      <summary class="ck-agreement__others-summary">
         <%= heroicon_tag "chevron-right", variant: :outline, size: 14, "aria-hidden": "true" %>
-        <span>What others said (<%= other_calibrations.size %>)</span>
+        <span>What others said (<%= other_agreements.size %>)</span>
       </summary>
-      <ul class="ck-calibration__others-list">
-        <% other_calibrations.each do |other| %>
-          <li class="ck-calibration__others-item ck-calibration__others-item--<%= other.verdict %>">
-            <div class="ck-calibration__others-row">
-              <span class="ck-calibration__others-verdict">
+      <ul class="ck-agreement__others-list">
+        <% other_agreements.each do |other| %>
+          <li class="ck-agreement__others-item ck-agreement__others-item--<%= other.verdict %>">
+            <div class="ck-agreement__others-row">
+              <span class="ck-agreement__others-verdict">
                 <%= heroicon_tag verdict_icons[other.verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
                 <%= other.verdict %>
               </span>
-              <span class="ck-calibration__others-by"><%= other.created_by %></span>
+              <span class="ck-agreement__others-by"><%= other.created_by %></span>
               <% if other.corrected_score %>
-                <span class="ck-calibration__others-stars" aria-label="<%= pluralize(other.corrected_score.to_i, 'star') %>" title="<%= pluralize(other.corrected_score.to_i, 'star') %>">
+                <span class="ck-agreement__others-stars" aria-label="<%= pluralize(other.corrected_score.to_i, 'star') %>" title="<%= pluralize(other.corrected_score.to_i, 'star') %>">
                   <% 5.times do |i| %>
                     <svg viewBox="0 0 24 24" width="12" height="12" stroke-width="1.75" class="ck-star <%= i < other.corrected_score.to_i ? "ck-star--filled" : "ck-star--empty" %>" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
                   <% end %>
@@ -65,7 +65,7 @@
               <% end %>
             </div>
             <% if other.note.to_s.present? %>
-              <p class="ck-calibration__others-note">"<%= other.note.to_s.truncate(140) %>"</p>
+              <p class="ck-agreement__others-note">"<%= other.note.to_s.truncate(140) %>"</p>
             <% end %>
           </li>
         <% end %>
@@ -74,10 +74,10 @@
   <% end %>
   <% if active_verdict == "disagree" %>
-    <% existing_score = (calibration&.corrected_score || review&.ai_score)&.round %>
-    <%= form_with url: run_response_calibrations_path(run, response_row),
+    <% existing_score = (agreement&.corrected_score || review&.ai_score)&.round %>
+    <%= form_with url: run_response_agreements_path(run, response_row),
                   method: :post, local: false,
-                  class: "ck-calibration__detail" do |f| %>
+                  class: "ck-agreement__detail" do |f| %>
       <%= hidden_field_tag :metric_id, metric.id %>
       <%= hidden_field_tag :verdict, "disagree" %>
       <p class="ck-label">What should the score have been?</p>
@@ -93,16 +93,16 @@
           <% end %>
         </div>
       </fieldset>
-      <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration&.note %></textarea>
+      <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= agreement&.note %></textarea>
       <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
     <% end %>
   <% elsif active_verdict == "borderline" %>
-    <%= form_with url: run_response_calibrations_path(run, response_row),
+    <%= form_with url: run_response_agreements_path(run, response_row),
                   method: :post, local: false,
-                  class: "ck-calibration__detail" do |f| %>
+                  class: "ck-agreement__detail" do |f| %>
       <%= hidden_field_tag :metric_id, metric.id %>
       <%= hidden_field_tag :verdict, "borderline" %>
-      <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration&.note %></textarea>
+      <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= agreement&.note %></textarea>
       <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
     <% end %>
   <% end %>

data/app/views/completion_kit/{calibrations → agreements}/_trust_panel.html.erb RENAMED Viewed

@@ -4,7 +4,7 @@
 <% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
 <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
      created_by = CompletionKit.config.username.presence || "operator"
-     verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
+     verdicted_ids = CompletionKit::Agreement.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
      CompletionKit::Response.joins(:reviews)
        .where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
        .where.not(reviews: { ai_score: nil })
@@ -12,20 +12,20 @@
        .order(created_at: :desc).first
    end %>
 <% prior_version_verdicts = if stats.sample_size.zero? && metric && current_metric_version
-     CompletionKit::Calibration.where(metric_id: metric.id).where.not(metric_version_id: current_metric_version.id).count
+     CompletionKit::Agreement.where(metric_id: metric.id).where.not(metric_version_id: current_metric_version.id).count
    else
      0
    end %>
-<p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
+<p id="agreement" class="ck-trust-line ck-trust-line--<%= stats.gate %>">
   <% if stats.sample_size.zero? %>
     <span class="ck-trust-line__lead">Not measured yet.</span>
-    <span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
+    <span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricAgreementStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
     <% if target_response %>
       <%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
     <% end %>
   <% elsif stats.counter_only? %>
-    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></strong></span>
+    <span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricAgreementStats::PROVISIONAL_MIN %></strong></span>
     <% if stats.short_to_target > 0 %><span class="ck-trust-line__hint"><%= stats.short_to_target %> more to report a rate</span><% end %>
     <% if target_response %>
       <%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>

data/app/views/completion_kit/api_reference/_body.html.erb CHANGED Viewed

@@ -17,7 +17,7 @@
   <input type="radio" name="ck-api-tab" id="ck-tab-datasets" class="ck-api-tabs__radio">
   <input type="radio" name="ck-api-tab" id="ck-tab-metrics" class="ck-api-tabs__radio">
   <input type="radio" name="ck-api-tab" id="ck-tab-metric-groups" class="ck-api-tabs__radio">
-  <input type="radio" name="ck-api-tab" id="ck-tab-calibrations" class="ck-api-tabs__radio">
+  <input type="radio" name="ck-api-tab" id="ck-tab-agreements" class="ck-api-tabs__radio">
   <input type="radio" name="ck-api-tab" id="ck-tab-tags" class="ck-api-tabs__radio">
   <input type="radio" name="ck-api-tab" id="ck-tab-providers" class="ck-api-tabs__radio">
@@ -29,7 +29,7 @@
     <label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
     <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">10</span></label>
     <label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
-    <label for="ck-tab-calibrations" class="ck-api-tabs__label">Calibrations <span class="ck-api-tabs__count">3</span></label>
+    <label for="ck-tab-agreements" class="ck-api-tabs__label">Agreements <span class="ck-api-tabs__count">3</span></label>
     <label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
     <label for="ck-tab-providers" class="ck-api-tabs__label">Providers <span class="ck-api-tabs__count">5</span></label>
   </nav>
@@ -238,8 +238,8 @@
             } %>
       <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
-        <p class="ck-kicker" style="margin-bottom: 0.5rem;">Calibration loop</p>
-        <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model to rewrite the instruction and rubric into a new draft version.</p>
+        <p class="ck-kicker" style="margin-bottom: 0.5rem;">Agreement loop</p>
+        <p class="ck-meta-copy">Drive metric improvement from disagree-flagged agreements: ask the model to rewrite the instruction and rubric into a new draft version.</p>
       </div>
       <div class="ck-api-endpoint">
         <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
@@ -250,7 +250,7 @@
       <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
         <p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>
-        <p class="ck-meta-copy">Every metric carries a history of versions (the current published one, prior published ones, and any draft suggestions). Reviews and calibrations record the version they ran against, so the API can surface stale state and let you revert.</p>
+        <p class="ck-meta-copy">Every metric carries a history of versions (the current published one, prior published ones, and any draft suggestions). Reviews and agreements record the version they ran against, so the API can surface stale state and let you revert.</p>
       </div>
       <div class="ck-api-endpoint">
         <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/metrics/:metric_id/metric_versions</p>
@@ -294,23 +294,23 @@
     </div>
     <div class="ck-api-tabs__panel">
-      <h2 class="ck-section-title">Calibrations</h2>
-      <p class="ck-copy">Per-verdict feedback events on a response/metric pair: agree, disagree (with a corrected score and note), or borderline. Calibrations capture the metric version that was current when the verdict was cast, which is what drives the trust signal and the "stale" indicators across the rest of the API.</p>
+      <h2 class="ck-section-title">Agreements</h2>
+      <p class="ck-copy">Per-verdict feedback events on a response/metric pair: agree, disagree (with a corrected score and note), or borderline. Agreements capture the metric version that was current when the verdict was cast, which is what drives the trust signal and the "stale" indicators across the rest of the API.</p>
       <div class="ck-api-endpoint">
-        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/calibrations</p>
-        <p class="ck-meta-copy">List calibrations across all runs. Supports filtering by any combination of the query params below.</p>
+        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/agreements</p>
+        <p class="ck-meta-copy">List agreements across all runs. Supports filtering by any combination of the query params below.</p>
         <p class="ck-api-params"><strong>Optional filters:</strong>&ensp;<code>run_id</code>, <code>response_id</code>, <code>metric_id</code>, <code>metric_version_id</code>, <code>created_by</code>, <code>verdict</code> (<code>agree</code>, <code>disagree</code>, or <code>borderline</code>)</p>
-        <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/calibrations?metric_id=1&verdict=disagree\" \\\n  -H \"Authorization: Bearer #{token}\"" %>
+        <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/agreements?metric_id=1&verdict=disagree\" \\\n  -H \"Authorization: Bearer #{token}\"" %>
       </div>
       <div class="ck-api-endpoint">
-        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:run_id/responses/:response_id/metrics/:metric_id/calibrations</p>
-        <p class="ck-meta-copy">Cast a calibration on a specific response/metric pair. The metric version on the record is set automatically from the run's review.</p>
+        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:run_id/responses/:response_id/metrics/:metric_id/agreements</p>
+        <p class="ck-meta-copy">Cast an agreement on a specific response/metric pair. The metric version on the record is set automatically from the run's review.</p>
         <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>verdict</code>, <code>created_by</code>&emsp;<strong>Optional:</strong>&ensp;<code>corrected_score</code>, <code>note</code></p>
-        <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/responses/42/metrics/3/calibrations \\\n  -H \"Authorization: Bearer #{token}\" \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\"verdict\": \"disagree\", \"corrected_score\": 3, \"note\": \"too generous\", \"created_by\": \"alice\"}'" %>
+        <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/responses/42/metrics/3/agreements \\\n  -H \"Authorization: Bearer #{token}\" \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\"verdict\": \"disagree\", \"corrected_score\": 3, \"note\": \"too generous\", \"created_by\": \"alice\"}'" %>
       </div>
       <div class="ck-api-endpoint">
-        <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/calibrations/:id</p>
-        <p class="ck-meta-copy">Delete a calibration. Returns 204 No Content.</p>
+        <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/agreements/:id</p>
+        <p class="ck-meta-copy">Delete an agreement. Returns 204 No Content.</p>
       </div>
     </div>

data/app/views/completion_kit/metrics/_guiding_examples.html.erb CHANGED Viewed

@@ -12,7 +12,7 @@
             <span class="ck-guiding__output"><%= truncate(example[:output].to_s, length: 90) %></span>
             <span class="ck-guiding__scores"><span class="ck-guiding__judge"><%= example[:judge_score].to_i %></span> &rarr; <span class="ck-guiding__human"><%= example[:human_score].to_i %></span></span>
           <% end %>
-          <%= button_to exclude_example_metric_path(metric, calibration_id: example[:id]),
+          <%= button_to exclude_example_metric_path(metric, agreement_id: example[:id]),
                 method: :post, form_class: "inline-block", class: "ck-icon-btn",
                 title: "Stop using this case", "aria-label": "Stop using this case",
                 data: { turbo_confirm: "Stop using this corrected case to guide the judge?" } do %><%= heroicon_tag "x-mark", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>

data/app/views/completion_kit/metrics/edit.html.erb CHANGED Viewed

@@ -10,7 +10,7 @@
   </div>
 </section>
-<% if CompletionKit.config.judge_calibration_enabled && @suggestion_draft.nil? && @edit_draft.nil? && @improve_disagreement_count.to_i.positive? %>
+<% if CompletionKit.config.judge_agreement_enabled && @suggestion_draft.nil? && @edit_draft.nil? && @improve_disagreement_count.to_i.positive? %>
   <div class="ck-suggestion-banner" role="status">
     <div class="ck-suggestion-banner__body">
       <p class="ck-kicker">Improve from reviews</p>

data/app/views/completion_kit/metrics/show.html.erb CHANGED Viewed

@@ -42,7 +42,7 @@
   </div>
 </section>
-<% if CompletionKit.config.judge_calibration_enabled && @versions.any? %>
+<% if CompletionKit.config.judge_agreement_enabled && @versions.any? %>
   <% predecessor_of = @versions.index_with { |v| @versions.detect { |o| o.version_number < v.version_number } } %>
   <section class="ck-card ck-card--spaced">
     <p class="ck-kicker">Versions</p>
@@ -73,7 +73,7 @@
                     <%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
                           method: :post, form_class: "inline-block",
                           class: "ck-chip ck-chip--publish",
-                          data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
+                          data: { turbo_confirm: "Make #{v.version_label} the version to use? It becomes the version used in test runs, and the reviews you gave on it count again. Reviews on the version you're leaving stay with it." } %>
                   <% end %>
                 </div>
                 <% vs = v.validation_summary %>
@@ -171,7 +171,7 @@
             <span class="ck-modal__foot-note">Roll this metric back to this version.</span>
             <%= button_to "Make #{v.version_label} current →", publish_draft_metric_path(@metric, draft_id: v.id),
                   method: :post, form_class: "inline-block", class: ck_button_classes(:dark),
-                  data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
+                  data: { turbo_confirm: "Make #{v.version_label} the version to use? It becomes the version used in test runs, and the reviews you gave on it count again. Reviews on the version you're leaving stay with it." } %>
           <% end %>
         </footer>
       </article>
@@ -179,7 +179,7 @@
   <% end %>
 <% end %>
-<% if CompletionKit.config.judge_calibration_enabled %>
+<% if CompletionKit.config.judge_agreement_enabled %>
   <% draft = @suggestion_draft || @edit_draft %>
   <section class="ck-card ck-card--spaced">
     <div class="ck-prompt-preview__header">
@@ -197,8 +197,8 @@
     <%= turbo_stream_from "metric_#{@metric.id}_suggestion" %>
     <div id="ck-suggestion-status-<%= @metric.id %>" class="ck-suggestion-status"></div>
     <p class="ck-meta-copy">How often the judge lands on the same score you would. Review its scores to build that signal, and improve the metric to raise it.</p>
-    <%= render "completion_kit/calibrations/trust_panel",
-          stats: CompletionKit::MetricCalibrationStats.for(@metric),
+    <%= render "completion_kit/agreements/trust_panel",
+          stats: CompletionKit::MetricAgreementStats.for(@metric),
           metric: @metric %>
     <% if CompletionKit.config.judge_examples_from_reviews %>
       <%= render "completion_kit/metrics/guiding_examples", metric: @metric, examples: @guiding_examples %>

data/app/views/completion_kit/responses/show.html.erb CHANGED Viewed

@@ -124,13 +124,13 @@
           <% if review.ai_feedback.present? %>
             <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
           <% end %>
-          <% if CompletionKit.config.judge_calibration_enabled && review.metric && review.ai_score %>
-            <% existing = CompletionKit::Calibration.find_by(
+          <% if CompletionKit.config.judge_agreement_enabled && review.metric && review.ai_score %>
+            <% existing = CompletionKit::Agreement.find_by(
                  response_id: @response.id, metric_id: review.metric_id,
                  created_by: CompletionKit.config.username.presence || "operator"
                ) %>
-            <%= render "completion_kit/calibrations/buttons",
-                  review: review, calibration: existing, run: @run,
+            <%= render "completion_kit/agreements/buttons",
+                  review: review, agreement: existing, run: @run,
                   response_row: @response, metric: review.metric %>
           <% end %>
         </div>

data/app/views/completion_kit/runs/show.html.erb CHANGED Viewed

@@ -18,7 +18,7 @@
   <% dataset_preview_lines = dataset_lines.first(50) %>
 <% end %>
-<% if CompletionKit.config.judge_calibration_enabled %>
+<% if CompletionKit.config.judge_agreement_enabled %>
   <% stale_summary = @run.stale_review_summary %>
   <% if stale_summary.any? %>
     <div class="ck-stale-versions-banner" role="status">

data/config/routes.rb CHANGED Viewed

@@ -41,7 +41,7 @@ CompletionKit::Engine.routes.draw do
       get :compare
     end
     resources :responses, only: [:show] do
-      resources :calibrations, only: [:create]
+      resources :agreements, only: [:create]
     end
   end
@@ -75,7 +75,7 @@ CompletionKit::Engine.routes.draw do
         end
         resources :responses, only: [:index, :show] do
           resources :metrics, only: [] do
-            resources :calibrations, only: [:index, :create]
+            resources :agreements, only: [:index, :create]
           end
         end
       end
@@ -93,7 +93,7 @@ CompletionKit::Engine.routes.draw do
       resources :metric_groups
       resources :tags
       resources :provider_credentials
-      resources :calibrations, only: [:index, :destroy]
+      resources :agreements, only: [:index, :destroy]
     end
   end

data/db/migrate/20260531000002_backfill_review_metric_versions.rb ADDED Viewed

@@ -0,0 +1,33 @@
+class BackfillReviewMetricVersions < ActiveRecord::Migration[8.1]
+  def up
+    quoted_true = ActiveRecord::Base.connection.quote(true)
+    now = ActiveRecord::Base.connection.quote(Time.current)
+    execute <<~SQL
+      INSERT INTO completion_kit_metric_versions
+        (metric_id, instruction, rubric_bands, current, state, version_number, published_at, created_at, updated_at)
+      SELECT m.id, m.instruction, m.rubric_bands, #{quoted_true}, 'published', 1, #{now}, #{now}, #{now}
+      FROM completion_kit_metrics m
+      WHERE NOT EXISTS (
+        SELECT 1 FROM completion_kit_metric_versions mv WHERE mv.metric_id = m.id
+      )
+    SQL
+    execute <<~SQL
+      UPDATE completion_kit_reviews
+      SET metric_version_id = (
+        SELECT mv.id FROM completion_kit_metric_versions mv
+        WHERE mv.metric_id = completion_kit_reviews.metric_id AND mv.current = #{quoted_true}
+        LIMIT 1
+      )
+      WHERE metric_id IS NOT NULL
+        AND (
+          metric_version_id IS NULL
+          OR metric_version_id NOT IN (SELECT id FROM completion_kit_metric_versions)
+        )
+    SQL
+  end
+  def down
+  end
+end

data/db/migrate/20260531000003_add_metric_version_fk_to_reviews.rb ADDED Viewed

@@ -0,0 +1,6 @@
+class AddMetricVersionFkToReviews < ActiveRecord::Migration[8.1]
+  def change
+    add_foreign_key :completion_kit_reviews, :completion_kit_metric_versions,
+                    column: :metric_version_id, on_delete: :nullify
+  end
+end