RubyGems - completion-kit - Versions diffs - 0.6.0 → 0.7.0 - Mend

completion-kit 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

data/app/models/completion_kit/review.rb CHANGED Viewed

@@ -1,37 +1,25 @@
 module CompletionKit
   class Review < ApplicationRecord
-    STATUSES = %w[pending retrying succeeded failed].freeze
-    TERMINAL_STATUSES = %w[succeeded failed].freeze
+    include HasJobStatus
     belongs_to :response
     belongs_to :metric, optional: true
     belongs_to :metric_version, optional: true
     has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
-    def stale_against_current_judge?
-      return false unless metric_id && metric_version_id
-      current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
-      return false if current_id.nil?
-      metric_version_id != current_id
-    end
     validates :metric_name, presence: true
-    validates :status, inclusion: { in: STATUSES }
     validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
     before_validation :set_default_status
-    def terminal?
-      TERMINAL_STATUSES.include?(status)
-    end
-    def succeeded?
-      status == "succeeded"
-    end
+    after_save_commit :broadcast_parent_row_update, unless: :destroyed?
+    after_save_commit :broadcast_run_progress, if: :should_broadcast_progress?
-    def error_payload
-      return nil if error_class.blank?
-      { provider: error_provider, class: error_class, status: error_status, message: error_message }
+    def stale_against_current_judge?
+      return false unless metric_id && metric_version_id
+      current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
+      return false if current_id.nil?
+      metric_version_id != current_id
     end
     def as_json(options = {})
@@ -46,8 +34,16 @@ module CompletionKit
     private
-    def set_default_status
-      self.status ||= "pending"
+    def broadcast_parent_row_update
+      response.run.broadcast_response_update(response)
+    end
+    def broadcast_run_progress
+      response.run.broadcast_progress
+    end
+    def should_broadcast_progress?
+      saved_change_to_status? && terminal?
     end
   end
 end

data/app/models/completion_kit/run.rb CHANGED Viewed

@@ -43,7 +43,7 @@ module CompletionKit
     end
     def outstanding_work_zero?
-      return false if responses.where.not(status: Response::TERMINAL_STATUSES).exists?
+      return false if responses.where.not(status: HasJobStatus::TERMINAL_STATUSES).exists?
       metric_ids = metrics.pluck(:id)
       return true if metric_ids.empty?
@@ -55,7 +55,7 @@ module CompletionKit
       terminal_review_count = Review.where(
         response_id: succeeded_response_ids,
         metric_id: metric_ids,
-        status: Review::TERMINAL_STATUSES
+        status: HasJobStatus::TERMINAL_STATUSES
       ).count
       terminal_review_count >= expected_reviews
@@ -118,6 +118,10 @@ module CompletionKit
     end
     def start!
+      unless %w[pending failed].include?(status)
+        return fail_with_summary!("Cannot start a run in state \"#{status}\". Use rerun to create a fresh copy, or retry_failures / regrade to work with the existing responses.")
+      end
       rows = if dataset
                CsvProcessor.process_self(self)
              else
@@ -161,7 +165,7 @@ module CompletionKit
           response = responses.create!(attrs)
           if judge_only?
-            metrics.each { |m| JudgeReviewJob.perform_later(response.id, m.id) } if judge_configured?
+            metrics.each { |m| JudgeReviewJob.perform_later(response.id, m.id, id) } if judge_configured?
           else
             GenerateRowJob.perform_later(id, response.id)
           end
@@ -202,7 +206,7 @@ module CompletionKit
         update!(status: "running", failure_summary: nil, error_message: nil)
         response_ids.each do |rid|
-          grading_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id) }
+          grading_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id, id) }
         end
         RunCompletionCheckJob.perform_later(id)
       end
@@ -272,17 +276,6 @@ module CompletionKit
       }
     end
-    private
-    def fail_with_summary!(message)
-      errors.add(:base, message)
-      if persisted?
-        update_columns(status: "failed", failure_summary: message, error_message: message)
-        broadcast_ui
-      end
-      false
-    end
     def broadcast_ui
       broadcast_progress
       broadcast_status_header
@@ -290,14 +283,6 @@ module CompletionKit
       broadcast_sort_toolbar
     end
-    def render_engine_partial(partial, locals)
-      CompletionKit::Engine.warm_routes!
-      CompletionKit::ApplicationController.render(
-        partial: partial,
-        locals: locals
-      )
-    end
     def broadcast_progress
       reload
       broadcast_replace_to(
@@ -356,6 +341,25 @@ module CompletionKit
       )
     end
+    private
+    def fail_with_summary!(message)
+      errors.add(:base, message)
+      if persisted?
+        update_columns(status: "failed", failure_summary: message, error_message: message)
+        broadcast_ui
+      end
+      false
+    end
+    def render_engine_partial(partial, locals)
+      CompletionKit::Engine.warm_routes!
+      CompletionKit::ApplicationController.render(
+        partial: partial,
+        locals: locals
+      )
+    end
     def set_default_status
       self.status ||= "pending"
     end

data/app/models/concerns/completion_kit/has_job_status.rb ADDED Viewed

@@ -0,0 +1,31 @@
+module CompletionKit
+  module HasJobStatus
+    extend ActiveSupport::Concern
+    STATUSES = %w[pending retrying succeeded failed].freeze
+    TERMINAL_STATUSES = %w[succeeded failed].freeze
+    included do
+      validates :status, inclusion: { in: STATUSES }
+    end
+    def terminal?
+      TERMINAL_STATUSES.include?(status)
+    end
+    def succeeded?
+      status == "succeeded"
+    end
+    def error_payload
+      return nil if error_class.blank?
+      { provider: error_provider, class: error_class, status: error_status, message: error_message }
+    end
+    private
+    def set_default_status
+      self.status ||= "pending"
+    end
+  end
+end

data/app/services/completion_kit/mcp_dispatcher.rb CHANGED Viewed

@@ -32,6 +32,7 @@ module CompletionKit
         McpTools::Datasets.definitions +
         McpTools::Metrics.definitions +
         McpTools::MetricGroups.definitions +
+        McpTools::MetricVersions.definitions +
         McpTools::ProviderCredentials.definitions +
         McpTools::Tags.definitions +
         McpTools::Calibrations.definitions +
@@ -44,8 +45,9 @@ module CompletionKit
       when /\Aruns_/                 then McpTools::Runs.call(name, arguments)
       when /\Aresponses_/            then McpTools::Responses.call(name, arguments)
       when /\Adatasets_/             then McpTools::Datasets.call(name, arguments)
-      when /\Ametrics_/              then McpTools::Metrics.call(name, arguments)
+      when /\Ametric_versions_/      then McpTools::MetricVersions.call(name, arguments)
       when /\Ametric_groups_/        then McpTools::MetricGroups.call(name, arguments)
+      when /\Ametrics_/              then McpTools::Metrics.call(name, arguments)
       when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
       when /\Atags_/                 then McpTools::Tags.call(name, arguments)
       when /\Acalibrations_/         then McpTools::Calibrations.call(name, arguments)

data/app/services/completion_kit/mcp_tools/metric_versions.rb ADDED Viewed

@@ -0,0 +1,67 @@
+module CompletionKit
+  module McpTools
+    module MetricVersions
+      extend Base
+      TOOLS = {
+        "metric_versions_list" => {
+          description: "List every MetricVersion (drafts + published) for a metric, newest first. Each row carries version_number, state, source, current flag, and timestamps.",
+          inputSchema: {
+            type: "object",
+            properties: {
+              metric_id: { type: "integer" }
+            },
+            required: ["metric_id"]
+          },
+          handler: :list
+        },
+        "metric_versions_publish" => {
+          description: "Publish a MetricVersion as the live judge for its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge actually uses them.",
+          inputSchema: {
+            type: "object",
+            properties: {
+              metric_version_id: { type: "integer" }
+            },
+            required: ["metric_version_id"]
+          },
+          handler: :publish
+        },
+        "metric_versions_dismiss" => {
+          description: "Destroy a draft MetricVersion (use for either source: 'edit' or source: 'suggestion'). Published versions are refused — to demote a published version, publish a different one as current instead.",
+          inputSchema: {
+            type: "object",
+            properties: {
+              metric_version_id: { type: "integer" }
+            },
+            required: ["metric_version_id"]
+          },
+          handler: :dismiss
+        }
+      }.freeze
+      def self.list(args)
+        metric = CompletionKit::Metric.find(args["metric_id"])
+        versions = CompletionKit::MetricVersion.where(metric_id: metric.id).order(version_number: :desc)
+        text_result(versions.map(&:as_json))
+      end
+      def self.publish(args)
+        version = CompletionKit::MetricVersion.find(args["metric_version_id"])
+        if version.published? && !version.current?
+          audit = version.revert!
+          text_result(audit.as_json)
+        else
+          version.publish!
+          text_result(version.reload.as_json)
+        end
+      end
+      def self.dismiss(args)
+        version = CompletionKit::MetricVersion.find(args["metric_version_id"])
+        return error_result("Cannot dismiss a published version. Publish a different version as current instead.") if version.published?
+        version.destroy!
+        text_result({id: version.id, destroyed: true})
+      end
+    end
+  end
+end

data/app/services/completion_kit/starter_metrics.rb CHANGED Viewed

@@ -21,8 +21,8 @@ module CompletionKit
         key: "instruction_following",
         name: "Instruction following",
         description: "Did the model do everything that was asked?",
-        catches: "The response is factually right but ignores \"answer in two sentences\", \"use bullet points\", \"do not include X\". Distinct from Correctness — a response can be right and still fail this.",
-        instruction: "Did the model do every concrete thing the prompt asked for? Score against the explicit requirements in the prompt (format constraints, count limits, exclusions, audience cues). Factual accuracy is a different dimension — score that elsewhere.",
+        catches: "The response is factually right but ignores \"answer in two sentences\", \"use bullet points\", \"do not include X\". Distinct from Correctness; a response can be right and still fail this.",
+        instruction: "Did the model do every concrete thing the prompt asked for? Score against the explicit requirements in the prompt (format constraints, count limits, exclusions, audience cues). Factual accuracy is a different dimension; score that elsewhere.",
         rubric_bands: [
           { "stars" => 5, "description" => "Followed every requirement in the prompt exactly." },
           { "stars" => 4, "description" => "Followed every requirement with a small slip." },
@@ -36,7 +36,7 @@ module CompletionKit
         name: "Format compliance",
         description: "Does the output follow the required structure?",
         catches: "Invalid JSON, missing schema fields, extra prose around a structured response, wrong casing on keys. Critical for any LLM wired into an API.",
-        instruction: "Does the output match the format the prompt asked for — JSON shape, schema, keys, casing, no stray prose? Score on whether a downstream parser would accept it without massaging.",
+        instruction: "Does the output match the format the prompt asked for: JSON shape, schema, keys, casing, no stray prose? Score on whether a downstream parser would accept it without massaging.",
         rubric_bands: [
           { "stars" => 5, "description" => "Exact spec, ready to consume programmatically." },
           { "stars" => 4, "description" => "Spec-compliant with one cosmetic issue." },
@@ -62,9 +62,9 @@ module CompletionKit
       Starter.new(
         key: "conciseness",
         name: "Conciseness",
-        description: "Is it the right length — no padding, no missing detail?",
+        description: "Is it the right length, no padding, no missing detail?",
         catches: "Rambling responses, repetitive caveats, over-hedging. LLMs default to verbose. Conciseness is the dimension where users most often see scores move after tuning.",
-        instruction: "Is the output the right length for the task — no padding, no missing detail, no hedging filler? Penalise rambling, repetition, over-caveating, and unnecessary preamble. Penalise too-short outputs that drop information.",
+        instruction: "Is the output the right length for the task: no padding, no missing detail, no hedging filler? Penalise rambling, repetition, over-caveating, and unnecessary preamble. Penalise too-short outputs that drop information.",
         rubric_bands: [
           { "stars" => 5, "description" => "Exactly as long as the task needs, no more, no less." },
           { "stars" => 4, "description" => "Right length with a small redundancy." },

data/app/views/completion_kit/api_reference/_body.html.erb CHANGED Viewed

@@ -17,17 +17,19 @@
   <input type="radio" name="ck-api-tab" id="ck-tab-datasets" class="ck-api-tabs__radio">
   <input type="radio" name="ck-api-tab" id="ck-tab-metrics" class="ck-api-tabs__radio">
   <input type="radio" name="ck-api-tab" id="ck-tab-metric-groups" class="ck-api-tabs__radio">
+  <input type="radio" name="ck-api-tab" id="ck-tab-calibrations" class="ck-api-tabs__radio">
   <input type="radio" name="ck-api-tab" id="ck-tab-tags" class="ck-api-tabs__radio">
   <input type="radio" name="ck-api-tab" id="ck-tab-providers" class="ck-api-tabs__radio">
   <nav class="ck-api-tabs__nav">
-    <label for="ck-tab-mcp" class="ck-api-tabs__label">MCP <span class="ck-api-tabs__count">35</span></label>
+    <label for="ck-tab-mcp" class="ck-api-tabs__label">MCP <span class="ck-api-tabs__count"><%= CompletionKit::McpDispatcher.tool_definitions.size %></span></label>
     <label for="ck-tab-prompts" class="ck-api-tabs__label">Prompts <span class="ck-api-tabs__count">6</span></label>
-    <label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">7</span></label>
+    <label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">10</span></label>
     <label for="ck-tab-responses" class="ck-api-tabs__label">Responses <span class="ck-api-tabs__count">2</span></label>
     <label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
-    <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">5</span></label>
+    <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">12</span></label>
     <label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
+    <label for="ck-tab-calibrations" class="ck-api-tabs__label">Calibrations <span class="ck-api-tabs__count">3</span></label>
     <label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
     <label for="ck-tab-providers" class="ck-api-tabs__label">Providers <span class="ck-api-tabs__count">5</span></label>
   </nav>
@@ -36,7 +38,7 @@
     <div class="ck-api-tabs__panel">
       <h2 class="ck-section-title">MCP Server</h2>
-      <p class="ck-copy">Connect Claude Code, Cursor, or any <a href="https://modelcontextprotocol.io" class="ck-link">MCP</a> client to manage prompts, runs, datasets, and metrics conversationally. 35 tools over streamable HTTP.</p>
+      <p class="ck-copy">Connect Claude Code, Cursor, or any <a href="https://modelcontextprotocol.io" class="ck-link">MCP</a> client to manage prompts, runs, datasets, and metrics conversationally. <%= CompletionKit::McpDispatcher.tool_definitions.size %> tools over streamable HTTP.</p>
       <div class="ck-mcp-install-grid">
         <div class="ck-mcp-install-card">
@@ -116,7 +118,8 @@
       <p class="ck-copy">Create runs, generate LLM responses, and judge them with metrics.</p>
       <div class="ck-api-endpoint">
         <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/runs</p>
-        <p class="ck-meta-copy">List all runs with response counts and average scores.</p>
+        <p class="ck-meta-copy">List runs with response counts and average scores. Supports pagination (<code>limit</code>, <code>offset</code>) and the following filters.</p>
+        <p class="ck-api-params"><strong>Optional filters:</strong>&ensp;<code>status</code> (<code>pending</code>, <code>running</code>, <code>completed</code>, <code>failed</code>), <code>prompt_id</code>, <code>dataset_id</code>, <code>tag[]</code></p>
       </div>
       <div class="ck-api-endpoint">
         <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs</p>
@@ -139,6 +142,24 @@
         <p class="ck-meta-copy">Start generating responses. Returns 202 Accepted. Poll the run to check progress.</p>
         <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/generate \\\n  -H \"Authorization: Bearer #{token}\"" %>
       </div>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:id/retry_failures</p>
+        <p class="ck-meta-copy">Re-queue any responses that failed during generation. Returns 202 Accepted.</p>
+      </div>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:id/rerun</p>
+        <p class="ck-meta-copy">Clone the run and start generating responses on the copy against the current prompt and metric versions. Returns the new run with 201 Created. Useful for capturing a fresh baseline after metric edits.</p>
+        <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/rerun \\\n  -H \"Authorization: Bearer #{token}\"" %>
+      </div>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:id/regrade</p>
+        <p class="ck-meta-copy">Re-judge the existing successful responses against the current metric versions without regenerating model output. Returns 202 Accepted, or 422 if no responses are eligible.</p>
+      </div>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/runs/:id/compare?with=:other_id</p>
+        <p class="ck-meta-copy">Side-by-side comparison against another run. Returns <code>{rows: [...], metric_ids: [...]}</code> with one row per input case, per-metric scores on both sides, and the delta. Cases that exist on only one side are still returned with the missing side nulled out.</p>
+        <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/runs/1/compare?with=2\" \\\n  -H \"Authorization: Bearer #{token}\"" %>
+      </div>
       <div class="ck-api-endpoint">
         <p class="ck-api-method"><span class="ck-chip ck-chip--soft">PATCH</span> /api/v1/runs/:id</p>
         <p class="ck-meta-copy">Update a run. Accepts same params as create.</p>
@@ -154,7 +175,8 @@
       <p class="ck-copy">Read-only access to generated responses and their review scores. Nested under runs.</p>
       <div class="ck-api-endpoint">
         <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/runs/:run_id/responses</p>
-        <p class="ck-meta-copy">List all responses for a run, including nested review scores.</p>
+        <p class="ck-meta-copy">List responses for a run, including nested review scores.</p>
+        <p class="ck-api-params"><strong>Optional filters:</strong>&ensp;<code>status</code> (<code>pending</code>, <code>succeeded</code>, <code>failed</code>), plus <code>limit</code> and <code>offset</code></p>
         <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl #{base_url}/api/v1/runs/1/responses \\\n  -H \"Authorization: Bearer #{token}\"" %>
       </div>
       <div class="ck-api-endpoint">
@@ -214,6 +236,48 @@
               { name: m.name, subtitle: m.instruction.presence&.truncate(100),
                 url: "#{base_url}/api/v1/metrics/#{m.id}", dom_id: "metric_ep_#{m.id}" }
             } %>
+      <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
+        <p class="ck-kicker" style="margin-bottom: 0.5rem;">Calibration loop</p>
+        <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model for variants, then pin individual cases as few-shot examples on the metric.</p>
+      </div>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
+        <p class="ck-meta-copy">Generate draft metric versions from the current disagreements. Returns 201 with the new draft versions, 422 if no disagreements exist or the model produced nothing usable.</p>
+        <p class="ck-api-params"><strong>Optional:</strong>&ensp;<code>count</code>, <code>model</code></p>
+        <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/metrics/1/suggest_variants \\\n  -H \"Authorization: Bearer #{token}\"" %>
+      </div>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/add_few_shot</p>
+        <p class="ck-meta-copy">Pin a disagree calibration as a few-shot example on the metric. Returns the updated metric, or 404 if the calibration is not a disagree on this metric.</p>
+        <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
+      </div>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:id/remove_few_shot</p>
+        <p class="ck-meta-copy">Drop the pinned few-shot example by calibration ID.</p>
+        <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
+      </div>
+      <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
+        <p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>
+        <p class="ck-meta-copy">Every metric carries a history of versions (the current published one, prior published ones, and any draft suggestions). Reviews and calibrations record the version they ran against, so the API can surface stale state and let you revert.</p>
+      </div>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/metrics/:metric_id/metric_versions</p>
+        <p class="ck-meta-copy">List every version for the metric, newest version_number first.</p>
+      </div>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/metrics/:metric_id/metric_versions/:id</p>
+        <p class="ck-meta-copy">Get a single version with its instruction, rubric bands, state, and source.</p>
+      </div>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:metric_id/metric_versions/:id/publish</p>
+        <p class="ck-meta-copy">Publish the version as current. Works for a draft (promote) or a superseded published version (revert). Copies the version's instruction and rubric back onto the metric.</p>
+      </div>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:metric_id/metric_versions/:id</p>
+        <p class="ck-meta-copy">Dismiss a draft version. Returns 204 No Content, or 409 Conflict if the version is published (published versions are immutable history).</p>
+      </div>
     </div>
     <div class="ck-api-tabs__panel">
@@ -239,6 +303,27 @@
             } %>
     </div>
+    <div class="ck-api-tabs__panel">
+      <h2 class="ck-section-title">Calibrations</h2>
+      <p class="ck-copy">Per-verdict feedback events on a response/metric pair: agree, disagree (with a corrected score and note), or borderline. Calibrations capture the metric version that was current when the verdict was cast, which is what drives the trust signal and the "stale" indicators across the rest of the API.</p>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/calibrations</p>
+        <p class="ck-meta-copy">List calibrations across all runs. Supports filtering by any combination of the query params below.</p>
+        <p class="ck-api-params"><strong>Optional filters:</strong>&ensp;<code>run_id</code>, <code>response_id</code>, <code>metric_id</code>, <code>metric_version_id</code>, <code>created_by</code>, <code>verdict</code> (<code>agree</code>, <code>disagree</code>, or <code>borderline</code>)</p>
+        <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/calibrations?metric_id=1&verdict=disagree\" \\\n  -H \"Authorization: Bearer #{token}\"" %>
+      </div>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:run_id/responses/:response_id/metrics/:metric_id/calibrations</p>
+        <p class="ck-meta-copy">Cast a calibration on a specific response/metric pair. The metric version on the record is set automatically from the run's review.</p>
+        <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>verdict</code>, <code>created_by</code>&emsp;<strong>Optional:</strong>&ensp;<code>corrected_score</code>, <code>note</code></p>
+        <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/responses/42/metrics/3/calibrations \\\n  -H \"Authorization: Bearer #{token}\" \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\"verdict\": \"disagree\", \"corrected_score\": 3, \"note\": \"too generous\", \"created_by\": \"alice\"}'" %>
+      </div>
+      <div class="ck-api-endpoint">
+        <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/calibrations/:id</p>
+        <p class="ck-meta-copy">Delete a calibration. Returns 204 No Content.</p>
+      </div>
+    </div>
     <div class="ck-api-tabs__panel">
       <h2 class="ck-section-title">Tags</h2>
       <p class="ck-copy">Domain labels you can attach to metrics, prompts, runs, and datasets. Tags are auto-assigned a color from a 10-color palette. Each index page can be filtered by one or more tags using <code>?tag[]=name</code> query params (OR semantics).</p>

data/app/views/completion_kit/api_reference/index.html.erb CHANGED Viewed

@@ -12,6 +12,14 @@
         <p class="ck-kicker">Authentication</p>
         <%= render CompletionKit.config.api_reference_authentication_partial, token: @token %>
       </div>
+      <div>
+        <p class="ck-kicker">Pagination</p>
+        <p class="ck-meta-copy">Every index endpoint accepts <code>?limit=</code> and <code>?offset=</code> (default limit 50, max 500). The server returns <code>X-Total-Count</code>, <code>X-Limit</code>, and <code>X-Offset</code> headers so the caller can build cursors without re-counting.</p>
+      </div>
+      <div>
+        <p class="ck-kicker">Tag filtering</p>
+        <p class="ck-meta-copy">Prompts, runs, metrics, datasets, and metric groups accept <code>?tag[]=name</code> (repeat for OR semantics).</p>
+      </div>
     </div>
   </div>
 </div>

data/app/views/completion_kit/metrics/index.html.erb CHANGED Viewed

@@ -80,7 +80,7 @@
   <% if @available_starters.any? %>
     <section class="ck-starter-row">
-      <p class="ck-kicker">Add a starter metric</p>
+      <p class="ck-kicker">Skip the blank page</p>
       <p class="ck-meta-copy">Pre-written rubrics for the dimensions most teams score against. Click a card to preview before it's created.</p>
       <div class="ck-starter-grid">
         <% @available_starters.each do |starter| %>
@@ -96,8 +96,8 @@
 <% else %>
   <% if @available_starters.any? %>
     <section class="ck-starter-row ck-starter-row--empty-state">
-      <h2 class="ck-title ck-title--sm">Start with a ready-made rubric</h2>
-      <p class="ck-lead">Pick one of the dimensions below to drop in a pre-written 1–5 rubric. You can edit anything after adding it. Or <%= link_to "write your own from scratch", new_metric_path, class: "ck-link" %>.</p>
+      <h2 class="ck-title ck-title--sm">Skip the blank page</h2>
+      <p class="ck-lead">Five rubrics we've worked through for common evaluation dimensions. Adopt one to drop in a pre-written 1&ndash;5 scale, edit anything after. Or <%= link_to "write your own from scratch", new_metric_path, class: "ck-link" %>.</p>
       <div class="ck-starter-grid">
         <% @available_starters.each do |starter| %>
           <%= render "starter_card", starter: starter %>

data/app/views/completion_kit/metrics/show.html.erb CHANGED Viewed

@@ -109,6 +109,7 @@
               <% source_label, source_class = case v.source
                                               when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
                                               when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
+                                              when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
                                               else ["Original", "ck-source-chip ck-source-chip--initial"]
                                               end %>
               <span class="<%= source_class %>"><%= source_label %></span>

data/config/routes.rb CHANGED Viewed

@@ -70,6 +70,9 @@ CompletionKit::Engine.routes.draw do
         member do
           post :generate
           post :retry_failures
+          post :rerun
+          post :regrade
+          get :compare
         end
         resources :responses, only: [:index, :show] do
           resources :metrics, only: [] do
@@ -78,10 +81,22 @@ CompletionKit::Engine.routes.draw do
         end
       end
       resources :datasets
-      resources :metrics
+      resources :metrics do
+        resources :metric_versions, only: [:index, :show, :destroy] do
+          member do
+            post :publish
+          end
+        end
+        member do
+          post :suggest_variants
+          post :add_few_shot
+          delete :remove_few_shot
+        end
+      end
       resources :metric_groups
       resources :tags
       resources :provider_credentials
+      resources :calibrations, only: [:index, :destroy]
     end
   end

data/lib/completion_kit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CompletionKit
-  VERSION = "0.6.0"
+  VERSION = "0.7.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: completion-kit
 version: !ruby/object:Gem::Version
-  version: 0.6.0
+  version: 0.7.0
 platform: ruby
 authors:
 - Damien Bastin
@@ -238,6 +238,7 @@ files:
 - app/controllers/completion_kit/api/v1/calibrations_controller.rb
 - app/controllers/completion_kit/api/v1/datasets_controller.rb
 - app/controllers/completion_kit/api/v1/metric_groups_controller.rb
+- app/controllers/completion_kit/api/v1/metric_versions_controller.rb
 - app/controllers/completion_kit/api/v1/metrics_controller.rb
 - app/controllers/completion_kit/api/v1/prompts_controller.rb
 - app/controllers/completion_kit/api/v1/provider_credentials_controller.rb
@@ -288,6 +289,7 @@ files:
 - app/models/completion_kit/suggestion.rb
 - app/models/completion_kit/tag.rb
 - app/models/completion_kit/tagging.rb
+- app/models/concerns/completion_kit/has_job_status.rb
 - app/models/concerns/completion_kit/taggable.rb
 - app/services/completion_kit/anthropic_client.rb
 - app/services/completion_kit/api_config.rb
@@ -302,6 +304,7 @@ files:
 - app/services/completion_kit/mcp_tools/datasets.rb
 - app/services/completion_kit/mcp_tools/judges.rb
 - app/services/completion_kit/mcp_tools/metric_groups.rb
+- app/services/completion_kit/mcp_tools/metric_versions.rb
 - app/services/completion_kit/mcp_tools/metrics.rb
 - app/services/completion_kit/mcp_tools/prompts.rb
 - app/services/completion_kit/mcp_tools/provider_credentials.rb