RubyGems - completion-kit - Versions diffs - 0.7.0 → 0.9.0 - Mend

completion-kit 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

data/app/controllers/completion_kit/api/v1/metric_groups_controller.rb CHANGED Viewed

@@ -20,7 +20,7 @@ module CompletionKit
             metric_group.replace_metrics!(params[:metric_ids]) if params.key?(:metric_ids)
             render json: metric_group.reload, status: :created
           else
-            render json: {errors: metric_group.errors}, status: :unprocessable_entity
+            render_validation_errors(metric_group)
           end
         end
@@ -29,7 +29,7 @@ module CompletionKit
             @metric_group.replace_metrics!(params[:metric_ids]) if params.key?(:metric_ids)
             render json: @metric_group.reload
           else
-            render json: {errors: @metric_group.errors}, status: :unprocessable_entity
+            render_validation_errors(@metric_group)
           end
         end

data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module CompletionKit
         def destroy
           if @version.published?
-            render json: { error: "Cannot dismiss a published version. Publish a different version as current instead." }, status: :conflict
+            render_error("Cannot dismiss a published version. Publish a different version as current instead.", status: :conflict)
             return
           end
           @version.destroy!

data/app/controllers/completion_kit/api/v1/metrics_controller.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module CompletionKit
   module Api
     module V1
       class MetricsController < BaseController
-        before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants, :add_few_shot, :remove_few_shot]
+        before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants]
         def index
           scope = Metric.includes(:tags)
@@ -19,7 +19,7 @@ module CompletionKit
           if metric.save
             render json: metric, status: :created
           else
-            render json: {errors: metric.errors}, status: :unprocessable_entity
+            render_validation_errors(metric)
           end
         end
@@ -27,7 +27,7 @@ module CompletionKit
           if @metric.update(metric_params)
             render json: @metric
           else
-            render json: {errors: @metric.errors}, status: :unprocessable_entity
+            render_validation_errors(@metric)
           end
         end
@@ -39,7 +39,7 @@ module CompletionKit
         def suggest_variants
           disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
           if disagreement_count.zero?
-            render json: { error: "Mark at least one case as Disagree before asking the model to suggest a change." }, status: :unprocessable_entity
+            render_error("Mark at least one case as Disagree before asking the model to suggest a change.", status: :unprocessable_entity)
             return
           end
@@ -47,40 +47,13 @@ module CompletionKit
           generator = MetricVariantGenerator.new(@metric, count: params[:count].to_i, model: params[:model])
           variants = generator.call
           if variants.empty?
-            render json: { error: "The model returned no usable variants. Try again with a different model." }, status: :unprocessable_entity
+            render_error("The model returned no usable variants. Try again with a different model.", status: :unprocessable_entity)
             return
           end
           versions = generator.persist!(variants)
           render json: versions, status: :created
         end
-        def add_few_shot
-          calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
-          review = calibration.response.reviews.find_by(metric_id: @metric.id)
-          examples = Array(@metric.few_shot_examples)
-          examples << {
-            "input" => calibration.response.input_data.to_s.truncate(2000),
-            "response" => calibration.response.response_text.to_s.truncate(2000),
-            "judge_score" => review&.ai_score&.to_f,
-            "judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
-            "human_score" => calibration.corrected_score&.to_f,
-            "human_note" => calibration.note.to_s.truncate(1000),
-            "calibration_id" => calibration.id,
-            "added_at" => Time.current.utc.iso8601
-          }
-          @metric.update!(few_shot_examples: examples)
-          render json: @metric.reload
-        rescue ActiveRecord::RecordNotFound
-          render json: { error: "Calibration not found or not a disagree on this metric." }, status: :not_found
-        end
-        def remove_few_shot
-          cal_id = params[:calibration_id].to_i
-          remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
-          @metric.update!(few_shot_examples: remaining)
-          render json: @metric.reload
-        end
         private
         def set_metric

data/app/controllers/completion_kit/api/v1/prompts_controller.rb CHANGED Viewed

@@ -19,7 +19,7 @@ module CompletionKit
           if prompt.save
             render json: prompt, status: :created
           else
-            render json: {errors: prompt.errors}, status: :unprocessable_entity
+            render_validation_errors(prompt)
           end
         end
@@ -32,7 +32,7 @@ module CompletionKit
           elsif @prompt.update(prompt_params)
             render json: @prompt
           else
-            render json: {errors: @prompt.errors}, status: :unprocessable_entity
+            render_validation_errors(@prompt)
           end
         end

data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module CompletionKit
           if credential.save
             render json: credential, status: :created
           else
-            render json: {errors: credential.errors}, status: :unprocessable_entity
+            render_validation_errors(credential)
           end
         end
@@ -25,7 +25,7 @@ module CompletionKit
           if @credential.update(credential_params)
             render json: @credential
           else
-            render json: {errors: @credential.errors}, status: :unprocessable_entity
+            render_validation_errors(@credential)
           end
         end

data/app/controllers/completion_kit/api/v1/runs_controller.rb CHANGED Viewed

@@ -23,7 +23,7 @@ module CompletionKit
             run.replace_metrics!(params[:metric_ids])
             render json: run.reload, status: :created
           else
-            render json: {errors: run.errors}, status: :unprocessable_entity
+            render_validation_errors(run)
           end
         end
@@ -32,7 +32,7 @@ module CompletionKit
             @run.replace_metrics!(params[:metric_ids]) if params.key?(:metric_ids)
             render json: @run.reload
           else
-            render json: {errors: @run.errors}, status: :unprocessable_entity
+            render_validation_errors(@run)
           end
         end
@@ -45,13 +45,13 @@ module CompletionKit
           if @run.start!
             render json: @run.reload, status: :accepted
           else
-            render json: { errors: [@run.failure_summary || @run.errors.full_messages.to_sentence] }, status: :unprocessable_entity
+            render_error(@run.failure_summary || @run.errors.full_messages.to_sentence, status: :unprocessable_entity)
           end
         end
         def retry_failures
           if @run.stale_review_summary.any?
-            return render(json: { error: "Judge has changed since this run executed. Retry would mix versions in the same run; use POST /api/v1/runs/:id/rerun instead." }, status: :conflict)
+            return render_error("Judge has changed since this run executed. Retry would mix versions in the same run; use POST /api/v1/runs/:id/rerun instead.", status: :conflict)
           end
           scope = @run.responses.where(status: "failed")
@@ -90,7 +90,7 @@ module CompletionKit
           if new_run.start!
             render json: new_run.reload, status: :accepted
           else
-            render json: { errors: [new_run.failure_summary || "Could not start the new run."] }, status: :unprocessable_entity
+            render_error(new_run.failure_summary || "Could not start the new run.", status: :unprocessable_entity)
           end
         end
@@ -98,7 +98,7 @@ module CompletionKit
           if @run.regrade!
             render json: @run.reload, status: :accepted
           else
-            render json: { error: "Nothing to re-grade. The run has no succeeded responses or no metrics attached." }, status: :unprocessable_entity
+            render_error("Nothing to re-grade. The run has no succeeded responses or no metrics attached.", status: :unprocessable_entity)
           end
         end
@@ -107,7 +107,7 @@ module CompletionKit
           comparison = build_run_comparison(@run, other)
           render json: { left_run_id: @run.id, right_run_id: other.id, metric_ids: comparison[:metric_ids], rows: comparison[:rows] }
         rescue ActiveRecord::RecordNotFound
-          render json: { error: "Other run not found. Pass ?with=<run_id>." }, status: :not_found
+          render_error("Other run not found. Pass ?with=<run_id>.", status: :not_found)
         end
         private

data/app/controllers/completion_kit/api/v1/tags_controller.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module CompletionKit
           if tag.save
             render json: tag, status: :created
           else
-            render json: {errors: tag.errors}, status: :unprocessable_entity
+            render_validation_errors(tag)
           end
         end
@@ -25,7 +25,7 @@ module CompletionKit
           if @tag.update(tag_params)
             render json: @tag
           else
-            render json: {errors: @tag.errors}, status: :unprocessable_entity
+            render_validation_errors(@tag)
           end
         end

data/app/controllers/completion_kit/metrics_controller.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module CompletionKit
   class MetricsController < ApplicationController
     include CompletionKit::TagFiltering
-    before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :remove_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
+    before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion]
     def index
       @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
@@ -35,11 +35,6 @@ module CompletionKit
     end
     def show
-      @published_metric_version = MetricVersion.ensure_current_for(@metric)
-      @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
-                                  .includes(:metric_version, response: [:reviews, :run])
-                                  .order(created_at: :desc)
-                                  .limit(50)
       @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
       @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
       @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
@@ -54,6 +49,7 @@ module CompletionKit
       @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
       @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
       @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
+      @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
       if @edit_draft
         @metric.instruction = @edit_draft.instruction
@@ -102,7 +98,7 @@ module CompletionKit
           state: "draft", source: "edit", current: false
         )
         redirect_to edit_metric_path(@metric),
-                    notice: "Saved as draft #{draft.version_label}. Publish to push these changes to the live judge."
+                    notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
       else
         @metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
         current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
@@ -120,7 +116,7 @@ module CompletionKit
       target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
       disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
       if disagreement_count.zero?
-        redirect_to target, alert: "Mark at least one row as Disagree before asking the model to suggest a change."
+        redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
         return
       end
@@ -132,15 +128,21 @@ module CompletionKit
         redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
         return
       end
-      generator.persist!(variants)
-      redirect_to target, notice: "Drafted a new version. Review it below."
+      versions = generator.persist!(variants)
+      new_version = versions.max_by(&:version_number)
+      if params[:back_to] == "edit"
+        redirect_to edit_metric_path(@metric), notice: "Drafted #{new_version.version_label} from your reviews. Review the proposed changes below, then Publish to use it."
+      else
+        redirect_to metric_path(@metric, show_change: new_version.id), notice: "Drafted #{new_version.version_label} from your reviews."
+      end
     end
     def dismiss_suggestion
       draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
+      label = draft&.version_label
       draft&.destroy
       target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
-      redirect_to target, notice: "Dismissed."
+      redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
     end
     def publish_draft
@@ -164,7 +166,7 @@ module CompletionKit
         audit = version.revert!
         prior_label = previously_current.version_label
         redirect_to metric_path(@metric),
-                    notice: "Reverted to #{@metric.name} #{version.version_label} (now logged as #{audit.version_label}). Pinned cases still flow to the judge, and calibration verdicts collected against #{prior_label} stay tied to it."
+                    notice: "Reverted #{@metric.name} to #{version.version_label} (logged as #{audit.version_label}). Human reviews collected against #{prior_label} stay tied to it."
       else
         version.publish!
         redirect_to metric_path(@metric),
@@ -172,31 +174,6 @@ module CompletionKit
       end
     end
-    def add_few_shot
-      calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
-      review = calibration.response.reviews.find_by(metric_id: @metric.id)
-      examples = Array(@metric.few_shot_examples)
-      examples << {
-        "input" => calibration.response.input_data.to_s.truncate(2000),
-        "response" => calibration.response.response_text.to_s.truncate(2000),
-        "judge_score" => review&.ai_score&.to_f,
-        "judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
-        "human_score" => calibration.corrected_score&.to_f,
-        "human_note" => calibration.note.to_s.truncate(1000),
-        "calibration_id" => calibration.id,
-        "added_at" => Time.current.utc.iso8601
-      }
-      @metric.update!(few_shot_examples: examples)
-      redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
-    end
-    def remove_few_shot
-      cal_id = params[:calibration_id].to_i
-      remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
-      @metric.update!(few_shot_examples: remaining)
-      redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
-    end
     private
     def set_metric

data/app/controllers/completion_kit/runs_controller.rb CHANGED Viewed

@@ -95,7 +95,7 @@ module CompletionKit
     def regrade
       if @run.regrade!
-        redirect_to run_path(@run), notice: "Re-grading existing responses with the current judge."
+        redirect_to run_path(@run), notice: "Re-grading existing responses against the current metrics."
       else
         redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
       end
@@ -151,7 +151,7 @@ module CompletionKit
     def retry_failures
       if @run.stale_review_summary.any?
         redirect_to run_path(@run),
-                    alert: "The judge has changed since this run executed. Retrying failed cases would mix scores from two metric versions in the same run. Use 'Re-run with current judge' to refresh everything against the live judge."
+                    alert: "A metric has a newer version than the one this run was scored against. Retrying failed cases would mix scores from two versions in the same run. Use 'Re-run from scratch' to refresh everything against the current metrics."
         return
       end

data/app/jobs/completion_kit/generate_row_job.rb CHANGED Viewed

@@ -80,8 +80,7 @@ module CompletionKit
     end
     def record_terminal_failure!(error)
-      response_id = @response_id || arguments.last
-      response = Response.find_by(id: response_id)
+      response = Response.find_by(id: @response_id)
       return unless response
       response.update!(
@@ -98,8 +97,7 @@ module CompletionKit
     end
     def enqueue_completion_check
-      run_id = @run_id || arguments.first
-      RunCompletionCheckJob.perform_later(run_id)
+      RunCompletionCheckJob.perform_later(@run_id)
     end
   end
 end

data/app/jobs/completion_kit/judge_review_job.rb CHANGED Viewed

@@ -58,7 +58,6 @@ module CompletionKit
         run.prompt&.template,
         criteria: metric.instruction.to_s,
         rubric_text: metric.display_rubric_text,
-        human_examples: few_shot_payload(metric),
         input_data: response.input_data
       )
@@ -91,14 +90,12 @@ module CompletionKit
     end
     def record_terminal_failure!(error)
-      response_id = @response_id || arguments.first
-      metric_id = @metric_id || arguments.last
-      response = Response.find_by(id: response_id)
+      response = Response.find_by(id: @response_id)
       return unless response
-      review = response.reviews.find_or_initialize_by(metric_id: metric_id)
+      review = response.reviews.find_or_initialize_by(metric_id: @metric_id)
       review.assign_attributes(
-        metric_name: review.metric_name || Metric.find_by(id: metric_id)&.name || "(deleted metric)",
+        metric_name: review.metric_name || Metric.find_by(id: @metric_id)&.name || "(deleted metric)",
         status: "failed",
         error_provider: provider_for(response),
         error_class: error.class.name,
@@ -115,20 +112,8 @@ module CompletionKit
     end
     def enqueue_completion_check
-      response_id = @response_id || arguments.first
-      response = Response.find_by(id: response_id)
+      response = Response.find_by(id: @response_id)
       RunCompletionCheckJob.perform_later(response.run_id) if response
     end
-    def few_shot_payload(metric)
-      return nil unless CompletionKit.config.judge_calibration_enabled
-      Array(metric.few_shot_examples).map do |fs|
-        {
-          human_score: fs["human_score"],
-          response_text: fs["response"].to_s,
-          human_note: fs["human_note"].to_s
-        }
-      end
-    end
   end
 end

data/app/models/completion_kit/metric.rb CHANGED Viewed

@@ -17,7 +17,6 @@ module CompletionKit
     has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
     serialize :rubric_bands, coder: JSON
-    serialize :few_shot_examples, coder: JSON, type: Array
     validates :name, presence: true
     validates :key, tenant_scoped_uniqueness: { allow_nil: true }

data/app/models/completion_kit/metric_version.rb CHANGED Viewed

@@ -40,6 +40,35 @@ module CompletionKit
       "v#{version_number}"
     end
+    def change_summary_against(previous)
+      return nil if previous.nil?
+      instruction_changed = previous.instruction.to_s.strip != instruction.to_s.strip
+      rubric_changes = rubric_band_change_count(previous)
+      return nil unless instruction_changed || rubric_changes.positive?
+      dimensions = []
+      dimensions << "instruction" if instruction_changed
+      dimensions << "rubric" if rubric_changes.positive?
+      words_changed = 0
+      if instruction_changed
+        old_words = previous.instruction.to_s.split
+        new_words = instruction.to_s.split
+        words_changed = (old_words - new_words).size + (new_words - old_words).size
+      end
+      magnitude = if rubric_changes >= 2 || (instruction_changed && rubric_changes >= 1) || words_changed >= 15
+        :major
+      elsif rubric_changes == 1 || words_changed >= 4
+        :minor
+      else
+        :trivial
+      end
+      { magnitude: magnitude, label: "#{magnitude.to_s.capitalize} #{dimensions.to_sentence} changes" }
+    end
     def publish!
       MetricVersion.transaction do
         self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
@@ -86,6 +115,12 @@ module CompletionKit
     private
+    def rubric_band_change_count(previous)
+      prev = Metric.normalize_rubric_bands(previous.rubric_bands)
+      curr = Metric.normalize_rubric_bands(rubric_bands)
+      prev.zip(curr).count { |p, c| p["description"].to_s.strip != c["description"].to_s.strip }
+    end
     def assign_version_number
       return if version_number.present?
       max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i

data/app/models/completion_kit/run.rb CHANGED Viewed

@@ -290,7 +290,6 @@ module CompletionKit
         target: "run_status_panel",
         html: render_engine_partial("completion_kit/runs/status_panel", run: self)
       )
-      broadcast_status_header
     end
     def broadcast_status_header

data/app/services/completion_kit/judge_service.rb CHANGED Viewed

@@ -10,12 +10,12 @@ module CompletionKit
       @judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
     end
-    def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, human_examples: nil, input_data: nil, **_extras)
+    def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, **_extras)
       raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
       judge_prompt = build_judge_prompt(output, expected_output, prompt,
         criteria: criteria,
-        rubric_text: rubric_text, human_examples: human_examples,
+        rubric_text: rubric_text,
         input_data: input_data)
       response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
@@ -25,7 +25,7 @@ module CompletionKit
     private
-    def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, human_examples: nil, input_data: nil)
+    def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil)
       judge_prompt = <<~PROMPT
         You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
@@ -42,13 +42,6 @@ module CompletionKit
         judge_prompt += "\nCriteria: #{criteria}\n"
       end
-      if human_examples.present?
-        judge_prompt += "\nCalibration examples:\n"
-        human_examples.each_with_index do |example, index|
-          judge_prompt += "Example #{index + 1}: score=#{example[:human_score]} output=#{example[:response_text].to_s.truncate(200)}\n"
-        end
-      end
       judge_prompt += <<~PROMPT
         Original prompt: #{prompt || "Not provided"}

data/app/services/completion_kit/mcp_tools/metric_versions.rb CHANGED Viewed

@@ -16,7 +16,7 @@ module CompletionKit
           handler: :list
         },
         "metric_versions_publish" => {
-          description: "Publish a MetricVersion as the live judge for its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge actually uses them.",
+          description: "Publish a MetricVersion as the live version of its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge grades against it.",
           inputSchema: {
             type: "object",
             properties: {

data/app/services/completion_kit/metric_variant_generator.rb CHANGED Viewed

@@ -43,7 +43,6 @@ module CompletionKit
     def build_meta_prompt
       disagreements = MetricCalibrationExamples.disagreements_for(@metric)
       borderlines = MetricCalibrationExamples.borderlines_for(@metric)
-      pinned_examples = Array(@metric.few_shot_examples)
       sections = []
       sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
       sections << ""
@@ -78,18 +77,6 @@ module CompletionKit
           sections << ""
         end
       end
-      if pinned_examples.any?
-        sections << "## Pinned cases the judge already references"
-        sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
-        pinned_examples.each_with_index do |ex, i|
-          sections << "### Pinned #{i + 1}"
-          sections << "Input: #{ex["input"].to_s.truncate(200)}"
-          sections << "Output: #{ex["response"].to_s.truncate(200)}"
-          sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
-          sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
-          sections << ""
-        end
-      end
       sections << "## Task"
       sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
       sections << ""

data/app/views/completion_kit/api_reference/_body.html.erb CHANGED Viewed

@@ -27,7 +27,7 @@
     <label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">10</span></label>
     <label for="ck-tab-responses" class="ck-api-tabs__label">Responses <span class="ck-api-tabs__count">2</span></label>
     <label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
-    <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">12</span></label>
+    <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">10</span></label>
     <label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
     <label for="ck-tab-calibrations" class="ck-api-tabs__label">Calibrations <span class="ck-api-tabs__count">3</span></label>
     <label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
@@ -239,7 +239,7 @@
       <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
         <p class="ck-kicker" style="margin-bottom: 0.5rem;">Calibration loop</p>
-        <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model for variants, then pin individual cases as few-shot examples on the metric.</p>
+        <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model to rewrite the instruction and rubric into a new draft version.</p>
       </div>
       <div class="ck-api-endpoint">
         <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
@@ -247,16 +247,6 @@
         <p class="ck-api-params"><strong>Optional:</strong>&ensp;<code>count</code>, <code>model</code></p>
         <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/metrics/1/suggest_variants \\\n  -H \"Authorization: Bearer #{token}\"" %>
       </div>
-      <div class="ck-api-endpoint">
-        <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/add_few_shot</p>
-        <p class="ck-meta-copy">Pin a disagree calibration as a few-shot example on the metric. Returns the updated metric, or 404 if the calibration is not a disagree on this metric.</p>
-        <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
-      </div>
-      <div class="ck-api-endpoint">
-        <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:id/remove_few_shot</p>
-        <p class="ck-meta-copy">Drop the pinned few-shot example by calibration ID.</p>
-        <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
-      </div>
       <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
         <p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>

data/app/views/completion_kit/api_reference/index.html.erb CHANGED Viewed

@@ -20,6 +20,10 @@
         <p class="ck-kicker">Tag filtering</p>
         <p class="ck-meta-copy">Prompts, runs, metrics, datasets, and metric groups accept <code>?tag[]=name</code> (repeat for OR semantics).</p>
       </div>
+      <div>
+        <p class="ck-kicker">Error shape</p>
+        <p class="ck-meta-copy">Every error response carries a top-level <code>error</code> string. Validation failures (422) add a <code>details</code> object keyed by field: <code>{ "error": "Validation failed", "details": { "name": ["can't be blank"] } }</code>.</p>
+      </div>
     </div>
   </div>
 </div>