completion-kit 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/stylesheets/completion_kit/application.css +203 -334
  3. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +1 -28
  4. data/app/controllers/completion_kit/metrics_controller.rb +30 -36
  5. data/app/controllers/completion_kit/runs_controller.rb +2 -2
  6. data/app/jobs/completion_kit/judge_review_job.rb +9 -16
  7. data/app/models/completion_kit/metric.rb +0 -1
  8. data/app/models/completion_kit/metric_version.rb +35 -0
  9. data/app/services/completion_kit/judge_service.rb +19 -10
  10. data/app/services/completion_kit/mcp_tools/metric_versions.rb +1 -1
  11. data/app/services/completion_kit/metric_calibration_examples.rb +56 -0
  12. data/app/services/completion_kit/metric_variant_generator.rb +0 -49
  13. data/app/views/completion_kit/api_reference/_body.html.erb +2 -12
  14. data/app/views/completion_kit/calibrations/_trust_panel.html.erb +25 -19
  15. data/app/views/completion_kit/metrics/_form.html.erb +11 -12
  16. data/app/views/completion_kit/metrics/_guiding_examples.html.erb +23 -0
  17. data/app/views/completion_kit/metrics/edit.html.erb +18 -0
  18. data/app/views/completion_kit/metrics/index.html.erb +5 -17
  19. data/app/views/completion_kit/metrics/show.html.erb +76 -100
  20. data/app/views/completion_kit/responses/show.html.erb +7 -5
  21. data/app/views/completion_kit/runs/show.html.erb +7 -7
  22. data/config/routes.rb +1 -4
  23. data/db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb +5 -0
  24. data/db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb +5 -0
  25. data/lib/completion_kit/version.rb +1 -1
  26. data/lib/completion_kit.rb +2 -0
  27. metadata +5 -1
@@ -2,7 +2,7 @@ module CompletionKit
2
2
  module Api
3
3
  module V1
4
4
  class MetricsController < BaseController
5
- before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants, :add_few_shot, :remove_few_shot]
5
+ before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants]
6
6
 
7
7
  def index
8
8
  scope = Metric.includes(:tags)
@@ -54,33 +54,6 @@ module CompletionKit
54
54
  render json: versions, status: :created
55
55
  end
56
56
 
57
- def add_few_shot
58
- calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
59
- review = calibration.response.reviews.find_by(metric_id: @metric.id)
60
- examples = Array(@metric.few_shot_examples)
61
- examples << {
62
- "input" => calibration.response.input_data.to_s.truncate(2000),
63
- "response" => calibration.response.response_text.to_s.truncate(2000),
64
- "judge_score" => review&.ai_score&.to_f,
65
- "judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
66
- "human_score" => calibration.corrected_score&.to_f,
67
- "human_note" => calibration.note.to_s.truncate(1000),
68
- "calibration_id" => calibration.id,
69
- "added_at" => Time.current.utc.iso8601
70
- }
71
- @metric.update!(few_shot_examples: examples)
72
- render json: @metric.reload
73
- rescue ActiveRecord::RecordNotFound
74
- render_error("Calibration not found or not a disagree on this metric.", status: :not_found)
75
- end
76
-
77
- def remove_few_shot
78
- cal_id = params[:calibration_id].to_i
79
- remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
80
- @metric.update!(few_shot_examples: remaining)
81
- render json: @metric.reload
82
- end
83
-
84
57
  private
85
58
 
86
59
  def set_metric
@@ -1,11 +1,13 @@
1
1
  module CompletionKit
2
2
  class MetricsController < ApplicationController
3
3
  include CompletionKit::TagFiltering
4
- before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :remove_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
4
+ before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion, :exclude_example]
5
+ before_action :ensure_examples_from_reviews_enabled, only: [:exclude_example]
5
6
 
6
7
  def index
7
8
  @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
8
9
  @available_starters = StarterMetrics.available
10
+ @current_versions = MetricVersion.published.current.where(metric_id: @metrics.map(&:id)).index_by(&:metric_id)
9
11
  end
10
12
 
11
13
  def starter_preview
@@ -35,15 +37,11 @@ module CompletionKit
35
37
  end
36
38
 
37
39
  def show
38
- @published_metric_version = MetricVersion.ensure_current_for(@metric)
39
- @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
40
- .includes(:metric_version, response: [:reviews, :run])
41
- .order(created_at: :desc)
42
- .limit(50)
43
40
  @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
44
41
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
45
42
  @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
46
43
  @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
44
+ @guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricCalibrationExamples.judge_examples_for(@metric) : []
47
45
  end
48
46
 
49
47
  def new
@@ -54,6 +52,7 @@ module CompletionKit
54
52
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
55
53
  @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
56
54
  @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
55
+ @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
57
56
 
58
57
  if @edit_draft
59
58
  @metric.instruction = @edit_draft.instruction
@@ -102,7 +101,7 @@ module CompletionKit
102
101
  state: "draft", source: "edit", current: false
103
102
  )
104
103
  redirect_to edit_metric_path(@metric),
105
- notice: "Saved as draft #{draft.version_label}. Publish to push these changes to the live judge."
104
+ notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
106
105
  else
107
106
  @metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
108
107
  current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
@@ -120,7 +119,7 @@ module CompletionKit
120
119
  target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
121
120
  disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
122
121
  if disagreement_count.zero?
123
- redirect_to target, alert: "Mark at least one row as Disagree before asking the model to suggest a change."
122
+ redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
124
123
  return
125
124
  end
126
125
 
@@ -132,15 +131,31 @@ module CompletionKit
132
131
  redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
133
132
  return
134
133
  end
135
- generator.persist!(variants)
136
- redirect_to target, notice: "Drafted a new version. Review it below."
134
+ versions = generator.persist!(variants)
135
+ new_version = versions.max_by(&:version_number)
136
+ if params[:back_to] == "edit"
137
+ redirect_to edit_metric_path(@metric), notice: "Drafted #{new_version.version_label} from your reviews. Review the proposed changes below, then Publish to use it."
138
+ else
139
+ redirect_to metric_path(@metric, show_change: new_version.id), notice: "Drafted #{new_version.version_label} from your reviews."
140
+ end
137
141
  end
138
142
 
139
143
  def dismiss_suggestion
140
144
  draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
145
+ label = draft&.version_label
141
146
  draft&.destroy
142
147
  target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
143
- redirect_to target, notice: "Dismissed."
148
+ redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
149
+ end
150
+
151
+ def exclude_example
152
+ calibration = Calibration.where(metric_id: @metric.id).find(params[:calibration_id])
153
+ calibration.update!(excluded_from_examples: true)
154
+ render turbo_stream: turbo_stream.replace(
155
+ "ck-guiding-#{@metric.id}",
156
+ partial: "completion_kit/metrics/guiding_examples",
157
+ locals: { metric: @metric, examples: MetricCalibrationExamples.judge_examples_for(@metric) }
158
+ )
144
159
  end
145
160
 
146
161
  def publish_draft
@@ -164,7 +179,7 @@ module CompletionKit
164
179
  audit = version.revert!
165
180
  prior_label = previously_current.version_label
166
181
  redirect_to metric_path(@metric),
167
- notice: "Reverted to #{@metric.name} #{version.version_label} (now logged as #{audit.version_label}). Pinned cases still flow to the judge, and calibration verdicts collected against #{prior_label} stay tied to it."
182
+ notice: "Reverted #{@metric.name} to #{version.version_label} (logged as #{audit.version_label}). Human reviews collected against #{prior_label} stay tied to it."
168
183
  else
169
184
  version.publish!
170
185
  redirect_to metric_path(@metric),
@@ -172,33 +187,12 @@ module CompletionKit
172
187
  end
173
188
  end
174
189
 
175
- def add_few_shot
176
- calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
177
- review = calibration.response.reviews.find_by(metric_id: @metric.id)
178
- examples = Array(@metric.few_shot_examples)
179
- examples << {
180
- "input" => calibration.response.input_data.to_s.truncate(2000),
181
- "response" => calibration.response.response_text.to_s.truncate(2000),
182
- "judge_score" => review&.ai_score&.to_f,
183
- "judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
184
- "human_score" => calibration.corrected_score&.to_f,
185
- "human_note" => calibration.note.to_s.truncate(1000),
186
- "calibration_id" => calibration.id,
187
- "added_at" => Time.current.utc.iso8601
188
- }
189
- @metric.update!(few_shot_examples: examples)
190
- redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
191
- end
190
+ private
192
191
 
193
- def remove_few_shot
194
- cal_id = params[:calibration_id].to_i
195
- remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
196
- @metric.update!(few_shot_examples: remaining)
197
- redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
192
+ def ensure_examples_from_reviews_enabled
193
+ head :not_found unless CompletionKit.config.judge_examples_from_reviews
198
194
  end
199
195
 
200
- private
201
-
202
196
  def set_metric
203
197
  @metric = Metric.find(params[:id])
204
198
  end
@@ -95,7 +95,7 @@ module CompletionKit
95
95
 
96
96
  def regrade
97
97
  if @run.regrade!
98
- redirect_to run_path(@run), notice: "Re-grading existing responses with the current judge."
98
+ redirect_to run_path(@run), notice: "Re-grading existing responses against the current metrics."
99
99
  else
100
100
  redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
101
101
  end
@@ -151,7 +151,7 @@ module CompletionKit
151
151
  def retry_failures
152
152
  if @run.stale_review_summary.any?
153
153
  redirect_to run_path(@run),
154
- alert: "The judge has changed since this run executed. Retrying failed cases would mix scores from two metric versions in the same run. Use 'Re-run with current judge' to refresh everything against the live judge."
154
+ alert: "A metric has a newer version than the one this run was scored against. Retrying failed cases would mix scores from two versions in the same run. Use 'Re-run from scratch' to refresh everything against the current metrics."
155
155
  return
156
156
  end
157
157
 
@@ -58,8 +58,8 @@ module CompletionKit
58
58
  run.prompt&.template,
59
59
  criteria: metric.instruction.to_s,
60
60
  rubric_text: metric.display_rubric_text,
61
- human_examples: few_shot_payload(metric),
62
- input_data: response.input_data
61
+ input_data: response.input_data,
62
+ human_examples: review_examples_for(metric, response)
63
63
  )
64
64
 
65
65
  review = response.reviews.find_or_initialize_by(metric_id: metric.id)
@@ -81,9 +81,13 @@ module CompletionKit
81
81
 
82
82
  private
83
83
 
84
- # A model with supports_judging == nil ("untested") just produced a valid
85
- # review promote it to confirmed. No-op once confirmed (so repeated runs
86
- # don't churn the row), and a model already flagged as a bad judge stays so.
84
+ def review_examples_for(metric, response)
85
+ return nil unless CompletionKit.config.judge_calibration_enabled
86
+ return nil unless CompletionKit.config.judge_examples_from_reviews
87
+
88
+ MetricCalibrationExamples.judge_examples_for(metric, exclude_response_id: response.id)
89
+ end
90
+
87
91
  def confirm_judging_capability(judge_model_id)
88
92
  model = Model.find_by(provider: ApiConfig.provider_for_model(judge_model_id), model_id: judge_model_id)
89
93
  return unless model && model.supports_judging.nil?
@@ -116,16 +120,5 @@ module CompletionKit
116
120
  response = Response.find_by(id: @response_id)
117
121
  RunCompletionCheckJob.perform_later(response.run_id) if response
118
122
  end
119
-
120
- def few_shot_payload(metric)
121
- return nil unless CompletionKit.config.judge_calibration_enabled
122
- Array(metric.few_shot_examples).map do |fs|
123
- {
124
- human_score: fs["human_score"],
125
- response_text: fs["response"].to_s,
126
- human_note: fs["human_note"].to_s
127
- }
128
- end
129
- end
130
123
  end
131
124
  end
@@ -17,7 +17,6 @@ module CompletionKit
17
17
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
18
18
 
19
19
  serialize :rubric_bands, coder: JSON
20
- serialize :few_shot_examples, coder: JSON, type: Array
21
20
 
22
21
  validates :name, presence: true
23
22
  validates :key, tenant_scoped_uniqueness: { allow_nil: true }
@@ -40,6 +40,35 @@ module CompletionKit
40
40
  "v#{version_number}"
41
41
  end
42
42
 
43
+ def change_summary_against(previous)
44
+ return nil if previous.nil?
45
+
46
+ instruction_changed = previous.instruction.to_s.strip != instruction.to_s.strip
47
+ rubric_changes = rubric_band_change_count(previous)
48
+ return nil unless instruction_changed || rubric_changes.positive?
49
+
50
+ dimensions = []
51
+ dimensions << "instruction" if instruction_changed
52
+ dimensions << "rubric" if rubric_changes.positive?
53
+
54
+ words_changed = 0
55
+ if instruction_changed
56
+ old_words = previous.instruction.to_s.split
57
+ new_words = instruction.to_s.split
58
+ words_changed = (old_words - new_words).size + (new_words - old_words).size
59
+ end
60
+
61
+ magnitude = if rubric_changes >= 2 || (instruction_changed && rubric_changes >= 1) || words_changed >= 15
62
+ :major
63
+ elsif rubric_changes == 1 || words_changed >= 4
64
+ :minor
65
+ else
66
+ :trivial
67
+ end
68
+
69
+ { magnitude: magnitude, label: "#{magnitude.to_s.capitalize} #{dimensions.to_sentence} changes" }
70
+ end
71
+
43
72
  def publish!
44
73
  MetricVersion.transaction do
45
74
  self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
@@ -86,6 +115,12 @@ module CompletionKit
86
115
 
87
116
  private
88
117
 
118
+ def rubric_band_change_count(previous)
119
+ prev = Metric.normalize_rubric_bands(previous.rubric_bands)
120
+ curr = Metric.normalize_rubric_bands(rubric_bands)
121
+ prev.zip(curr).count { |p, c| p["description"].to_s.strip != c["description"].to_s.strip }
122
+ end
123
+
89
124
  def assign_version_number
90
125
  return if version_number.present?
91
126
  max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i
@@ -10,13 +10,14 @@ module CompletionKit
10
10
  @judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
11
11
  end
12
12
 
13
- def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, human_examples: nil, input_data: nil, **_extras)
13
+ def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil, **_extras)
14
14
  raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
15
15
 
16
16
  judge_prompt = build_judge_prompt(output, expected_output, prompt,
17
17
  criteria: criteria,
18
- rubric_text: rubric_text, human_examples: human_examples,
19
- input_data: input_data)
18
+ rubric_text: rubric_text,
19
+ input_data: input_data,
20
+ human_examples: human_examples)
20
21
 
21
22
  response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
22
23
  raise StandardError, response if response.start_with?("Error:")
@@ -25,7 +26,7 @@ module CompletionKit
25
26
 
26
27
  private
27
28
 
28
- def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, human_examples: nil, input_data: nil)
29
+ def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil)
29
30
  judge_prompt = <<~PROMPT
30
31
  You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
31
32
 
@@ -42,12 +43,7 @@ module CompletionKit
42
43
  judge_prompt += "\nCriteria: #{criteria}\n"
43
44
  end
44
45
 
45
- if human_examples.present?
46
- judge_prompt += "\nCalibration examples:\n"
47
- human_examples.each_with_index do |example, index|
48
- judge_prompt += "Example #{index + 1}: score=#{example[:human_score]} output=#{example[:response_text].to_s.truncate(200)}\n"
49
- end
50
- end
46
+ judge_prompt += human_examples_block(human_examples)
51
47
 
52
48
  judge_prompt += <<~PROMPT
53
49
 
@@ -60,6 +56,19 @@ module CompletionKit
60
56
  judge_prompt
61
57
  end
62
58
 
59
+ def human_examples_block(examples)
60
+ return "" if examples.blank?
61
+
62
+ lines = ["", "Reviewed examples where a human corrected the judge on this metric. Weigh them when scoring:"]
63
+ examples.each_with_index do |example, index|
64
+ note = example[:human_note].to_s
65
+ line = "Example #{index + 1}: Output: #{example[:output].to_s.truncate(200)}. The judge scored this #{example[:judge_score].to_i}/5. A reviewer corrected it to #{example[:human_score].to_i}/5"
66
+ line += note.present? ? ": #{note.truncate(160)}" : "."
67
+ lines << line
68
+ end
69
+ lines.join("\n") + "\n"
70
+ end
71
+
63
72
  def parse_judge_response(response)
64
73
  score_match = response.match(/\*{0,2}Score:?\*{0,2}\s*(\d+(?:\.\d+)?)/i)
65
74
  feedback_match = response.match(/\*{0,2}Feedback:?\*{0,2}\s*(.+)/mi)
@@ -16,7 +16,7 @@ module CompletionKit
16
16
  handler: :list
17
17
  },
18
18
  "metric_versions_publish" => {
19
- description: "Publish a MetricVersion as the live judge for its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge actually uses them.",
19
+ description: "Publish a MetricVersion as the live version of its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge grades against it.",
20
20
  inputSchema: {
21
21
  type: "object",
22
22
  properties: {
@@ -0,0 +1,56 @@
1
+ module CompletionKit
2
+ module MetricCalibrationExamples
3
+ DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
4
+
5
+ module_function
6
+
7
+ def for(metric, limit: 8)
8
+ disagreements_for(metric, limit: limit)
9
+ end
10
+
11
+ def disagreements_for(metric, limit: 8)
12
+ calibrations_for(metric, verdict: "disagree", limit: limit)
13
+ end
14
+
15
+ def borderlines_for(metric, limit: 6)
16
+ calibrations_for(metric, verdict: "borderline", limit: limit)
17
+ end
18
+
19
+ def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
20
+ current_version = MetricVersion.current.find_by(metric_id: metric.id)
21
+ return [] unless current_version
22
+
23
+ relation = Calibration
24
+ .where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
25
+ .where.not(corrected_score: nil)
26
+ relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
27
+ map_examples(relation.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
28
+ .reject { |example| example[:judge_score].nil? }
29
+ end
30
+
31
+ def calibrations_for(metric, verdict:, limit:)
32
+ base = Calibration.where(metric_id: metric.id, verdict: verdict)
33
+ current_version = MetricVersion.current.find_by(metric_id: metric.id)
34
+ scoped = current_version ? base.where(metric_version_id: current_version.id) : base
35
+ effective = scoped.exists? ? scoped : base
36
+ map_examples(effective.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
37
+ end
38
+
39
+ def map_examples(relation, metric)
40
+ relation.map do |cal|
41
+ review = cal.response.reviews.find { |r| r.metric_id == metric.id }
42
+ {
43
+ id: cal.id,
44
+ run_id: cal.run_id,
45
+ response_id: cal.response_id,
46
+ input: cal.response.input_data,
47
+ output: cal.response.response_text,
48
+ judge_score: review&.ai_score,
49
+ judge_feedback: review&.ai_feedback,
50
+ human_score: cal.corrected_score,
51
+ human_note: cal.note
52
+ }
53
+ end
54
+ end
55
+ end
56
+ end
@@ -43,7 +43,6 @@ module CompletionKit
43
43
  def build_meta_prompt
44
44
  disagreements = MetricCalibrationExamples.disagreements_for(@metric)
45
45
  borderlines = MetricCalibrationExamples.borderlines_for(@metric)
46
- pinned_examples = Array(@metric.few_shot_examples)
47
46
  sections = []
48
47
  sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
49
48
  sections << ""
@@ -78,18 +77,6 @@ module CompletionKit
78
77
  sections << ""
79
78
  end
80
79
  end
81
- if pinned_examples.any?
82
- sections << "## Pinned cases the judge already references"
83
- sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
84
- pinned_examples.each_with_index do |ex, i|
85
- sections << "### Pinned #{i + 1}"
86
- sections << "Input: #{ex["input"].to_s.truncate(200)}"
87
- sections << "Output: #{ex["response"].to_s.truncate(200)}"
88
- sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
89
- sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
90
- sections << ""
91
- end
92
- end
93
80
  sections << "## Task"
94
81
  sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
95
82
  sections << ""
@@ -130,40 +117,4 @@ module CompletionKit
130
117
  end
131
118
  end
132
119
 
133
- module MetricCalibrationExamples
134
- module_function
135
-
136
- def for(metric, limit: 8)
137
- disagreements_for(metric, limit: limit)
138
- end
139
-
140
- def disagreements_for(metric, limit: 8)
141
- calibrations_for(metric, verdict: "disagree", limit: limit)
142
- end
143
-
144
- def borderlines_for(metric, limit: 6)
145
- calibrations_for(metric, verdict: "borderline", limit: limit)
146
- end
147
-
148
- def calibrations_for(metric, verdict:, limit:)
149
- base = Calibration.where(metric_id: metric.id, verdict: verdict)
150
- current_version = MetricVersion.current.find_by(metric_id: metric.id)
151
- scoped = current_version ? base.where(metric_version_id: current_version.id) : base
152
- effective = scoped.exists? ? scoped : base
153
- effective.includes(response: :reviews)
154
- .order(created_at: :desc)
155
- .limit(limit)
156
- .map do |cal|
157
- review = cal.response.reviews.find { |r| r.metric_id == metric.id }
158
- {
159
- input: cal.response.input_data,
160
- output: cal.response.response_text,
161
- judge_score: review&.ai_score,
162
- judge_feedback: review&.ai_feedback,
163
- human_score: cal.corrected_score,
164
- human_note: cal.note
165
- }
166
- end
167
- end
168
- end
169
120
  end
@@ -27,7 +27,7 @@
27
27
  <label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">10</span></label>
28
28
  <label for="ck-tab-responses" class="ck-api-tabs__label">Responses <span class="ck-api-tabs__count">2</span></label>
29
29
  <label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
30
- <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">12</span></label>
30
+ <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">10</span></label>
31
31
  <label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
32
32
  <label for="ck-tab-calibrations" class="ck-api-tabs__label">Calibrations <span class="ck-api-tabs__count">3</span></label>
33
33
  <label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
@@ -239,7 +239,7 @@
239
239
 
240
240
  <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
241
241
  <p class="ck-kicker" style="margin-bottom: 0.5rem;">Calibration loop</p>
242
- <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model for variants, then pin individual cases as few-shot examples on the metric.</p>
242
+ <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model to rewrite the instruction and rubric into a new draft version.</p>
243
243
  </div>
244
244
  <div class="ck-api-endpoint">
245
245
  <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
@@ -247,16 +247,6 @@
247
247
  <p class="ck-api-params"><strong>Optional:</strong>&ensp;<code>count</code>, <code>model</code></p>
248
248
  <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/metrics/1/suggest_variants \\\n -H \"Authorization: Bearer #{token}\"" %>
249
249
  </div>
250
- <div class="ck-api-endpoint">
251
- <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/add_few_shot</p>
252
- <p class="ck-meta-copy">Pin a disagree calibration as a few-shot example on the metric. Returns the updated metric, or 404 if the calibration is not a disagree on this metric.</p>
253
- <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
254
- </div>
255
- <div class="ck-api-endpoint">
256
- <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:id/remove_few_shot</p>
257
- <p class="ck-meta-copy">Drop the pinned few-shot example by calibration ID.</p>
258
- <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
259
- </div>
260
250
 
261
251
  <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
262
252
  <p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>
@@ -1,16 +1,12 @@
1
1
  <% stats = local_assigns[:stats] %>
2
2
  <% metric = local_assigns[:metric] %>
3
3
  <% anchor = metric&.name&.parameterize %>
4
- <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
4
+ <% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
5
+ <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
5
6
  created_by = CompletionKit.config.username.presence || "operator"
6
- current_metric_version = CompletionKit::MetricVersion.current.find_by(metric_id: metric.id)
7
- verdicted_ids = if current_metric_version
8
- CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
9
- else
10
- []
11
- end
7
+ verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
12
8
  CompletionKit::Response.joins(:reviews)
13
- .where(reviews: { metric_id: metric.id })
9
+ .where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
14
10
  .where.not(reviews: { ai_score: nil })
15
11
  .where.not(id: verdicted_ids)
16
12
  .order(created_at: :desc).first
@@ -22,19 +18,29 @@
22
18
  end %>
23
19
 
24
20
  <p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
25
- <span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
26
21
  <% if stats.sample_size.zero? %>
27
- <span class="ck-trust-line__state">Not measured yet.</span>
28
- <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "verdict") %> on prior versions, tied to that version's history.)<% end %><% if target_response %>
29
- <%= link_to "Give a verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %>
30
- <% else %> Run this metric on a dataset, then give a verdict.<% end %></span>
22
+ <span class="ck-trust-line__lead">Not measured yet.</span>
23
+ <span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
24
+ <% if target_response %>
25
+ <%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
26
+ <% end %>
31
27
  <% elsif stats.counter_only? %>
32
- <span class="ck-trust-line__counter"><%= stats.sample_size %>/<%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span>
33
- <span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.short_to_target > 0 %> · <%= stats.short_to_target %> more before this can be measured<% end %><% if target_response %> · <%= link_to "Give another verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %><% end %></span>
28
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></strong></span>
29
+ <% if stats.short_to_target > 0 %><span class="ck-trust-line__hint"><%= stats.short_to_target %> more to report a rate</span><% end %>
30
+ <% if target_response %>
31
+ <%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
32
+ <% end %>
34
33
  <% else %>
35
- <span class="ck-trust-line__score" title="Roughly how often the judge and the humans agreed.">~<%= (stats.agreement_point * 100).round %>%</span>
36
- <span class="ck-trust-line__margin" title="The range we're confident the true rate sits in.">±<%= (stats.margin * 100).round %> pt</span>
37
- <span class="ck-trust-line__gate" title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts.' %>"><%= stats.firm? ? "settled" : "early" %></span>
38
- <span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.borderline_rate && stats.borderline_rate > 0 %><% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %> · <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>" title="<%= level == 'ok' ? '' : 'Reviewers said the rubric was unclear here.' %>"><%= (stats.borderline_rate * 100).round %>% unclear</span><% end %></span>
34
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agreement</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong></span>
35
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
36
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
37
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
38
+ <% if stats.borderline_rate && stats.borderline_rate > 0 %>
39
+ <% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
40
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>
41
+ <% end %>
39
42
  <% end %>
40
43
  </p>
44
+ <% if stats.sample_size.zero? && prior_version_verdicts > 0 %>
45
+ <p class="ck-trust-line__aside"><%= pluralize(prior_version_verdicts, "review") %> from an earlier version <%= prior_version_verdicts == 1 ? "doesn't" : "don't" %> count toward this version.</p>
46
+ <% end %>
@@ -40,20 +40,19 @@
40
40
  <% if suggestion %>
41
41
  <div class="ck-suggestion-banner" role="status">
42
42
  <div class="ck-suggestion-banner__body">
43
- <p class="ck-kicker">Proposed improvements</p>
44
- <p class="ck-meta-copy">Based on your disagreements, the model proposed these changes to the instruction and rubric. Apply pieces inline below, take everything at once, try again, or discard.</p>
43
+ <p class="ck-kicker ck-kicker--icon"><%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>Proposed changes</p>
44
+ <p class="ck-meta-copy">Based on human reviews, here are some proposed changes to the metric.</p>
45
45
  </div>
46
46
  <div class="ck-suggestion-banner__actions">
47
- <%= button_to "Try again", suggest_variants_metric_path(metric, back_to: "edit"),
48
- method: :post, form_class: "inline-block",
49
- class: ck_button_classes(:light, variant: :outline),
50
- title: "Discard these improvements and ask the model for fresh ones.",
51
- data: { turbo_confirm: "Replace these improvements with fresh ones from the model?" } %>
52
- <%= button_to "Discard", dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
53
- method: :delete, form_class: "inline-block",
54
- class: ck_button_classes(:light, variant: :outline),
55
- data: { turbo_confirm: "Drop these improvements?" } %>
56
- <%= button_to "Take everything", publish_draft_metric_path(metric, draft_id: suggestion.id),
47
+ <%= button_to suggest_variants_metric_path(metric, back_to: "edit"),
48
+ method: :post, form_class: "inline-block", class: "ck-icon-btn",
49
+ title: "Try again", "aria-label": "Try again",
50
+ data: { turbo_confirm: "Replace these changes with fresh ones from the model?" } do %><%= heroicon_tag "arrow-path", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
51
+ <%= button_to dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
52
+ method: :delete, form_class: "inline-block", class: "ck-icon-btn",
53
+ title: "Discard these changes", "aria-label": "Discard",
54
+ data: { turbo_confirm: "Drop these changes?" } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
55
+ <%= button_to "Apply all", publish_draft_metric_path(metric, draft_id: suggestion.id),
57
56
  method: :post, form_class: "inline-block",
58
57
  class: ck_button_classes(:dark) %>
59
58
  </div>