completion-kit 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  module CompletionKit
2
2
  class MetricsController < ApplicationController
3
3
  include CompletionKit::TagFiltering
4
- before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :remove_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
4
+ before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion]
5
5
 
6
6
  def index
7
7
  @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
@@ -35,11 +35,6 @@ module CompletionKit
35
35
  end
36
36
 
37
37
  def show
38
- @published_metric_version = MetricVersion.ensure_current_for(@metric)
39
- @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
40
- .includes(:metric_version, response: [:reviews, :run])
41
- .order(created_at: :desc)
42
- .limit(50)
43
38
  @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
44
39
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
45
40
  @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
@@ -54,6 +49,7 @@ module CompletionKit
54
49
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
55
50
  @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
56
51
  @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
52
+ @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
57
53
 
58
54
  if @edit_draft
59
55
  @metric.instruction = @edit_draft.instruction
@@ -102,7 +98,7 @@ module CompletionKit
102
98
  state: "draft", source: "edit", current: false
103
99
  )
104
100
  redirect_to edit_metric_path(@metric),
105
- notice: "Saved as draft #{draft.version_label}. Publish to push these changes to the live judge."
101
+ notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
106
102
  else
107
103
  @metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
108
104
  current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
@@ -120,7 +116,7 @@ module CompletionKit
120
116
  target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
121
117
  disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
122
118
  if disagreement_count.zero?
123
- redirect_to target, alert: "Mark at least one row as Disagree before asking the model to suggest a change."
119
+ redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
124
120
  return
125
121
  end
126
122
 
@@ -132,15 +128,21 @@ module CompletionKit
132
128
  redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
133
129
  return
134
130
  end
135
- generator.persist!(variants)
136
- redirect_to target, notice: "Drafted a new version. Review it below."
131
+ versions = generator.persist!(variants)
132
+ new_version = versions.max_by(&:version_number)
133
+ if params[:back_to] == "edit"
134
+ redirect_to edit_metric_path(@metric), notice: "Drafted #{new_version.version_label} from your reviews. Review the proposed changes below, then Publish to use it."
135
+ else
136
+ redirect_to metric_path(@metric, show_change: new_version.id), notice: "Drafted #{new_version.version_label} from your reviews."
137
+ end
137
138
  end
138
139
 
139
140
  def dismiss_suggestion
140
141
  draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
142
+ label = draft&.version_label
141
143
  draft&.destroy
142
144
  target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
143
- redirect_to target, notice: "Dismissed."
145
+ redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
144
146
  end
145
147
 
146
148
  def publish_draft
@@ -164,7 +166,7 @@ module CompletionKit
164
166
  audit = version.revert!
165
167
  prior_label = previously_current.version_label
166
168
  redirect_to metric_path(@metric),
167
- notice: "Reverted to #{@metric.name} #{version.version_label} (now logged as #{audit.version_label}). Pinned cases still flow to the judge, and calibration verdicts collected against #{prior_label} stay tied to it."
169
+ notice: "Reverted #{@metric.name} to #{version.version_label} (logged as #{audit.version_label}). Human reviews collected against #{prior_label} stay tied to it."
168
170
  else
169
171
  version.publish!
170
172
  redirect_to metric_path(@metric),
@@ -172,31 +174,6 @@ module CompletionKit
172
174
  end
173
175
  end
174
176
 
175
- def add_few_shot
176
- calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
177
- review = calibration.response.reviews.find_by(metric_id: @metric.id)
178
- examples = Array(@metric.few_shot_examples)
179
- examples << {
180
- "input" => calibration.response.input_data.to_s.truncate(2000),
181
- "response" => calibration.response.response_text.to_s.truncate(2000),
182
- "judge_score" => review&.ai_score&.to_f,
183
- "judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
184
- "human_score" => calibration.corrected_score&.to_f,
185
- "human_note" => calibration.note.to_s.truncate(1000),
186
- "calibration_id" => calibration.id,
187
- "added_at" => Time.current.utc.iso8601
188
- }
189
- @metric.update!(few_shot_examples: examples)
190
- redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
191
- end
192
-
193
- def remove_few_shot
194
- cal_id = params[:calibration_id].to_i
195
- remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
196
- @metric.update!(few_shot_examples: remaining)
197
- redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
198
- end
199
-
200
177
  private
201
178
 
202
179
  def set_metric
@@ -95,7 +95,7 @@ module CompletionKit
95
95
 
96
96
  def regrade
97
97
  if @run.regrade!
98
- redirect_to run_path(@run), notice: "Re-grading existing responses with the current judge."
98
+ redirect_to run_path(@run), notice: "Re-grading existing responses against the current metrics."
99
99
  else
100
100
  redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
101
101
  end
@@ -151,7 +151,7 @@ module CompletionKit
151
151
  def retry_failures
152
152
  if @run.stale_review_summary.any?
153
153
  redirect_to run_path(@run),
154
- alert: "The judge has changed since this run executed. Retrying failed cases would mix scores from two metric versions in the same run. Use 'Re-run with current judge' to refresh everything against the live judge."
154
+ alert: "A metric has a newer version than the one this run was scored against. Retrying failed cases would mix scores from two versions in the same run. Use 'Re-run from scratch' to refresh everything against the current metrics."
155
155
  return
156
156
  end
157
157
 
@@ -58,7 +58,6 @@ module CompletionKit
58
58
  run.prompt&.template,
59
59
  criteria: metric.instruction.to_s,
60
60
  rubric_text: metric.display_rubric_text,
61
- human_examples: few_shot_payload(metric),
62
61
  input_data: response.input_data
63
62
  )
64
63
 
@@ -116,16 +115,5 @@ module CompletionKit
116
115
  response = Response.find_by(id: @response_id)
117
116
  RunCompletionCheckJob.perform_later(response.run_id) if response
118
117
  end
119
-
120
- def few_shot_payload(metric)
121
- return nil unless CompletionKit.config.judge_calibration_enabled
122
- Array(metric.few_shot_examples).map do |fs|
123
- {
124
- human_score: fs["human_score"],
125
- response_text: fs["response"].to_s,
126
- human_note: fs["human_note"].to_s
127
- }
128
- end
129
- end
130
118
  end
131
119
  end
@@ -17,7 +17,6 @@ module CompletionKit
17
17
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
18
18
 
19
19
  serialize :rubric_bands, coder: JSON
20
- serialize :few_shot_examples, coder: JSON, type: Array
21
20
 
22
21
  validates :name, presence: true
23
22
  validates :key, tenant_scoped_uniqueness: { allow_nil: true }
@@ -40,6 +40,35 @@ module CompletionKit
40
40
  "v#{version_number}"
41
41
  end
42
42
 
43
+ def change_summary_against(previous)
44
+ return nil if previous.nil?
45
+
46
+ instruction_changed = previous.instruction.to_s.strip != instruction.to_s.strip
47
+ rubric_changes = rubric_band_change_count(previous)
48
+ return nil unless instruction_changed || rubric_changes.positive?
49
+
50
+ dimensions = []
51
+ dimensions << "instruction" if instruction_changed
52
+ dimensions << "rubric" if rubric_changes.positive?
53
+
54
+ words_changed = 0
55
+ if instruction_changed
56
+ old_words = previous.instruction.to_s.split
57
+ new_words = instruction.to_s.split
58
+ words_changed = (old_words - new_words).size + (new_words - old_words).size
59
+ end
60
+
61
+ magnitude = if rubric_changes >= 2 || (instruction_changed && rubric_changes >= 1) || words_changed >= 15
62
+ :major
63
+ elsif rubric_changes == 1 || words_changed >= 4
64
+ :minor
65
+ else
66
+ :trivial
67
+ end
68
+
69
+ { magnitude: magnitude, label: "#{magnitude.to_s.capitalize} #{dimensions.to_sentence} changes" }
70
+ end
71
+
43
72
  def publish!
44
73
  MetricVersion.transaction do
45
74
  self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
@@ -86,6 +115,12 @@ module CompletionKit
86
115
 
87
116
  private
88
117
 
118
+ def rubric_band_change_count(previous)
119
+ prev = Metric.normalize_rubric_bands(previous.rubric_bands)
120
+ curr = Metric.normalize_rubric_bands(rubric_bands)
121
+ prev.zip(curr).count { |p, c| p["description"].to_s.strip != c["description"].to_s.strip }
122
+ end
123
+
89
124
  def assign_version_number
90
125
  return if version_number.present?
91
126
  max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i
@@ -10,12 +10,12 @@ module CompletionKit
10
10
  @judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
11
11
  end
12
12
 
13
- def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, human_examples: nil, input_data: nil, **_extras)
13
+ def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, **_extras)
14
14
  raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
15
15
 
16
16
  judge_prompt = build_judge_prompt(output, expected_output, prompt,
17
17
  criteria: criteria,
18
- rubric_text: rubric_text, human_examples: human_examples,
18
+ rubric_text: rubric_text,
19
19
  input_data: input_data)
20
20
 
21
21
  response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
@@ -25,7 +25,7 @@ module CompletionKit
25
25
 
26
26
  private
27
27
 
28
- def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, human_examples: nil, input_data: nil)
28
+ def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil)
29
29
  judge_prompt = <<~PROMPT
30
30
  You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
31
31
 
@@ -42,13 +42,6 @@ module CompletionKit
42
42
  judge_prompt += "\nCriteria: #{criteria}\n"
43
43
  end
44
44
 
45
- if human_examples.present?
46
- judge_prompt += "\nCalibration examples:\n"
47
- human_examples.each_with_index do |example, index|
48
- judge_prompt += "Example #{index + 1}: score=#{example[:human_score]} output=#{example[:response_text].to_s.truncate(200)}\n"
49
- end
50
- end
51
-
52
45
  judge_prompt += <<~PROMPT
53
46
 
54
47
  Original prompt: #{prompt || "Not provided"}
@@ -16,7 +16,7 @@ module CompletionKit
16
16
  handler: :list
17
17
  },
18
18
  "metric_versions_publish" => {
19
- description: "Publish a MetricVersion as the live judge for its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge actually uses them.",
19
+ description: "Publish a MetricVersion as the live version of its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge grades against it.",
20
20
  inputSchema: {
21
21
  type: "object",
22
22
  properties: {
@@ -43,7 +43,6 @@ module CompletionKit
43
43
  def build_meta_prompt
44
44
  disagreements = MetricCalibrationExamples.disagreements_for(@metric)
45
45
  borderlines = MetricCalibrationExamples.borderlines_for(@metric)
46
- pinned_examples = Array(@metric.few_shot_examples)
47
46
  sections = []
48
47
  sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
49
48
  sections << ""
@@ -78,18 +77,6 @@ module CompletionKit
78
77
  sections << ""
79
78
  end
80
79
  end
81
- if pinned_examples.any?
82
- sections << "## Pinned cases the judge already references"
83
- sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
84
- pinned_examples.each_with_index do |ex, i|
85
- sections << "### Pinned #{i + 1}"
86
- sections << "Input: #{ex["input"].to_s.truncate(200)}"
87
- sections << "Output: #{ex["response"].to_s.truncate(200)}"
88
- sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
89
- sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
90
- sections << ""
91
- end
92
- end
93
80
  sections << "## Task"
94
81
  sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
95
82
  sections << ""
@@ -27,7 +27,7 @@
27
27
  <label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">10</span></label>
28
28
  <label for="ck-tab-responses" class="ck-api-tabs__label">Responses <span class="ck-api-tabs__count">2</span></label>
29
29
  <label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
30
- <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">12</span></label>
30
+ <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">10</span></label>
31
31
  <label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
32
32
  <label for="ck-tab-calibrations" class="ck-api-tabs__label">Calibrations <span class="ck-api-tabs__count">3</span></label>
33
33
  <label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
@@ -239,7 +239,7 @@
239
239
 
240
240
  <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
241
241
  <p class="ck-kicker" style="margin-bottom: 0.5rem;">Calibration loop</p>
242
- <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model for variants, then pin individual cases as few-shot examples on the metric.</p>
242
+ <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model to rewrite the instruction and rubric into a new draft version.</p>
243
243
  </div>
244
244
  <div class="ck-api-endpoint">
245
245
  <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
@@ -247,16 +247,6 @@
247
247
  <p class="ck-api-params"><strong>Optional:</strong>&ensp;<code>count</code>, <code>model</code></p>
248
248
  <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/metrics/1/suggest_variants \\\n -H \"Authorization: Bearer #{token}\"" %>
249
249
  </div>
250
- <div class="ck-api-endpoint">
251
- <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/add_few_shot</p>
252
- <p class="ck-meta-copy">Pin a disagree calibration as a few-shot example on the metric. Returns the updated metric, or 404 if the calibration is not a disagree on this metric.</p>
253
- <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
254
- </div>
255
- <div class="ck-api-endpoint">
256
- <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:id/remove_few_shot</p>
257
- <p class="ck-meta-copy">Drop the pinned few-shot example by calibration ID.</p>
258
- <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
259
- </div>
260
250
 
261
251
  <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
262
252
  <p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>
@@ -1,9 +1,9 @@
1
1
  <% stats = local_assigns[:stats] %>
2
2
  <% metric = local_assigns[:metric] %>
3
3
  <% anchor = metric&.name&.parameterize %>
4
+ <% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
4
5
  <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
5
6
  created_by = CompletionKit.config.username.presence || "operator"
6
- current_metric_version = CompletionKit::MetricVersion.current.find_by(metric_id: metric.id)
7
7
  verdicted_ids = if current_metric_version
8
8
  CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
9
9
  else
@@ -22,19 +22,26 @@
22
22
  end %>
23
23
 
24
24
  <p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
25
- <span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
26
25
  <% if stats.sample_size.zero? %>
27
- <span class="ck-trust-line__state">Not measured yet.</span>
28
- <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "verdict") %> on prior versions, tied to that version's history.)<% end %><% if target_response %>
29
- <%= link_to "Give a verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %>
30
- <% else %> Run this metric on a dataset, then give a verdict.<% end %></span>
26
+ <span class="ck-trust-line__lead">Not measured yet.</span>
27
+ <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "earlier-version review") %> kept on file.)<% end %></span>
28
+ <% if target_response %>
29
+ <%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
30
+ <% end %>
31
31
  <% elsif stats.counter_only? %>
32
- <span class="ck-trust-line__counter"><%= stats.sample_size %>/<%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span>
33
- <span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.short_to_target > 0 %> · <%= stats.short_to_target %> more before this can be measured<% end %><% if target_response %> · <%= link_to "Give another verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %><% end %></span>
32
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></strong></span>
33
+ <% if stats.short_to_target > 0 %><span class="ck-trust-line__hint"><%= stats.short_to_target %> more to report a rate</span><% end %>
34
+ <% if target_response %>
35
+ <%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
36
+ <% end %>
34
37
  <% else %>
35
- <span class="ck-trust-line__score" title="Roughly how often the judge and the humans agreed.">~<%= (stats.agreement_point * 100).round %>%</span>
36
- <span class="ck-trust-line__margin" title="The range we're confident the true rate sits in.">±<%= (stats.margin * 100).round %> pt</span>
37
- <span class="ck-trust-line__gate" title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts.' %>"><%= stats.firm? ? "settled" : "early" %></span>
38
- <span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.borderline_rate && stats.borderline_rate > 0 %><% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %> · <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>" title="<%= level == 'ok' ? '' : 'Reviewers said the rubric was unclear here.' %>"><%= (stats.borderline_rate * 100).round %>% unclear</span><% end %></span>
38
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agreement</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong></span>
39
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
40
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
41
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
42
+ <% if stats.borderline_rate && stats.borderline_rate > 0 %>
43
+ <% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
44
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>
45
+ <% end %>
39
46
  <% end %>
40
47
  </p>
@@ -40,20 +40,19 @@
40
40
  <% if suggestion %>
41
41
  <div class="ck-suggestion-banner" role="status">
42
42
  <div class="ck-suggestion-banner__body">
43
- <p class="ck-kicker">Proposed improvements</p>
44
- <p class="ck-meta-copy">Based on your disagreements, the model proposed these changes to the instruction and rubric. Apply pieces inline below, take everything at once, try again, or discard.</p>
43
+ <p class="ck-kicker ck-kicker--icon"><%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>Proposed changes</p>
44
+ <p class="ck-meta-copy">Based on human reviews, here are some proposed changes to the metric.</p>
45
45
  </div>
46
46
  <div class="ck-suggestion-banner__actions">
47
- <%= button_to "Try again", suggest_variants_metric_path(metric, back_to: "edit"),
48
- method: :post, form_class: "inline-block",
49
- class: ck_button_classes(:light, variant: :outline),
50
- title: "Discard these improvements and ask the model for fresh ones.",
51
- data: { turbo_confirm: "Replace these improvements with fresh ones from the model?" } %>
52
- <%= button_to "Discard", dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
53
- method: :delete, form_class: "inline-block",
54
- class: ck_button_classes(:light, variant: :outline),
55
- data: { turbo_confirm: "Drop these improvements?" } %>
56
- <%= button_to "Take everything", publish_draft_metric_path(metric, draft_id: suggestion.id),
47
+ <%= button_to suggest_variants_metric_path(metric, back_to: "edit"),
48
+ method: :post, form_class: "inline-block", class: "ck-icon-btn",
49
+ title: "Try again", "aria-label": "Try again",
50
+ data: { turbo_confirm: "Replace these changes with fresh ones from the model?" } do %><%= heroicon_tag "arrow-path", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
51
+ <%= button_to dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
52
+ method: :delete, form_class: "inline-block", class: "ck-icon-btn",
53
+ title: "Discard these changes", "aria-label": "Discard",
54
+ data: { turbo_confirm: "Drop these changes?" } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
55
+ <%= button_to "Apply all", publish_draft_metric_path(metric, draft_id: suggestion.id),
57
56
  method: :post, form_class: "inline-block",
58
57
  class: ck_button_classes(:dark) %>
59
58
  </div>
@@ -10,6 +10,24 @@
10
10
  </div>
11
11
  </section>
12
12
 
13
+ <% if CompletionKit.config.judge_calibration_enabled && @suggestion_draft.nil? && @edit_draft.nil? && @improve_disagreement_count.to_i.positive? %>
14
+ <div class="ck-suggestion-banner" role="status">
15
+ <div class="ck-suggestion-banner__body">
16
+ <p class="ck-kicker">Improve from reviews</p>
17
+ <p class="ck-meta-copy">Based on human reviews, the model can propose changes to this metric.</p>
18
+ </div>
19
+ <div class="ck-suggestion-banner__actions">
20
+ <%= button_to suggest_variants_metric_path(@metric, back_to: "edit"),
21
+ method: :post, form_class: "inline-block",
22
+ class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
23
+ data: { turbo_confirm: "Draft improvements to this metric from your human reviews? You can edit or apply them here before publishing." } do %>
24
+ <%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
25
+ Suggest improvements
26
+ <% end %>
27
+ </div>
28
+ </div>
29
+ <% end %>
30
+
13
31
  <%= render "form",
14
32
  metric: @metric,
15
33
  suggestion_draft: @suggestion_draft,
@@ -28,23 +28,6 @@
28
28
  <tr onclick="window.location='<%= metric_path(metric) %>'" style="cursor: pointer;">
29
29
  <td>
30
30
  <%= link_to metric_path(metric), class: "ck-record-name" do %><strong><%= metric.name %></strong><% end %>
31
- <% if CompletionKit.config.judge_calibration_enabled %>
32
- <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
33
- <p class="ck-metrics-table__trust" title="Calibration: how often this metric's scores match the humans who reviewed them.">
34
- <%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true", class: "ck-trust-icon" %>
35
- <span class="ck-metrics-table__trust-label">Calibration</span>
36
- <% if s.counter_only? %>
37
- <% if s.sample_size.zero? %>
38
- <span class="ck-metrics-table__trust-state">Not measured yet</span>
39
- <% else %>
40
- <%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts
41
- <% end %>
42
- <% else %>
43
- <span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read. Keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
44
- ±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %>
45
- <% end %>
46
- </p>
47
- <% end %>
48
31
  <% if metric.tags.any? %>
49
32
  <div class="tag-marks-row">
50
33
  <%= render "completion_kit/tags/marks", tags: metric.tags %>