completion-kit 0.5.42 → 0.5.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/javascripts/completion_kit/application.js +17 -0
  3. data/app/assets/stylesheets/completion_kit/application.css +505 -39
  4. data/app/controllers/completion_kit/metrics_controller.rb +35 -24
  5. data/app/jobs/completion_kit/judge_review_job.rb +11 -0
  6. data/app/models/completion_kit/judge_version.rb +32 -1
  7. data/app/services/completion_kit/judge_variant_generator.rb +8 -6
  8. data/app/services/completion_kit/metric_calibration_stats.rb +16 -4
  9. data/app/views/completion_kit/api_reference/_body.html.erb +1 -1
  10. data/app/views/completion_kit/calibrations/_buttons.html.erb +43 -6
  11. data/app/views/completion_kit/calibrations/_trust_panel.html.erb +27 -28
  12. data/app/views/completion_kit/metrics/_form.html.erb +90 -4
  13. data/app/views/completion_kit/metrics/_rubric_diff.html.erb +25 -0
  14. data/app/views/completion_kit/metrics/_rubric_hint.html.erb +4 -0
  15. data/app/views/completion_kit/metrics/_starter_card.html.erb +13 -9
  16. data/app/views/completion_kit/metrics/edit.html.erb +5 -1
  17. data/app/views/completion_kit/metrics/index.html.erb +5 -3
  18. data/app/views/completion_kit/metrics/show.html.erb +132 -126
  19. data/app/views/completion_kit/metrics/starter_preview.html.erb +6 -6
  20. data/app/views/completion_kit/responses/show.html.erb +1 -1
  21. data/app/views/completion_kit/runs/_status_panel.html.erb +2 -2
  22. data/config/routes.rb +2 -1
  23. data/db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb +24 -0
  24. data/lib/completion_kit/version.rb +1 -1
  25. metadata +4 -1
@@ -1,7 +1,7 @@
1
1
  module CompletionKit
2
2
  class MetricsController < ApplicationController
3
3
  include CompletionKit::TagFiltering
4
- before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
4
+ before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :remove_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
5
5
 
6
6
  def index
7
7
  @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
@@ -35,14 +35,16 @@ module CompletionKit
35
35
  end
36
36
 
37
37
  def show
38
+ @published_judge_version = JudgeVersion.ensure_current_for(@metric)
38
39
  @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
39
- .includes(response: [:reviews, :run])
40
+ .includes(:judge_version, response: [:reviews, :run])
40
41
  .order(created_at: :desc)
41
42
  .limit(50)
42
43
  @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
43
- @published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
44
44
  @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
45
- @improve_disagreement_count = @disagreements.size
45
+ @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree",
46
+ judge_version_id: @published_judge_version.id).count
47
+ @versions = JudgeVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
46
48
  end
47
49
 
48
50
  def new
@@ -50,6 +52,9 @@ module CompletionKit
50
52
  end
51
53
 
52
54
  def edit
55
+ @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
56
+ @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
57
+ @published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
53
58
  end
54
59
 
55
60
  def create
@@ -76,9 +81,10 @@ module CompletionKit
76
81
  end
77
82
 
78
83
  def suggest_variants
84
+ target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
79
85
  disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
80
86
  if disagreement_count.zero?
81
- redirect_to metric_path(@metric), alert: "Mark at least one row as Disagree before asking the model to suggest a change."
87
+ redirect_to target, alert: "Mark at least one row as Disagree before asking the model to suggest a change."
82
88
  return
83
89
  end
84
90
 
@@ -87,38 +93,36 @@ module CompletionKit
87
93
  generator = JudgeVariantGenerator.new(@metric, count: 1)
88
94
  variants = generator.call
89
95
  if variants.empty?
90
- redirect_to metric_path(@metric), alert: "The model returned no usable variants. Try again with a different model."
96
+ redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
91
97
  return
92
98
  end
93
99
  generator.persist!(variants)
94
- redirect_to metric_path(@metric), notice: "Drafted a new version. Review it below."
100
+ redirect_to target, notice: "Drafted a new version. Review it below."
95
101
  end
96
102
 
97
103
  def dismiss_suggestion
98
- draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").find_by(id: params[:draft_id])
104
+ draft = JudgeVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
99
105
  draft&.destroy
100
- redirect_to metric_path(@metric), notice: "Dismissed."
106
+ target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
107
+ redirect_to target, notice: "Dismissed."
101
108
  end
102
109
 
103
110
  def publish_draft
104
- scope = JudgeVersion.drafts.where(metric_id: @metric.id)
105
- draft = params[:draft_id].present? ? scope.find_by(id: params[:draft_id]) : scope.order(created_at: :desc).first
106
-
107
- if draft.nil?
108
- redirect_to metric_path(@metric), alert: "No draft to publish."
111
+ scope = JudgeVersion.where(metric_id: @metric.id)
112
+ version = if params[:draft_id].present?
113
+ scope.find_by(id: params[:draft_id])
114
+ else
115
+ JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
116
+ end
117
+
118
+ if version.nil?
119
+ redirect_to metric_path(@metric), alert: "No version to publish."
109
120
  return
110
121
  end
111
122
 
112
- JudgeVersion.transaction do
113
- JudgeVersion.where(metric_id: @metric.id, state: "published").update_all(current: false)
114
- draft.update!(state: "published", current: true)
115
- @metric.update_columns(
116
- instruction: draft.instruction,
117
- rubric_bands: Array(draft.rubric_bands).to_json
118
- )
119
- end
120
-
121
- redirect_to metric_path(@metric), notice: "This judge version is now live."
123
+ version.publish!
124
+ redirect_to metric_path(@metric),
125
+ notice: "#{@metric.name} #{version.version_label} is now the published version."
122
126
  end
123
127
 
124
128
  def add_few_shot
@@ -139,6 +143,13 @@ module CompletionKit
139
143
  redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
140
144
  end
141
145
 
146
+ def remove_few_shot
147
+ cal_id = params[:calibration_id].to_i
148
+ remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
149
+ @metric.update!(few_shot_examples: remaining)
150
+ redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
151
+ end
152
+
142
153
  private
143
154
 
144
155
  def set_metric
@@ -57,6 +57,7 @@ module CompletionKit
57
57
  run.prompt&.template,
58
58
  criteria: metric.instruction.to_s,
59
59
  rubric_text: metric.display_rubric_text,
60
+ human_examples: few_shot_payload(metric),
60
61
  input_data: response.input_data
61
62
  )
62
63
 
@@ -119,5 +120,15 @@ module CompletionKit
119
120
  response = Response.find_by(id: response_id)
120
121
  RunCompletionCheckJob.perform_later(response.run_id) if response
121
122
  end
123
+
124
+ def few_shot_payload(metric)
125
+ Array(metric.few_shot_examples).map do |fs|
126
+ {
127
+ human_score: fs["human_score"],
128
+ response_text: fs["response"].to_s,
129
+ human_note: fs["human_note"].to_s
130
+ }
131
+ end
132
+ end
122
133
  end
123
134
  end
@@ -7,8 +7,11 @@ module CompletionKit
7
7
 
8
8
  serialize :rubric_bands, coder: JSON
9
9
 
10
+ before_validation :assign_version_number, on: :create
11
+
10
12
  validates :metric_id, presence: true
11
13
  validates :state, inclusion: { in: STATES }
14
+ validates :version_number, presence: true, uniqueness: { scope: :metric_id }
12
15
 
13
16
  scope :current, -> { where(current: true) }
14
17
  scope :published, -> { where(state: "published") }
@@ -20,7 +23,8 @@ module CompletionKit
20
23
  instruction: metric.instruction,
21
24
  rubric_bands: metric.rubric_bands,
22
25
  current: true,
23
- state: "published"
26
+ state: "published",
27
+ published_at: Time.current
24
28
  )
25
29
  end
26
30
 
@@ -32,17 +36,44 @@ module CompletionKit
32
36
  state == "published"
33
37
  end
34
38
 
39
+ def version_label
40
+ "v#{version_number}"
41
+ end
42
+
43
+ def publish!
44
+ JudgeVersion.transaction do
45
+ self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
46
+ reload
47
+ update!(state: "published", current: true, published_at: published_at || Time.current)
48
+ metric.update_columns(
49
+ instruction: instruction,
50
+ rubric_bands: Array(rubric_bands).to_json
51
+ )
52
+ end
53
+ self
54
+ end
55
+
35
56
  def as_json(options = {})
36
57
  {
37
58
  id: id,
38
59
  metric_id: metric_id,
60
+ version_number: version_number,
39
61
  instruction: instruction,
40
62
  rubric_bands: rubric_bands,
41
63
  current: current,
42
64
  state: state,
43
65
  source: source,
66
+ published_at: published_at,
44
67
  created_at: created_at
45
68
  }
46
69
  end
70
+
71
+ private
72
+
73
+ def assign_version_number
74
+ return if version_number.present?
75
+ max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i
76
+ self.version_number = max + 1
77
+ end
47
78
  end
48
79
  end
@@ -86,7 +86,7 @@ module CompletionKit
86
86
  sections << "REASONING: <one short sentence: what changes and why>"
87
87
  sections << "INSTRUCTION:"
88
88
  sections << "<the rewritten instruction>"
89
- sections << "RUBRIC: # optional omit this block if the rubric is unchanged"
89
+ sections << "RUBRIC: # optional. Omit this block if the rubric is unchanged."
90
90
  sections << "5: <description for 5 stars>"
91
91
  sections << "4: <description for 4 stars>"
92
92
  sections << "3: <description for 3 stars>"
@@ -133,11 +133,13 @@ module CompletionKit
133
133
  end
134
134
 
135
135
  def calibrations_for(metric, verdict:, limit:)
136
- Calibration.where(metric_id: metric.id, verdict: verdict)
137
- .includes(response: :reviews)
138
- .order(created_at: :desc)
139
- .limit(limit)
140
- .map do |cal|
136
+ scope = Calibration.where(metric_id: metric.id, verdict: verdict)
137
+ current_version = JudgeVersion.current.find_by(metric_id: metric.id)
138
+ scope = scope.where(judge_version_id: current_version.id) if current_version
139
+ scope.includes(response: :reviews)
140
+ .order(created_at: :desc)
141
+ .limit(limit)
142
+ .map do |cal|
141
143
  review = cal.response.reviews.find { |r| r.metric_id == metric.id }
142
144
  {
143
145
  input: cal.response.input_data,
@@ -31,18 +31,30 @@ module CompletionKit
31
31
  end
32
32
  end
33
33
 
34
- def self.for(metric, judge_version: nil)
35
- new(metric: metric, judge_version: judge_version).call
34
+ CURRENT = :current
35
+
36
+ def self.for(metric, judge_version: CURRENT)
37
+ resolved = case judge_version
38
+ when CURRENT then JudgeVersion.current.find_by(metric_id: metric.id)
39
+ when nil then nil
40
+ else judge_version
41
+ end
42
+ new(metric: metric, judge_version: resolved, all_versions: judge_version.nil?).call
36
43
  end
37
44
 
38
- def initialize(metric:, judge_version: nil)
45
+ def initialize(metric:, judge_version: nil, all_versions: false)
39
46
  @metric = metric
40
47
  @judge_version = judge_version
48
+ @all_versions = all_versions
41
49
  end
42
50
 
43
51
  def call
44
52
  scope = Calibration.where(metric_id: @metric.id)
45
- scope = scope.where(judge_version_id: @judge_version.id) if @judge_version
53
+ if @judge_version
54
+ scope = scope.where(judge_version_id: @judge_version.id)
55
+ elsif !@all_versions
56
+ scope = scope.none
57
+ end
46
58
 
47
59
  verdicts = scope.pluck(:verdict, :corrected_score, :response_id)
48
60
  n = verdicts.length
@@ -187,7 +187,7 @@
187
187
  </div>
188
188
  <%= render "completion_kit/api_reference/resource_list", title: "Your datasets",
189
189
  items: datasets.map { |d|
190
- { name: d.name, meta: pluralize([d.csv_data.to_s.lines.count - 1, 0].max, "row"),
190
+ { name: d.name, meta: pluralize([d.csv_data.to_s.lines.count - 1, 0].max, "entry"),
191
191
  url: "#{base_url}/api/v1/datasets/#{d.id}", dom_id: "dataset_ep_#{d.id}" }
192
192
  } %>
193
193
  </div>
@@ -3,14 +3,19 @@
3
3
  <% pending_verdict = local_assigns[:pending_verdict] %>
4
4
  <% active_verdict = pending_verdict || current_verdict %>
5
5
  <% error = local_assigns[:error] %>
6
- <% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
6
+ <% me = CompletionKit.config.username.presence || "operator" %>
7
+ <% other_calibrations = CompletionKit::Calibration
8
+ .where(response_id: response_row.id, metric_id: metric.id)
9
+ .where.not(created_by: me)
10
+ .order(created_at: :asc).to_a %>
11
+ <% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
7
12
  <p class="ck-calibration__prompt">
8
13
  <span class="ck-calibration__label">Your verdict</span>
9
- <% if verdict_count > 0 %>
10
- <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "trust level →", metric_path(metric), class: "ck-link" %></span>
11
- <% else %>
12
- <span class="ck-calibration__hint">Tell us what you think — was the score right? Verdicts roll up into the metric's <%= link_to "trust level", metric_path(metric), class: "ck-link" %>.</span>
14
+ <% if other_calibrations.any? %>
15
+ <span class="ck-calibration__meta"><%= pluralize(other_calibrations.size, "other verdict") %> on this score</span>
16
+ <span class="ck-calibration__sep">·</span>
13
17
  <% end %>
18
+ <%= link_to metric_path(metric), class: "ck-calibration__meta-link" do %><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration →<% end %>
14
19
  </p>
15
20
  <div class="ck-calibration__buttons">
16
21
  <% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
@@ -36,6 +41,38 @@
36
41
  <p class="ck-calibration__error" role="alert"><%= error %></p>
37
42
  <% end %>
38
43
 
44
+ <% if other_calibrations.any? %>
45
+ <details class="ck-calibration__others">
46
+ <summary class="ck-calibration__others-summary">
47
+ <%= heroicon_tag "chevron-right", variant: :outline, size: 14, "aria-hidden": "true" %>
48
+ <span>What others said (<%= other_calibrations.size %>)</span>
49
+ </summary>
50
+ <ul class="ck-calibration__others-list">
51
+ <% other_calibrations.each do |other| %>
52
+ <li class="ck-calibration__others-item ck-calibration__others-item--<%= other.verdict %>">
53
+ <div class="ck-calibration__others-row">
54
+ <span class="ck-calibration__others-verdict">
55
+ <%= heroicon_tag verdict_icons[other.verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
56
+ <%= other.verdict %>
57
+ </span>
58
+ <span class="ck-calibration__others-by"><%= other.created_by %></span>
59
+ <% if other.corrected_score %>
60
+ <span class="ck-calibration__others-stars" aria-label="<%= pluralize(other.corrected_score.to_i, 'star') %>" title="<%= pluralize(other.corrected_score.to_i, 'star') %>">
61
+ <% 5.times do |i| %>
62
+ <svg viewBox="0 0 24 24" width="12" height="12" stroke-width="1.75" class="ck-star <%= i < other.corrected_score.to_i ? "ck-star--filled" : "ck-star--empty" %>" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
63
+ <% end %>
64
+ </span>
65
+ <% end %>
66
+ </div>
67
+ <% if other.note.to_s.present? %>
68
+ <p class="ck-calibration__others-note">"<%= other.note.to_s.truncate(140) %>"</p>
69
+ <% end %>
70
+ </li>
71
+ <% end %>
72
+ </ul>
73
+ </details>
74
+ <% end %>
75
+
39
76
  <% if active_verdict == "disagree" %>
40
77
  <% existing_score = (calibration&.corrected_score || review&.ai_score)&.round %>
41
78
  <%= form_with url: run_response_calibrations_path(run, response_row),
@@ -51,7 +88,7 @@
51
88
  <% radio_id = "ck-star-#{response_row.id}-#{metric.id}-#{n}" %>
52
89
  <input type="radio" name="corrected_score" id="<%= radio_id %>" value="<%= n %>" <%= "checked" if existing_score == n %> required>
53
90
  <label for="<%= radio_id %>" title="<%= pluralize(n, 'star') %>" aria-label="<%= pluralize(n, 'star') %>">
54
- <svg viewBox="0 0 24 24" width="28" height="28" stroke-width="1.5" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
91
+ <svg viewBox="0 0 24 24" stroke-width="1.5" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
55
92
  </label>
56
93
  <% end %>
57
94
  </div>
@@ -1,31 +1,30 @@
1
1
  <% stats = local_assigns[:stats] %>
2
- <div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
3
- <p class="ck-trust-panel__label" title="How often this metric's scores match the humans who reviewed them.">Trust level</p>
4
- <% if stats.counter_only? %>
5
- <div class="ck-trust-panel__body">
6
- <span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>
7
- <span class="ck-trust-panel__hint">verdicts so far<% if stats.short_to_target > 0 %> · <%= stats.short_to_target %> more before we can score the judge<% end %></span>
8
- </div>
2
+ <% metric = local_assigns[:metric] %>
3
+ <% anchor = metric&.name&.parameterize %>
4
+ <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
5
+ created_by = CompletionKit.config.username.presence || "operator"
6
+ verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by).pluck(:response_id)
7
+ CompletionKit::Response.joins(:reviews)
8
+ .where(reviews: { metric_id: metric.id })
9
+ .where.not(reviews: { ai_score: nil })
10
+ .where.not(id: verdicted_ids)
11
+ .order(created_at: :desc).first
12
+ end %>
13
+
14
+ <p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
15
+ <span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
16
+ <% if stats.sample_size.zero? %>
17
+ <span class="ck-trust-line__state">Not measured yet.</span>
18
+ <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if target_response %>
19
+ <%= link_to "Give a verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %>
20
+ <% else %> Run this metric on a dataset, then give a verdict.<% end %></span>
21
+ <% elsif stats.counter_only? %>
22
+ <span class="ck-trust-line__counter"><%= stats.sample_size %>/<%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span>
23
+ <span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.short_to_target > 0 %> · <%= stats.short_to_target %> more before this can be measured<% end %><% if target_response %> · <%= link_to "Give another verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %><% end %></span>
9
24
  <% else %>
10
- <div class="ck-trust-panel__body">
11
- <span class="ck-trust-panel__score"
12
- title="Roughly how often the judge and the humans agreed.">~<%= (stats.agreement_point * 100).round %><span class="ck-trust-panel__score-pct">%</span></span>
13
- <span class="ck-trust-panel__margin"
14
- title="The range we're confident the true rate sits in, given how few verdicts we have so far.">±<%= (stats.margin * 100).round %> pt</span>
15
- <span class="ck-trust-panel__gate"
16
- title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts to tighten the margin.' %>"><%= stats.firm? ? "settled" : "early read" %></span>
17
- </div>
18
- <div class="ck-trust-panel__details">
19
- <span><%= pluralize(stats.sample_size, "verdict") %></span>
20
- <% if stats.borderline_rate && stats.borderline_rate > 0 %>
21
- <% level = if stats.borderline_rate > 0.30 then "danger"
22
- elsif stats.borderline_rate > 0.15 then "warning"
23
- else "ok" end %>
24
- <span class="ck-trust-panel__borderline ck-trust-panel__borderline--<%= level %>"
25
- title="<%= level == 'ok' ? 'Some reviewers said the rubric was unclear here.' : 'A lot of reviewers say the rubric is unclear here. Consider splitting the metric or rewriting the rubric.' %>">
26
- <%= (stats.borderline_rate * 100).round %>% said "unclear"
27
- </span>
28
- <% end %>
29
- </div>
25
+ <span class="ck-trust-line__score" title="Roughly how often the judge and the humans agreed.">~<%= (stats.agreement_point * 100).round %>%</span>
26
+ <span class="ck-trust-line__margin" title="The range we're confident the true rate sits in.">±<%= (stats.margin * 100).round %> pt</span>
27
+ <span class="ck-trust-line__gate" title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts.' %>"><%= stats.firm? ? "settled" : "early" %></span>
28
+ <span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.borderline_rate && stats.borderline_rate > 0 %><% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %> · <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>" title="<%= level == 'ok' ? '' : 'Reviewers said the rubric was unclear here.' %>"><%= (stats.borderline_rate * 100).round %>% unclear</span><% end %></span>
30
29
  <% end %>
31
- </div>
30
+ </p>
@@ -1,3 +1,9 @@
1
+ <% suggestion = local_assigns[:suggestion_draft] %>
2
+ <% edit_draft = local_assigns[:edit_draft] %>
3
+ <% suggestion_bands = suggestion ? Array(suggestion.rubric_bands).each_with_object({}) { |b, h| h[b["stars"].to_i] = b["description"].to_s } : {} %>
4
+ <% suggested_instruction = suggestion&.instruction.to_s %>
5
+ <% instruction_changed = suggestion && suggested_instruction.present? && suggested_instruction != metric.instruction.to_s %>
6
+
1
7
  <%= form_with(model: metric, local: true) do |form| %>
2
8
  <% if metric.errors.any? %>
3
9
  <div class="ck-flash ck-flash--alert" role="alert">
@@ -10,6 +16,50 @@
10
16
  </div>
11
17
  <% end %>
12
18
 
19
+ <% if edit_draft && !suggestion %>
20
+ <% pub = local_assigns[:published_judge_version] %>
21
+ <% draft_instr_changed = pub && pub.instruction.to_s != edit_draft.instruction.to_s %>
22
+ <% draft_rubric_changed = pub && pub.rubric_bands != edit_draft.rubric_bands %>
23
+ <div class="ck-suggestion-banner" role="status">
24
+ <div class="ck-suggestion-banner__body">
25
+ <p class="ck-kicker">Draft pending</p>
26
+ <p class="ck-meta-copy">An unpublished draft of this metric is saved. Publish to replace the live<%= " instruction" if draft_instr_changed %><%= " and" if draft_instr_changed && draft_rubric_changed %><%= " rubric" if draft_rubric_changed %> for future runs, or keep editing.</p>
27
+ </div>
28
+ <div class="ck-suggestion-banner__actions">
29
+ <%= button_to "Discard draft", dismiss_suggestion_metric_path(metric, draft_id: edit_draft.id, back_to: "edit"),
30
+ method: :delete, form_class: "inline-block",
31
+ class: ck_button_classes(:light, variant: :outline),
32
+ data: { turbo_confirm: "Drop this draft?" } %>
33
+ <%= button_to "Publish this version", publish_draft_metric_path(metric, draft_id: edit_draft.id),
34
+ method: :post, form_class: "inline-block",
35
+ class: ck_button_classes(:dark) %>
36
+ </div>
37
+ </div>
38
+ <% end %>
39
+
40
+ <% if suggestion %>
41
+ <div class="ck-suggestion-banner" role="status">
42
+ <div class="ck-suggestion-banner__body">
43
+ <p class="ck-kicker">Proposed improvements</p>
44
+ <p class="ck-meta-copy">Based on your disagreements, the model proposed these changes to the instruction and rubric. Apply pieces inline below, take everything at once, try again, or discard.</p>
45
+ </div>
46
+ <div class="ck-suggestion-banner__actions">
47
+ <%= button_to "Try again", suggest_variants_metric_path(metric, back_to: "edit"),
48
+ method: :post, form_class: "inline-block",
49
+ class: ck_button_classes(:light, variant: :outline),
50
+ title: "Discard these improvements and ask the model for fresh ones.",
51
+ data: { turbo_confirm: "Replace these improvements with fresh ones from the model?" } %>
52
+ <%= button_to "Discard", dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
53
+ method: :delete, form_class: "inline-block",
54
+ class: ck_button_classes(:light, variant: :outline),
55
+ data: { turbo_confirm: "Drop these improvements?" } %>
56
+ <%= button_to "Take everything", publish_draft_metric_path(metric, draft_id: suggestion.id),
57
+ method: :post, form_class: "inline-block",
58
+ class: ck_button_classes(:dark) %>
59
+ </div>
60
+ </div>
61
+ <% end %>
62
+
13
63
  <div class="ck-card ck-form-card">
14
64
  <div class="ck-field">
15
65
  <%= form.label :name, "Metric name", class: "ck-label" %>
@@ -22,14 +72,34 @@
22
72
  <p class="ck-hint">What should the judge assess? This instruction is sent to the LLM judge when scoring outputs.</p>
23
73
  <%= form.text_area :instruction, rows: 8, class: "ck-input ck-input--area", placeholder: "Evaluate whether the output...", **ck_field_aria(form, :instruction) %>
24
74
  <%= ck_field_error(form, :instruction) %>
75
+
76
+ <% if instruction_changed %>
77
+ <div class="ck-inline-suggestion">
78
+ <div class="ck-inline-suggestion__head">
79
+ <p class="ck-kicker">Suggested wording</p>
80
+ <button type="button"
81
+ class="<%= ck_button_classes(:light, variant: :outline) %> ck-inline-suggestion__apply"
82
+ data-ck-apply
83
+ data-target="metric[instruction]"
84
+ data-value="<%= h(suggested_instruction) %>">Use this wording</button>
85
+ </div>
86
+ <div class="ck-inline-suggestion__diff">
87
+ <pre class="ck-inline-suggestion__pane ck-inline-suggestion__pane--before"><%= ck_word_diff_old(metric.instruction.to_s, suggested_instruction) %></pre>
88
+ <pre class="ck-inline-suggestion__pane ck-inline-suggestion__pane--after"><%= ck_word_diff_new(metric.instruction.to_s, suggested_instruction) %></pre>
89
+ </div>
90
+ </div>
91
+ <% end %>
25
92
  </div>
26
93
 
27
94
  <div class="ck-field ck-field--spacious">
28
- <p class="ck-section-title">Rubric</p>
95
+ <p class="ck-section-title">Rubric<%= render "completion_kit/metrics/rubric_hint" %></p>
29
96
  <p class="ck-hint">What each star rating means for this metric.</p>
30
97
 
31
98
  <div class="ck-rubric-builder">
32
99
  <% metric.rubric_bands_for_form.each_with_index do |band, index| %>
100
+ <% suggested_band = suggestion_bands[band["stars"].to_i].to_s %>
101
+ <% band_changed = suggestion && suggested_band.present? && suggested_band != band["description"].to_s %>
102
+ <% target_name = "metric[rubric_bands][#{index}][description]" %>
33
103
  <div class="ck-rubric-row">
34
104
  <div class="ck-rubric-row__stars">
35
105
  <% 5.times do |i| %>
@@ -38,7 +108,23 @@
38
108
  <input type="hidden" name="metric[rubric_bands][<%= index %>][stars]" value="<%= band["stars"] %>">
39
109
  </div>
40
110
  <div class="ck-rubric-row__fields">
41
- <textarea name="metric[rubric_bands][<%= index %>][description]" rows="2" class="ck-input ck-input--area"><%= band["description"] %></textarea>
111
+ <textarea name="<%= target_name %>" rows="2" class="ck-input ck-input--area"><%= band["description"] %></textarea>
112
+ <% if band_changed %>
113
+ <div class="ck-inline-suggestion ck-inline-suggestion--band">
114
+ <div class="ck-inline-suggestion__head">
115
+ <p class="ck-kicker">Suggested band</p>
116
+ <button type="button"
117
+ class="<%= ck_button_classes(:light, variant: :outline) %> ck-inline-suggestion__apply"
118
+ data-ck-apply
119
+ data-target="<%= target_name %>"
120
+ data-value="<%= h(suggested_band) %>">Use this band</button>
121
+ </div>
122
+ <div class="ck-inline-suggestion__diff">
123
+ <pre class="ck-inline-suggestion__pane ck-inline-suggestion__pane--before"><%= ck_word_diff_old(band["description"].to_s, suggested_band) %></pre>
124
+ <pre class="ck-inline-suggestion__pane ck-inline-suggestion__pane--after"><%= ck_word_diff_new(band["description"].to_s, suggested_band) %></pre>
125
+ </div>
126
+ </div>
127
+ <% end %>
42
128
  </div>
43
129
  </div>
44
130
  <% end %>
@@ -57,11 +143,11 @@
57
143
  <% confirm = parts.empty? ? "Delete \"#{metric.name}\"? It's not in use." : "Delete \"#{metric.name}\"? It's #{parts.to_sentence}." %>
58
144
  <%= button_to metric_path(metric), method: :delete,
59
145
  form_class: "inline-block",
60
- class: "ck-icon-btn",
146
+ class: "ck-icon-btn ck-icon-btn--form",
61
147
  title: "Delete metric",
62
148
  "aria-label": "Delete metric",
63
149
  data: { turbo_confirm: confirm } do %>
64
- <%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %>
150
+ <%= heroicon_tag "trash", variant: :outline, size: 24, "aria-hidden": "true" %>
65
151
  <% end %>
66
152
  <% end %>
67
153
  <%= link_to "Cancel", metrics_path, class: ck_button_classes(:light, variant: :outline), tabindex: "0" %>
@@ -0,0 +1,25 @@
1
+ <% current_bands = local_assigns[:current_bands] || [] %>
2
+ <% draft_bands = local_assigns[:draft_bands] || [] %>
3
+ <% lookup = ->(bands, stars) { bands.find { |b| b["stars"].to_i == stars }&.dig("description").to_s } %>
4
+ <div class="ck-rubric-diff">
5
+ <% 5.downto(1) do |stars| %>
6
+ <% old_band = lookup.call(current_bands, stars) %>
7
+ <% new_band = lookup.call(draft_bands, stars) %>
8
+ <% changed = old_band != new_band %>
9
+ <div class="ck-rubric-diff__row ck-rubric-diff__row--<%= changed ? "changed" : "unchanged" %>">
10
+ <div class="ck-rubric-diff__stars">
11
+ <% 5.times do |i| %>
12
+ <svg viewBox="0 0 24 24" width="14" height="14" stroke-width="1.75" class="ck-star <%= i < stars ? "ck-star--filled" : "ck-star--empty" %>"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
13
+ <% end %>
14
+ </div>
15
+ <% if changed %>
16
+ <div class="ck-rubric-diff__panes">
17
+ <pre class="ck-rubric-diff__pane ck-rubric-diff__pane--before"><%= ck_word_diff_old(old_band, new_band) %></pre>
18
+ <pre class="ck-rubric-diff__pane ck-rubric-diff__pane--after"><%= ck_word_diff_new(old_band, new_band) %></pre>
19
+ </div>
20
+ <% else %>
21
+ <p class="ck-rubric-diff__unchanged"><%= old_band.presence || "—" %></p>
22
+ <% end %>
23
+ </div>
24
+ <% end %>
25
+ </div>
@@ -0,0 +1,4 @@
1
+ <span class="ck-info-hint" tabindex="0" role="button" aria-label="What is a rubric?">
2
+ <%= heroicon_tag "information-circle", variant: :outline, "aria-hidden": "true" %>
3
+ <span class="ck-info-popup">How the judge picks 1 to 5. Each row says what an output has to look like to earn that many stars. The judge reads these descriptions when it scores, so clearer rows give you more consistent scoring.</span>
4
+ </span>
@@ -1,11 +1,15 @@
1
- <article class="ck-starter-card">
2
- <h3 class="ck-starter-card__name"><%= link_to starter.name, starter_preview_metrics_path(key: starter.key), class: "ck-link" %></h3>
3
- <p class="ck-starter-card__desc"><%= starter.description %></p>
4
- <div class="ck-starter-card__actions">
5
- <%= link_to "Preview", starter_preview_metrics_path(key: starter.key), class: ck_button_classes(:dark) + " ck-button--sm" %>
6
- <%= button_to "Don't show this one", dismiss_starter_metrics_path(key: starter.key),
7
- method: :post, form_class: "inline-block",
8
- class: "ck-link ck-starter-card__dismiss",
1
+ <%= link_to starter_preview_metrics_path(key: starter.key), class: "ck-starter-card" do %>
2
+ <div class="ck-starter-card__body">
3
+ <p class="ck-starter-card__name"><strong><%= starter.name %></strong></p>
4
+ <p class="ck-starter-card__desc"><%= starter.description %></p>
5
+ </div>
6
+ <div class="ck-starter-card__foot">
7
+ <span class="ck-starter-card__cta">Preview →</span>
8
+ <%= button_to "dismiss", dismiss_starter_metrics_path(key: starter.key),
9
+ method: :post,
10
+ form: { onclick: "event.stopPropagation();" },
11
+ form_class: "inline-block ck-starter-card__dismiss-form",
12
+ class: "ck-starter-card__dismiss",
9
13
  data: { turbo_confirm: "Hide \"#{starter.name}\" from this list?" } %>
10
14
  </div>
11
- </article>
15
+ <% end %>
@@ -10,4 +10,8 @@
10
10
  </div>
11
11
  </section>
12
12
 
13
- <%= render "form", metric: @metric %>
13
+ <%= render "form",
14
+ metric: @metric,
15
+ suggestion_draft: @suggestion_draft,
16
+ edit_draft: @edit_draft,
17
+ published_judge_version: @published_judge_version %>