completion-kit 0.5.41 → 0.5.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/javascripts/completion_kit/application.js +17 -0
  3. data/app/assets/stylesheets/completion_kit/application.css +557 -23
  4. data/app/controllers/completion_kit/metrics_controller.rb +62 -24
  5. data/app/jobs/completion_kit/judge_review_job.rb +11 -0
  6. data/app/models/completion_kit/judge_version.rb +32 -1
  7. data/app/models/completion_kit/starter_metric_dismissal.rb +5 -0
  8. data/app/services/completion_kit/judge_variant_generator.rb +8 -6
  9. data/app/services/completion_kit/metric_calibration_stats.rb +16 -4
  10. data/app/services/completion_kit/starter_metrics.rb +94 -0
  11. data/app/views/completion_kit/api_reference/_body.html.erb +1 -1
  12. data/app/views/completion_kit/calibrations/_buttons.html.erb +43 -6
  13. data/app/views/completion_kit/calibrations/_trust_panel.html.erb +27 -28
  14. data/app/views/completion_kit/metrics/_form.html.erb +90 -4
  15. data/app/views/completion_kit/metrics/_rubric_diff.html.erb +25 -0
  16. data/app/views/completion_kit/metrics/_rubric_hint.html.erb +4 -0
  17. data/app/views/completion_kit/metrics/_starter_card.html.erb +15 -0
  18. data/app/views/completion_kit/metrics/edit.html.erb +5 -1
  19. data/app/views/completion_kit/metrics/index.html.erb +32 -6
  20. data/app/views/completion_kit/metrics/show.html.erb +132 -126
  21. data/app/views/completion_kit/metrics/starter_preview.html.erb +45 -0
  22. data/app/views/completion_kit/responses/show.html.erb +1 -1
  23. data/app/views/completion_kit/runs/_status_panel.html.erb +2 -2
  24. data/config/routes.rb +7 -1
  25. data/db/migrate/20260524000001_create_completion_kit_starter_metric_dismissals.rb +12 -0
  26. data/db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb +24 -0
  27. data/lib/completion_kit/version.rb +1 -1
  28. metadata +9 -1
@@ -1,3 +1,9 @@
1
+ <% suggestion = local_assigns[:suggestion_draft] %>
2
+ <% edit_draft = local_assigns[:edit_draft] %>
3
+ <% suggestion_bands = suggestion ? Array(suggestion.rubric_bands).each_with_object({}) { |b, h| h[b["stars"].to_i] = b["description"].to_s } : {} %>
4
+ <% suggested_instruction = suggestion&.instruction.to_s %>
5
+ <% instruction_changed = suggestion && suggested_instruction.present? && suggested_instruction != metric.instruction.to_s %>
6
+
1
7
  <%= form_with(model: metric, local: true) do |form| %>
2
8
  <% if metric.errors.any? %>
3
9
  <div class="ck-flash ck-flash--alert" role="alert">
@@ -10,6 +16,50 @@
10
16
  </div>
11
17
  <% end %>
12
18
 
19
+ <% if edit_draft && !suggestion %>
20
+ <% pub = local_assigns[:published_judge_version] %>
21
+ <% draft_instr_changed = pub && pub.instruction.to_s != edit_draft.instruction.to_s %>
22
+ <% draft_rubric_changed = pub && pub.rubric_bands != edit_draft.rubric_bands %>
23
+ <div class="ck-suggestion-banner" role="status">
24
+ <div class="ck-suggestion-banner__body">
25
+ <p class="ck-kicker">Draft pending</p>
26
+ <p class="ck-meta-copy">An unpublished draft of this metric is saved. Publish to replace the live<%= " instruction" if draft_instr_changed %><%= " and" if draft_instr_changed && draft_rubric_changed %><%= " rubric" if draft_rubric_changed %> for future runs, or keep editing.</p>
27
+ </div>
28
+ <div class="ck-suggestion-banner__actions">
29
+ <%= button_to "Discard draft", dismiss_suggestion_metric_path(metric, draft_id: edit_draft.id, back_to: "edit"),
30
+ method: :delete, form_class: "inline-block",
31
+ class: ck_button_classes(:light, variant: :outline),
32
+ data: { turbo_confirm: "Drop this draft?" } %>
33
+ <%= button_to "Publish this version", publish_draft_metric_path(metric, draft_id: edit_draft.id),
34
+ method: :post, form_class: "inline-block",
35
+ class: ck_button_classes(:dark) %>
36
+ </div>
37
+ </div>
38
+ <% end %>
39
+
40
+ <% if suggestion %>
41
+ <div class="ck-suggestion-banner" role="status">
42
+ <div class="ck-suggestion-banner__body">
43
+ <p class="ck-kicker">Proposed improvements</p>
44
+ <p class="ck-meta-copy">Based on your disagreements, the model proposed these changes to the instruction and rubric. Apply pieces inline below, take everything at once, try again, or discard.</p>
45
+ </div>
46
+ <div class="ck-suggestion-banner__actions">
47
+ <%= button_to "Try again", suggest_variants_metric_path(metric, back_to: "edit"),
48
+ method: :post, form_class: "inline-block",
49
+ class: ck_button_classes(:light, variant: :outline),
50
+ title: "Discard these improvements and ask the model for fresh ones.",
51
+ data: { turbo_confirm: "Replace these improvements with fresh ones from the model?" } %>
52
+ <%= button_to "Discard", dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
53
+ method: :delete, form_class: "inline-block",
54
+ class: ck_button_classes(:light, variant: :outline),
55
+ data: { turbo_confirm: "Drop these improvements?" } %>
56
+ <%= button_to "Take everything", publish_draft_metric_path(metric, draft_id: suggestion.id),
57
+ method: :post, form_class: "inline-block",
58
+ class: ck_button_classes(:dark) %>
59
+ </div>
60
+ </div>
61
+ <% end %>
62
+
13
63
  <div class="ck-card ck-form-card">
14
64
  <div class="ck-field">
15
65
  <%= form.label :name, "Metric name", class: "ck-label" %>
@@ -22,14 +72,34 @@
22
72
  <p class="ck-hint">What should the judge assess? This instruction is sent to the LLM judge when scoring outputs.</p>
23
73
  <%= form.text_area :instruction, rows: 8, class: "ck-input ck-input--area", placeholder: "Evaluate whether the output...", **ck_field_aria(form, :instruction) %>
24
74
  <%= ck_field_error(form, :instruction) %>
75
+
76
+ <% if instruction_changed %>
77
+ <div class="ck-inline-suggestion">
78
+ <div class="ck-inline-suggestion__head">
79
+ <p class="ck-kicker">Suggested wording</p>
80
+ <button type="button"
81
+ class="<%= ck_button_classes(:light, variant: :outline) %> ck-inline-suggestion__apply"
82
+ data-ck-apply
83
+ data-target="metric[instruction]"
84
+ data-value="<%= h(suggested_instruction) %>">Use this wording</button>
85
+ </div>
86
+ <div class="ck-inline-suggestion__diff">
87
+ <pre class="ck-inline-suggestion__pane ck-inline-suggestion__pane--before"><%= ck_word_diff_old(metric.instruction.to_s, suggested_instruction) %></pre>
88
+ <pre class="ck-inline-suggestion__pane ck-inline-suggestion__pane--after"><%= ck_word_diff_new(metric.instruction.to_s, suggested_instruction) %></pre>
89
+ </div>
90
+ </div>
91
+ <% end %>
25
92
  </div>
26
93
 
27
94
  <div class="ck-field ck-field--spacious">
28
- <p class="ck-section-title">Rubric</p>
95
+ <p class="ck-section-title">Rubric<%= render "completion_kit/metrics/rubric_hint" %></p>
29
96
  <p class="ck-hint">What each star rating means for this metric.</p>
30
97
 
31
98
  <div class="ck-rubric-builder">
32
99
  <% metric.rubric_bands_for_form.each_with_index do |band, index| %>
100
+ <% suggested_band = suggestion_bands[band["stars"].to_i].to_s %>
101
+ <% band_changed = suggestion && suggested_band.present? && suggested_band != band["description"].to_s %>
102
+ <% target_name = "metric[rubric_bands][#{index}][description]" %>
33
103
  <div class="ck-rubric-row">
34
104
  <div class="ck-rubric-row__stars">
35
105
  <% 5.times do |i| %>
@@ -38,7 +108,23 @@
38
108
  <input type="hidden" name="metric[rubric_bands][<%= index %>][stars]" value="<%= band["stars"] %>">
39
109
  </div>
40
110
  <div class="ck-rubric-row__fields">
41
- <textarea name="metric[rubric_bands][<%= index %>][description]" rows="2" class="ck-input ck-input--area"><%= band["description"] %></textarea>
111
+ <textarea name="<%= target_name %>" rows="2" class="ck-input ck-input--area"><%= band["description"] %></textarea>
112
+ <% if band_changed %>
113
+ <div class="ck-inline-suggestion ck-inline-suggestion--band">
114
+ <div class="ck-inline-suggestion__head">
115
+ <p class="ck-kicker">Suggested band</p>
116
+ <button type="button"
117
+ class="<%= ck_button_classes(:light, variant: :outline) %> ck-inline-suggestion__apply"
118
+ data-ck-apply
119
+ data-target="<%= target_name %>"
120
+ data-value="<%= h(suggested_band) %>">Use this band</button>
121
+ </div>
122
+ <div class="ck-inline-suggestion__diff">
123
+ <pre class="ck-inline-suggestion__pane ck-inline-suggestion__pane--before"><%= ck_word_diff_old(band["description"].to_s, suggested_band) %></pre>
124
+ <pre class="ck-inline-suggestion__pane ck-inline-suggestion__pane--after"><%= ck_word_diff_new(band["description"].to_s, suggested_band) %></pre>
125
+ </div>
126
+ </div>
127
+ <% end %>
42
128
  </div>
43
129
  </div>
44
130
  <% end %>
@@ -57,11 +143,11 @@
57
143
  <% confirm = parts.empty? ? "Delete \"#{metric.name}\"? It's not in use." : "Delete \"#{metric.name}\"? It's #{parts.to_sentence}." %>
58
144
  <%= button_to metric_path(metric), method: :delete,
59
145
  form_class: "inline-block",
60
- class: "ck-icon-btn",
146
+ class: "ck-icon-btn ck-icon-btn--form",
61
147
  title: "Delete metric",
62
148
  "aria-label": "Delete metric",
63
149
  data: { turbo_confirm: confirm } do %>
64
- <%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %>
150
+ <%= heroicon_tag "trash", variant: :outline, size: 24, "aria-hidden": "true" %>
65
151
  <% end %>
66
152
  <% end %>
67
153
  <%= link_to "Cancel", metrics_path, class: ck_button_classes(:light, variant: :outline), tabindex: "0" %>
@@ -0,0 +1,25 @@
1
+ <% current_bands = local_assigns[:current_bands] || [] %>
2
+ <% draft_bands = local_assigns[:draft_bands] || [] %>
3
+ <% lookup = ->(bands, stars) { bands.find { |b| b["stars"].to_i == stars }&.dig("description").to_s } %>
4
+ <div class="ck-rubric-diff">
5
+ <% 5.downto(1) do |stars| %>
6
+ <% old_band = lookup.call(current_bands, stars) %>
7
+ <% new_band = lookup.call(draft_bands, stars) %>
8
+ <% changed = old_band != new_band %>
9
+ <div class="ck-rubric-diff__row ck-rubric-diff__row--<%= changed ? "changed" : "unchanged" %>">
10
+ <div class="ck-rubric-diff__stars">
11
+ <% 5.times do |i| %>
12
+ <svg viewBox="0 0 24 24" width="14" height="14" stroke-width="1.75" class="ck-star <%= i < stars ? "ck-star--filled" : "ck-star--empty" %>"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
13
+ <% end %>
14
+ </div>
15
+ <% if changed %>
16
+ <div class="ck-rubric-diff__panes">
17
+ <pre class="ck-rubric-diff__pane ck-rubric-diff__pane--before"><%= ck_word_diff_old(old_band, new_band) %></pre>
18
+ <pre class="ck-rubric-diff__pane ck-rubric-diff__pane--after"><%= ck_word_diff_new(old_band, new_band) %></pre>
19
+ </div>
20
+ <% else %>
21
+ <p class="ck-rubric-diff__unchanged"><%= old_band.presence || "—" %></p>
22
+ <% end %>
23
+ </div>
24
+ <% end %>
25
+ </div>
@@ -0,0 +1,4 @@
1
+ <span class="ck-info-hint" tabindex="0" role="button" aria-label="What is a rubric?">
2
+ <%= heroicon_tag "information-circle", variant: :outline, "aria-hidden": "true" %>
3
+ <span class="ck-info-popup">How the judge picks 1 to 5. Each row says what an output has to look like to earn that many stars. The judge reads these descriptions when it scores, so clearer rows give you more consistent scoring.</span>
4
+ </span>
@@ -0,0 +1,15 @@
1
+ <%= link_to starter_preview_metrics_path(key: starter.key), class: "ck-starter-card" do %>
2
+ <div class="ck-starter-card__body">
3
+ <p class="ck-starter-card__name"><strong><%= starter.name %></strong></p>
4
+ <p class="ck-starter-card__desc"><%= starter.description %></p>
5
+ </div>
6
+ <div class="ck-starter-card__foot">
7
+ <span class="ck-starter-card__cta">Preview →</span>
8
+ <%= button_to "dismiss", dismiss_starter_metrics_path(key: starter.key),
9
+ method: :post,
10
+ form: { onclick: "event.stopPropagation();" },
11
+ form_class: "inline-block ck-starter-card__dismiss-form",
12
+ class: "ck-starter-card__dismiss",
13
+ data: { turbo_confirm: "Hide \"#{starter.name}\" from this list?" } %>
14
+ </div>
15
+ <% end %>
@@ -10,4 +10,8 @@
10
10
  </div>
11
11
  </section>
12
12
 
13
- <%= render "form", metric: @metric %>
13
+ <%= render "form",
14
+ metric: @metric,
15
+ suggestion_draft: @suggestion_draft,
16
+ edit_draft: @edit_draft,
17
+ published_judge_version: @published_judge_version %>
@@ -30,15 +30,17 @@
30
30
  <%= link_to metric_path(metric), class: "ck-record-name" do %><strong><%= metric.name %></strong><% end %>
31
31
  <% if CompletionKit.config.judge_calibration_enabled %>
32
32
  <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
33
- <p class="ck-metrics-table__trust">
33
+ <p class="ck-metrics-table__trust" title="Calibration: how often this metric's scores match the humans who reviewed them.">
34
+ <%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true", class: "ck-trust-icon" %>
35
+ <span class="ck-metrics-table__trust-label">Calibration</span>
34
36
  <% if s.counter_only? %>
35
37
  <% if s.sample_size.zero? %>
36
- No verdicts yet
38
+ <span class="ck-metrics-table__trust-state">Not measured yet</span>
37
39
  <% else %>
38
40
  <%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts
39
41
  <% end %>
40
42
  <% else %>
41
- <span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
43
+ <span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read. Keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
42
44
  ±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %>
43
45
  <% end %>
44
46
  </p>
@@ -75,12 +77,36 @@
75
77
  Use the same metrics on multiple runs? <%= link_to "Group them →", metric_groups_path, class: "ck-link" %>
76
78
  </p>
77
79
  <% end %>
80
+
81
+ <% if @available_starters.any? %>
82
+ <section class="ck-starter-row">
83
+ <p class="ck-kicker">Add a starter metric</p>
84
+ <p class="ck-meta-copy">Pre-written rubrics for the dimensions most teams score against. Click a card to preview before it's created.</p>
85
+ <div class="ck-starter-grid">
86
+ <% @available_starters.each do |starter| %>
87
+ <%= render "starter_card", starter: starter %>
88
+ <% end %>
89
+ </div>
90
+ </section>
91
+ <% end %>
78
92
  <% elsif @selected_tags.any? %>
79
93
  <div class="ck-empty">
80
94
  <p>No metrics match these tags. <%= link_to "Clear filters", metrics_path, class: "ck-link" %>.</p>
81
95
  </div>
82
96
  <% else %>
83
- <div class="ck-empty">
84
- <p>No metrics yet. <%= link_to "Create your first metric", new_metric_path, class: "ck-link" %> to start scoring prompt outputs.</p>
85
- </div>
97
+ <% if @available_starters.any? %>
98
+ <section class="ck-starter-row ck-starter-row--empty-state">
99
+ <h2 class="ck-title ck-title--sm">Start with a ready-made rubric</h2>
100
+ <p class="ck-lead">Pick one of the dimensions below to drop in a pre-written 1–5 rubric. You can edit anything after adding it. Or <%= link_to "write your own from scratch", new_metric_path, class: "ck-link" %>.</p>
101
+ <div class="ck-starter-grid">
102
+ <% @available_starters.each do |starter| %>
103
+ <%= render "starter_card", starter: starter %>
104
+ <% end %>
105
+ </div>
106
+ </section>
107
+ <% else %>
108
+ <div class="ck-empty">
109
+ <p>No metrics yet. <%= link_to "Create your first metric", new_metric_path, class: "ck-link" %> to start scoring prompt outputs.</p>
110
+ </div>
111
+ <% end %>
86
112
  <% end %>
@@ -8,70 +8,40 @@
8
8
  <h1 class="ck-title"><%= @metric.name %></h1>
9
9
  <% if CompletionKit.config.judge_calibration_enabled %>
10
10
  <%= render "completion_kit/calibrations/trust_panel",
11
- stats: CompletionKit::MetricCalibrationStats.for(@metric) %>
12
- <% if @edit_draft %>
13
- <% pub_instr = @published_judge_version&.instruction.to_s %>
14
- <% draft_instr = @edit_draft.instruction.to_s %>
15
- <% instruction_changed = pub_instr != draft_instr %>
16
- <% rubric_changed = @published_judge_version && @published_judge_version.rubric_bands != @edit_draft.rubric_bands %>
17
- <section class="ck-card ck-card--spaced ck-draft-pending">
18
- <div class="ck-prompt-preview__header">
19
- <p class="ck-kicker">Draft pending</p>
20
- <%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @edit_draft.id),
21
- method: :post, form_class: "inline-block",
22
- class: ck_button_classes(:dark) %>
23
- </div>
24
- <p class="ck-meta-copy">A draft of this metric is saved. Publishing it replaces the live instruction<%= ", rubric," if rubric_changed %> for future runs. Here's what changes.</p>
25
-
26
- <% if instruction_changed %>
27
- <div class="ck-suggest-diff">
28
- <div class="ck-suggest-diff__pane">
29
- <div class="ck-suggest-diff__header">
30
- <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Currently live</span>
31
- </div>
32
- <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(pub_instr, draft_instr) %></pre>
33
- </div>
34
- <div class="ck-suggest-diff__pane">
35
- <div class="ck-suggest-diff__header">
36
- <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Draft</span>
37
- </div>
38
- <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(pub_instr, draft_instr) %></pre>
39
- </div>
40
- </div>
41
- <% else %>
42
- <p class="ck-meta-copy">The instruction is unchanged.</p>
43
- <% end %>
44
-
45
- <% if rubric_changed %>
46
- <p class="ck-meta-copy"><strong>Rubric also changed.</strong> Edit the metric to inspect each band, or publish to apply the new wording.</p>
47
- <% end %>
48
- </section>
49
- <% end %>
11
+ stats: CompletionKit::MetricCalibrationStats.for(@metric),
12
+ metric: @metric %>
13
+ <% end %>
14
+ <% if @metric.tags.any? %>
15
+ <div class="tag-marks-row tag-marks-row--header">
16
+ <%= render "completion_kit/tags/marks", tags: @metric.tags %>
17
+ </div>
50
18
  <% end %>
51
19
  </div>
52
20
  <div class="ck-actions">
53
21
  <% if CompletionKit.config.judge_calibration_enabled %>
54
- <% if @improve_disagreement_count.positive? %>
22
+ <% if @suggestion_draft %>
23
+ <%= link_to "Review improvements →", edit_metric_path(@metric),
24
+ class: ck_button_classes(:dark),
25
+ title: "The model proposed improvements based on your disagreements. Review and apply what you want." %>
26
+ <% elsif @edit_draft %>
27
+ <%= link_to "Review draft →", edit_metric_path(@metric),
28
+ class: ck_button_classes(:dark),
29
+ title: "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
30
+ <% elsif @improve_disagreement_count.positive? %>
55
31
  <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
56
32
  method: :post, form_class: "inline-block",
57
33
  class: ck_button_classes(:light, variant: :outline),
58
- title: "Rewrite this metric based on the disagreements collected so far.",
34
+ title: "Have the model rewrite this metric's instruction and rubric based on the disagreements collected so far.",
59
35
  data: { turbo_confirm: "Rewrite this metric based on the disagreements collected so far?" } %>
60
36
  <% else %>
61
37
  <button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
62
- title="Mark at least one row as Disagree before the model can suggest a change.">Improve the metric</button>
38
+ title="Mark at least one case as Disagree before the model can suggest a change.">Improve the metric</button>
63
39
  <% end %>
64
40
  <% end %>
65
41
  <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
66
42
  </div>
67
43
  </section>
68
44
 
69
- <% if @metric.tags.any? %>
70
- <div class="tag-marks-row tag-marks-row--header">
71
- <%= render "completion_kit/tags/marks", tags: @metric.tags %>
72
- </div>
73
- <% end %>
74
-
75
45
  <% if @metric.instruction.present? %>
76
46
  <section class="ck-card">
77
47
  <p class="ck-kicker">Instruction</p>
@@ -80,7 +50,7 @@
80
50
  <% end %>
81
51
 
82
52
  <section class="ck-card ck-card--spaced">
83
- <p class="ck-kicker">Rubric</p>
53
+ <p class="ck-kicker">Rubric<%= render "completion_kit/metrics/rubric_hint" %></p>
84
54
  <div class="ck-rubric-display">
85
55
  <% @metric.rubric_bands_for_form.each do |band| %>
86
56
  <div class="ck-rubric-row ck-rubric-row--display">
@@ -97,64 +67,121 @@
97
67
  </div>
98
68
  </section>
99
69
 
100
- <% if CompletionKit.config.judge_calibration_enabled && @suggestion_draft %>
101
- <% sd_current_instr = @published_judge_version&.instruction.to_s %>
102
- <% sd_draft_instr = @suggestion_draft.instruction.to_s %>
103
- <% sd_current_rubric = @published_judge_version&.rubric_bands || [] %>
104
- <% sd_rubric_changed = @suggestion_draft.rubric_bands != sd_current_rubric %>
105
- <section class="ck-card ck-card--spaced ck-draft-pending">
106
- <div class="ck-prompt-preview__header">
107
- <p class="ck-kicker">Suggested change</p>
108
- <time class="ck-meta-copy" data-relative-time datetime="<%= @suggestion_draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(@suggestion_draft.created_at) %> ago</time>
109
- </div>
110
- <p class="ck-meta-copy">Based on your disagreements, the model proposed this rewrite. Use it to replace the live version, or discard.</p>
70
+ <% if CompletionKit.config.judge_calibration_enabled && @versions.size > 1 %>
71
+ <% predecessor_of = @versions.index_with { |v| @versions.detect { |o| o.version_number < v.version_number } } %>
72
+ <% version_changed = ->(v, pred) { pred && (pred.instruction.to_s != v.instruction.to_s || pred.rubric_bands != v.rubric_bands) } %>
73
+ <section class="ck-card ck-card--spaced">
74
+ <p class="ck-kicker">Versions</p>
75
+ <table class="ck-results-table ck-metric-versions-table">
76
+ <thead>
77
+ <tr>
78
+ <th scope="col">Version</th>
79
+ <th scope="col">Source</th>
80
+ <th scope="col">Created</th>
81
+ </tr>
82
+ </thead>
83
+ <tbody>
84
+ <% @versions.each do |v| %>
85
+ <% pred = predecessor_of[v] %>
86
+ <tr>
87
+ <td>
88
+ <div class="ck-version-cell">
89
+ <div class="ck-version-cell__label">
90
+ <strong><%= v.version_label %></strong>
91
+ <% if v.current? %>
92
+ <span class="ck-chip">Published</span>
93
+ <% elsif v.draft? %>
94
+ <%= button_to "Publish", publish_draft_metric_path(@metric, draft_id: v.id),
95
+ method: :post, form_class: "inline-block",
96
+ class: "ck-chip ck-chip--publish" %>
97
+ <% else %>
98
+ <%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
99
+ method: :post, form_class: "inline-block",
100
+ class: "ck-chip ck-chip--publish",
101
+ data: { turbo_confirm: "Roll the live judge back to #{v.version_label}? Calibration verdicts collected against the current version stay tied to it." } %>
102
+ <% end %>
103
+ </div>
104
+ <% if version_changed.call(v, pred) %>
105
+ <button type="button" class="ck-cell-link ck-cell-link--delta"
106
+ title="What changed from #{pred.version_label}"
107
+ onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">&Delta;</button>
108
+ <% end %>
109
+ </div>
110
+ </td>
111
+ <td>
112
+ <% source_label, source_class = case v.source
113
+ when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
114
+ when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
115
+ else ["Original", "ck-source-chip ck-source-chip--initial"]
116
+ end %>
117
+ <span class="<%= source_class %>"><%= source_label %></span>
118
+ </td>
119
+ <td class="ck-meta-copy">
120
+ <time datetime="<%= v.created_at.utc.iso8601 %>" data-relative-time><%= time_ago_in_words(v.created_at) %> ago</time>
121
+ </td>
122
+ </tr>
123
+ <% end %>
124
+ </tbody>
125
+ </table>
126
+ </section>
111
127
 
112
- <div class="ck-suggest-diff">
113
- <div class="ck-suggest-diff__pane">
114
- <div class="ck-suggest-diff__header">
115
- <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Currently live</span>
116
- </div>
117
- <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(sd_current_instr, sd_draft_instr) %></pre>
118
- </div>
119
- <div class="ck-suggest-diff__pane">
120
- <div class="ck-suggest-diff__header">
121
- <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Proposed</span>
128
+ <% @versions.each do |v| %>
129
+ <% pred = predecessor_of[v] %>
130
+ <% next unless version_changed.call(v, pred) %>
131
+ <dialog id="ck-mvdiff-<%= v.id %>" class="ck-modal" onclick="if(event.target===this)this.close()">
132
+ <article class="ck-modal__panel" tabindex="-1" onclick="event.stopPropagation()">
133
+ <header class="ck-modal__header">
134
+ <div class="ck-modal__heading">
135
+ <h2 class="ck-modal__title"><%= pred.version_label %> &rarr; <%= v.version_label %></h2>
136
+ <span class="ck-modal__meta">What changed in <%= v.version_label %><% if v.current? %> (live)<% elsif v.draft? %> (draft)<% end %></span>
137
+ </div>
138
+ <button type="button" class="ck-modal__close" aria-label="Close" onclick="this.closest('dialog').close()">&times;</button>
139
+ </header>
140
+ <div class="ck-modal__body">
141
+ <% if pred.instruction.to_s != v.instruction.to_s %>
142
+ <div class="ck-suggest-diff">
143
+ <div class="ck-suggest-diff__pane">
144
+ <div class="ck-suggest-diff__header"><span class="ck-suggest-diff__label ck-suggest-diff__label--before"><%= pred.version_label %> instruction</span></div>
145
+ <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(pred.instruction.to_s, v.instruction.to_s) %></pre>
146
+ </div>
147
+ <div class="ck-suggest-diff__pane">
148
+ <div class="ck-suggest-diff__header"><span class="ck-suggest-diff__label ck-suggest-diff__label--after"><%= v.version_label %> instruction</span></div>
149
+ <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(pred.instruction.to_s, v.instruction.to_s) %></pre>
150
+ </div>
151
+ </div>
152
+ <% end %>
153
+ <% if pred.rubric_bands != v.rubric_bands %>
154
+ <p class="ck-kicker ck-kicker--inset">Rubric changes</p>
155
+ <%= render "completion_kit/metrics/rubric_diff",
156
+ current_bands: pred.rubric_bands || [],
157
+ draft_bands: v.rubric_bands || [] %>
158
+ <% end %>
122
159
  </div>
123
- <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(sd_current_instr, sd_draft_instr) %></pre>
124
- </div>
125
- </div>
126
-
127
- <% if sd_rubric_changed %>
128
- <p class="ck-meta-copy"><strong>Rubric also changed.</strong> Publishing applies the new rubric too.</p>
129
- <% end %>
130
-
131
- <div class="ck-actions">
132
- <%= button_to "Discard", dismiss_suggestion_metric_path(@metric, draft_id: @suggestion_draft.id),
133
- method: :delete, form_class: "inline-block",
134
- class: ck_button_classes(:light, variant: :outline),
135
- data: { turbo_confirm: "Drop this suggestion?" } %>
136
- <%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: @suggestion_draft.id),
137
- method: :post, form_class: "inline-block",
138
- class: ck_button_classes(:dark) %>
139
- </div>
140
- </section>
160
+ </article>
161
+ </dialog>
162
+ <% end %>
141
163
  <% end %>
142
164
 
143
165
  <% if CompletionKit.config.judge_calibration_enabled && @disagreements.any? %>
144
166
  <section class="ck-card ck-card--spaced">
145
167
  <div class="ck-prompt-preview__header">
146
168
  <p class="ck-kicker">Cases to learn from</p>
147
- <span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
169
+ <span class="ck-chip"><%= pluralize(@disagreements.size, "case") %></span>
148
170
  </div>
149
- <p class="ck-meta-copy">Rows where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> the judge sees them next time it grades.</p>
171
+ <p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades.</p>
150
172
  <% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
151
173
  <ul class="ck-disagreement-list">
152
174
  <% @disagreements.each do |cal| %>
153
175
  <% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
154
176
  <% already = existing_ids.include?(cal.id) %>
155
- <li class="ck-disagreement">
177
+ <% cal_version = cal.judge_version %>
178
+ <% on_current = cal_version&.id == @published_judge_version.id %>
179
+ <li class="ck-disagreement<%= " ck-disagreement--remembered" if already %><%= " ck-disagreement--stale" unless on_current %>">
156
180
  <div class="ck-disagreement__head">
157
181
  <div class="ck-disagreement__scores">
182
+ <% if cal_version %>
183
+ <span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_version.version_label %></span>
184
+ <% end %>
158
185
  <span class="ck-disagreement__scores-label">Judge</span>
159
186
  <% if review&.ai_score %>
160
187
  <span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
@@ -171,14 +198,21 @@
171
198
  </div>
172
199
  <div class="ck-disagreement__action">
173
200
  <% if already %>
174
- <span class="ck-chip ck-chip--done">Remembered</span>
201
+ <%= button_to "Forget",
202
+ remove_few_shot_metric_path(@metric, calibration_id: cal.id),
203
+ method: :delete,
204
+ form_class: "inline-block",
205
+ class: ck_button_classes(:light, variant: :outline),
206
+ title: "Stop showing this case to the judge.",
207
+ data: { turbo_confirm: "Stop showing this case to the judge?" } %>
208
+ <span class="ck-chip ck-chip--done" title="The judge sees this row when it grades for this metric.">Remembered</span>
175
209
  <% else %>
176
210
  <%= button_to "Remember this",
177
211
  add_few_shot_metric_path(@metric, calibration_id: cal.id),
178
212
  method: :post,
179
213
  form_class: "inline-block",
180
214
  class: ck_button_classes(:light, variant: :outline),
181
- title: "Pin this row so the judge sees it next time it grades for this metric." %>
215
+ title: "Pin this case so the judge sees it next time it grades for this metric." %>
182
216
  <% end %>
183
217
  </div>
184
218
  </div>
@@ -186,42 +220,14 @@
186
220
  <p class="ck-disagreement__note"><%= cal.note %></p>
187
221
  <% end %>
188
222
  <p class="ck-disagreement__source ck-meta-copy">
189
- <%= link_to cal.response.run.name.to_s.truncate(50), ck_run_path(cal.response.run), class: "ck-link" %>
190
- ·
191
- <%= link_to "row ##{cal.response.id}", run_response_path(cal.response.run, cal.response), class: "ck-link" %>
223
+ <%= link_to run_response_path(cal.response.run, cal.response, anchor: @metric.name.parameterize),
224
+ class: "ck-disagreement__source-link" do %>
225
+ <% case_display = cal.response.row_index.nil? ? "##{cal.response.id}" : (cal.response.row_index + 1).to_s %>
226
+ View case <%= case_display %> in <%= cal.response.run.name.to_s.truncate(50) %> →
227
+ <% end %>
192
228
  </p>
193
229
  </li>
194
230
  <% end %>
195
231
  </ul>
196
232
  </section>
197
-
198
-
199
- <% if Array(@metric.few_shot_examples).any? %>
200
- <section class="ck-card ck-card--spaced">
201
- <div class="ck-prompt-preview__header">
202
- <p class="ck-kicker">What the judge remembers</p>
203
- <span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "case") %></span>
204
- </div>
205
- <p class="ck-meta-copy">Rows you've pinned so the judge sees them next time it grades. Each one shows what the judge gave and what a human said it should have been.</p>
206
- <ol class="ck-few-shot-list">
207
- <% Array(@metric.few_shot_examples).each do |fs| %>
208
- <li class="ck-few-shot-item">
209
- <div class="ck-few-shot-item__scores">
210
- <span class="ck-meta-copy">judge said</span>
211
- <% if fs["judge_score"] %>
212
- <span class="<%= ck_badge_classes(ck_score_kind(fs["judge_score"].to_f)) %>"><%= fs["judge_score"] %></span>
213
- <% end %>
214
- <span class="ck-meta-copy">human said</span>
215
- <% if fs["human_score"] %>
216
- <span class="<%= ck_badge_classes(ck_score_kind(fs["human_score"].to_f)) %>"><%= fs["human_score"] %></span>
217
- <% end %>
218
- </div>
219
- <% if fs["human_note"].to_s.present? %>
220
- <p class="ck-copy"><%= fs["human_note"] %></p>
221
- <% end %>
222
- </li>
223
- <% end %>
224
- </ol>
225
- </section>
226
- <% end %>
227
233
  <% end %>