completion-kit 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/stylesheets/completion_kit/application.css +146 -325
  3. data/app/controllers/completion_kit/api/v1/base_controller.rb +14 -4
  4. data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +2 -2
  5. data/app/controllers/completion_kit/api/v1/datasets_controller.rb +2 -2
  6. data/app/controllers/completion_kit/api/v1/metric_groups_controller.rb +2 -2
  7. data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +1 -1
  8. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +5 -32
  9. data/app/controllers/completion_kit/api/v1/prompts_controller.rb +2 -2
  10. data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb +2 -2
  11. data/app/controllers/completion_kit/api/v1/runs_controller.rb +7 -7
  12. data/app/controllers/completion_kit/api/v1/tags_controller.rb +2 -2
  13. data/app/controllers/completion_kit/metrics_controller.rb +14 -37
  14. data/app/controllers/completion_kit/runs_controller.rb +2 -2
  15. data/app/jobs/completion_kit/generate_row_job.rb +2 -4
  16. data/app/jobs/completion_kit/judge_review_job.rb +4 -19
  17. data/app/models/completion_kit/metric.rb +0 -1
  18. data/app/models/completion_kit/metric_version.rb +35 -0
  19. data/app/models/completion_kit/run.rb +0 -1
  20. data/app/services/completion_kit/judge_service.rb +3 -10
  21. data/app/services/completion_kit/mcp_tools/metric_versions.rb +1 -1
  22. data/app/services/completion_kit/metric_variant_generator.rb +0 -13
  23. data/app/views/completion_kit/api_reference/_body.html.erb +2 -12
  24. data/app/views/completion_kit/api_reference/index.html.erb +4 -0
  25. data/app/views/completion_kit/calibrations/_trust_panel.html.erb +19 -12
  26. data/app/views/completion_kit/metrics/_form.html.erb +11 -12
  27. data/app/views/completion_kit/metrics/edit.html.erb +18 -0
  28. data/app/views/completion_kit/metrics/index.html.erb +0 -17
  29. data/app/views/completion_kit/metrics/show.html.erb +87 -105
  30. data/app/views/completion_kit/responses/show.html.erb +2 -2
  31. data/app/views/completion_kit/runs/show.html.erb +7 -7
  32. data/config/routes.rb +0 -4
  33. data/db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb +5 -0
  34. data/lib/completion_kit/version.rb +1 -1
  35. metadata +2 -1
@@ -1,9 +1,9 @@
1
1
  <% stats = local_assigns[:stats] %>
2
2
  <% metric = local_assigns[:metric] %>
3
3
  <% anchor = metric&.name&.parameterize %>
4
+ <% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
4
5
  <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
5
6
  created_by = CompletionKit.config.username.presence || "operator"
6
- current_metric_version = CompletionKit::MetricVersion.current.find_by(metric_id: metric.id)
7
7
  verdicted_ids = if current_metric_version
8
8
  CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
9
9
  else
@@ -22,19 +22,26 @@
22
22
  end %>
23
23
 
24
24
  <p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
25
- <span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
26
25
  <% if stats.sample_size.zero? %>
27
- <span class="ck-trust-line__state">Not measured yet.</span>
28
- <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "verdict") %> on prior versions, tied to that version's history.)<% end %><% if target_response %>
29
- <%= link_to "Give a verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %>
30
- <% else %> Run this metric on a dataset, then give a verdict.<% end %></span>
26
+ <span class="ck-trust-line__lead">Not measured yet.</span>
27
+ <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "earlier-version review") %> kept on file.)<% end %></span>
28
+ <% if target_response %>
29
+ <%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
30
+ <% end %>
31
31
  <% elsif stats.counter_only? %>
32
- <span class="ck-trust-line__counter"><%= stats.sample_size %>/<%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span>
33
- <span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.short_to_target > 0 %> · <%= stats.short_to_target %> more before this can be measured<% end %><% if target_response %> · <%= link_to "Give another verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %><% end %></span>
32
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></strong></span>
33
+ <% if stats.short_to_target > 0 %><span class="ck-trust-line__hint"><%= stats.short_to_target %> more to report a rate</span><% end %>
34
+ <% if target_response %>
35
+ <%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
36
+ <% end %>
34
37
  <% else %>
35
- <span class="ck-trust-line__score" title="Roughly how often the judge and the humans agreed.">~<%= (stats.agreement_point * 100).round %>%</span>
36
- <span class="ck-trust-line__margin" title="The range we're confident the true rate sits in.">±<%= (stats.margin * 100).round %> pt</span>
37
- <span class="ck-trust-line__gate" title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts.' %>"><%= stats.firm? ? "settled" : "early" %></span>
38
- <span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.borderline_rate && stats.borderline_rate > 0 %><% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %> · <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>" title="<%= level == 'ok' ? '' : 'Reviewers said the rubric was unclear here.' %>"><%= (stats.borderline_rate * 100).round %>% unclear</span><% end %></span>
38
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agreement</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong></span>
39
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
40
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
41
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
42
+ <% if stats.borderline_rate && stats.borderline_rate > 0 %>
43
+ <% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
44
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>
45
+ <% end %>
39
46
  <% end %>
40
47
  </p>
@@ -40,20 +40,19 @@
40
40
  <% if suggestion %>
41
41
  <div class="ck-suggestion-banner" role="status">
42
42
  <div class="ck-suggestion-banner__body">
43
- <p class="ck-kicker">Proposed improvements</p>
44
- <p class="ck-meta-copy">Based on your disagreements, the model proposed these changes to the instruction and rubric. Apply pieces inline below, take everything at once, try again, or discard.</p>
43
+ <p class="ck-kicker ck-kicker--icon"><%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>Proposed changes</p>
44
+ <p class="ck-meta-copy">Based on human reviews, here are some proposed changes to the metric.</p>
45
45
  </div>
46
46
  <div class="ck-suggestion-banner__actions">
47
- <%= button_to "Try again", suggest_variants_metric_path(metric, back_to: "edit"),
48
- method: :post, form_class: "inline-block",
49
- class: ck_button_classes(:light, variant: :outline),
50
- title: "Discard these improvements and ask the model for fresh ones.",
51
- data: { turbo_confirm: "Replace these improvements with fresh ones from the model?" } %>
52
- <%= button_to "Discard", dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
53
- method: :delete, form_class: "inline-block",
54
- class: ck_button_classes(:light, variant: :outline),
55
- data: { turbo_confirm: "Drop these improvements?" } %>
56
- <%= button_to "Take everything", publish_draft_metric_path(metric, draft_id: suggestion.id),
47
+ <%= button_to suggest_variants_metric_path(metric, back_to: "edit"),
48
+ method: :post, form_class: "inline-block", class: "ck-icon-btn",
49
+ title: "Try again", "aria-label": "Try again",
50
+ data: { turbo_confirm: "Replace these changes with fresh ones from the model?" } do %><%= heroicon_tag "arrow-path", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
51
+ <%= button_to dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
52
+ method: :delete, form_class: "inline-block", class: "ck-icon-btn",
53
+ title: "Discard these changes", "aria-label": "Discard",
54
+ data: { turbo_confirm: "Drop these changes?" } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
55
+ <%= button_to "Apply all", publish_draft_metric_path(metric, draft_id: suggestion.id),
57
56
  method: :post, form_class: "inline-block",
58
57
  class: ck_button_classes(:dark) %>
59
58
  </div>
@@ -10,6 +10,24 @@
10
10
  </div>
11
11
  </section>
12
12
 
13
+ <% if CompletionKit.config.judge_calibration_enabled && @suggestion_draft.nil? && @edit_draft.nil? && @improve_disagreement_count.to_i.positive? %>
14
+ <div class="ck-suggestion-banner" role="status">
15
+ <div class="ck-suggestion-banner__body">
16
+ <p class="ck-kicker">Improve from reviews</p>
17
+ <p class="ck-meta-copy">Based on human reviews, the model can propose changes to this metric.</p>
18
+ </div>
19
+ <div class="ck-suggestion-banner__actions">
20
+ <%= button_to suggest_variants_metric_path(@metric, back_to: "edit"),
21
+ method: :post, form_class: "inline-block",
22
+ class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
23
+ data: { turbo_confirm: "Draft improvements to this metric from your human reviews? You can edit or apply them here before publishing." } do %>
24
+ <%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
25
+ Suggest improvements
26
+ <% end %>
27
+ </div>
28
+ </div>
29
+ <% end %>
30
+
13
31
  <%= render "form",
14
32
  metric: @metric,
15
33
  suggestion_draft: @suggestion_draft,
@@ -28,23 +28,6 @@
28
28
  <tr onclick="window.location='<%= metric_path(metric) %>'" style="cursor: pointer;">
29
29
  <td>
30
30
  <%= link_to metric_path(metric), class: "ck-record-name" do %><strong><%= metric.name %></strong><% end %>
31
- <% if CompletionKit.config.judge_calibration_enabled %>
32
- <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
33
- <p class="ck-metrics-table__trust" title="Calibration: how often this metric's scores match the humans who reviewed them.">
34
- <%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true", class: "ck-trust-icon" %>
35
- <span class="ck-metrics-table__trust-label">Calibration</span>
36
- <% if s.counter_only? %>
37
- <% if s.sample_size.zero? %>
38
- <span class="ck-metrics-table__trust-state">Not measured yet</span>
39
- <% else %>
40
- <%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts
41
- <% end %>
42
- <% else %>
43
- <span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read. Keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
44
- ±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %>
45
- <% end %>
46
- </p>
47
- <% end %>
48
31
  <% if metric.tags.any? %>
49
32
  <div class="tag-marks-row">
50
33
  <%= render "completion_kit/tags/marks", tags: metric.tags %>
@@ -6,11 +6,6 @@
6
6
  <section class="ck-page-header">
7
7
  <div>
8
8
  <h1 class="ck-title"><%= @metric.name %></h1>
9
- <% if CompletionKit.config.judge_calibration_enabled %>
10
- <%= render "completion_kit/calibrations/trust_panel",
11
- stats: CompletionKit::MetricCalibrationStats.for(@metric),
12
- metric: @metric %>
13
- <% end %>
14
9
  <% if @metric.tags.any? %>
15
10
  <div class="tag-marks-row tag-marks-row--header">
16
11
  <%= render "completion_kit/tags/marks", tags: @metric.tags %>
@@ -18,23 +13,6 @@
18
13
  <% end %>
19
14
  </div>
20
15
  <div class="ck-actions">
21
- <% if CompletionKit.config.judge_calibration_enabled %>
22
- <% if @suggestion_draft || @edit_draft %>
23
- <% review_title = @suggestion_draft ? "The model proposed improvements based on your disagreements. Review and apply what you want." : "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
24
- <%= link_to "Review changes →", edit_metric_path(@metric),
25
- class: ck_button_classes(:dark),
26
- title: review_title %>
27
- <% elsif @improve_disagreement_count.positive? %>
28
- <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
29
- method: :post, form_class: "inline-block",
30
- class: ck_button_classes(:light, variant: :outline),
31
- title: "Ask the model to suggest improvements to this metric's instruction and rubric based on the disagreements collected so far.",
32
- data: { turbo_confirm: "Ask the model for suggested improvements based on the disagreements collected so far?" } %>
33
- <% else %>
34
- <button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
35
- title="Mark at least one case as Disagree before the model can suggest a change.">Improve the metric</button>
36
- <% end %>
37
- <% end %>
38
16
  <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
39
17
  </div>
40
18
  </section>
@@ -64,15 +42,15 @@
64
42
  </div>
65
43
  </section>
66
44
 
67
- <% if CompletionKit.config.judge_calibration_enabled && @versions.size > 1 %>
45
+ <% if CompletionKit.config.judge_calibration_enabled && @versions.any? %>
68
46
  <% predecessor_of = @versions.index_with { |v| @versions.detect { |o| o.version_number < v.version_number } } %>
69
- <% version_changed = ->(v, pred) { pred && (pred.instruction.to_s != v.instruction.to_s || pred.rubric_bands != v.rubric_bands) } %>
70
47
  <section class="ck-card ck-card--spaced">
71
48
  <p class="ck-kicker">Versions</p>
72
49
  <table class="ck-results-table ck-metric-versions-table">
73
50
  <thead>
74
51
  <tr>
75
52
  <th scope="col">Version</th>
53
+ <th scope="col">&Delta; Change</th>
76
54
  <th scope="col">Source</th>
77
55
  <th scope="col">Created</th>
78
56
  </tr>
@@ -86,28 +64,35 @@
86
64
  <div class="ck-version-cell__label">
87
65
  <strong><%= v.version_label %></strong>
88
66
  <% if v.current? %>
89
- <span class="ck-chip">Published</span>
67
+ <span class="ck-version-state ck-version-state--live">Published</span>
90
68
  <% elsif v.draft? %>
69
+ <span class="ck-version-state">Draft</span>
91
70
  <%= button_to "Publish", publish_draft_metric_path(@metric, draft_id: v.id),
92
71
  method: :post, form_class: "inline-block",
93
- class: "ck-chip ck-chip--publish" %>
72
+ class: "ck-chip ck-chip--cta" %>
94
73
  <% else %>
74
+ <span class="ck-version-state">Past</span>
95
75
  <%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
96
76
  method: :post, form_class: "inline-block",
97
77
  class: "ck-chip ck-chip--publish",
98
- data: { turbo_confirm: "Roll the live judge back to #{v.version_label}? Calibration verdicts collected against the current version stay tied to it." } %>
78
+ data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
99
79
  <% end %>
100
80
  </div>
101
- <% if version_changed.call(v, pred) %>
102
- <button type="button" class="ck-cell-link ck-cell-link--delta"
103
- title="What changed from #{pred.version_label}"
104
- onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">&Delta;</button>
105
- <% end %>
106
81
  </div>
107
82
  </td>
83
+ <td>
84
+ <% summary = v.change_summary_against(pred) %>
85
+ <% if summary %>
86
+ <button type="button" class="ck-change-link ck-change-link--<%= summary[:magnitude] %>"
87
+ title="Compare with <%= pred.version_label %>"
88
+ onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()"><%= summary[:label] %></button>
89
+ <% else %>
90
+ <span class="ck-meta-copy">—</span>
91
+ <% end %>
92
+ </td>
108
93
  <td>
109
94
  <% source_label, source_class = case v.source
110
- when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
95
+ when "suggestion" then ["AI calibration", "ck-source-chip ck-source-chip--ai"]
111
96
  when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
112
97
  when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
113
98
  else ["Original", "ck-source-chip ck-source-chip--initial"]
@@ -115,7 +100,15 @@
115
100
  <span class="<%= source_class %>"><%= source_label %></span>
116
101
  </td>
117
102
  <td class="ck-meta-copy">
118
- <time datetime="<%= v.created_at.utc.iso8601 %>" data-relative-time><%= time_ago_in_words(v.created_at) %> ago</time>
103
+ <div class="ck-version-created">
104
+ <time datetime="<%= v.created_at.utc.iso8601 %>" data-relative-time><%= time_ago_in_words(v.created_at) %> ago</time>
105
+ <% if v.draft? %>
106
+ <%= button_to dismiss_suggestion_metric_path(@metric, draft_id: v.id),
107
+ method: :delete, form_class: "inline-block", class: "ck-icon-btn",
108
+ title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
109
+ data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
110
+ <% end %>
111
+ </div>
119
112
  </td>
120
113
  </tr>
121
114
  <% end %>
@@ -125,7 +118,7 @@
125
118
 
126
119
  <% @versions.each do |v| %>
127
120
  <% pred = predecessor_of[v] %>
128
- <% next unless version_changed.call(v, pred) %>
121
+ <% next unless v.change_summary_against(pred) %>
129
122
  <dialog id="ck-mvdiff-<%= v.id %>" class="ck-modal" onclick="if(event.target===this)this.close()">
130
123
  <article class="ck-modal__panel" tabindex="-1" onclick="event.stopPropagation()">
131
124
  <header class="ck-modal__header">
@@ -148,85 +141,74 @@
148
141
  </div>
149
142
  </div>
150
143
  <% end %>
151
- <% if pred.rubric_bands != v.rubric_bands %>
144
+ <% pred_bands = CompletionKit::Metric.normalize_rubric_bands(pred.rubric_bands) %>
145
+ <% v_bands = CompletionKit::Metric.normalize_rubric_bands(v.rubric_bands) %>
146
+ <% if pred_bands != v_bands %>
152
147
  <p class="ck-kicker ck-kicker--inset">Rubric changes</p>
153
148
  <%= render "completion_kit/metrics/rubric_diff",
154
- current_bands: pred.rubric_bands || [],
155
- draft_bands: v.rubric_bands || [] %>
149
+ current_bands: pred_bands,
150
+ draft_bands: v_bands %>
156
151
  <% end %>
157
152
  </div>
153
+ <footer class="ck-modal__footer ck-modal__footer--split">
154
+ <% if v.current? %>
155
+ <span class="ck-modal__foot-note">This is the metric's published version.</span>
156
+ <% elsif v.draft? %>
157
+ <span class="ck-modal__foot-note">Happy with it? Publish to use <%= v.version_label %> for this metric from now on. Tweak it with Edit.</span>
158
+ <span class="ck-modal__foot-actions">
159
+ <%= button_to dismiss_suggestion_metric_path(@metric, draft_id: v.id),
160
+ method: :delete, form_class: "inline-block", class: "ck-icon-btn",
161
+ title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
162
+ data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
163
+ <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
164
+ <%= button_to "Publish #{v.version_label} →", publish_draft_metric_path(@metric, draft_id: v.id),
165
+ method: :post, form_class: "inline-block", class: ck_button_classes(:dark) %>
166
+ </span>
167
+ <% else %>
168
+ <span class="ck-modal__foot-note">Roll this metric back to this version.</span>
169
+ <%= button_to "Make #{v.version_label} current →", publish_draft_metric_path(@metric, draft_id: v.id),
170
+ method: :post, form_class: "inline-block", class: ck_button_classes(:dark),
171
+ data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
172
+ <% end %>
173
+ </footer>
158
174
  </article>
159
175
  </dialog>
160
176
  <% end %>
161
177
  <% end %>
162
178
 
163
- <% if CompletionKit.config.judge_calibration_enabled && @disagreements.any? %>
179
+ <% if CompletionKit.config.judge_calibration_enabled %>
164
180
  <section class="ck-card ck-card--spaced">
165
- <div class="ck-prompt-preview__header">
166
- <p class="ck-kicker">Cases to learn from</p>
167
- <span class="ck-chip"><%= pluralize(@disagreements.size, "case") %></span>
168
- </div>
169
- <% mixed_versions = @disagreements.any? { |c| c.metric_version_id != @published_metric_version.id } %>
170
- <p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades<%= " (pins flow into the current judge regardless of which version produced the verdict)" if mixed_versions %>.</p>
171
- <% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
172
- <ul class="ck-disagreement-list">
173
- <% @disagreements.each do |cal| %>
174
- <% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
175
- <% already = existing_ids.include?(cal.id) %>
176
- <% cal_metric_version = cal.metric_version %>
177
- <% on_current = cal_metric_version&.id == @published_metric_version.id %>
178
- <li class="ck-disagreement<%= " ck-disagreement--remembered" if already %><%= " ck-disagreement--stale" unless on_current %>">
179
- <div class="ck-disagreement__head">
180
- <div class="ck-disagreement__scores">
181
- <% if cal_metric_version && mixed_versions %>
182
- <span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_metric_version.version_label %></span>
183
- <% end %>
184
- <span class="ck-disagreement__scores-label">Judge</span>
185
- <% if review&.ai_score %>
186
- <span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
187
- <% else %>
188
- <span class="ck-meta-copy">—</span>
189
- <% end %>
190
- <span class="ck-disagreement__scores-arrow">→</span>
191
- <span class="ck-disagreement__scores-label">Human</span>
192
- <% if cal.corrected_score %>
193
- <span class="<%= ck_badge_classes(ck_score_kind(cal.corrected_score.to_f)) %>"><%= cal.corrected_score %></span>
194
- <% else %>
195
- <span class="ck-meta-copy">—</span>
196
- <% end %>
197
- </div>
198
- <div class="ck-disagreement__action">
199
- <% if already %>
200
- <%= button_to "Forget",
201
- remove_few_shot_metric_path(@metric, calibration_id: cal.id),
202
- method: :delete,
203
- form_class: "inline-block",
204
- class: ck_button_classes(:light, variant: :outline),
205
- title: "Stop showing this case to the judge.",
206
- data: { turbo_confirm: "Stop showing this case to the judge?" } %>
207
- <span class="ck-chip ck-chip--done" title="The judge sees this row when it grades for this metric.">Remembered</span>
208
- <% else %>
209
- <%= button_to "Remember this",
210
- add_few_shot_metric_path(@metric, calibration_id: cal.id),
211
- method: :post,
212
- form_class: "inline-block",
213
- class: ck_button_classes(:light, variant: :outline),
214
- title: "Pin this case so the judge sees it next time it grades for this metric." %>
215
- <% end %>
216
- </div>
217
- </div>
218
- <% if cal.note.to_s.present? %>
219
- <p class="ck-disagreement__note"><%= cal.note %></p>
220
- <% end %>
221
- <p class="ck-disagreement__source ck-meta-copy">
222
- <%= link_to run_response_path(cal.response.run, cal.response, anchor: @metric.name.parameterize),
223
- class: "ck-disagreement__source-link" do %>
224
- <% case_display = cal.response.row_index.nil? ? "##{cal.response.id}" : (cal.response.row_index + 1).to_s %>
225
- View case <%= case_display %> in <%= cal.response.run.name.to_s.truncate(50) %> →
226
- <% end %>
227
- </p>
228
- </li>
229
- <% end %>
230
- </ul>
181
+ <p class="ck-kicker">Calibration</p>
182
+ <p class="ck-meta-copy">This is a measure of how often the judge's scores match a human reviewer. Review its scores to build that signal; the scores you disagree with become the cases the model learns from when you improve the metric.</p>
183
+ <%= render "completion_kit/calibrations/trust_panel",
184
+ stats: CompletionKit::MetricCalibrationStats.for(@metric),
185
+ metric: @metric %>
186
+ <% draft = @suggestion_draft || @edit_draft %>
187
+ <% if draft %>
188
+ <div class="ck-cal-foot">
189
+ <span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
190
+ </div>
191
+ <% elsif @improve_disagreement_count.positive? %>
192
+ <div class="ck-cal-foot">
193
+ <span class="ck-cal-foot__note"><%= pluralize(@improve_disagreement_count, "case") %> where a reviewer's score didn't match the judge.</span>
194
+ <%= button_to suggest_variants_metric_path(@metric),
195
+ method: :post, form_class: "inline-block",
196
+ class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
197
+ data: { turbo_confirm: "Draft improvements to this metric from your human reviews? It stays a draft until you compare it and publish." } do %>
198
+ <%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
199
+ Suggest improvements
200
+ <% end %>
201
+ </div>
202
+ <% end %>
231
203
  </section>
232
204
  <% end %>
205
+
206
+ <% if params[:show_change].present? %>
207
+ <script>
208
+ (function () {
209
+ var dialog = document.getElementById("ck-mvdiff-<%= params[:show_change].to_i %>");
210
+ if (dialog && typeof dialog.showModal === "function") dialog.showModal();
211
+ })();
212
+ </script>
213
+ <% end %>
214
+
@@ -105,7 +105,7 @@
105
105
  <span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
106
106
  <div class="ck-inline">
107
107
  <% if review_version %>
108
- <span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Score produced by #{review_version.version_label} of this metric. The live judge has changed since." : "Score produced by the live judge (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
108
+ <span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Scored against #{review_version.version_label} of this metric. The metric has been republished since." : "Scored against the metric's current version (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
109
109
  <% end %>
110
110
  <% if review.ai_score %>
111
111
  <% 5.times do |i| %>
@@ -117,7 +117,7 @@
117
117
  </div>
118
118
  </div>
119
119
  <% if stale %>
120
- <p class="ck-review-card__stale-note">Scored against a superseded version of this metric. The live judge may score this differently.</p>
120
+ <p class="ck-review-card__stale-note">Scored against a superseded version of this metric. Its current version may score this differently.</p>
121
121
  <% end %>
122
122
  <% if review.ai_feedback.present? %>
123
123
  <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
@@ -23,25 +23,25 @@
23
23
  <% if stale_summary.any? %>
24
24
  <div class="ck-stale-versions-banner" role="status">
25
25
  <div class="ck-stale-versions-banner__body">
26
- <p class="ck-kicker">Stale judge versions</p>
26
+ <p class="ck-kicker">Stale metric versions</p>
27
27
  <p class="ck-meta-copy">
28
28
  This run was scored against metric versions that are no longer live.
29
29
  <% stale_summary.values.each_with_index do |s, i| %>
30
30
  <%= ", " if i > 0 %><strong><%= s[:metric_name] %></strong> (scored by <%= s[:scored_labels].join(", ") %>; live is <%= s[:current_label] %>)<% end %>.
31
- Re-run to refresh the scores with the current judge.
31
+ Re-run to refresh the scores with the current metrics.
32
32
  </p>
33
33
  </div>
34
34
  <% if @run.status == "completed" %>
35
35
  <%= button_to "Re-run from scratch",
36
36
  rerun_run_path(@run), method: :post,
37
37
  class: ck_button_classes(:light, variant: :outline), form_class: "inline-block",
38
- title: "Create a new run that regenerates responses and grades them with the current judge.",
39
- data: { turbo_confirm: "Create a new run with fresh responses and the current judge? The original run stays as a record." } %>
40
- <%= button_to "Re-grade with current judge",
38
+ title: "Create a new run that regenerates responses and grades them with the current metrics.",
39
+ data: { turbo_confirm: "Create a new run with fresh responses and the current metrics? The original run stays as a record." } %>
40
+ <%= button_to "Re-grade with current metrics",
41
41
  regrade_run_path(@run), method: :post,
42
42
  class: ck_button_classes(:dark), form_class: "inline-block",
43
- title: "Re-judge this run's existing responses against the current judge. Faster and cheaper than re-running.",
44
- data: { turbo_confirm: "Re-judge this run's existing responses against the current judge?" } %>
43
+ title: "Re-grade this run's existing responses against the current metrics. Faster and cheaper than re-running.",
44
+ data: { turbo_confirm: "Re-grade this run's existing responses against the current metrics?" } %>
45
45
  <% end %>
46
46
  </div>
47
47
  <% end %>
data/config/routes.rb CHANGED
@@ -19,8 +19,6 @@ CompletionKit::Engine.routes.draw do
19
19
  post "starters/:key/dismiss", to: "metrics#dismiss_starter", as: :dismiss_starter
20
20
  end
21
21
  member do
22
- post :add_few_shot
23
- delete :remove_few_shot
24
22
  post :publish_draft
25
23
  post :suggest_variants
26
24
  delete :dismiss_suggestion
@@ -89,8 +87,6 @@ CompletionKit::Engine.routes.draw do
89
87
  end
90
88
  member do
91
89
  post :suggest_variants
92
- post :add_few_shot
93
- delete :remove_few_shot
94
90
  end
95
91
  end
96
92
  resources :metric_groups
@@ -0,0 +1,5 @@
1
+ class RemoveFewShotExamplesFromCompletionKitMetrics < ActiveRecord::Migration[8.1]
2
+ def change
3
+ remove_column :completion_kit_metrics, :few_shot_examples, :text
4
+ end
5
+ end
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.7.0"
2
+ VERSION = "0.9.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin
@@ -429,6 +429,7 @@ files:
429
429
  - db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb
430
430
  - db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
431
431
  - db/migrate/20260528000002_add_metric_version_to_reviews.rb
432
+ - db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
432
433
  - lib/completion-kit.rb
433
434
  - lib/completion_kit.rb
434
435
  - lib/completion_kit/concurrency_check.rb