completion-kit 0.8.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/stylesheets/completion_kit/application.css +203 -334
  3. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +1 -28
  4. data/app/controllers/completion_kit/metrics_controller.rb +30 -36
  5. data/app/controllers/completion_kit/runs_controller.rb +2 -2
  6. data/app/jobs/completion_kit/judge_review_job.rb +9 -16
  7. data/app/models/completion_kit/metric.rb +0 -1
  8. data/app/models/completion_kit/metric_version.rb +35 -0
  9. data/app/services/completion_kit/judge_service.rb +19 -10
  10. data/app/services/completion_kit/mcp_tools/metric_versions.rb +1 -1
  11. data/app/services/completion_kit/metric_calibration_examples.rb +56 -0
  12. data/app/services/completion_kit/metric_variant_generator.rb +0 -49
  13. data/app/views/completion_kit/api_reference/_body.html.erb +2 -12
  14. data/app/views/completion_kit/calibrations/_trust_panel.html.erb +25 -19
  15. data/app/views/completion_kit/metrics/_form.html.erb +11 -12
  16. data/app/views/completion_kit/metrics/_guiding_examples.html.erb +23 -0
  17. data/app/views/completion_kit/metrics/edit.html.erb +18 -0
  18. data/app/views/completion_kit/metrics/index.html.erb +5 -17
  19. data/app/views/completion_kit/metrics/show.html.erb +76 -100
  20. data/app/views/completion_kit/responses/show.html.erb +7 -5
  21. data/app/views/completion_kit/runs/show.html.erb +7 -7
  22. data/config/routes.rb +1 -4
  23. data/db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb +5 -0
  24. data/db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb +5 -0
  25. data/lib/completion_kit/version.rb +1 -1
  26. data/lib/completion_kit.rb +2 -0
  27. metadata +5 -1
@@ -0,0 +1,23 @@
1
+ <div id="ck-guiding-<%= metric.id %>" class="ck-guiding">
2
+ <% if examples.any? %>
3
+ <div class="ck-guiding__head">
4
+ <p class="ck-kicker ck-kicker--inset">Guiding the judge</p>
5
+ <span class="ck-guiding__legend">Judge &rarr; Human</span>
6
+ </div>
7
+ <ul class="ck-guiding__list">
8
+ <% examples.each do |example| %>
9
+ <li class="ck-guiding__item">
10
+ <%= link_to run_response_path(example[:run_id], example[:response_id], anchor: metric.name.parameterize),
11
+ class: "ck-guiding__link", title: "Open this review" do %>
12
+ <span class="ck-guiding__output"><%= truncate(example[:output].to_s, length: 90) %></span>
13
+ <span class="ck-guiding__scores"><span class="ck-guiding__judge"><%= example[:judge_score].to_i %></span> &rarr; <span class="ck-guiding__human"><%= example[:human_score].to_i %></span></span>
14
+ <% end %>
15
+ <%= button_to exclude_example_metric_path(metric, calibration_id: example[:id]),
16
+ method: :post, form_class: "inline-block", class: "ck-icon-btn",
17
+ title: "Stop using this case", "aria-label": "Stop using this case",
18
+ data: { turbo_confirm: "Stop using this corrected case to guide the judge?" } do %><%= heroicon_tag "x-mark", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
19
+ </li>
20
+ <% end %>
21
+ </ul>
22
+ <% end %>
23
+ </div>
@@ -10,6 +10,24 @@
10
10
  </div>
11
11
  </section>
12
12
 
13
+ <% if CompletionKit.config.judge_calibration_enabled && @suggestion_draft.nil? && @edit_draft.nil? && @improve_disagreement_count.to_i.positive? %>
14
+ <div class="ck-suggestion-banner" role="status">
15
+ <div class="ck-suggestion-banner__body">
16
+ <p class="ck-kicker">Improve from reviews</p>
17
+ <p class="ck-meta-copy">Based on human reviews, the model can propose changes to this metric.</p>
18
+ </div>
19
+ <div class="ck-suggestion-banner__actions">
20
+ <%= button_to suggest_variants_metric_path(@metric, back_to: "edit"),
21
+ method: :post, form_class: "inline-block",
22
+ class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
23
+ data: { turbo_confirm: "Draft improvements to this metric from your human reviews? You can edit or apply them here before publishing." } do %>
24
+ <%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
25
+ Suggest improvements
26
+ <% end %>
27
+ </div>
28
+ </div>
29
+ <% end %>
30
+
13
31
  <%= render "form",
14
32
  metric: @metric,
15
33
  suggestion_draft: @suggestion_draft,
@@ -18,6 +18,7 @@
18
18
  <thead>
19
19
  <tr>
20
20
  <th scope="col">Name</th>
21
+ <th scope="col">Version</th>
21
22
  <th scope="col">Instruction</th>
22
23
  <th scope="col">In groups</th>
23
24
  <th scope="col"></th>
@@ -28,29 +29,16 @@
28
29
  <tr onclick="window.location='<%= metric_path(metric) %>'" style="cursor: pointer;">
29
30
  <td>
30
31
  <%= link_to metric_path(metric), class: "ck-record-name" do %><strong><%= metric.name %></strong><% end %>
31
- <% if CompletionKit.config.judge_calibration_enabled %>
32
- <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
33
- <p class="ck-metrics-table__trust" title="Calibration: how often this metric's scores match the humans who reviewed them.">
34
- <%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true", class: "ck-trust-icon" %>
35
- <span class="ck-metrics-table__trust-label">Calibration</span>
36
- <% if s.counter_only? %>
37
- <% if s.sample_size.zero? %>
38
- <span class="ck-metrics-table__trust-state">Not measured yet</span>
39
- <% else %>
40
- <%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts
41
- <% end %>
42
- <% else %>
43
- <span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read. Keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
44
- ±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %>
45
- <% end %>
46
- </p>
47
- <% end %>
48
32
  <% if metric.tags.any? %>
49
33
  <div class="tag-marks-row">
50
34
  <%= render "completion_kit/tags/marks", tags: metric.tags %>
51
35
  </div>
52
36
  <% end %>
53
37
  </td>
38
+ <td data-label="Version">
39
+ <% v = @current_versions[metric.id] %>
40
+ <span class="ck-chip ck-chip--soft"><%= v ? v.version_label : "v1" %></span>
41
+ </td>
54
42
  <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
55
43
  <td data-label="In groups">
56
44
  <% groups = metric.metric_groups %>
@@ -6,11 +6,6 @@
6
6
  <section class="ck-page-header">
7
7
  <div>
8
8
  <h1 class="ck-title"><%= @metric.name %></h1>
9
- <% if CompletionKit.config.judge_calibration_enabled %>
10
- <%= render "completion_kit/calibrations/trust_panel",
11
- stats: CompletionKit::MetricCalibrationStats.for(@metric),
12
- metric: @metric %>
13
- <% end %>
14
9
  <% if @metric.tags.any? %>
15
10
  <div class="tag-marks-row tag-marks-row--header">
16
11
  <%= render "completion_kit/tags/marks", tags: @metric.tags %>
@@ -18,23 +13,6 @@
18
13
  <% end %>
19
14
  </div>
20
15
  <div class="ck-actions">
21
- <% if CompletionKit.config.judge_calibration_enabled %>
22
- <% if @suggestion_draft || @edit_draft %>
23
- <% review_title = @suggestion_draft ? "The model proposed improvements based on your disagreements. Review and apply what you want." : "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
24
- <%= link_to "Review changes →", edit_metric_path(@metric),
25
- class: ck_button_classes(:dark),
26
- title: review_title %>
27
- <% elsif @improve_disagreement_count.positive? %>
28
- <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
29
- method: :post, form_class: "inline-block",
30
- class: ck_button_classes(:light, variant: :outline),
31
- title: "Ask the model to suggest improvements to this metric's instruction and rubric based on the disagreements collected so far.",
32
- data: { turbo_confirm: "Ask the model for suggested improvements based on the disagreements collected so far?" } %>
33
- <% else %>
34
- <button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
35
- title="Mark at least one case as Disagree before the model can suggest a change.">Improve the metric</button>
36
- <% end %>
37
- <% end %>
38
16
  <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
39
17
  </div>
40
18
  </section>
@@ -64,9 +42,8 @@
64
42
  </div>
65
43
  </section>
66
44
 
67
- <% if CompletionKit.config.judge_calibration_enabled && @versions.size > 1 %>
45
+ <% if CompletionKit.config.judge_calibration_enabled && @versions.any? %>
68
46
  <% predecessor_of = @versions.index_with { |v| @versions.detect { |o| o.version_number < v.version_number } } %>
69
- <% version_changed = ->(v, pred) { pred && (pred.instruction.to_s != v.instruction.to_s || pred.rubric_bands != v.rubric_bands) } %>
70
47
  <section class="ck-card ck-card--spaced">
71
48
  <p class="ck-kicker">Versions</p>
72
49
  <table class="ck-results-table ck-metric-versions-table">
@@ -82,6 +59,7 @@
82
59
  <% pred = predecessor_of[v] %>
83
60
  <tr>
84
61
  <td>
62
+ <% summary = v.change_summary_against(pred) %>
85
63
  <div class="ck-version-cell">
86
64
  <div class="ck-version-cell__label">
87
65
  <strong><%= v.version_label %></strong>
@@ -95,19 +73,17 @@
95
73
  <%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
96
74
  method: :post, form_class: "inline-block",
97
75
  class: "ck-chip ck-chip--publish",
98
- data: { turbo_confirm: "Roll the live judge back to #{v.version_label}? Calibration verdicts collected against the current version stay tied to it." } %>
76
+ data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
99
77
  <% end %>
100
78
  </div>
101
- <% if version_changed.call(v, pred) %>
102
- <button type="button" class="ck-cell-link ck-cell-link--delta"
103
- title="What changed from #{pred.version_label}"
104
- onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">&Delta;</button>
79
+ <% if summary %>
80
+ <button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">&Delta;</button>
105
81
  <% end %>
106
82
  </div>
107
83
  </td>
108
84
  <td>
109
85
  <% source_label, source_class = case v.source
110
- when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
86
+ when "suggestion" then ["AI calibration", "ck-source-chip ck-source-chip--ai"]
111
87
  when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
112
88
  when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
113
89
  else ["Original", "ck-source-chip ck-source-chip--initial"]
@@ -115,7 +91,15 @@
115
91
  <span class="<%= source_class %>"><%= source_label %></span>
116
92
  </td>
117
93
  <td class="ck-meta-copy">
118
- <time datetime="<%= v.created_at.utc.iso8601 %>" data-relative-time><%= time_ago_in_words(v.created_at) %> ago</time>
94
+ <div class="ck-version-created">
95
+ <time datetime="<%= v.created_at.utc.iso8601 %>" data-relative-time><%= time_ago_in_words(v.created_at) %> ago</time>
96
+ <% if v.draft? %>
97
+ <%= button_to dismiss_suggestion_metric_path(@metric, draft_id: v.id),
98
+ method: :delete, form_class: "inline-block", class: "ck-icon-btn",
99
+ title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
100
+ data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
101
+ <% end %>
102
+ </div>
119
103
  </td>
120
104
  </tr>
121
105
  <% end %>
@@ -125,7 +109,7 @@
125
109
 
126
110
  <% @versions.each do |v| %>
127
111
  <% pred = predecessor_of[v] %>
128
- <% next unless version_changed.call(v, pred) %>
112
+ <% next unless v.change_summary_against(pred) %>
129
113
  <dialog id="ck-mvdiff-<%= v.id %>" class="ck-modal" onclick="if(event.target===this)this.close()">
130
114
  <article class="ck-modal__panel" tabindex="-1" onclick="event.stopPropagation()">
131
115
  <header class="ck-modal__header">
@@ -148,85 +132,77 @@
148
132
  </div>
149
133
  </div>
150
134
  <% end %>
151
- <% if pred.rubric_bands != v.rubric_bands %>
135
+ <% pred_bands = CompletionKit::Metric.normalize_rubric_bands(pred.rubric_bands) %>
136
+ <% v_bands = CompletionKit::Metric.normalize_rubric_bands(v.rubric_bands) %>
137
+ <% if pred_bands != v_bands %>
152
138
  <p class="ck-kicker ck-kicker--inset">Rubric changes</p>
153
139
  <%= render "completion_kit/metrics/rubric_diff",
154
- current_bands: pred.rubric_bands || [],
155
- draft_bands: v.rubric_bands || [] %>
140
+ current_bands: pred_bands,
141
+ draft_bands: v_bands %>
156
142
  <% end %>
157
143
  </div>
144
+ <footer class="ck-modal__footer ck-modal__footer--split">
145
+ <% if v.current? %>
146
+ <span class="ck-modal__foot-note">This is the metric's published version.</span>
147
+ <% elsif v.draft? %>
148
+ <span class="ck-modal__foot-note">Happy with it? Publish to use <%= v.version_label %> for this metric from now on. Tweak it with Edit.</span>
149
+ <span class="ck-modal__foot-actions">
150
+ <%= button_to dismiss_suggestion_metric_path(@metric, draft_id: v.id),
151
+ method: :delete, form_class: "inline-block", class: "ck-icon-btn",
152
+ title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
153
+ data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
154
+ <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
155
+ <%= button_to "Publish #{v.version_label} →", publish_draft_metric_path(@metric, draft_id: v.id),
156
+ method: :post, form_class: "inline-block", class: ck_button_classes(:dark) %>
157
+ </span>
158
+ <% else %>
159
+ <span class="ck-modal__foot-note">Roll this metric back to this version.</span>
160
+ <%= button_to "Make #{v.version_label} current →", publish_draft_metric_path(@metric, draft_id: v.id),
161
+ method: :post, form_class: "inline-block", class: ck_button_classes(:dark),
162
+ data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
163
+ <% end %>
164
+ </footer>
158
165
  </article>
159
166
  </dialog>
160
167
  <% end %>
161
168
  <% end %>
162
169
 
163
- <% if CompletionKit.config.judge_calibration_enabled && @disagreements.any? %>
170
+ <% if CompletionKit.config.judge_calibration_enabled %>
171
+ <% draft = @suggestion_draft || @edit_draft %>
164
172
  <section class="ck-card ck-card--spaced">
165
173
  <div class="ck-prompt-preview__header">
166
- <p class="ck-kicker">Cases to learn from</p>
167
- <span class="ck-chip"><%= pluralize(@disagreements.size, "case") %></span>
168
- </div>
169
- <% mixed_versions = @disagreements.any? { |c| c.metric_version_id != @published_metric_version.id } %>
170
- <p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades<%= " (pins flow into the current judge regardless of which version produced the verdict)" if mixed_versions %>.</p>
171
- <% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
172
- <ul class="ck-disagreement-list">
173
- <% @disagreements.each do |cal| %>
174
- <% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
175
- <% already = existing_ids.include?(cal.id) %>
176
- <% cal_metric_version = cal.metric_version %>
177
- <% on_current = cal_metric_version&.id == @published_metric_version.id %>
178
- <li class="ck-disagreement<%= " ck-disagreement--remembered" if already %><%= " ck-disagreement--stale" unless on_current %>">
179
- <div class="ck-disagreement__head">
180
- <div class="ck-disagreement__scores">
181
- <% if cal_metric_version && mixed_versions %>
182
- <span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_metric_version.version_label %></span>
183
- <% end %>
184
- <span class="ck-disagreement__scores-label">Judge</span>
185
- <% if review&.ai_score %>
186
- <span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
187
- <% else %>
188
- <span class="ck-meta-copy">—</span>
189
- <% end %>
190
- <span class="ck-disagreement__scores-arrow">→</span>
191
- <span class="ck-disagreement__scores-label">Human</span>
192
- <% if cal.corrected_score %>
193
- <span class="<%= ck_badge_classes(ck_score_kind(cal.corrected_score.to_f)) %>"><%= cal.corrected_score %></span>
194
- <% else %>
195
- <span class="ck-meta-copy">—</span>
196
- <% end %>
197
- </div>
198
- <div class="ck-disagreement__action">
199
- <% if already %>
200
- <%= button_to "Forget",
201
- remove_few_shot_metric_path(@metric, calibration_id: cal.id),
202
- method: :delete,
203
- form_class: "inline-block",
204
- class: ck_button_classes(:light, variant: :outline),
205
- title: "Stop showing this case to the judge.",
206
- data: { turbo_confirm: "Stop showing this case to the judge?" } %>
207
- <span class="ck-chip ck-chip--done" title="The judge sees this row when it grades for this metric.">Remembered</span>
208
- <% else %>
209
- <%= button_to "Remember this",
210
- add_few_shot_metric_path(@metric, calibration_id: cal.id),
211
- method: :post,
212
- form_class: "inline-block",
213
- class: ck_button_classes(:light, variant: :outline),
214
- title: "Pin this case so the judge sees it next time it grades for this metric." %>
215
- <% end %>
216
- </div>
217
- </div>
218
- <% if cal.note.to_s.present? %>
219
- <p class="ck-disagreement__note"><%= cal.note %></p>
220
- <% end %>
221
- <p class="ck-disagreement__source ck-meta-copy">
222
- <%= link_to run_response_path(cal.response.run, cal.response, anchor: @metric.name.parameterize),
223
- class: "ck-disagreement__source-link" do %>
224
- <% case_display = cal.response.row_index.nil? ? "##{cal.response.id}" : (cal.response.row_index + 1).to_s %>
225
- View case <%= case_display %> in <%= cal.response.run.name.to_s.truncate(50) %> →
226
- <% end %>
227
- </p>
228
- </li>
174
+ <p class="ck-kicker">Calibration</p>
175
+ <% if draft.nil? && @improve_disagreement_count.positive? %>
176
+ <%= button_to suggest_variants_metric_path(@metric),
177
+ method: :post, form_class: "inline-block",
178
+ class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
179
+ data: { turbo_confirm: "Draft improvements to this metric from your human reviews? It stays a draft until you compare it and publish." } do %>
180
+ <%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
181
+ Suggest improvements
182
+ <% end %>
229
183
  <% end %>
230
- </ul>
184
+ </div>
185
+ <p class="ck-meta-copy">This is a measure of how often the judge's scores match a human reviewer. Review its scores to build that signal; the scores you disagree with become the cases the model learns from when you improve the metric.</p>
186
+ <%= render "completion_kit/calibrations/trust_panel",
187
+ stats: CompletionKit::MetricCalibrationStats.for(@metric),
188
+ metric: @metric %>
189
+ <% if CompletionKit.config.judge_examples_from_reviews %>
190
+ <%= render "completion_kit/metrics/guiding_examples", metric: @metric, examples: @guiding_examples %>
191
+ <% end %>
192
+ <% if draft %>
193
+ <div class="ck-cal-foot">
194
+ <span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
195
+ </div>
196
+ <% end %>
231
197
  </section>
232
198
  <% end %>
199
+
200
+ <% if params[:show_change].present? %>
201
+ <script>
202
+ (function () {
203
+ var dialog = document.getElementById("ck-mvdiff-<%= params[:show_change].to_i %>");
204
+ if (dialog && typeof dialog.showModal === "function") dialog.showModal();
205
+ })();
206
+ </script>
207
+ <% end %>
208
+
@@ -100,12 +100,17 @@
100
100
  <% @reviews.each do |review| %>
101
101
  <% review_version = review.metric_version %>
102
102
  <% stale = review.stale_against_current_judge? %>
103
- <div class="ck-review-card<%= " ck-review-card--stale" if stale %>" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
103
+ <div class="ck-review-card" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
104
104
  <div class="ck-review-card__header">
105
105
  <span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
106
106
  <div class="ck-inline">
107
107
  <% if review_version %>
108
- <span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Score produced by #{review_version.version_label} of this metric. The live judge has changed since." : "Score produced by the live judge (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
108
+ <% if stale %>
109
+ <% current_version = CompletionKit::MetricVersion.current.find_by(metric_id: review.metric_id) %>
110
+ <span class="ck-source-chip ck-source-chip--past" title="Scored on <%= review_version.version_label %>; the metric is now on <%= current_version.version_label %>, which may score this differently."><%= review_version.version_label %> &rarr; <%= current_version.version_label %></span>
111
+ <% else %>
112
+ <span class="ck-source-chip ck-source-chip--current" title="Scored on the metric's current version (<%= review_version.version_label %>)."><%= review_version.version_label %></span>
113
+ <% end %>
109
114
  <% end %>
110
115
  <% if review.ai_score %>
111
116
  <% 5.times do |i| %>
@@ -116,9 +121,6 @@
116
121
  <% end %>
117
122
  </div>
118
123
  </div>
119
- <% if stale %>
120
- <p class="ck-review-card__stale-note">Scored against a superseded version of this metric. The live judge may score this differently.</p>
121
- <% end %>
122
124
  <% if review.ai_feedback.present? %>
123
125
  <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
124
126
  <% end %>
@@ -23,25 +23,25 @@
23
23
  <% if stale_summary.any? %>
24
24
  <div class="ck-stale-versions-banner" role="status">
25
25
  <div class="ck-stale-versions-banner__body">
26
- <p class="ck-kicker">Stale judge versions</p>
26
+ <p class="ck-kicker">Stale metric versions</p>
27
27
  <p class="ck-meta-copy">
28
28
  This run was scored against metric versions that are no longer live.
29
29
  <% stale_summary.values.each_with_index do |s, i| %>
30
30
  <%= ", " if i > 0 %><strong><%= s[:metric_name] %></strong> (scored by <%= s[:scored_labels].join(", ") %>; live is <%= s[:current_label] %>)<% end %>.
31
- Re-run to refresh the scores with the current judge.
31
+ Re-run to refresh the scores with the current metrics.
32
32
  </p>
33
33
  </div>
34
34
  <% if @run.status == "completed" %>
35
35
  <%= button_to "Re-run from scratch",
36
36
  rerun_run_path(@run), method: :post,
37
37
  class: ck_button_classes(:light, variant: :outline), form_class: "inline-block",
38
- title: "Create a new run that regenerates responses and grades them with the current judge.",
39
- data: { turbo_confirm: "Create a new run with fresh responses and the current judge? The original run stays as a record." } %>
40
- <%= button_to "Re-grade with current judge",
38
+ title: "Create a new run that regenerates responses and grades them with the current metrics.",
39
+ data: { turbo_confirm: "Create a new run with fresh responses and the current metrics? The original run stays as a record." } %>
40
+ <%= button_to "Re-grade with current metrics",
41
41
  regrade_run_path(@run), method: :post,
42
42
  class: ck_button_classes(:dark), form_class: "inline-block",
43
- title: "Re-judge this run's existing responses against the current judge. Faster and cheaper than re-running.",
44
- data: { turbo_confirm: "Re-judge this run's existing responses against the current judge?" } %>
43
+ title: "Re-grade this run's existing responses against the current metrics. Faster and cheaper than re-running.",
44
+ data: { turbo_confirm: "Re-grade this run's existing responses against the current metrics?" } %>
45
45
  <% end %>
46
46
  </div>
47
47
  <% end %>
data/config/routes.rb CHANGED
@@ -19,11 +19,10 @@ CompletionKit::Engine.routes.draw do
19
19
  post "starters/:key/dismiss", to: "metrics#dismiss_starter", as: :dismiss_starter
20
20
  end
21
21
  member do
22
- post :add_few_shot
23
- delete :remove_few_shot
24
22
  post :publish_draft
25
23
  post :suggest_variants
26
24
  delete :dismiss_suggestion
25
+ post :exclude_example
27
26
  end
28
27
  end
29
28
  resources :metric_groups
@@ -89,8 +88,6 @@ CompletionKit::Engine.routes.draw do
89
88
  end
90
89
  member do
91
90
  post :suggest_variants
92
- post :add_few_shot
93
- delete :remove_few_shot
94
91
  end
95
92
  end
96
93
  resources :metric_groups
@@ -0,0 +1,5 @@
1
+ class RemoveFewShotExamplesFromCompletionKitMetrics < ActiveRecord::Migration[8.1]
2
+ def change
3
+ remove_column :completion_kit_metrics, :few_shot_examples, :text
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ class AddExcludedFromExamplesToCompletionKitCalibrations < ActiveRecord::Migration[8.1]
2
+ def change
3
+ add_column :completion_kit_calibrations, :excluded_from_examples, :boolean, null: false, default: false
4
+ end
5
+ end
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.8.0"
2
+ VERSION = "0.10.0"
3
3
  end
@@ -13,6 +13,7 @@ module CompletionKit
13
13
  attr_accessor :api_rate_limit, :web_rate_limit
14
14
  attr_accessor :allow_loopback_endpoints
15
15
  attr_accessor :judge_calibration_enabled
16
+ attr_accessor :judge_examples_from_reviews
16
17
 
17
18
  def initialize
18
19
  @openai_api_key = ENV['OPENAI_API_KEY']
@@ -29,6 +30,7 @@ module CompletionKit
29
30
 
30
31
  @allow_loopback_endpoints = true
31
32
  @judge_calibration_enabled = true
33
+ @judge_examples_from_reviews = false
32
34
 
33
35
  @api_reference_authentication_partial = "completion_kit/api_reference/authentication"
34
36
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin
@@ -311,6 +311,7 @@ files:
311
311
  - app/services/completion_kit/mcp_tools/responses.rb
312
312
  - app/services/completion_kit/mcp_tools/runs.rb
313
313
  - app/services/completion_kit/mcp_tools/tags.rb
314
+ - app/services/completion_kit/metric_calibration_examples.rb
314
315
  - app/services/completion_kit/metric_calibration_stats.rb
315
316
  - app/services/completion_kit/metric_variant_generator.rb
316
317
  - app/services/completion_kit/model_discovery_service.rb
@@ -350,6 +351,7 @@ files:
350
351
  - app/views/completion_kit/metric_groups/new.html.erb
351
352
  - app/views/completion_kit/metric_groups/show.html.erb
352
353
  - app/views/completion_kit/metrics/_form.html.erb
354
+ - app/views/completion_kit/metrics/_guiding_examples.html.erb
353
355
  - app/views/completion_kit/metrics/_rubric_diff.html.erb
354
356
  - app/views/completion_kit/metrics/_rubric_hint.html.erb
355
357
  - app/views/completion_kit/metrics/_starter_card.html.erb
@@ -429,6 +431,8 @@ files:
429
431
  - db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb
430
432
  - db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
431
433
  - db/migrate/20260528000002_add_metric_version_to_reviews.rb
434
+ - db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
435
+ - db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb
432
436
  - lib/completion-kit.rb
433
437
  - lib/completion_kit.rb
434
438
  - lib/completion_kit/concurrency_check.rb