completion-kit 0.8.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +203 -334
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +1 -28
- data/app/controllers/completion_kit/metrics_controller.rb +30 -36
- data/app/controllers/completion_kit/runs_controller.rb +2 -2
- data/app/jobs/completion_kit/judge_review_job.rb +9 -16
- data/app/models/completion_kit/metric.rb +0 -1
- data/app/models/completion_kit/metric_version.rb +35 -0
- data/app/services/completion_kit/judge_service.rb +19 -10
- data/app/services/completion_kit/mcp_tools/metric_versions.rb +1 -1
- data/app/services/completion_kit/metric_calibration_examples.rb +56 -0
- data/app/services/completion_kit/metric_variant_generator.rb +0 -49
- data/app/views/completion_kit/api_reference/_body.html.erb +2 -12
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +25 -19
- data/app/views/completion_kit/metrics/_form.html.erb +11 -12
- data/app/views/completion_kit/metrics/_guiding_examples.html.erb +23 -0
- data/app/views/completion_kit/metrics/edit.html.erb +18 -0
- data/app/views/completion_kit/metrics/index.html.erb +5 -17
- data/app/views/completion_kit/metrics/show.html.erb +76 -100
- data/app/views/completion_kit/responses/show.html.erb +7 -5
- data/app/views/completion_kit/runs/show.html.erb +7 -7
- data/config/routes.rb +1 -4
- data/db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb +5 -0
- data/db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb +5 -0
- data/lib/completion_kit/version.rb +1 -1
- data/lib/completion_kit.rb +2 -0
- metadata +5 -1
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
<div id="ck-guiding-<%= metric.id %>" class="ck-guiding">
|
|
2
|
+
<% if examples.any? %>
|
|
3
|
+
<div class="ck-guiding__head">
|
|
4
|
+
<p class="ck-kicker ck-kicker--inset">Guiding the judge</p>
|
|
5
|
+
<span class="ck-guiding__legend">Judge → Human</span>
|
|
6
|
+
</div>
|
|
7
|
+
<ul class="ck-guiding__list">
|
|
8
|
+
<% examples.each do |example| %>
|
|
9
|
+
<li class="ck-guiding__item">
|
|
10
|
+
<%= link_to run_response_path(example[:run_id], example[:response_id], anchor: metric.name.parameterize),
|
|
11
|
+
class: "ck-guiding__link", title: "Open this review" do %>
|
|
12
|
+
<span class="ck-guiding__output"><%= truncate(example[:output].to_s, length: 90) %></span>
|
|
13
|
+
<span class="ck-guiding__scores"><span class="ck-guiding__judge"><%= example[:judge_score].to_i %></span> → <span class="ck-guiding__human"><%= example[:human_score].to_i %></span></span>
|
|
14
|
+
<% end %>
|
|
15
|
+
<%= button_to exclude_example_metric_path(metric, calibration_id: example[:id]),
|
|
16
|
+
method: :post, form_class: "inline-block", class: "ck-icon-btn",
|
|
17
|
+
title: "Stop using this case", "aria-label": "Stop using this case",
|
|
18
|
+
data: { turbo_confirm: "Stop using this corrected case to guide the judge?" } do %><%= heroicon_tag "x-mark", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
19
|
+
</li>
|
|
20
|
+
<% end %>
|
|
21
|
+
</ul>
|
|
22
|
+
<% end %>
|
|
23
|
+
</div>
|
|
@@ -10,6 +10,24 @@
|
|
|
10
10
|
</div>
|
|
11
11
|
</section>
|
|
12
12
|
|
|
13
|
+
<% if CompletionKit.config.judge_calibration_enabled && @suggestion_draft.nil? && @edit_draft.nil? && @improve_disagreement_count.to_i.positive? %>
|
|
14
|
+
<div class="ck-suggestion-banner" role="status">
|
|
15
|
+
<div class="ck-suggestion-banner__body">
|
|
16
|
+
<p class="ck-kicker">Improve from reviews</p>
|
|
17
|
+
<p class="ck-meta-copy">Based on human reviews, the model can propose changes to this metric.</p>
|
|
18
|
+
</div>
|
|
19
|
+
<div class="ck-suggestion-banner__actions">
|
|
20
|
+
<%= button_to suggest_variants_metric_path(@metric, back_to: "edit"),
|
|
21
|
+
method: :post, form_class: "inline-block",
|
|
22
|
+
class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
|
|
23
|
+
data: { turbo_confirm: "Draft improvements to this metric from your human reviews? You can edit or apply them here before publishing." } do %>
|
|
24
|
+
<%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
|
|
25
|
+
Suggest improvements
|
|
26
|
+
<% end %>
|
|
27
|
+
</div>
|
|
28
|
+
</div>
|
|
29
|
+
<% end %>
|
|
30
|
+
|
|
13
31
|
<%= render "form",
|
|
14
32
|
metric: @metric,
|
|
15
33
|
suggestion_draft: @suggestion_draft,
|
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
<thead>
|
|
19
19
|
<tr>
|
|
20
20
|
<th scope="col">Name</th>
|
|
21
|
+
<th scope="col">Version</th>
|
|
21
22
|
<th scope="col">Instruction</th>
|
|
22
23
|
<th scope="col">In groups</th>
|
|
23
24
|
<th scope="col"></th>
|
|
@@ -28,29 +29,16 @@
|
|
|
28
29
|
<tr onclick="window.location='<%= metric_path(metric) %>'" style="cursor: pointer;">
|
|
29
30
|
<td>
|
|
30
31
|
<%= link_to metric_path(metric), class: "ck-record-name" do %><strong><%= metric.name %></strong><% end %>
|
|
31
|
-
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
32
|
-
<% s = CompletionKit::MetricCalibrationStats.for(metric) %>
|
|
33
|
-
<p class="ck-metrics-table__trust" title="Calibration: how often this metric's scores match the humans who reviewed them.">
|
|
34
|
-
<%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true", class: "ck-trust-icon" %>
|
|
35
|
-
<span class="ck-metrics-table__trust-label">Calibration</span>
|
|
36
|
-
<% if s.counter_only? %>
|
|
37
|
-
<% if s.sample_size.zero? %>
|
|
38
|
-
<span class="ck-metrics-table__trust-state">Not measured yet</span>
|
|
39
|
-
<% else %>
|
|
40
|
-
<%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts
|
|
41
|
-
<% end %>
|
|
42
|
-
<% else %>
|
|
43
|
-
<span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read. Keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
|
|
44
|
-
±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %>
|
|
45
|
-
<% end %>
|
|
46
|
-
</p>
|
|
47
|
-
<% end %>
|
|
48
32
|
<% if metric.tags.any? %>
|
|
49
33
|
<div class="tag-marks-row">
|
|
50
34
|
<%= render "completion_kit/tags/marks", tags: metric.tags %>
|
|
51
35
|
</div>
|
|
52
36
|
<% end %>
|
|
53
37
|
</td>
|
|
38
|
+
<td data-label="Version">
|
|
39
|
+
<% v = @current_versions[metric.id] %>
|
|
40
|
+
<span class="ck-chip ck-chip--soft"><%= v ? v.version_label : "v1" %></span>
|
|
41
|
+
</td>
|
|
54
42
|
<td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
|
|
55
43
|
<td data-label="In groups">
|
|
56
44
|
<% groups = metric.metric_groups %>
|
|
@@ -6,11 +6,6 @@
|
|
|
6
6
|
<section class="ck-page-header">
|
|
7
7
|
<div>
|
|
8
8
|
<h1 class="ck-title"><%= @metric.name %></h1>
|
|
9
|
-
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
10
|
-
<%= render "completion_kit/calibrations/trust_panel",
|
|
11
|
-
stats: CompletionKit::MetricCalibrationStats.for(@metric),
|
|
12
|
-
metric: @metric %>
|
|
13
|
-
<% end %>
|
|
14
9
|
<% if @metric.tags.any? %>
|
|
15
10
|
<div class="tag-marks-row tag-marks-row--header">
|
|
16
11
|
<%= render "completion_kit/tags/marks", tags: @metric.tags %>
|
|
@@ -18,23 +13,6 @@
|
|
|
18
13
|
<% end %>
|
|
19
14
|
</div>
|
|
20
15
|
<div class="ck-actions">
|
|
21
|
-
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
22
|
-
<% if @suggestion_draft || @edit_draft %>
|
|
23
|
-
<% review_title = @suggestion_draft ? "The model proposed improvements based on your disagreements. Review and apply what you want." : "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
|
|
24
|
-
<%= link_to "Review changes →", edit_metric_path(@metric),
|
|
25
|
-
class: ck_button_classes(:dark),
|
|
26
|
-
title: review_title %>
|
|
27
|
-
<% elsif @improve_disagreement_count.positive? %>
|
|
28
|
-
<%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
|
|
29
|
-
method: :post, form_class: "inline-block",
|
|
30
|
-
class: ck_button_classes(:light, variant: :outline),
|
|
31
|
-
title: "Ask the model to suggest improvements to this metric's instruction and rubric based on the disagreements collected so far.",
|
|
32
|
-
data: { turbo_confirm: "Ask the model for suggested improvements based on the disagreements collected so far?" } %>
|
|
33
|
-
<% else %>
|
|
34
|
-
<button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
|
|
35
|
-
title="Mark at least one case as Disagree before the model can suggest a change.">Improve the metric</button>
|
|
36
|
-
<% end %>
|
|
37
|
-
<% end %>
|
|
38
16
|
<%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
|
|
39
17
|
</div>
|
|
40
18
|
</section>
|
|
@@ -64,9 +42,8 @@
|
|
|
64
42
|
</div>
|
|
65
43
|
</section>
|
|
66
44
|
|
|
67
|
-
<% if CompletionKit.config.judge_calibration_enabled && @versions.
|
|
45
|
+
<% if CompletionKit.config.judge_calibration_enabled && @versions.any? %>
|
|
68
46
|
<% predecessor_of = @versions.index_with { |v| @versions.detect { |o| o.version_number < v.version_number } } %>
|
|
69
|
-
<% version_changed = ->(v, pred) { pred && (pred.instruction.to_s != v.instruction.to_s || pred.rubric_bands != v.rubric_bands) } %>
|
|
70
47
|
<section class="ck-card ck-card--spaced">
|
|
71
48
|
<p class="ck-kicker">Versions</p>
|
|
72
49
|
<table class="ck-results-table ck-metric-versions-table">
|
|
@@ -82,6 +59,7 @@
|
|
|
82
59
|
<% pred = predecessor_of[v] %>
|
|
83
60
|
<tr>
|
|
84
61
|
<td>
|
|
62
|
+
<% summary = v.change_summary_against(pred) %>
|
|
85
63
|
<div class="ck-version-cell">
|
|
86
64
|
<div class="ck-version-cell__label">
|
|
87
65
|
<strong><%= v.version_label %></strong>
|
|
@@ -95,19 +73,17 @@
|
|
|
95
73
|
<%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
96
74
|
method: :post, form_class: "inline-block",
|
|
97
75
|
class: "ck-chip ck-chip--publish",
|
|
98
|
-
data: { turbo_confirm: "
|
|
76
|
+
data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
|
|
99
77
|
<% end %>
|
|
100
78
|
</div>
|
|
101
|
-
<% if
|
|
102
|
-
<button type="button" class="ck-cell-link ck-cell-link--delta"
|
|
103
|
-
title="What changed from #{pred.version_label}"
|
|
104
|
-
onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">Δ</button>
|
|
79
|
+
<% if summary %>
|
|
80
|
+
<button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">Δ</button>
|
|
105
81
|
<% end %>
|
|
106
82
|
</div>
|
|
107
83
|
</td>
|
|
108
84
|
<td>
|
|
109
85
|
<% source_label, source_class = case v.source
|
|
110
|
-
when "suggestion" then ["AI
|
|
86
|
+
when "suggestion" then ["AI calibration", "ck-source-chip ck-source-chip--ai"]
|
|
111
87
|
when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
|
|
112
88
|
when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
|
|
113
89
|
else ["Original", "ck-source-chip ck-source-chip--initial"]
|
|
@@ -115,7 +91,15 @@
|
|
|
115
91
|
<span class="<%= source_class %>"><%= source_label %></span>
|
|
116
92
|
</td>
|
|
117
93
|
<td class="ck-meta-copy">
|
|
118
|
-
<
|
|
94
|
+
<div class="ck-version-created">
|
|
95
|
+
<time datetime="<%= v.created_at.utc.iso8601 %>" data-relative-time><%= time_ago_in_words(v.created_at) %> ago</time>
|
|
96
|
+
<% if v.draft? %>
|
|
97
|
+
<%= button_to dismiss_suggestion_metric_path(@metric, draft_id: v.id),
|
|
98
|
+
method: :delete, form_class: "inline-block", class: "ck-icon-btn",
|
|
99
|
+
title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
|
|
100
|
+
data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
101
|
+
<% end %>
|
|
102
|
+
</div>
|
|
119
103
|
</td>
|
|
120
104
|
</tr>
|
|
121
105
|
<% end %>
|
|
@@ -125,7 +109,7 @@
|
|
|
125
109
|
|
|
126
110
|
<% @versions.each do |v| %>
|
|
127
111
|
<% pred = predecessor_of[v] %>
|
|
128
|
-
<% next unless
|
|
112
|
+
<% next unless v.change_summary_against(pred) %>
|
|
129
113
|
<dialog id="ck-mvdiff-<%= v.id %>" class="ck-modal" onclick="if(event.target===this)this.close()">
|
|
130
114
|
<article class="ck-modal__panel" tabindex="-1" onclick="event.stopPropagation()">
|
|
131
115
|
<header class="ck-modal__header">
|
|
@@ -148,85 +132,77 @@
|
|
|
148
132
|
</div>
|
|
149
133
|
</div>
|
|
150
134
|
<% end %>
|
|
151
|
-
<%
|
|
135
|
+
<% pred_bands = CompletionKit::Metric.normalize_rubric_bands(pred.rubric_bands) %>
|
|
136
|
+
<% v_bands = CompletionKit::Metric.normalize_rubric_bands(v.rubric_bands) %>
|
|
137
|
+
<% if pred_bands != v_bands %>
|
|
152
138
|
<p class="ck-kicker ck-kicker--inset">Rubric changes</p>
|
|
153
139
|
<%= render "completion_kit/metrics/rubric_diff",
|
|
154
|
-
current_bands:
|
|
155
|
-
draft_bands:
|
|
140
|
+
current_bands: pred_bands,
|
|
141
|
+
draft_bands: v_bands %>
|
|
156
142
|
<% end %>
|
|
157
143
|
</div>
|
|
144
|
+
<footer class="ck-modal__footer ck-modal__footer--split">
|
|
145
|
+
<% if v.current? %>
|
|
146
|
+
<span class="ck-modal__foot-note">This is the metric's published version.</span>
|
|
147
|
+
<% elsif v.draft? %>
|
|
148
|
+
<span class="ck-modal__foot-note">Happy with it? Publish to use <%= v.version_label %> for this metric from now on. Tweak it with Edit.</span>
|
|
149
|
+
<span class="ck-modal__foot-actions">
|
|
150
|
+
<%= button_to dismiss_suggestion_metric_path(@metric, draft_id: v.id),
|
|
151
|
+
method: :delete, form_class: "inline-block", class: "ck-icon-btn",
|
|
152
|
+
title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
|
|
153
|
+
data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
154
|
+
<%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
|
|
155
|
+
<%= button_to "Publish #{v.version_label} →", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
156
|
+
method: :post, form_class: "inline-block", class: ck_button_classes(:dark) %>
|
|
157
|
+
</span>
|
|
158
|
+
<% else %>
|
|
159
|
+
<span class="ck-modal__foot-note">Roll this metric back to this version.</span>
|
|
160
|
+
<%= button_to "Make #{v.version_label} current →", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
161
|
+
method: :post, form_class: "inline-block", class: ck_button_classes(:dark),
|
|
162
|
+
data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
|
|
163
|
+
<% end %>
|
|
164
|
+
</footer>
|
|
158
165
|
</article>
|
|
159
166
|
</dialog>
|
|
160
167
|
<% end %>
|
|
161
168
|
<% end %>
|
|
162
169
|
|
|
163
|
-
<% if CompletionKit.config.judge_calibration_enabled
|
|
170
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
171
|
+
<% draft = @suggestion_draft || @edit_draft %>
|
|
164
172
|
<section class="ck-card ck-card--spaced">
|
|
165
173
|
<div class="ck-prompt-preview__header">
|
|
166
|
-
<p class="ck-kicker">
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
<%
|
|
175
|
-
<% already = existing_ids.include?(cal.id) %>
|
|
176
|
-
<% cal_metric_version = cal.metric_version %>
|
|
177
|
-
<% on_current = cal_metric_version&.id == @published_metric_version.id %>
|
|
178
|
-
<li class="ck-disagreement<%= " ck-disagreement--remembered" if already %><%= " ck-disagreement--stale" unless on_current %>">
|
|
179
|
-
<div class="ck-disagreement__head">
|
|
180
|
-
<div class="ck-disagreement__scores">
|
|
181
|
-
<% if cal_metric_version && mixed_versions %>
|
|
182
|
-
<span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_metric_version.version_label %></span>
|
|
183
|
-
<% end %>
|
|
184
|
-
<span class="ck-disagreement__scores-label">Judge</span>
|
|
185
|
-
<% if review&.ai_score %>
|
|
186
|
-
<span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
|
|
187
|
-
<% else %>
|
|
188
|
-
<span class="ck-meta-copy">—</span>
|
|
189
|
-
<% end %>
|
|
190
|
-
<span class="ck-disagreement__scores-arrow">→</span>
|
|
191
|
-
<span class="ck-disagreement__scores-label">Human</span>
|
|
192
|
-
<% if cal.corrected_score %>
|
|
193
|
-
<span class="<%= ck_badge_classes(ck_score_kind(cal.corrected_score.to_f)) %>"><%= cal.corrected_score %></span>
|
|
194
|
-
<% else %>
|
|
195
|
-
<span class="ck-meta-copy">—</span>
|
|
196
|
-
<% end %>
|
|
197
|
-
</div>
|
|
198
|
-
<div class="ck-disagreement__action">
|
|
199
|
-
<% if already %>
|
|
200
|
-
<%= button_to "Forget",
|
|
201
|
-
remove_few_shot_metric_path(@metric, calibration_id: cal.id),
|
|
202
|
-
method: :delete,
|
|
203
|
-
form_class: "inline-block",
|
|
204
|
-
class: ck_button_classes(:light, variant: :outline),
|
|
205
|
-
title: "Stop showing this case to the judge.",
|
|
206
|
-
data: { turbo_confirm: "Stop showing this case to the judge?" } %>
|
|
207
|
-
<span class="ck-chip ck-chip--done" title="The judge sees this row when it grades for this metric.">Remembered</span>
|
|
208
|
-
<% else %>
|
|
209
|
-
<%= button_to "Remember this",
|
|
210
|
-
add_few_shot_metric_path(@metric, calibration_id: cal.id),
|
|
211
|
-
method: :post,
|
|
212
|
-
form_class: "inline-block",
|
|
213
|
-
class: ck_button_classes(:light, variant: :outline),
|
|
214
|
-
title: "Pin this case so the judge sees it next time it grades for this metric." %>
|
|
215
|
-
<% end %>
|
|
216
|
-
</div>
|
|
217
|
-
</div>
|
|
218
|
-
<% if cal.note.to_s.present? %>
|
|
219
|
-
<p class="ck-disagreement__note"><%= cal.note %></p>
|
|
220
|
-
<% end %>
|
|
221
|
-
<p class="ck-disagreement__source ck-meta-copy">
|
|
222
|
-
<%= link_to run_response_path(cal.response.run, cal.response, anchor: @metric.name.parameterize),
|
|
223
|
-
class: "ck-disagreement__source-link" do %>
|
|
224
|
-
<% case_display = cal.response.row_index.nil? ? "##{cal.response.id}" : (cal.response.row_index + 1).to_s %>
|
|
225
|
-
View case <%= case_display %> in <%= cal.response.run.name.to_s.truncate(50) %> →
|
|
226
|
-
<% end %>
|
|
227
|
-
</p>
|
|
228
|
-
</li>
|
|
174
|
+
<p class="ck-kicker">Calibration</p>
|
|
175
|
+
<% if draft.nil? && @improve_disagreement_count.positive? %>
|
|
176
|
+
<%= button_to suggest_variants_metric_path(@metric),
|
|
177
|
+
method: :post, form_class: "inline-block",
|
|
178
|
+
class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
|
|
179
|
+
data: { turbo_confirm: "Draft improvements to this metric from your human reviews? It stays a draft until you compare it and publish." } do %>
|
|
180
|
+
<%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
|
|
181
|
+
Suggest improvements
|
|
182
|
+
<% end %>
|
|
229
183
|
<% end %>
|
|
230
|
-
</
|
|
184
|
+
</div>
|
|
185
|
+
<p class="ck-meta-copy">This is a measure of how often the judge's scores match a human reviewer. Review its scores to build that signal; the scores you disagree with become the cases the model learns from when you improve the metric.</p>
|
|
186
|
+
<%= render "completion_kit/calibrations/trust_panel",
|
|
187
|
+
stats: CompletionKit::MetricCalibrationStats.for(@metric),
|
|
188
|
+
metric: @metric %>
|
|
189
|
+
<% if CompletionKit.config.judge_examples_from_reviews %>
|
|
190
|
+
<%= render "completion_kit/metrics/guiding_examples", metric: @metric, examples: @guiding_examples %>
|
|
191
|
+
<% end %>
|
|
192
|
+
<% if draft %>
|
|
193
|
+
<div class="ck-cal-foot">
|
|
194
|
+
<span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
|
|
195
|
+
</div>
|
|
196
|
+
<% end %>
|
|
231
197
|
</section>
|
|
232
198
|
<% end %>
|
|
199
|
+
|
|
200
|
+
<% if params[:show_change].present? %>
|
|
201
|
+
<script>
|
|
202
|
+
(function () {
|
|
203
|
+
var dialog = document.getElementById("ck-mvdiff-<%= params[:show_change].to_i %>");
|
|
204
|
+
if (dialog && typeof dialog.showModal === "function") dialog.showModal();
|
|
205
|
+
})();
|
|
206
|
+
</script>
|
|
207
|
+
<% end %>
|
|
208
|
+
|
|
@@ -100,12 +100,17 @@
|
|
|
100
100
|
<% @reviews.each do |review| %>
|
|
101
101
|
<% review_version = review.metric_version %>
|
|
102
102
|
<% stale = review.stale_against_current_judge? %>
|
|
103
|
-
<div class="ck-review-card
|
|
103
|
+
<div class="ck-review-card" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
|
|
104
104
|
<div class="ck-review-card__header">
|
|
105
105
|
<span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
|
|
106
106
|
<div class="ck-inline">
|
|
107
107
|
<% if review_version %>
|
|
108
|
-
|
|
108
|
+
<% if stale %>
|
|
109
|
+
<% current_version = CompletionKit::MetricVersion.current.find_by(metric_id: review.metric_id) %>
|
|
110
|
+
<span class="ck-source-chip ck-source-chip--past" title="Scored on <%= review_version.version_label %>; the metric is now on <%= current_version.version_label %>, which may score this differently."><%= review_version.version_label %> → <%= current_version.version_label %></span>
|
|
111
|
+
<% else %>
|
|
112
|
+
<span class="ck-source-chip ck-source-chip--current" title="Scored on the metric's current version (<%= review_version.version_label %>)."><%= review_version.version_label %></span>
|
|
113
|
+
<% end %>
|
|
109
114
|
<% end %>
|
|
110
115
|
<% if review.ai_score %>
|
|
111
116
|
<% 5.times do |i| %>
|
|
@@ -116,9 +121,6 @@
|
|
|
116
121
|
<% end %>
|
|
117
122
|
</div>
|
|
118
123
|
</div>
|
|
119
|
-
<% if stale %>
|
|
120
|
-
<p class="ck-review-card__stale-note">Scored against a superseded version of this metric. The live judge may score this differently.</p>
|
|
121
|
-
<% end %>
|
|
122
124
|
<% if review.ai_feedback.present? %>
|
|
123
125
|
<p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
|
|
124
126
|
<% end %>
|
|
@@ -23,25 +23,25 @@
|
|
|
23
23
|
<% if stale_summary.any? %>
|
|
24
24
|
<div class="ck-stale-versions-banner" role="status">
|
|
25
25
|
<div class="ck-stale-versions-banner__body">
|
|
26
|
-
<p class="ck-kicker">Stale
|
|
26
|
+
<p class="ck-kicker">Stale metric versions</p>
|
|
27
27
|
<p class="ck-meta-copy">
|
|
28
28
|
This run was scored against metric versions that are no longer live.
|
|
29
29
|
<% stale_summary.values.each_with_index do |s, i| %>
|
|
30
30
|
<%= ", " if i > 0 %><strong><%= s[:metric_name] %></strong> (scored by <%= s[:scored_labels].join(", ") %>; live is <%= s[:current_label] %>)<% end %>.
|
|
31
|
-
Re-run to refresh the scores with the current
|
|
31
|
+
Re-run to refresh the scores with the current metrics.
|
|
32
32
|
</p>
|
|
33
33
|
</div>
|
|
34
34
|
<% if @run.status == "completed" %>
|
|
35
35
|
<%= button_to "Re-run from scratch",
|
|
36
36
|
rerun_run_path(@run), method: :post,
|
|
37
37
|
class: ck_button_classes(:light, variant: :outline), form_class: "inline-block",
|
|
38
|
-
title: "Create a new run that regenerates responses and grades them with the current
|
|
39
|
-
data: { turbo_confirm: "Create a new run with fresh responses and the current
|
|
40
|
-
<%= button_to "Re-grade with current
|
|
38
|
+
title: "Create a new run that regenerates responses and grades them with the current metrics.",
|
|
39
|
+
data: { turbo_confirm: "Create a new run with fresh responses and the current metrics? The original run stays as a record." } %>
|
|
40
|
+
<%= button_to "Re-grade with current metrics",
|
|
41
41
|
regrade_run_path(@run), method: :post,
|
|
42
42
|
class: ck_button_classes(:dark), form_class: "inline-block",
|
|
43
|
-
title: "Re-
|
|
44
|
-
data: { turbo_confirm: "Re-
|
|
43
|
+
title: "Re-grade this run's existing responses against the current metrics. Faster and cheaper than re-running.",
|
|
44
|
+
data: { turbo_confirm: "Re-grade this run's existing responses against the current metrics?" } %>
|
|
45
45
|
<% end %>
|
|
46
46
|
</div>
|
|
47
47
|
<% end %>
|
data/config/routes.rb
CHANGED
|
@@ -19,11 +19,10 @@ CompletionKit::Engine.routes.draw do
|
|
|
19
19
|
post "starters/:key/dismiss", to: "metrics#dismiss_starter", as: :dismiss_starter
|
|
20
20
|
end
|
|
21
21
|
member do
|
|
22
|
-
post :add_few_shot
|
|
23
|
-
delete :remove_few_shot
|
|
24
22
|
post :publish_draft
|
|
25
23
|
post :suggest_variants
|
|
26
24
|
delete :dismiss_suggestion
|
|
25
|
+
post :exclude_example
|
|
27
26
|
end
|
|
28
27
|
end
|
|
29
28
|
resources :metric_groups
|
|
@@ -89,8 +88,6 @@ CompletionKit::Engine.routes.draw do
|
|
|
89
88
|
end
|
|
90
89
|
member do
|
|
91
90
|
post :suggest_variants
|
|
92
|
-
post :add_few_shot
|
|
93
|
-
delete :remove_few_shot
|
|
94
91
|
end
|
|
95
92
|
end
|
|
96
93
|
resources :metric_groups
|
data/lib/completion_kit.rb
CHANGED
|
@@ -13,6 +13,7 @@ module CompletionKit
|
|
|
13
13
|
attr_accessor :api_rate_limit, :web_rate_limit
|
|
14
14
|
attr_accessor :allow_loopback_endpoints
|
|
15
15
|
attr_accessor :judge_calibration_enabled
|
|
16
|
+
attr_accessor :judge_examples_from_reviews
|
|
16
17
|
|
|
17
18
|
def initialize
|
|
18
19
|
@openai_api_key = ENV['OPENAI_API_KEY']
|
|
@@ -29,6 +30,7 @@ module CompletionKit
|
|
|
29
30
|
|
|
30
31
|
@allow_loopback_endpoints = true
|
|
31
32
|
@judge_calibration_enabled = true
|
|
33
|
+
@judge_examples_from_reviews = false
|
|
32
34
|
|
|
33
35
|
@api_reference_authentication_partial = "completion_kit/api_reference/authentication"
|
|
34
36
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.10.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -311,6 +311,7 @@ files:
|
|
|
311
311
|
- app/services/completion_kit/mcp_tools/responses.rb
|
|
312
312
|
- app/services/completion_kit/mcp_tools/runs.rb
|
|
313
313
|
- app/services/completion_kit/mcp_tools/tags.rb
|
|
314
|
+
- app/services/completion_kit/metric_calibration_examples.rb
|
|
314
315
|
- app/services/completion_kit/metric_calibration_stats.rb
|
|
315
316
|
- app/services/completion_kit/metric_variant_generator.rb
|
|
316
317
|
- app/services/completion_kit/model_discovery_service.rb
|
|
@@ -350,6 +351,7 @@ files:
|
|
|
350
351
|
- app/views/completion_kit/metric_groups/new.html.erb
|
|
351
352
|
- app/views/completion_kit/metric_groups/show.html.erb
|
|
352
353
|
- app/views/completion_kit/metrics/_form.html.erb
|
|
354
|
+
- app/views/completion_kit/metrics/_guiding_examples.html.erb
|
|
353
355
|
- app/views/completion_kit/metrics/_rubric_diff.html.erb
|
|
354
356
|
- app/views/completion_kit/metrics/_rubric_hint.html.erb
|
|
355
357
|
- app/views/completion_kit/metrics/_starter_card.html.erb
|
|
@@ -429,6 +431,8 @@ files:
|
|
|
429
431
|
- db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb
|
|
430
432
|
- db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
|
|
431
433
|
- db/migrate/20260528000002_add_metric_version_to_reviews.rb
|
|
434
|
+
- db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
|
|
435
|
+
- db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb
|
|
432
436
|
- lib/completion-kit.rb
|
|
433
437
|
- lib/completion_kit.rb
|
|
434
438
|
- lib/completion_kit/concurrency_check.rb
|