completion-kit 0.5.42 → 0.5.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/javascripts/completion_kit/application.js +17 -0
- data/app/assets/stylesheets/completion_kit/application.css +530 -39
- data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +1 -1
- data/app/controllers/completion_kit/api/v1/runs_controller.rb +4 -0
- data/app/controllers/completion_kit/calibrations_controller.rb +1 -1
- data/app/controllers/completion_kit/metrics_controller.rb +88 -31
- data/app/controllers/completion_kit/runs_controller.rb +6 -0
- data/app/jobs/completion_kit/judge_review_job.rb +14 -0
- data/app/models/completion_kit/calibration.rb +6 -2
- data/app/models/completion_kit/metric.rb +0 -17
- data/app/models/completion_kit/{judge_version.rb → metric_version.rb} +35 -2
- data/app/models/completion_kit/review.rb +9 -0
- data/app/models/completion_kit/run.rb +28 -0
- data/app/services/completion_kit/mcp_tools/calibrations.rb +1 -1
- data/app/services/completion_kit/mcp_tools/judges.rb +15 -13
- data/app/services/completion_kit/metric_calibration_stats.rb +17 -5
- data/app/services/completion_kit/{judge_variant_generator.rb → metric_variant_generator.rb} +14 -12
- data/app/views/completion_kit/api_reference/_body.html.erb +1 -1
- data/app/views/completion_kit/calibrations/_buttons.html.erb +43 -6
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +32 -28
- data/app/views/completion_kit/metrics/_form.html.erb +90 -4
- data/app/views/completion_kit/metrics/_rubric_diff.html.erb +25 -0
- data/app/views/completion_kit/metrics/_rubric_hint.html.erb +4 -0
- data/app/views/completion_kit/metrics/_starter_card.html.erb +13 -9
- data/app/views/completion_kit/metrics/edit.html.erb +5 -1
- data/app/views/completion_kit/metrics/index.html.erb +5 -3
- data/app/views/completion_kit/metrics/show.html.erb +131 -127
- data/app/views/completion_kit/metrics/starter_preview.html.erb +6 -6
- data/app/views/completion_kit/responses/show.html.erb +9 -1
- data/app/views/completion_kit/runs/_status_panel.html.erb +2 -2
- data/app/views/completion_kit/runs/show.html.erb +23 -0
- data/config/routes.rb +2 -1
- data/db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb +24 -0
- data/db/migrate/20260528000001_rename_judge_version_to_metric_version.rb +22 -0
- data/db/migrate/20260528000002_add_metric_version_to_reviews.rb +21 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +8 -3
|
@@ -8,70 +8,37 @@
|
|
|
8
8
|
<h1 class="ck-title"><%= @metric.name %></h1>
|
|
9
9
|
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
10
10
|
<%= render "completion_kit/calibrations/trust_panel",
|
|
11
|
-
stats: CompletionKit::MetricCalibrationStats.for(@metric)
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
<div class="ck-prompt-preview__header">
|
|
19
|
-
<p class="ck-kicker">Draft pending</p>
|
|
20
|
-
<%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @edit_draft.id),
|
|
21
|
-
method: :post, form_class: "inline-block",
|
|
22
|
-
class: ck_button_classes(:dark) %>
|
|
23
|
-
</div>
|
|
24
|
-
<p class="ck-meta-copy">A draft of this metric is saved. Publishing it replaces the live instruction<%= ", rubric," if rubric_changed %> for future runs. Here's what changes.</p>
|
|
25
|
-
|
|
26
|
-
<% if instruction_changed %>
|
|
27
|
-
<div class="ck-suggest-diff">
|
|
28
|
-
<div class="ck-suggest-diff__pane">
|
|
29
|
-
<div class="ck-suggest-diff__header">
|
|
30
|
-
<span class="ck-suggest-diff__label ck-suggest-diff__label--before">Currently live</span>
|
|
31
|
-
</div>
|
|
32
|
-
<pre class="ck-suggest-diff__code"><%= ck_word_diff_old(pub_instr, draft_instr) %></pre>
|
|
33
|
-
</div>
|
|
34
|
-
<div class="ck-suggest-diff__pane">
|
|
35
|
-
<div class="ck-suggest-diff__header">
|
|
36
|
-
<span class="ck-suggest-diff__label ck-suggest-diff__label--after">Draft</span>
|
|
37
|
-
</div>
|
|
38
|
-
<pre class="ck-suggest-diff__code"><%= ck_word_diff_new(pub_instr, draft_instr) %></pre>
|
|
39
|
-
</div>
|
|
40
|
-
</div>
|
|
41
|
-
<% else %>
|
|
42
|
-
<p class="ck-meta-copy">The instruction is unchanged.</p>
|
|
43
|
-
<% end %>
|
|
44
|
-
|
|
45
|
-
<% if rubric_changed %>
|
|
46
|
-
<p class="ck-meta-copy"><strong>Rubric also changed.</strong> Edit the metric to inspect each band, or publish to apply the new wording.</p>
|
|
47
|
-
<% end %>
|
|
48
|
-
</section>
|
|
49
|
-
<% end %>
|
|
11
|
+
stats: CompletionKit::MetricCalibrationStats.for(@metric),
|
|
12
|
+
metric: @metric %>
|
|
13
|
+
<% end %>
|
|
14
|
+
<% if @metric.tags.any? %>
|
|
15
|
+
<div class="tag-marks-row tag-marks-row--header">
|
|
16
|
+
<%= render "completion_kit/tags/marks", tags: @metric.tags %>
|
|
17
|
+
</div>
|
|
50
18
|
<% end %>
|
|
51
19
|
</div>
|
|
52
20
|
<div class="ck-actions">
|
|
53
21
|
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
54
|
-
<% if @
|
|
22
|
+
<% if @suggestion_draft || @edit_draft %>
|
|
23
|
+
<% review_title = @suggestion_draft ? "The model proposed improvements based on your disagreements. Review and apply what you want." : "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
|
|
24
|
+
<%= link_to "Review changes →", edit_metric_path(@metric),
|
|
25
|
+
class: ck_button_classes(:dark),
|
|
26
|
+
title: review_title %>
|
|
27
|
+
<% elsif @improve_disagreement_count.positive? %>
|
|
55
28
|
<%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
|
|
56
29
|
method: :post, form_class: "inline-block",
|
|
57
30
|
class: ck_button_classes(:light, variant: :outline),
|
|
58
|
-
title: "
|
|
59
|
-
data: { turbo_confirm: "
|
|
31
|
+
title: "Ask the model to suggest improvements to this metric's instruction and rubric based on the disagreements collected so far.",
|
|
32
|
+
data: { turbo_confirm: "Ask the model for suggested improvements based on the disagreements collected so far?" } %>
|
|
60
33
|
<% else %>
|
|
61
34
|
<button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
|
|
62
|
-
title="Mark at least one
|
|
35
|
+
title="Mark at least one case as Disagree before the model can suggest a change.">Improve the metric</button>
|
|
63
36
|
<% end %>
|
|
64
37
|
<% end %>
|
|
65
38
|
<%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
|
|
66
39
|
</div>
|
|
67
40
|
</section>
|
|
68
41
|
|
|
69
|
-
<% if @metric.tags.any? %>
|
|
70
|
-
<div class="tag-marks-row tag-marks-row--header">
|
|
71
|
-
<%= render "completion_kit/tags/marks", tags: @metric.tags %>
|
|
72
|
-
</div>
|
|
73
|
-
<% end %>
|
|
74
|
-
|
|
75
42
|
<% if @metric.instruction.present? %>
|
|
76
43
|
<section class="ck-card">
|
|
77
44
|
<p class="ck-kicker">Instruction</p>
|
|
@@ -80,7 +47,7 @@
|
|
|
80
47
|
<% end %>
|
|
81
48
|
|
|
82
49
|
<section class="ck-card ck-card--spaced">
|
|
83
|
-
<p class="ck-kicker">Rubric
|
|
50
|
+
<p class="ck-kicker">Rubric<%= render "completion_kit/metrics/rubric_hint" %></p>
|
|
84
51
|
<div class="ck-rubric-display">
|
|
85
52
|
<% @metric.rubric_bands_for_form.each do |band| %>
|
|
86
53
|
<div class="ck-rubric-row ck-rubric-row--display">
|
|
@@ -97,64 +64,122 @@
|
|
|
97
64
|
</div>
|
|
98
65
|
</section>
|
|
99
66
|
|
|
100
|
-
<% if CompletionKit.config.judge_calibration_enabled && @
|
|
101
|
-
<%
|
|
102
|
-
<%
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
67
|
+
<% if CompletionKit.config.judge_calibration_enabled && @versions.size > 1 %>
|
|
68
|
+
<% predecessor_of = @versions.index_with { |v| @versions.detect { |o| o.version_number < v.version_number } } %>
|
|
69
|
+
<% version_changed = ->(v, pred) { pred && (pred.instruction.to_s != v.instruction.to_s || pred.rubric_bands != v.rubric_bands) } %>
|
|
70
|
+
<section class="ck-card ck-card--spaced">
|
|
71
|
+
<p class="ck-kicker">Versions</p>
|
|
72
|
+
<table class="ck-results-table ck-metric-versions-table">
|
|
73
|
+
<thead>
|
|
74
|
+
<tr>
|
|
75
|
+
<th scope="col">Version</th>
|
|
76
|
+
<th scope="col">Source</th>
|
|
77
|
+
<th scope="col">Created</th>
|
|
78
|
+
</tr>
|
|
79
|
+
</thead>
|
|
80
|
+
<tbody>
|
|
81
|
+
<% @versions.each do |v| %>
|
|
82
|
+
<% pred = predecessor_of[v] %>
|
|
83
|
+
<tr>
|
|
84
|
+
<td>
|
|
85
|
+
<div class="ck-version-cell">
|
|
86
|
+
<div class="ck-version-cell__label">
|
|
87
|
+
<strong><%= v.version_label %></strong>
|
|
88
|
+
<% if v.current? %>
|
|
89
|
+
<span class="ck-chip">Published</span>
|
|
90
|
+
<% elsif v.draft? %>
|
|
91
|
+
<%= button_to "Publish", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
92
|
+
method: :post, form_class: "inline-block",
|
|
93
|
+
class: "ck-chip ck-chip--publish" %>
|
|
94
|
+
<% else %>
|
|
95
|
+
<%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
96
|
+
method: :post, form_class: "inline-block",
|
|
97
|
+
class: "ck-chip ck-chip--publish",
|
|
98
|
+
data: { turbo_confirm: "Roll the live judge back to #{v.version_label}? Calibration verdicts collected against the current version stay tied to it." } %>
|
|
99
|
+
<% end %>
|
|
100
|
+
</div>
|
|
101
|
+
<% if version_changed.call(v, pred) %>
|
|
102
|
+
<button type="button" class="ck-cell-link ck-cell-link--delta"
|
|
103
|
+
title="What changed from #{pred.version_label}"
|
|
104
|
+
onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">Δ</button>
|
|
105
|
+
<% end %>
|
|
106
|
+
</div>
|
|
107
|
+
</td>
|
|
108
|
+
<td>
|
|
109
|
+
<% source_label, source_class = case v.source
|
|
110
|
+
when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
|
|
111
|
+
when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
|
|
112
|
+
else ["Original", "ck-source-chip ck-source-chip--initial"]
|
|
113
|
+
end %>
|
|
114
|
+
<span class="<%= source_class %>"><%= source_label %></span>
|
|
115
|
+
</td>
|
|
116
|
+
<td class="ck-meta-copy">
|
|
117
|
+
<time datetime="<%= v.created_at.utc.iso8601 %>" data-relative-time><%= time_ago_in_words(v.created_at) %> ago</time>
|
|
118
|
+
</td>
|
|
119
|
+
</tr>
|
|
120
|
+
<% end %>
|
|
121
|
+
</tbody>
|
|
122
|
+
</table>
|
|
123
|
+
</section>
|
|
111
124
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
<
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
125
|
+
<% @versions.each do |v| %>
|
|
126
|
+
<% pred = predecessor_of[v] %>
|
|
127
|
+
<% next unless version_changed.call(v, pred) %>
|
|
128
|
+
<dialog id="ck-mvdiff-<%= v.id %>" class="ck-modal" onclick="if(event.target===this)this.close()">
|
|
129
|
+
<article class="ck-modal__panel" tabindex="-1" onclick="event.stopPropagation()">
|
|
130
|
+
<header class="ck-modal__header">
|
|
131
|
+
<div class="ck-modal__heading">
|
|
132
|
+
<h2 class="ck-modal__title"><%= pred.version_label %> → <%= v.version_label %></h2>
|
|
133
|
+
<span class="ck-modal__meta">What changed in <%= v.version_label %><% if v.current? %> (live)<% elsif v.draft? %> (draft)<% end %></span>
|
|
134
|
+
</div>
|
|
135
|
+
<button type="button" class="ck-modal__close" aria-label="Close" onclick="this.closest('dialog').close()">×</button>
|
|
136
|
+
</header>
|
|
137
|
+
<div class="ck-modal__body">
|
|
138
|
+
<% if pred.instruction.to_s != v.instruction.to_s %>
|
|
139
|
+
<div class="ck-suggest-diff">
|
|
140
|
+
<div class="ck-suggest-diff__pane">
|
|
141
|
+
<div class="ck-suggest-diff__header"><span class="ck-suggest-diff__label ck-suggest-diff__label--before"><%= pred.version_label %> instruction</span></div>
|
|
142
|
+
<pre class="ck-suggest-diff__code"><%= ck_word_diff_old(pred.instruction.to_s, v.instruction.to_s) %></pre>
|
|
143
|
+
</div>
|
|
144
|
+
<div class="ck-suggest-diff__pane">
|
|
145
|
+
<div class="ck-suggest-diff__header"><span class="ck-suggest-diff__label ck-suggest-diff__label--after"><%= v.version_label %> instruction</span></div>
|
|
146
|
+
<pre class="ck-suggest-diff__code"><%= ck_word_diff_new(pred.instruction.to_s, v.instruction.to_s) %></pre>
|
|
147
|
+
</div>
|
|
148
|
+
</div>
|
|
149
|
+
<% end %>
|
|
150
|
+
<% if pred.rubric_bands != v.rubric_bands %>
|
|
151
|
+
<p class="ck-kicker ck-kicker--inset">Rubric changes</p>
|
|
152
|
+
<%= render "completion_kit/metrics/rubric_diff",
|
|
153
|
+
current_bands: pred.rubric_bands || [],
|
|
154
|
+
draft_bands: v.rubric_bands || [] %>
|
|
155
|
+
<% end %>
|
|
122
156
|
</div>
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
<% if sd_rubric_changed %>
|
|
128
|
-
<p class="ck-meta-copy"><strong>Rubric also changed.</strong> Publishing applies the new rubric too.</p>
|
|
129
|
-
<% end %>
|
|
130
|
-
|
|
131
|
-
<div class="ck-actions">
|
|
132
|
-
<%= button_to "Discard", dismiss_suggestion_metric_path(@metric, draft_id: @suggestion_draft.id),
|
|
133
|
-
method: :delete, form_class: "inline-block",
|
|
134
|
-
class: ck_button_classes(:light, variant: :outline),
|
|
135
|
-
data: { turbo_confirm: "Drop this suggestion?" } %>
|
|
136
|
-
<%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: @suggestion_draft.id),
|
|
137
|
-
method: :post, form_class: "inline-block",
|
|
138
|
-
class: ck_button_classes(:dark) %>
|
|
139
|
-
</div>
|
|
140
|
-
</section>
|
|
157
|
+
</article>
|
|
158
|
+
</dialog>
|
|
159
|
+
<% end %>
|
|
141
160
|
<% end %>
|
|
142
161
|
|
|
143
162
|
<% if CompletionKit.config.judge_calibration_enabled && @disagreements.any? %>
|
|
144
163
|
<section class="ck-card ck-card--spaced">
|
|
145
164
|
<div class="ck-prompt-preview__header">
|
|
146
165
|
<p class="ck-kicker">Cases to learn from</p>
|
|
147
|
-
<span class="ck-chip"><%= pluralize(@disagreements.size, "
|
|
166
|
+
<span class="ck-chip"><%= pluralize(@disagreements.size, "case") %></span>
|
|
148
167
|
</div>
|
|
149
|
-
|
|
168
|
+
<% mixed_versions = @disagreements.map(&:metric_version_id).uniq.size > 1 %>
|
|
169
|
+
<p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades<%= " (pins flow into the current judge regardless of which version produced the verdict)" if mixed_versions %>.</p>
|
|
150
170
|
<% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
|
|
151
171
|
<ul class="ck-disagreement-list">
|
|
152
172
|
<% @disagreements.each do |cal| %>
|
|
153
173
|
<% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
|
|
154
174
|
<% already = existing_ids.include?(cal.id) %>
|
|
155
|
-
|
|
175
|
+
<% cal_metric_version = cal.metric_version %>
|
|
176
|
+
<% on_current = cal_metric_version&.id == @published_metric_version.id %>
|
|
177
|
+
<li class="ck-disagreement<%= " ck-disagreement--remembered" if already %><%= " ck-disagreement--stale" unless on_current %>">
|
|
156
178
|
<div class="ck-disagreement__head">
|
|
157
179
|
<div class="ck-disagreement__scores">
|
|
180
|
+
<% if cal_metric_version && mixed_versions %>
|
|
181
|
+
<span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_metric_version.version_label %></span>
|
|
182
|
+
<% end %>
|
|
158
183
|
<span class="ck-disagreement__scores-label">Judge</span>
|
|
159
184
|
<% if review&.ai_score %>
|
|
160
185
|
<span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
|
|
@@ -171,14 +196,21 @@
|
|
|
171
196
|
</div>
|
|
172
197
|
<div class="ck-disagreement__action">
|
|
173
198
|
<% if already %>
|
|
174
|
-
|
|
199
|
+
<%= button_to "Forget",
|
|
200
|
+
remove_few_shot_metric_path(@metric, calibration_id: cal.id),
|
|
201
|
+
method: :delete,
|
|
202
|
+
form_class: "inline-block",
|
|
203
|
+
class: ck_button_classes(:light, variant: :outline),
|
|
204
|
+
title: "Stop showing this case to the judge.",
|
|
205
|
+
data: { turbo_confirm: "Stop showing this case to the judge?" } %>
|
|
206
|
+
<span class="ck-chip ck-chip--done" title="The judge sees this row when it grades for this metric.">Remembered</span>
|
|
175
207
|
<% else %>
|
|
176
208
|
<%= button_to "Remember this",
|
|
177
209
|
add_few_shot_metric_path(@metric, calibration_id: cal.id),
|
|
178
210
|
method: :post,
|
|
179
211
|
form_class: "inline-block",
|
|
180
212
|
class: ck_button_classes(:light, variant: :outline),
|
|
181
|
-
title: "Pin this
|
|
213
|
+
title: "Pin this case so the judge sees it next time it grades for this metric." %>
|
|
182
214
|
<% end %>
|
|
183
215
|
</div>
|
|
184
216
|
</div>
|
|
@@ -186,42 +218,14 @@
|
|
|
186
218
|
<p class="ck-disagreement__note"><%= cal.note %></p>
|
|
187
219
|
<% end %>
|
|
188
220
|
<p class="ck-disagreement__source ck-meta-copy">
|
|
189
|
-
<%= link_to cal.response.run
|
|
190
|
-
|
|
191
|
-
|
|
221
|
+
<%= link_to run_response_path(cal.response.run, cal.response, anchor: @metric.name.parameterize),
|
|
222
|
+
class: "ck-disagreement__source-link" do %>
|
|
223
|
+
<% case_display = cal.response.row_index.nil? ? "##{cal.response.id}" : (cal.response.row_index + 1).to_s %>
|
|
224
|
+
View case <%= case_display %> in <%= cal.response.run.name.to_s.truncate(50) %> →
|
|
225
|
+
<% end %>
|
|
192
226
|
</p>
|
|
193
227
|
</li>
|
|
194
228
|
<% end %>
|
|
195
229
|
</ul>
|
|
196
230
|
</section>
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
<% if Array(@metric.few_shot_examples).any? %>
|
|
200
|
-
<section class="ck-card ck-card--spaced">
|
|
201
|
-
<div class="ck-prompt-preview__header">
|
|
202
|
-
<p class="ck-kicker">What the judge remembers</p>
|
|
203
|
-
<span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "case") %></span>
|
|
204
|
-
</div>
|
|
205
|
-
<p class="ck-meta-copy">Rows you've pinned so the judge sees them next time it grades. Each one shows what the judge gave and what a human said it should have been.</p>
|
|
206
|
-
<ol class="ck-few-shot-list">
|
|
207
|
-
<% Array(@metric.few_shot_examples).each do |fs| %>
|
|
208
|
-
<li class="ck-few-shot-item">
|
|
209
|
-
<div class="ck-few-shot-item__scores">
|
|
210
|
-
<span class="ck-meta-copy">judge said</span>
|
|
211
|
-
<% if fs["judge_score"] %>
|
|
212
|
-
<span class="<%= ck_badge_classes(ck_score_kind(fs["judge_score"].to_f)) %>"><%= fs["judge_score"] %></span>
|
|
213
|
-
<% end %>
|
|
214
|
-
<span class="ck-meta-copy">human said</span>
|
|
215
|
-
<% if fs["human_score"] %>
|
|
216
|
-
<span class="<%= ck_badge_classes(ck_score_kind(fs["human_score"].to_f)) %>"><%= fs["human_score"] %></span>
|
|
217
|
-
<% end %>
|
|
218
|
-
</div>
|
|
219
|
-
<% if fs["human_note"].to_s.present? %>
|
|
220
|
-
<p class="ck-copy"><%= fs["human_note"] %></p>
|
|
221
|
-
<% end %>
|
|
222
|
-
</li>
|
|
223
|
-
<% end %>
|
|
224
|
-
</ol>
|
|
225
|
-
</section>
|
|
226
|
-
<% end %>
|
|
227
231
|
<% end %>
|
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
<div>
|
|
3
3
|
<p class="ck-kicker">Starter metric</p>
|
|
4
4
|
<h1 class="ck-title"><%= @starter.name %></h1>
|
|
5
|
-
<p class="ck-lead"><%= @starter.description %></p>
|
|
6
5
|
</div>
|
|
7
6
|
<div class="ck-actions">
|
|
8
7
|
<%= link_to "← Back to metrics", metrics_path, class: ck_button_classes(:light, variant: :outline) %>
|
|
@@ -10,17 +9,18 @@
|
|
|
10
9
|
</section>
|
|
11
10
|
|
|
12
11
|
<section class="ck-card ck-card--spaced">
|
|
13
|
-
<p class="ck-kicker">
|
|
12
|
+
<p class="ck-kicker">Why use this</p>
|
|
13
|
+
<p class="ck-copy"><strong><%= @starter.description %></strong></p>
|
|
14
14
|
<p class="ck-copy"><%= @starter.catches %></p>
|
|
15
15
|
</section>
|
|
16
16
|
|
|
17
17
|
<section class="ck-card ck-card--spaced">
|
|
18
|
-
<p class="ck-kicker">
|
|
18
|
+
<p class="ck-kicker">Judge instruction</p>
|
|
19
19
|
<p class="ck-copy"><%= @starter.instruction %></p>
|
|
20
20
|
</section>
|
|
21
21
|
|
|
22
22
|
<section class="ck-card ck-card--spaced">
|
|
23
|
-
<p class="ck-kicker">Rubric
|
|
23
|
+
<p class="ck-kicker">Rubric<%= render "completion_kit/metrics/rubric_hint" %></p>
|
|
24
24
|
<div class="ck-rubric-display">
|
|
25
25
|
<% @starter.rubric_bands.sort_by { |b| -b["stars"] }.each do |band| %>
|
|
26
26
|
<div class="ck-rubric-row ck-rubric-row--display">
|
|
@@ -37,9 +37,9 @@
|
|
|
37
37
|
</div>
|
|
38
38
|
</section>
|
|
39
39
|
|
|
40
|
-
<div class="ck-
|
|
40
|
+
<div class="ck-starter-actions">
|
|
41
41
|
<%= link_to "Cancel", metrics_path, class: ck_button_classes(:light, variant: :outline) %>
|
|
42
|
-
<%= button_to "Add
|
|
42
|
+
<%= button_to "Add #{@starter.name} to my metrics", adopt_starter_metrics_path(key: @starter.key),
|
|
43
43
|
method: :post, form_class: "inline-block",
|
|
44
44
|
class: ck_button_classes(:dark) %>
|
|
45
45
|
</div>
|
|
@@ -98,10 +98,15 @@
|
|
|
98
98
|
|
|
99
99
|
<div class="ck-review-list">
|
|
100
100
|
<% @reviews.each do |review| %>
|
|
101
|
-
|
|
101
|
+
<% review_version = review.metric_version %>
|
|
102
|
+
<% stale = review.stale_against_current_judge? %>
|
|
103
|
+
<div class="ck-review-card<%= " ck-review-card--stale" if stale %>" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
|
|
102
104
|
<div class="ck-review-card__header">
|
|
103
105
|
<span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
|
|
104
106
|
<div class="ck-inline">
|
|
107
|
+
<% if review_version %>
|
|
108
|
+
<span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Score produced by #{review_version.version_label} of this metric. The live judge has changed since." : "Score produced by the live judge (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
|
|
109
|
+
<% end %>
|
|
105
110
|
<% if review.ai_score %>
|
|
106
111
|
<% 5.times do |i| %>
|
|
107
112
|
<svg viewBox="0 0 24 24" width="16" height="16" stroke-width="1.75" class="ck-star <%= i < review.ai_score.to_i ? "ck-star--filled" : "ck-star--empty" %>"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
|
|
@@ -111,6 +116,9 @@
|
|
|
111
116
|
<% end %>
|
|
112
117
|
</div>
|
|
113
118
|
</div>
|
|
119
|
+
<% if stale %>
|
|
120
|
+
<p class="ck-review-card__stale-note">Scored against a superseded version of this metric. The live judge may score this differently.</p>
|
|
121
|
+
<% end %>
|
|
114
122
|
<% if review.ai_feedback.present? %>
|
|
115
123
|
<p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
|
|
116
124
|
<% end %>
|
|
@@ -46,7 +46,7 @@
|
|
|
46
46
|
<% else %>
|
|
47
47
|
<span class="ck-metric-pip ck-metric-pip--pending">
|
|
48
48
|
<span class="ck-metric-pip__bar"></span>
|
|
49
|
-
<span class="ck-metric-pip__label"><%= metric.name %> <
|
|
49
|
+
<span class="ck-metric-pip__label"><%= metric.name %> <span class="ck-metric-pip__sub">pending</span></span>
|
|
50
50
|
</span>
|
|
51
51
|
<% end %>
|
|
52
52
|
<% end %>
|
|
@@ -72,7 +72,7 @@
|
|
|
72
72
|
|
|
73
73
|
<% if failed_count > 0 %>
|
|
74
74
|
<%= button_to retry_failures_run_path(run), method: :post, class: "ck-run-status__retry", form_class: "ck-run-status__action" do %>
|
|
75
|
-
Retry <%= failed_count %> failed <%= "
|
|
75
|
+
Retry <%= failed_count %> failed <%= "case".pluralize(failed_count) %>
|
|
76
76
|
<% end %>
|
|
77
77
|
<% end %>
|
|
78
78
|
</section>
|
|
@@ -18,6 +18,29 @@
|
|
|
18
18
|
<% dataset_preview_lines = dataset_lines.first(50) %>
|
|
19
19
|
<% end %>
|
|
20
20
|
|
|
21
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
22
|
+
<% stale_summary = @run.stale_review_summary %>
|
|
23
|
+
<% if stale_summary.any? %>
|
|
24
|
+
<div class="ck-stale-versions-banner" role="status">
|
|
25
|
+
<div class="ck-stale-versions-banner__body">
|
|
26
|
+
<p class="ck-kicker">Stale judge versions</p>
|
|
27
|
+
<p class="ck-meta-copy">
|
|
28
|
+
This run was scored against metric versions that are no longer live.
|
|
29
|
+
<% stale_summary.values.each_with_index do |s, i| %>
|
|
30
|
+
<%= ", " if i > 0 %><strong><%= s[:metric_name] %></strong> (scored by <%= s[:scored_labels].join(", ") %>; live is <%= s[:current_label] %>)<% end %>.
|
|
31
|
+
Re-run to refresh the scores with the current judge.
|
|
32
|
+
</p>
|
|
33
|
+
</div>
|
|
34
|
+
<% if @run.status == "completed" %>
|
|
35
|
+
<%= button_to "Re-run with current judge",
|
|
36
|
+
rerun_run_path(@run), method: :post,
|
|
37
|
+
class: ck_button_classes(:dark), form_class: "inline-block",
|
|
38
|
+
data: { turbo_confirm: "Create a new run with the current metric versions? The original run stays as a record." } %>
|
|
39
|
+
<% end %>
|
|
40
|
+
</div>
|
|
41
|
+
<% end %>
|
|
42
|
+
<% end %>
|
|
43
|
+
|
|
21
44
|
<div class="ck-run-config">
|
|
22
45
|
<div class="ck-run-config__row">
|
|
23
46
|
<span class="ck-run-config__key">Created</span>
|
data/config/routes.rb
CHANGED
|
@@ -19,7 +19,8 @@ CompletionKit::Engine.routes.draw do
|
|
|
19
19
|
post "starters/:key/dismiss", to: "metrics#dismiss_starter", as: :dismiss_starter
|
|
20
20
|
end
|
|
21
21
|
member do
|
|
22
|
-
post
|
|
22
|
+
post :add_few_shot
|
|
23
|
+
delete :remove_few_shot
|
|
23
24
|
post :publish_draft
|
|
24
25
|
post :suggest_variants
|
|
25
26
|
delete :dismiss_suggestion
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
class AddVersionNumberAndPublishedAtToJudgeVersions < ActiveRecord::Migration[8.1]
|
|
2
|
+
def change
|
|
3
|
+
add_column :completion_kit_judge_versions, :version_number, :integer
|
|
4
|
+
add_column :completion_kit_judge_versions, :published_at, :datetime
|
|
5
|
+
|
|
6
|
+
reversible do |dir|
|
|
7
|
+
dir.up do
|
|
8
|
+
jv = Class.new(ActiveRecord::Base) { self.table_name = "completion_kit_judge_versions" }
|
|
9
|
+
jv.distinct.pluck(:metric_id).each do |metric_id|
|
|
10
|
+
jv.where(metric_id: metric_id).order(:created_at, :id).each_with_index do |row, i|
|
|
11
|
+
updates = { version_number: i + 1 }
|
|
12
|
+
updates[:published_at] = row.created_at if row[:state] == "published"
|
|
13
|
+
jv.where(id: row.id).update_all(updates)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
change_column_null :completion_kit_judge_versions, :version_number, false
|
|
20
|
+
add_index :completion_kit_judge_versions,
|
|
21
|
+
[:metric_id, :version_number],
|
|
22
|
+
name: "index_ck_judge_versions_on_metric_version"
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
class RenameJudgeVersionToMetricVersion < ActiveRecord::Migration[8.1]
|
|
2
|
+
def change
|
|
3
|
+
rename_table :completion_kit_judge_versions, :completion_kit_metric_versions
|
|
4
|
+
rename_column :completion_kit_calibrations, :judge_version_id, :metric_version_id
|
|
5
|
+
|
|
6
|
+
rename_index :completion_kit_metric_versions,
|
|
7
|
+
"index_ck_judge_versions_on_metric_id",
|
|
8
|
+
"index_ck_metric_versions_on_metric_id"
|
|
9
|
+
rename_index :completion_kit_metric_versions,
|
|
10
|
+
"index_ck_judge_versions_on_metric_current",
|
|
11
|
+
"index_ck_metric_versions_on_metric_current"
|
|
12
|
+
rename_index :completion_kit_metric_versions,
|
|
13
|
+
"index_ck_judge_versions_on_metric_state",
|
|
14
|
+
"index_ck_metric_versions_on_metric_state"
|
|
15
|
+
rename_index :completion_kit_metric_versions,
|
|
16
|
+
"index_ck_judge_versions_on_metric_version",
|
|
17
|
+
"index_ck_metric_versions_on_metric_vnum"
|
|
18
|
+
rename_index :completion_kit_calibrations,
|
|
19
|
+
"index_ck_calibrations_on_judge_version_id",
|
|
20
|
+
"index_ck_calibrations_on_metric_version_id"
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
class AddMetricVersionToReviews < ActiveRecord::Migration[8.1]
|
|
2
|
+
def change
|
|
3
|
+
add_column :completion_kit_reviews, :metric_version_id, :bigint
|
|
4
|
+
add_index :completion_kit_reviews, :metric_version_id, name: "index_ck_reviews_on_metric_version_id"
|
|
5
|
+
|
|
6
|
+
reversible do |dir|
|
|
7
|
+
dir.up do
|
|
8
|
+
execute <<~SQL
|
|
9
|
+
UPDATE completion_kit_reviews
|
|
10
|
+
SET metric_version_id = (
|
|
11
|
+
SELECT id FROM completion_kit_metric_versions mv
|
|
12
|
+
WHERE mv.metric_id = completion_kit_reviews.metric_id
|
|
13
|
+
AND mv.current = #{ActiveRecord::Base.connection.quote(true)}
|
|
14
|
+
LIMIT 1
|
|
15
|
+
)
|
|
16
|
+
WHERE metric_id IS NOT NULL
|
|
17
|
+
SQL
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.44
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -272,11 +272,11 @@ files:
|
|
|
272
272
|
- app/models/completion_kit/calibration.rb
|
|
273
273
|
- app/models/completion_kit/dashboard_dismissal.rb
|
|
274
274
|
- app/models/completion_kit/dataset.rb
|
|
275
|
-
- app/models/completion_kit/judge_version.rb
|
|
276
275
|
- app/models/completion_kit/mcp_session.rb
|
|
277
276
|
- app/models/completion_kit/metric.rb
|
|
278
277
|
- app/models/completion_kit/metric_group.rb
|
|
279
278
|
- app/models/completion_kit/metric_group_membership.rb
|
|
279
|
+
- app/models/completion_kit/metric_version.rb
|
|
280
280
|
- app/models/completion_kit/model.rb
|
|
281
281
|
- app/models/completion_kit/prompt.rb
|
|
282
282
|
- app/models/completion_kit/provider_credential.rb
|
|
@@ -295,7 +295,6 @@ files:
|
|
|
295
295
|
- app/services/completion_kit/csv_processor.rb
|
|
296
296
|
- app/services/completion_kit/dashboard_stats.rb
|
|
297
297
|
- app/services/completion_kit/judge_service.rb
|
|
298
|
-
- app/services/completion_kit/judge_variant_generator.rb
|
|
299
298
|
- app/services/completion_kit/llm_client.rb
|
|
300
299
|
- app/services/completion_kit/mcp_dispatcher.rb
|
|
301
300
|
- app/services/completion_kit/mcp_tools/base.rb
|
|
@@ -310,6 +309,7 @@ files:
|
|
|
310
309
|
- app/services/completion_kit/mcp_tools/runs.rb
|
|
311
310
|
- app/services/completion_kit/mcp_tools/tags.rb
|
|
312
311
|
- app/services/completion_kit/metric_calibration_stats.rb
|
|
312
|
+
- app/services/completion_kit/metric_variant_generator.rb
|
|
313
313
|
- app/services/completion_kit/model_discovery_service.rb
|
|
314
314
|
- app/services/completion_kit/ollama_client.rb
|
|
315
315
|
- app/services/completion_kit/onboarding/checklist.rb
|
|
@@ -347,6 +347,8 @@ files:
|
|
|
347
347
|
- app/views/completion_kit/metric_groups/new.html.erb
|
|
348
348
|
- app/views/completion_kit/metric_groups/show.html.erb
|
|
349
349
|
- app/views/completion_kit/metrics/_form.html.erb
|
|
350
|
+
- app/views/completion_kit/metrics/_rubric_diff.html.erb
|
|
351
|
+
- app/views/completion_kit/metrics/_rubric_hint.html.erb
|
|
350
352
|
- app/views/completion_kit/metrics/_starter_card.html.erb
|
|
351
353
|
- app/views/completion_kit/metrics/edit.html.erb
|
|
352
354
|
- app/views/completion_kit/metrics/index.html.erb
|
|
@@ -419,6 +421,9 @@ files:
|
|
|
419
421
|
- db/migrate/20260523000001_add_few_shot_examples_to_completion_kit_metrics.rb
|
|
420
422
|
- db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb
|
|
421
423
|
- db/migrate/20260524000001_create_completion_kit_starter_metric_dismissals.rb
|
|
424
|
+
- db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb
|
|
425
|
+
- db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
|
|
426
|
+
- db/migrate/20260528000002_add_metric_version_to_reviews.rb
|
|
422
427
|
- lib/completion-kit.rb
|
|
423
428
|
- lib/completion_kit.rb
|
|
424
429
|
- lib/completion_kit/concurrency_check.rb
|