completion-kit 0.5.40 → 0.5.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +54 -1
- data/app/controllers/completion_kit/metrics_controller.rb +1 -1
- data/app/views/completion_kit/metrics/index.html.erb +15 -18
- data/app/views/completion_kit/metrics/show.html.erb +52 -65
- data/lib/completion_kit/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c92ab72bfe3b2fe9fa296a21e8430f7b8c3d9f949a933f7675f4ffd8059dd8ae
|
|
4
|
+
data.tar.gz: ec53ee3e3b29a4d283db95d90e7a1dd8afef3572647ae243c9458500600714c9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 8b783ec1b478a365f9e8a04da486fc7a87251d53dd6c63720c7d88795a524546bf71da8a07eca79337863c6787f52c776e51c75940b854a6e477f8074bf1d016
|
|
7
|
+
data.tar.gz: 7444db06e0adf5e29c7d68426496e643c743187d9cc8d974367cfdf025c993c0e061ad2b4b3f9d6a70f1aca3d70d124744b2a7ae416d5827efab9b6339b496f7
|
|
@@ -5413,9 +5413,11 @@ a.tag-mark {
|
|
|
5413
5413
|
}
|
|
5414
5414
|
|
|
5415
5415
|
.ck-metrics-table__trust {
|
|
5416
|
+
margin: 4px 0 0;
|
|
5416
5417
|
font-family: var(--ck-mono);
|
|
5417
|
-
font-size: 0.
|
|
5418
|
+
font-size: 0.72rem;
|
|
5418
5419
|
letter-spacing: 0.03em;
|
|
5420
|
+
color: var(--ck-dim);
|
|
5419
5421
|
}
|
|
5420
5422
|
.ck-metrics-table__trust-rate {
|
|
5421
5423
|
font-weight: 600;
|
|
@@ -5485,3 +5487,54 @@ a.tag-mark {
|
|
|
5485
5487
|
@keyframes ck-saved-flash {
|
|
5486
5488
|
0% { background: var(--ck-success); border-color: var(--ck-success); }
|
|
5487
5489
|
}
|
|
5490
|
+
|
|
5491
|
+
.ck-disagreement-list {
|
|
5492
|
+
list-style: none;
|
|
5493
|
+
padding: 0;
|
|
5494
|
+
margin: 12px 0 0;
|
|
5495
|
+
display: flex;
|
|
5496
|
+
flex-direction: column;
|
|
5497
|
+
gap: 12px;
|
|
5498
|
+
}
|
|
5499
|
+
.ck-disagreement {
|
|
5500
|
+
padding: 14px;
|
|
5501
|
+
background: var(--ck-surface-soft);
|
|
5502
|
+
border: 1px solid var(--ck-line);
|
|
5503
|
+
border-radius: 6px;
|
|
5504
|
+
display: flex;
|
|
5505
|
+
flex-direction: column;
|
|
5506
|
+
gap: 8px;
|
|
5507
|
+
}
|
|
5508
|
+
.ck-disagreement__head {
|
|
5509
|
+
display: flex;
|
|
5510
|
+
align-items: center;
|
|
5511
|
+
justify-content: space-between;
|
|
5512
|
+
gap: 12px;
|
|
5513
|
+
flex-wrap: wrap;
|
|
5514
|
+
}
|
|
5515
|
+
.ck-disagreement__scores {
|
|
5516
|
+
display: inline-flex;
|
|
5517
|
+
align-items: center;
|
|
5518
|
+
gap: 8px;
|
|
5519
|
+
flex-wrap: wrap;
|
|
5520
|
+
}
|
|
5521
|
+
.ck-disagreement__scores-label {
|
|
5522
|
+
font-family: var(--ck-mono);
|
|
5523
|
+
font-size: 0.7rem;
|
|
5524
|
+
letter-spacing: 0.08em;
|
|
5525
|
+
text-transform: uppercase;
|
|
5526
|
+
color: var(--ck-dim);
|
|
5527
|
+
}
|
|
5528
|
+
.ck-disagreement__scores-arrow {
|
|
5529
|
+
color: var(--ck-dim);
|
|
5530
|
+
}
|
|
5531
|
+
.ck-disagreement__note {
|
|
5532
|
+
margin: 0;
|
|
5533
|
+
color: var(--ck-text);
|
|
5534
|
+
font-size: 0.92rem;
|
|
5535
|
+
line-height: 1.45;
|
|
5536
|
+
}
|
|
5537
|
+
.ck-disagreement__source {
|
|
5538
|
+
margin: 0;
|
|
5539
|
+
font-size: 0.78rem;
|
|
5540
|
+
}
|
|
@@ -109,7 +109,7 @@ module CompletionKit
|
|
|
109
109
|
"added_at" => Time.current.utc.iso8601
|
|
110
110
|
}
|
|
111
111
|
@metric.update!(few_shot_examples: examples)
|
|
112
|
-
redirect_to metric_path(@metric), notice: "
|
|
112
|
+
redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
|
|
113
113
|
end
|
|
114
114
|
|
|
115
115
|
private
|
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
<tr>
|
|
20
20
|
<th scope="col">Name</th>
|
|
21
21
|
<th scope="col">Instruction</th>
|
|
22
|
-
<th scope="col" title="How often this metric's scores match the humans who reviewed them.">Trust level</th>
|
|
23
22
|
<th scope="col">In groups</th>
|
|
24
23
|
<th scope="col"></th>
|
|
25
24
|
</tr>
|
|
@@ -29,6 +28,21 @@
|
|
|
29
28
|
<tr onclick="window.location='<%= metric_path(metric) %>'" style="cursor: pointer;">
|
|
30
29
|
<td>
|
|
31
30
|
<%= link_to metric_path(metric), class: "ck-record-name" do %><strong><%= metric.name %></strong><% end %>
|
|
31
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
32
|
+
<% s = CompletionKit::MetricCalibrationStats.for(metric) %>
|
|
33
|
+
<p class="ck-metrics-table__trust">
|
|
34
|
+
<% if s.counter_only? %>
|
|
35
|
+
<% if s.sample_size.zero? %>
|
|
36
|
+
No verdicts yet
|
|
37
|
+
<% else %>
|
|
38
|
+
<%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts
|
|
39
|
+
<% end %>
|
|
40
|
+
<% else %>
|
|
41
|
+
<span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read — keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
|
|
42
|
+
±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %>
|
|
43
|
+
<% end %>
|
|
44
|
+
</p>
|
|
45
|
+
<% end %>
|
|
32
46
|
<% if metric.tags.any? %>
|
|
33
47
|
<div class="tag-marks-row">
|
|
34
48
|
<%= render "completion_kit/tags/marks", tags: metric.tags %>
|
|
@@ -36,23 +50,6 @@
|
|
|
36
50
|
<% end %>
|
|
37
51
|
</td>
|
|
38
52
|
<td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
|
|
39
|
-
<td data-label="Trust level" class="ck-metrics-table__trust">
|
|
40
|
-
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
41
|
-
<% s = CompletionKit::MetricCalibrationStats.for(metric) %>
|
|
42
|
-
<% if s.counter_only? %>
|
|
43
|
-
<% if s.sample_size.zero? %>
|
|
44
|
-
<span class="ck-meta-copy">No verdicts yet</span>
|
|
45
|
-
<% else %>
|
|
46
|
-
<span class="ck-meta-copy"><%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts</span>
|
|
47
|
-
<% end %>
|
|
48
|
-
<% else %>
|
|
49
|
-
<span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read — keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
|
|
50
|
-
<span class="ck-meta-copy">±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %></span>
|
|
51
|
-
<% end %>
|
|
52
|
-
<% else %>
|
|
53
|
-
<span class="ck-meta-copy">—</span>
|
|
54
|
-
<% end %>
|
|
55
|
-
</td>
|
|
56
53
|
<td data-label="In groups">
|
|
57
54
|
<% groups = metric.metric_groups %>
|
|
58
55
|
<% if groups.any? %>
|
|
@@ -140,82 +140,69 @@
|
|
|
140
140
|
</section>
|
|
141
141
|
<% end %>
|
|
142
142
|
|
|
143
|
-
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
143
|
+
<% if CompletionKit.config.judge_calibration_enabled && @disagreements.any? %>
|
|
144
144
|
<section class="ck-card ck-card--spaced">
|
|
145
145
|
<div class="ck-prompt-preview__header">
|
|
146
|
-
<p class="ck-kicker">
|
|
147
|
-
|
|
148
|
-
<span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
|
|
149
|
-
<% end %>
|
|
146
|
+
<p class="ck-kicker">Cases to learn from</p>
|
|
147
|
+
<span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
|
|
150
148
|
</div>
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
<
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
<% end %>
|
|
190
|
-
</td>
|
|
191
|
-
<td class="ck-meta-copy"><%= cal.note.to_s.truncate(120) %></td>
|
|
192
|
-
<td>
|
|
193
|
-
<% if already %>
|
|
194
|
-
<span class="ck-chip ck-chip--done">Saved as example</span>
|
|
195
|
-
<% else %>
|
|
196
|
-
<%= button_to "Teach the judge",
|
|
197
|
-
add_few_shot_metric_path(@metric, calibration_id: cal.id),
|
|
198
|
-
method: :post,
|
|
199
|
-
form_class: "inline-block",
|
|
200
|
-
class: ck_button_classes(:light, variant: :outline),
|
|
201
|
-
title: "Save this row as a teaching example. The judge will see it next time it grades." %>
|
|
202
|
-
<% end %>
|
|
203
|
-
</td>
|
|
204
|
-
</tr>
|
|
149
|
+
<p class="ck-meta-copy">Rows where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> — the judge sees them next time it grades.</p>
|
|
150
|
+
<% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
|
|
151
|
+
<ul class="ck-disagreement-list">
|
|
152
|
+
<% @disagreements.each do |cal| %>
|
|
153
|
+
<% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
|
|
154
|
+
<% already = existing_ids.include?(cal.id) %>
|
|
155
|
+
<li class="ck-disagreement">
|
|
156
|
+
<div class="ck-disagreement__head">
|
|
157
|
+
<div class="ck-disagreement__scores">
|
|
158
|
+
<span class="ck-disagreement__scores-label">Judge</span>
|
|
159
|
+
<% if review&.ai_score %>
|
|
160
|
+
<span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
|
|
161
|
+
<% else %>
|
|
162
|
+
<span class="ck-meta-copy">—</span>
|
|
163
|
+
<% end %>
|
|
164
|
+
<span class="ck-disagreement__scores-arrow">→</span>
|
|
165
|
+
<span class="ck-disagreement__scores-label">Human</span>
|
|
166
|
+
<% if cal.corrected_score %>
|
|
167
|
+
<span class="<%= ck_badge_classes(ck_score_kind(cal.corrected_score.to_f)) %>"><%= cal.corrected_score %></span>
|
|
168
|
+
<% else %>
|
|
169
|
+
<span class="ck-meta-copy">—</span>
|
|
170
|
+
<% end %>
|
|
171
|
+
</div>
|
|
172
|
+
<div class="ck-disagreement__action">
|
|
173
|
+
<% if already %>
|
|
174
|
+
<span class="ck-chip ck-chip--done">Remembered</span>
|
|
175
|
+
<% else %>
|
|
176
|
+
<%= button_to "Remember this",
|
|
177
|
+
add_few_shot_metric_path(@metric, calibration_id: cal.id),
|
|
178
|
+
method: :post,
|
|
179
|
+
form_class: "inline-block",
|
|
180
|
+
class: ck_button_classes(:light, variant: :outline),
|
|
181
|
+
title: "Pin this row so the judge sees it next time it grades for this metric." %>
|
|
182
|
+
<% end %>
|
|
183
|
+
</div>
|
|
184
|
+
</div>
|
|
185
|
+
<% if cal.note.to_s.present? %>
|
|
186
|
+
<p class="ck-disagreement__note"><%= cal.note %></p>
|
|
205
187
|
<% end %>
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
188
|
+
<p class="ck-disagreement__source ck-meta-copy">
|
|
189
|
+
<%= link_to cal.response.run.name.to_s.truncate(50), ck_run_path(cal.response.run), class: "ck-link" %>
|
|
190
|
+
·
|
|
191
|
+
<%= link_to "row ##{cal.response.id}", run_response_path(cal.response.run, cal.response), class: "ck-link" %>
|
|
192
|
+
</p>
|
|
193
|
+
</li>
|
|
194
|
+
<% end %>
|
|
195
|
+
</ul>
|
|
209
196
|
</section>
|
|
210
197
|
|
|
211
198
|
|
|
212
199
|
<% if Array(@metric.few_shot_examples).any? %>
|
|
213
200
|
<section class="ck-card ck-card--spaced">
|
|
214
201
|
<div class="ck-prompt-preview__header">
|
|
215
|
-
<p class="ck-kicker">
|
|
216
|
-
<span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "
|
|
202
|
+
<p class="ck-kicker">What the judge remembers</p>
|
|
203
|
+
<span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "case") %></span>
|
|
217
204
|
</div>
|
|
218
|
-
<p class="ck-meta-copy">
|
|
205
|
+
<p class="ck-meta-copy">Rows you've pinned so the judge sees them next time it grades. Each one shows what the judge gave and what a human said it should have been.</p>
|
|
219
206
|
<ol class="ck-few-shot-list">
|
|
220
207
|
<% Array(@metric.few_shot_examples).each do |fs| %>
|
|
221
208
|
<li class="ck-few-shot-item">
|