completion-kit 0.5.40 → 0.5.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d8875fa22de9c8626e401706818b271f3cb26bffa7faae1f583b29503228cead
4
- data.tar.gz: 5bf39fad883b2eed2f505b11403ab60fddf9efc5735073ff61c290d53f59a36d
3
+ metadata.gz: c92ab72bfe3b2fe9fa296a21e8430f7b8c3d9f949a933f7675f4ffd8059dd8ae
4
+ data.tar.gz: ec53ee3e3b29a4d283db95d90e7a1dd8afef3572647ae243c9458500600714c9
5
5
  SHA512:
6
- metadata.gz: a8f2a7f14235c1214b567b891defaf523a645a21a2409ed81df964893a260cccb1fc9bf63903794f952c91bc9c91f3c1e3850db751a08ce0edc49b360ad9642d
7
- data.tar.gz: e148b500e498a00dc370bf203fea3e2618b9f1c8fccd1dc5f220ae77a4988a8934198ad32bbf25a29bc84f948c687fab9a0ae16b9c03fe575fe9fca4cf98a0ea
6
+ metadata.gz: 8b783ec1b478a365f9e8a04da486fc7a87251d53dd6c63720c7d88795a524546bf71da8a07eca79337863c6787f52c776e51c75940b854a6e477f8074bf1d016
7
+ data.tar.gz: 7444db06e0adf5e29c7d68426496e643c743187d9cc8d974367cfdf025c993c0e061ad2b4b3f9d6a70f1aca3d70d124744b2a7ae416d5827efab9b6339b496f7
@@ -5413,9 +5413,11 @@ a.tag-mark {
5413
5413
  }
5414
5414
 
5415
5415
  .ck-metrics-table__trust {
5416
+ margin: 4px 0 0;
5416
5417
  font-family: var(--ck-mono);
5417
- font-size: 0.78rem;
5418
+ font-size: 0.72rem;
5418
5419
  letter-spacing: 0.03em;
5420
+ color: var(--ck-dim);
5419
5421
  }
5420
5422
  .ck-metrics-table__trust-rate {
5421
5423
  font-weight: 600;
@@ -5485,3 +5487,54 @@ a.tag-mark {
5485
5487
  @keyframes ck-saved-flash {
5486
5488
  0% { background: var(--ck-success); border-color: var(--ck-success); }
5487
5489
  }
5490
+
5491
+ .ck-disagreement-list {
5492
+ list-style: none;
5493
+ padding: 0;
5494
+ margin: 12px 0 0;
5495
+ display: flex;
5496
+ flex-direction: column;
5497
+ gap: 12px;
5498
+ }
5499
+ .ck-disagreement {
5500
+ padding: 14px;
5501
+ background: var(--ck-surface-soft);
5502
+ border: 1px solid var(--ck-line);
5503
+ border-radius: 6px;
5504
+ display: flex;
5505
+ flex-direction: column;
5506
+ gap: 8px;
5507
+ }
5508
+ .ck-disagreement__head {
5509
+ display: flex;
5510
+ align-items: center;
5511
+ justify-content: space-between;
5512
+ gap: 12px;
5513
+ flex-wrap: wrap;
5514
+ }
5515
+ .ck-disagreement__scores {
5516
+ display: inline-flex;
5517
+ align-items: center;
5518
+ gap: 8px;
5519
+ flex-wrap: wrap;
5520
+ }
5521
+ .ck-disagreement__scores-label {
5522
+ font-family: var(--ck-mono);
5523
+ font-size: 0.7rem;
5524
+ letter-spacing: 0.08em;
5525
+ text-transform: uppercase;
5526
+ color: var(--ck-dim);
5527
+ }
5528
+ .ck-disagreement__scores-arrow {
5529
+ color: var(--ck-dim);
5530
+ }
5531
+ .ck-disagreement__note {
5532
+ margin: 0;
5533
+ color: var(--ck-text);
5534
+ font-size: 0.92rem;
5535
+ line-height: 1.45;
5536
+ }
5537
+ .ck-disagreement__source {
5538
+ margin: 0;
5539
+ font-size: 0.78rem;
5540
+ }
@@ -109,7 +109,7 @@ module CompletionKit
109
109
  "added_at" => Time.current.utc.iso8601
110
110
  }
111
111
  @metric.update!(few_shot_examples: examples)
112
- redirect_to metric_path(@metric), notice: "Saved as a teaching example. The judge will see it next time it grades."
112
+ redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
113
113
  end
114
114
 
115
115
  private
@@ -19,7 +19,6 @@
19
19
  <tr>
20
20
  <th scope="col">Name</th>
21
21
  <th scope="col">Instruction</th>
22
- <th scope="col" title="How often this metric's scores match the humans who reviewed them.">Trust level</th>
23
22
  <th scope="col">In groups</th>
24
23
  <th scope="col"></th>
25
24
  </tr>
@@ -29,6 +28,21 @@
29
28
  <tr onclick="window.location='<%= metric_path(metric) %>'" style="cursor: pointer;">
30
29
  <td>
31
30
  <%= link_to metric_path(metric), class: "ck-record-name" do %><strong><%= metric.name %></strong><% end %>
31
+ <% if CompletionKit.config.judge_calibration_enabled %>
32
+ <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
33
+ <p class="ck-metrics-table__trust">
34
+ <% if s.counter_only? %>
35
+ <% if s.sample_size.zero? %>
36
+ No verdicts yet
37
+ <% else %>
38
+ <%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts
39
+ <% end %>
40
+ <% else %>
41
+ <span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read — keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
42
+ ±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %>
43
+ <% end %>
44
+ </p>
45
+ <% end %>
32
46
  <% if metric.tags.any? %>
33
47
  <div class="tag-marks-row">
34
48
  <%= render "completion_kit/tags/marks", tags: metric.tags %>
@@ -36,23 +50,6 @@
36
50
  <% end %>
37
51
  </td>
38
52
  <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
39
- <td data-label="Trust level" class="ck-metrics-table__trust">
40
- <% if CompletionKit.config.judge_calibration_enabled %>
41
- <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
42
- <% if s.counter_only? %>
43
- <% if s.sample_size.zero? %>
44
- <span class="ck-meta-copy">No verdicts yet</span>
45
- <% else %>
46
- <span class="ck-meta-copy"><%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts</span>
47
- <% end %>
48
- <% else %>
49
- <span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read — keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
50
- <span class="ck-meta-copy">±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %></span>
51
- <% end %>
52
- <% else %>
53
- <span class="ck-meta-copy">—</span>
54
- <% end %>
55
- </td>
56
53
  <td data-label="In groups">
57
54
  <% groups = metric.metric_groups %>
58
55
  <% if groups.any? %>
@@ -140,82 +140,69 @@
140
140
  </section>
141
141
  <% end %>
142
142
 
143
- <% if CompletionKit.config.judge_calibration_enabled %>
143
+ <% if CompletionKit.config.judge_calibration_enabled && @disagreements.any? %>
144
144
  <section class="ck-card ck-card--spaced">
145
145
  <div class="ck-prompt-preview__header">
146
- <p class="ck-kicker">Where the judge got it wrong</p>
147
- <% if @disagreements.any? %>
148
- <span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
149
- <% end %>
146
+ <p class="ck-kicker">Cases to learn from</p>
147
+ <span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
150
148
  </div>
151
- <% if @disagreements.empty? %>
152
- <p class="ck-meta-copy">Nothing here yet. As people give a "disagree" verdict on response rows, those rows show up below so you can review the judge's misses and turn them into teaching examples.</p>
153
- <% else %>
154
- <p class="ck-meta-copy">Rows where a reviewer said the judge got it wrong. Save the best ones as teaching examples — the judge will see them next time it grades.</p>
155
- <table class="ck-results-table ck-disagreements-table">
156
- <thead>
157
- <tr>
158
- <th scope="col">Run · row</th>
159
- <th scope="col">Judge</th>
160
- <th scope="col">Human</th>
161
- <th scope="col">Note</th>
162
- <th scope="col"></th>
163
- </tr>
164
- </thead>
165
- <tbody>
166
- <% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
167
- <% @disagreements.each do |cal| %>
168
- <% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
169
- <% already = existing_ids.include?(cal.id) %>
170
- <tr>
171
- <td>
172
- <%= link_to ck_run_path(cal.response.run), class: "ck-record-name" do %>
173
- <strong><%= cal.response.run.name.to_s.truncate(40) %></strong>
174
- <% end %>
175
- <span class="ck-meta-copy">· <%= link_to "row ##{cal.response.id}", run_response_path(cal.response.run, cal.response), class: "ck-link" %></span>
176
- </td>
177
- <td>
178
- <% if review&.ai_score %>
179
- <span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
180
- <% else %>
181
- <span class="ck-meta-copy">—</span>
182
- <% end %>
183
- </td>
184
- <td>
185
- <% if cal.corrected_score %>
186
- <span class="<%= ck_badge_classes(ck_score_kind(cal.corrected_score.to_f)) %>"><%= cal.corrected_score %></span>
187
- <% else %>
188
- <span class="ck-meta-copy">—</span>
189
- <% end %>
190
- </td>
191
- <td class="ck-meta-copy"><%= cal.note.to_s.truncate(120) %></td>
192
- <td>
193
- <% if already %>
194
- <span class="ck-chip ck-chip--done">Saved as example</span>
195
- <% else %>
196
- <%= button_to "Teach the judge",
197
- add_few_shot_metric_path(@metric, calibration_id: cal.id),
198
- method: :post,
199
- form_class: "inline-block",
200
- class: ck_button_classes(:light, variant: :outline),
201
- title: "Save this row as a teaching example. The judge will see it next time it grades." %>
202
- <% end %>
203
- </td>
204
- </tr>
149
+ <p class="ck-meta-copy">Rows where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> — the judge sees them next time it grades.</p>
150
+ <% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
151
+ <ul class="ck-disagreement-list">
152
+ <% @disagreements.each do |cal| %>
153
+ <% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
154
+ <% already = existing_ids.include?(cal.id) %>
155
+ <li class="ck-disagreement">
156
+ <div class="ck-disagreement__head">
157
+ <div class="ck-disagreement__scores">
158
+ <span class="ck-disagreement__scores-label">Judge</span>
159
+ <% if review&.ai_score %>
160
+ <span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
161
+ <% else %>
162
+ <span class="ck-meta-copy">—</span>
163
+ <% end %>
164
+ <span class="ck-disagreement__scores-arrow">→</span>
165
+ <span class="ck-disagreement__scores-label">Human</span>
166
+ <% if cal.corrected_score %>
167
+ <span class="<%= ck_badge_classes(ck_score_kind(cal.corrected_score.to_f)) %>"><%= cal.corrected_score %></span>
168
+ <% else %>
169
+ <span class="ck-meta-copy">—</span>
170
+ <% end %>
171
+ </div>
172
+ <div class="ck-disagreement__action">
173
+ <% if already %>
174
+ <span class="ck-chip ck-chip--done">Remembered</span>
175
+ <% else %>
176
+ <%= button_to "Remember this",
177
+ add_few_shot_metric_path(@metric, calibration_id: cal.id),
178
+ method: :post,
179
+ form_class: "inline-block",
180
+ class: ck_button_classes(:light, variant: :outline),
181
+ title: "Pin this row so the judge sees it next time it grades for this metric." %>
182
+ <% end %>
183
+ </div>
184
+ </div>
185
+ <% if cal.note.to_s.present? %>
186
+ <p class="ck-disagreement__note"><%= cal.note %></p>
205
187
  <% end %>
206
- </tbody>
207
- </table>
208
- <% end %>
188
+ <p class="ck-disagreement__source ck-meta-copy">
189
+ <%= link_to cal.response.run.name.to_s.truncate(50), ck_run_path(cal.response.run), class: "ck-link" %>
190
+ ·
191
+ <%= link_to "row ##{cal.response.id}", run_response_path(cal.response.run, cal.response), class: "ck-link" %>
192
+ </p>
193
+ </li>
194
+ <% end %>
195
+ </ul>
209
196
  </section>
210
197
 
211
198
 
212
199
  <% if Array(@metric.few_shot_examples).any? %>
213
200
  <section class="ck-card ck-card--spaced">
214
201
  <div class="ck-prompt-preview__header">
215
- <p class="ck-kicker">Teaching examples</p>
216
- <span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "example") %></span>
202
+ <p class="ck-kicker">What the judge remembers</p>
203
+ <span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "case") %></span>
217
204
  </div>
218
- <p class="ck-meta-copy">The judge sees these worked examples whenever it grades for this metric. Each shows what the judge gave and what a human said it should have been.</p>
205
+ <p class="ck-meta-copy">Rows you've pinned so the judge sees them next time it grades. Each one shows what the judge gave and what a human said it should have been.</p>
219
206
  <ol class="ck-few-shot-list">
220
207
  <% Array(@metric.few_shot_examples).each do |fs| %>
221
208
  <li class="ck-few-shot-item">
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.5.40"
2
+ VERSION = "0.5.41"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.40
4
+ version: 0.5.41
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin