completion-kit 0.5.37 → 0.5.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1ccc7d1feb86aed6af17569a642d8b8e81fe522f0a7c68ca4ebb34abc113dbce
4
- data.tar.gz: 88793eabe6b04c3497c761cde5b61511623c5a9844ce7e101560f8eb3b492e18
3
+ metadata.gz: fc7d527828189c2993060b315dca634fb958d2da11fd7fae63c4790179c46701
4
+ data.tar.gz: f0323b980bdfb35d36742b548ddd3629e66d39e587775521678dc80b4cd2f068
5
5
  SHA512:
6
- metadata.gz: d133a9d0db55ee41eb07e290b9657e044c8a0836806bbd055d0b7b6d1cf8b981056b40e3a8795a951e36e7d7dbcc7626e249c2a2f4cba9492fad38aa931b6bfc
7
- data.tar.gz: 71bbbe827f33648b12f121c949af74fe8d02702d44c70fd39beb4795c8f95d2b9941aa755d631a78029e0b412cdec7ca9e2bea107d1395ea001abe46dcfddf3f
6
+ metadata.gz: '020946bdac698194bb5246cfbe21fdf45c56006c80c15d1d7bcfda4d3494d95cde45645e090df14a411b172c83dcde42be777d74811b25a340e5710dba6ae7ce'
7
+ data.tar.gz: 1b4f0ea8cf4e613df783ac428404ef1ae19b285db04f8a6768119760c65fb81d9ee72d52905ab2ef30e077ae910eb00e622925c1dcc43aee0c9e3a0f748718b1
@@ -5361,3 +5361,14 @@ a.tag-mark {
5361
5361
  border-radius: 4px;
5362
5362
  border: 1px solid var(--ck-line);
5363
5363
  }
5364
+
5365
+ .ck-metrics-table__trust {
5366
+ font-family: var(--ck-mono);
5367
+ font-size: 0.78rem;
5368
+ letter-spacing: 0.03em;
5369
+ }
5370
+ .ck-metrics-table__trust-rate {
5371
+ font-weight: 600;
5372
+ color: var(--ck-success);
5373
+ margin-right: 6px;
5374
+ }
@@ -54,12 +54,14 @@ module CompletionKit
54
54
  return
55
55
  end
56
56
  generator.persist!(variants)
57
- label = variants.length == 1 ? "judge variant" : "judge variants"
58
- redirect_to metric_path(@metric), notice: "Generated #{variants.length} #{label} as drafts. Pick one to publish."
57
+ label = variants.length == 1 ? "alternative" : "alternatives"
58
+ redirect_to metric_path(@metric), notice: "Wrote #{variants.length} #{label} for the judge instruction. Pick one to make it live."
59
59
  end
60
60
 
61
61
  def publish_draft
62
- draft = JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
62
+ scope = JudgeVersion.drafts.where(metric_id: @metric.id)
63
+ draft = params[:draft_id].present? ? scope.find_by(id: params[:draft_id]) : scope.order(created_at: :desc).first
64
+
63
65
  if draft.nil?
64
66
  redirect_to metric_path(@metric), alert: "No draft to publish."
65
67
  return
@@ -68,9 +70,13 @@ module CompletionKit
68
70
  JudgeVersion.transaction do
69
71
  JudgeVersion.where(metric_id: @metric.id, state: "published").update_all(current: false)
70
72
  draft.update!(state: "published", current: true)
73
+ @metric.update_columns(
74
+ instruction: draft.instruction,
75
+ rubric_bands: Array(draft.rubric_bands).to_json
76
+ )
71
77
  end
72
78
 
73
- redirect_to metric_path(@metric), notice: "Draft published as the current judge version."
79
+ redirect_to metric_path(@metric), notice: "This judge version is now live."
74
80
  end
75
81
 
76
82
  def add_few_shot
@@ -88,7 +94,7 @@ module CompletionKit
88
94
  "added_at" => Time.current.utc.iso8601
89
95
  }
90
96
  @metric.update!(few_shot_examples: examples)
91
- redirect_to metric_path(@metric), notice: "Added as a judge few-shot."
97
+ redirect_to metric_path(@metric), notice: "Saved as a teaching example. The judge will see it next time it grades."
92
98
  end
93
99
 
94
100
  private
@@ -4,17 +4,25 @@
4
4
  <p class="ck-calibration__prompt">
5
5
  Your verdict
6
6
  <% if verdict_count > 0 %>
7
- <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score</span>
7
+ <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "judge trust →", metric_path(metric), class: "ck-link" %></span>
8
+ <% else %>
9
+ <span class="ck-calibration__count">Tell us what you think — was the score right? Verdicts roll up into <%= link_to "judge trust", metric_path(metric), class: "ck-link" %>.</span>
8
10
  <% end %>
9
11
  </p>
10
12
  <div class="ck-calibration__buttons">
11
13
  <% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
14
+ <% verdict_hints = {
15
+ "agree" => "The score looks right.",
16
+ "disagree" => "The score is wrong — you'll pick the right one.",
17
+ "borderline" => "The rubric is unclear here; either score could be defensible."
18
+ } %>
12
19
  <% CompletionKit::Calibration::VERDICTS.each do |verdict| %>
13
20
  <%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
14
21
  method: :post,
15
22
  form: { data: { turbo: "true" } },
16
23
  class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == current_verdict}",
17
- "aria-pressed": (verdict == current_verdict).to_s do %>
24
+ "aria-pressed": (verdict == current_verdict).to_s,
25
+ title: verdict_hints[verdict] do %>
18
26
  <%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
19
27
  <span><%= verdict %></span>
20
28
  <% end %>
@@ -28,7 +36,7 @@
28
36
  <%= hidden_field_tag :metric_id, metric.id %>
29
37
  <%= hidden_field_tag :verdict, "disagree" %>
30
38
  <label class="ck-label">
31
- Your score
39
+ What should the score have been?
32
40
  <span class="ck-calibration__value" data-calibration-value><%= calibration.corrected_score || review&.ai_score || 3 %></span>
33
41
  </label>
34
42
  <input type="range" name="corrected_score" min="1" max="5" step="0.5"
@@ -1,34 +1,31 @@
1
1
  <% stats = local_assigns[:stats] %>
2
2
  <div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
3
- <p class="ck-trust-panel__label">Judge trust</p>
3
+ <p class="ck-trust-panel__label" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</p>
4
4
  <% if stats.counter_only? %>
5
5
  <div class="ck-trust-panel__body">
6
6
  <span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>
7
- <span class="ck-trust-panel__hint">verdicts<% if stats.short_to_target > 0 %> · <%= pluralize(stats.short_to_target, "more") %> to score<% end %></span>
7
+ <span class="ck-trust-panel__hint">verdicts so far<% if stats.short_to_target > 0 %> · <%= stats.short_to_target %> more before we can score the judge<% end %></span>
8
8
  </div>
9
9
  <% else %>
10
10
  <div class="ck-trust-panel__body">
11
- <span class="ck-trust-panel__score">~<%= (stats.agreement_point * 100).round %><span class="ck-trust-panel__score-pct">%</span></span>
12
- <span class="ck-trust-panel__margin">±<%= (stats.margin * 100).round %> pt</span>
13
- <span class="ck-trust-panel__gate"><%= stats.firm? ? "settled" : "provisional" %></span>
11
+ <span class="ck-trust-panel__score"
12
+ title="Roughly how often the judge and the humans agreed.">~<%= (stats.agreement_point * 100).round %><span class="ck-trust-panel__score-pct">%</span></span>
13
+ <span class="ck-trust-panel__margin"
14
+ title="The range we're confident the true rate sits in, given how few verdicts we have so far.">±<%= (stats.margin * 100).round %> pt</span>
15
+ <span class="ck-trust-panel__gate"
16
+ title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts to tighten the margin.' %>"><%= stats.firm? ? "settled" : "early read" %></span>
14
17
  </div>
15
18
  <div class="ck-trust-panel__details">
16
- <span><%= stats.sample_size %> verdicts</span>
19
+ <span><%= pluralize(stats.sample_size, "verdict") %></span>
17
20
  <% if stats.borderline_rate && stats.borderline_rate > 0 %>
18
21
  <% level = if stats.borderline_rate > 0.30 then "danger"
19
22
  elsif stats.borderline_rate > 0.15 then "warning"
20
23
  else "ok" end %>
21
24
  <span class="ck-trust-panel__borderline ck-trust-panel__borderline--<%= level %>"
22
- title="<%= level == 'ok' ? '' : 'Rubric ambiguous. Consider splitting the metric or clarifying the rubric.' %>">
23
- <%= (stats.borderline_rate * 100).round %>% borderline
25
+ title="<%= level == 'ok' ? 'Some reviewers said the rubric was unclear here.' : 'A lot of reviewers say the rubric is unclear here. Consider splitting the metric or rewriting the rubric.' %>">
26
+ <%= (stats.borderline_rate * 100).round %>% said "unclear"
24
27
  </span>
25
28
  <% end %>
26
- <% if stats.mae %>
27
- <span>MAE <%= stats.mae.round(2) %></span>
28
- <% end %>
29
- <% if stats.kappa %>
30
- <span>κ <%= stats.kappa.round(2) %></span>
31
- <% end %>
32
29
  </div>
33
30
  <% end %>
34
31
  </div>
@@ -19,6 +19,7 @@
19
19
  <tr>
20
20
  <th scope="col">Name</th>
21
21
  <th scope="col">Instruction</th>
22
+ <th scope="col" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</th>
22
23
  <th scope="col">In groups</th>
23
24
  <th scope="col"></th>
24
25
  </tr>
@@ -35,6 +36,23 @@
35
36
  <% end %>
36
37
  </td>
37
38
  <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
39
+ <td data-label="Judge trust" class="ck-metrics-table__trust">
40
+ <% if CompletionKit.config.judge_calibration_enabled %>
41
+ <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
42
+ <% if s.counter_only? %>
43
+ <% if s.sample_size.zero? %>
44
+ <span class="ck-meta-copy">No verdicts yet</span>
45
+ <% else %>
46
+ <span class="ck-meta-copy"><%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts</span>
47
+ <% end %>
48
+ <% else %>
49
+ <span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read — keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
50
+ <span class="ck-meta-copy">±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %></span>
51
+ <% end %>
52
+ <% else %>
53
+ <span class="ck-meta-copy">—</span>
54
+ <% end %>
55
+ </td>
38
56
  <td data-label="In groups">
39
57
  <% groups = metric.metric_groups %>
40
58
  <% if groups.any? %>
@@ -12,8 +12,8 @@
12
12
  <% if @latest_draft %>
13
13
  <div class="ck-draft-banner">
14
14
  <span class="ck-chip ck-chip--soft">Draft pending</span>
15
- <span class="ck-meta-copy">An edit forked a draft judge version. Publish it to make this the current judge.</span>
16
- <%= button_to "Publish draft", publish_draft_metric_path(@metric),
15
+ <span class="ck-meta-copy">A draft version of this judge is saved. Publishing it replaces the live instruction and rubric.</span>
16
+ <%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @latest_draft.id),
17
17
  method: :post, form_class: "inline-block",
18
18
  class: ck_button_classes(:dark) %>
19
19
  </div>
@@ -22,10 +22,11 @@
22
22
  </div>
23
23
  <div class="ck-actions">
24
24
  <% if CompletionKit.config.judge_calibration_enabled %>
25
- <%= button_to "Suggest improvements", suggest_variants_metric_path(@metric),
25
+ <%= button_to "Suggest rewrites", suggest_variants_metric_path(@metric),
26
26
  method: :post, form_class: "inline-block",
27
27
  class: ck_button_classes(:light, variant: :outline),
28
- data: { turbo_confirm: "Ask the model to propose new judge instructions based on the disagreements collected so far?" } %>
28
+ title: "Ask the model to rewrite this judge instruction based on the disagreements collected so far.",
29
+ data: { turbo_confirm: "Ask the model to rewrite this judge instruction based on the disagreements collected so far?" } %>
29
30
  <% end %>
30
31
  <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
31
32
  </div>
@@ -65,14 +66,15 @@
65
66
  <% if CompletionKit.config.judge_calibration_enabled %>
66
67
  <section class="ck-card ck-card--spaced">
67
68
  <div class="ck-prompt-preview__header">
68
- <p class="ck-kicker">Disagreements</p>
69
+ <p class="ck-kicker">Where the judge got it wrong</p>
69
70
  <% if @disagreements.any? %>
70
71
  <span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
71
72
  <% end %>
72
73
  </div>
73
74
  <% if @disagreements.empty? %>
74
- <p class="ck-meta-copy">No disagreements yet. As humans give the verdict "disagree" on individual rows, the judge's misses will show up here for review.</p>
75
+ <p class="ck-meta-copy">Nothing here yet. As people give a "disagree" verdict on response rows, those rows show up below so you can review the judge's misses and turn them into teaching examples.</p>
75
76
  <% else %>
77
+ <p class="ck-meta-copy">Rows where a reviewer said the judge got it wrong. Save the best ones as teaching examples — the judge will see them next time it grades.</p>
76
78
  <table class="ck-results-table ck-disagreements-table">
77
79
  <thead>
78
80
  <tr>
@@ -112,13 +114,14 @@
112
114
  <td class="ck-meta-copy"><%= cal.note.to_s.truncate(120) %></td>
113
115
  <td>
114
116
  <% if already %>
115
- <span class="ck-chip ck-chip--done">Added</span>
117
+ <span class="ck-chip ck-chip--done">Saved as example</span>
116
118
  <% else %>
117
- <%= button_to "Add as judge few-shot",
119
+ <%= button_to "Teach the judge",
118
120
  add_few_shot_metric_path(@metric, calibration_id: cal.id),
119
121
  method: :post,
120
122
  form_class: "inline-block",
121
- class: ck_button_classes(:light, variant: :outline) %>
123
+ class: ck_button_classes(:light, variant: :outline),
124
+ title: "Save this row as a teaching example. The judge will see it next time it grades." %>
122
125
  <% end %>
123
126
  </td>
124
127
  </tr>
@@ -131,20 +134,20 @@
131
134
  <% if @suggestion_drafts.any? %>
132
135
  <section class="ck-card ck-card--spaced">
133
136
  <div class="ck-prompt-preview__header">
134
- <p class="ck-kicker">Suggested judge variants</p>
135
- <span class="ck-chip"><%= @suggestion_drafts.size %> draft<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
137
+ <p class="ck-kicker">Suggested rewrites</p>
138
+ <span class="ck-chip"><%= @suggestion_drafts.size %> option<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
136
139
  </div>
137
- <p class="ck-meta-copy">Pick one and publish it to make it the current judge. The previous published version stays in history.</p>
140
+ <p class="ck-meta-copy">The model wrote these alternate instructions based on the disagreements above. Pick one to make it the live judge the previous version is kept in history.</p>
138
141
  <div class="ck-suggestion-list">
139
142
  <% @suggestion_drafts.each do |draft| %>
140
143
  <article class="ck-suggestion-card">
141
144
  <header class="ck-suggestion-card__header">
142
- <span class="ck-chip ck-chip--soft">Draft #<%= draft.id %></span>
145
+ <span class="ck-chip ck-chip--soft">Option #<%= draft.id %></span>
143
146
  <time class="ck-meta-copy" data-relative-time datetime="<%= draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(draft.created_at) %> ago</time>
144
147
  </header>
145
148
  <pre class="ck-code ck-suggestion-card__instruction"><%= draft.instruction %></pre>
146
149
  <div class="ck-actions">
147
- <%= button_to "Publish this draft", publish_draft_metric_path(@metric),
150
+ <%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: draft.id),
148
151
  method: :post, form_class: "inline-block",
149
152
  class: ck_button_classes(:dark) %>
150
153
  </div>
@@ -157,10 +160,10 @@
157
160
  <% if Array(@metric.few_shot_examples).any? %>
158
161
  <section class="ck-card ck-card--spaced">
159
162
  <div class="ck-prompt-preview__header">
160
- <p class="ck-kicker">Judge few-shot examples</p>
163
+ <p class="ck-kicker">Teaching examples</p>
161
164
  <span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "example") %></span>
162
165
  </div>
163
- <p class="ck-meta-copy">Disagreements added here will be injected as worked examples when the judge runs on this metric. Used by Phase 4 / 5 to retrain the judge.</p>
166
+ <p class="ck-meta-copy">The judge sees these worked examples whenever it grades for this metric. Each shows what the judge gave and what a human said it should have been.</p>
164
167
  <ol class="ck-few-shot-list">
165
168
  <% Array(@metric.few_shot_examples).each do |fs| %>
166
169
  <li class="ck-few-shot-item">
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.5.37"
2
+ VERSION = "0.5.38"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.37
4
+ version: 0.5.38
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin