completion-kit 0.5.38 → 0.5.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fc7d527828189c2993060b315dca634fb958d2da11fd7fae63c4790179c46701
4
- data.tar.gz: f0323b980bdfb35d36742b548ddd3629e66d39e587775521678dc80b4cd2f068
3
+ metadata.gz: d8d052b5ce9253412be890b820439248547d767575969e4260566a63426ac612
4
+ data.tar.gz: 8e2f73e59c977c1923b90c9b36fae7dd8eadd35d0c499ae04cea1d63113e7655
5
5
  SHA512:
6
- metadata.gz: '020946bdac698194bb5246cfbe21fdf45c56006c80c15d1d7bcfda4d3494d95cde45645e090df14a411b172c83dcde42be777d74811b25a340e5710dba6ae7ce'
7
- data.tar.gz: 1b4f0ea8cf4e613df783ac428404ef1ae19b285db04f8a6768119760c65fb81d9ee72d52905ab2ef30e077ae910eb00e622925c1dcc43aee0c9e3a0f748718b1
6
+ metadata.gz: 54dd9bd2a4b2e64f929865508649ca2ada6972840715552b920b2bcc156b74cc76fe957b8ac58ec2f9ad7d8594dbe2ef15c600efb10304963b66b226cdee959b
7
+ data.tar.gz: 2db1e93c654e7d0de826a9f9c0ffadae292cf57d1dfff1df71763c5a98da4fc6d547560808bee9c8364f64c08c747661546a91f330d0effcda7b3587547d35e8
@@ -1,12 +1,26 @@
1
- <% %w[400 500 700].each do |weight| %>
2
1
  @font-face {
3
2
  font-family: 'JetBrains Mono';
4
3
  font-style: normal;
5
- font-weight: <%= weight %>;
4
+ font-weight: 400;
5
+ font-display: swap;
6
+ src: url('completion_kit/jetbrains-mono-400.woff2') format('woff2');
7
+ }
8
+
9
+ @font-face {
10
+ font-family: 'JetBrains Mono';
11
+ font-style: normal;
12
+ font-weight: 500;
13
+ font-display: swap;
14
+ src: url('completion_kit/jetbrains-mono-500.woff2') format('woff2');
15
+ }
16
+
17
+ @font-face {
18
+ font-family: 'JetBrains Mono';
19
+ font-style: normal;
20
+ font-weight: 700;
6
21
  font-display: swap;
7
- src: url('<%= asset_path("completion_kit/jetbrains-mono-#{weight}.woff2") %>') format('woff2');
22
+ src: url('completion_kit/jetbrains-mono-700.woff2') format('woff2');
8
23
  }
9
- <% end %>
10
24
 
11
25
  .turbo-progress-bar {
12
26
  background-color: var(--ck-accent);
@@ -2751,6 +2765,11 @@ select.ck-input {
2751
2765
  gap: 1rem;
2752
2766
  }
2753
2767
 
2768
+ .ck-review-card__header .ck-inline {
2769
+ flex-wrap: nowrap;
2770
+ flex-shrink: 0;
2771
+ }
2772
+
2754
2773
  .ck-review-card__metric {
2755
2774
  font-family: var(--ck-mono);
2756
2775
  font-size: 0.95rem;
@@ -2827,6 +2846,12 @@ select.ck-input {
2827
2846
  padding: 1rem;
2828
2847
  }
2829
2848
 
2849
+ .ck-review-card__header {
2850
+ flex-direction: column;
2851
+ align-items: flex-start;
2852
+ gap: 0.5rem;
2853
+ }
2854
+
2830
2855
  /* Topbar nav collapses behind the hamburger trigger. */
2831
2856
  .ck-nav-menu__trigger {
2832
2857
  display: inline-flex;
@@ -5372,3 +5397,13 @@ a.tag-mark {
5372
5397
  color: var(--ck-success);
5373
5398
  margin-right: 6px;
5374
5399
  }
5400
+
5401
+ .ck-calibration__error {
5402
+ margin: 8px 0 0;
5403
+ padding: 8px 10px;
5404
+ background: var(--ck-danger-soft);
5405
+ border: 1px solid rgba(248, 113, 113, 0.3);
5406
+ border-radius: 4px;
5407
+ color: var(--ck-danger);
5408
+ font-size: 0.82rem;
5409
+ }
@@ -5,9 +5,18 @@ module CompletionKit
5
5
 
6
6
  def create
7
7
  created_by = calibration_creator
8
- calibration = Calibration.find_or_initialize_by(
8
+ existing = Calibration.find_by(
9
9
  run_id: @run.id, response_id: @response.id, metric_id: @metric.id, created_by: created_by
10
10
  )
11
+
12
+ if params[:verdict] == "disagree" && params[:corrected_score].blank?
13
+ render_calibration(calibration: existing, pending_verdict: "disagree")
14
+ return
15
+ end
16
+
17
+ calibration = existing || Calibration.new(
18
+ run: @run, response: @response, metric: @metric, created_by: created_by
19
+ )
11
20
  calibration.assign_attributes(
12
21
  judge_version: JudgeVersion.ensure_current_for(@metric),
13
22
  verdict: params[:verdict],
@@ -16,19 +25,36 @@ module CompletionKit
16
25
  )
17
26
 
18
27
  if calibration.save
19
- render turbo_stream: turbo_stream.replace(
20
- "calibration_#{@response.id}_#{@metric.id}",
21
- partial: "completion_kit/calibrations/buttons",
22
- locals: { review: review_for_metric, calibration: calibration, run: @run, response_row: @response, metric: @metric }
23
- )
28
+ render_calibration(calibration: calibration)
24
29
  else
25
- flash[:alert] = calibration.errors.full_messages.to_sentence
26
- redirect_to run_response_path(@run, @response)
30
+ render_calibration(
31
+ calibration: existing,
32
+ pending_verdict: params[:verdict],
33
+ error: calibration.errors.full_messages.to_sentence,
34
+ status: :unprocessable_entity
35
+ )
27
36
  end
28
37
  end
29
38
 
30
39
  private
31
40
 
41
+ def render_calibration(calibration:, pending_verdict: nil, error: nil, status: :ok)
42
+ locals = {
43
+ review: review_for_metric,
44
+ calibration: calibration,
45
+ run: @run,
46
+ response_row: @response,
47
+ metric: @metric,
48
+ pending_verdict: pending_verdict,
49
+ error: error
50
+ }
51
+ render turbo_stream: turbo_stream.replace(
52
+ "calibration_#{@response.id}_#{@metric.id}",
53
+ partial: "completion_kit/calibrations/buttons",
54
+ locals: locals
55
+ ), status: status
56
+ end
57
+
32
58
  def ensure_calibration_enabled
33
59
  head :not_found unless CompletionKit.config.judge_calibration_enabled
34
60
  end
@@ -55,7 +55,7 @@ module CompletionKit
55
55
  end
56
56
  generator.persist!(variants)
57
57
  label = variants.length == 1 ? "alternative" : "alternatives"
58
- redirect_to metric_path(@metric), notice: "Wrote #{variants.length} #{label} for the judge instruction. Pick one to make it live."
58
+ redirect_to metric_path(@metric), notice: "Wrote #{variants.length} #{label} for this metric. Pick one to make it live."
59
59
  end
60
60
 
61
61
  def publish_draft
@@ -39,7 +39,8 @@ module CompletionKit
39
39
  private
40
40
 
41
41
  def build_meta_prompt
42
- examples = JudgeCalibrationExamples.for(@metric)
42
+ disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
43
+ borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
43
44
  sections = []
44
45
  sections << "You are an expert evaluator. Rewrite a judge's grading instruction so it agrees better with humans on the cases below."
45
46
  sections << ""
@@ -51,17 +52,31 @@ module CompletionKit
51
52
  sections << "## Rubric (unchanged across variants — only rewrite the instruction)"
52
53
  sections << @metric.display_rubric_text
53
54
  sections << ""
54
- sections << "## Recent disagreements (judge vs human)"
55
- examples.each_with_index do |ex, i|
56
- sections << "### Case #{i + 1}"
57
- sections << "Input: #{ex[:input].to_s.truncate(200)}"
58
- sections << "Output: #{ex[:output].to_s.truncate(200)}"
59
- sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
60
- sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
61
- sections << ""
55
+ if disagreements.any?
56
+ sections << "## Recent disagreements (judge vs human)"
57
+ disagreements.each_with_index do |ex, i|
58
+ sections << "### Case #{i + 1}"
59
+ sections << "Input: #{ex[:input].to_s.truncate(200)}"
60
+ sections << "Output: #{ex[:output].to_s.truncate(200)}"
61
+ sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
62
+ sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
63
+ sections << ""
64
+ end
65
+ end
66
+ if borderlines.any?
67
+ sections << "## Rubric-ambiguous cases (humans marked these borderline)"
68
+ sections << "Each case below is one where a human said the rubric was unclear. Use these to sharpen language, split overlapping bands, or call out edge cases explicitly."
69
+ borderlines.each_with_index do |ex, i|
70
+ sections << "### Borderline #{i + 1}"
71
+ sections << "Input: #{ex[:input].to_s.truncate(200)}"
72
+ sections << "Output: #{ex[:output].to_s.truncate(200)}"
73
+ sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
74
+ sections << "Human note: #{ex[:human_note].to_s.truncate(200)}" if ex[:human_note].to_s.present?
75
+ sections << ""
76
+ end
62
77
  end
63
78
  sections << "## Task"
64
- sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite — not a wholesale rewrite of the rubric. Aim to close the disagreement gap."
79
+ sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite — not a wholesale rewrite of the rubric. Close the disagreement gap and disambiguate the borderline cases."
65
80
  sections << ""
66
81
  sections << "Respond in EXACTLY this format, repeated #{@count} times:"
67
82
  sections << ""
@@ -88,11 +103,23 @@ module CompletionKit
88
103
  module_function
89
104
 
90
105
  def for(metric, limit: 8)
91
- disagreements = Calibration.where(metric_id: metric.id, verdict: "disagree")
92
- .includes(response: :reviews)
93
- .order(created_at: :desc)
94
- .limit(limit)
95
- disagreements.map do |cal|
106
+ disagreements_for(metric, limit: limit)
107
+ end
108
+
109
+ def disagreements_for(metric, limit: 8)
110
+ calibrations_for(metric, verdict: "disagree", limit: limit)
111
+ end
112
+
113
+ def borderlines_for(metric, limit: 6)
114
+ calibrations_for(metric, verdict: "borderline", limit: limit)
115
+ end
116
+
117
+ def calibrations_for(metric, verdict:, limit:)
118
+ Calibration.where(metric_id: metric.id, verdict: verdict)
119
+ .includes(response: :reviews)
120
+ .order(created_at: :desc)
121
+ .limit(limit)
122
+ .map do |cal|
96
123
  review = cal.response.reviews.find { |r| r.metric_id == metric.id }
97
124
  {
98
125
  input: cal.response.input_data,
@@ -1,12 +1,15 @@
1
1
  <div id="calibration_<%= response_row.id %>_<%= metric.id %>" class="ck-calibration">
2
2
  <% current_verdict = calibration&.verdict %>
3
+ <% pending_verdict = local_assigns[:pending_verdict] %>
4
+ <% active_verdict = pending_verdict || current_verdict %>
5
+ <% error = local_assigns[:error] %>
3
6
  <% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
4
7
  <p class="ck-calibration__prompt">
5
8
  Your verdict
6
9
  <% if verdict_count > 0 %>
7
- <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "judge trust →", metric_path(metric), class: "ck-link" %></span>
10
+ <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "trust score →", metric_path(metric), class: "ck-link" %></span>
8
11
  <% else %>
9
- <span class="ck-calibration__count">Tell us what you think — was the score right? Verdicts roll up into <%= link_to "judge trust", metric_path(metric), class: "ck-link" %>.</span>
12
+ <span class="ck-calibration__count">Tell us what you think — was the score right? Verdicts roll up into the metric's <%= link_to "trust score", metric_path(metric), class: "ck-link" %>.</span>
10
13
  <% end %>
11
14
  </p>
12
15
  <div class="ck-calibration__buttons">
@@ -20,8 +23,8 @@
20
23
  <%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
21
24
  method: :post,
22
25
  form: { data: { turbo: "true" } },
23
- class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == current_verdict}",
24
- "aria-pressed": (verdict == current_verdict).to_s,
26
+ class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
27
+ "aria-pressed": (verdict == active_verdict).to_s,
25
28
  title: verdict_hints[verdict] do %>
26
29
  <%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
27
30
  <span><%= verdict %></span>
@@ -29,7 +32,11 @@
29
32
  <% end %>
30
33
  </div>
31
34
 
32
- <% if current_verdict == "disagree" %>
35
+ <% if error.present? %>
36
+ <p class="ck-calibration__error" role="alert"><%= error %></p>
37
+ <% end %>
38
+
39
+ <% if active_verdict == "disagree" %>
33
40
  <%= form_with url: run_response_calibrations_path(run, response_row),
34
41
  method: :post, local: false,
35
42
  class: "ck-calibration__detail" do |f| %>
@@ -37,23 +44,24 @@
37
44
  <%= hidden_field_tag :verdict, "disagree" %>
38
45
  <label class="ck-label">
39
46
  What should the score have been?
40
- <span class="ck-calibration__value" data-calibration-value><%= calibration.corrected_score || review&.ai_score || 3 %></span>
47
+ <span class="ck-calibration__value" data-calibration-value><%= calibration&.corrected_score || review&.ai_score || 3 %></span>
41
48
  </label>
42
49
  <input type="range" name="corrected_score" min="1" max="5" step="0.5"
43
- value="<%= calibration.corrected_score || review&.ai_score || 3 %>"
50
+ value="<%= calibration&.corrected_score || review&.ai_score || 3 %>"
44
51
  oninput="this.closest('.ck-calibration__detail').querySelector('[data-calibration-value]').textContent = this.value"
45
- class="ck-slider">
46
- <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration.note %></textarea>
47
- <%= f.submit "Save", class: ck_button_classes(:dark) %>
52
+ class="ck-slider"
53
+ required>
54
+ <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration&.note %></textarea>
55
+ <%= f.submit (current_verdict == "disagree" ? "Update" : "Save disagree"), class: ck_button_classes(:dark) %>
48
56
  <% end %>
49
- <% elsif current_verdict == "borderline" %>
57
+ <% elsif active_verdict == "borderline" %>
50
58
  <%= form_with url: run_response_calibrations_path(run, response_row),
51
59
  method: :post, local: false,
52
60
  class: "ck-calibration__detail" do |f| %>
53
61
  <%= hidden_field_tag :metric_id, metric.id %>
54
62
  <%= hidden_field_tag :verdict, "borderline" %>
55
- <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration.note %></textarea>
56
- <%= f.submit "Save", class: ck_button_classes(:dark) %>
63
+ <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration&.note %></textarea>
64
+ <%= f.submit (current_verdict == "borderline" ? "Update" : "Save"), class: ck_button_classes(:dark) %>
57
65
  <% end %>
58
66
  <% end %>
59
67
  </div>
@@ -1,6 +1,6 @@
1
1
  <% stats = local_assigns[:stats] %>
2
2
  <div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
3
- <p class="ck-trust-panel__label" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</p>
3
+ <p class="ck-trust-panel__label" title="How often this metric's scores match the humans who reviewed them.">Trust score</p>
4
4
  <% if stats.counter_only? %>
5
5
  <div class="ck-trust-panel__body">
6
6
  <span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>
@@ -19,7 +19,7 @@
19
19
  <tr>
20
20
  <th scope="col">Name</th>
21
21
  <th scope="col">Instruction</th>
22
- <th scope="col" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</th>
22
+ <th scope="col" title="How often this metric's scores match the humans who reviewed them.">Trust score</th>
23
23
  <th scope="col">In groups</th>
24
24
  <th scope="col"></th>
25
25
  </tr>
@@ -36,7 +36,7 @@
36
36
  <% end %>
37
37
  </td>
38
38
  <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
39
- <td data-label="Judge trust" class="ck-metrics-table__trust">
39
+ <td data-label="Trust score" class="ck-metrics-table__trust">
40
40
  <% if CompletionKit.config.judge_calibration_enabled %>
41
41
  <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
42
42
  <% if s.counter_only? %>
@@ -22,11 +22,11 @@
22
22
  </div>
23
23
  <div class="ck-actions">
24
24
  <% if CompletionKit.config.judge_calibration_enabled %>
25
- <%= button_to "Suggest rewrites", suggest_variants_metric_path(@metric),
25
+ <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
26
26
  method: :post, form_class: "inline-block",
27
27
  class: ck_button_classes(:light, variant: :outline),
28
- title: "Ask the model to rewrite this judge instruction based on the disagreements collected so far.",
29
- data: { turbo_confirm: "Ask the model to rewrite this judge instruction based on the disagreements collected so far?" } %>
28
+ title: "Ask the model to rewrite this metric's instruction based on the disagreements and rubric-ambiguous cases collected so far.",
29
+ data: { turbo_confirm: "Ask the model to rewrite this metric's instruction based on the disagreements and rubric-ambiguous cases collected so far?" } %>
30
30
  <% end %>
31
31
  <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
32
32
  </div>
@@ -41,7 +41,7 @@
41
41
  <% if @metric.instruction.present? %>
42
42
  <section class="ck-card">
43
43
  <p class="ck-kicker">Instruction</p>
44
- <div class="ck-note-box"><%= simple_format(@metric.instruction) %></div>
44
+ <%= simple_format(@metric.instruction, {}, class: "ck-copy") %>
45
45
  </section>
46
46
  <% end %>
47
47
 
@@ -134,10 +134,10 @@
134
134
  <% if @suggestion_drafts.any? %>
135
135
  <section class="ck-card ck-card--spaced">
136
136
  <div class="ck-prompt-preview__header">
137
- <p class="ck-kicker">Suggested rewrites</p>
137
+ <p class="ck-kicker">Suggested improvements</p>
138
138
  <span class="ck-chip"><%= @suggestion_drafts.size %> option<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
139
139
  </div>
140
- <p class="ck-meta-copy">The model wrote these alternate instructions based on the disagreements above. Pick one to make it the live judge — the previous version is kept in history.</p>
140
+ <p class="ck-meta-copy">Based on your verdicts, the model proposed these alternative instructions for this metric. Pick one to make it live — the previous version stays in history.</p>
141
141
  <div class="ck-suggestion-list">
142
142
  <% @suggestion_drafts.each do |draft| %>
143
143
  <article class="ck-suggestion-card">
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.5.38"
2
+ VERSION = "0.5.39"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.38
4
+ version: 0.5.39
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin
@@ -233,7 +233,7 @@ files:
233
233
  - app/assets/images/completion_kit/favicon.ico
234
234
  - app/assets/images/completion_kit/logo.png
235
235
  - app/assets/javascripts/completion_kit/application.js
236
- - app/assets/stylesheets/completion_kit/application.css.erb
236
+ - app/assets/stylesheets/completion_kit/application.css
237
237
  - app/controllers/completion_kit/api/v1/base_controller.rb
238
238
  - app/controllers/completion_kit/api/v1/calibrations_controller.rb
239
239
  - app/controllers/completion_kit/api/v1/datasets_controller.rb