completion-kit 0.5.37 → 0.5.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/{application.css.erb → application.css} +50 -4
- data/app/controllers/completion_kit/calibrations_controller.rb +34 -8
- data/app/controllers/completion_kit/metrics_controller.rb +11 -5
- data/app/services/completion_kit/judge_variant_generator.rb +42 -15
- data/app/views/completion_kit/calibrations/_buttons.html.erb +29 -13
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +11 -14
- data/app/views/completion_kit/metrics/index.html.erb +18 -0
- data/app/views/completion_kit/metrics/show.html.erb +20 -17
- data/lib/completion_kit/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d8d052b5ce9253412be890b820439248547d767575969e4260566a63426ac612
|
|
4
|
+
data.tar.gz: 8e2f73e59c977c1923b90c9b36fae7dd8eadd35d0c499ae04cea1d63113e7655
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 54dd9bd2a4b2e64f929865508649ca2ada6972840715552b920b2bcc156b74cc76fe957b8ac58ec2f9ad7d8594dbe2ef15c600efb10304963b66b226cdee959b
|
|
7
|
+
data.tar.gz: 2db1e93c654e7d0de826a9f9c0ffadae292cf57d1dfff1df71763c5a98da4fc6d547560808bee9c8364f64c08c747661546a91f330d0effcda7b3587547d35e8
|
|
@@ -1,12 +1,26 @@
|
|
|
1
|
-
<% %w[400 500 700].each do |weight| %>
|
|
2
1
|
@font-face {
|
|
3
2
|
font-family: 'JetBrains Mono';
|
|
4
3
|
font-style: normal;
|
|
5
|
-
font-weight:
|
|
4
|
+
font-weight: 400;
|
|
5
|
+
font-display: swap;
|
|
6
|
+
src: url('completion_kit/jetbrains-mono-400.woff2') format('woff2');
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
@font-face {
|
|
10
|
+
font-family: 'JetBrains Mono';
|
|
11
|
+
font-style: normal;
|
|
12
|
+
font-weight: 500;
|
|
6
13
|
font-display: swap;
|
|
7
|
-
src: url('
|
|
14
|
+
src: url('completion_kit/jetbrains-mono-500.woff2') format('woff2');
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
@font-face {
|
|
18
|
+
font-family: 'JetBrains Mono';
|
|
19
|
+
font-style: normal;
|
|
20
|
+
font-weight: 700;
|
|
21
|
+
font-display: swap;
|
|
22
|
+
src: url('completion_kit/jetbrains-mono-700.woff2') format('woff2');
|
|
8
23
|
}
|
|
9
|
-
<% end %>
|
|
10
24
|
|
|
11
25
|
.turbo-progress-bar {
|
|
12
26
|
background-color: var(--ck-accent);
|
|
@@ -2751,6 +2765,11 @@ select.ck-input {
|
|
|
2751
2765
|
gap: 1rem;
|
|
2752
2766
|
}
|
|
2753
2767
|
|
|
2768
|
+
.ck-review-card__header .ck-inline {
|
|
2769
|
+
flex-wrap: nowrap;
|
|
2770
|
+
flex-shrink: 0;
|
|
2771
|
+
}
|
|
2772
|
+
|
|
2754
2773
|
.ck-review-card__metric {
|
|
2755
2774
|
font-family: var(--ck-mono);
|
|
2756
2775
|
font-size: 0.95rem;
|
|
@@ -2827,6 +2846,12 @@ select.ck-input {
|
|
|
2827
2846
|
padding: 1rem;
|
|
2828
2847
|
}
|
|
2829
2848
|
|
|
2849
|
+
.ck-review-card__header {
|
|
2850
|
+
flex-direction: column;
|
|
2851
|
+
align-items: flex-start;
|
|
2852
|
+
gap: 0.5rem;
|
|
2853
|
+
}
|
|
2854
|
+
|
|
2830
2855
|
/* Topbar nav collapses behind the hamburger trigger. */
|
|
2831
2856
|
.ck-nav-menu__trigger {
|
|
2832
2857
|
display: inline-flex;
|
|
@@ -5361,3 +5386,24 @@ a.tag-mark {
|
|
|
5361
5386
|
border-radius: 4px;
|
|
5362
5387
|
border: 1px solid var(--ck-line);
|
|
5363
5388
|
}
|
|
5389
|
+
|
|
5390
|
+
.ck-metrics-table__trust {
|
|
5391
|
+
font-family: var(--ck-mono);
|
|
5392
|
+
font-size: 0.78rem;
|
|
5393
|
+
letter-spacing: 0.03em;
|
|
5394
|
+
}
|
|
5395
|
+
.ck-metrics-table__trust-rate {
|
|
5396
|
+
font-weight: 600;
|
|
5397
|
+
color: var(--ck-success);
|
|
5398
|
+
margin-right: 6px;
|
|
5399
|
+
}
|
|
5400
|
+
|
|
5401
|
+
.ck-calibration__error {
|
|
5402
|
+
margin: 8px 0 0;
|
|
5403
|
+
padding: 8px 10px;
|
|
5404
|
+
background: var(--ck-danger-soft);
|
|
5405
|
+
border: 1px solid rgba(248, 113, 113, 0.3);
|
|
5406
|
+
border-radius: 4px;
|
|
5407
|
+
color: var(--ck-danger);
|
|
5408
|
+
font-size: 0.82rem;
|
|
5409
|
+
}
|
|
@@ -5,9 +5,18 @@ module CompletionKit
|
|
|
5
5
|
|
|
6
6
|
def create
|
|
7
7
|
created_by = calibration_creator
|
|
8
|
-
|
|
8
|
+
existing = Calibration.find_by(
|
|
9
9
|
run_id: @run.id, response_id: @response.id, metric_id: @metric.id, created_by: created_by
|
|
10
10
|
)
|
|
11
|
+
|
|
12
|
+
if params[:verdict] == "disagree" && params[:corrected_score].blank?
|
|
13
|
+
render_calibration(calibration: existing, pending_verdict: "disagree")
|
|
14
|
+
return
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
calibration = existing || Calibration.new(
|
|
18
|
+
run: @run, response: @response, metric: @metric, created_by: created_by
|
|
19
|
+
)
|
|
11
20
|
calibration.assign_attributes(
|
|
12
21
|
judge_version: JudgeVersion.ensure_current_for(@metric),
|
|
13
22
|
verdict: params[:verdict],
|
|
@@ -16,19 +25,36 @@ module CompletionKit
|
|
|
16
25
|
)
|
|
17
26
|
|
|
18
27
|
if calibration.save
|
|
19
|
-
|
|
20
|
-
"calibration_#{@response.id}_#{@metric.id}",
|
|
21
|
-
partial: "completion_kit/calibrations/buttons",
|
|
22
|
-
locals: { review: review_for_metric, calibration: calibration, run: @run, response_row: @response, metric: @metric }
|
|
23
|
-
)
|
|
28
|
+
render_calibration(calibration: calibration)
|
|
24
29
|
else
|
|
25
|
-
|
|
26
|
-
|
|
30
|
+
render_calibration(
|
|
31
|
+
calibration: existing,
|
|
32
|
+
pending_verdict: params[:verdict],
|
|
33
|
+
error: calibration.errors.full_messages.to_sentence,
|
|
34
|
+
status: :unprocessable_entity
|
|
35
|
+
)
|
|
27
36
|
end
|
|
28
37
|
end
|
|
29
38
|
|
|
30
39
|
private
|
|
31
40
|
|
|
41
|
+
def render_calibration(calibration:, pending_verdict: nil, error: nil, status: :ok)
|
|
42
|
+
locals = {
|
|
43
|
+
review: review_for_metric,
|
|
44
|
+
calibration: calibration,
|
|
45
|
+
run: @run,
|
|
46
|
+
response_row: @response,
|
|
47
|
+
metric: @metric,
|
|
48
|
+
pending_verdict: pending_verdict,
|
|
49
|
+
error: error
|
|
50
|
+
}
|
|
51
|
+
render turbo_stream: turbo_stream.replace(
|
|
52
|
+
"calibration_#{@response.id}_#{@metric.id}",
|
|
53
|
+
partial: "completion_kit/calibrations/buttons",
|
|
54
|
+
locals: locals
|
|
55
|
+
), status: status
|
|
56
|
+
end
|
|
57
|
+
|
|
32
58
|
def ensure_calibration_enabled
|
|
33
59
|
head :not_found unless CompletionKit.config.judge_calibration_enabled
|
|
34
60
|
end
|
|
@@ -54,12 +54,14 @@ module CompletionKit
|
|
|
54
54
|
return
|
|
55
55
|
end
|
|
56
56
|
generator.persist!(variants)
|
|
57
|
-
label = variants.length == 1 ? "
|
|
58
|
-
redirect_to metric_path(@metric), notice: "
|
|
57
|
+
label = variants.length == 1 ? "alternative" : "alternatives"
|
|
58
|
+
redirect_to metric_path(@metric), notice: "Wrote #{variants.length} #{label} for this metric. Pick one to make it live."
|
|
59
59
|
end
|
|
60
60
|
|
|
61
61
|
def publish_draft
|
|
62
|
-
|
|
62
|
+
scope = JudgeVersion.drafts.where(metric_id: @metric.id)
|
|
63
|
+
draft = params[:draft_id].present? ? scope.find_by(id: params[:draft_id]) : scope.order(created_at: :desc).first
|
|
64
|
+
|
|
63
65
|
if draft.nil?
|
|
64
66
|
redirect_to metric_path(@metric), alert: "No draft to publish."
|
|
65
67
|
return
|
|
@@ -68,9 +70,13 @@ module CompletionKit
|
|
|
68
70
|
JudgeVersion.transaction do
|
|
69
71
|
JudgeVersion.where(metric_id: @metric.id, state: "published").update_all(current: false)
|
|
70
72
|
draft.update!(state: "published", current: true)
|
|
73
|
+
@metric.update_columns(
|
|
74
|
+
instruction: draft.instruction,
|
|
75
|
+
rubric_bands: Array(draft.rubric_bands).to_json
|
|
76
|
+
)
|
|
71
77
|
end
|
|
72
78
|
|
|
73
|
-
redirect_to metric_path(@metric), notice: "
|
|
79
|
+
redirect_to metric_path(@metric), notice: "This judge version is now live."
|
|
74
80
|
end
|
|
75
81
|
|
|
76
82
|
def add_few_shot
|
|
@@ -88,7 +94,7 @@ module CompletionKit
|
|
|
88
94
|
"added_at" => Time.current.utc.iso8601
|
|
89
95
|
}
|
|
90
96
|
@metric.update!(few_shot_examples: examples)
|
|
91
|
-
redirect_to metric_path(@metric), notice: "
|
|
97
|
+
redirect_to metric_path(@metric), notice: "Saved as a teaching example. The judge will see it next time it grades."
|
|
92
98
|
end
|
|
93
99
|
|
|
94
100
|
private
|
|
@@ -39,7 +39,8 @@ module CompletionKit
|
|
|
39
39
|
private
|
|
40
40
|
|
|
41
41
|
def build_meta_prompt
|
|
42
|
-
|
|
42
|
+
disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
|
|
43
|
+
borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
|
|
43
44
|
sections = []
|
|
44
45
|
sections << "You are an expert evaluator. Rewrite a judge's grading instruction so it agrees better with humans on the cases below."
|
|
45
46
|
sections << ""
|
|
@@ -51,17 +52,31 @@ module CompletionKit
|
|
|
51
52
|
sections << "## Rubric (unchanged across variants — only rewrite the instruction)"
|
|
52
53
|
sections << @metric.display_rubric_text
|
|
53
54
|
sections << ""
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
55
|
+
if disagreements.any?
|
|
56
|
+
sections << "## Recent disagreements (judge vs human)"
|
|
57
|
+
disagreements.each_with_index do |ex, i|
|
|
58
|
+
sections << "### Case #{i + 1}"
|
|
59
|
+
sections << "Input: #{ex[:input].to_s.truncate(200)}"
|
|
60
|
+
sections << "Output: #{ex[:output].to_s.truncate(200)}"
|
|
61
|
+
sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
|
|
62
|
+
sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
|
|
63
|
+
sections << ""
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
if borderlines.any?
|
|
67
|
+
sections << "## Rubric-ambiguous cases (humans marked these borderline)"
|
|
68
|
+
sections << "Each case below is one where a human said the rubric was unclear. Use these to sharpen language, split overlapping bands, or call out edge cases explicitly."
|
|
69
|
+
borderlines.each_with_index do |ex, i|
|
|
70
|
+
sections << "### Borderline #{i + 1}"
|
|
71
|
+
sections << "Input: #{ex[:input].to_s.truncate(200)}"
|
|
72
|
+
sections << "Output: #{ex[:output].to_s.truncate(200)}"
|
|
73
|
+
sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
|
|
74
|
+
sections << "Human note: #{ex[:human_note].to_s.truncate(200)}" if ex[:human_note].to_s.present?
|
|
75
|
+
sections << ""
|
|
76
|
+
end
|
|
62
77
|
end
|
|
63
78
|
sections << "## Task"
|
|
64
|
-
sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite — not a wholesale rewrite of the rubric.
|
|
79
|
+
sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite — not a wholesale rewrite of the rubric. Close the disagreement gap and disambiguate the borderline cases."
|
|
65
80
|
sections << ""
|
|
66
81
|
sections << "Respond in EXACTLY this format, repeated #{@count} times:"
|
|
67
82
|
sections << ""
|
|
@@ -88,11 +103,23 @@ module CompletionKit
|
|
|
88
103
|
module_function
|
|
89
104
|
|
|
90
105
|
def for(metric, limit: 8)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
106
|
+
disagreements_for(metric, limit: limit)
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def disagreements_for(metric, limit: 8)
|
|
110
|
+
calibrations_for(metric, verdict: "disagree", limit: limit)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def borderlines_for(metric, limit: 6)
|
|
114
|
+
calibrations_for(metric, verdict: "borderline", limit: limit)
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def calibrations_for(metric, verdict:, limit:)
|
|
118
|
+
Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
119
|
+
.includes(response: :reviews)
|
|
120
|
+
.order(created_at: :desc)
|
|
121
|
+
.limit(limit)
|
|
122
|
+
.map do |cal|
|
|
96
123
|
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
97
124
|
{
|
|
98
125
|
input: cal.response.input_data,
|
|
@@ -1,51 +1,67 @@
|
|
|
1
1
|
<div id="calibration_<%= response_row.id %>_<%= metric.id %>" class="ck-calibration">
|
|
2
2
|
<% current_verdict = calibration&.verdict %>
|
|
3
|
+
<% pending_verdict = local_assigns[:pending_verdict] %>
|
|
4
|
+
<% active_verdict = pending_verdict || current_verdict %>
|
|
5
|
+
<% error = local_assigns[:error] %>
|
|
3
6
|
<% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
|
|
4
7
|
<p class="ck-calibration__prompt">
|
|
5
8
|
Your verdict
|
|
6
9
|
<% if verdict_count > 0 %>
|
|
7
|
-
<span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score
|
|
10
|
+
<span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "trust score →", metric_path(metric), class: "ck-link" %></span>
|
|
11
|
+
<% else %>
|
|
12
|
+
<span class="ck-calibration__count">Tell us what you think — was the score right? Verdicts roll up into the metric's <%= link_to "trust score", metric_path(metric), class: "ck-link" %>.</span>
|
|
8
13
|
<% end %>
|
|
9
14
|
</p>
|
|
10
15
|
<div class="ck-calibration__buttons">
|
|
11
16
|
<% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
|
|
17
|
+
<% verdict_hints = {
|
|
18
|
+
"agree" => "The score looks right.",
|
|
19
|
+
"disagree" => "The score is wrong — you'll pick the right one.",
|
|
20
|
+
"borderline" => "The rubric is unclear here; either score could be defensible."
|
|
21
|
+
} %>
|
|
12
22
|
<% CompletionKit::Calibration::VERDICTS.each do |verdict| %>
|
|
13
23
|
<%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
|
|
14
24
|
method: :post,
|
|
15
25
|
form: { data: { turbo: "true" } },
|
|
16
|
-
class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict ==
|
|
17
|
-
"aria-pressed": (verdict ==
|
|
26
|
+
class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
|
|
27
|
+
"aria-pressed": (verdict == active_verdict).to_s,
|
|
28
|
+
title: verdict_hints[verdict] do %>
|
|
18
29
|
<%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
19
30
|
<span><%= verdict %></span>
|
|
20
31
|
<% end %>
|
|
21
32
|
<% end %>
|
|
22
33
|
</div>
|
|
23
34
|
|
|
24
|
-
<% if
|
|
35
|
+
<% if error.present? %>
|
|
36
|
+
<p class="ck-calibration__error" role="alert"><%= error %></p>
|
|
37
|
+
<% end %>
|
|
38
|
+
|
|
39
|
+
<% if active_verdict == "disagree" %>
|
|
25
40
|
<%= form_with url: run_response_calibrations_path(run, response_row),
|
|
26
41
|
method: :post, local: false,
|
|
27
42
|
class: "ck-calibration__detail" do |f| %>
|
|
28
43
|
<%= hidden_field_tag :metric_id, metric.id %>
|
|
29
44
|
<%= hidden_field_tag :verdict, "disagree" %>
|
|
30
45
|
<label class="ck-label">
|
|
31
|
-
|
|
32
|
-
<span class="ck-calibration__value" data-calibration-value><%= calibration
|
|
46
|
+
What should the score have been?
|
|
47
|
+
<span class="ck-calibration__value" data-calibration-value><%= calibration&.corrected_score || review&.ai_score || 3 %></span>
|
|
33
48
|
</label>
|
|
34
49
|
<input type="range" name="corrected_score" min="1" max="5" step="0.5"
|
|
35
|
-
value="<%= calibration
|
|
50
|
+
value="<%= calibration&.corrected_score || review&.ai_score || 3 %>"
|
|
36
51
|
oninput="this.closest('.ck-calibration__detail').querySelector('[data-calibration-value]').textContent = this.value"
|
|
37
|
-
class="ck-slider"
|
|
38
|
-
|
|
39
|
-
|
|
52
|
+
class="ck-slider"
|
|
53
|
+
required>
|
|
54
|
+
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration&.note %></textarea>
|
|
55
|
+
<%= f.submit (current_verdict == "disagree" ? "Update" : "Save disagree"), class: ck_button_classes(:dark) %>
|
|
40
56
|
<% end %>
|
|
41
|
-
<% elsif
|
|
57
|
+
<% elsif active_verdict == "borderline" %>
|
|
42
58
|
<%= form_with url: run_response_calibrations_path(run, response_row),
|
|
43
59
|
method: :post, local: false,
|
|
44
60
|
class: "ck-calibration__detail" do |f| %>
|
|
45
61
|
<%= hidden_field_tag :metric_id, metric.id %>
|
|
46
62
|
<%= hidden_field_tag :verdict, "borderline" %>
|
|
47
|
-
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration
|
|
48
|
-
<%= f.submit "Save", class: ck_button_classes(:dark) %>
|
|
63
|
+
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration&.note %></textarea>
|
|
64
|
+
<%= f.submit (current_verdict == "borderline" ? "Update" : "Save"), class: ck_button_classes(:dark) %>
|
|
49
65
|
<% end %>
|
|
50
66
|
<% end %>
|
|
51
67
|
</div>
|
|
@@ -1,34 +1,31 @@
|
|
|
1
1
|
<% stats = local_assigns[:stats] %>
|
|
2
2
|
<div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
|
|
3
|
-
<p class="ck-trust-panel__label">
|
|
3
|
+
<p class="ck-trust-panel__label" title="How often this metric's scores match the humans who reviewed them.">Trust score</p>
|
|
4
4
|
<% if stats.counter_only? %>
|
|
5
5
|
<div class="ck-trust-panel__body">
|
|
6
6
|
<span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>
|
|
7
|
-
<span class="ck-trust-panel__hint">verdicts<% if stats.short_to_target > 0 %> · <%=
|
|
7
|
+
<span class="ck-trust-panel__hint">verdicts so far<% if stats.short_to_target > 0 %> · <%= stats.short_to_target %> more before we can score the judge<% end %></span>
|
|
8
8
|
</div>
|
|
9
9
|
<% else %>
|
|
10
10
|
<div class="ck-trust-panel__body">
|
|
11
|
-
<span class="ck-trust-panel__score"
|
|
12
|
-
|
|
13
|
-
<span class="ck-trust-
|
|
11
|
+
<span class="ck-trust-panel__score"
|
|
12
|
+
title="Roughly how often the judge and the humans agreed.">~<%= (stats.agreement_point * 100).round %><span class="ck-trust-panel__score-pct">%</span></span>
|
|
13
|
+
<span class="ck-trust-panel__margin"
|
|
14
|
+
title="The range we're confident the true rate sits in, given how few verdicts we have so far.">±<%= (stats.margin * 100).round %> pt</span>
|
|
15
|
+
<span class="ck-trust-panel__gate"
|
|
16
|
+
title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts to tighten the margin.' %>"><%= stats.firm? ? "settled" : "early read" %></span>
|
|
14
17
|
</div>
|
|
15
18
|
<div class="ck-trust-panel__details">
|
|
16
|
-
<span><%= stats.sample_size
|
|
19
|
+
<span><%= pluralize(stats.sample_size, "verdict") %></span>
|
|
17
20
|
<% if stats.borderline_rate && stats.borderline_rate > 0 %>
|
|
18
21
|
<% level = if stats.borderline_rate > 0.30 then "danger"
|
|
19
22
|
elsif stats.borderline_rate > 0.15 then "warning"
|
|
20
23
|
else "ok" end %>
|
|
21
24
|
<span class="ck-trust-panel__borderline ck-trust-panel__borderline--<%= level %>"
|
|
22
|
-
title="<%= level == 'ok' ? '' : '
|
|
23
|
-
<%= (stats.borderline_rate * 100).round %>%
|
|
25
|
+
title="<%= level == 'ok' ? 'Some reviewers said the rubric was unclear here.' : 'A lot of reviewers say the rubric is unclear here. Consider splitting the metric or rewriting the rubric.' %>">
|
|
26
|
+
<%= (stats.borderline_rate * 100).round %>% said "unclear"
|
|
24
27
|
</span>
|
|
25
28
|
<% end %>
|
|
26
|
-
<% if stats.mae %>
|
|
27
|
-
<span>MAE <%= stats.mae.round(2) %></span>
|
|
28
|
-
<% end %>
|
|
29
|
-
<% if stats.kappa %>
|
|
30
|
-
<span>κ <%= stats.kappa.round(2) %></span>
|
|
31
|
-
<% end %>
|
|
32
29
|
</div>
|
|
33
30
|
<% end %>
|
|
34
31
|
</div>
|
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
<tr>
|
|
20
20
|
<th scope="col">Name</th>
|
|
21
21
|
<th scope="col">Instruction</th>
|
|
22
|
+
<th scope="col" title="How often this metric's scores match the humans who reviewed them.">Trust score</th>
|
|
22
23
|
<th scope="col">In groups</th>
|
|
23
24
|
<th scope="col"></th>
|
|
24
25
|
</tr>
|
|
@@ -35,6 +36,23 @@
|
|
|
35
36
|
<% end %>
|
|
36
37
|
</td>
|
|
37
38
|
<td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
|
|
39
|
+
<td data-label="Trust score" class="ck-metrics-table__trust">
|
|
40
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
41
|
+
<% s = CompletionKit::MetricCalibrationStats.for(metric) %>
|
|
42
|
+
<% if s.counter_only? %>
|
|
43
|
+
<% if s.sample_size.zero? %>
|
|
44
|
+
<span class="ck-meta-copy">No verdicts yet</span>
|
|
45
|
+
<% else %>
|
|
46
|
+
<span class="ck-meta-copy"><%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts</span>
|
|
47
|
+
<% end %>
|
|
48
|
+
<% else %>
|
|
49
|
+
<span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read — keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
|
|
50
|
+
<span class="ck-meta-copy">±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %></span>
|
|
51
|
+
<% end %>
|
|
52
|
+
<% else %>
|
|
53
|
+
<span class="ck-meta-copy">—</span>
|
|
54
|
+
<% end %>
|
|
55
|
+
</td>
|
|
38
56
|
<td data-label="In groups">
|
|
39
57
|
<% groups = metric.metric_groups %>
|
|
40
58
|
<% if groups.any? %>
|
|
@@ -12,8 +12,8 @@
|
|
|
12
12
|
<% if @latest_draft %>
|
|
13
13
|
<div class="ck-draft-banner">
|
|
14
14
|
<span class="ck-chip ck-chip--soft">Draft pending</span>
|
|
15
|
-
<span class="ck-meta-copy">
|
|
16
|
-
<%= button_to "Publish
|
|
15
|
+
<span class="ck-meta-copy">A draft version of this judge is saved. Publishing it replaces the live instruction and rubric.</span>
|
|
16
|
+
<%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @latest_draft.id),
|
|
17
17
|
method: :post, form_class: "inline-block",
|
|
18
18
|
class: ck_button_classes(:dark) %>
|
|
19
19
|
</div>
|
|
@@ -22,10 +22,11 @@
|
|
|
22
22
|
</div>
|
|
23
23
|
<div class="ck-actions">
|
|
24
24
|
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
25
|
-
<%= button_to "
|
|
25
|
+
<%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
|
|
26
26
|
method: :post, form_class: "inline-block",
|
|
27
27
|
class: ck_button_classes(:light, variant: :outline),
|
|
28
|
-
|
|
28
|
+
title: "Ask the model to rewrite this metric's instruction based on the disagreements and rubric-ambiguous cases collected so far.",
|
|
29
|
+
data: { turbo_confirm: "Ask the model to rewrite this metric's instruction based on the disagreements and rubric-ambiguous cases collected so far?" } %>
|
|
29
30
|
<% end %>
|
|
30
31
|
<%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
|
|
31
32
|
</div>
|
|
@@ -40,7 +41,7 @@
|
|
|
40
41
|
<% if @metric.instruction.present? %>
|
|
41
42
|
<section class="ck-card">
|
|
42
43
|
<p class="ck-kicker">Instruction</p>
|
|
43
|
-
|
|
44
|
+
<%= simple_format(@metric.instruction, {}, class: "ck-copy") %>
|
|
44
45
|
</section>
|
|
45
46
|
<% end %>
|
|
46
47
|
|
|
@@ -65,14 +66,15 @@
|
|
|
65
66
|
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
66
67
|
<section class="ck-card ck-card--spaced">
|
|
67
68
|
<div class="ck-prompt-preview__header">
|
|
68
|
-
<p class="ck-kicker">
|
|
69
|
+
<p class="ck-kicker">Where the judge got it wrong</p>
|
|
69
70
|
<% if @disagreements.any? %>
|
|
70
71
|
<span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
|
|
71
72
|
<% end %>
|
|
72
73
|
</div>
|
|
73
74
|
<% if @disagreements.empty? %>
|
|
74
|
-
<p class="ck-meta-copy">
|
|
75
|
+
<p class="ck-meta-copy">Nothing here yet. As people give a "disagree" verdict on response rows, those rows show up below so you can review the judge's misses and turn them into teaching examples.</p>
|
|
75
76
|
<% else %>
|
|
77
|
+
<p class="ck-meta-copy">Rows where a reviewer said the judge got it wrong. Save the best ones as teaching examples — the judge will see them next time it grades.</p>
|
|
76
78
|
<table class="ck-results-table ck-disagreements-table">
|
|
77
79
|
<thead>
|
|
78
80
|
<tr>
|
|
@@ -112,13 +114,14 @@
|
|
|
112
114
|
<td class="ck-meta-copy"><%= cal.note.to_s.truncate(120) %></td>
|
|
113
115
|
<td>
|
|
114
116
|
<% if already %>
|
|
115
|
-
<span class="ck-chip ck-chip--done">
|
|
117
|
+
<span class="ck-chip ck-chip--done">Saved as example</span>
|
|
116
118
|
<% else %>
|
|
117
|
-
<%= button_to "
|
|
119
|
+
<%= button_to "Teach the judge",
|
|
118
120
|
add_few_shot_metric_path(@metric, calibration_id: cal.id),
|
|
119
121
|
method: :post,
|
|
120
122
|
form_class: "inline-block",
|
|
121
|
-
class: ck_button_classes(:light, variant: :outline)
|
|
123
|
+
class: ck_button_classes(:light, variant: :outline),
|
|
124
|
+
title: "Save this row as a teaching example. The judge will see it next time it grades." %>
|
|
122
125
|
<% end %>
|
|
123
126
|
</td>
|
|
124
127
|
</tr>
|
|
@@ -131,20 +134,20 @@
|
|
|
131
134
|
<% if @suggestion_drafts.any? %>
|
|
132
135
|
<section class="ck-card ck-card--spaced">
|
|
133
136
|
<div class="ck-prompt-preview__header">
|
|
134
|
-
<p class="ck-kicker">Suggested
|
|
135
|
-
<span class="ck-chip"><%= @suggestion_drafts.size %>
|
|
137
|
+
<p class="ck-kicker">Suggested improvements</p>
|
|
138
|
+
<span class="ck-chip"><%= @suggestion_drafts.size %> option<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
|
|
136
139
|
</div>
|
|
137
|
-
<p class="ck-meta-copy">
|
|
140
|
+
<p class="ck-meta-copy">Based on your verdicts, the model proposed these alternative instructions for this metric. Pick one to make it live — the previous version stays in history.</p>
|
|
138
141
|
<div class="ck-suggestion-list">
|
|
139
142
|
<% @suggestion_drafts.each do |draft| %>
|
|
140
143
|
<article class="ck-suggestion-card">
|
|
141
144
|
<header class="ck-suggestion-card__header">
|
|
142
|
-
<span class="ck-chip ck-chip--soft">
|
|
145
|
+
<span class="ck-chip ck-chip--soft">Option #<%= draft.id %></span>
|
|
143
146
|
<time class="ck-meta-copy" data-relative-time datetime="<%= draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(draft.created_at) %> ago</time>
|
|
144
147
|
</header>
|
|
145
148
|
<pre class="ck-code ck-suggestion-card__instruction"><%= draft.instruction %></pre>
|
|
146
149
|
<div class="ck-actions">
|
|
147
|
-
<%= button_to "
|
|
150
|
+
<%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: draft.id),
|
|
148
151
|
method: :post, form_class: "inline-block",
|
|
149
152
|
class: ck_button_classes(:dark) %>
|
|
150
153
|
</div>
|
|
@@ -157,10 +160,10 @@
|
|
|
157
160
|
<% if Array(@metric.few_shot_examples).any? %>
|
|
158
161
|
<section class="ck-card ck-card--spaced">
|
|
159
162
|
<div class="ck-prompt-preview__header">
|
|
160
|
-
<p class="ck-kicker">
|
|
163
|
+
<p class="ck-kicker">Teaching examples</p>
|
|
161
164
|
<span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "example") %></span>
|
|
162
165
|
</div>
|
|
163
|
-
<p class="ck-meta-copy">
|
|
166
|
+
<p class="ck-meta-copy">The judge sees these worked examples whenever it grades for this metric. Each shows what the judge gave and what a human said it should have been.</p>
|
|
164
167
|
<ol class="ck-few-shot-list">
|
|
165
168
|
<% Array(@metric.few_shot_examples).each do |fs| %>
|
|
166
169
|
<li class="ck-few-shot-item">
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.39
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -233,7 +233,7 @@ files:
|
|
|
233
233
|
- app/assets/images/completion_kit/favicon.ico
|
|
234
234
|
- app/assets/images/completion_kit/logo.png
|
|
235
235
|
- app/assets/javascripts/completion_kit/application.js
|
|
236
|
-
- app/assets/stylesheets/completion_kit/application.css
|
|
236
|
+
- app/assets/stylesheets/completion_kit/application.css
|
|
237
237
|
- app/controllers/completion_kit/api/v1/base_controller.rb
|
|
238
238
|
- app/controllers/completion_kit/api/v1/calibrations_controller.rb
|
|
239
239
|
- app/controllers/completion_kit/api/v1/datasets_controller.rb
|