completion-kit 0.5.42 → 0.5.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/javascripts/completion_kit/application.js +17 -0
- data/app/assets/stylesheets/completion_kit/application.css +505 -39
- data/app/controllers/completion_kit/metrics_controller.rb +35 -24
- data/app/jobs/completion_kit/judge_review_job.rb +11 -0
- data/app/models/completion_kit/judge_version.rb +32 -1
- data/app/services/completion_kit/judge_variant_generator.rb +8 -6
- data/app/services/completion_kit/metric_calibration_stats.rb +16 -4
- data/app/views/completion_kit/api_reference/_body.html.erb +1 -1
- data/app/views/completion_kit/calibrations/_buttons.html.erb +43 -6
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +27 -28
- data/app/views/completion_kit/metrics/_form.html.erb +90 -4
- data/app/views/completion_kit/metrics/_rubric_diff.html.erb +25 -0
- data/app/views/completion_kit/metrics/_rubric_hint.html.erb +4 -0
- data/app/views/completion_kit/metrics/_starter_card.html.erb +13 -9
- data/app/views/completion_kit/metrics/edit.html.erb +5 -1
- data/app/views/completion_kit/metrics/index.html.erb +5 -3
- data/app/views/completion_kit/metrics/show.html.erb +132 -126
- data/app/views/completion_kit/metrics/starter_preview.html.erb +6 -6
- data/app/views/completion_kit/responses/show.html.erb +1 -1
- data/app/views/completion_kit/runs/_status_panel.html.erb +2 -2
- data/config/routes.rb +2 -1
- data/db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb +24 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +4 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class MetricsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
|
|
4
|
+
before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :remove_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
|
|
5
5
|
|
|
6
6
|
def index
|
|
7
7
|
@metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
|
|
@@ -35,14 +35,16 @@ module CompletionKit
|
|
|
35
35
|
end
|
|
36
36
|
|
|
37
37
|
def show
|
|
38
|
+
@published_judge_version = JudgeVersion.ensure_current_for(@metric)
|
|
38
39
|
@disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
|
|
39
|
-
.includes(response: [:reviews, :run])
|
|
40
|
+
.includes(:judge_version, response: [:reviews, :run])
|
|
40
41
|
.order(created_at: :desc)
|
|
41
42
|
.limit(50)
|
|
42
43
|
@edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
43
|
-
@published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
|
|
44
44
|
@suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
45
|
-
@improve_disagreement_count = @
|
|
45
|
+
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree",
|
|
46
|
+
judge_version_id: @published_judge_version.id).count
|
|
47
|
+
@versions = JudgeVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
|
|
46
48
|
end
|
|
47
49
|
|
|
48
50
|
def new
|
|
@@ -50,6 +52,9 @@ module CompletionKit
|
|
|
50
52
|
end
|
|
51
53
|
|
|
52
54
|
def edit
|
|
55
|
+
@suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
56
|
+
@edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
57
|
+
@published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
|
|
53
58
|
end
|
|
54
59
|
|
|
55
60
|
def create
|
|
@@ -76,9 +81,10 @@ module CompletionKit
|
|
|
76
81
|
end
|
|
77
82
|
|
|
78
83
|
def suggest_variants
|
|
84
|
+
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
79
85
|
disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
80
86
|
if disagreement_count.zero?
|
|
81
|
-
redirect_to
|
|
87
|
+
redirect_to target, alert: "Mark at least one row as Disagree before asking the model to suggest a change."
|
|
82
88
|
return
|
|
83
89
|
end
|
|
84
90
|
|
|
@@ -87,38 +93,36 @@ module CompletionKit
|
|
|
87
93
|
generator = JudgeVariantGenerator.new(@metric, count: 1)
|
|
88
94
|
variants = generator.call
|
|
89
95
|
if variants.empty?
|
|
90
|
-
redirect_to
|
|
96
|
+
redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
|
|
91
97
|
return
|
|
92
98
|
end
|
|
93
99
|
generator.persist!(variants)
|
|
94
|
-
redirect_to
|
|
100
|
+
redirect_to target, notice: "Drafted a new version. Review it below."
|
|
95
101
|
end
|
|
96
102
|
|
|
97
103
|
def dismiss_suggestion
|
|
98
|
-
draft = JudgeVersion.drafts.where(metric_id: @metric.id
|
|
104
|
+
draft = JudgeVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
|
|
99
105
|
draft&.destroy
|
|
100
|
-
|
|
106
|
+
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
107
|
+
redirect_to target, notice: "Dismissed."
|
|
101
108
|
end
|
|
102
109
|
|
|
103
110
|
def publish_draft
|
|
104
|
-
scope = JudgeVersion.
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
111
|
+
scope = JudgeVersion.where(metric_id: @metric.id)
|
|
112
|
+
version = if params[:draft_id].present?
|
|
113
|
+
scope.find_by(id: params[:draft_id])
|
|
114
|
+
else
|
|
115
|
+
JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
if version.nil?
|
|
119
|
+
redirect_to metric_path(@metric), alert: "No version to publish."
|
|
109
120
|
return
|
|
110
121
|
end
|
|
111
122
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
@metric.update_columns(
|
|
116
|
-
instruction: draft.instruction,
|
|
117
|
-
rubric_bands: Array(draft.rubric_bands).to_json
|
|
118
|
-
)
|
|
119
|
-
end
|
|
120
|
-
|
|
121
|
-
redirect_to metric_path(@metric), notice: "This judge version is now live."
|
|
123
|
+
version.publish!
|
|
124
|
+
redirect_to metric_path(@metric),
|
|
125
|
+
notice: "#{@metric.name} #{version.version_label} is now the published version."
|
|
122
126
|
end
|
|
123
127
|
|
|
124
128
|
def add_few_shot
|
|
@@ -139,6 +143,13 @@ module CompletionKit
|
|
|
139
143
|
redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
|
|
140
144
|
end
|
|
141
145
|
|
|
146
|
+
def remove_few_shot
|
|
147
|
+
cal_id = params[:calibration_id].to_i
|
|
148
|
+
remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
|
|
149
|
+
@metric.update!(few_shot_examples: remaining)
|
|
150
|
+
redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
|
|
151
|
+
end
|
|
152
|
+
|
|
142
153
|
private
|
|
143
154
|
|
|
144
155
|
def set_metric
|
|
@@ -57,6 +57,7 @@ module CompletionKit
|
|
|
57
57
|
run.prompt&.template,
|
|
58
58
|
criteria: metric.instruction.to_s,
|
|
59
59
|
rubric_text: metric.display_rubric_text,
|
|
60
|
+
human_examples: few_shot_payload(metric),
|
|
60
61
|
input_data: response.input_data
|
|
61
62
|
)
|
|
62
63
|
|
|
@@ -119,5 +120,15 @@ module CompletionKit
|
|
|
119
120
|
response = Response.find_by(id: response_id)
|
|
120
121
|
RunCompletionCheckJob.perform_later(response.run_id) if response
|
|
121
122
|
end
|
|
123
|
+
|
|
124
|
+
def few_shot_payload(metric)
|
|
125
|
+
Array(metric.few_shot_examples).map do |fs|
|
|
126
|
+
{
|
|
127
|
+
human_score: fs["human_score"],
|
|
128
|
+
response_text: fs["response"].to_s,
|
|
129
|
+
human_note: fs["human_note"].to_s
|
|
130
|
+
}
|
|
131
|
+
end
|
|
132
|
+
end
|
|
122
133
|
end
|
|
123
134
|
end
|
|
@@ -7,8 +7,11 @@ module CompletionKit
|
|
|
7
7
|
|
|
8
8
|
serialize :rubric_bands, coder: JSON
|
|
9
9
|
|
|
10
|
+
before_validation :assign_version_number, on: :create
|
|
11
|
+
|
|
10
12
|
validates :metric_id, presence: true
|
|
11
13
|
validates :state, inclusion: { in: STATES }
|
|
14
|
+
validates :version_number, presence: true, uniqueness: { scope: :metric_id }
|
|
12
15
|
|
|
13
16
|
scope :current, -> { where(current: true) }
|
|
14
17
|
scope :published, -> { where(state: "published") }
|
|
@@ -20,7 +23,8 @@ module CompletionKit
|
|
|
20
23
|
instruction: metric.instruction,
|
|
21
24
|
rubric_bands: metric.rubric_bands,
|
|
22
25
|
current: true,
|
|
23
|
-
state: "published"
|
|
26
|
+
state: "published",
|
|
27
|
+
published_at: Time.current
|
|
24
28
|
)
|
|
25
29
|
end
|
|
26
30
|
|
|
@@ -32,17 +36,44 @@ module CompletionKit
|
|
|
32
36
|
state == "published"
|
|
33
37
|
end
|
|
34
38
|
|
|
39
|
+
def version_label
|
|
40
|
+
"v#{version_number}"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def publish!
|
|
44
|
+
JudgeVersion.transaction do
|
|
45
|
+
self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
|
|
46
|
+
reload
|
|
47
|
+
update!(state: "published", current: true, published_at: published_at || Time.current)
|
|
48
|
+
metric.update_columns(
|
|
49
|
+
instruction: instruction,
|
|
50
|
+
rubric_bands: Array(rubric_bands).to_json
|
|
51
|
+
)
|
|
52
|
+
end
|
|
53
|
+
self
|
|
54
|
+
end
|
|
55
|
+
|
|
35
56
|
def as_json(options = {})
|
|
36
57
|
{
|
|
37
58
|
id: id,
|
|
38
59
|
metric_id: metric_id,
|
|
60
|
+
version_number: version_number,
|
|
39
61
|
instruction: instruction,
|
|
40
62
|
rubric_bands: rubric_bands,
|
|
41
63
|
current: current,
|
|
42
64
|
state: state,
|
|
43
65
|
source: source,
|
|
66
|
+
published_at: published_at,
|
|
44
67
|
created_at: created_at
|
|
45
68
|
}
|
|
46
69
|
end
|
|
70
|
+
|
|
71
|
+
private
|
|
72
|
+
|
|
73
|
+
def assign_version_number
|
|
74
|
+
return if version_number.present?
|
|
75
|
+
max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i
|
|
76
|
+
self.version_number = max + 1
|
|
77
|
+
end
|
|
47
78
|
end
|
|
48
79
|
end
|
|
@@ -86,7 +86,7 @@ module CompletionKit
|
|
|
86
86
|
sections << "REASONING: <one short sentence: what changes and why>"
|
|
87
87
|
sections << "INSTRUCTION:"
|
|
88
88
|
sections << "<the rewritten instruction>"
|
|
89
|
-
sections << "RUBRIC: # optional
|
|
89
|
+
sections << "RUBRIC: # optional. Omit this block if the rubric is unchanged."
|
|
90
90
|
sections << "5: <description for 5 stars>"
|
|
91
91
|
sections << "4: <description for 4 stars>"
|
|
92
92
|
sections << "3: <description for 3 stars>"
|
|
@@ -133,11 +133,13 @@ module CompletionKit
|
|
|
133
133
|
end
|
|
134
134
|
|
|
135
135
|
def calibrations_for(metric, verdict:, limit:)
|
|
136
|
-
Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
136
|
+
scope = Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
137
|
+
current_version = JudgeVersion.current.find_by(metric_id: metric.id)
|
|
138
|
+
scope = scope.where(judge_version_id: current_version.id) if current_version
|
|
139
|
+
scope.includes(response: :reviews)
|
|
140
|
+
.order(created_at: :desc)
|
|
141
|
+
.limit(limit)
|
|
142
|
+
.map do |cal|
|
|
141
143
|
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
142
144
|
{
|
|
143
145
|
input: cal.response.input_data,
|
|
@@ -31,18 +31,30 @@ module CompletionKit
|
|
|
31
31
|
end
|
|
32
32
|
end
|
|
33
33
|
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
CURRENT = :current
|
|
35
|
+
|
|
36
|
+
def self.for(metric, judge_version: CURRENT)
|
|
37
|
+
resolved = case judge_version
|
|
38
|
+
when CURRENT then JudgeVersion.current.find_by(metric_id: metric.id)
|
|
39
|
+
when nil then nil
|
|
40
|
+
else judge_version
|
|
41
|
+
end
|
|
42
|
+
new(metric: metric, judge_version: resolved, all_versions: judge_version.nil?).call
|
|
36
43
|
end
|
|
37
44
|
|
|
38
|
-
def initialize(metric:, judge_version: nil)
|
|
45
|
+
def initialize(metric:, judge_version: nil, all_versions: false)
|
|
39
46
|
@metric = metric
|
|
40
47
|
@judge_version = judge_version
|
|
48
|
+
@all_versions = all_versions
|
|
41
49
|
end
|
|
42
50
|
|
|
43
51
|
def call
|
|
44
52
|
scope = Calibration.where(metric_id: @metric.id)
|
|
45
|
-
|
|
53
|
+
if @judge_version
|
|
54
|
+
scope = scope.where(judge_version_id: @judge_version.id)
|
|
55
|
+
elsif !@all_versions
|
|
56
|
+
scope = scope.none
|
|
57
|
+
end
|
|
46
58
|
|
|
47
59
|
verdicts = scope.pluck(:verdict, :corrected_score, :response_id)
|
|
48
60
|
n = verdicts.length
|
|
@@ -187,7 +187,7 @@
|
|
|
187
187
|
</div>
|
|
188
188
|
<%= render "completion_kit/api_reference/resource_list", title: "Your datasets",
|
|
189
189
|
items: datasets.map { |d|
|
|
190
|
-
{ name: d.name, meta: pluralize([d.csv_data.to_s.lines.count - 1, 0].max, "
|
|
190
|
+
{ name: d.name, meta: pluralize([d.csv_data.to_s.lines.count - 1, 0].max, "entry"),
|
|
191
191
|
url: "#{base_url}/api/v1/datasets/#{d.id}", dom_id: "dataset_ep_#{d.id}" }
|
|
192
192
|
} %>
|
|
193
193
|
</div>
|
|
@@ -3,14 +3,19 @@
|
|
|
3
3
|
<% pending_verdict = local_assigns[:pending_verdict] %>
|
|
4
4
|
<% active_verdict = pending_verdict || current_verdict %>
|
|
5
5
|
<% error = local_assigns[:error] %>
|
|
6
|
-
<%
|
|
6
|
+
<% me = CompletionKit.config.username.presence || "operator" %>
|
|
7
|
+
<% other_calibrations = CompletionKit::Calibration
|
|
8
|
+
.where(response_id: response_row.id, metric_id: metric.id)
|
|
9
|
+
.where.not(created_by: me)
|
|
10
|
+
.order(created_at: :asc).to_a %>
|
|
11
|
+
<% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
|
|
7
12
|
<p class="ck-calibration__prompt">
|
|
8
13
|
<span class="ck-calibration__label">Your verdict</span>
|
|
9
|
-
<% if
|
|
10
|
-
<span class="ck-
|
|
11
|
-
|
|
12
|
-
<span class="ck-calibration__hint">Tell us what you think — was the score right? Verdicts roll up into the metric's <%= link_to "trust level", metric_path(metric), class: "ck-link" %>.</span>
|
|
14
|
+
<% if other_calibrations.any? %>
|
|
15
|
+
<span class="ck-calibration__meta"><%= pluralize(other_calibrations.size, "other verdict") %> on this score</span>
|
|
16
|
+
<span class="ck-calibration__sep">·</span>
|
|
13
17
|
<% end %>
|
|
18
|
+
<%= link_to metric_path(metric), class: "ck-calibration__meta-link" do %><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration →<% end %>
|
|
14
19
|
</p>
|
|
15
20
|
<div class="ck-calibration__buttons">
|
|
16
21
|
<% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
|
|
@@ -36,6 +41,38 @@
|
|
|
36
41
|
<p class="ck-calibration__error" role="alert"><%= error %></p>
|
|
37
42
|
<% end %>
|
|
38
43
|
|
|
44
|
+
<% if other_calibrations.any? %>
|
|
45
|
+
<details class="ck-calibration__others">
|
|
46
|
+
<summary class="ck-calibration__others-summary">
|
|
47
|
+
<%= heroicon_tag "chevron-right", variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
48
|
+
<span>What others said (<%= other_calibrations.size %>)</span>
|
|
49
|
+
</summary>
|
|
50
|
+
<ul class="ck-calibration__others-list">
|
|
51
|
+
<% other_calibrations.each do |other| %>
|
|
52
|
+
<li class="ck-calibration__others-item ck-calibration__others-item--<%= other.verdict %>">
|
|
53
|
+
<div class="ck-calibration__others-row">
|
|
54
|
+
<span class="ck-calibration__others-verdict">
|
|
55
|
+
<%= heroicon_tag verdict_icons[other.verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
56
|
+
<%= other.verdict %>
|
|
57
|
+
</span>
|
|
58
|
+
<span class="ck-calibration__others-by"><%= other.created_by %></span>
|
|
59
|
+
<% if other.corrected_score %>
|
|
60
|
+
<span class="ck-calibration__others-stars" aria-label="<%= pluralize(other.corrected_score.to_i, 'star') %>" title="<%= pluralize(other.corrected_score.to_i, 'star') %>">
|
|
61
|
+
<% 5.times do |i| %>
|
|
62
|
+
<svg viewBox="0 0 24 24" width="12" height="12" stroke-width="1.75" class="ck-star <%= i < other.corrected_score.to_i ? "ck-star--filled" : "ck-star--empty" %>" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
|
|
63
|
+
<% end %>
|
|
64
|
+
</span>
|
|
65
|
+
<% end %>
|
|
66
|
+
</div>
|
|
67
|
+
<% if other.note.to_s.present? %>
|
|
68
|
+
<p class="ck-calibration__others-note">"<%= other.note.to_s.truncate(140) %>"</p>
|
|
69
|
+
<% end %>
|
|
70
|
+
</li>
|
|
71
|
+
<% end %>
|
|
72
|
+
</ul>
|
|
73
|
+
</details>
|
|
74
|
+
<% end %>
|
|
75
|
+
|
|
39
76
|
<% if active_verdict == "disagree" %>
|
|
40
77
|
<% existing_score = (calibration&.corrected_score || review&.ai_score)&.round %>
|
|
41
78
|
<%= form_with url: run_response_calibrations_path(run, response_row),
|
|
@@ -51,7 +88,7 @@
|
|
|
51
88
|
<% radio_id = "ck-star-#{response_row.id}-#{metric.id}-#{n}" %>
|
|
52
89
|
<input type="radio" name="corrected_score" id="<%= radio_id %>" value="<%= n %>" <%= "checked" if existing_score == n %> required>
|
|
53
90
|
<label for="<%= radio_id %>" title="<%= pluralize(n, 'star') %>" aria-label="<%= pluralize(n, 'star') %>">
|
|
54
|
-
<svg viewBox="0 0 24 24"
|
|
91
|
+
<svg viewBox="0 0 24 24" stroke-width="1.5" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
|
|
55
92
|
</label>
|
|
56
93
|
<% end %>
|
|
57
94
|
</div>
|
|
@@ -1,31 +1,30 @@
|
|
|
1
1
|
<% stats = local_assigns[:stats] %>
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
2
|
+
<% metric = local_assigns[:metric] %>
|
|
3
|
+
<% anchor = metric&.name&.parameterize %>
|
|
4
|
+
<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
|
|
5
|
+
created_by = CompletionKit.config.username.presence || "operator"
|
|
6
|
+
verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by).pluck(:response_id)
|
|
7
|
+
CompletionKit::Response.joins(:reviews)
|
|
8
|
+
.where(reviews: { metric_id: metric.id })
|
|
9
|
+
.where.not(reviews: { ai_score: nil })
|
|
10
|
+
.where.not(id: verdicted_ids)
|
|
11
|
+
.order(created_at: :desc).first
|
|
12
|
+
end %>
|
|
13
|
+
|
|
14
|
+
<p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
15
|
+
<span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
|
|
16
|
+
<% if stats.sample_size.zero? %>
|
|
17
|
+
<span class="ck-trust-line__state">Not measured yet.</span>
|
|
18
|
+
<span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if target_response %>
|
|
19
|
+
<%= link_to "Give a verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %>
|
|
20
|
+
<% else %> Run this metric on a dataset, then give a verdict.<% end %></span>
|
|
21
|
+
<% elsif stats.counter_only? %>
|
|
22
|
+
<span class="ck-trust-line__counter"><%= stats.sample_size %>/<%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span>
|
|
23
|
+
<span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.short_to_target > 0 %> · <%= stats.short_to_target %> more before this can be measured<% end %><% if target_response %> · <%= link_to "Give another verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %><% end %></span>
|
|
9
24
|
<% else %>
|
|
10
|
-
<
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
title="The range we're confident the true rate sits in, given how few verdicts we have so far.">±<%= (stats.margin * 100).round %> pt</span>
|
|
15
|
-
<span class="ck-trust-panel__gate"
|
|
16
|
-
title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts to tighten the margin.' %>"><%= stats.firm? ? "settled" : "early read" %></span>
|
|
17
|
-
</div>
|
|
18
|
-
<div class="ck-trust-panel__details">
|
|
19
|
-
<span><%= pluralize(stats.sample_size, "verdict") %></span>
|
|
20
|
-
<% if stats.borderline_rate && stats.borderline_rate > 0 %>
|
|
21
|
-
<% level = if stats.borderline_rate > 0.30 then "danger"
|
|
22
|
-
elsif stats.borderline_rate > 0.15 then "warning"
|
|
23
|
-
else "ok" end %>
|
|
24
|
-
<span class="ck-trust-panel__borderline ck-trust-panel__borderline--<%= level %>"
|
|
25
|
-
title="<%= level == 'ok' ? 'Some reviewers said the rubric was unclear here.' : 'A lot of reviewers say the rubric is unclear here. Consider splitting the metric or rewriting the rubric.' %>">
|
|
26
|
-
<%= (stats.borderline_rate * 100).round %>% said "unclear"
|
|
27
|
-
</span>
|
|
28
|
-
<% end %>
|
|
29
|
-
</div>
|
|
25
|
+
<span class="ck-trust-line__score" title="Roughly how often the judge and the humans agreed.">~<%= (stats.agreement_point * 100).round %>%</span>
|
|
26
|
+
<span class="ck-trust-line__margin" title="The range we're confident the true rate sits in.">±<%= (stats.margin * 100).round %> pt</span>
|
|
27
|
+
<span class="ck-trust-line__gate" title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts.' %>"><%= stats.firm? ? "settled" : "early" %></span>
|
|
28
|
+
<span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.borderline_rate && stats.borderline_rate > 0 %><% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %> · <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>" title="<%= level == 'ok' ? '' : 'Reviewers said the rubric was unclear here.' %>"><%= (stats.borderline_rate * 100).round %>% unclear</span><% end %></span>
|
|
30
29
|
<% end %>
|
|
31
|
-
</
|
|
30
|
+
</p>
|
|
@@ -1,3 +1,9 @@
|
|
|
1
|
+
<% suggestion = local_assigns[:suggestion_draft] %>
|
|
2
|
+
<% edit_draft = local_assigns[:edit_draft] %>
|
|
3
|
+
<% suggestion_bands = suggestion ? Array(suggestion.rubric_bands).each_with_object({}) { |b, h| h[b["stars"].to_i] = b["description"].to_s } : {} %>
|
|
4
|
+
<% suggested_instruction = suggestion&.instruction.to_s %>
|
|
5
|
+
<% instruction_changed = suggestion && suggested_instruction.present? && suggested_instruction != metric.instruction.to_s %>
|
|
6
|
+
|
|
1
7
|
<%= form_with(model: metric, local: true) do |form| %>
|
|
2
8
|
<% if metric.errors.any? %>
|
|
3
9
|
<div class="ck-flash ck-flash--alert" role="alert">
|
|
@@ -10,6 +16,50 @@
|
|
|
10
16
|
</div>
|
|
11
17
|
<% end %>
|
|
12
18
|
|
|
19
|
+
<% if edit_draft && !suggestion %>
|
|
20
|
+
<% pub = local_assigns[:published_judge_version] %>
|
|
21
|
+
<% draft_instr_changed = pub && pub.instruction.to_s != edit_draft.instruction.to_s %>
|
|
22
|
+
<% draft_rubric_changed = pub && pub.rubric_bands != edit_draft.rubric_bands %>
|
|
23
|
+
<div class="ck-suggestion-banner" role="status">
|
|
24
|
+
<div class="ck-suggestion-banner__body">
|
|
25
|
+
<p class="ck-kicker">Draft pending</p>
|
|
26
|
+
<p class="ck-meta-copy">An unpublished draft of this metric is saved. Publish to replace the live<%= " instruction" if draft_instr_changed %><%= " and" if draft_instr_changed && draft_rubric_changed %><%= " rubric" if draft_rubric_changed %> for future runs, or keep editing.</p>
|
|
27
|
+
</div>
|
|
28
|
+
<div class="ck-suggestion-banner__actions">
|
|
29
|
+
<%= button_to "Discard draft", dismiss_suggestion_metric_path(metric, draft_id: edit_draft.id, back_to: "edit"),
|
|
30
|
+
method: :delete, form_class: "inline-block",
|
|
31
|
+
class: ck_button_classes(:light, variant: :outline),
|
|
32
|
+
data: { turbo_confirm: "Drop this draft?" } %>
|
|
33
|
+
<%= button_to "Publish this version", publish_draft_metric_path(metric, draft_id: edit_draft.id),
|
|
34
|
+
method: :post, form_class: "inline-block",
|
|
35
|
+
class: ck_button_classes(:dark) %>
|
|
36
|
+
</div>
|
|
37
|
+
</div>
|
|
38
|
+
<% end %>
|
|
39
|
+
|
|
40
|
+
<% if suggestion %>
|
|
41
|
+
<div class="ck-suggestion-banner" role="status">
|
|
42
|
+
<div class="ck-suggestion-banner__body">
|
|
43
|
+
<p class="ck-kicker">Proposed improvements</p>
|
|
44
|
+
<p class="ck-meta-copy">Based on your disagreements, the model proposed these changes to the instruction and rubric. Apply pieces inline below, take everything at once, try again, or discard.</p>
|
|
45
|
+
</div>
|
|
46
|
+
<div class="ck-suggestion-banner__actions">
|
|
47
|
+
<%= button_to "Try again", suggest_variants_metric_path(metric, back_to: "edit"),
|
|
48
|
+
method: :post, form_class: "inline-block",
|
|
49
|
+
class: ck_button_classes(:light, variant: :outline),
|
|
50
|
+
title: "Discard these improvements and ask the model for fresh ones.",
|
|
51
|
+
data: { turbo_confirm: "Replace these improvements with fresh ones from the model?" } %>
|
|
52
|
+
<%= button_to "Discard", dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
|
|
53
|
+
method: :delete, form_class: "inline-block",
|
|
54
|
+
class: ck_button_classes(:light, variant: :outline),
|
|
55
|
+
data: { turbo_confirm: "Drop these improvements?" } %>
|
|
56
|
+
<%= button_to "Take everything", publish_draft_metric_path(metric, draft_id: suggestion.id),
|
|
57
|
+
method: :post, form_class: "inline-block",
|
|
58
|
+
class: ck_button_classes(:dark) %>
|
|
59
|
+
</div>
|
|
60
|
+
</div>
|
|
61
|
+
<% end %>
|
|
62
|
+
|
|
13
63
|
<div class="ck-card ck-form-card">
|
|
14
64
|
<div class="ck-field">
|
|
15
65
|
<%= form.label :name, "Metric name", class: "ck-label" %>
|
|
@@ -22,14 +72,34 @@
|
|
|
22
72
|
<p class="ck-hint">What should the judge assess? This instruction is sent to the LLM judge when scoring outputs.</p>
|
|
23
73
|
<%= form.text_area :instruction, rows: 8, class: "ck-input ck-input--area", placeholder: "Evaluate whether the output...", **ck_field_aria(form, :instruction) %>
|
|
24
74
|
<%= ck_field_error(form, :instruction) %>
|
|
75
|
+
|
|
76
|
+
<% if instruction_changed %>
|
|
77
|
+
<div class="ck-inline-suggestion">
|
|
78
|
+
<div class="ck-inline-suggestion__head">
|
|
79
|
+
<p class="ck-kicker">Suggested wording</p>
|
|
80
|
+
<button type="button"
|
|
81
|
+
class="<%= ck_button_classes(:light, variant: :outline) %> ck-inline-suggestion__apply"
|
|
82
|
+
data-ck-apply
|
|
83
|
+
data-target="metric[instruction]"
|
|
84
|
+
data-value="<%= h(suggested_instruction) %>">Use this wording</button>
|
|
85
|
+
</div>
|
|
86
|
+
<div class="ck-inline-suggestion__diff">
|
|
87
|
+
<pre class="ck-inline-suggestion__pane ck-inline-suggestion__pane--before"><%= ck_word_diff_old(metric.instruction.to_s, suggested_instruction) %></pre>
|
|
88
|
+
<pre class="ck-inline-suggestion__pane ck-inline-suggestion__pane--after"><%= ck_word_diff_new(metric.instruction.to_s, suggested_instruction) %></pre>
|
|
89
|
+
</div>
|
|
90
|
+
</div>
|
|
91
|
+
<% end %>
|
|
25
92
|
</div>
|
|
26
93
|
|
|
27
94
|
<div class="ck-field ck-field--spacious">
|
|
28
|
-
<p class="ck-section-title">Rubric
|
|
95
|
+
<p class="ck-section-title">Rubric<%= render "completion_kit/metrics/rubric_hint" %></p>
|
|
29
96
|
<p class="ck-hint">What each star rating means for this metric.</p>
|
|
30
97
|
|
|
31
98
|
<div class="ck-rubric-builder">
|
|
32
99
|
<% metric.rubric_bands_for_form.each_with_index do |band, index| %>
|
|
100
|
+
<% suggested_band = suggestion_bands[band["stars"].to_i].to_s %>
|
|
101
|
+
<% band_changed = suggestion && suggested_band.present? && suggested_band != band["description"].to_s %>
|
|
102
|
+
<% target_name = "metric[rubric_bands][#{index}][description]" %>
|
|
33
103
|
<div class="ck-rubric-row">
|
|
34
104
|
<div class="ck-rubric-row__stars">
|
|
35
105
|
<% 5.times do |i| %>
|
|
@@ -38,7 +108,23 @@
|
|
|
38
108
|
<input type="hidden" name="metric[rubric_bands][<%= index %>][stars]" value="<%= band["stars"] %>">
|
|
39
109
|
</div>
|
|
40
110
|
<div class="ck-rubric-row__fields">
|
|
41
|
-
<textarea name="
|
|
111
|
+
<textarea name="<%= target_name %>" rows="2" class="ck-input ck-input--area"><%= band["description"] %></textarea>
|
|
112
|
+
<% if band_changed %>
|
|
113
|
+
<div class="ck-inline-suggestion ck-inline-suggestion--band">
|
|
114
|
+
<div class="ck-inline-suggestion__head">
|
|
115
|
+
<p class="ck-kicker">Suggested band</p>
|
|
116
|
+
<button type="button"
|
|
117
|
+
class="<%= ck_button_classes(:light, variant: :outline) %> ck-inline-suggestion__apply"
|
|
118
|
+
data-ck-apply
|
|
119
|
+
data-target="<%= target_name %>"
|
|
120
|
+
data-value="<%= h(suggested_band) %>">Use this band</button>
|
|
121
|
+
</div>
|
|
122
|
+
<div class="ck-inline-suggestion__diff">
|
|
123
|
+
<pre class="ck-inline-suggestion__pane ck-inline-suggestion__pane--before"><%= ck_word_diff_old(band["description"].to_s, suggested_band) %></pre>
|
|
124
|
+
<pre class="ck-inline-suggestion__pane ck-inline-suggestion__pane--after"><%= ck_word_diff_new(band["description"].to_s, suggested_band) %></pre>
|
|
125
|
+
</div>
|
|
126
|
+
</div>
|
|
127
|
+
<% end %>
|
|
42
128
|
</div>
|
|
43
129
|
</div>
|
|
44
130
|
<% end %>
|
|
@@ -57,11 +143,11 @@
|
|
|
57
143
|
<% confirm = parts.empty? ? "Delete \"#{metric.name}\"? It's not in use." : "Delete \"#{metric.name}\"? It's #{parts.to_sentence}." %>
|
|
58
144
|
<%= button_to metric_path(metric), method: :delete,
|
|
59
145
|
form_class: "inline-block",
|
|
60
|
-
class: "ck-icon-btn",
|
|
146
|
+
class: "ck-icon-btn ck-icon-btn--form",
|
|
61
147
|
title: "Delete metric",
|
|
62
148
|
"aria-label": "Delete metric",
|
|
63
149
|
data: { turbo_confirm: confirm } do %>
|
|
64
|
-
<%= heroicon_tag "trash", variant: :outline, size:
|
|
150
|
+
<%= heroicon_tag "trash", variant: :outline, size: 24, "aria-hidden": "true" %>
|
|
65
151
|
<% end %>
|
|
66
152
|
<% end %>
|
|
67
153
|
<%= link_to "Cancel", metrics_path, class: ck_button_classes(:light, variant: :outline), tabindex: "0" %>
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
<% current_bands = local_assigns[:current_bands] || [] %>
|
|
2
|
+
<% draft_bands = local_assigns[:draft_bands] || [] %>
|
|
3
|
+
<% lookup = ->(bands, stars) { bands.find { |b| b["stars"].to_i == stars }&.dig("description").to_s } %>
|
|
4
|
+
<div class="ck-rubric-diff">
|
|
5
|
+
<% 5.downto(1) do |stars| %>
|
|
6
|
+
<% old_band = lookup.call(current_bands, stars) %>
|
|
7
|
+
<% new_band = lookup.call(draft_bands, stars) %>
|
|
8
|
+
<% changed = old_band != new_band %>
|
|
9
|
+
<div class="ck-rubric-diff__row ck-rubric-diff__row--<%= changed ? "changed" : "unchanged" %>">
|
|
10
|
+
<div class="ck-rubric-diff__stars">
|
|
11
|
+
<% 5.times do |i| %>
|
|
12
|
+
<svg viewBox="0 0 24 24" width="14" height="14" stroke-width="1.75" class="ck-star <%= i < stars ? "ck-star--filled" : "ck-star--empty" %>"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
|
|
13
|
+
<% end %>
|
|
14
|
+
</div>
|
|
15
|
+
<% if changed %>
|
|
16
|
+
<div class="ck-rubric-diff__panes">
|
|
17
|
+
<pre class="ck-rubric-diff__pane ck-rubric-diff__pane--before"><%= ck_word_diff_old(old_band, new_band) %></pre>
|
|
18
|
+
<pre class="ck-rubric-diff__pane ck-rubric-diff__pane--after"><%= ck_word_diff_new(old_band, new_band) %></pre>
|
|
19
|
+
</div>
|
|
20
|
+
<% else %>
|
|
21
|
+
<p class="ck-rubric-diff__unchanged"><%= old_band.presence || "—" %></p>
|
|
22
|
+
<% end %>
|
|
23
|
+
</div>
|
|
24
|
+
<% end %>
|
|
25
|
+
</div>
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
<span class="ck-info-hint" tabindex="0" role="button" aria-label="What is a rubric?">
|
|
2
|
+
<%= heroicon_tag "information-circle", variant: :outline, "aria-hidden": "true" %>
|
|
3
|
+
<span class="ck-info-popup">How the judge picks 1 to 5. Each row says what an output has to look like to earn that many stars. The judge reads these descriptions when it scores, so clearer rows give you more consistent scoring.</span>
|
|
4
|
+
</span>
|
|
@@ -1,11 +1,15 @@
|
|
|
1
|
-
|
|
2
|
-
<
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
1
|
+
<%= link_to starter_preview_metrics_path(key: starter.key), class: "ck-starter-card" do %>
|
|
2
|
+
<div class="ck-starter-card__body">
|
|
3
|
+
<p class="ck-starter-card__name"><strong><%= starter.name %></strong></p>
|
|
4
|
+
<p class="ck-starter-card__desc"><%= starter.description %></p>
|
|
5
|
+
</div>
|
|
6
|
+
<div class="ck-starter-card__foot">
|
|
7
|
+
<span class="ck-starter-card__cta">Preview →</span>
|
|
8
|
+
<%= button_to "dismiss", dismiss_starter_metrics_path(key: starter.key),
|
|
9
|
+
method: :post,
|
|
10
|
+
form: { onclick: "event.stopPropagation();" },
|
|
11
|
+
form_class: "inline-block ck-starter-card__dismiss-form",
|
|
12
|
+
class: "ck-starter-card__dismiss",
|
|
9
13
|
data: { turbo_confirm: "Hide \"#{starter.name}\" from this list?" } %>
|
|
10
14
|
</div>
|
|
11
|
-
|
|
15
|
+
<% end %>
|