completion-kit 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +135 -322
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +1 -28
- data/app/controllers/completion_kit/metrics_controller.rb +14 -37
- data/app/controllers/completion_kit/runs_controller.rb +2 -2
- data/app/jobs/completion_kit/judge_review_job.rb +0 -12
- data/app/models/completion_kit/metric.rb +0 -1
- data/app/models/completion_kit/metric_version.rb +35 -0
- data/app/services/completion_kit/judge_service.rb +3 -10
- data/app/services/completion_kit/mcp_tools/metric_versions.rb +1 -1
- data/app/services/completion_kit/metric_variant_generator.rb +0 -13
- data/app/views/completion_kit/api_reference/_body.html.erb +2 -12
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +19 -12
- data/app/views/completion_kit/metrics/_form.html.erb +11 -12
- data/app/views/completion_kit/metrics/edit.html.erb +18 -0
- data/app/views/completion_kit/metrics/index.html.erb +0 -17
- data/app/views/completion_kit/metrics/show.html.erb +87 -105
- data/app/views/completion_kit/responses/show.html.erb +2 -2
- data/app/views/completion_kit/runs/show.html.erb +7 -7
- data/config/routes.rb +0 -4
- data/db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb +5 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +2 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class MetricsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_metric, only: [:show, :edit, :update, :destroy, :
|
|
4
|
+
before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion]
|
|
5
5
|
|
|
6
6
|
def index
|
|
7
7
|
@metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
|
|
@@ -35,11 +35,6 @@ module CompletionKit
|
|
|
35
35
|
end
|
|
36
36
|
|
|
37
37
|
def show
|
|
38
|
-
@published_metric_version = MetricVersion.ensure_current_for(@metric)
|
|
39
|
-
@disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
|
|
40
|
-
.includes(:metric_version, response: [:reviews, :run])
|
|
41
|
-
.order(created_at: :desc)
|
|
42
|
-
.limit(50)
|
|
43
38
|
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
44
39
|
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
45
40
|
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
@@ -54,6 +49,7 @@ module CompletionKit
|
|
|
54
49
|
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
55
50
|
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
56
51
|
@published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
52
|
+
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
57
53
|
|
|
58
54
|
if @edit_draft
|
|
59
55
|
@metric.instruction = @edit_draft.instruction
|
|
@@ -102,7 +98,7 @@ module CompletionKit
|
|
|
102
98
|
state: "draft", source: "edit", current: false
|
|
103
99
|
)
|
|
104
100
|
redirect_to edit_metric_path(@metric),
|
|
105
|
-
notice: "Saved as draft #{draft.version_label}. Publish to
|
|
101
|
+
notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
|
|
106
102
|
else
|
|
107
103
|
@metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
|
|
108
104
|
current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
@@ -120,7 +116,7 @@ module CompletionKit
|
|
|
120
116
|
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
121
117
|
disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
122
118
|
if disagreement_count.zero?
|
|
123
|
-
redirect_to target, alert: "Mark at least one
|
|
119
|
+
redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
|
|
124
120
|
return
|
|
125
121
|
end
|
|
126
122
|
|
|
@@ -132,15 +128,21 @@ module CompletionKit
|
|
|
132
128
|
redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
|
|
133
129
|
return
|
|
134
130
|
end
|
|
135
|
-
generator.persist!(variants)
|
|
136
|
-
|
|
131
|
+
versions = generator.persist!(variants)
|
|
132
|
+
new_version = versions.max_by(&:version_number)
|
|
133
|
+
if params[:back_to] == "edit"
|
|
134
|
+
redirect_to edit_metric_path(@metric), notice: "Drafted #{new_version.version_label} from your reviews. Review the proposed changes below, then Publish to use it."
|
|
135
|
+
else
|
|
136
|
+
redirect_to metric_path(@metric, show_change: new_version.id), notice: "Drafted #{new_version.version_label} from your reviews."
|
|
137
|
+
end
|
|
137
138
|
end
|
|
138
139
|
|
|
139
140
|
def dismiss_suggestion
|
|
140
141
|
draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
|
|
142
|
+
label = draft&.version_label
|
|
141
143
|
draft&.destroy
|
|
142
144
|
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
143
|
-
redirect_to target, notice: "
|
|
145
|
+
redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
|
|
144
146
|
end
|
|
145
147
|
|
|
146
148
|
def publish_draft
|
|
@@ -164,7 +166,7 @@ module CompletionKit
|
|
|
164
166
|
audit = version.revert!
|
|
165
167
|
prior_label = previously_current.version_label
|
|
166
168
|
redirect_to metric_path(@metric),
|
|
167
|
-
notice: "Reverted
|
|
169
|
+
notice: "Reverted #{@metric.name} to #{version.version_label} (logged as #{audit.version_label}). Human reviews collected against #{prior_label} stay tied to it."
|
|
168
170
|
else
|
|
169
171
|
version.publish!
|
|
170
172
|
redirect_to metric_path(@metric),
|
|
@@ -172,31 +174,6 @@ module CompletionKit
|
|
|
172
174
|
end
|
|
173
175
|
end
|
|
174
176
|
|
|
175
|
-
def add_few_shot
|
|
176
|
-
calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
|
|
177
|
-
review = calibration.response.reviews.find_by(metric_id: @metric.id)
|
|
178
|
-
examples = Array(@metric.few_shot_examples)
|
|
179
|
-
examples << {
|
|
180
|
-
"input" => calibration.response.input_data.to_s.truncate(2000),
|
|
181
|
-
"response" => calibration.response.response_text.to_s.truncate(2000),
|
|
182
|
-
"judge_score" => review&.ai_score&.to_f,
|
|
183
|
-
"judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
|
|
184
|
-
"human_score" => calibration.corrected_score&.to_f,
|
|
185
|
-
"human_note" => calibration.note.to_s.truncate(1000),
|
|
186
|
-
"calibration_id" => calibration.id,
|
|
187
|
-
"added_at" => Time.current.utc.iso8601
|
|
188
|
-
}
|
|
189
|
-
@metric.update!(few_shot_examples: examples)
|
|
190
|
-
redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
|
|
191
|
-
end
|
|
192
|
-
|
|
193
|
-
def remove_few_shot
|
|
194
|
-
cal_id = params[:calibration_id].to_i
|
|
195
|
-
remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
|
|
196
|
-
@metric.update!(few_shot_examples: remaining)
|
|
197
|
-
redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
|
|
198
|
-
end
|
|
199
|
-
|
|
200
177
|
private
|
|
201
178
|
|
|
202
179
|
def set_metric
|
|
@@ -95,7 +95,7 @@ module CompletionKit
|
|
|
95
95
|
|
|
96
96
|
def regrade
|
|
97
97
|
if @run.regrade!
|
|
98
|
-
redirect_to run_path(@run), notice: "Re-grading existing responses
|
|
98
|
+
redirect_to run_path(@run), notice: "Re-grading existing responses against the current metrics."
|
|
99
99
|
else
|
|
100
100
|
redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
|
|
101
101
|
end
|
|
@@ -151,7 +151,7 @@ module CompletionKit
|
|
|
151
151
|
def retry_failures
|
|
152
152
|
if @run.stale_review_summary.any?
|
|
153
153
|
redirect_to run_path(@run),
|
|
154
|
-
alert: "
|
|
154
|
+
alert: "A metric has a newer version than the one this run was scored against. Retrying failed cases would mix scores from two versions in the same run. Use 'Re-run from scratch' to refresh everything against the current metrics."
|
|
155
155
|
return
|
|
156
156
|
end
|
|
157
157
|
|
|
@@ -58,7 +58,6 @@ module CompletionKit
|
|
|
58
58
|
run.prompt&.template,
|
|
59
59
|
criteria: metric.instruction.to_s,
|
|
60
60
|
rubric_text: metric.display_rubric_text,
|
|
61
|
-
human_examples: few_shot_payload(metric),
|
|
62
61
|
input_data: response.input_data
|
|
63
62
|
)
|
|
64
63
|
|
|
@@ -116,16 +115,5 @@ module CompletionKit
|
|
|
116
115
|
response = Response.find_by(id: @response_id)
|
|
117
116
|
RunCompletionCheckJob.perform_later(response.run_id) if response
|
|
118
117
|
end
|
|
119
|
-
|
|
120
|
-
def few_shot_payload(metric)
|
|
121
|
-
return nil unless CompletionKit.config.judge_calibration_enabled
|
|
122
|
-
Array(metric.few_shot_examples).map do |fs|
|
|
123
|
-
{
|
|
124
|
-
human_score: fs["human_score"],
|
|
125
|
-
response_text: fs["response"].to_s,
|
|
126
|
-
human_note: fs["human_note"].to_s
|
|
127
|
-
}
|
|
128
|
-
end
|
|
129
|
-
end
|
|
130
118
|
end
|
|
131
119
|
end
|
|
@@ -17,7 +17,6 @@ module CompletionKit
|
|
|
17
17
|
has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
|
|
18
18
|
|
|
19
19
|
serialize :rubric_bands, coder: JSON
|
|
20
|
-
serialize :few_shot_examples, coder: JSON, type: Array
|
|
21
20
|
|
|
22
21
|
validates :name, presence: true
|
|
23
22
|
validates :key, tenant_scoped_uniqueness: { allow_nil: true }
|
|
@@ -40,6 +40,35 @@ module CompletionKit
|
|
|
40
40
|
"v#{version_number}"
|
|
41
41
|
end
|
|
42
42
|
|
|
43
|
+
def change_summary_against(previous)
|
|
44
|
+
return nil if previous.nil?
|
|
45
|
+
|
|
46
|
+
instruction_changed = previous.instruction.to_s.strip != instruction.to_s.strip
|
|
47
|
+
rubric_changes = rubric_band_change_count(previous)
|
|
48
|
+
return nil unless instruction_changed || rubric_changes.positive?
|
|
49
|
+
|
|
50
|
+
dimensions = []
|
|
51
|
+
dimensions << "instruction" if instruction_changed
|
|
52
|
+
dimensions << "rubric" if rubric_changes.positive?
|
|
53
|
+
|
|
54
|
+
words_changed = 0
|
|
55
|
+
if instruction_changed
|
|
56
|
+
old_words = previous.instruction.to_s.split
|
|
57
|
+
new_words = instruction.to_s.split
|
|
58
|
+
words_changed = (old_words - new_words).size + (new_words - old_words).size
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
magnitude = if rubric_changes >= 2 || (instruction_changed && rubric_changes >= 1) || words_changed >= 15
|
|
62
|
+
:major
|
|
63
|
+
elsif rubric_changes == 1 || words_changed >= 4
|
|
64
|
+
:minor
|
|
65
|
+
else
|
|
66
|
+
:trivial
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
{ magnitude: magnitude, label: "#{magnitude.to_s.capitalize} #{dimensions.to_sentence} changes" }
|
|
70
|
+
end
|
|
71
|
+
|
|
43
72
|
def publish!
|
|
44
73
|
MetricVersion.transaction do
|
|
45
74
|
self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
|
|
@@ -86,6 +115,12 @@ module CompletionKit
|
|
|
86
115
|
|
|
87
116
|
private
|
|
88
117
|
|
|
118
|
+
def rubric_band_change_count(previous)
|
|
119
|
+
prev = Metric.normalize_rubric_bands(previous.rubric_bands)
|
|
120
|
+
curr = Metric.normalize_rubric_bands(rubric_bands)
|
|
121
|
+
prev.zip(curr).count { |p, c| p["description"].to_s.strip != c["description"].to_s.strip }
|
|
122
|
+
end
|
|
123
|
+
|
|
89
124
|
def assign_version_number
|
|
90
125
|
return if version_number.present?
|
|
91
126
|
max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i
|
|
@@ -10,12 +10,12 @@ module CompletionKit
|
|
|
10
10
|
@judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
-
def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil,
|
|
13
|
+
def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, **_extras)
|
|
14
14
|
raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
|
|
15
15
|
|
|
16
16
|
judge_prompt = build_judge_prompt(output, expected_output, prompt,
|
|
17
17
|
criteria: criteria,
|
|
18
|
-
rubric_text: rubric_text,
|
|
18
|
+
rubric_text: rubric_text,
|
|
19
19
|
input_data: input_data)
|
|
20
20
|
|
|
21
21
|
response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
|
|
@@ -25,7 +25,7 @@ module CompletionKit
|
|
|
25
25
|
|
|
26
26
|
private
|
|
27
27
|
|
|
28
|
-
def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil,
|
|
28
|
+
def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil)
|
|
29
29
|
judge_prompt = <<~PROMPT
|
|
30
30
|
You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
|
|
31
31
|
|
|
@@ -42,13 +42,6 @@ module CompletionKit
|
|
|
42
42
|
judge_prompt += "\nCriteria: #{criteria}\n"
|
|
43
43
|
end
|
|
44
44
|
|
|
45
|
-
if human_examples.present?
|
|
46
|
-
judge_prompt += "\nCalibration examples:\n"
|
|
47
|
-
human_examples.each_with_index do |example, index|
|
|
48
|
-
judge_prompt += "Example #{index + 1}: score=#{example[:human_score]} output=#{example[:response_text].to_s.truncate(200)}\n"
|
|
49
|
-
end
|
|
50
|
-
end
|
|
51
|
-
|
|
52
45
|
judge_prompt += <<~PROMPT
|
|
53
46
|
|
|
54
47
|
Original prompt: #{prompt || "Not provided"}
|
|
@@ -16,7 +16,7 @@ module CompletionKit
|
|
|
16
16
|
handler: :list
|
|
17
17
|
},
|
|
18
18
|
"metric_versions_publish" => {
|
|
19
|
-
description: "Publish a MetricVersion as the live
|
|
19
|
+
description: "Publish a MetricVersion as the live version of its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge grades against it.",
|
|
20
20
|
inputSchema: {
|
|
21
21
|
type: "object",
|
|
22
22
|
properties: {
|
|
@@ -43,7 +43,6 @@ module CompletionKit
|
|
|
43
43
|
def build_meta_prompt
|
|
44
44
|
disagreements = MetricCalibrationExamples.disagreements_for(@metric)
|
|
45
45
|
borderlines = MetricCalibrationExamples.borderlines_for(@metric)
|
|
46
|
-
pinned_examples = Array(@metric.few_shot_examples)
|
|
47
46
|
sections = []
|
|
48
47
|
sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
|
|
49
48
|
sections << ""
|
|
@@ -78,18 +77,6 @@ module CompletionKit
|
|
|
78
77
|
sections << ""
|
|
79
78
|
end
|
|
80
79
|
end
|
|
81
|
-
if pinned_examples.any?
|
|
82
|
-
sections << "## Pinned cases the judge already references"
|
|
83
|
-
sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
|
|
84
|
-
pinned_examples.each_with_index do |ex, i|
|
|
85
|
-
sections << "### Pinned #{i + 1}"
|
|
86
|
-
sections << "Input: #{ex["input"].to_s.truncate(200)}"
|
|
87
|
-
sections << "Output: #{ex["response"].to_s.truncate(200)}"
|
|
88
|
-
sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
|
|
89
|
-
sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
|
|
90
|
-
sections << ""
|
|
91
|
-
end
|
|
92
|
-
end
|
|
93
80
|
sections << "## Task"
|
|
94
81
|
sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
|
|
95
82
|
sections << ""
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
<label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">10</span></label>
|
|
28
28
|
<label for="ck-tab-responses" class="ck-api-tabs__label">Responses <span class="ck-api-tabs__count">2</span></label>
|
|
29
29
|
<label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
|
|
30
|
-
<label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">
|
|
30
|
+
<label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">10</span></label>
|
|
31
31
|
<label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
|
|
32
32
|
<label for="ck-tab-calibrations" class="ck-api-tabs__label">Calibrations <span class="ck-api-tabs__count">3</span></label>
|
|
33
33
|
<label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
|
|
@@ -239,7 +239,7 @@
|
|
|
239
239
|
|
|
240
240
|
<div class="ck-api-endpoint" style="padding-top: 1.5rem;">
|
|
241
241
|
<p class="ck-kicker" style="margin-bottom: 0.5rem;">Calibration loop</p>
|
|
242
|
-
<p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model
|
|
242
|
+
<p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model to rewrite the instruction and rubric into a new draft version.</p>
|
|
243
243
|
</div>
|
|
244
244
|
<div class="ck-api-endpoint">
|
|
245
245
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
|
|
@@ -247,16 +247,6 @@
|
|
|
247
247
|
<p class="ck-api-params"><strong>Optional:</strong> <code>count</code>, <code>model</code></p>
|
|
248
248
|
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/metrics/1/suggest_variants \\\n -H \"Authorization: Bearer #{token}\"" %>
|
|
249
249
|
</div>
|
|
250
|
-
<div class="ck-api-endpoint">
|
|
251
|
-
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/add_few_shot</p>
|
|
252
|
-
<p class="ck-meta-copy">Pin a disagree calibration as a few-shot example on the metric. Returns the updated metric, or 404 if the calibration is not a disagree on this metric.</p>
|
|
253
|
-
<p class="ck-api-params"><strong>Required:</strong> <code>calibration_id</code></p>
|
|
254
|
-
</div>
|
|
255
|
-
<div class="ck-api-endpoint">
|
|
256
|
-
<p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:id/remove_few_shot</p>
|
|
257
|
-
<p class="ck-meta-copy">Drop the pinned few-shot example by calibration ID.</p>
|
|
258
|
-
<p class="ck-api-params"><strong>Required:</strong> <code>calibration_id</code></p>
|
|
259
|
-
</div>
|
|
260
250
|
|
|
261
251
|
<div class="ck-api-endpoint" style="padding-top: 1.5rem;">
|
|
262
252
|
<p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
<% stats = local_assigns[:stats] %>
|
|
2
2
|
<% metric = local_assigns[:metric] %>
|
|
3
3
|
<% anchor = metric&.name&.parameterize %>
|
|
4
|
+
<% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
|
|
4
5
|
<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
|
|
5
6
|
created_by = CompletionKit.config.username.presence || "operator"
|
|
6
|
-
current_metric_version = CompletionKit::MetricVersion.current.find_by(metric_id: metric.id)
|
|
7
7
|
verdicted_ids = if current_metric_version
|
|
8
8
|
CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
|
|
9
9
|
else
|
|
@@ -22,19 +22,26 @@
|
|
|
22
22
|
end %>
|
|
23
23
|
|
|
24
24
|
<p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
25
|
-
<span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
|
|
26
25
|
<% if stats.sample_size.zero? %>
|
|
27
|
-
<span class="ck-trust-
|
|
28
|
-
<span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %>
|
|
29
|
-
|
|
30
|
-
|
|
26
|
+
<span class="ck-trust-line__lead">Not measured yet.</span>
|
|
27
|
+
<span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "earlier-version review") %> kept on file.)<% end %></span>
|
|
28
|
+
<% if target_response %>
|
|
29
|
+
<%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
30
|
+
<% end %>
|
|
31
31
|
<% elsif stats.counter_only? %>
|
|
32
|
-
<span class="ck-
|
|
33
|
-
|
|
32
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></strong></span>
|
|
33
|
+
<% if stats.short_to_target > 0 %><span class="ck-trust-line__hint"><%= stats.short_to_target %> more to report a rate</span><% end %>
|
|
34
|
+
<% if target_response %>
|
|
35
|
+
<%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
36
|
+
<% end %>
|
|
34
37
|
<% else %>
|
|
35
|
-
<span class="ck-
|
|
36
|
-
<span class="ck-
|
|
37
|
-
<span class="ck-
|
|
38
|
-
<span class="ck-
|
|
38
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Agreement</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong></span>
|
|
39
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
|
|
40
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
|
|
41
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
|
|
42
|
+
<% if stats.borderline_rate && stats.borderline_rate > 0 %>
|
|
43
|
+
<% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
|
|
44
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>
|
|
45
|
+
<% end %>
|
|
39
46
|
<% end %>
|
|
40
47
|
</p>
|
|
@@ -40,20 +40,19 @@
|
|
|
40
40
|
<% if suggestion %>
|
|
41
41
|
<div class="ck-suggestion-banner" role="status">
|
|
42
42
|
<div class="ck-suggestion-banner__body">
|
|
43
|
-
<p class="ck-kicker"
|
|
44
|
-
<p class="ck-meta-copy">Based on
|
|
43
|
+
<p class="ck-kicker ck-kicker--icon"><%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>Proposed changes</p>
|
|
44
|
+
<p class="ck-meta-copy">Based on human reviews, here are some proposed changes to the metric.</p>
|
|
45
45
|
</div>
|
|
46
46
|
<div class="ck-suggestion-banner__actions">
|
|
47
|
-
<%= button_to
|
|
48
|
-
method: :post, form_class: "inline-block",
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
<%= button_to "Take everything", publish_draft_metric_path(metric, draft_id: suggestion.id),
|
|
47
|
+
<%= button_to suggest_variants_metric_path(metric, back_to: "edit"),
|
|
48
|
+
method: :post, form_class: "inline-block", class: "ck-icon-btn",
|
|
49
|
+
title: "Try again", "aria-label": "Try again",
|
|
50
|
+
data: { turbo_confirm: "Replace these changes with fresh ones from the model?" } do %><%= heroicon_tag "arrow-path", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
51
|
+
<%= button_to dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
|
|
52
|
+
method: :delete, form_class: "inline-block", class: "ck-icon-btn",
|
|
53
|
+
title: "Discard these changes", "aria-label": "Discard",
|
|
54
|
+
data: { turbo_confirm: "Drop these changes?" } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
55
|
+
<%= button_to "Apply all", publish_draft_metric_path(metric, draft_id: suggestion.id),
|
|
57
56
|
method: :post, form_class: "inline-block",
|
|
58
57
|
class: ck_button_classes(:dark) %>
|
|
59
58
|
</div>
|
|
@@ -10,6 +10,24 @@
|
|
|
10
10
|
</div>
|
|
11
11
|
</section>
|
|
12
12
|
|
|
13
|
+
<% if CompletionKit.config.judge_calibration_enabled && @suggestion_draft.nil? && @edit_draft.nil? && @improve_disagreement_count.to_i.positive? %>
|
|
14
|
+
<div class="ck-suggestion-banner" role="status">
|
|
15
|
+
<div class="ck-suggestion-banner__body">
|
|
16
|
+
<p class="ck-kicker">Improve from reviews</p>
|
|
17
|
+
<p class="ck-meta-copy">Based on human reviews, the model can propose changes to this metric.</p>
|
|
18
|
+
</div>
|
|
19
|
+
<div class="ck-suggestion-banner__actions">
|
|
20
|
+
<%= button_to suggest_variants_metric_path(@metric, back_to: "edit"),
|
|
21
|
+
method: :post, form_class: "inline-block",
|
|
22
|
+
class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
|
|
23
|
+
data: { turbo_confirm: "Draft improvements to this metric from your human reviews? You can edit or apply them here before publishing." } do %>
|
|
24
|
+
<%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
|
|
25
|
+
Suggest improvements
|
|
26
|
+
<% end %>
|
|
27
|
+
</div>
|
|
28
|
+
</div>
|
|
29
|
+
<% end %>
|
|
30
|
+
|
|
13
31
|
<%= render "form",
|
|
14
32
|
metric: @metric,
|
|
15
33
|
suggestion_draft: @suggestion_draft,
|
|
@@ -28,23 +28,6 @@
|
|
|
28
28
|
<tr onclick="window.location='<%= metric_path(metric) %>'" style="cursor: pointer;">
|
|
29
29
|
<td>
|
|
30
30
|
<%= link_to metric_path(metric), class: "ck-record-name" do %><strong><%= metric.name %></strong><% end %>
|
|
31
|
-
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
32
|
-
<% s = CompletionKit::MetricCalibrationStats.for(metric) %>
|
|
33
|
-
<p class="ck-metrics-table__trust" title="Calibration: how often this metric's scores match the humans who reviewed them.">
|
|
34
|
-
<%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true", class: "ck-trust-icon" %>
|
|
35
|
-
<span class="ck-metrics-table__trust-label">Calibration</span>
|
|
36
|
-
<% if s.counter_only? %>
|
|
37
|
-
<% if s.sample_size.zero? %>
|
|
38
|
-
<span class="ck-metrics-table__trust-state">Not measured yet</span>
|
|
39
|
-
<% else %>
|
|
40
|
-
<%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts
|
|
41
|
-
<% end %>
|
|
42
|
-
<% else %>
|
|
43
|
-
<span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read. Keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
|
|
44
|
-
±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %>
|
|
45
|
-
<% end %>
|
|
46
|
-
</p>
|
|
47
|
-
<% end %>
|
|
48
31
|
<% if metric.tags.any? %>
|
|
49
32
|
<div class="tag-marks-row">
|
|
50
33
|
<%= render "completion_kit/tags/marks", tags: metric.tags %>
|