completion-kit 0.8.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +203 -334
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +1 -28
- data/app/controllers/completion_kit/metrics_controller.rb +30 -36
- data/app/controllers/completion_kit/runs_controller.rb +2 -2
- data/app/jobs/completion_kit/judge_review_job.rb +9 -16
- data/app/models/completion_kit/metric.rb +0 -1
- data/app/models/completion_kit/metric_version.rb +35 -0
- data/app/services/completion_kit/judge_service.rb +19 -10
- data/app/services/completion_kit/mcp_tools/metric_versions.rb +1 -1
- data/app/services/completion_kit/metric_calibration_examples.rb +56 -0
- data/app/services/completion_kit/metric_variant_generator.rb +0 -49
- data/app/views/completion_kit/api_reference/_body.html.erb +2 -12
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +25 -19
- data/app/views/completion_kit/metrics/_form.html.erb +11 -12
- data/app/views/completion_kit/metrics/_guiding_examples.html.erb +23 -0
- data/app/views/completion_kit/metrics/edit.html.erb +18 -0
- data/app/views/completion_kit/metrics/index.html.erb +5 -17
- data/app/views/completion_kit/metrics/show.html.erb +76 -100
- data/app/views/completion_kit/responses/show.html.erb +7 -5
- data/app/views/completion_kit/runs/show.html.erb +7 -7
- data/config/routes.rb +1 -4
- data/db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb +5 -0
- data/db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb +5 -0
- data/lib/completion_kit/version.rb +1 -1
- data/lib/completion_kit.rb +2 -0
- metadata +5 -1
|
@@ -2,7 +2,7 @@ module CompletionKit
|
|
|
2
2
|
module Api
|
|
3
3
|
module V1
|
|
4
4
|
class MetricsController < BaseController
|
|
5
|
-
before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants
|
|
5
|
+
before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants]
|
|
6
6
|
|
|
7
7
|
def index
|
|
8
8
|
scope = Metric.includes(:tags)
|
|
@@ -54,33 +54,6 @@ module CompletionKit
|
|
|
54
54
|
render json: versions, status: :created
|
|
55
55
|
end
|
|
56
56
|
|
|
57
|
-
def add_few_shot
|
|
58
|
-
calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
|
|
59
|
-
review = calibration.response.reviews.find_by(metric_id: @metric.id)
|
|
60
|
-
examples = Array(@metric.few_shot_examples)
|
|
61
|
-
examples << {
|
|
62
|
-
"input" => calibration.response.input_data.to_s.truncate(2000),
|
|
63
|
-
"response" => calibration.response.response_text.to_s.truncate(2000),
|
|
64
|
-
"judge_score" => review&.ai_score&.to_f,
|
|
65
|
-
"judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
|
|
66
|
-
"human_score" => calibration.corrected_score&.to_f,
|
|
67
|
-
"human_note" => calibration.note.to_s.truncate(1000),
|
|
68
|
-
"calibration_id" => calibration.id,
|
|
69
|
-
"added_at" => Time.current.utc.iso8601
|
|
70
|
-
}
|
|
71
|
-
@metric.update!(few_shot_examples: examples)
|
|
72
|
-
render json: @metric.reload
|
|
73
|
-
rescue ActiveRecord::RecordNotFound
|
|
74
|
-
render_error("Calibration not found or not a disagree on this metric.", status: :not_found)
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
def remove_few_shot
|
|
78
|
-
cal_id = params[:calibration_id].to_i
|
|
79
|
-
remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
|
|
80
|
-
@metric.update!(few_shot_examples: remaining)
|
|
81
|
-
render json: @metric.reload
|
|
82
|
-
end
|
|
83
|
-
|
|
84
57
|
private
|
|
85
58
|
|
|
86
59
|
def set_metric
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class MetricsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_metric, only: [:show, :edit, :update, :destroy, :
|
|
4
|
+
before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion, :exclude_example]
|
|
5
|
+
before_action :ensure_examples_from_reviews_enabled, only: [:exclude_example]
|
|
5
6
|
|
|
6
7
|
def index
|
|
7
8
|
@metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
|
|
8
9
|
@available_starters = StarterMetrics.available
|
|
10
|
+
@current_versions = MetricVersion.published.current.where(metric_id: @metrics.map(&:id)).index_by(&:metric_id)
|
|
9
11
|
end
|
|
10
12
|
|
|
11
13
|
def starter_preview
|
|
@@ -35,15 +37,11 @@ module CompletionKit
|
|
|
35
37
|
end
|
|
36
38
|
|
|
37
39
|
def show
|
|
38
|
-
@published_metric_version = MetricVersion.ensure_current_for(@metric)
|
|
39
|
-
@disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
|
|
40
|
-
.includes(:metric_version, response: [:reviews, :run])
|
|
41
|
-
.order(created_at: :desc)
|
|
42
|
-
.limit(50)
|
|
43
40
|
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
44
41
|
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
45
42
|
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
46
43
|
@versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
|
|
44
|
+
@guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricCalibrationExamples.judge_examples_for(@metric) : []
|
|
47
45
|
end
|
|
48
46
|
|
|
49
47
|
def new
|
|
@@ -54,6 +52,7 @@ module CompletionKit
|
|
|
54
52
|
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
55
53
|
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
56
54
|
@published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
55
|
+
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
57
56
|
|
|
58
57
|
if @edit_draft
|
|
59
58
|
@metric.instruction = @edit_draft.instruction
|
|
@@ -102,7 +101,7 @@ module CompletionKit
|
|
|
102
101
|
state: "draft", source: "edit", current: false
|
|
103
102
|
)
|
|
104
103
|
redirect_to edit_metric_path(@metric),
|
|
105
|
-
notice: "Saved as draft #{draft.version_label}. Publish to
|
|
104
|
+
notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
|
|
106
105
|
else
|
|
107
106
|
@metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
|
|
108
107
|
current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
@@ -120,7 +119,7 @@ module CompletionKit
|
|
|
120
119
|
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
121
120
|
disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
122
121
|
if disagreement_count.zero?
|
|
123
|
-
redirect_to target, alert: "Mark at least one
|
|
122
|
+
redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
|
|
124
123
|
return
|
|
125
124
|
end
|
|
126
125
|
|
|
@@ -132,15 +131,31 @@ module CompletionKit
|
|
|
132
131
|
redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
|
|
133
132
|
return
|
|
134
133
|
end
|
|
135
|
-
generator.persist!(variants)
|
|
136
|
-
|
|
134
|
+
versions = generator.persist!(variants)
|
|
135
|
+
new_version = versions.max_by(&:version_number)
|
|
136
|
+
if params[:back_to] == "edit"
|
|
137
|
+
redirect_to edit_metric_path(@metric), notice: "Drafted #{new_version.version_label} from your reviews. Review the proposed changes below, then Publish to use it."
|
|
138
|
+
else
|
|
139
|
+
redirect_to metric_path(@metric, show_change: new_version.id), notice: "Drafted #{new_version.version_label} from your reviews."
|
|
140
|
+
end
|
|
137
141
|
end
|
|
138
142
|
|
|
139
143
|
def dismiss_suggestion
|
|
140
144
|
draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
|
|
145
|
+
label = draft&.version_label
|
|
141
146
|
draft&.destroy
|
|
142
147
|
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
143
|
-
redirect_to target, notice: "
|
|
148
|
+
redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def exclude_example
|
|
152
|
+
calibration = Calibration.where(metric_id: @metric.id).find(params[:calibration_id])
|
|
153
|
+
calibration.update!(excluded_from_examples: true)
|
|
154
|
+
render turbo_stream: turbo_stream.replace(
|
|
155
|
+
"ck-guiding-#{@metric.id}",
|
|
156
|
+
partial: "completion_kit/metrics/guiding_examples",
|
|
157
|
+
locals: { metric: @metric, examples: MetricCalibrationExamples.judge_examples_for(@metric) }
|
|
158
|
+
)
|
|
144
159
|
end
|
|
145
160
|
|
|
146
161
|
def publish_draft
|
|
@@ -164,7 +179,7 @@ module CompletionKit
|
|
|
164
179
|
audit = version.revert!
|
|
165
180
|
prior_label = previously_current.version_label
|
|
166
181
|
redirect_to metric_path(@metric),
|
|
167
|
-
notice: "Reverted
|
|
182
|
+
notice: "Reverted #{@metric.name} to #{version.version_label} (logged as #{audit.version_label}). Human reviews collected against #{prior_label} stay tied to it."
|
|
168
183
|
else
|
|
169
184
|
version.publish!
|
|
170
185
|
redirect_to metric_path(@metric),
|
|
@@ -172,33 +187,12 @@ module CompletionKit
|
|
|
172
187
|
end
|
|
173
188
|
end
|
|
174
189
|
|
|
175
|
-
|
|
176
|
-
calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
|
|
177
|
-
review = calibration.response.reviews.find_by(metric_id: @metric.id)
|
|
178
|
-
examples = Array(@metric.few_shot_examples)
|
|
179
|
-
examples << {
|
|
180
|
-
"input" => calibration.response.input_data.to_s.truncate(2000),
|
|
181
|
-
"response" => calibration.response.response_text.to_s.truncate(2000),
|
|
182
|
-
"judge_score" => review&.ai_score&.to_f,
|
|
183
|
-
"judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
|
|
184
|
-
"human_score" => calibration.corrected_score&.to_f,
|
|
185
|
-
"human_note" => calibration.note.to_s.truncate(1000),
|
|
186
|
-
"calibration_id" => calibration.id,
|
|
187
|
-
"added_at" => Time.current.utc.iso8601
|
|
188
|
-
}
|
|
189
|
-
@metric.update!(few_shot_examples: examples)
|
|
190
|
-
redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
|
|
191
|
-
end
|
|
190
|
+
private
|
|
192
191
|
|
|
193
|
-
def
|
|
194
|
-
|
|
195
|
-
remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
|
|
196
|
-
@metric.update!(few_shot_examples: remaining)
|
|
197
|
-
redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
|
|
192
|
+
def ensure_examples_from_reviews_enabled
|
|
193
|
+
head :not_found unless CompletionKit.config.judge_examples_from_reviews
|
|
198
194
|
end
|
|
199
195
|
|
|
200
|
-
private
|
|
201
|
-
|
|
202
196
|
def set_metric
|
|
203
197
|
@metric = Metric.find(params[:id])
|
|
204
198
|
end
|
|
@@ -95,7 +95,7 @@ module CompletionKit
|
|
|
95
95
|
|
|
96
96
|
def regrade
|
|
97
97
|
if @run.regrade!
|
|
98
|
-
redirect_to run_path(@run), notice: "Re-grading existing responses
|
|
98
|
+
redirect_to run_path(@run), notice: "Re-grading existing responses against the current metrics."
|
|
99
99
|
else
|
|
100
100
|
redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
|
|
101
101
|
end
|
|
@@ -151,7 +151,7 @@ module CompletionKit
|
|
|
151
151
|
def retry_failures
|
|
152
152
|
if @run.stale_review_summary.any?
|
|
153
153
|
redirect_to run_path(@run),
|
|
154
|
-
alert: "
|
|
154
|
+
alert: "A metric has a newer version than the one this run was scored against. Retrying failed cases would mix scores from two versions in the same run. Use 'Re-run from scratch' to refresh everything against the current metrics."
|
|
155
155
|
return
|
|
156
156
|
end
|
|
157
157
|
|
|
@@ -58,8 +58,8 @@ module CompletionKit
|
|
|
58
58
|
run.prompt&.template,
|
|
59
59
|
criteria: metric.instruction.to_s,
|
|
60
60
|
rubric_text: metric.display_rubric_text,
|
|
61
|
-
|
|
62
|
-
|
|
61
|
+
input_data: response.input_data,
|
|
62
|
+
human_examples: review_examples_for(metric, response)
|
|
63
63
|
)
|
|
64
64
|
|
|
65
65
|
review = response.reviews.find_or_initialize_by(metric_id: metric.id)
|
|
@@ -81,9 +81,13 @@ module CompletionKit
|
|
|
81
81
|
|
|
82
82
|
private
|
|
83
83
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
84
|
+
def review_examples_for(metric, response)
|
|
85
|
+
return nil unless CompletionKit.config.judge_calibration_enabled
|
|
86
|
+
return nil unless CompletionKit.config.judge_examples_from_reviews
|
|
87
|
+
|
|
88
|
+
MetricCalibrationExamples.judge_examples_for(metric, exclude_response_id: response.id)
|
|
89
|
+
end
|
|
90
|
+
|
|
87
91
|
def confirm_judging_capability(judge_model_id)
|
|
88
92
|
model = Model.find_by(provider: ApiConfig.provider_for_model(judge_model_id), model_id: judge_model_id)
|
|
89
93
|
return unless model && model.supports_judging.nil?
|
|
@@ -116,16 +120,5 @@ module CompletionKit
|
|
|
116
120
|
response = Response.find_by(id: @response_id)
|
|
117
121
|
RunCompletionCheckJob.perform_later(response.run_id) if response
|
|
118
122
|
end
|
|
119
|
-
|
|
120
|
-
def few_shot_payload(metric)
|
|
121
|
-
return nil unless CompletionKit.config.judge_calibration_enabled
|
|
122
|
-
Array(metric.few_shot_examples).map do |fs|
|
|
123
|
-
{
|
|
124
|
-
human_score: fs["human_score"],
|
|
125
|
-
response_text: fs["response"].to_s,
|
|
126
|
-
human_note: fs["human_note"].to_s
|
|
127
|
-
}
|
|
128
|
-
end
|
|
129
|
-
end
|
|
130
123
|
end
|
|
131
124
|
end
|
|
@@ -17,7 +17,6 @@ module CompletionKit
|
|
|
17
17
|
has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
|
|
18
18
|
|
|
19
19
|
serialize :rubric_bands, coder: JSON
|
|
20
|
-
serialize :few_shot_examples, coder: JSON, type: Array
|
|
21
20
|
|
|
22
21
|
validates :name, presence: true
|
|
23
22
|
validates :key, tenant_scoped_uniqueness: { allow_nil: true }
|
|
@@ -40,6 +40,35 @@ module CompletionKit
|
|
|
40
40
|
"v#{version_number}"
|
|
41
41
|
end
|
|
42
42
|
|
|
43
|
+
def change_summary_against(previous)
|
|
44
|
+
return nil if previous.nil?
|
|
45
|
+
|
|
46
|
+
instruction_changed = previous.instruction.to_s.strip != instruction.to_s.strip
|
|
47
|
+
rubric_changes = rubric_band_change_count(previous)
|
|
48
|
+
return nil unless instruction_changed || rubric_changes.positive?
|
|
49
|
+
|
|
50
|
+
dimensions = []
|
|
51
|
+
dimensions << "instruction" if instruction_changed
|
|
52
|
+
dimensions << "rubric" if rubric_changes.positive?
|
|
53
|
+
|
|
54
|
+
words_changed = 0
|
|
55
|
+
if instruction_changed
|
|
56
|
+
old_words = previous.instruction.to_s.split
|
|
57
|
+
new_words = instruction.to_s.split
|
|
58
|
+
words_changed = (old_words - new_words).size + (new_words - old_words).size
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
magnitude = if rubric_changes >= 2 || (instruction_changed && rubric_changes >= 1) || words_changed >= 15
|
|
62
|
+
:major
|
|
63
|
+
elsif rubric_changes == 1 || words_changed >= 4
|
|
64
|
+
:minor
|
|
65
|
+
else
|
|
66
|
+
:trivial
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
{ magnitude: magnitude, label: "#{magnitude.to_s.capitalize} #{dimensions.to_sentence} changes" }
|
|
70
|
+
end
|
|
71
|
+
|
|
43
72
|
def publish!
|
|
44
73
|
MetricVersion.transaction do
|
|
45
74
|
self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
|
|
@@ -86,6 +115,12 @@ module CompletionKit
|
|
|
86
115
|
|
|
87
116
|
private
|
|
88
117
|
|
|
118
|
+
def rubric_band_change_count(previous)
|
|
119
|
+
prev = Metric.normalize_rubric_bands(previous.rubric_bands)
|
|
120
|
+
curr = Metric.normalize_rubric_bands(rubric_bands)
|
|
121
|
+
prev.zip(curr).count { |p, c| p["description"].to_s.strip != c["description"].to_s.strip }
|
|
122
|
+
end
|
|
123
|
+
|
|
89
124
|
def assign_version_number
|
|
90
125
|
return if version_number.present?
|
|
91
126
|
max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i
|
|
@@ -10,13 +10,14 @@ module CompletionKit
|
|
|
10
10
|
@judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
-
def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil,
|
|
13
|
+
def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil, **_extras)
|
|
14
14
|
raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
|
|
15
15
|
|
|
16
16
|
judge_prompt = build_judge_prompt(output, expected_output, prompt,
|
|
17
17
|
criteria: criteria,
|
|
18
|
-
rubric_text: rubric_text,
|
|
19
|
-
input_data: input_data
|
|
18
|
+
rubric_text: rubric_text,
|
|
19
|
+
input_data: input_data,
|
|
20
|
+
human_examples: human_examples)
|
|
20
21
|
|
|
21
22
|
response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
|
|
22
23
|
raise StandardError, response if response.start_with?("Error:")
|
|
@@ -25,7 +26,7 @@ module CompletionKit
|
|
|
25
26
|
|
|
26
27
|
private
|
|
27
28
|
|
|
28
|
-
def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil,
|
|
29
|
+
def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil)
|
|
29
30
|
judge_prompt = <<~PROMPT
|
|
30
31
|
You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
|
|
31
32
|
|
|
@@ -42,12 +43,7 @@ module CompletionKit
|
|
|
42
43
|
judge_prompt += "\nCriteria: #{criteria}\n"
|
|
43
44
|
end
|
|
44
45
|
|
|
45
|
-
|
|
46
|
-
judge_prompt += "\nCalibration examples:\n"
|
|
47
|
-
human_examples.each_with_index do |example, index|
|
|
48
|
-
judge_prompt += "Example #{index + 1}: score=#{example[:human_score]} output=#{example[:response_text].to_s.truncate(200)}\n"
|
|
49
|
-
end
|
|
50
|
-
end
|
|
46
|
+
judge_prompt += human_examples_block(human_examples)
|
|
51
47
|
|
|
52
48
|
judge_prompt += <<~PROMPT
|
|
53
49
|
|
|
@@ -60,6 +56,19 @@ module CompletionKit
|
|
|
60
56
|
judge_prompt
|
|
61
57
|
end
|
|
62
58
|
|
|
59
|
+
def human_examples_block(examples)
|
|
60
|
+
return "" if examples.blank?
|
|
61
|
+
|
|
62
|
+
lines = ["", "Reviewed examples where a human corrected the judge on this metric. Weigh them when scoring:"]
|
|
63
|
+
examples.each_with_index do |example, index|
|
|
64
|
+
note = example[:human_note].to_s
|
|
65
|
+
line = "Example #{index + 1}: Output: #{example[:output].to_s.truncate(200)}. The judge scored this #{example[:judge_score].to_i}/5. A reviewer corrected it to #{example[:human_score].to_i}/5"
|
|
66
|
+
line += note.present? ? ": #{note.truncate(160)}" : "."
|
|
67
|
+
lines << line
|
|
68
|
+
end
|
|
69
|
+
lines.join("\n") + "\n"
|
|
70
|
+
end
|
|
71
|
+
|
|
63
72
|
def parse_judge_response(response)
|
|
64
73
|
score_match = response.match(/\*{0,2}Score:?\*{0,2}\s*(\d+(?:\.\d+)?)/i)
|
|
65
74
|
feedback_match = response.match(/\*{0,2}Feedback:?\*{0,2}\s*(.+)/mi)
|
|
@@ -16,7 +16,7 @@ module CompletionKit
|
|
|
16
16
|
handler: :list
|
|
17
17
|
},
|
|
18
18
|
"metric_versions_publish" => {
|
|
19
|
-
description: "Publish a MetricVersion as the live
|
|
19
|
+
description: "Publish a MetricVersion as the live version of its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge grades against it.",
|
|
20
20
|
inputSchema: {
|
|
21
21
|
type: "object",
|
|
22
22
|
properties: {
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module MetricCalibrationExamples
|
|
3
|
+
DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
|
|
4
|
+
|
|
5
|
+
module_function
|
|
6
|
+
|
|
7
|
+
def for(metric, limit: 8)
|
|
8
|
+
disagreements_for(metric, limit: limit)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def disagreements_for(metric, limit: 8)
|
|
12
|
+
calibrations_for(metric, verdict: "disagree", limit: limit)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def borderlines_for(metric, limit: 6)
|
|
16
|
+
calibrations_for(metric, verdict: "borderline", limit: limit)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
|
|
20
|
+
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
21
|
+
return [] unless current_version
|
|
22
|
+
|
|
23
|
+
relation = Calibration
|
|
24
|
+
.where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
|
|
25
|
+
.where.not(corrected_score: nil)
|
|
26
|
+
relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
|
|
27
|
+
map_examples(relation.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
|
|
28
|
+
.reject { |example| example[:judge_score].nil? }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def calibrations_for(metric, verdict:, limit:)
|
|
32
|
+
base = Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
33
|
+
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
34
|
+
scoped = current_version ? base.where(metric_version_id: current_version.id) : base
|
|
35
|
+
effective = scoped.exists? ? scoped : base
|
|
36
|
+
map_examples(effective.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def map_examples(relation, metric)
|
|
40
|
+
relation.map do |cal|
|
|
41
|
+
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
42
|
+
{
|
|
43
|
+
id: cal.id,
|
|
44
|
+
run_id: cal.run_id,
|
|
45
|
+
response_id: cal.response_id,
|
|
46
|
+
input: cal.response.input_data,
|
|
47
|
+
output: cal.response.response_text,
|
|
48
|
+
judge_score: review&.ai_score,
|
|
49
|
+
judge_feedback: review&.ai_feedback,
|
|
50
|
+
human_score: cal.corrected_score,
|
|
51
|
+
human_note: cal.note
|
|
52
|
+
}
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -43,7 +43,6 @@ module CompletionKit
|
|
|
43
43
|
def build_meta_prompt
|
|
44
44
|
disagreements = MetricCalibrationExamples.disagreements_for(@metric)
|
|
45
45
|
borderlines = MetricCalibrationExamples.borderlines_for(@metric)
|
|
46
|
-
pinned_examples = Array(@metric.few_shot_examples)
|
|
47
46
|
sections = []
|
|
48
47
|
sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
|
|
49
48
|
sections << ""
|
|
@@ -78,18 +77,6 @@ module CompletionKit
|
|
|
78
77
|
sections << ""
|
|
79
78
|
end
|
|
80
79
|
end
|
|
81
|
-
if pinned_examples.any?
|
|
82
|
-
sections << "## Pinned cases the judge already references"
|
|
83
|
-
sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
|
|
84
|
-
pinned_examples.each_with_index do |ex, i|
|
|
85
|
-
sections << "### Pinned #{i + 1}"
|
|
86
|
-
sections << "Input: #{ex["input"].to_s.truncate(200)}"
|
|
87
|
-
sections << "Output: #{ex["response"].to_s.truncate(200)}"
|
|
88
|
-
sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
|
|
89
|
-
sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
|
|
90
|
-
sections << ""
|
|
91
|
-
end
|
|
92
|
-
end
|
|
93
80
|
sections << "## Task"
|
|
94
81
|
sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
|
|
95
82
|
sections << ""
|
|
@@ -130,40 +117,4 @@ module CompletionKit
|
|
|
130
117
|
end
|
|
131
118
|
end
|
|
132
119
|
|
|
133
|
-
module MetricCalibrationExamples
|
|
134
|
-
module_function
|
|
135
|
-
|
|
136
|
-
def for(metric, limit: 8)
|
|
137
|
-
disagreements_for(metric, limit: limit)
|
|
138
|
-
end
|
|
139
|
-
|
|
140
|
-
def disagreements_for(metric, limit: 8)
|
|
141
|
-
calibrations_for(metric, verdict: "disagree", limit: limit)
|
|
142
|
-
end
|
|
143
|
-
|
|
144
|
-
def borderlines_for(metric, limit: 6)
|
|
145
|
-
calibrations_for(metric, verdict: "borderline", limit: limit)
|
|
146
|
-
end
|
|
147
|
-
|
|
148
|
-
def calibrations_for(metric, verdict:, limit:)
|
|
149
|
-
base = Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
150
|
-
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
151
|
-
scoped = current_version ? base.where(metric_version_id: current_version.id) : base
|
|
152
|
-
effective = scoped.exists? ? scoped : base
|
|
153
|
-
effective.includes(response: :reviews)
|
|
154
|
-
.order(created_at: :desc)
|
|
155
|
-
.limit(limit)
|
|
156
|
-
.map do |cal|
|
|
157
|
-
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
158
|
-
{
|
|
159
|
-
input: cal.response.input_data,
|
|
160
|
-
output: cal.response.response_text,
|
|
161
|
-
judge_score: review&.ai_score,
|
|
162
|
-
judge_feedback: review&.ai_feedback,
|
|
163
|
-
human_score: cal.corrected_score,
|
|
164
|
-
human_note: cal.note
|
|
165
|
-
}
|
|
166
|
-
end
|
|
167
|
-
end
|
|
168
|
-
end
|
|
169
120
|
end
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
<label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">10</span></label>
|
|
28
28
|
<label for="ck-tab-responses" class="ck-api-tabs__label">Responses <span class="ck-api-tabs__count">2</span></label>
|
|
29
29
|
<label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
|
|
30
|
-
<label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">
|
|
30
|
+
<label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">10</span></label>
|
|
31
31
|
<label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
|
|
32
32
|
<label for="ck-tab-calibrations" class="ck-api-tabs__label">Calibrations <span class="ck-api-tabs__count">3</span></label>
|
|
33
33
|
<label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
|
|
@@ -239,7 +239,7 @@
|
|
|
239
239
|
|
|
240
240
|
<div class="ck-api-endpoint" style="padding-top: 1.5rem;">
|
|
241
241
|
<p class="ck-kicker" style="margin-bottom: 0.5rem;">Calibration loop</p>
|
|
242
|
-
<p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model
|
|
242
|
+
<p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model to rewrite the instruction and rubric into a new draft version.</p>
|
|
243
243
|
</div>
|
|
244
244
|
<div class="ck-api-endpoint">
|
|
245
245
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
|
|
@@ -247,16 +247,6 @@
|
|
|
247
247
|
<p class="ck-api-params"><strong>Optional:</strong> <code>count</code>, <code>model</code></p>
|
|
248
248
|
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/metrics/1/suggest_variants \\\n -H \"Authorization: Bearer #{token}\"" %>
|
|
249
249
|
</div>
|
|
250
|
-
<div class="ck-api-endpoint">
|
|
251
|
-
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/add_few_shot</p>
|
|
252
|
-
<p class="ck-meta-copy">Pin a disagree calibration as a few-shot example on the metric. Returns the updated metric, or 404 if the calibration is not a disagree on this metric.</p>
|
|
253
|
-
<p class="ck-api-params"><strong>Required:</strong> <code>calibration_id</code></p>
|
|
254
|
-
</div>
|
|
255
|
-
<div class="ck-api-endpoint">
|
|
256
|
-
<p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:id/remove_few_shot</p>
|
|
257
|
-
<p class="ck-meta-copy">Drop the pinned few-shot example by calibration ID.</p>
|
|
258
|
-
<p class="ck-api-params"><strong>Required:</strong> <code>calibration_id</code></p>
|
|
259
|
-
</div>
|
|
260
250
|
|
|
261
251
|
<div class="ck-api-endpoint" style="padding-top: 1.5rem;">
|
|
262
252
|
<p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>
|
|
@@ -1,16 +1,12 @@
|
|
|
1
1
|
<% stats = local_assigns[:stats] %>
|
|
2
2
|
<% metric = local_assigns[:metric] %>
|
|
3
3
|
<% anchor = metric&.name&.parameterize %>
|
|
4
|
-
<%
|
|
4
|
+
<% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
|
|
5
|
+
<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
|
|
5
6
|
created_by = CompletionKit.config.username.presence || "operator"
|
|
6
|
-
|
|
7
|
-
verdicted_ids = if current_metric_version
|
|
8
|
-
CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
|
|
9
|
-
else
|
|
10
|
-
[]
|
|
11
|
-
end
|
|
7
|
+
verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
|
|
12
8
|
CompletionKit::Response.joins(:reviews)
|
|
13
|
-
.where(reviews: { metric_id: metric.id })
|
|
9
|
+
.where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
|
|
14
10
|
.where.not(reviews: { ai_score: nil })
|
|
15
11
|
.where.not(id: verdicted_ids)
|
|
16
12
|
.order(created_at: :desc).first
|
|
@@ -22,19 +18,29 @@
|
|
|
22
18
|
end %>
|
|
23
19
|
|
|
24
20
|
<p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
25
|
-
<span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
|
|
26
21
|
<% if stats.sample_size.zero? %>
|
|
27
|
-
<span class="ck-trust-
|
|
28
|
-
<span class="ck-trust-line__hint"
|
|
29
|
-
|
|
30
|
-
|
|
22
|
+
<span class="ck-trust-line__lead">Not measured yet.</span>
|
|
23
|
+
<span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
|
|
24
|
+
<% if target_response %>
|
|
25
|
+
<%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
26
|
+
<% end %>
|
|
31
27
|
<% elsif stats.counter_only? %>
|
|
32
|
-
<span class="ck-
|
|
33
|
-
|
|
28
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></strong></span>
|
|
29
|
+
<% if stats.short_to_target > 0 %><span class="ck-trust-line__hint"><%= stats.short_to_target %> more to report a rate</span><% end %>
|
|
30
|
+
<% if target_response %>
|
|
31
|
+
<%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
32
|
+
<% end %>
|
|
34
33
|
<% else %>
|
|
35
|
-
<span class="ck-
|
|
36
|
-
<span class="ck-
|
|
37
|
-
<span class="ck-
|
|
38
|
-
<span class="ck-
|
|
34
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Agreement</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong></span>
|
|
35
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
|
|
36
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
|
|
37
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
|
|
38
|
+
<% if stats.borderline_rate && stats.borderline_rate > 0 %>
|
|
39
|
+
<% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
|
|
40
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>
|
|
41
|
+
<% end %>
|
|
39
42
|
<% end %>
|
|
40
43
|
</p>
|
|
44
|
+
<% if stats.sample_size.zero? && prior_version_verdicts > 0 %>
|
|
45
|
+
<p class="ck-trust-line__aside"><%= pluralize(prior_version_verdicts, "review") %> from an earlier version <%= prior_version_verdicts == 1 ? "doesn't" : "don't" %> count toward this version.</p>
|
|
46
|
+
<% end %>
|
|
@@ -40,20 +40,19 @@
|
|
|
40
40
|
<% if suggestion %>
|
|
41
41
|
<div class="ck-suggestion-banner" role="status">
|
|
42
42
|
<div class="ck-suggestion-banner__body">
|
|
43
|
-
<p class="ck-kicker"
|
|
44
|
-
<p class="ck-meta-copy">Based on
|
|
43
|
+
<p class="ck-kicker ck-kicker--icon"><%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>Proposed changes</p>
|
|
44
|
+
<p class="ck-meta-copy">Based on human reviews, here are some proposed changes to the metric.</p>
|
|
45
45
|
</div>
|
|
46
46
|
<div class="ck-suggestion-banner__actions">
|
|
47
|
-
<%= button_to
|
|
48
|
-
method: :post, form_class: "inline-block",
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
<%= button_to "Take everything", publish_draft_metric_path(metric, draft_id: suggestion.id),
|
|
47
|
+
<%= button_to suggest_variants_metric_path(metric, back_to: "edit"),
|
|
48
|
+
method: :post, form_class: "inline-block", class: "ck-icon-btn",
|
|
49
|
+
title: "Try again", "aria-label": "Try again",
|
|
50
|
+
data: { turbo_confirm: "Replace these changes with fresh ones from the model?" } do %><%= heroicon_tag "arrow-path", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
51
|
+
<%= button_to dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
|
|
52
|
+
method: :delete, form_class: "inline-block", class: "ck-icon-btn",
|
|
53
|
+
title: "Discard these changes", "aria-label": "Discard",
|
|
54
|
+
data: { turbo_confirm: "Drop these changes?" } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
55
|
+
<%= button_to "Apply all", publish_draft_metric_path(metric, draft_id: suggestion.id),
|
|
57
56
|
method: :post, form_class: "inline-block",
|
|
58
57
|
class: ck_button_classes(:dark) %>
|
|
59
58
|
</div>
|