completion-kit 0.7.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +146 -325
- data/app/controllers/completion_kit/api/v1/base_controller.rb +14 -4
- data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +2 -2
- data/app/controllers/completion_kit/api/v1/datasets_controller.rb +2 -2
- data/app/controllers/completion_kit/api/v1/metric_groups_controller.rb +2 -2
- data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +1 -1
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +5 -32
- data/app/controllers/completion_kit/api/v1/prompts_controller.rb +2 -2
- data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb +2 -2
- data/app/controllers/completion_kit/api/v1/runs_controller.rb +7 -7
- data/app/controllers/completion_kit/api/v1/tags_controller.rb +2 -2
- data/app/controllers/completion_kit/metrics_controller.rb +14 -37
- data/app/controllers/completion_kit/runs_controller.rb +2 -2
- data/app/jobs/completion_kit/generate_row_job.rb +2 -4
- data/app/jobs/completion_kit/judge_review_job.rb +4 -19
- data/app/models/completion_kit/metric.rb +0 -1
- data/app/models/completion_kit/metric_version.rb +35 -0
- data/app/models/completion_kit/run.rb +0 -1
- data/app/services/completion_kit/judge_service.rb +3 -10
- data/app/services/completion_kit/mcp_tools/metric_versions.rb +1 -1
- data/app/services/completion_kit/metric_variant_generator.rb +0 -13
- data/app/views/completion_kit/api_reference/_body.html.erb +2 -12
- data/app/views/completion_kit/api_reference/index.html.erb +4 -0
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +19 -12
- data/app/views/completion_kit/metrics/_form.html.erb +11 -12
- data/app/views/completion_kit/metrics/edit.html.erb +18 -0
- data/app/views/completion_kit/metrics/index.html.erb +0 -17
- data/app/views/completion_kit/metrics/show.html.erb +87 -105
- data/app/views/completion_kit/responses/show.html.erb +2 -2
- data/app/views/completion_kit/runs/show.html.erb +7 -7
- data/config/routes.rb +0 -4
- data/db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb +5 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +2 -1
|
@@ -20,7 +20,7 @@ module CompletionKit
|
|
|
20
20
|
metric_group.replace_metrics!(params[:metric_ids]) if params.key?(:metric_ids)
|
|
21
21
|
render json: metric_group.reload, status: :created
|
|
22
22
|
else
|
|
23
|
-
|
|
23
|
+
render_validation_errors(metric_group)
|
|
24
24
|
end
|
|
25
25
|
end
|
|
26
26
|
|
|
@@ -29,7 +29,7 @@ module CompletionKit
|
|
|
29
29
|
@metric_group.replace_metrics!(params[:metric_ids]) if params.key?(:metric_ids)
|
|
30
30
|
render json: @metric_group.reload
|
|
31
31
|
else
|
|
32
|
-
|
|
32
|
+
render_validation_errors(@metric_group)
|
|
33
33
|
end
|
|
34
34
|
end
|
|
35
35
|
|
|
@@ -25,7 +25,7 @@ module CompletionKit
|
|
|
25
25
|
|
|
26
26
|
def destroy
|
|
27
27
|
if @version.published?
|
|
28
|
-
|
|
28
|
+
render_error("Cannot dismiss a published version. Publish a different version as current instead.", status: :conflict)
|
|
29
29
|
return
|
|
30
30
|
end
|
|
31
31
|
@version.destroy!
|
|
@@ -2,7 +2,7 @@ module CompletionKit
|
|
|
2
2
|
module Api
|
|
3
3
|
module V1
|
|
4
4
|
class MetricsController < BaseController
|
|
5
|
-
before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants
|
|
5
|
+
before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants]
|
|
6
6
|
|
|
7
7
|
def index
|
|
8
8
|
scope = Metric.includes(:tags)
|
|
@@ -19,7 +19,7 @@ module CompletionKit
|
|
|
19
19
|
if metric.save
|
|
20
20
|
render json: metric, status: :created
|
|
21
21
|
else
|
|
22
|
-
|
|
22
|
+
render_validation_errors(metric)
|
|
23
23
|
end
|
|
24
24
|
end
|
|
25
25
|
|
|
@@ -27,7 +27,7 @@ module CompletionKit
|
|
|
27
27
|
if @metric.update(metric_params)
|
|
28
28
|
render json: @metric
|
|
29
29
|
else
|
|
30
|
-
|
|
30
|
+
render_validation_errors(@metric)
|
|
31
31
|
end
|
|
32
32
|
end
|
|
33
33
|
|
|
@@ -39,7 +39,7 @@ module CompletionKit
|
|
|
39
39
|
def suggest_variants
|
|
40
40
|
disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
41
41
|
if disagreement_count.zero?
|
|
42
|
-
|
|
42
|
+
render_error("Mark at least one case as Disagree before asking the model to suggest a change.", status: :unprocessable_entity)
|
|
43
43
|
return
|
|
44
44
|
end
|
|
45
45
|
|
|
@@ -47,40 +47,13 @@ module CompletionKit
|
|
|
47
47
|
generator = MetricVariantGenerator.new(@metric, count: params[:count].to_i, model: params[:model])
|
|
48
48
|
variants = generator.call
|
|
49
49
|
if variants.empty?
|
|
50
|
-
|
|
50
|
+
render_error("The model returned no usable variants. Try again with a different model.", status: :unprocessable_entity)
|
|
51
51
|
return
|
|
52
52
|
end
|
|
53
53
|
versions = generator.persist!(variants)
|
|
54
54
|
render json: versions, status: :created
|
|
55
55
|
end
|
|
56
56
|
|
|
57
|
-
def add_few_shot
|
|
58
|
-
calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
|
|
59
|
-
review = calibration.response.reviews.find_by(metric_id: @metric.id)
|
|
60
|
-
examples = Array(@metric.few_shot_examples)
|
|
61
|
-
examples << {
|
|
62
|
-
"input" => calibration.response.input_data.to_s.truncate(2000),
|
|
63
|
-
"response" => calibration.response.response_text.to_s.truncate(2000),
|
|
64
|
-
"judge_score" => review&.ai_score&.to_f,
|
|
65
|
-
"judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
|
|
66
|
-
"human_score" => calibration.corrected_score&.to_f,
|
|
67
|
-
"human_note" => calibration.note.to_s.truncate(1000),
|
|
68
|
-
"calibration_id" => calibration.id,
|
|
69
|
-
"added_at" => Time.current.utc.iso8601
|
|
70
|
-
}
|
|
71
|
-
@metric.update!(few_shot_examples: examples)
|
|
72
|
-
render json: @metric.reload
|
|
73
|
-
rescue ActiveRecord::RecordNotFound
|
|
74
|
-
render json: { error: "Calibration not found or not a disagree on this metric." }, status: :not_found
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
def remove_few_shot
|
|
78
|
-
cal_id = params[:calibration_id].to_i
|
|
79
|
-
remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
|
|
80
|
-
@metric.update!(few_shot_examples: remaining)
|
|
81
|
-
render json: @metric.reload
|
|
82
|
-
end
|
|
83
|
-
|
|
84
57
|
private
|
|
85
58
|
|
|
86
59
|
def set_metric
|
|
@@ -19,7 +19,7 @@ module CompletionKit
|
|
|
19
19
|
if prompt.save
|
|
20
20
|
render json: prompt, status: :created
|
|
21
21
|
else
|
|
22
|
-
|
|
22
|
+
render_validation_errors(prompt)
|
|
23
23
|
end
|
|
24
24
|
end
|
|
25
25
|
|
|
@@ -32,7 +32,7 @@ module CompletionKit
|
|
|
32
32
|
elsif @prompt.update(prompt_params)
|
|
33
33
|
render json: @prompt
|
|
34
34
|
else
|
|
35
|
-
|
|
35
|
+
render_validation_errors(@prompt)
|
|
36
36
|
end
|
|
37
37
|
end
|
|
38
38
|
|
|
@@ -17,7 +17,7 @@ module CompletionKit
|
|
|
17
17
|
if credential.save
|
|
18
18
|
render json: credential, status: :created
|
|
19
19
|
else
|
|
20
|
-
|
|
20
|
+
render_validation_errors(credential)
|
|
21
21
|
end
|
|
22
22
|
end
|
|
23
23
|
|
|
@@ -25,7 +25,7 @@ module CompletionKit
|
|
|
25
25
|
if @credential.update(credential_params)
|
|
26
26
|
render json: @credential
|
|
27
27
|
else
|
|
28
|
-
|
|
28
|
+
render_validation_errors(@credential)
|
|
29
29
|
end
|
|
30
30
|
end
|
|
31
31
|
|
|
@@ -23,7 +23,7 @@ module CompletionKit
|
|
|
23
23
|
run.replace_metrics!(params[:metric_ids])
|
|
24
24
|
render json: run.reload, status: :created
|
|
25
25
|
else
|
|
26
|
-
|
|
26
|
+
render_validation_errors(run)
|
|
27
27
|
end
|
|
28
28
|
end
|
|
29
29
|
|
|
@@ -32,7 +32,7 @@ module CompletionKit
|
|
|
32
32
|
@run.replace_metrics!(params[:metric_ids]) if params.key?(:metric_ids)
|
|
33
33
|
render json: @run.reload
|
|
34
34
|
else
|
|
35
|
-
|
|
35
|
+
render_validation_errors(@run)
|
|
36
36
|
end
|
|
37
37
|
end
|
|
38
38
|
|
|
@@ -45,13 +45,13 @@ module CompletionKit
|
|
|
45
45
|
if @run.start!
|
|
46
46
|
render json: @run.reload, status: :accepted
|
|
47
47
|
else
|
|
48
|
-
|
|
48
|
+
render_error(@run.failure_summary || @run.errors.full_messages.to_sentence, status: :unprocessable_entity)
|
|
49
49
|
end
|
|
50
50
|
end
|
|
51
51
|
|
|
52
52
|
def retry_failures
|
|
53
53
|
if @run.stale_review_summary.any?
|
|
54
|
-
return
|
|
54
|
+
return render_error("Judge has changed since this run executed. Retry would mix versions in the same run; use POST /api/v1/runs/:id/rerun instead.", status: :conflict)
|
|
55
55
|
end
|
|
56
56
|
|
|
57
57
|
scope = @run.responses.where(status: "failed")
|
|
@@ -90,7 +90,7 @@ module CompletionKit
|
|
|
90
90
|
if new_run.start!
|
|
91
91
|
render json: new_run.reload, status: :accepted
|
|
92
92
|
else
|
|
93
|
-
|
|
93
|
+
render_error(new_run.failure_summary || "Could not start the new run.", status: :unprocessable_entity)
|
|
94
94
|
end
|
|
95
95
|
end
|
|
96
96
|
|
|
@@ -98,7 +98,7 @@ module CompletionKit
|
|
|
98
98
|
if @run.regrade!
|
|
99
99
|
render json: @run.reload, status: :accepted
|
|
100
100
|
else
|
|
101
|
-
|
|
101
|
+
render_error("Nothing to re-grade. The run has no succeeded responses or no metrics attached.", status: :unprocessable_entity)
|
|
102
102
|
end
|
|
103
103
|
end
|
|
104
104
|
|
|
@@ -107,7 +107,7 @@ module CompletionKit
|
|
|
107
107
|
comparison = build_run_comparison(@run, other)
|
|
108
108
|
render json: { left_run_id: @run.id, right_run_id: other.id, metric_ids: comparison[:metric_ids], rows: comparison[:rows] }
|
|
109
109
|
rescue ActiveRecord::RecordNotFound
|
|
110
|
-
|
|
110
|
+
render_error("Other run not found. Pass ?with=<run_id>.", status: :not_found)
|
|
111
111
|
end
|
|
112
112
|
|
|
113
113
|
private
|
|
@@ -17,7 +17,7 @@ module CompletionKit
|
|
|
17
17
|
if tag.save
|
|
18
18
|
render json: tag, status: :created
|
|
19
19
|
else
|
|
20
|
-
|
|
20
|
+
render_validation_errors(tag)
|
|
21
21
|
end
|
|
22
22
|
end
|
|
23
23
|
|
|
@@ -25,7 +25,7 @@ module CompletionKit
|
|
|
25
25
|
if @tag.update(tag_params)
|
|
26
26
|
render json: @tag
|
|
27
27
|
else
|
|
28
|
-
|
|
28
|
+
render_validation_errors(@tag)
|
|
29
29
|
end
|
|
30
30
|
end
|
|
31
31
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class MetricsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_metric, only: [:show, :edit, :update, :destroy, :
|
|
4
|
+
before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion]
|
|
5
5
|
|
|
6
6
|
def index
|
|
7
7
|
@metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
|
|
@@ -35,11 +35,6 @@ module CompletionKit
|
|
|
35
35
|
end
|
|
36
36
|
|
|
37
37
|
def show
|
|
38
|
-
@published_metric_version = MetricVersion.ensure_current_for(@metric)
|
|
39
|
-
@disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
|
|
40
|
-
.includes(:metric_version, response: [:reviews, :run])
|
|
41
|
-
.order(created_at: :desc)
|
|
42
|
-
.limit(50)
|
|
43
38
|
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
44
39
|
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
45
40
|
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
@@ -54,6 +49,7 @@ module CompletionKit
|
|
|
54
49
|
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
55
50
|
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
56
51
|
@published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
52
|
+
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
57
53
|
|
|
58
54
|
if @edit_draft
|
|
59
55
|
@metric.instruction = @edit_draft.instruction
|
|
@@ -102,7 +98,7 @@ module CompletionKit
|
|
|
102
98
|
state: "draft", source: "edit", current: false
|
|
103
99
|
)
|
|
104
100
|
redirect_to edit_metric_path(@metric),
|
|
105
|
-
notice: "Saved as draft #{draft.version_label}. Publish to
|
|
101
|
+
notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
|
|
106
102
|
else
|
|
107
103
|
@metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
|
|
108
104
|
current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
@@ -120,7 +116,7 @@ module CompletionKit
|
|
|
120
116
|
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
121
117
|
disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
122
118
|
if disagreement_count.zero?
|
|
123
|
-
redirect_to target, alert: "Mark at least one
|
|
119
|
+
redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
|
|
124
120
|
return
|
|
125
121
|
end
|
|
126
122
|
|
|
@@ -132,15 +128,21 @@ module CompletionKit
|
|
|
132
128
|
redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
|
|
133
129
|
return
|
|
134
130
|
end
|
|
135
|
-
generator.persist!(variants)
|
|
136
|
-
|
|
131
|
+
versions = generator.persist!(variants)
|
|
132
|
+
new_version = versions.max_by(&:version_number)
|
|
133
|
+
if params[:back_to] == "edit"
|
|
134
|
+
redirect_to edit_metric_path(@metric), notice: "Drafted #{new_version.version_label} from your reviews. Review the proposed changes below, then Publish to use it."
|
|
135
|
+
else
|
|
136
|
+
redirect_to metric_path(@metric, show_change: new_version.id), notice: "Drafted #{new_version.version_label} from your reviews."
|
|
137
|
+
end
|
|
137
138
|
end
|
|
138
139
|
|
|
139
140
|
def dismiss_suggestion
|
|
140
141
|
draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
|
|
142
|
+
label = draft&.version_label
|
|
141
143
|
draft&.destroy
|
|
142
144
|
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
143
|
-
redirect_to target, notice: "
|
|
145
|
+
redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
|
|
144
146
|
end
|
|
145
147
|
|
|
146
148
|
def publish_draft
|
|
@@ -164,7 +166,7 @@ module CompletionKit
|
|
|
164
166
|
audit = version.revert!
|
|
165
167
|
prior_label = previously_current.version_label
|
|
166
168
|
redirect_to metric_path(@metric),
|
|
167
|
-
notice: "Reverted
|
|
169
|
+
notice: "Reverted #{@metric.name} to #{version.version_label} (logged as #{audit.version_label}). Human reviews collected against #{prior_label} stay tied to it."
|
|
168
170
|
else
|
|
169
171
|
version.publish!
|
|
170
172
|
redirect_to metric_path(@metric),
|
|
@@ -172,31 +174,6 @@ module CompletionKit
|
|
|
172
174
|
end
|
|
173
175
|
end
|
|
174
176
|
|
|
175
|
-
def add_few_shot
|
|
176
|
-
calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
|
|
177
|
-
review = calibration.response.reviews.find_by(metric_id: @metric.id)
|
|
178
|
-
examples = Array(@metric.few_shot_examples)
|
|
179
|
-
examples << {
|
|
180
|
-
"input" => calibration.response.input_data.to_s.truncate(2000),
|
|
181
|
-
"response" => calibration.response.response_text.to_s.truncate(2000),
|
|
182
|
-
"judge_score" => review&.ai_score&.to_f,
|
|
183
|
-
"judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
|
|
184
|
-
"human_score" => calibration.corrected_score&.to_f,
|
|
185
|
-
"human_note" => calibration.note.to_s.truncate(1000),
|
|
186
|
-
"calibration_id" => calibration.id,
|
|
187
|
-
"added_at" => Time.current.utc.iso8601
|
|
188
|
-
}
|
|
189
|
-
@metric.update!(few_shot_examples: examples)
|
|
190
|
-
redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
|
|
191
|
-
end
|
|
192
|
-
|
|
193
|
-
def remove_few_shot
|
|
194
|
-
cal_id = params[:calibration_id].to_i
|
|
195
|
-
remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
|
|
196
|
-
@metric.update!(few_shot_examples: remaining)
|
|
197
|
-
redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
|
|
198
|
-
end
|
|
199
|
-
|
|
200
177
|
private
|
|
201
178
|
|
|
202
179
|
def set_metric
|
|
@@ -95,7 +95,7 @@ module CompletionKit
|
|
|
95
95
|
|
|
96
96
|
def regrade
|
|
97
97
|
if @run.regrade!
|
|
98
|
-
redirect_to run_path(@run), notice: "Re-grading existing responses
|
|
98
|
+
redirect_to run_path(@run), notice: "Re-grading existing responses against the current metrics."
|
|
99
99
|
else
|
|
100
100
|
redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
|
|
101
101
|
end
|
|
@@ -151,7 +151,7 @@ module CompletionKit
|
|
|
151
151
|
def retry_failures
|
|
152
152
|
if @run.stale_review_summary.any?
|
|
153
153
|
redirect_to run_path(@run),
|
|
154
|
-
alert: "
|
|
154
|
+
alert: "A metric has a newer version than the one this run was scored against. Retrying failed cases would mix scores from two versions in the same run. Use 'Re-run from scratch' to refresh everything against the current metrics."
|
|
155
155
|
return
|
|
156
156
|
end
|
|
157
157
|
|
|
@@ -80,8 +80,7 @@ module CompletionKit
|
|
|
80
80
|
end
|
|
81
81
|
|
|
82
82
|
def record_terminal_failure!(error)
|
|
83
|
-
|
|
84
|
-
response = Response.find_by(id: response_id)
|
|
83
|
+
response = Response.find_by(id: @response_id)
|
|
85
84
|
return unless response
|
|
86
85
|
|
|
87
86
|
response.update!(
|
|
@@ -98,8 +97,7 @@ module CompletionKit
|
|
|
98
97
|
end
|
|
99
98
|
|
|
100
99
|
def enqueue_completion_check
|
|
101
|
-
|
|
102
|
-
RunCompletionCheckJob.perform_later(run_id)
|
|
100
|
+
RunCompletionCheckJob.perform_later(@run_id)
|
|
103
101
|
end
|
|
104
102
|
end
|
|
105
103
|
end
|
|
@@ -58,7 +58,6 @@ module CompletionKit
|
|
|
58
58
|
run.prompt&.template,
|
|
59
59
|
criteria: metric.instruction.to_s,
|
|
60
60
|
rubric_text: metric.display_rubric_text,
|
|
61
|
-
human_examples: few_shot_payload(metric),
|
|
62
61
|
input_data: response.input_data
|
|
63
62
|
)
|
|
64
63
|
|
|
@@ -91,14 +90,12 @@ module CompletionKit
|
|
|
91
90
|
end
|
|
92
91
|
|
|
93
92
|
def record_terminal_failure!(error)
|
|
94
|
-
|
|
95
|
-
metric_id = @metric_id || arguments.last
|
|
96
|
-
response = Response.find_by(id: response_id)
|
|
93
|
+
response = Response.find_by(id: @response_id)
|
|
97
94
|
return unless response
|
|
98
95
|
|
|
99
|
-
review = response.reviews.find_or_initialize_by(metric_id: metric_id)
|
|
96
|
+
review = response.reviews.find_or_initialize_by(metric_id: @metric_id)
|
|
100
97
|
review.assign_attributes(
|
|
101
|
-
metric_name: review.metric_name || Metric.find_by(id: metric_id)&.name || "(deleted metric)",
|
|
98
|
+
metric_name: review.metric_name || Metric.find_by(id: @metric_id)&.name || "(deleted metric)",
|
|
102
99
|
status: "failed",
|
|
103
100
|
error_provider: provider_for(response),
|
|
104
101
|
error_class: error.class.name,
|
|
@@ -115,20 +112,8 @@ module CompletionKit
|
|
|
115
112
|
end
|
|
116
113
|
|
|
117
114
|
def enqueue_completion_check
|
|
118
|
-
|
|
119
|
-
response = Response.find_by(id: response_id)
|
|
115
|
+
response = Response.find_by(id: @response_id)
|
|
120
116
|
RunCompletionCheckJob.perform_later(response.run_id) if response
|
|
121
117
|
end
|
|
122
|
-
|
|
123
|
-
def few_shot_payload(metric)
|
|
124
|
-
return nil unless CompletionKit.config.judge_calibration_enabled
|
|
125
|
-
Array(metric.few_shot_examples).map do |fs|
|
|
126
|
-
{
|
|
127
|
-
human_score: fs["human_score"],
|
|
128
|
-
response_text: fs["response"].to_s,
|
|
129
|
-
human_note: fs["human_note"].to_s
|
|
130
|
-
}
|
|
131
|
-
end
|
|
132
|
-
end
|
|
133
118
|
end
|
|
134
119
|
end
|
|
@@ -17,7 +17,6 @@ module CompletionKit
|
|
|
17
17
|
has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
|
|
18
18
|
|
|
19
19
|
serialize :rubric_bands, coder: JSON
|
|
20
|
-
serialize :few_shot_examples, coder: JSON, type: Array
|
|
21
20
|
|
|
22
21
|
validates :name, presence: true
|
|
23
22
|
validates :key, tenant_scoped_uniqueness: { allow_nil: true }
|
|
@@ -40,6 +40,35 @@ module CompletionKit
|
|
|
40
40
|
"v#{version_number}"
|
|
41
41
|
end
|
|
42
42
|
|
|
43
|
+
def change_summary_against(previous)
|
|
44
|
+
return nil if previous.nil?
|
|
45
|
+
|
|
46
|
+
instruction_changed = previous.instruction.to_s.strip != instruction.to_s.strip
|
|
47
|
+
rubric_changes = rubric_band_change_count(previous)
|
|
48
|
+
return nil unless instruction_changed || rubric_changes.positive?
|
|
49
|
+
|
|
50
|
+
dimensions = []
|
|
51
|
+
dimensions << "instruction" if instruction_changed
|
|
52
|
+
dimensions << "rubric" if rubric_changes.positive?
|
|
53
|
+
|
|
54
|
+
words_changed = 0
|
|
55
|
+
if instruction_changed
|
|
56
|
+
old_words = previous.instruction.to_s.split
|
|
57
|
+
new_words = instruction.to_s.split
|
|
58
|
+
words_changed = (old_words - new_words).size + (new_words - old_words).size
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
magnitude = if rubric_changes >= 2 || (instruction_changed && rubric_changes >= 1) || words_changed >= 15
|
|
62
|
+
:major
|
|
63
|
+
elsif rubric_changes == 1 || words_changed >= 4
|
|
64
|
+
:minor
|
|
65
|
+
else
|
|
66
|
+
:trivial
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
{ magnitude: magnitude, label: "#{magnitude.to_s.capitalize} #{dimensions.to_sentence} changes" }
|
|
70
|
+
end
|
|
71
|
+
|
|
43
72
|
def publish!
|
|
44
73
|
MetricVersion.transaction do
|
|
45
74
|
self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
|
|
@@ -86,6 +115,12 @@ module CompletionKit
|
|
|
86
115
|
|
|
87
116
|
private
|
|
88
117
|
|
|
118
|
+
def rubric_band_change_count(previous)
|
|
119
|
+
prev = Metric.normalize_rubric_bands(previous.rubric_bands)
|
|
120
|
+
curr = Metric.normalize_rubric_bands(rubric_bands)
|
|
121
|
+
prev.zip(curr).count { |p, c| p["description"].to_s.strip != c["description"].to_s.strip }
|
|
122
|
+
end
|
|
123
|
+
|
|
89
124
|
def assign_version_number
|
|
90
125
|
return if version_number.present?
|
|
91
126
|
max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i
|
|
@@ -10,12 +10,12 @@ module CompletionKit
|
|
|
10
10
|
@judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
-
def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil,
|
|
13
|
+
def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, **_extras)
|
|
14
14
|
raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
|
|
15
15
|
|
|
16
16
|
judge_prompt = build_judge_prompt(output, expected_output, prompt,
|
|
17
17
|
criteria: criteria,
|
|
18
|
-
rubric_text: rubric_text,
|
|
18
|
+
rubric_text: rubric_text,
|
|
19
19
|
input_data: input_data)
|
|
20
20
|
|
|
21
21
|
response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
|
|
@@ -25,7 +25,7 @@ module CompletionKit
|
|
|
25
25
|
|
|
26
26
|
private
|
|
27
27
|
|
|
28
|
-
def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil,
|
|
28
|
+
def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil)
|
|
29
29
|
judge_prompt = <<~PROMPT
|
|
30
30
|
You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
|
|
31
31
|
|
|
@@ -42,13 +42,6 @@ module CompletionKit
|
|
|
42
42
|
judge_prompt += "\nCriteria: #{criteria}\n"
|
|
43
43
|
end
|
|
44
44
|
|
|
45
|
-
if human_examples.present?
|
|
46
|
-
judge_prompt += "\nCalibration examples:\n"
|
|
47
|
-
human_examples.each_with_index do |example, index|
|
|
48
|
-
judge_prompt += "Example #{index + 1}: score=#{example[:human_score]} output=#{example[:response_text].to_s.truncate(200)}\n"
|
|
49
|
-
end
|
|
50
|
-
end
|
|
51
|
-
|
|
52
45
|
judge_prompt += <<~PROMPT
|
|
53
46
|
|
|
54
47
|
Original prompt: #{prompt || "Not provided"}
|
|
@@ -16,7 +16,7 @@ module CompletionKit
|
|
|
16
16
|
handler: :list
|
|
17
17
|
},
|
|
18
18
|
"metric_versions_publish" => {
|
|
19
|
-
description: "Publish a MetricVersion as the live
|
|
19
|
+
description: "Publish a MetricVersion as the live version of its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge grades against it.",
|
|
20
20
|
inputSchema: {
|
|
21
21
|
type: "object",
|
|
22
22
|
properties: {
|
|
@@ -43,7 +43,6 @@ module CompletionKit
|
|
|
43
43
|
def build_meta_prompt
|
|
44
44
|
disagreements = MetricCalibrationExamples.disagreements_for(@metric)
|
|
45
45
|
borderlines = MetricCalibrationExamples.borderlines_for(@metric)
|
|
46
|
-
pinned_examples = Array(@metric.few_shot_examples)
|
|
47
46
|
sections = []
|
|
48
47
|
sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
|
|
49
48
|
sections << ""
|
|
@@ -78,18 +77,6 @@ module CompletionKit
|
|
|
78
77
|
sections << ""
|
|
79
78
|
end
|
|
80
79
|
end
|
|
81
|
-
if pinned_examples.any?
|
|
82
|
-
sections << "## Pinned cases the judge already references"
|
|
83
|
-
sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
|
|
84
|
-
pinned_examples.each_with_index do |ex, i|
|
|
85
|
-
sections << "### Pinned #{i + 1}"
|
|
86
|
-
sections << "Input: #{ex["input"].to_s.truncate(200)}"
|
|
87
|
-
sections << "Output: #{ex["response"].to_s.truncate(200)}"
|
|
88
|
-
sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
|
|
89
|
-
sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
|
|
90
|
-
sections << ""
|
|
91
|
-
end
|
|
92
|
-
end
|
|
93
80
|
sections << "## Task"
|
|
94
81
|
sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
|
|
95
82
|
sections << ""
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
<label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">10</span></label>
|
|
28
28
|
<label for="ck-tab-responses" class="ck-api-tabs__label">Responses <span class="ck-api-tabs__count">2</span></label>
|
|
29
29
|
<label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
|
|
30
|
-
<label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">
|
|
30
|
+
<label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">10</span></label>
|
|
31
31
|
<label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
|
|
32
32
|
<label for="ck-tab-calibrations" class="ck-api-tabs__label">Calibrations <span class="ck-api-tabs__count">3</span></label>
|
|
33
33
|
<label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
|
|
@@ -239,7 +239,7 @@
|
|
|
239
239
|
|
|
240
240
|
<div class="ck-api-endpoint" style="padding-top: 1.5rem;">
|
|
241
241
|
<p class="ck-kicker" style="margin-bottom: 0.5rem;">Calibration loop</p>
|
|
242
|
-
<p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model
|
|
242
|
+
<p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model to rewrite the instruction and rubric into a new draft version.</p>
|
|
243
243
|
</div>
|
|
244
244
|
<div class="ck-api-endpoint">
|
|
245
245
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
|
|
@@ -247,16 +247,6 @@
|
|
|
247
247
|
<p class="ck-api-params"><strong>Optional:</strong> <code>count</code>, <code>model</code></p>
|
|
248
248
|
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/metrics/1/suggest_variants \\\n -H \"Authorization: Bearer #{token}\"" %>
|
|
249
249
|
</div>
|
|
250
|
-
<div class="ck-api-endpoint">
|
|
251
|
-
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/add_few_shot</p>
|
|
252
|
-
<p class="ck-meta-copy">Pin a disagree calibration as a few-shot example on the metric. Returns the updated metric, or 404 if the calibration is not a disagree on this metric.</p>
|
|
253
|
-
<p class="ck-api-params"><strong>Required:</strong> <code>calibration_id</code></p>
|
|
254
|
-
</div>
|
|
255
|
-
<div class="ck-api-endpoint">
|
|
256
|
-
<p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:id/remove_few_shot</p>
|
|
257
|
-
<p class="ck-meta-copy">Drop the pinned few-shot example by calibration ID.</p>
|
|
258
|
-
<p class="ck-api-params"><strong>Required:</strong> <code>calibration_id</code></p>
|
|
259
|
-
</div>
|
|
260
250
|
|
|
261
251
|
<div class="ck-api-endpoint" style="padding-top: 1.5rem;">
|
|
262
252
|
<p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>
|
|
@@ -20,6 +20,10 @@
|
|
|
20
20
|
<p class="ck-kicker">Tag filtering</p>
|
|
21
21
|
<p class="ck-meta-copy">Prompts, runs, metrics, datasets, and metric groups accept <code>?tag[]=name</code> (repeat for OR semantics).</p>
|
|
22
22
|
</div>
|
|
23
|
+
<div>
|
|
24
|
+
<p class="ck-kicker">Error shape</p>
|
|
25
|
+
<p class="ck-meta-copy">Every error response carries a top-level <code>error</code> string. Validation failures (422) add a <code>details</code> object keyed by field: <code>{ "error": "Validation failed", "details": { "name": ["can't be blank"] } }</code>.</p>
|
|
26
|
+
</div>
|
|
23
27
|
</div>
|
|
24
28
|
</div>
|
|
25
29
|
</div>
|