completion-kit 0.5.43 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +38 -0
- data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +1 -1
- data/app/controllers/completion_kit/api/v1/runs_controller.rb +4 -0
- data/app/controllers/completion_kit/calibrations_controller.rb +1 -1
- data/app/controllers/completion_kit/metrics_controller.rb +76 -20
- data/app/controllers/completion_kit/runs_controller.rb +69 -1
- data/app/jobs/completion_kit/judge_review_job.rb +3 -0
- data/app/models/completion_kit/calibration.rb +2 -2
- data/app/models/completion_kit/metric.rb +0 -17
- data/app/models/completion_kit/{judge_version.rb → metric_version.rb} +3 -2
- data/app/models/completion_kit/review.rb +9 -0
- data/app/models/completion_kit/run.rb +60 -0
- data/app/services/completion_kit/mcp_tools/calibrations.rb +1 -1
- data/app/services/completion_kit/mcp_tools/judges.rb +13 -13
- data/app/services/completion_kit/metric_calibration_stats.rb +9 -9
- data/app/services/completion_kit/{judge_variant_generator.rb → metric_variant_generator.rb} +27 -13
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +12 -2
- data/app/views/completion_kit/metrics/_form.html.erb +3 -3
- data/app/views/completion_kit/metrics/edit.html.erb +1 -1
- data/app/views/completion_kit/metrics/show.html.erb +12 -14
- data/app/views/completion_kit/responses/show.html.erb +9 -1
- data/app/views/completion_kit/runs/_actions.html.erb +1 -0
- data/app/views/completion_kit/runs/compare.html.erb +85 -0
- data/app/views/completion_kit/runs/compare_picker.html.erb +39 -0
- data/app/views/completion_kit/runs/show.html.erb +29 -0
- data/config/routes.rb +2 -0
- data/db/migrate/20260528000001_rename_judge_version_to_metric_version.rb +22 -0
- data/db/migrate/20260528000002_add_metric_version_to_reviews.rb +21 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +7 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d8454bbb11d5064ca0c6d4355c780425a28198280dffe7dd424d266fbeef6a09
|
|
4
|
+
data.tar.gz: 24c1da76e1e9118d5e2a732e8e45b684f588f553ad4d8bac89e239bc22c953c3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6fbc5b8047a20240897e19c389bb3f6104d3e2a219794d190183b5433e14d524bb692eb0a27b36ab6471e596c2b9b8af2d70a4f56ae81aa327726fe92f092eb9
|
|
7
|
+
data.tar.gz: a3399003a48836fd457a8c8b488305fad6d006596c6f940a82b232e2a731dfbc3df5ded4ba8bc16b94690a88d56baaaff6edc6801f47f2bbf422ca8fb74270df
|
|
@@ -2816,6 +2816,44 @@ select.ck-input {
|
|
|
2816
2816
|
line-height: 1.55;
|
|
2817
2817
|
}
|
|
2818
2818
|
|
|
2819
|
+
.ck-review-card--stale {
|
|
2820
|
+
border-left: 2px solid rgba(224, 164, 88, 0.45);
|
|
2821
|
+
}
|
|
2822
|
+
|
|
2823
|
+
.ck-stale-versions-banner {
|
|
2824
|
+
margin: 0 0 1rem;
|
|
2825
|
+
padding: 0.9rem 1rem;
|
|
2826
|
+
border: 1px solid rgba(224, 164, 88, 0.4);
|
|
2827
|
+
background: rgba(224, 164, 88, 0.06);
|
|
2828
|
+
border-radius: var(--ck-radius);
|
|
2829
|
+
display: flex;
|
|
2830
|
+
align-items: center;
|
|
2831
|
+
justify-content: space-between;
|
|
2832
|
+
gap: 1rem;
|
|
2833
|
+
flex-wrap: wrap;
|
|
2834
|
+
}
|
|
2835
|
+
.ck-stale-versions-banner__body { min-width: 0; flex: 1 1 320px; }
|
|
2836
|
+
.ck-stale-versions-banner .ck-kicker { color: var(--ck-warning); }
|
|
2837
|
+
|
|
2838
|
+
.ck-delta {
|
|
2839
|
+
font-family: var(--ck-mono);
|
|
2840
|
+
font-size: 0.78rem;
|
|
2841
|
+
letter-spacing: 0.04em;
|
|
2842
|
+
padding: 2px 6px;
|
|
2843
|
+
border-radius: 4px;
|
|
2844
|
+
}
|
|
2845
|
+
.ck-delta--positive { color: var(--ck-success); background: var(--ck-success-soft); }
|
|
2846
|
+
.ck-delta--negative { color: var(--ck-danger); background: var(--ck-danger-soft); }
|
|
2847
|
+
.ck-delta--zero { color: var(--ck-dim); }
|
|
2848
|
+
|
|
2849
|
+
.ck-run-compare-table td { vertical-align: middle; }
|
|
2850
|
+
.ck-review-card__stale-note {
|
|
2851
|
+
margin: 0.4rem 0 0;
|
|
2852
|
+
font-family: var(--ck-mono);
|
|
2853
|
+
font-size: 0.78rem;
|
|
2854
|
+
color: var(--ck-warning);
|
|
2855
|
+
}
|
|
2856
|
+
|
|
2819
2857
|
@media (max-width: 900px) {
|
|
2820
2858
|
.ck-grid--sidebar,
|
|
2821
2859
|
.ck-grid--cards,
|
|
@@ -45,6 +45,10 @@ module CompletionKit
|
|
|
45
45
|
end
|
|
46
46
|
|
|
47
47
|
def retry_failures
|
|
48
|
+
if @run.stale_review_summary.any?
|
|
49
|
+
return render(json: { error: "Judge has changed since this run executed. Retry would mix versions in the same run; use POST /api/v1/runs/:id/rerun instead." }, status: :conflict)
|
|
50
|
+
end
|
|
51
|
+
|
|
48
52
|
scope = @run.responses.where(status: "failed")
|
|
49
53
|
scope = scope.where(id: params[:only]) if params[:only].present?
|
|
50
54
|
|
|
@@ -18,7 +18,7 @@ module CompletionKit
|
|
|
18
18
|
run: @run, response: @response, metric: @metric, created_by: created_by
|
|
19
19
|
)
|
|
20
20
|
calibration.assign_attributes(
|
|
21
|
-
|
|
21
|
+
metric_version: MetricVersion.ensure_current_for(@metric),
|
|
22
22
|
verdict: params[:verdict],
|
|
23
23
|
corrected_score: params[:corrected_score].presence,
|
|
24
24
|
note: params[:note].presence
|
|
@@ -35,16 +35,15 @@ module CompletionKit
|
|
|
35
35
|
end
|
|
36
36
|
|
|
37
37
|
def show
|
|
38
|
-
@
|
|
38
|
+
@published_metric_version = MetricVersion.ensure_current_for(@metric)
|
|
39
39
|
@disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
|
|
40
|
-
.includes(:
|
|
40
|
+
.includes(:metric_version, response: [:reviews, :run])
|
|
41
41
|
.order(created_at: :desc)
|
|
42
42
|
.limit(50)
|
|
43
|
-
@edit_draft =
|
|
44
|
-
@suggestion_draft =
|
|
45
|
-
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree"
|
|
46
|
-
|
|
47
|
-
@versions = JudgeVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
|
|
43
|
+
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
44
|
+
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
45
|
+
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
46
|
+
@versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
|
|
48
47
|
end
|
|
49
48
|
|
|
50
49
|
def new
|
|
@@ -52,9 +51,14 @@ module CompletionKit
|
|
|
52
51
|
end
|
|
53
52
|
|
|
54
53
|
def edit
|
|
55
|
-
@suggestion_draft =
|
|
56
|
-
@edit_draft =
|
|
57
|
-
@
|
|
54
|
+
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
55
|
+
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
56
|
+
@published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
57
|
+
|
|
58
|
+
if @edit_draft
|
|
59
|
+
@metric.instruction = @edit_draft.instruction
|
|
60
|
+
@metric.rubric_bands = @edit_draft.rubric_bands
|
|
61
|
+
end
|
|
58
62
|
end
|
|
59
63
|
|
|
60
64
|
def create
|
|
@@ -68,10 +72,42 @@ module CompletionKit
|
|
|
68
72
|
end
|
|
69
73
|
|
|
70
74
|
def update
|
|
71
|
-
|
|
72
|
-
|
|
75
|
+
judge_keys = %i[instruction rubric_bands]
|
|
76
|
+
meta_attrs = metric_params.except(*judge_keys)
|
|
77
|
+
proposed_instruction = metric_params[:instruction]
|
|
78
|
+
proposed_rubric = metric_params[:rubric_bands]
|
|
79
|
+
|
|
80
|
+
unless @metric.update(meta_attrs)
|
|
81
|
+
return render(:edit, status: :unprocessable_entity)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
current_instruction = @metric.instruction.to_s
|
|
85
|
+
current_rubric = @metric.rubric_bands || []
|
|
86
|
+
normalized_proposed_rubric = normalize_rubric_bands_for_update(proposed_rubric)
|
|
87
|
+
|
|
88
|
+
instruction_changed = !proposed_instruction.nil? && proposed_instruction.to_s != current_instruction
|
|
89
|
+
rubric_changed = !normalized_proposed_rubric.nil? && normalized_proposed_rubric != current_rubric
|
|
90
|
+
|
|
91
|
+
unless instruction_changed || rubric_changed
|
|
92
|
+
return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
new_instruction = instruction_changed ? proposed_instruction.to_s : current_instruction
|
|
96
|
+
new_rubric = rubric_changed ? normalized_proposed_rubric : current_rubric
|
|
97
|
+
|
|
98
|
+
if @metric.reviews.exists?
|
|
99
|
+
MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
|
|
100
|
+
draft = MetricVersion.create!(
|
|
101
|
+
metric: @metric, instruction: new_instruction, rubric_bands: new_rubric,
|
|
102
|
+
state: "draft", source: "edit", current: false
|
|
103
|
+
)
|
|
104
|
+
redirect_to edit_metric_path(@metric),
|
|
105
|
+
notice: "Saved as draft #{draft.version_label}. Publish to push these changes to the live judge."
|
|
73
106
|
else
|
|
74
|
-
|
|
107
|
+
@metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
|
|
108
|
+
current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
109
|
+
current_pub&.update!(instruction: @metric.instruction, rubric_bands: @metric.rubric_bands)
|
|
110
|
+
redirect_to metric_path(@metric), notice: "Metric was successfully updated."
|
|
75
111
|
end
|
|
76
112
|
end
|
|
77
113
|
|
|
@@ -88,9 +124,9 @@ module CompletionKit
|
|
|
88
124
|
return
|
|
89
125
|
end
|
|
90
126
|
|
|
91
|
-
|
|
127
|
+
MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
|
|
92
128
|
|
|
93
|
-
generator =
|
|
129
|
+
generator = MetricVariantGenerator.new(@metric, count: 1)
|
|
94
130
|
variants = generator.call
|
|
95
131
|
if variants.empty?
|
|
96
132
|
redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
|
|
@@ -101,18 +137,18 @@ module CompletionKit
|
|
|
101
137
|
end
|
|
102
138
|
|
|
103
139
|
def dismiss_suggestion
|
|
104
|
-
draft =
|
|
140
|
+
draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
|
|
105
141
|
draft&.destroy
|
|
106
142
|
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
107
143
|
redirect_to target, notice: "Dismissed."
|
|
108
144
|
end
|
|
109
145
|
|
|
110
146
|
def publish_draft
|
|
111
|
-
scope =
|
|
147
|
+
scope = MetricVersion.where(metric_id: @metric.id)
|
|
112
148
|
version = if params[:draft_id].present?
|
|
113
149
|
scope.find_by(id: params[:draft_id])
|
|
114
150
|
else
|
|
115
|
-
|
|
151
|
+
MetricVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
|
|
116
152
|
end
|
|
117
153
|
|
|
118
154
|
if version.nil?
|
|
@@ -120,9 +156,20 @@ module CompletionKit
|
|
|
120
156
|
return
|
|
121
157
|
end
|
|
122
158
|
|
|
159
|
+
was_published_already = version.published?
|
|
160
|
+
reverting = was_published_already && !version.current?
|
|
161
|
+
previously_current = MetricVersion.current.find_by(metric_id: @metric.id)
|
|
162
|
+
|
|
123
163
|
version.publish!
|
|
124
|
-
|
|
125
|
-
|
|
164
|
+
|
|
165
|
+
if reverting
|
|
166
|
+
prior_label = previously_current.version_label
|
|
167
|
+
redirect_to metric_path(@metric),
|
|
168
|
+
notice: "Reverted to #{@metric.name} #{version.version_label}. Pinned cases still flow to the judge, and calibration verdicts collected against #{prior_label} stay tied to it."
|
|
169
|
+
else
|
|
170
|
+
redirect_to metric_path(@metric),
|
|
171
|
+
notice: "#{@metric.name} #{version.version_label} is now the published version."
|
|
172
|
+
end
|
|
126
173
|
end
|
|
127
174
|
|
|
128
175
|
def add_few_shot
|
|
@@ -160,5 +207,14 @@ module CompletionKit
|
|
|
160
207
|
params.require(:metric).permit(:name, :instruction,
|
|
161
208
|
rubric_bands: [:stars, :description], tag_names: [])
|
|
162
209
|
end
|
|
210
|
+
|
|
211
|
+
def normalize_rubric_bands_for_update(bands)
|
|
212
|
+
return nil if bands.nil?
|
|
213
|
+
array = bands.is_a?(ActionController::Parameters) ? bands.to_unsafe_h.values : bands
|
|
214
|
+
Array(array).map do |b|
|
|
215
|
+
h = b.respond_to?(:to_unsafe_h) ? b.to_unsafe_h : b
|
|
216
|
+
{ "stars" => h["stars"].to_i, "description" => h["description"].to_s }
|
|
217
|
+
end.sort_by { |b| -b["stars"] }
|
|
218
|
+
end
|
|
163
219
|
end
|
|
164
220
|
end
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class RunsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :refresh_status]
|
|
4
|
+
before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :regrade, :refresh_status, :compare]
|
|
5
5
|
before_action :load_form_collections, only: [:new, :edit, :create, :update]
|
|
6
6
|
|
|
7
7
|
def index
|
|
@@ -78,6 +78,29 @@ module CompletionKit
|
|
|
78
78
|
end
|
|
79
79
|
end
|
|
80
80
|
|
|
81
|
+
def compare
|
|
82
|
+
other_id = params[:with]
|
|
83
|
+
if other_id.blank?
|
|
84
|
+
@other_runs = Run.where(dataset_id: @run.dataset_id, prompt_id: @run.prompt_id)
|
|
85
|
+
.where.not(id: @run.id)
|
|
86
|
+
.order(created_at: :desc)
|
|
87
|
+
.limit(50)
|
|
88
|
+
return render(:compare_picker)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
@other_run = Run.find(other_id)
|
|
92
|
+
@comparison = build_run_comparison(@run, @other_run)
|
|
93
|
+
render(:compare)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def regrade
|
|
97
|
+
if @run.regrade!
|
|
98
|
+
redirect_to run_path(@run), notice: "Re-grading existing responses with the current judge."
|
|
99
|
+
else
|
|
100
|
+
redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
81
104
|
def rerun
|
|
82
105
|
new_run = Run.create!(
|
|
83
106
|
prompt_id: @run.prompt_id,
|
|
@@ -126,6 +149,12 @@ module CompletionKit
|
|
|
126
149
|
end
|
|
127
150
|
|
|
128
151
|
def retry_failures
|
|
152
|
+
if @run.stale_review_summary.any?
|
|
153
|
+
redirect_to run_path(@run),
|
|
154
|
+
alert: "The judge has changed since this run executed. Retrying failed cases would mix scores from two metric versions in the same run. Use 'Re-run with current judge' to refresh everything against the live judge."
|
|
155
|
+
return
|
|
156
|
+
end
|
|
157
|
+
|
|
129
158
|
scope = @run.responses.where(status: "failed")
|
|
130
159
|
scope = scope.where(id: params[:only]) if params[:only].present?
|
|
131
160
|
|
|
@@ -157,6 +186,45 @@ module CompletionKit
|
|
|
157
186
|
@run = Run.find(params[:id])
|
|
158
187
|
end
|
|
159
188
|
|
|
189
|
+
def build_run_comparison(left, right)
|
|
190
|
+
left_responses = left.responses.includes(:reviews).order(:row_index, :id)
|
|
191
|
+
right_responses = right.responses.includes(:reviews).order(:row_index, :id)
|
|
192
|
+
right_by_input = right_responses.each_with_object({}) { |r, h| h[r.input_data.to_s] ||= r }
|
|
193
|
+
|
|
194
|
+
all_reviews = left_responses.flat_map(&:reviews) + right_responses.flat_map(&:reviews)
|
|
195
|
+
metric_ids = all_reviews.map(&:metric_id).compact.uniq
|
|
196
|
+
metric_versions = MetricVersion.where(id: all_reviews.map(&:metric_version_id).compact.uniq).index_by(&:id)
|
|
197
|
+
|
|
198
|
+
rows = left_responses.map do |lr|
|
|
199
|
+
rr = right_by_input[lr.input_data.to_s]
|
|
200
|
+
{
|
|
201
|
+
left_response: lr,
|
|
202
|
+
right_response: rr,
|
|
203
|
+
per_metric: metric_ids.map do |mid|
|
|
204
|
+
l_review = lr.reviews.find { |r| r.metric_id == mid }
|
|
205
|
+
r_review = rr && rr.reviews.find { |r| r.metric_id == mid }
|
|
206
|
+
next nil if l_review.nil? && r_review.nil?
|
|
207
|
+
anchor = l_review || r_review
|
|
208
|
+
{
|
|
209
|
+
metric_id: mid,
|
|
210
|
+
metric_name: anchor.metric_name,
|
|
211
|
+
left_score: l_review ? l_review.ai_score : nil,
|
|
212
|
+
right_score: r_review ? r_review.ai_score : nil,
|
|
213
|
+
left_version_label: version_label_for(l_review, metric_versions),
|
|
214
|
+
right_version_label: version_label_for(r_review, metric_versions),
|
|
215
|
+
delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil
|
|
216
|
+
}
|
|
217
|
+
end.compact
|
|
218
|
+
}
|
|
219
|
+
end
|
|
220
|
+
{ rows: rows, metric_ids: metric_ids }
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def version_label_for(review, metric_versions)
|
|
224
|
+
return nil if review.nil? || review.metric_version_id.nil?
|
|
225
|
+
metric_versions[review.metric_version_id]&.version_label
|
|
226
|
+
end
|
|
227
|
+
|
|
160
228
|
def load_form_collections
|
|
161
229
|
@prompts = Prompt.order(:name)
|
|
162
230
|
@datasets = Dataset.order(:name)
|
|
@@ -62,9 +62,11 @@ module CompletionKit
|
|
|
62
62
|
)
|
|
63
63
|
|
|
64
64
|
review = response.reviews.find_or_initialize_by(metric_id: metric.id)
|
|
65
|
+
current_metric_version = MetricVersion.ensure_current_for(metric)
|
|
65
66
|
review.assign_attributes(
|
|
66
67
|
metric_name: metric.name,
|
|
67
68
|
instruction: metric.instruction.to_s,
|
|
69
|
+
metric_version_id: current_metric_version.id,
|
|
68
70
|
status: "succeeded",
|
|
69
71
|
ai_score: evaluation[:score],
|
|
70
72
|
ai_feedback: evaluation[:feedback],
|
|
@@ -122,6 +124,7 @@ module CompletionKit
|
|
|
122
124
|
end
|
|
123
125
|
|
|
124
126
|
def few_shot_payload(metric)
|
|
127
|
+
return nil unless CompletionKit.config.judge_calibration_enabled
|
|
125
128
|
Array(metric.few_shot_examples).map do |fs|
|
|
126
129
|
{
|
|
127
130
|
human_score: fs["human_score"],
|
|
@@ -5,7 +5,7 @@ module CompletionKit
|
|
|
5
5
|
belongs_to :run
|
|
6
6
|
belongs_to :response
|
|
7
7
|
belongs_to :metric
|
|
8
|
-
belongs_to :
|
|
8
|
+
belongs_to :metric_version
|
|
9
9
|
|
|
10
10
|
validates :verdict, presence: true, inclusion: { in: VERDICTS }
|
|
11
11
|
validates :response_id,
|
|
@@ -22,7 +22,7 @@ module CompletionKit
|
|
|
22
22
|
run_id: run_id,
|
|
23
23
|
response_id: response_id,
|
|
24
24
|
metric_id: metric_id,
|
|
25
|
-
|
|
25
|
+
metric_version_id: metric_version_id,
|
|
26
26
|
verdict: verdict,
|
|
27
27
|
corrected_score: corrected_score,
|
|
28
28
|
note: note,
|
|
@@ -24,7 +24,6 @@ module CompletionKit
|
|
|
24
24
|
before_validation :generate_key
|
|
25
25
|
before_validation :normalize_rubric_bands
|
|
26
26
|
before_validation :set_defaults
|
|
27
|
-
after_update :fork_draft_judge_version, if: :judge_relevant_changes?
|
|
28
27
|
|
|
29
28
|
def self.default_rubric_bands
|
|
30
29
|
DEFAULT_RUBRIC_BANDS.map(&:dup)
|
|
@@ -98,21 +97,5 @@ module CompletionKit
|
|
|
98
97
|
self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
|
|
99
98
|
end
|
|
100
99
|
|
|
101
|
-
def judge_relevant_changes?
|
|
102
|
-
saved_change_to_instruction? || saved_change_to_rubric_bands?
|
|
103
|
-
end
|
|
104
|
-
|
|
105
|
-
def fork_draft_judge_version
|
|
106
|
-
JudgeVersion.ensure_current_for(self)
|
|
107
|
-
JudgeVersion.where(metric_id: id, state: "draft").update_all(current: false)
|
|
108
|
-
JudgeVersion.create!(
|
|
109
|
-
metric: self,
|
|
110
|
-
instruction: instruction,
|
|
111
|
-
rubric_bands: rubric_bands,
|
|
112
|
-
current: false,
|
|
113
|
-
state: "draft",
|
|
114
|
-
source: "edit"
|
|
115
|
-
)
|
|
116
|
-
end
|
|
117
100
|
end
|
|
118
101
|
end
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
|
-
class
|
|
2
|
+
class MetricVersion < ApplicationRecord
|
|
3
3
|
STATES = %w[draft published].freeze
|
|
4
4
|
|
|
5
5
|
belongs_to :metric
|
|
@@ -41,7 +41,7 @@ module CompletionKit
|
|
|
41
41
|
end
|
|
42
42
|
|
|
43
43
|
def publish!
|
|
44
|
-
|
|
44
|
+
MetricVersion.transaction do
|
|
45
45
|
self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
|
|
46
46
|
reload
|
|
47
47
|
update!(state: "published", current: true, published_at: published_at || Time.current)
|
|
@@ -76,4 +76,5 @@ module CompletionKit
|
|
|
76
76
|
self.version_number = max + 1
|
|
77
77
|
end
|
|
78
78
|
end
|
|
79
|
+
|
|
79
80
|
end
|
|
@@ -5,8 +5,16 @@ module CompletionKit
|
|
|
5
5
|
|
|
6
6
|
belongs_to :response
|
|
7
7
|
belongs_to :metric, optional: true
|
|
8
|
+
belongs_to :metric_version, optional: true
|
|
8
9
|
has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
|
|
9
10
|
|
|
11
|
+
def stale_against_current_judge?
|
|
12
|
+
return false unless metric_id && metric_version_id
|
|
13
|
+
current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
|
|
14
|
+
return false if current_id.nil?
|
|
15
|
+
metric_version_id != current_id
|
|
16
|
+
end
|
|
17
|
+
|
|
10
18
|
validates :metric_name, presence: true
|
|
11
19
|
validates :status, inclusion: { in: STATUSES }
|
|
12
20
|
validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
|
|
@@ -29,6 +37,7 @@ module CompletionKit
|
|
|
29
37
|
def as_json(options = {})
|
|
30
38
|
{
|
|
31
39
|
id: id, response_id: response_id, metric_id: metric_id,
|
|
40
|
+
metric_version_id: metric_version_id,
|
|
32
41
|
metric_name: metric_name, ai_score: ai_score,
|
|
33
42
|
ai_feedback: ai_feedback, status: status, attempts: attempts,
|
|
34
43
|
error: error_payload
|
|
@@ -89,6 +89,34 @@ module CompletionKit
|
|
|
89
89
|
end
|
|
90
90
|
end
|
|
91
91
|
|
|
92
|
+
def stale_review_summary
|
|
93
|
+
review_pairs = Review.where(response_id: response_ids)
|
|
94
|
+
.where.not(metric_id: nil)
|
|
95
|
+
.where.not(metric_version_id: nil)
|
|
96
|
+
.pluck(:metric_id, :metric_version_id, :metric_name)
|
|
97
|
+
return {} if review_pairs.empty?
|
|
98
|
+
|
|
99
|
+
metric_ids = review_pairs.map(&:first).uniq
|
|
100
|
+
version_ids = review_pairs.map { |_, vid, _| vid }.uniq
|
|
101
|
+
current_by_metric = MetricVersion.current.where(metric_id: metric_ids).pluck(:metric_id, :id, :version_number).each_with_object({}) do |(mid, vid, vnum), h|
|
|
102
|
+
h[mid] = { id: vid, label: "v#{vnum}" }
|
|
103
|
+
end
|
|
104
|
+
label_by_version = MetricVersion.where(id: version_ids).pluck(:id, :version_number).each_with_object({}) { |(vid, vnum), h| h[vid] = "v#{vnum}" }
|
|
105
|
+
|
|
106
|
+
summary = {}
|
|
107
|
+
review_pairs.each do |metric_id, version_id, metric_name|
|
|
108
|
+
current = current_by_metric[metric_id]
|
|
109
|
+
next if current.nil?
|
|
110
|
+
next if version_id == current[:id]
|
|
111
|
+
label = label_by_version[version_id]
|
|
112
|
+
next if label.nil?
|
|
113
|
+
summary[metric_id] ||= { metric_name: metric_name, current_label: current[:label], stale_count: 0, scored_labels: [] }
|
|
114
|
+
summary[metric_id][:stale_count] += 1
|
|
115
|
+
summary[metric_id][:scored_labels] |= [label]
|
|
116
|
+
end
|
|
117
|
+
summary
|
|
118
|
+
end
|
|
119
|
+
|
|
92
120
|
def start!
|
|
93
121
|
rows = if dataset
|
|
94
122
|
CsvProcessor.process_self(self)
|
|
@@ -151,6 +179,38 @@ module CompletionKit
|
|
|
151
179
|
start!
|
|
152
180
|
end
|
|
153
181
|
|
|
182
|
+
def regrade!
|
|
183
|
+
grading_metrics = metrics
|
|
184
|
+
return false if grading_metrics.empty? || !judge_configured?
|
|
185
|
+
|
|
186
|
+
eligible_responses = responses.where(status: "succeeded").where.not(response_text: nil)
|
|
187
|
+
response_ids = eligible_responses.pluck(:id)
|
|
188
|
+
return false if response_ids.empty?
|
|
189
|
+
|
|
190
|
+
transaction do
|
|
191
|
+
Review.where(response_id: response_ids).update_all(
|
|
192
|
+
status: "pending",
|
|
193
|
+
attempts: 0,
|
|
194
|
+
metric_version_id: nil,
|
|
195
|
+
ai_score: nil,
|
|
196
|
+
ai_feedback: nil,
|
|
197
|
+
error_provider: nil,
|
|
198
|
+
error_class: nil,
|
|
199
|
+
error_status: nil,
|
|
200
|
+
error_message: nil
|
|
201
|
+
)
|
|
202
|
+
update!(status: "running", failure_summary: nil, error_message: nil)
|
|
203
|
+
|
|
204
|
+
response_ids.each do |rid|
|
|
205
|
+
grading_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id) }
|
|
206
|
+
end
|
|
207
|
+
RunCompletionCheckJob.perform_later(id)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
broadcast_ui
|
|
211
|
+
true
|
|
212
|
+
end
|
|
213
|
+
|
|
154
214
|
def progress_snapshot
|
|
155
215
|
generated_done = responses.where(status: "succeeded").count
|
|
156
216
|
generated_failed = responses.where(status: "failed").count
|
|
@@ -56,7 +56,7 @@ module CompletionKit
|
|
|
56
56
|
run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
|
|
57
57
|
)
|
|
58
58
|
calibration.assign_attributes(
|
|
59
|
-
|
|
59
|
+
metric_version: CompletionKit::MetricVersion.ensure_current_for(metric),
|
|
60
60
|
verdict: args["verdict"],
|
|
61
61
|
corrected_score: args["corrected_score"],
|
|
62
62
|
note: args["note"]
|
|
@@ -5,7 +5,7 @@ module CompletionKit
|
|
|
5
5
|
|
|
6
6
|
TOOLS = {
|
|
7
7
|
"judges_suggest" => {
|
|
8
|
-
description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft
|
|
8
|
+
description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft MetricVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
|
|
9
9
|
inputSchema: {
|
|
10
10
|
type: "object",
|
|
11
11
|
properties: {
|
|
@@ -33,15 +33,15 @@ module CompletionKit
|
|
|
33
33
|
handler: :replay
|
|
34
34
|
},
|
|
35
35
|
"judges_compare" => {
|
|
36
|
-
description: "Compare two
|
|
36
|
+
description: "Compare two metric versions' calibration stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
|
|
37
37
|
inputSchema: {
|
|
38
38
|
type: "object",
|
|
39
39
|
properties: {
|
|
40
40
|
metric_id: { type: "integer" },
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
metric_version_a_id: { type: "integer" },
|
|
42
|
+
metric_version_b_id: { type: "integer" }
|
|
43
43
|
},
|
|
44
|
-
required: ["metric_id", "
|
|
44
|
+
required: ["metric_id", "metric_version_a_id", "metric_version_b_id"]
|
|
45
45
|
},
|
|
46
46
|
handler: :compare
|
|
47
47
|
}
|
|
@@ -49,7 +49,7 @@ module CompletionKit
|
|
|
49
49
|
|
|
50
50
|
def self.suggest(args)
|
|
51
51
|
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
52
|
-
generator = CompletionKit::
|
|
52
|
+
generator = CompletionKit::MetricVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
|
|
53
53
|
variants = generator.call
|
|
54
54
|
return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
|
|
55
55
|
versions = generator.persist!(variants)
|
|
@@ -75,20 +75,20 @@ module CompletionKit
|
|
|
75
75
|
|
|
76
76
|
def self.compare(args)
|
|
77
77
|
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
78
|
-
a = CompletionKit::
|
|
79
|
-
b = CompletionKit::
|
|
80
|
-
stats_a = CompletionKit::MetricCalibrationStats.for(metric,
|
|
81
|
-
stats_b = CompletionKit::MetricCalibrationStats.for(metric,
|
|
78
|
+
a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
|
|
79
|
+
b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
|
|
80
|
+
stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
|
|
81
|
+
stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
|
|
82
82
|
text_result({
|
|
83
83
|
metric_id: metric.id,
|
|
84
|
-
a:
|
|
85
|
-
b:
|
|
84
|
+
a: metric_version_payload(a, stats_a),
|
|
85
|
+
b: metric_version_payload(b, stats_b),
|
|
86
86
|
delta: delta_payload(stats_a, stats_b),
|
|
87
87
|
recommendation: recommendation_for(stats_a, stats_b)
|
|
88
88
|
})
|
|
89
89
|
end
|
|
90
90
|
|
|
91
|
-
def self.
|
|
91
|
+
def self.metric_version_payload(version, stats)
|
|
92
92
|
{
|
|
93
93
|
id: version.id, state: version.state, current: version.current,
|
|
94
94
|
source: version.source, created_at: version.created_at,
|
|
@@ -33,25 +33,25 @@ module CompletionKit
|
|
|
33
33
|
|
|
34
34
|
CURRENT = :current
|
|
35
35
|
|
|
36
|
-
def self.for(metric,
|
|
37
|
-
resolved = case
|
|
38
|
-
when CURRENT then
|
|
36
|
+
def self.for(metric, metric_version: CURRENT)
|
|
37
|
+
resolved = case metric_version
|
|
38
|
+
when CURRENT then MetricVersion.current.find_by(metric_id: metric.id)
|
|
39
39
|
when nil then nil
|
|
40
|
-
else
|
|
40
|
+
else metric_version
|
|
41
41
|
end
|
|
42
|
-
new(metric: metric,
|
|
42
|
+
new(metric: metric, metric_version: resolved, all_versions: metric_version.nil?).call
|
|
43
43
|
end
|
|
44
44
|
|
|
45
|
-
def initialize(metric:,
|
|
45
|
+
def initialize(metric:, metric_version: nil, all_versions: false)
|
|
46
46
|
@metric = metric
|
|
47
|
-
@
|
|
47
|
+
@metric_version = metric_version
|
|
48
48
|
@all_versions = all_versions
|
|
49
49
|
end
|
|
50
50
|
|
|
51
51
|
def call
|
|
52
52
|
scope = Calibration.where(metric_id: @metric.id)
|
|
53
|
-
if @
|
|
54
|
-
scope = scope.where(
|
|
53
|
+
if @metric_version
|
|
54
|
+
scope = scope.where(metric_version_id: @metric_version.id)
|
|
55
55
|
elsif !@all_versions
|
|
56
56
|
scope = scope.none
|
|
57
57
|
end
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
|
-
class
|
|
2
|
+
class MetricVariantGenerator
|
|
3
3
|
DEFAULT_VARIANT_COUNT = 1
|
|
4
4
|
MAX_VARIANT_COUNT = 3
|
|
5
5
|
DEFAULT_TEMPERATURE = 0.4
|
|
@@ -20,9 +20,9 @@ module CompletionKit
|
|
|
20
20
|
end
|
|
21
21
|
|
|
22
22
|
def persist!(variants)
|
|
23
|
-
|
|
23
|
+
MetricVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
|
|
24
24
|
versions = variants.map do |variant|
|
|
25
|
-
|
|
25
|
+
MetricVersion.create!(
|
|
26
26
|
metric: @metric,
|
|
27
27
|
instruction: variant.instruction,
|
|
28
28
|
rubric_bands: variant.rubric_bands.presence || @metric.rubric_bands,
|
|
@@ -41,8 +41,9 @@ module CompletionKit
|
|
|
41
41
|
private
|
|
42
42
|
|
|
43
43
|
def build_meta_prompt
|
|
44
|
-
disagreements =
|
|
45
|
-
borderlines =
|
|
44
|
+
disagreements = MetricCalibrationExamples.disagreements_for(@metric)
|
|
45
|
+
borderlines = MetricCalibrationExamples.borderlines_for(@metric)
|
|
46
|
+
pinned_examples = Array(@metric.few_shot_examples)
|
|
46
47
|
sections = []
|
|
47
48
|
sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
|
|
48
49
|
sections << ""
|
|
@@ -77,6 +78,18 @@ module CompletionKit
|
|
|
77
78
|
sections << ""
|
|
78
79
|
end
|
|
79
80
|
end
|
|
81
|
+
if pinned_examples.any?
|
|
82
|
+
sections << "## Pinned cases the judge already references"
|
|
83
|
+
sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
|
|
84
|
+
pinned_examples.each_with_index do |ex, i|
|
|
85
|
+
sections << "### Pinned #{i + 1}"
|
|
86
|
+
sections << "Input: #{ex["input"].to_s.truncate(200)}"
|
|
87
|
+
sections << "Output: #{ex["response"].to_s.truncate(200)}"
|
|
88
|
+
sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
|
|
89
|
+
sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
|
|
90
|
+
sections << ""
|
|
91
|
+
end
|
|
92
|
+
end
|
|
80
93
|
sections << "## Task"
|
|
81
94
|
sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
|
|
82
95
|
sections << ""
|
|
@@ -117,7 +130,7 @@ module CompletionKit
|
|
|
117
130
|
end
|
|
118
131
|
end
|
|
119
132
|
|
|
120
|
-
module
|
|
133
|
+
module MetricCalibrationExamples
|
|
121
134
|
module_function
|
|
122
135
|
|
|
123
136
|
def for(metric, limit: 8)
|
|
@@ -133,13 +146,14 @@ module CompletionKit
|
|
|
133
146
|
end
|
|
134
147
|
|
|
135
148
|
def calibrations_for(metric, verdict:, limit:)
|
|
136
|
-
|
|
137
|
-
current_version =
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
149
|
+
base = Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
150
|
+
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
151
|
+
scoped = current_version ? base.where(metric_version_id: current_version.id) : base
|
|
152
|
+
effective = scoped.exists? ? scoped : base
|
|
153
|
+
effective.includes(response: :reviews)
|
|
154
|
+
.order(created_at: :desc)
|
|
155
|
+
.limit(limit)
|
|
156
|
+
.map do |cal|
|
|
143
157
|
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
144
158
|
{
|
|
145
159
|
input: cal.response.input_data,
|
|
@@ -3,19 +3,29 @@
|
|
|
3
3
|
<% anchor = metric&.name&.parameterize %>
|
|
4
4
|
<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
|
|
5
5
|
created_by = CompletionKit.config.username.presence || "operator"
|
|
6
|
-
|
|
6
|
+
current_metric_version = CompletionKit::MetricVersion.current.find_by(metric_id: metric.id)
|
|
7
|
+
verdicted_ids = if current_metric_version
|
|
8
|
+
CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
|
|
9
|
+
else
|
|
10
|
+
[]
|
|
11
|
+
end
|
|
7
12
|
CompletionKit::Response.joins(:reviews)
|
|
8
13
|
.where(reviews: { metric_id: metric.id })
|
|
9
14
|
.where.not(reviews: { ai_score: nil })
|
|
10
15
|
.where.not(id: verdicted_ids)
|
|
11
16
|
.order(created_at: :desc).first
|
|
12
17
|
end %>
|
|
18
|
+
<% prior_version_verdicts = if stats.sample_size.zero? && metric && current_metric_version
|
|
19
|
+
CompletionKit::Calibration.where(metric_id: metric.id).where.not(metric_version_id: current_metric_version.id).count
|
|
20
|
+
else
|
|
21
|
+
0
|
|
22
|
+
end %>
|
|
13
23
|
|
|
14
24
|
<p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
15
25
|
<span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
|
|
16
26
|
<% if stats.sample_size.zero? %>
|
|
17
27
|
<span class="ck-trust-line__state">Not measured yet.</span>
|
|
18
|
-
<span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if target_response %>
|
|
28
|
+
<span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "verdict") %> on prior versions, tied to that version's history.)<% end %><% if target_response %>
|
|
19
29
|
<%= link_to "Give a verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %>
|
|
20
30
|
<% else %> Run this metric on a dataset, then give a verdict.<% end %></span>
|
|
21
31
|
<% elsif stats.counter_only? %>
|
|
@@ -16,14 +16,14 @@
|
|
|
16
16
|
</div>
|
|
17
17
|
<% end %>
|
|
18
18
|
|
|
19
|
-
<% if edit_draft
|
|
20
|
-
<% pub = local_assigns[:
|
|
19
|
+
<% if edit_draft %>
|
|
20
|
+
<% pub = local_assigns[:published_metric_version] %>
|
|
21
21
|
<% draft_instr_changed = pub && pub.instruction.to_s != edit_draft.instruction.to_s %>
|
|
22
22
|
<% draft_rubric_changed = pub && pub.rubric_bands != edit_draft.rubric_bands %>
|
|
23
23
|
<div class="ck-suggestion-banner" role="status">
|
|
24
24
|
<div class="ck-suggestion-banner__body">
|
|
25
25
|
<p class="ck-kicker">Draft pending</p>
|
|
26
|
-
<p class="ck-meta-copy">
|
|
26
|
+
<p class="ck-meta-copy">The form below shows your unpublished draft. Publish to replace the live<%= " instruction" if draft_instr_changed %><%= " and" if draft_instr_changed && draft_rubric_changed %><%= " rubric" if draft_rubric_changed %> for future runs, or keep editing.</p>
|
|
27
27
|
</div>
|
|
28
28
|
<div class="ck-suggestion-banner__actions">
|
|
29
29
|
<%= button_to "Discard draft", dismiss_suggestion_metric_path(metric, draft_id: edit_draft.id, back_to: "edit"),
|
|
@@ -19,20 +19,17 @@
|
|
|
19
19
|
</div>
|
|
20
20
|
<div class="ck-actions">
|
|
21
21
|
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
22
|
-
<% if @suggestion_draft %>
|
|
23
|
-
|
|
22
|
+
<% if @suggestion_draft || @edit_draft %>
|
|
23
|
+
<% review_title = @suggestion_draft ? "The model proposed improvements based on your disagreements. Review and apply what you want." : "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
|
|
24
|
+
<%= link_to "Review changes →", edit_metric_path(@metric),
|
|
24
25
|
class: ck_button_classes(:dark),
|
|
25
|
-
title:
|
|
26
|
-
<% elsif @edit_draft %>
|
|
27
|
-
<%= link_to "Review draft →", edit_metric_path(@metric),
|
|
28
|
-
class: ck_button_classes(:dark),
|
|
29
|
-
title: "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
|
|
26
|
+
title: review_title %>
|
|
30
27
|
<% elsif @improve_disagreement_count.positive? %>
|
|
31
28
|
<%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
|
|
32
29
|
method: :post, form_class: "inline-block",
|
|
33
30
|
class: ck_button_classes(:light, variant: :outline),
|
|
34
|
-
title: "
|
|
35
|
-
data: { turbo_confirm: "
|
|
31
|
+
title: "Ask the model to suggest improvements to this metric's instruction and rubric based on the disagreements collected so far.",
|
|
32
|
+
data: { turbo_confirm: "Ask the model for suggested improvements based on the disagreements collected so far?" } %>
|
|
36
33
|
<% else %>
|
|
37
34
|
<button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
|
|
38
35
|
title="Mark at least one case as Disagree before the model can suggest a change.">Improve the metric</button>
|
|
@@ -168,19 +165,20 @@
|
|
|
168
165
|
<p class="ck-kicker">Cases to learn from</p>
|
|
169
166
|
<span class="ck-chip"><%= pluralize(@disagreements.size, "case") %></span>
|
|
170
167
|
</div>
|
|
171
|
-
|
|
168
|
+
<% mixed_versions = @disagreements.any? { |c| c.metric_version_id != @published_metric_version.id } %>
|
|
169
|
+
<p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades<%= " (pins flow into the current judge regardless of which version produced the verdict)" if mixed_versions %>.</p>
|
|
172
170
|
<% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
|
|
173
171
|
<ul class="ck-disagreement-list">
|
|
174
172
|
<% @disagreements.each do |cal| %>
|
|
175
173
|
<% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
|
|
176
174
|
<% already = existing_ids.include?(cal.id) %>
|
|
177
|
-
<%
|
|
178
|
-
<% on_current =
|
|
175
|
+
<% cal_metric_version = cal.metric_version %>
|
|
176
|
+
<% on_current = cal_metric_version&.id == @published_metric_version.id %>
|
|
179
177
|
<li class="ck-disagreement<%= " ck-disagreement--remembered" if already %><%= " ck-disagreement--stale" unless on_current %>">
|
|
180
178
|
<div class="ck-disagreement__head">
|
|
181
179
|
<div class="ck-disagreement__scores">
|
|
182
|
-
<% if
|
|
183
|
-
<span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%=
|
|
180
|
+
<% if cal_metric_version && mixed_versions %>
|
|
181
|
+
<span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_metric_version.version_label %></span>
|
|
184
182
|
<% end %>
|
|
185
183
|
<span class="ck-disagreement__scores-label">Judge</span>
|
|
186
184
|
<% if review&.ai_score %>
|
|
@@ -98,10 +98,15 @@
|
|
|
98
98
|
|
|
99
99
|
<div class="ck-review-list">
|
|
100
100
|
<% @reviews.each do |review| %>
|
|
101
|
-
|
|
101
|
+
<% review_version = review.metric_version %>
|
|
102
|
+
<% stale = review.stale_against_current_judge? %>
|
|
103
|
+
<div class="ck-review-card<%= " ck-review-card--stale" if stale %>" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
|
|
102
104
|
<div class="ck-review-card__header">
|
|
103
105
|
<span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
|
|
104
106
|
<div class="ck-inline">
|
|
107
|
+
<% if review_version %>
|
|
108
|
+
<span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Score produced by #{review_version.version_label} of this metric. The live judge has changed since." : "Score produced by the live judge (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
|
|
109
|
+
<% end %>
|
|
105
110
|
<% if review.ai_score %>
|
|
106
111
|
<% 5.times do |i| %>
|
|
107
112
|
<svg viewBox="0 0 24 24" width="16" height="16" stroke-width="1.75" class="ck-star <%= i < review.ai_score.to_i ? "ck-star--filled" : "ck-star--empty" %>"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
|
|
@@ -111,6 +116,9 @@
|
|
|
111
116
|
<% end %>
|
|
112
117
|
</div>
|
|
113
118
|
</div>
|
|
119
|
+
<% if stale %>
|
|
120
|
+
<p class="ck-review-card__stale-note">Scored against a superseded version of this metric. The live judge may score this differently.</p>
|
|
121
|
+
<% end %>
|
|
114
122
|
<% if review.ai_feedback.present? %>
|
|
115
123
|
<p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
|
|
116
124
|
<% end %>
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
<%= button_to "Retry", generate_run_path(run), method: :post, class: ck_button_classes(:light, variant: :outline), form_class: "inline-block" %>
|
|
12
12
|
<%= button_to "Re-run as new", rerun_run_path(run), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
|
|
13
13
|
<% elsif run.status == "completed" %>
|
|
14
|
+
<%= link_to "Compare", compare_run_path(run), class: ck_button_classes(:light, variant: :outline) %>
|
|
14
15
|
<%= button_to "Re-run", rerun_run_path(run), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
|
|
15
16
|
<% end %>
|
|
16
17
|
<% end %>
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
<ol class="ck-breadcrumb">
|
|
2
|
+
<li><%= link_to "Runs", runs_path %></li>
|
|
3
|
+
<li><%= link_to @run.name, run_path(@run) %></li>
|
|
4
|
+
<li>vs <%= @other_run.name %></li>
|
|
5
|
+
</ol>
|
|
6
|
+
|
|
7
|
+
<section class="ck-page-header">
|
|
8
|
+
<div>
|
|
9
|
+
<h1 class="ck-title">Comparing runs</h1>
|
|
10
|
+
<p class="ck-meta-copy"><strong>A</strong>: <%= link_to @run.name, run_path(@run), class: "ck-link" %> · <strong>B</strong>: <%= link_to @other_run.name, run_path(@other_run), class: "ck-link" %></p>
|
|
11
|
+
</div>
|
|
12
|
+
<div class="ck-actions">
|
|
13
|
+
<%= link_to "Pick another", compare_run_path(@run), class: ck_button_classes(:light, variant: :outline) %>
|
|
14
|
+
</div>
|
|
15
|
+
</section>
|
|
16
|
+
|
|
17
|
+
<% if @comparison[:rows].empty? %>
|
|
18
|
+
<div class="ck-empty">
|
|
19
|
+
<p>No responses to compare yet.</p>
|
|
20
|
+
</div>
|
|
21
|
+
<% else %>
|
|
22
|
+
<table class="ck-results-table ck-run-compare-table">
|
|
23
|
+
<thead>
|
|
24
|
+
<tr>
|
|
25
|
+
<th scope="col">Case</th>
|
|
26
|
+
<th scope="col">Metric</th>
|
|
27
|
+
<th scope="col">A score</th>
|
|
28
|
+
<th scope="col">B score</th>
|
|
29
|
+
<th scope="col">Δ</th>
|
|
30
|
+
<th scope="col">A version</th>
|
|
31
|
+
<th scope="col">B version</th>
|
|
32
|
+
</tr>
|
|
33
|
+
</thead>
|
|
34
|
+
<tbody>
|
|
35
|
+
<% @comparison[:rows].each do |row| %>
|
|
36
|
+
<% case_label = ((row[:left_response].row_index || 0) + 1).to_s %>
|
|
37
|
+
<% row[:per_metric].each_with_index do |pm, idx| %>
|
|
38
|
+
<tr>
|
|
39
|
+
<% if idx == 0 %>
|
|
40
|
+
<td rowspan="<%= row[:per_metric].size %>">
|
|
41
|
+
<%= link_to case_label, run_response_path(@run, row[:left_response]), class: "ck-link" %>
|
|
42
|
+
<% if row[:right_response] %>
|
|
43
|
+
<span class="ck-meta-copy">/ <%= link_to "B", run_response_path(@other_run, row[:right_response]), class: "ck-link" %></span>
|
|
44
|
+
<% end %>
|
|
45
|
+
</td>
|
|
46
|
+
<% end %>
|
|
47
|
+
<td><%= pm[:metric_name] %></td>
|
|
48
|
+
<td>
|
|
49
|
+
<% if pm[:left_score] %>
|
|
50
|
+
<span class="<%= ck_badge_classes(ck_score_kind(pm[:left_score].to_f)) %>"><%= pm[:left_score] %></span>
|
|
51
|
+
<% else %>
|
|
52
|
+
<span class="ck-meta-copy">—</span>
|
|
53
|
+
<% end %>
|
|
54
|
+
</td>
|
|
55
|
+
<td>
|
|
56
|
+
<% if pm[:right_score] %>
|
|
57
|
+
<span class="<%= ck_badge_classes(ck_score_kind(pm[:right_score].to_f)) %>"><%= pm[:right_score] %></span>
|
|
58
|
+
<% else %>
|
|
59
|
+
<span class="ck-meta-copy">—</span>
|
|
60
|
+
<% end %>
|
|
61
|
+
</td>
|
|
62
|
+
<td>
|
|
63
|
+
<% if pm[:delta] %>
|
|
64
|
+
<% delta_class = pm[:delta] > 0 ? "ck-delta--positive" : pm[:delta] < 0 ? "ck-delta--negative" : "ck-delta--zero" %>
|
|
65
|
+
<span class="ck-delta <%= delta_class %>"><%= pm[:delta].positive? ? "+#{pm[:delta]}" : pm[:delta].to_s %></span>
|
|
66
|
+
<% else %>
|
|
67
|
+
<span class="ck-meta-copy">—</span>
|
|
68
|
+
<% end %>
|
|
69
|
+
</td>
|
|
70
|
+
<td>
|
|
71
|
+
<% if pm[:left_version_label] %>
|
|
72
|
+
<span class="ck-source-chip ck-source-chip--current"><%= pm[:left_version_label] %></span>
|
|
73
|
+
<% end %>
|
|
74
|
+
</td>
|
|
75
|
+
<td>
|
|
76
|
+
<% if pm[:right_version_label] %>
|
|
77
|
+
<span class="ck-source-chip ck-source-chip--current"><%= pm[:right_version_label] %></span>
|
|
78
|
+
<% end %>
|
|
79
|
+
</td>
|
|
80
|
+
</tr>
|
|
81
|
+
<% end %>
|
|
82
|
+
<% end %>
|
|
83
|
+
</tbody>
|
|
84
|
+
</table>
|
|
85
|
+
<% end %>
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
<ol class="ck-breadcrumb">
|
|
2
|
+
<li><%= link_to "Runs", runs_path %></li>
|
|
3
|
+
<li><%= link_to @run.name, run_path(@run) %></li>
|
|
4
|
+
<li>Compare</li>
|
|
5
|
+
</ol>
|
|
6
|
+
|
|
7
|
+
<section class="ck-page-header">
|
|
8
|
+
<div>
|
|
9
|
+
<h1 class="ck-title">Compare with another run</h1>
|
|
10
|
+
<p class="ck-lead">Pick a run on the same dataset and prompt to see per-case score deltas side by side.</p>
|
|
11
|
+
</div>
|
|
12
|
+
</section>
|
|
13
|
+
|
|
14
|
+
<% if @other_runs.any? %>
|
|
15
|
+
<table class="ck-results-table">
|
|
16
|
+
<thead>
|
|
17
|
+
<tr>
|
|
18
|
+
<th scope="col">Run</th>
|
|
19
|
+
<th scope="col">Judge</th>
|
|
20
|
+
<th scope="col">Created</th>
|
|
21
|
+
<th scope="col"></th>
|
|
22
|
+
</tr>
|
|
23
|
+
</thead>
|
|
24
|
+
<tbody>
|
|
25
|
+
<% @other_runs.each do |other| %>
|
|
26
|
+
<tr>
|
|
27
|
+
<td><%= link_to other.name, run_path(other), class: "ck-link" %></td>
|
|
28
|
+
<td class="ck-meta-copy"><%= other.judge_model %></td>
|
|
29
|
+
<td class="ck-meta-copy"><time datetime="<%= other.created_at.utc.iso8601 %>"><%= time_ago_in_words(other.created_at) %> ago</time></td>
|
|
30
|
+
<td class="ck-results-table__arrow"><%= link_to "Compare →", compare_run_path(@run, with: other.id), class: "ck-link" %></td>
|
|
31
|
+
</tr>
|
|
32
|
+
<% end %>
|
|
33
|
+
</tbody>
|
|
34
|
+
</table>
|
|
35
|
+
<% else %>
|
|
36
|
+
<div class="ck-empty">
|
|
37
|
+
<p>No other runs on this dataset + prompt combination yet. <%= link_to "Re-run from this one", rerun_run_path(@run), method: :post, class: "ck-link" %> to create one.</p>
|
|
38
|
+
</div>
|
|
39
|
+
<% end %>
|
|
@@ -18,6 +18,35 @@
|
|
|
18
18
|
<% dataset_preview_lines = dataset_lines.first(50) %>
|
|
19
19
|
<% end %>
|
|
20
20
|
|
|
21
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
22
|
+
<% stale_summary = @run.stale_review_summary %>
|
|
23
|
+
<% if stale_summary.any? %>
|
|
24
|
+
<div class="ck-stale-versions-banner" role="status">
|
|
25
|
+
<div class="ck-stale-versions-banner__body">
|
|
26
|
+
<p class="ck-kicker">Stale judge versions</p>
|
|
27
|
+
<p class="ck-meta-copy">
|
|
28
|
+
This run was scored against metric versions that are no longer live.
|
|
29
|
+
<% stale_summary.values.each_with_index do |s, i| %>
|
|
30
|
+
<%= ", " if i > 0 %><strong><%= s[:metric_name] %></strong> (scored by <%= s[:scored_labels].join(", ") %>; live is <%= s[:current_label] %>)<% end %>.
|
|
31
|
+
Re-run to refresh the scores with the current judge.
|
|
32
|
+
</p>
|
|
33
|
+
</div>
|
|
34
|
+
<% if @run.status == "completed" %>
|
|
35
|
+
<%= button_to "Re-run from scratch",
|
|
36
|
+
rerun_run_path(@run), method: :post,
|
|
37
|
+
class: ck_button_classes(:light, variant: :outline), form_class: "inline-block",
|
|
38
|
+
title: "Create a new run that regenerates responses and grades them with the current judge.",
|
|
39
|
+
data: { turbo_confirm: "Create a new run with fresh responses and the current judge? The original run stays as a record." } %>
|
|
40
|
+
<%= button_to "Re-grade with current judge",
|
|
41
|
+
regrade_run_path(@run), method: :post,
|
|
42
|
+
class: ck_button_classes(:dark), form_class: "inline-block",
|
|
43
|
+
title: "Re-judge this run's existing responses against the current judge. Faster and cheaper than re-running.",
|
|
44
|
+
data: { turbo_confirm: "Re-judge this run's existing responses against the current judge?" } %>
|
|
45
|
+
<% end %>
|
|
46
|
+
</div>
|
|
47
|
+
<% end %>
|
|
48
|
+
<% end %>
|
|
49
|
+
|
|
21
50
|
<div class="ck-run-config">
|
|
22
51
|
<div class="ck-run-config__row">
|
|
23
52
|
<span class="ck-run-config__key">Created</span>
|
data/config/routes.rb
CHANGED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
class RenameJudgeVersionToMetricVersion < ActiveRecord::Migration[8.1]
|
|
2
|
+
def change
|
|
3
|
+
rename_table :completion_kit_judge_versions, :completion_kit_metric_versions
|
|
4
|
+
rename_column :completion_kit_calibrations, :judge_version_id, :metric_version_id
|
|
5
|
+
|
|
6
|
+
rename_index :completion_kit_metric_versions,
|
|
7
|
+
"index_ck_judge_versions_on_metric_id",
|
|
8
|
+
"index_ck_metric_versions_on_metric_id"
|
|
9
|
+
rename_index :completion_kit_metric_versions,
|
|
10
|
+
"index_ck_judge_versions_on_metric_current",
|
|
11
|
+
"index_ck_metric_versions_on_metric_current"
|
|
12
|
+
rename_index :completion_kit_metric_versions,
|
|
13
|
+
"index_ck_judge_versions_on_metric_state",
|
|
14
|
+
"index_ck_metric_versions_on_metric_state"
|
|
15
|
+
rename_index :completion_kit_metric_versions,
|
|
16
|
+
"index_ck_judge_versions_on_metric_version",
|
|
17
|
+
"index_ck_metric_versions_on_metric_vnum"
|
|
18
|
+
rename_index :completion_kit_calibrations,
|
|
19
|
+
"index_ck_calibrations_on_judge_version_id",
|
|
20
|
+
"index_ck_calibrations_on_metric_version_id"
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
class AddMetricVersionToReviews < ActiveRecord::Migration[8.1]
|
|
2
|
+
def change
|
|
3
|
+
add_column :completion_kit_reviews, :metric_version_id, :bigint
|
|
4
|
+
add_index :completion_kit_reviews, :metric_version_id, name: "index_ck_reviews_on_metric_version_id"
|
|
5
|
+
|
|
6
|
+
reversible do |dir|
|
|
7
|
+
dir.up do
|
|
8
|
+
execute <<~SQL
|
|
9
|
+
UPDATE completion_kit_reviews
|
|
10
|
+
SET metric_version_id = (
|
|
11
|
+
SELECT id FROM completion_kit_metric_versions mv
|
|
12
|
+
WHERE mv.metric_id = completion_kit_reviews.metric_id
|
|
13
|
+
AND mv.current = #{ActiveRecord::Base.connection.quote(true)}
|
|
14
|
+
LIMIT 1
|
|
15
|
+
)
|
|
16
|
+
WHERE metric_id IS NOT NULL
|
|
17
|
+
SQL
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -272,11 +272,11 @@ files:
|
|
|
272
272
|
- app/models/completion_kit/calibration.rb
|
|
273
273
|
- app/models/completion_kit/dashboard_dismissal.rb
|
|
274
274
|
- app/models/completion_kit/dataset.rb
|
|
275
|
-
- app/models/completion_kit/judge_version.rb
|
|
276
275
|
- app/models/completion_kit/mcp_session.rb
|
|
277
276
|
- app/models/completion_kit/metric.rb
|
|
278
277
|
- app/models/completion_kit/metric_group.rb
|
|
279
278
|
- app/models/completion_kit/metric_group_membership.rb
|
|
279
|
+
- app/models/completion_kit/metric_version.rb
|
|
280
280
|
- app/models/completion_kit/model.rb
|
|
281
281
|
- app/models/completion_kit/prompt.rb
|
|
282
282
|
- app/models/completion_kit/provider_credential.rb
|
|
@@ -295,7 +295,6 @@ files:
|
|
|
295
295
|
- app/services/completion_kit/csv_processor.rb
|
|
296
296
|
- app/services/completion_kit/dashboard_stats.rb
|
|
297
297
|
- app/services/completion_kit/judge_service.rb
|
|
298
|
-
- app/services/completion_kit/judge_variant_generator.rb
|
|
299
298
|
- app/services/completion_kit/llm_client.rb
|
|
300
299
|
- app/services/completion_kit/mcp_dispatcher.rb
|
|
301
300
|
- app/services/completion_kit/mcp_tools/base.rb
|
|
@@ -310,6 +309,7 @@ files:
|
|
|
310
309
|
- app/services/completion_kit/mcp_tools/runs.rb
|
|
311
310
|
- app/services/completion_kit/mcp_tools/tags.rb
|
|
312
311
|
- app/services/completion_kit/metric_calibration_stats.rb
|
|
312
|
+
- app/services/completion_kit/metric_variant_generator.rb
|
|
313
313
|
- app/services/completion_kit/model_discovery_service.rb
|
|
314
314
|
- app/services/completion_kit/ollama_client.rb
|
|
315
315
|
- app/services/completion_kit/onboarding/checklist.rb
|
|
@@ -377,6 +377,8 @@ files:
|
|
|
377
377
|
- app/views/completion_kit/runs/_status_header.html.erb
|
|
378
378
|
- app/views/completion_kit/runs/_status_panel.html.erb
|
|
379
379
|
- app/views/completion_kit/runs/_table.html.erb
|
|
380
|
+
- app/views/completion_kit/runs/compare.html.erb
|
|
381
|
+
- app/views/completion_kit/runs/compare_picker.html.erb
|
|
380
382
|
- app/views/completion_kit/runs/edit.html.erb
|
|
381
383
|
- app/views/completion_kit/runs/index.html.erb
|
|
382
384
|
- app/views/completion_kit/runs/new.html.erb
|
|
@@ -422,6 +424,8 @@ files:
|
|
|
422
424
|
- db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb
|
|
423
425
|
- db/migrate/20260524000001_create_completion_kit_starter_metric_dismissals.rb
|
|
424
426
|
- db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb
|
|
427
|
+
- db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
|
|
428
|
+
- db/migrate/20260528000002_add_metric_version_to_reviews.rb
|
|
425
429
|
- lib/completion-kit.rb
|
|
426
430
|
- lib/completion_kit.rb
|
|
427
431
|
- lib/completion_kit/concurrency_check.rb
|