completion-kit 0.5.43 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/stylesheets/completion_kit/application.css +38 -0
  3. data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +1 -1
  4. data/app/controllers/completion_kit/api/v1/runs_controller.rb +4 -0
  5. data/app/controllers/completion_kit/calibrations_controller.rb +1 -1
  6. data/app/controllers/completion_kit/metrics_controller.rb +76 -20
  7. data/app/controllers/completion_kit/runs_controller.rb +69 -1
  8. data/app/jobs/completion_kit/judge_review_job.rb +3 -0
  9. data/app/models/completion_kit/calibration.rb +2 -2
  10. data/app/models/completion_kit/metric.rb +0 -17
  11. data/app/models/completion_kit/{judge_version.rb → metric_version.rb} +3 -2
  12. data/app/models/completion_kit/review.rb +9 -0
  13. data/app/models/completion_kit/run.rb +60 -0
  14. data/app/services/completion_kit/mcp_tools/calibrations.rb +1 -1
  15. data/app/services/completion_kit/mcp_tools/judges.rb +13 -13
  16. data/app/services/completion_kit/metric_calibration_stats.rb +9 -9
  17. data/app/services/completion_kit/{judge_variant_generator.rb → metric_variant_generator.rb} +27 -13
  18. data/app/views/completion_kit/calibrations/_trust_panel.html.erb +12 -2
  19. data/app/views/completion_kit/metrics/_form.html.erb +3 -3
  20. data/app/views/completion_kit/metrics/edit.html.erb +1 -1
  21. data/app/views/completion_kit/metrics/show.html.erb +12 -14
  22. data/app/views/completion_kit/responses/show.html.erb +9 -1
  23. data/app/views/completion_kit/runs/_actions.html.erb +1 -0
  24. data/app/views/completion_kit/runs/compare.html.erb +85 -0
  25. data/app/views/completion_kit/runs/compare_picker.html.erb +39 -0
  26. data/app/views/completion_kit/runs/show.html.erb +29 -0
  27. data/config/routes.rb +2 -0
  28. data/db/migrate/20260528000001_rename_judge_version_to_metric_version.rb +22 -0
  29. data/db/migrate/20260528000002_add_metric_version_to_reviews.rb +21 -0
  30. data/lib/completion_kit/version.rb +1 -1
  31. metadata +7 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7a9284c6a53b1b609de8ca2c081111687990d32f40f1fa4d2422670daeae9f2f
4
- data.tar.gz: edb62bc8b34b3ecce534a1e4f0730066d6b56f591d05046b1904eb33f9f7cbc6
3
+ metadata.gz: d8454bbb11d5064ca0c6d4355c780425a28198280dffe7dd424d266fbeef6a09
4
+ data.tar.gz: 24c1da76e1e9118d5e2a732e8e45b684f588f553ad4d8bac89e239bc22c953c3
5
5
  SHA512:
6
- metadata.gz: 0aaf95d75bdfee01b387d3ebe97434168815d58627f8d855ad3dd15534e33c2a69eca7ee8a25a964f6669f891026d350abb5c23e23006ada5a1c56df9ad616ea
7
- data.tar.gz: 800fec24cee472a245fcfffbb025eabb2a3bc62cbfc513d1ec0a2c7aa8d1e304f59cc28aa9074712060d0f49ac6bbfba4597cc17e1d3b8db71c5e3b9c557dcab
6
+ metadata.gz: 6fbc5b8047a20240897e19c389bb3f6104d3e2a219794d190183b5433e14d524bb692eb0a27b36ab6471e596c2b9b8af2d70a4f56ae81aa327726fe92f092eb9
7
+ data.tar.gz: a3399003a48836fd457a8c8b488305fad6d006596c6f940a82b232e2a731dfbc3df5ded4ba8bc16b94690a88d56baaaff6edc6801f47f2bbf422ca8fb74270df
@@ -2816,6 +2816,44 @@ select.ck-input {
2816
2816
  line-height: 1.55;
2817
2817
  }
2818
2818
 
2819
+ .ck-review-card--stale {
2820
+ border-left: 2px solid rgba(224, 164, 88, 0.45);
2821
+ }
2822
+
2823
+ .ck-stale-versions-banner {
2824
+ margin: 0 0 1rem;
2825
+ padding: 0.9rem 1rem;
2826
+ border: 1px solid rgba(224, 164, 88, 0.4);
2827
+ background: rgba(224, 164, 88, 0.06);
2828
+ border-radius: var(--ck-radius);
2829
+ display: flex;
2830
+ align-items: center;
2831
+ justify-content: space-between;
2832
+ gap: 1rem;
2833
+ flex-wrap: wrap;
2834
+ }
2835
+ .ck-stale-versions-banner__body { min-width: 0; flex: 1 1 320px; }
2836
+ .ck-stale-versions-banner .ck-kicker { color: var(--ck-warning); }
2837
+
2838
+ .ck-delta {
2839
+ font-family: var(--ck-mono);
2840
+ font-size: 0.78rem;
2841
+ letter-spacing: 0.04em;
2842
+ padding: 2px 6px;
2843
+ border-radius: 4px;
2844
+ }
2845
+ .ck-delta--positive { color: var(--ck-success); background: var(--ck-success-soft); }
2846
+ .ck-delta--negative { color: var(--ck-danger); background: var(--ck-danger-soft); }
2847
+ .ck-delta--zero { color: var(--ck-dim); }
2848
+
2849
+ .ck-run-compare-table td { vertical-align: middle; }
2850
+ .ck-review-card__stale-note {
2851
+ margin: 0.4rem 0 0;
2852
+ font-family: var(--ck-mono);
2853
+ font-size: 0.78rem;
2854
+ color: var(--ck-warning);
2855
+ }
2856
+
2819
2857
  @media (max-width: 900px) {
2820
2858
  .ck-grid--sidebar,
2821
2859
  .ck-grid--cards,
@@ -15,7 +15,7 @@ module CompletionKit
15
15
  run: @run,
16
16
  response: @response,
17
17
  metric: @metric,
18
- judge_version: JudgeVersion.ensure_current_for(@metric),
18
+ metric_version: MetricVersion.ensure_current_for(@metric),
19
19
  **calibration_params
20
20
  )
21
21
 
@@ -45,6 +45,10 @@ module CompletionKit
45
45
  end
46
46
 
47
47
  def retry_failures
48
+ if @run.stale_review_summary.any?
49
+ return render(json: { error: "Judge has changed since this run executed. Retry would mix versions in the same run; use POST /api/v1/runs/:id/rerun instead." }, status: :conflict)
50
+ end
51
+
48
52
  scope = @run.responses.where(status: "failed")
49
53
  scope = scope.where(id: params[:only]) if params[:only].present?
50
54
 
@@ -18,7 +18,7 @@ module CompletionKit
18
18
  run: @run, response: @response, metric: @metric, created_by: created_by
19
19
  )
20
20
  calibration.assign_attributes(
21
- judge_version: JudgeVersion.ensure_current_for(@metric),
21
+ metric_version: MetricVersion.ensure_current_for(@metric),
22
22
  verdict: params[:verdict],
23
23
  corrected_score: params[:corrected_score].presence,
24
24
  note: params[:note].presence
@@ -35,16 +35,15 @@ module CompletionKit
35
35
  end
36
36
 
37
37
  def show
38
- @published_judge_version = JudgeVersion.ensure_current_for(@metric)
38
+ @published_metric_version = MetricVersion.ensure_current_for(@metric)
39
39
  @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
40
- .includes(:judge_version, response: [:reviews, :run])
40
+ .includes(:metric_version, response: [:reviews, :run])
41
41
  .order(created_at: :desc)
42
42
  .limit(50)
43
- @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
44
- @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
45
- @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree",
46
- judge_version_id: @published_judge_version.id).count
47
- @versions = JudgeVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
43
+ @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
44
+ @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
45
+ @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
46
+ @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
48
47
  end
49
48
 
50
49
  def new
@@ -52,9 +51,14 @@ module CompletionKit
52
51
  end
53
52
 
54
53
  def edit
55
- @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
56
- @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
57
- @published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
54
+ @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
55
+ @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
56
+ @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
57
+
58
+ if @edit_draft
59
+ @metric.instruction = @edit_draft.instruction
60
+ @metric.rubric_bands = @edit_draft.rubric_bands
61
+ end
58
62
  end
59
63
 
60
64
  def create
@@ -68,10 +72,42 @@ module CompletionKit
68
72
  end
69
73
 
70
74
  def update
71
- if @metric.update(metric_params)
72
- redirect_to metric_path(@metric), notice: "Metric was successfully updated."
75
+ judge_keys = %i[instruction rubric_bands]
76
+ meta_attrs = metric_params.except(*judge_keys)
77
+ proposed_instruction = metric_params[:instruction]
78
+ proposed_rubric = metric_params[:rubric_bands]
79
+
80
+ unless @metric.update(meta_attrs)
81
+ return render(:edit, status: :unprocessable_entity)
82
+ end
83
+
84
+ current_instruction = @metric.instruction.to_s
85
+ current_rubric = @metric.rubric_bands || []
86
+ normalized_proposed_rubric = normalize_rubric_bands_for_update(proposed_rubric)
87
+
88
+ instruction_changed = !proposed_instruction.nil? && proposed_instruction.to_s != current_instruction
89
+ rubric_changed = !normalized_proposed_rubric.nil? && normalized_proposed_rubric != current_rubric
90
+
91
+ unless instruction_changed || rubric_changed
92
+ return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
93
+ end
94
+
95
+ new_instruction = instruction_changed ? proposed_instruction.to_s : current_instruction
96
+ new_rubric = rubric_changed ? normalized_proposed_rubric : current_rubric
97
+
98
+ if @metric.reviews.exists?
99
+ MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
100
+ draft = MetricVersion.create!(
101
+ metric: @metric, instruction: new_instruction, rubric_bands: new_rubric,
102
+ state: "draft", source: "edit", current: false
103
+ )
104
+ redirect_to edit_metric_path(@metric),
105
+ notice: "Saved as draft #{draft.version_label}. Publish to push these changes to the live judge."
73
106
  else
74
- render :edit, status: :unprocessable_entity
107
+ @metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
108
+ current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
109
+ current_pub&.update!(instruction: @metric.instruction, rubric_bands: @metric.rubric_bands)
110
+ redirect_to metric_path(@metric), notice: "Metric was successfully updated."
75
111
  end
76
112
  end
77
113
 
@@ -88,9 +124,9 @@ module CompletionKit
88
124
  return
89
125
  end
90
126
 
91
- JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
127
+ MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
92
128
 
93
- generator = JudgeVariantGenerator.new(@metric, count: 1)
129
+ generator = MetricVariantGenerator.new(@metric, count: 1)
94
130
  variants = generator.call
95
131
  if variants.empty?
96
132
  redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
@@ -101,18 +137,18 @@ module CompletionKit
101
137
  end
102
138
 
103
139
  def dismiss_suggestion
104
- draft = JudgeVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
140
+ draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
105
141
  draft&.destroy
106
142
  target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
107
143
  redirect_to target, notice: "Dismissed."
108
144
  end
109
145
 
110
146
  def publish_draft
111
- scope = JudgeVersion.where(metric_id: @metric.id)
147
+ scope = MetricVersion.where(metric_id: @metric.id)
112
148
  version = if params[:draft_id].present?
113
149
  scope.find_by(id: params[:draft_id])
114
150
  else
115
- JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
151
+ MetricVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
116
152
  end
117
153
 
118
154
  if version.nil?
@@ -120,9 +156,20 @@ module CompletionKit
120
156
  return
121
157
  end
122
158
 
159
+ was_published_already = version.published?
160
+ reverting = was_published_already && !version.current?
161
+ previously_current = MetricVersion.current.find_by(metric_id: @metric.id)
162
+
123
163
  version.publish!
124
- redirect_to metric_path(@metric),
125
- notice: "#{@metric.name} #{version.version_label} is now the published version."
164
+
165
+ if reverting
166
+ prior_label = previously_current.version_label
167
+ redirect_to metric_path(@metric),
168
+ notice: "Reverted to #{@metric.name} #{version.version_label}. Pinned cases still flow to the judge, and calibration verdicts collected against #{prior_label} stay tied to it."
169
+ else
170
+ redirect_to metric_path(@metric),
171
+ notice: "#{@metric.name} #{version.version_label} is now the published version."
172
+ end
126
173
  end
127
174
 
128
175
  def add_few_shot
@@ -160,5 +207,14 @@ module CompletionKit
160
207
  params.require(:metric).permit(:name, :instruction,
161
208
  rubric_bands: [:stars, :description], tag_names: [])
162
209
  end
210
+
211
+ def normalize_rubric_bands_for_update(bands)
212
+ return nil if bands.nil?
213
+ array = bands.is_a?(ActionController::Parameters) ? bands.to_unsafe_h.values : bands
214
+ Array(array).map do |b|
215
+ h = b.respond_to?(:to_unsafe_h) ? b.to_unsafe_h : b
216
+ { "stars" => h["stars"].to_i, "description" => h["description"].to_s }
217
+ end.sort_by { |b| -b["stars"] }
218
+ end
163
219
  end
164
220
  end
@@ -1,7 +1,7 @@
1
1
  module CompletionKit
2
2
  class RunsController < ApplicationController
3
3
  include CompletionKit::TagFiltering
4
- before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :refresh_status]
4
+ before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :regrade, :refresh_status, :compare]
5
5
  before_action :load_form_collections, only: [:new, :edit, :create, :update]
6
6
 
7
7
  def index
@@ -78,6 +78,29 @@ module CompletionKit
78
78
  end
79
79
  end
80
80
 
81
+ def compare
82
+ other_id = params[:with]
83
+ if other_id.blank?
84
+ @other_runs = Run.where(dataset_id: @run.dataset_id, prompt_id: @run.prompt_id)
85
+ .where.not(id: @run.id)
86
+ .order(created_at: :desc)
87
+ .limit(50)
88
+ return render(:compare_picker)
89
+ end
90
+
91
+ @other_run = Run.find(other_id)
92
+ @comparison = build_run_comparison(@run, @other_run)
93
+ render(:compare)
94
+ end
95
+
96
+ def regrade
97
+ if @run.regrade!
98
+ redirect_to run_path(@run), notice: "Re-grading existing responses with the current judge."
99
+ else
100
+ redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
101
+ end
102
+ end
103
+
81
104
  def rerun
82
105
  new_run = Run.create!(
83
106
  prompt_id: @run.prompt_id,
@@ -126,6 +149,12 @@ module CompletionKit
126
149
  end
127
150
 
128
151
  def retry_failures
152
+ if @run.stale_review_summary.any?
153
+ redirect_to run_path(@run),
154
+ alert: "The judge has changed since this run executed. Retrying failed cases would mix scores from two metric versions in the same run. Use 'Re-run with current judge' to refresh everything against the live judge."
155
+ return
156
+ end
157
+
129
158
  scope = @run.responses.where(status: "failed")
130
159
  scope = scope.where(id: params[:only]) if params[:only].present?
131
160
 
@@ -157,6 +186,45 @@ module CompletionKit
157
186
  @run = Run.find(params[:id])
158
187
  end
159
188
 
189
+ def build_run_comparison(left, right)
190
+ left_responses = left.responses.includes(:reviews).order(:row_index, :id)
191
+ right_responses = right.responses.includes(:reviews).order(:row_index, :id)
192
+ right_by_input = right_responses.each_with_object({}) { |r, h| h[r.input_data.to_s] ||= r }
193
+
194
+ all_reviews = left_responses.flat_map(&:reviews) + right_responses.flat_map(&:reviews)
195
+ metric_ids = all_reviews.map(&:metric_id).compact.uniq
196
+ metric_versions = MetricVersion.where(id: all_reviews.map(&:metric_version_id).compact.uniq).index_by(&:id)
197
+
198
+ rows = left_responses.map do |lr|
199
+ rr = right_by_input[lr.input_data.to_s]
200
+ {
201
+ left_response: lr,
202
+ right_response: rr,
203
+ per_metric: metric_ids.map do |mid|
204
+ l_review = lr.reviews.find { |r| r.metric_id == mid }
205
+ r_review = rr && rr.reviews.find { |r| r.metric_id == mid }
206
+ next nil if l_review.nil? && r_review.nil?
207
+ anchor = l_review || r_review
208
+ {
209
+ metric_id: mid,
210
+ metric_name: anchor.metric_name,
211
+ left_score: l_review ? l_review.ai_score : nil,
212
+ right_score: r_review ? r_review.ai_score : nil,
213
+ left_version_label: version_label_for(l_review, metric_versions),
214
+ right_version_label: version_label_for(r_review, metric_versions),
215
+ delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil
216
+ }
217
+ end.compact
218
+ }
219
+ end
220
+ { rows: rows, metric_ids: metric_ids }
221
+ end
222
+
223
+ def version_label_for(review, metric_versions)
224
+ return nil if review.nil? || review.metric_version_id.nil?
225
+ metric_versions[review.metric_version_id]&.version_label
226
+ end
227
+
160
228
  def load_form_collections
161
229
  @prompts = Prompt.order(:name)
162
230
  @datasets = Dataset.order(:name)
@@ -62,9 +62,11 @@ module CompletionKit
62
62
  )
63
63
 
64
64
  review = response.reviews.find_or_initialize_by(metric_id: metric.id)
65
+ current_metric_version = MetricVersion.ensure_current_for(metric)
65
66
  review.assign_attributes(
66
67
  metric_name: metric.name,
67
68
  instruction: metric.instruction.to_s,
69
+ metric_version_id: current_metric_version.id,
68
70
  status: "succeeded",
69
71
  ai_score: evaluation[:score],
70
72
  ai_feedback: evaluation[:feedback],
@@ -122,6 +124,7 @@ module CompletionKit
122
124
  end
123
125
 
124
126
  def few_shot_payload(metric)
127
+ return nil unless CompletionKit.config.judge_calibration_enabled
125
128
  Array(metric.few_shot_examples).map do |fs|
126
129
  {
127
130
  human_score: fs["human_score"],
@@ -5,7 +5,7 @@ module CompletionKit
5
5
  belongs_to :run
6
6
  belongs_to :response
7
7
  belongs_to :metric
8
- belongs_to :judge_version
8
+ belongs_to :metric_version
9
9
 
10
10
  validates :verdict, presence: true, inclusion: { in: VERDICTS }
11
11
  validates :response_id,
@@ -22,7 +22,7 @@ module CompletionKit
22
22
  run_id: run_id,
23
23
  response_id: response_id,
24
24
  metric_id: metric_id,
25
- judge_version_id: judge_version_id,
25
+ metric_version_id: metric_version_id,
26
26
  verdict: verdict,
27
27
  corrected_score: corrected_score,
28
28
  note: note,
@@ -24,7 +24,6 @@ module CompletionKit
24
24
  before_validation :generate_key
25
25
  before_validation :normalize_rubric_bands
26
26
  before_validation :set_defaults
27
- after_update :fork_draft_judge_version, if: :judge_relevant_changes?
28
27
 
29
28
  def self.default_rubric_bands
30
29
  DEFAULT_RUBRIC_BANDS.map(&:dup)
@@ -98,21 +97,5 @@ module CompletionKit
98
97
  self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
99
98
  end
100
99
 
101
- def judge_relevant_changes?
102
- saved_change_to_instruction? || saved_change_to_rubric_bands?
103
- end
104
-
105
- def fork_draft_judge_version
106
- JudgeVersion.ensure_current_for(self)
107
- JudgeVersion.where(metric_id: id, state: "draft").update_all(current: false)
108
- JudgeVersion.create!(
109
- metric: self,
110
- instruction: instruction,
111
- rubric_bands: rubric_bands,
112
- current: false,
113
- state: "draft",
114
- source: "edit"
115
- )
116
- end
117
100
  end
118
101
  end
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- class JudgeVersion < ApplicationRecord
2
+ class MetricVersion < ApplicationRecord
3
3
  STATES = %w[draft published].freeze
4
4
 
5
5
  belongs_to :metric
@@ -41,7 +41,7 @@ module CompletionKit
41
41
  end
42
42
 
43
43
  def publish!
44
- JudgeVersion.transaction do
44
+ MetricVersion.transaction do
45
45
  self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
46
46
  reload
47
47
  update!(state: "published", current: true, published_at: published_at || Time.current)
@@ -76,4 +76,5 @@ module CompletionKit
76
76
  self.version_number = max + 1
77
77
  end
78
78
  end
79
+
79
80
  end
@@ -5,8 +5,16 @@ module CompletionKit
5
5
 
6
6
  belongs_to :response
7
7
  belongs_to :metric, optional: true
8
+ belongs_to :metric_version, optional: true
8
9
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
9
10
 
11
+ def stale_against_current_judge?
12
+ return false unless metric_id && metric_version_id
13
+ current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
14
+ return false if current_id.nil?
15
+ metric_version_id != current_id
16
+ end
17
+
10
18
  validates :metric_name, presence: true
11
19
  validates :status, inclusion: { in: STATUSES }
12
20
  validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
@@ -29,6 +37,7 @@ module CompletionKit
29
37
  def as_json(options = {})
30
38
  {
31
39
  id: id, response_id: response_id, metric_id: metric_id,
40
+ metric_version_id: metric_version_id,
32
41
  metric_name: metric_name, ai_score: ai_score,
33
42
  ai_feedback: ai_feedback, status: status, attempts: attempts,
34
43
  error: error_payload
@@ -89,6 +89,34 @@ module CompletionKit
89
89
  end
90
90
  end
91
91
 
92
+ def stale_review_summary
93
+ review_pairs = Review.where(response_id: response_ids)
94
+ .where.not(metric_id: nil)
95
+ .where.not(metric_version_id: nil)
96
+ .pluck(:metric_id, :metric_version_id, :metric_name)
97
+ return {} if review_pairs.empty?
98
+
99
+ metric_ids = review_pairs.map(&:first).uniq
100
+ version_ids = review_pairs.map { |_, vid, _| vid }.uniq
101
+ current_by_metric = MetricVersion.current.where(metric_id: metric_ids).pluck(:metric_id, :id, :version_number).each_with_object({}) do |(mid, vid, vnum), h|
102
+ h[mid] = { id: vid, label: "v#{vnum}" }
103
+ end
104
+ label_by_version = MetricVersion.where(id: version_ids).pluck(:id, :version_number).each_with_object({}) { |(vid, vnum), h| h[vid] = "v#{vnum}" }
105
+
106
+ summary = {}
107
+ review_pairs.each do |metric_id, version_id, metric_name|
108
+ current = current_by_metric[metric_id]
109
+ next if current.nil?
110
+ next if version_id == current[:id]
111
+ label = label_by_version[version_id]
112
+ next if label.nil?
113
+ summary[metric_id] ||= { metric_name: metric_name, current_label: current[:label], stale_count: 0, scored_labels: [] }
114
+ summary[metric_id][:stale_count] += 1
115
+ summary[metric_id][:scored_labels] |= [label]
116
+ end
117
+ summary
118
+ end
119
+
92
120
  def start!
93
121
  rows = if dataset
94
122
  CsvProcessor.process_self(self)
@@ -151,6 +179,38 @@ module CompletionKit
151
179
  start!
152
180
  end
153
181
 
182
+ def regrade!
183
+ grading_metrics = metrics
184
+ return false if grading_metrics.empty? || !judge_configured?
185
+
186
+ eligible_responses = responses.where(status: "succeeded").where.not(response_text: nil)
187
+ response_ids = eligible_responses.pluck(:id)
188
+ return false if response_ids.empty?
189
+
190
+ transaction do
191
+ Review.where(response_id: response_ids).update_all(
192
+ status: "pending",
193
+ attempts: 0,
194
+ metric_version_id: nil,
195
+ ai_score: nil,
196
+ ai_feedback: nil,
197
+ error_provider: nil,
198
+ error_class: nil,
199
+ error_status: nil,
200
+ error_message: nil
201
+ )
202
+ update!(status: "running", failure_summary: nil, error_message: nil)
203
+
204
+ response_ids.each do |rid|
205
+ grading_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id) }
206
+ end
207
+ RunCompletionCheckJob.perform_later(id)
208
+ end
209
+
210
+ broadcast_ui
211
+ true
212
+ end
213
+
154
214
  def progress_snapshot
155
215
  generated_done = responses.where(status: "succeeded").count
156
216
  generated_failed = responses.where(status: "failed").count
@@ -56,7 +56,7 @@ module CompletionKit
56
56
  run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
57
57
  )
58
58
  calibration.assign_attributes(
59
- judge_version: CompletionKit::JudgeVersion.ensure_current_for(metric),
59
+ metric_version: CompletionKit::MetricVersion.ensure_current_for(metric),
60
60
  verdict: args["verdict"],
61
61
  corrected_score: args["corrected_score"],
62
62
  note: args["note"]
@@ -5,7 +5,7 @@ module CompletionKit
5
5
 
6
6
  TOOLS = {
7
7
  "judges_suggest" => {
8
- description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft JudgeVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
8
+ description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft MetricVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
9
9
  inputSchema: {
10
10
  type: "object",
11
11
  properties: {
@@ -33,15 +33,15 @@ module CompletionKit
33
33
  handler: :replay
34
34
  },
35
35
  "judges_compare" => {
36
- description: "Compare two judge versions' calibration stats side by side. Pass either two judge_version_ids or one metric_id with judge_version_a_id / judge_version_b_id.",
36
+ description: "Compare two metric versions' calibration stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
37
37
  inputSchema: {
38
38
  type: "object",
39
39
  properties: {
40
40
  metric_id: { type: "integer" },
41
- judge_version_a_id: { type: "integer" },
42
- judge_version_b_id: { type: "integer" }
41
+ metric_version_a_id: { type: "integer" },
42
+ metric_version_b_id: { type: "integer" }
43
43
  },
44
- required: ["metric_id", "judge_version_a_id", "judge_version_b_id"]
44
+ required: ["metric_id", "metric_version_a_id", "metric_version_b_id"]
45
45
  },
46
46
  handler: :compare
47
47
  }
@@ -49,7 +49,7 @@ module CompletionKit
49
49
 
50
50
  def self.suggest(args)
51
51
  metric = CompletionKit::Metric.find(args["metric_id"])
52
- generator = CompletionKit::JudgeVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
52
+ generator = CompletionKit::MetricVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
53
53
  variants = generator.call
54
54
  return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
55
55
  versions = generator.persist!(variants)
@@ -75,20 +75,20 @@ module CompletionKit
75
75
 
76
76
  def self.compare(args)
77
77
  metric = CompletionKit::Metric.find(args["metric_id"])
78
- a = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_a_id"])
79
- b = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_b_id"])
80
- stats_a = CompletionKit::MetricCalibrationStats.for(metric, judge_version: a)
81
- stats_b = CompletionKit::MetricCalibrationStats.for(metric, judge_version: b)
78
+ a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
79
+ b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
80
+ stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
81
+ stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
82
82
  text_result({
83
83
  metric_id: metric.id,
84
- a: judge_version_payload(a, stats_a),
85
- b: judge_version_payload(b, stats_b),
84
+ a: metric_version_payload(a, stats_a),
85
+ b: metric_version_payload(b, stats_b),
86
86
  delta: delta_payload(stats_a, stats_b),
87
87
  recommendation: recommendation_for(stats_a, stats_b)
88
88
  })
89
89
  end
90
90
 
91
- def self.judge_version_payload(version, stats)
91
+ def self.metric_version_payload(version, stats)
92
92
  {
93
93
  id: version.id, state: version.state, current: version.current,
94
94
  source: version.source, created_at: version.created_at,
@@ -33,25 +33,25 @@ module CompletionKit
33
33
 
34
34
  CURRENT = :current
35
35
 
36
- def self.for(metric, judge_version: CURRENT)
37
- resolved = case judge_version
38
- when CURRENT then JudgeVersion.current.find_by(metric_id: metric.id)
36
+ def self.for(metric, metric_version: CURRENT)
37
+ resolved = case metric_version
38
+ when CURRENT then MetricVersion.current.find_by(metric_id: metric.id)
39
39
  when nil then nil
40
- else judge_version
40
+ else metric_version
41
41
  end
42
- new(metric: metric, judge_version: resolved, all_versions: judge_version.nil?).call
42
+ new(metric: metric, metric_version: resolved, all_versions: metric_version.nil?).call
43
43
  end
44
44
 
45
- def initialize(metric:, judge_version: nil, all_versions: false)
45
+ def initialize(metric:, metric_version: nil, all_versions: false)
46
46
  @metric = metric
47
- @judge_version = judge_version
47
+ @metric_version = metric_version
48
48
  @all_versions = all_versions
49
49
  end
50
50
 
51
51
  def call
52
52
  scope = Calibration.where(metric_id: @metric.id)
53
- if @judge_version
54
- scope = scope.where(judge_version_id: @judge_version.id)
53
+ if @metric_version
54
+ scope = scope.where(metric_version_id: @metric_version.id)
55
55
  elsif !@all_versions
56
56
  scope = scope.none
57
57
  end
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- class JudgeVariantGenerator
2
+ class MetricVariantGenerator
3
3
  DEFAULT_VARIANT_COUNT = 1
4
4
  MAX_VARIANT_COUNT = 3
5
5
  DEFAULT_TEMPERATURE = 0.4
@@ -20,9 +20,9 @@ module CompletionKit
20
20
  end
21
21
 
22
22
  def persist!(variants)
23
- JudgeVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
23
+ MetricVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
24
24
  versions = variants.map do |variant|
25
- JudgeVersion.create!(
25
+ MetricVersion.create!(
26
26
  metric: @metric,
27
27
  instruction: variant.instruction,
28
28
  rubric_bands: variant.rubric_bands.presence || @metric.rubric_bands,
@@ -41,8 +41,9 @@ module CompletionKit
41
41
  private
42
42
 
43
43
  def build_meta_prompt
44
- disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
45
- borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
44
+ disagreements = MetricCalibrationExamples.disagreements_for(@metric)
45
+ borderlines = MetricCalibrationExamples.borderlines_for(@metric)
46
+ pinned_examples = Array(@metric.few_shot_examples)
46
47
  sections = []
47
48
  sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
48
49
  sections << ""
@@ -77,6 +78,18 @@ module CompletionKit
77
78
  sections << ""
78
79
  end
79
80
  end
81
+ if pinned_examples.any?
82
+ sections << "## Pinned cases the judge already references"
83
+ sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
84
+ pinned_examples.each_with_index do |ex, i|
85
+ sections << "### Pinned #{i + 1}"
86
+ sections << "Input: #{ex["input"].to_s.truncate(200)}"
87
+ sections << "Output: #{ex["response"].to_s.truncate(200)}"
88
+ sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
89
+ sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
90
+ sections << ""
91
+ end
92
+ end
80
93
  sections << "## Task"
81
94
  sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
82
95
  sections << ""
@@ -117,7 +130,7 @@ module CompletionKit
117
130
  end
118
131
  end
119
132
 
120
- module JudgeCalibrationExamples
133
+ module MetricCalibrationExamples
121
134
  module_function
122
135
 
123
136
  def for(metric, limit: 8)
@@ -133,13 +146,14 @@ module CompletionKit
133
146
  end
134
147
 
135
148
  def calibrations_for(metric, verdict:, limit:)
136
- scope = Calibration.where(metric_id: metric.id, verdict: verdict)
137
- current_version = JudgeVersion.current.find_by(metric_id: metric.id)
138
- scope = scope.where(judge_version_id: current_version.id) if current_version
139
- scope.includes(response: :reviews)
140
- .order(created_at: :desc)
141
- .limit(limit)
142
- .map do |cal|
149
+ base = Calibration.where(metric_id: metric.id, verdict: verdict)
150
+ current_version = MetricVersion.current.find_by(metric_id: metric.id)
151
+ scoped = current_version ? base.where(metric_version_id: current_version.id) : base
152
+ effective = scoped.exists? ? scoped : base
153
+ effective.includes(response: :reviews)
154
+ .order(created_at: :desc)
155
+ .limit(limit)
156
+ .map do |cal|
143
157
  review = cal.response.reviews.find { |r| r.metric_id == metric.id }
144
158
  {
145
159
  input: cal.response.input_data,
@@ -3,19 +3,29 @@
3
3
  <% anchor = metric&.name&.parameterize %>
4
4
  <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
5
5
  created_by = CompletionKit.config.username.presence || "operator"
6
- verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by).pluck(:response_id)
6
+ current_metric_version = CompletionKit::MetricVersion.current.find_by(metric_id: metric.id)
7
+ verdicted_ids = if current_metric_version
8
+ CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
9
+ else
10
+ []
11
+ end
7
12
  CompletionKit::Response.joins(:reviews)
8
13
  .where(reviews: { metric_id: metric.id })
9
14
  .where.not(reviews: { ai_score: nil })
10
15
  .where.not(id: verdicted_ids)
11
16
  .order(created_at: :desc).first
12
17
  end %>
18
+ <% prior_version_verdicts = if stats.sample_size.zero? && metric && current_metric_version
19
+ CompletionKit::Calibration.where(metric_id: metric.id).where.not(metric_version_id: current_metric_version.id).count
20
+ else
21
+ 0
22
+ end %>
13
23
 
14
24
  <p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
15
25
  <span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
16
26
  <% if stats.sample_size.zero? %>
17
27
  <span class="ck-trust-line__state">Not measured yet.</span>
18
- <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if target_response %>
28
+ <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "verdict") %> on prior versions, tied to that version's history.)<% end %><% if target_response %>
19
29
  <%= link_to "Give a verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %>
20
30
  <% else %> Run this metric on a dataset, then give a verdict.<% end %></span>
21
31
  <% elsif stats.counter_only? %>
@@ -16,14 +16,14 @@
16
16
  </div>
17
17
  <% end %>
18
18
 
19
- <% if edit_draft && !suggestion %>
20
- <% pub = local_assigns[:published_judge_version] %>
19
+ <% if edit_draft %>
20
+ <% pub = local_assigns[:published_metric_version] %>
21
21
  <% draft_instr_changed = pub && pub.instruction.to_s != edit_draft.instruction.to_s %>
22
22
  <% draft_rubric_changed = pub && pub.rubric_bands != edit_draft.rubric_bands %>
23
23
  <div class="ck-suggestion-banner" role="status">
24
24
  <div class="ck-suggestion-banner__body">
25
25
  <p class="ck-kicker">Draft pending</p>
26
- <p class="ck-meta-copy">An unpublished draft of this metric is saved. Publish to replace the live<%= " instruction" if draft_instr_changed %><%= " and" if draft_instr_changed && draft_rubric_changed %><%= " rubric" if draft_rubric_changed %> for future runs, or keep editing.</p>
26
+ <p class="ck-meta-copy">The form below shows your unpublished draft. Publish to replace the live<%= " instruction" if draft_instr_changed %><%= " and" if draft_instr_changed && draft_rubric_changed %><%= " rubric" if draft_rubric_changed %> for future runs, or keep editing.</p>
27
27
  </div>
28
28
  <div class="ck-suggestion-banner__actions">
29
29
  <%= button_to "Discard draft", dismiss_suggestion_metric_path(metric, draft_id: edit_draft.id, back_to: "edit"),
@@ -14,4 +14,4 @@
14
14
  metric: @metric,
15
15
  suggestion_draft: @suggestion_draft,
16
16
  edit_draft: @edit_draft,
17
- published_judge_version: @published_judge_version %>
17
+ published_metric_version: @published_metric_version %>
@@ -19,20 +19,17 @@
19
19
  </div>
20
20
  <div class="ck-actions">
21
21
  <% if CompletionKit.config.judge_calibration_enabled %>
22
- <% if @suggestion_draft %>
23
- <%= link_to "Review improvements ", edit_metric_path(@metric),
22
+ <% if @suggestion_draft || @edit_draft %>
23
+ <% review_title = @suggestion_draft ? "The model proposed improvements based on your disagreements. Review and apply what you want." : "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
24
+ <%= link_to "Review changes →", edit_metric_path(@metric),
24
25
  class: ck_button_classes(:dark),
25
- title: "The model proposed improvements based on your disagreements. Review and apply what you want." %>
26
- <% elsif @edit_draft %>
27
- <%= link_to "Review draft →", edit_metric_path(@metric),
28
- class: ck_button_classes(:dark),
29
- title: "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
26
+ title: review_title %>
30
27
  <% elsif @improve_disagreement_count.positive? %>
31
28
  <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
32
29
  method: :post, form_class: "inline-block",
33
30
  class: ck_button_classes(:light, variant: :outline),
34
- title: "Have the model rewrite this metric's instruction and rubric based on the disagreements collected so far.",
35
- data: { turbo_confirm: "Rewrite this metric based on the disagreements collected so far?" } %>
31
+ title: "Ask the model to suggest improvements to this metric's instruction and rubric based on the disagreements collected so far.",
32
+ data: { turbo_confirm: "Ask the model for suggested improvements based on the disagreements collected so far?" } %>
36
33
  <% else %>
37
34
  <button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
38
35
  title="Mark at least one case as Disagree before the model can suggest a change.">Improve the metric</button>
@@ -168,19 +165,20 @@
168
165
  <p class="ck-kicker">Cases to learn from</p>
169
166
  <span class="ck-chip"><%= pluralize(@disagreements.size, "case") %></span>
170
167
  </div>
171
- <p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades.</p>
168
+ <% mixed_versions = @disagreements.any? { |c| c.metric_version_id != @published_metric_version.id } %>
169
+ <p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades<%= " (pins flow into the current judge regardless of which version produced the verdict)" if mixed_versions %>.</p>
172
170
  <% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
173
171
  <ul class="ck-disagreement-list">
174
172
  <% @disagreements.each do |cal| %>
175
173
  <% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
176
174
  <% already = existing_ids.include?(cal.id) %>
177
- <% cal_version = cal.judge_version %>
178
- <% on_current = cal_version&.id == @published_judge_version.id %>
175
+ <% cal_metric_version = cal.metric_version %>
176
+ <% on_current = cal_metric_version&.id == @published_metric_version.id %>
179
177
  <li class="ck-disagreement<%= " ck-disagreement--remembered" if already %><%= " ck-disagreement--stale" unless on_current %>">
180
178
  <div class="ck-disagreement__head">
181
179
  <div class="ck-disagreement__scores">
182
- <% if cal_version %>
183
- <span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_version.version_label %></span>
180
+ <% if cal_metric_version && mixed_versions %>
181
+ <span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_metric_version.version_label %></span>
184
182
  <% end %>
185
183
  <span class="ck-disagreement__scores-label">Judge</span>
186
184
  <% if review&.ai_score %>
@@ -98,10 +98,15 @@
98
98
 
99
99
  <div class="ck-review-list">
100
100
  <% @reviews.each do |review| %>
101
- <div class="ck-review-card" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
101
+ <% review_version = review.metric_version %>
102
+ <% stale = review.stale_against_current_judge? %>
103
+ <div class="ck-review-card<%= " ck-review-card--stale" if stale %>" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
102
104
  <div class="ck-review-card__header">
103
105
  <span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
104
106
  <div class="ck-inline">
107
+ <% if review_version %>
108
+ <span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Score produced by #{review_version.version_label} of this metric. The live judge has changed since." : "Score produced by the live judge (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
109
+ <% end %>
105
110
  <% if review.ai_score %>
106
111
  <% 5.times do |i| %>
107
112
  <svg viewBox="0 0 24 24" width="16" height="16" stroke-width="1.75" class="ck-star <%= i < review.ai_score.to_i ? "ck-star--filled" : "ck-star--empty" %>"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
@@ -111,6 +116,9 @@
111
116
  <% end %>
112
117
  </div>
113
118
  </div>
119
+ <% if stale %>
120
+ <p class="ck-review-card__stale-note">Scored against a superseded version of this metric. The live judge may score this differently.</p>
121
+ <% end %>
114
122
  <% if review.ai_feedback.present? %>
115
123
  <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
116
124
  <% end %>
@@ -11,6 +11,7 @@
11
11
  <%= button_to "Retry", generate_run_path(run), method: :post, class: ck_button_classes(:light, variant: :outline), form_class: "inline-block" %>
12
12
  <%= button_to "Re-run as new", rerun_run_path(run), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
13
13
  <% elsif run.status == "completed" %>
14
+ <%= link_to "Compare", compare_run_path(run), class: ck_button_classes(:light, variant: :outline) %>
14
15
  <%= button_to "Re-run", rerun_run_path(run), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
15
16
  <% end %>
16
17
  <% end %>
@@ -0,0 +1,85 @@
1
+ <ol class="ck-breadcrumb">
2
+ <li><%= link_to "Runs", runs_path %></li>
3
+ <li><%= link_to @run.name, run_path(@run) %></li>
4
+ <li>vs <%= @other_run.name %></li>
5
+ </ol>
6
+
7
+ <section class="ck-page-header">
8
+ <div>
9
+ <h1 class="ck-title">Comparing runs</h1>
10
+ <p class="ck-meta-copy"><strong>A</strong>: <%= link_to @run.name, run_path(@run), class: "ck-link" %> &middot; <strong>B</strong>: <%= link_to @other_run.name, run_path(@other_run), class: "ck-link" %></p>
11
+ </div>
12
+ <div class="ck-actions">
13
+ <%= link_to "Pick another", compare_run_path(@run), class: ck_button_classes(:light, variant: :outline) %>
14
+ </div>
15
+ </section>
16
+
17
+ <% if @comparison[:rows].empty? %>
18
+ <div class="ck-empty">
19
+ <p>No responses to compare yet.</p>
20
+ </div>
21
+ <% else %>
22
+ <table class="ck-results-table ck-run-compare-table">
23
+ <thead>
24
+ <tr>
25
+ <th scope="col">Case</th>
26
+ <th scope="col">Metric</th>
27
+ <th scope="col">A score</th>
28
+ <th scope="col">B score</th>
29
+ <th scope="col">Δ</th>
30
+ <th scope="col">A version</th>
31
+ <th scope="col">B version</th>
32
+ </tr>
33
+ </thead>
34
+ <tbody>
35
+ <% @comparison[:rows].each do |row| %>
36
+ <% case_label = ((row[:left_response].row_index || 0) + 1).to_s %>
37
+ <% row[:per_metric].each_with_index do |pm, idx| %>
38
+ <tr>
39
+ <% if idx == 0 %>
40
+ <td rowspan="<%= row[:per_metric].size %>">
41
+ <%= link_to case_label, run_response_path(@run, row[:left_response]), class: "ck-link" %>
42
+ <% if row[:right_response] %>
43
+ <span class="ck-meta-copy">/ <%= link_to "B", run_response_path(@other_run, row[:right_response]), class: "ck-link" %></span>
44
+ <% end %>
45
+ </td>
46
+ <% end %>
47
+ <td><%= pm[:metric_name] %></td>
48
+ <td>
49
+ <% if pm[:left_score] %>
50
+ <span class="<%= ck_badge_classes(ck_score_kind(pm[:left_score].to_f)) %>"><%= pm[:left_score] %></span>
51
+ <% else %>
52
+ <span class="ck-meta-copy">—</span>
53
+ <% end %>
54
+ </td>
55
+ <td>
56
+ <% if pm[:right_score] %>
57
+ <span class="<%= ck_badge_classes(ck_score_kind(pm[:right_score].to_f)) %>"><%= pm[:right_score] %></span>
58
+ <% else %>
59
+ <span class="ck-meta-copy">—</span>
60
+ <% end %>
61
+ </td>
62
+ <td>
63
+ <% if pm[:delta] %>
64
+ <% delta_class = pm[:delta] > 0 ? "ck-delta--positive" : pm[:delta] < 0 ? "ck-delta--negative" : "ck-delta--zero" %>
65
+ <span class="ck-delta <%= delta_class %>"><%= pm[:delta].positive? ? "+#{pm[:delta]}" : pm[:delta].to_s %></span>
66
+ <% else %>
67
+ <span class="ck-meta-copy">—</span>
68
+ <% end %>
69
+ </td>
70
+ <td>
71
+ <% if pm[:left_version_label] %>
72
+ <span class="ck-source-chip ck-source-chip--current"><%= pm[:left_version_label] %></span>
73
+ <% end %>
74
+ </td>
75
+ <td>
76
+ <% if pm[:right_version_label] %>
77
+ <span class="ck-source-chip ck-source-chip--current"><%= pm[:right_version_label] %></span>
78
+ <% end %>
79
+ </td>
80
+ </tr>
81
+ <% end %>
82
+ <% end %>
83
+ </tbody>
84
+ </table>
85
+ <% end %>
@@ -0,0 +1,39 @@
1
+ <ol class="ck-breadcrumb">
2
+ <li><%= link_to "Runs", runs_path %></li>
3
+ <li><%= link_to @run.name, run_path(@run) %></li>
4
+ <li>Compare</li>
5
+ </ol>
6
+
7
+ <section class="ck-page-header">
8
+ <div>
9
+ <h1 class="ck-title">Compare with another run</h1>
10
+ <p class="ck-lead">Pick a run on the same dataset and prompt to see per-case score deltas side by side.</p>
11
+ </div>
12
+ </section>
13
+
14
+ <% if @other_runs.any? %>
15
+ <table class="ck-results-table">
16
+ <thead>
17
+ <tr>
18
+ <th scope="col">Run</th>
19
+ <th scope="col">Judge</th>
20
+ <th scope="col">Created</th>
21
+ <th scope="col"></th>
22
+ </tr>
23
+ </thead>
24
+ <tbody>
25
+ <% @other_runs.each do |other| %>
26
+ <tr>
27
+ <td><%= link_to other.name, run_path(other), class: "ck-link" %></td>
28
+ <td class="ck-meta-copy"><%= other.judge_model %></td>
29
+ <td class="ck-meta-copy"><time datetime="<%= other.created_at.utc.iso8601 %>"><%= time_ago_in_words(other.created_at) %> ago</time></td>
30
+ <td class="ck-results-table__arrow"><%= link_to "Compare →", compare_run_path(@run, with: other.id), class: "ck-link" %></td>
31
+ </tr>
32
+ <% end %>
33
+ </tbody>
34
+ </table>
35
+ <% else %>
36
+ <div class="ck-empty">
37
+ <p>No other runs on this dataset + prompt combination yet. <%= link_to "Re-run from this one", rerun_run_path(@run), method: :post, class: "ck-link" %> to create one.</p>
38
+ </div>
39
+ <% end %>
@@ -18,6 +18,35 @@
18
18
  <% dataset_preview_lines = dataset_lines.first(50) %>
19
19
  <% end %>
20
20
 
21
+ <% if CompletionKit.config.judge_calibration_enabled %>
22
+ <% stale_summary = @run.stale_review_summary %>
23
+ <% if stale_summary.any? %>
24
+ <div class="ck-stale-versions-banner" role="status">
25
+ <div class="ck-stale-versions-banner__body">
26
+ <p class="ck-kicker">Stale judge versions</p>
27
+ <p class="ck-meta-copy">
28
+ This run was scored against metric versions that are no longer live.
29
+ <% stale_summary.values.each_with_index do |s, i| %>
30
+ <%= ", " if i > 0 %><strong><%= s[:metric_name] %></strong> (scored by <%= s[:scored_labels].join(", ") %>; live is <%= s[:current_label] %>)<% end %>.
31
+ Re-run to refresh the scores with the current judge.
32
+ </p>
33
+ </div>
34
+ <% if @run.status == "completed" %>
35
+ <%= button_to "Re-run from scratch",
36
+ rerun_run_path(@run), method: :post,
37
+ class: ck_button_classes(:light, variant: :outline), form_class: "inline-block",
38
+ title: "Create a new run that regenerates responses and grades them with the current judge.",
39
+ data: { turbo_confirm: "Create a new run with fresh responses and the current judge? The original run stays as a record." } %>
40
+ <%= button_to "Re-grade with current judge",
41
+ regrade_run_path(@run), method: :post,
42
+ class: ck_button_classes(:dark), form_class: "inline-block",
43
+ title: "Re-judge this run's existing responses against the current judge. Faster and cheaper than re-running.",
44
+ data: { turbo_confirm: "Re-judge this run's existing responses against the current judge?" } %>
45
+ <% end %>
46
+ </div>
47
+ <% end %>
48
+ <% end %>
49
+
21
50
  <div class="ck-run-config">
22
51
  <div class="ck-run-config__row">
23
52
  <span class="ck-run-config__key">Created</span>
data/config/routes.rb CHANGED
@@ -37,7 +37,9 @@ CompletionKit::Engine.routes.draw do
37
37
  post :suggest
38
38
  post :retry_failures
39
39
  post :rerun
40
+ post :regrade
40
41
  get :refresh_status
42
+ get :compare
41
43
  end
42
44
  resources :responses, only: [:show] do
43
45
  resources :calibrations, only: [:create]
@@ -0,0 +1,22 @@
1
+ class RenameJudgeVersionToMetricVersion < ActiveRecord::Migration[8.1]
2
+ def change
3
+ rename_table :completion_kit_judge_versions, :completion_kit_metric_versions
4
+ rename_column :completion_kit_calibrations, :judge_version_id, :metric_version_id
5
+
6
+ rename_index :completion_kit_metric_versions,
7
+ "index_ck_judge_versions_on_metric_id",
8
+ "index_ck_metric_versions_on_metric_id"
9
+ rename_index :completion_kit_metric_versions,
10
+ "index_ck_judge_versions_on_metric_current",
11
+ "index_ck_metric_versions_on_metric_current"
12
+ rename_index :completion_kit_metric_versions,
13
+ "index_ck_judge_versions_on_metric_state",
14
+ "index_ck_metric_versions_on_metric_state"
15
+ rename_index :completion_kit_metric_versions,
16
+ "index_ck_judge_versions_on_metric_version",
17
+ "index_ck_metric_versions_on_metric_vnum"
18
+ rename_index :completion_kit_calibrations,
19
+ "index_ck_calibrations_on_judge_version_id",
20
+ "index_ck_calibrations_on_metric_version_id"
21
+ end
22
+ end
@@ -0,0 +1,21 @@
1
+ class AddMetricVersionToReviews < ActiveRecord::Migration[8.1]
2
+ def change
3
+ add_column :completion_kit_reviews, :metric_version_id, :bigint
4
+ add_index :completion_kit_reviews, :metric_version_id, name: "index_ck_reviews_on_metric_version_id"
5
+
6
+ reversible do |dir|
7
+ dir.up do
8
+ execute <<~SQL
9
+ UPDATE completion_kit_reviews
10
+ SET metric_version_id = (
11
+ SELECT id FROM completion_kit_metric_versions mv
12
+ WHERE mv.metric_id = completion_kit_reviews.metric_id
13
+ AND mv.current = #{ActiveRecord::Base.connection.quote(true)}
14
+ LIMIT 1
15
+ )
16
+ WHERE metric_id IS NOT NULL
17
+ SQL
18
+ end
19
+ end
20
+ end
21
+ end
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.5.43"
2
+ VERSION = "0.6.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.43
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin
@@ -272,11 +272,11 @@ files:
272
272
  - app/models/completion_kit/calibration.rb
273
273
  - app/models/completion_kit/dashboard_dismissal.rb
274
274
  - app/models/completion_kit/dataset.rb
275
- - app/models/completion_kit/judge_version.rb
276
275
  - app/models/completion_kit/mcp_session.rb
277
276
  - app/models/completion_kit/metric.rb
278
277
  - app/models/completion_kit/metric_group.rb
279
278
  - app/models/completion_kit/metric_group_membership.rb
279
+ - app/models/completion_kit/metric_version.rb
280
280
  - app/models/completion_kit/model.rb
281
281
  - app/models/completion_kit/prompt.rb
282
282
  - app/models/completion_kit/provider_credential.rb
@@ -295,7 +295,6 @@ files:
295
295
  - app/services/completion_kit/csv_processor.rb
296
296
  - app/services/completion_kit/dashboard_stats.rb
297
297
  - app/services/completion_kit/judge_service.rb
298
- - app/services/completion_kit/judge_variant_generator.rb
299
298
  - app/services/completion_kit/llm_client.rb
300
299
  - app/services/completion_kit/mcp_dispatcher.rb
301
300
  - app/services/completion_kit/mcp_tools/base.rb
@@ -310,6 +309,7 @@ files:
310
309
  - app/services/completion_kit/mcp_tools/runs.rb
311
310
  - app/services/completion_kit/mcp_tools/tags.rb
312
311
  - app/services/completion_kit/metric_calibration_stats.rb
312
+ - app/services/completion_kit/metric_variant_generator.rb
313
313
  - app/services/completion_kit/model_discovery_service.rb
314
314
  - app/services/completion_kit/ollama_client.rb
315
315
  - app/services/completion_kit/onboarding/checklist.rb
@@ -377,6 +377,8 @@ files:
377
377
  - app/views/completion_kit/runs/_status_header.html.erb
378
378
  - app/views/completion_kit/runs/_status_panel.html.erb
379
379
  - app/views/completion_kit/runs/_table.html.erb
380
+ - app/views/completion_kit/runs/compare.html.erb
381
+ - app/views/completion_kit/runs/compare_picker.html.erb
380
382
  - app/views/completion_kit/runs/edit.html.erb
381
383
  - app/views/completion_kit/runs/index.html.erb
382
384
  - app/views/completion_kit/runs/new.html.erb
@@ -422,6 +424,8 @@ files:
422
424
  - db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb
423
425
  - db/migrate/20260524000001_create_completion_kit_starter_metric_dismissals.rb
424
426
  - db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb
427
+ - db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
428
+ - db/migrate/20260528000002_add_metric_version_to_reviews.rb
425
429
  - lib/completion-kit.rb
426
430
  - lib/completion_kit.rb
427
431
  - lib/completion_kit/concurrency_check.rb