completion-kit 0.5.43 → 0.5.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/stylesheets/completion_kit/application.css +25 -0
  3. data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +1 -1
  4. data/app/controllers/completion_kit/api/v1/runs_controller.rb +4 -0
  5. data/app/controllers/completion_kit/calibrations_controller.rb +1 -1
  6. data/app/controllers/completion_kit/metrics_controller.rb +63 -17
  7. data/app/controllers/completion_kit/runs_controller.rb +6 -0
  8. data/app/jobs/completion_kit/judge_review_job.rb +3 -0
  9. data/app/models/completion_kit/calibration.rb +6 -2
  10. data/app/models/completion_kit/metric.rb +0 -17
  11. data/app/models/completion_kit/{judge_version.rb → metric_version.rb} +4 -2
  12. data/app/models/completion_kit/review.rb +9 -0
  13. data/app/models/completion_kit/run.rb +28 -0
  14. data/app/services/completion_kit/mcp_tools/calibrations.rb +1 -1
  15. data/app/services/completion_kit/mcp_tools/judges.rb +15 -13
  16. data/app/services/completion_kit/metric_calibration_stats.rb +9 -9
  17. data/app/services/completion_kit/{judge_variant_generator.rb → metric_variant_generator.rb} +8 -8
  18. data/app/views/completion_kit/calibrations/_trust_panel.html.erb +6 -1
  19. data/app/views/completion_kit/metrics/_form.html.erb +3 -3
  20. data/app/views/completion_kit/metrics/edit.html.erb +1 -1
  21. data/app/views/completion_kit/metrics/show.html.erb +12 -14
  22. data/app/views/completion_kit/responses/show.html.erb +9 -1
  23. data/app/views/completion_kit/runs/show.html.erb +23 -0
  24. data/db/migrate/20260528000001_rename_judge_version_to_metric_version.rb +22 -0
  25. data/db/migrate/20260528000002_add_metric_version_to_reviews.rb +21 -0
  26. data/lib/completion_kit/version.rb +1 -1
  27. metadata +5 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 7a9284c6a53b1b609de8ca2c081111687990d32f40f1fa4d2422670daeae9f2f
4
- data.tar.gz: edb62bc8b34b3ecce534a1e4f0730066d6b56f591d05046b1904eb33f9f7cbc6
3
+ metadata.gz: d81df0996441d12c0fb540b9f29bb514813adcdbea3ceefb515d318f28947731
4
+ data.tar.gz: 606764f41e74cec3284f1155d7ef86e77a61af708af2320d5b02640827741f7a
5
5
  SHA512:
6
- metadata.gz: 0aaf95d75bdfee01b387d3ebe97434168815d58627f8d855ad3dd15534e33c2a69eca7ee8a25a964f6669f891026d350abb5c23e23006ada5a1c56df9ad616ea
7
- data.tar.gz: 800fec24cee472a245fcfffbb025eabb2a3bc62cbfc513d1ec0a2c7aa8d1e304f59cc28aa9074712060d0f49ac6bbfba4597cc17e1d3b8db71c5e3b9c557dcab
6
+ metadata.gz: 9e468cd12eb143f4b5eb64333339199420db4c9d0c78ec548965972eee5e326d574a80c6c3092d63f4d99d88901ce3470ac688468d2813f5370e589568fba669
7
+ data.tar.gz: 7377f00a31d539297f9e79059083aa7bfef782d18d1ecfcb9f7da1ff648ce1eaf6f8a94bc55d56fcca22a47e09c7fcb1bc89981563aa351e4293c47f8d886570
@@ -2816,6 +2816,31 @@ select.ck-input {
2816
2816
  line-height: 1.55;
2817
2817
  }
2818
2818
 
2819
+ .ck-review-card--stale {
2820
+ border-left: 2px solid rgba(224, 164, 88, 0.45);
2821
+ }
2822
+
2823
+ .ck-stale-versions-banner {
2824
+ margin: 0 0 1rem;
2825
+ padding: 0.9rem 1rem;
2826
+ border: 1px solid rgba(224, 164, 88, 0.4);
2827
+ background: rgba(224, 164, 88, 0.06);
2828
+ border-radius: var(--ck-radius);
2829
+ display: flex;
2830
+ align-items: center;
2831
+ justify-content: space-between;
2832
+ gap: 1rem;
2833
+ flex-wrap: wrap;
2834
+ }
2835
+ .ck-stale-versions-banner__body { min-width: 0; flex: 1 1 320px; }
2836
+ .ck-stale-versions-banner .ck-kicker { color: var(--ck-warning); }
2837
+ .ck-review-card__stale-note {
2838
+ margin: 0.4rem 0 0;
2839
+ font-family: var(--ck-mono);
2840
+ font-size: 0.78rem;
2841
+ color: var(--ck-warning);
2842
+ }
2843
+
2819
2844
  @media (max-width: 900px) {
2820
2845
  .ck-grid--sidebar,
2821
2846
  .ck-grid--cards,
@@ -15,7 +15,7 @@ module CompletionKit
15
15
  run: @run,
16
16
  response: @response,
17
17
  metric: @metric,
18
- judge_version: JudgeVersion.ensure_current_for(@metric),
18
+ metric_version: MetricVersion.ensure_current_for(@metric),
19
19
  **calibration_params
20
20
  )
21
21
 
@@ -45,6 +45,10 @@ module CompletionKit
45
45
  end
46
46
 
47
47
  def retry_failures
48
+ if @run.stale_review_summary.any?
49
+ return render(json: { error: "Judge has changed since this run executed. Retry would mix versions in the same run; use POST /api/v1/runs/:id/rerun instead." }, status: :conflict)
50
+ end
51
+
48
52
  scope = @run.responses.where(status: "failed")
49
53
  scope = scope.where(id: params[:only]) if params[:only].present?
50
54
 
@@ -18,7 +18,7 @@ module CompletionKit
18
18
  run: @run, response: @response, metric: @metric, created_by: created_by
19
19
  )
20
20
  calibration.assign_attributes(
21
- judge_version: JudgeVersion.ensure_current_for(@metric),
21
+ metric_version: MetricVersion.ensure_current_for(@metric),
22
22
  verdict: params[:verdict],
23
23
  corrected_score: params[:corrected_score].presence,
24
24
  note: params[:note].presence
@@ -35,16 +35,16 @@ module CompletionKit
35
35
  end
36
36
 
37
37
  def show
38
- @published_judge_version = JudgeVersion.ensure_current_for(@metric)
38
+ @published_metric_version = MetricVersion.ensure_current_for(@metric)
39
39
  @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
40
- .includes(:judge_version, response: [:reviews, :run])
40
+ .includes(:metric_version, response: [:reviews, :run])
41
41
  .order(created_at: :desc)
42
42
  .limit(50)
43
- @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
44
- @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
43
+ @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
44
+ @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
45
45
  @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree",
46
- judge_version_id: @published_judge_version.id).count
47
- @versions = JudgeVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
46
+ metric_version_id: @published_metric_version.id).count
47
+ @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
48
48
  end
49
49
 
50
50
  def new
@@ -52,9 +52,14 @@ module CompletionKit
52
52
  end
53
53
 
54
54
  def edit
55
- @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
56
- @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
57
- @published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
55
+ @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
56
+ @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
57
+ @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
58
+
59
+ if @edit_draft
60
+ @metric.instruction = @edit_draft.instruction
61
+ @metric.rubric_bands = @edit_draft.rubric_bands
62
+ end
58
63
  end
59
64
 
60
65
  def create
@@ -68,10 +73,42 @@ module CompletionKit
68
73
  end
69
74
 
70
75
  def update
71
- if @metric.update(metric_params)
72
- redirect_to metric_path(@metric), notice: "Metric was successfully updated."
76
+ judge_keys = %i[instruction rubric_bands]
77
+ meta_attrs = metric_params.except(*judge_keys)
78
+ proposed_instruction = metric_params[:instruction]
79
+ proposed_rubric = metric_params[:rubric_bands]
80
+
81
+ unless @metric.update(meta_attrs)
82
+ return render(:edit, status: :unprocessable_entity)
83
+ end
84
+
85
+ current_instruction = @metric.instruction.to_s
86
+ current_rubric = @metric.rubric_bands || []
87
+ normalized_proposed_rubric = normalize_rubric_bands_for_update(proposed_rubric)
88
+
89
+ instruction_changed = !proposed_instruction.nil? && proposed_instruction.to_s != current_instruction
90
+ rubric_changed = !normalized_proposed_rubric.nil? && normalized_proposed_rubric != current_rubric
91
+
92
+ unless instruction_changed || rubric_changed
93
+ return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
94
+ end
95
+
96
+ new_instruction = instruction_changed ? proposed_instruction.to_s : current_instruction
97
+ new_rubric = rubric_changed ? normalized_proposed_rubric : current_rubric
98
+
99
+ if @metric.reviews.exists?
100
+ MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
101
+ draft = MetricVersion.create!(
102
+ metric: @metric, instruction: new_instruction, rubric_bands: new_rubric,
103
+ state: "draft", source: "edit", current: false
104
+ )
105
+ redirect_to edit_metric_path(@metric),
106
+ notice: "Saved as draft #{draft.version_label}. Publish to push these changes to the live judge."
73
107
  else
74
- render :edit, status: :unprocessable_entity
108
+ @metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
109
+ current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
110
+ current_pub&.update!(instruction: @metric.instruction, rubric_bands: @metric.rubric_bands)
111
+ redirect_to metric_path(@metric), notice: "Metric was successfully updated."
75
112
  end
76
113
  end
77
114
 
@@ -88,9 +125,9 @@ module CompletionKit
88
125
  return
89
126
  end
90
127
 
91
- JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
128
+ MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
92
129
 
93
- generator = JudgeVariantGenerator.new(@metric, count: 1)
130
+ generator = MetricVariantGenerator.new(@metric, count: 1)
94
131
  variants = generator.call
95
132
  if variants.empty?
96
133
  redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
@@ -101,18 +138,18 @@ module CompletionKit
101
138
  end
102
139
 
103
140
  def dismiss_suggestion
104
- draft = JudgeVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
141
+ draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
105
142
  draft&.destroy
106
143
  target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
107
144
  redirect_to target, notice: "Dismissed."
108
145
  end
109
146
 
110
147
  def publish_draft
111
- scope = JudgeVersion.where(metric_id: @metric.id)
148
+ scope = MetricVersion.where(metric_id: @metric.id)
112
149
  version = if params[:draft_id].present?
113
150
  scope.find_by(id: params[:draft_id])
114
151
  else
115
- JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
152
+ MetricVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
116
153
  end
117
154
 
118
155
  if version.nil?
@@ -160,5 +197,14 @@ module CompletionKit
160
197
  params.require(:metric).permit(:name, :instruction,
161
198
  rubric_bands: [:stars, :description], tag_names: [])
162
199
  end
200
+
201
+ def normalize_rubric_bands_for_update(bands)
202
+ return nil if bands.nil?
203
+ array = bands.is_a?(ActionController::Parameters) ? bands.to_unsafe_h.values : bands
204
+ Array(array).map do |b|
205
+ h = b.respond_to?(:to_unsafe_h) ? b.to_unsafe_h : b
206
+ { "stars" => h["stars"].to_i, "description" => h["description"].to_s }
207
+ end.sort_by { |b| -b["stars"] }
208
+ end
163
209
  end
164
210
  end
@@ -126,6 +126,12 @@ module CompletionKit
126
126
  end
127
127
 
128
128
  def retry_failures
129
+ if @run.stale_review_summary.any?
130
+ redirect_to run_path(@run),
131
+ alert: "The judge has changed since this run executed. Retrying failed cases would mix scores from two metric versions in the same run. Use 'Re-run with current judge' to refresh everything against the live judge."
132
+ return
133
+ end
134
+
129
135
  scope = @run.responses.where(status: "failed")
130
136
  scope = scope.where(id: params[:only]) if params[:only].present?
131
137
 
@@ -62,9 +62,11 @@ module CompletionKit
62
62
  )
63
63
 
64
64
  review = response.reviews.find_or_initialize_by(metric_id: metric.id)
65
+ current_metric_version = MetricVersion.ensure_current_for(metric)
65
66
  review.assign_attributes(
66
67
  metric_name: metric.name,
67
68
  instruction: metric.instruction.to_s,
69
+ metric_version_id: current_metric_version.id,
68
70
  status: "succeeded",
69
71
  ai_score: evaluation[:score],
70
72
  ai_feedback: evaluation[:feedback],
@@ -122,6 +124,7 @@ module CompletionKit
122
124
  end
123
125
 
124
126
  def few_shot_payload(metric)
127
+ return nil unless CompletionKit.config.judge_calibration_enabled
125
128
  Array(metric.few_shot_examples).map do |fs|
126
129
  {
127
130
  human_score: fs["human_score"],
@@ -5,7 +5,11 @@ module CompletionKit
5
5
  belongs_to :run
6
6
  belongs_to :response
7
7
  belongs_to :metric
8
- belongs_to :judge_version
8
+ belongs_to :metric_version
9
+
10
+ alias_attribute :judge_version_id, :metric_version_id
11
+ alias_method :judge_version, :metric_version
12
+ alias_method :judge_version=, :metric_version=
9
13
 
10
14
  validates :verdict, presence: true, inclusion: { in: VERDICTS }
11
15
  validates :response_id,
@@ -22,7 +26,7 @@ module CompletionKit
22
26
  run_id: run_id,
23
27
  response_id: response_id,
24
28
  metric_id: metric_id,
25
- judge_version_id: judge_version_id,
29
+ metric_version_id: metric_version_id,
26
30
  verdict: verdict,
27
31
  corrected_score: corrected_score,
28
32
  note: note,
@@ -24,7 +24,6 @@ module CompletionKit
24
24
  before_validation :generate_key
25
25
  before_validation :normalize_rubric_bands
26
26
  before_validation :set_defaults
27
- after_update :fork_draft_judge_version, if: :judge_relevant_changes?
28
27
 
29
28
  def self.default_rubric_bands
30
29
  DEFAULT_RUBRIC_BANDS.map(&:dup)
@@ -98,21 +97,5 @@ module CompletionKit
98
97
  self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
99
98
  end
100
99
 
101
- def judge_relevant_changes?
102
- saved_change_to_instruction? || saved_change_to_rubric_bands?
103
- end
104
-
105
- def fork_draft_judge_version
106
- JudgeVersion.ensure_current_for(self)
107
- JudgeVersion.where(metric_id: id, state: "draft").update_all(current: false)
108
- JudgeVersion.create!(
109
- metric: self,
110
- instruction: instruction,
111
- rubric_bands: rubric_bands,
112
- current: false,
113
- state: "draft",
114
- source: "edit"
115
- )
116
- end
117
100
  end
118
101
  end
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- class JudgeVersion < ApplicationRecord
2
+ class MetricVersion < ApplicationRecord
3
3
  STATES = %w[draft published].freeze
4
4
 
5
5
  belongs_to :metric
@@ -41,7 +41,7 @@ module CompletionKit
41
41
  end
42
42
 
43
43
  def publish!
44
- JudgeVersion.transaction do
44
+ MetricVersion.transaction do
45
45
  self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
46
46
  reload
47
47
  update!(state: "published", current: true, published_at: published_at || Time.current)
@@ -76,4 +76,6 @@ module CompletionKit
76
76
  self.version_number = max + 1
77
77
  end
78
78
  end
79
+
80
+ JudgeVersion = MetricVersion
79
81
  end
@@ -5,8 +5,16 @@ module CompletionKit
5
5
 
6
6
  belongs_to :response
7
7
  belongs_to :metric, optional: true
8
+ belongs_to :metric_version, optional: true
8
9
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
9
10
 
11
+ def stale_against_current_judge?
12
+ return false unless metric_id && metric_version_id
13
+ current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
14
+ return false if current_id.nil?
15
+ metric_version_id != current_id
16
+ end
17
+
10
18
  validates :metric_name, presence: true
11
19
  validates :status, inclusion: { in: STATUSES }
12
20
  validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
@@ -29,6 +37,7 @@ module CompletionKit
29
37
  def as_json(options = {})
30
38
  {
31
39
  id: id, response_id: response_id, metric_id: metric_id,
40
+ metric_version_id: metric_version_id,
32
41
  metric_name: metric_name, ai_score: ai_score,
33
42
  ai_feedback: ai_feedback, status: status, attempts: attempts,
34
43
  error: error_payload
@@ -89,6 +89,34 @@ module CompletionKit
89
89
  end
90
90
  end
91
91
 
92
+ def stale_review_summary
93
+ review_pairs = Review.where(response_id: response_ids)
94
+ .where.not(metric_id: nil)
95
+ .where.not(metric_version_id: nil)
96
+ .pluck(:metric_id, :metric_version_id, :metric_name)
97
+ return {} if review_pairs.empty?
98
+
99
+ metric_ids = review_pairs.map(&:first).uniq
100
+ version_ids = review_pairs.map { |_, vid, _| vid }.uniq
101
+ current_by_metric = MetricVersion.current.where(metric_id: metric_ids).pluck(:metric_id, :id, :version_number).each_with_object({}) do |(mid, vid, vnum), h|
102
+ h[mid] = { id: vid, label: "v#{vnum}" }
103
+ end
104
+ label_by_version = MetricVersion.where(id: version_ids).pluck(:id, :version_number).each_with_object({}) { |(vid, vnum), h| h[vid] = "v#{vnum}" }
105
+
106
+ summary = {}
107
+ review_pairs.each do |metric_id, version_id, metric_name|
108
+ current = current_by_metric[metric_id]
109
+ next if current.nil?
110
+ next if version_id == current[:id]
111
+ label = label_by_version[version_id]
112
+ next if label.nil?
113
+ summary[metric_id] ||= { metric_name: metric_name, current_label: current[:label], stale_count: 0, scored_labels: [] }
114
+ summary[metric_id][:stale_count] += 1
115
+ summary[metric_id][:scored_labels] |= [label]
116
+ end
117
+ summary
118
+ end
119
+
92
120
  def start!
93
121
  rows = if dataset
94
122
  CsvProcessor.process_self(self)
@@ -56,7 +56,7 @@ module CompletionKit
56
56
  run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
57
57
  )
58
58
  calibration.assign_attributes(
59
- judge_version: CompletionKit::JudgeVersion.ensure_current_for(metric),
59
+ metric_version: CompletionKit::MetricVersion.ensure_current_for(metric),
60
60
  verdict: args["verdict"],
61
61
  corrected_score: args["corrected_score"],
62
62
  note: args["note"]
@@ -5,7 +5,7 @@ module CompletionKit
5
5
 
6
6
  TOOLS = {
7
7
  "judges_suggest" => {
8
- description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft JudgeVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
8
+ description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft MetricVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
9
9
  inputSchema: {
10
10
  type: "object",
11
11
  properties: {
@@ -33,15 +33,15 @@ module CompletionKit
33
33
  handler: :replay
34
34
  },
35
35
  "judges_compare" => {
36
- description: "Compare two judge versions' calibration stats side by side. Pass either two judge_version_ids or one metric_id with judge_version_a_id / judge_version_b_id.",
36
+ description: "Compare two metric versions' calibration stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
37
37
  inputSchema: {
38
38
  type: "object",
39
39
  properties: {
40
40
  metric_id: { type: "integer" },
41
- judge_version_a_id: { type: "integer" },
42
- judge_version_b_id: { type: "integer" }
41
+ metric_version_a_id: { type: "integer" },
42
+ metric_version_b_id: { type: "integer" }
43
43
  },
44
- required: ["metric_id", "judge_version_a_id", "judge_version_b_id"]
44
+ required: ["metric_id", "metric_version_a_id", "metric_version_b_id"]
45
45
  },
46
46
  handler: :compare
47
47
  }
@@ -49,7 +49,7 @@ module CompletionKit
49
49
 
50
50
  def self.suggest(args)
51
51
  metric = CompletionKit::Metric.find(args["metric_id"])
52
- generator = CompletionKit::JudgeVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
52
+ generator = CompletionKit::MetricVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
53
53
  variants = generator.call
54
54
  return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
55
55
  versions = generator.persist!(variants)
@@ -75,20 +75,22 @@ module CompletionKit
75
75
 
76
76
  def self.compare(args)
77
77
  metric = CompletionKit::Metric.find(args["metric_id"])
78
- a = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_a_id"])
79
- b = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_b_id"])
80
- stats_a = CompletionKit::MetricCalibrationStats.for(metric, judge_version: a)
81
- stats_b = CompletionKit::MetricCalibrationStats.for(metric, judge_version: b)
78
+ a_id = args["metric_version_a_id"] || args["judge_version_a_id"]
79
+ b_id = args["metric_version_b_id"] || args["judge_version_b_id"]
80
+ a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(a_id)
81
+ b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(b_id)
82
+ stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
83
+ stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
82
84
  text_result({
83
85
  metric_id: metric.id,
84
- a: judge_version_payload(a, stats_a),
85
- b: judge_version_payload(b, stats_b),
86
+ a: metric_version_payload(a, stats_a),
87
+ b: metric_version_payload(b, stats_b),
86
88
  delta: delta_payload(stats_a, stats_b),
87
89
  recommendation: recommendation_for(stats_a, stats_b)
88
90
  })
89
91
  end
90
92
 
91
- def self.judge_version_payload(version, stats)
93
+ def self.metric_version_payload(version, stats)
92
94
  {
93
95
  id: version.id, state: version.state, current: version.current,
94
96
  source: version.source, created_at: version.created_at,
@@ -33,25 +33,25 @@ module CompletionKit
33
33
 
34
34
  CURRENT = :current
35
35
 
36
- def self.for(metric, judge_version: CURRENT)
37
- resolved = case judge_version
38
- when CURRENT then JudgeVersion.current.find_by(metric_id: metric.id)
36
+ def self.for(metric, metric_version: CURRENT)
37
+ resolved = case metric_version
38
+ when CURRENT then MetricVersion.current.find_by(metric_id: metric.id)
39
39
  when nil then nil
40
- else judge_version
40
+ else metric_version
41
41
  end
42
- new(metric: metric, judge_version: resolved, all_versions: judge_version.nil?).call
42
+ new(metric: metric, metric_version: resolved, all_versions: metric_version.nil?).call
43
43
  end
44
44
 
45
- def initialize(metric:, judge_version: nil, all_versions: false)
45
+ def initialize(metric:, metric_version: nil, all_versions: false)
46
46
  @metric = metric
47
- @judge_version = judge_version
47
+ @metric_version = metric_version
48
48
  @all_versions = all_versions
49
49
  end
50
50
 
51
51
  def call
52
52
  scope = Calibration.where(metric_id: @metric.id)
53
- if @judge_version
54
- scope = scope.where(judge_version_id: @judge_version.id)
53
+ if @metric_version
54
+ scope = scope.where(metric_version_id: @metric_version.id)
55
55
  elsif !@all_versions
56
56
  scope = scope.none
57
57
  end
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- class JudgeVariantGenerator
2
+ class MetricVariantGenerator
3
3
  DEFAULT_VARIANT_COUNT = 1
4
4
  MAX_VARIANT_COUNT = 3
5
5
  DEFAULT_TEMPERATURE = 0.4
@@ -20,9 +20,9 @@ module CompletionKit
20
20
  end
21
21
 
22
22
  def persist!(variants)
23
- JudgeVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
23
+ MetricVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
24
24
  versions = variants.map do |variant|
25
- JudgeVersion.create!(
25
+ MetricVersion.create!(
26
26
  metric: @metric,
27
27
  instruction: variant.instruction,
28
28
  rubric_bands: variant.rubric_bands.presence || @metric.rubric_bands,
@@ -41,8 +41,8 @@ module CompletionKit
41
41
  private
42
42
 
43
43
  def build_meta_prompt
44
- disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
45
- borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
44
+ disagreements = MetricCalibrationExamples.disagreements_for(@metric)
45
+ borderlines = MetricCalibrationExamples.borderlines_for(@metric)
46
46
  sections = []
47
47
  sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
48
48
  sections << ""
@@ -117,7 +117,7 @@ module CompletionKit
117
117
  end
118
118
  end
119
119
 
120
- module JudgeCalibrationExamples
120
+ module MetricCalibrationExamples
121
121
  module_function
122
122
 
123
123
  def for(metric, limit: 8)
@@ -134,8 +134,8 @@ module CompletionKit
134
134
 
135
135
  def calibrations_for(metric, verdict:, limit:)
136
136
  scope = Calibration.where(metric_id: metric.id, verdict: verdict)
137
- current_version = JudgeVersion.current.find_by(metric_id: metric.id)
138
- scope = scope.where(judge_version_id: current_version.id) if current_version
137
+ current_version = MetricVersion.current.find_by(metric_id: metric.id)
138
+ scope = scope.where(metric_version_id: current_version.id) if current_version
139
139
  scope.includes(response: :reviews)
140
140
  .order(created_at: :desc)
141
141
  .limit(limit)
@@ -3,7 +3,12 @@
3
3
  <% anchor = metric&.name&.parameterize %>
4
4
  <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
5
5
  created_by = CompletionKit.config.username.presence || "operator"
6
- verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by).pluck(:response_id)
6
+ current_metric_version = CompletionKit::MetricVersion.current.find_by(metric_id: metric.id)
7
+ verdicted_ids = if current_metric_version
8
+ CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
9
+ else
10
+ []
11
+ end
7
12
  CompletionKit::Response.joins(:reviews)
8
13
  .where(reviews: { metric_id: metric.id })
9
14
  .where.not(reviews: { ai_score: nil })
@@ -16,14 +16,14 @@
16
16
  </div>
17
17
  <% end %>
18
18
 
19
- <% if edit_draft && !suggestion %>
20
- <% pub = local_assigns[:published_judge_version] %>
19
+ <% if edit_draft %>
20
+ <% pub = local_assigns[:published_metric_version] %>
21
21
  <% draft_instr_changed = pub && pub.instruction.to_s != edit_draft.instruction.to_s %>
22
22
  <% draft_rubric_changed = pub && pub.rubric_bands != edit_draft.rubric_bands %>
23
23
  <div class="ck-suggestion-banner" role="status">
24
24
  <div class="ck-suggestion-banner__body">
25
25
  <p class="ck-kicker">Draft pending</p>
26
- <p class="ck-meta-copy">An unpublished draft of this metric is saved. Publish to replace the live<%= " instruction" if draft_instr_changed %><%= " and" if draft_instr_changed && draft_rubric_changed %><%= " rubric" if draft_rubric_changed %> for future runs, or keep editing.</p>
26
+ <p class="ck-meta-copy">The form below shows your unpublished draft. Publish to replace the live<%= " instruction" if draft_instr_changed %><%= " and" if draft_instr_changed && draft_rubric_changed %><%= " rubric" if draft_rubric_changed %> for future runs, or keep editing.</p>
27
27
  </div>
28
28
  <div class="ck-suggestion-banner__actions">
29
29
  <%= button_to "Discard draft", dismiss_suggestion_metric_path(metric, draft_id: edit_draft.id, back_to: "edit"),
@@ -14,4 +14,4 @@
14
14
  metric: @metric,
15
15
  suggestion_draft: @suggestion_draft,
16
16
  edit_draft: @edit_draft,
17
- published_judge_version: @published_judge_version %>
17
+ published_metric_version: @published_metric_version %>
@@ -19,20 +19,17 @@
19
19
  </div>
20
20
  <div class="ck-actions">
21
21
  <% if CompletionKit.config.judge_calibration_enabled %>
22
- <% if @suggestion_draft %>
23
- <%= link_to "Review improvements ", edit_metric_path(@metric),
22
+ <% if @suggestion_draft || @edit_draft %>
23
+ <% review_title = @suggestion_draft ? "The model proposed improvements based on your disagreements. Review and apply what you want." : "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
24
+ <%= link_to "Review changes →", edit_metric_path(@metric),
24
25
  class: ck_button_classes(:dark),
25
- title: "The model proposed improvements based on your disagreements. Review and apply what you want." %>
26
- <% elsif @edit_draft %>
27
- <%= link_to "Review draft →", edit_metric_path(@metric),
28
- class: ck_button_classes(:dark),
29
- title: "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
26
+ title: review_title %>
30
27
  <% elsif @improve_disagreement_count.positive? %>
31
28
  <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
32
29
  method: :post, form_class: "inline-block",
33
30
  class: ck_button_classes(:light, variant: :outline),
34
- title: "Have the model rewrite this metric's instruction and rubric based on the disagreements collected so far.",
35
- data: { turbo_confirm: "Rewrite this metric based on the disagreements collected so far?" } %>
31
+ title: "Ask the model to suggest improvements to this metric's instruction and rubric based on the disagreements collected so far.",
32
+ data: { turbo_confirm: "Ask the model for suggested improvements based on the disagreements collected so far?" } %>
36
33
  <% else %>
37
34
  <button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
38
35
  title="Mark at least one case as Disagree before the model can suggest a change.">Improve the metric</button>
@@ -168,19 +165,20 @@
168
165
  <p class="ck-kicker">Cases to learn from</p>
169
166
  <span class="ck-chip"><%= pluralize(@disagreements.size, "case") %></span>
170
167
  </div>
171
- <p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades.</p>
168
+ <% mixed_versions = @disagreements.map(&:metric_version_id).uniq.size > 1 %>
169
+ <p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades<%= " (pins flow into the current judge regardless of which version produced the verdict)" if mixed_versions %>.</p>
172
170
  <% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
173
171
  <ul class="ck-disagreement-list">
174
172
  <% @disagreements.each do |cal| %>
175
173
  <% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
176
174
  <% already = existing_ids.include?(cal.id) %>
177
- <% cal_version = cal.judge_version %>
178
- <% on_current = cal_version&.id == @published_judge_version.id %>
175
+ <% cal_metric_version = cal.metric_version %>
176
+ <% on_current = cal_metric_version&.id == @published_metric_version.id %>
179
177
  <li class="ck-disagreement<%= " ck-disagreement--remembered" if already %><%= " ck-disagreement--stale" unless on_current %>">
180
178
  <div class="ck-disagreement__head">
181
179
  <div class="ck-disagreement__scores">
182
- <% if cal_version %>
183
- <span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_version.version_label %></span>
180
+ <% if cal_metric_version && mixed_versions %>
181
+ <span class="ck-source-chip <%= on_current ? "ck-source-chip--current" : "ck-source-chip--past" %>" title="<%= on_current ? "Verdict on the live judge version." : "Verdict on a superseded judge version." %>"><%= cal_metric_version.version_label %></span>
184
182
  <% end %>
185
183
  <span class="ck-disagreement__scores-label">Judge</span>
186
184
  <% if review&.ai_score %>
@@ -98,10 +98,15 @@
98
98
 
99
99
  <div class="ck-review-list">
100
100
  <% @reviews.each do |review| %>
101
- <div class="ck-review-card" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
101
+ <% review_version = review.metric_version %>
102
+ <% stale = review.stale_against_current_judge? %>
103
+ <div class="ck-review-card<%= " ck-review-card--stale" if stale %>" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
102
104
  <div class="ck-review-card__header">
103
105
  <span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
104
106
  <div class="ck-inline">
107
+ <% if review_version %>
108
+ <span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Score produced by #{review_version.version_label} of this metric. The live judge has changed since." : "Score produced by the live judge (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
109
+ <% end %>
105
110
  <% if review.ai_score %>
106
111
  <% 5.times do |i| %>
107
112
  <svg viewBox="0 0 24 24" width="16" height="16" stroke-width="1.75" class="ck-star <%= i < review.ai_score.to_i ? "ck-star--filled" : "ck-star--empty" %>"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
@@ -111,6 +116,9 @@
111
116
  <% end %>
112
117
  </div>
113
118
  </div>
119
+ <% if stale %>
120
+ <p class="ck-review-card__stale-note">Scored against a superseded version of this metric. The live judge may score this differently.</p>
121
+ <% end %>
114
122
  <% if review.ai_feedback.present? %>
115
123
  <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
116
124
  <% end %>
@@ -18,6 +18,29 @@
18
18
  <% dataset_preview_lines = dataset_lines.first(50) %>
19
19
  <% end %>
20
20
 
21
+ <% if CompletionKit.config.judge_calibration_enabled %>
22
+ <% stale_summary = @run.stale_review_summary %>
23
+ <% if stale_summary.any? %>
24
+ <div class="ck-stale-versions-banner" role="status">
25
+ <div class="ck-stale-versions-banner__body">
26
+ <p class="ck-kicker">Stale judge versions</p>
27
+ <p class="ck-meta-copy">
28
+ This run was scored against metric versions that are no longer live.
29
+ <% stale_summary.values.each_with_index do |s, i| %>
30
+ <%= ", " if i > 0 %><strong><%= s[:metric_name] %></strong> (scored by <%= s[:scored_labels].join(", ") %>; live is <%= s[:current_label] %>)<% end %>.
31
+ Re-run to refresh the scores with the current judge.
32
+ </p>
33
+ </div>
34
+ <% if @run.status == "completed" %>
35
+ <%= button_to "Re-run with current judge",
36
+ rerun_run_path(@run), method: :post,
37
+ class: ck_button_classes(:dark), form_class: "inline-block",
38
+ data: { turbo_confirm: "Create a new run with the current metric versions? The original run stays as a record." } %>
39
+ <% end %>
40
+ </div>
41
+ <% end %>
42
+ <% end %>
43
+
21
44
  <div class="ck-run-config">
22
45
  <div class="ck-run-config__row">
23
46
  <span class="ck-run-config__key">Created</span>
@@ -0,0 +1,22 @@
1
+ class RenameJudgeVersionToMetricVersion < ActiveRecord::Migration[8.1]
2
+ def change
3
+ rename_table :completion_kit_judge_versions, :completion_kit_metric_versions
4
+ rename_column :completion_kit_calibrations, :judge_version_id, :metric_version_id
5
+
6
+ rename_index :completion_kit_metric_versions,
7
+ "index_ck_judge_versions_on_metric_id",
8
+ "index_ck_metric_versions_on_metric_id"
9
+ rename_index :completion_kit_metric_versions,
10
+ "index_ck_judge_versions_on_metric_current",
11
+ "index_ck_metric_versions_on_metric_current"
12
+ rename_index :completion_kit_metric_versions,
13
+ "index_ck_judge_versions_on_metric_state",
14
+ "index_ck_metric_versions_on_metric_state"
15
+ rename_index :completion_kit_metric_versions,
16
+ "index_ck_judge_versions_on_metric_version",
17
+ "index_ck_metric_versions_on_metric_vnum"
18
+ rename_index :completion_kit_calibrations,
19
+ "index_ck_calibrations_on_judge_version_id",
20
+ "index_ck_calibrations_on_metric_version_id"
21
+ end
22
+ end
@@ -0,0 +1,21 @@
1
+ class AddMetricVersionToReviews < ActiveRecord::Migration[8.1]
2
+ def change
3
+ add_column :completion_kit_reviews, :metric_version_id, :bigint
4
+ add_index :completion_kit_reviews, :metric_version_id, name: "index_ck_reviews_on_metric_version_id"
5
+
6
+ reversible do |dir|
7
+ dir.up do
8
+ execute <<~SQL
9
+ UPDATE completion_kit_reviews
10
+ SET metric_version_id = (
11
+ SELECT id FROM completion_kit_metric_versions mv
12
+ WHERE mv.metric_id = completion_kit_reviews.metric_id
13
+ AND mv.current = #{ActiveRecord::Base.connection.quote(true)}
14
+ LIMIT 1
15
+ )
16
+ WHERE metric_id IS NOT NULL
17
+ SQL
18
+ end
19
+ end
20
+ end
21
+ end
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.5.43"
2
+ VERSION = "0.5.44"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.43
4
+ version: 0.5.44
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin
@@ -272,11 +272,11 @@ files:
272
272
  - app/models/completion_kit/calibration.rb
273
273
  - app/models/completion_kit/dashboard_dismissal.rb
274
274
  - app/models/completion_kit/dataset.rb
275
- - app/models/completion_kit/judge_version.rb
276
275
  - app/models/completion_kit/mcp_session.rb
277
276
  - app/models/completion_kit/metric.rb
278
277
  - app/models/completion_kit/metric_group.rb
279
278
  - app/models/completion_kit/metric_group_membership.rb
279
+ - app/models/completion_kit/metric_version.rb
280
280
  - app/models/completion_kit/model.rb
281
281
  - app/models/completion_kit/prompt.rb
282
282
  - app/models/completion_kit/provider_credential.rb
@@ -295,7 +295,6 @@ files:
295
295
  - app/services/completion_kit/csv_processor.rb
296
296
  - app/services/completion_kit/dashboard_stats.rb
297
297
  - app/services/completion_kit/judge_service.rb
298
- - app/services/completion_kit/judge_variant_generator.rb
299
298
  - app/services/completion_kit/llm_client.rb
300
299
  - app/services/completion_kit/mcp_dispatcher.rb
301
300
  - app/services/completion_kit/mcp_tools/base.rb
@@ -310,6 +309,7 @@ files:
310
309
  - app/services/completion_kit/mcp_tools/runs.rb
311
310
  - app/services/completion_kit/mcp_tools/tags.rb
312
311
  - app/services/completion_kit/metric_calibration_stats.rb
312
+ - app/services/completion_kit/metric_variant_generator.rb
313
313
  - app/services/completion_kit/model_discovery_service.rb
314
314
  - app/services/completion_kit/ollama_client.rb
315
315
  - app/services/completion_kit/onboarding/checklist.rb
@@ -422,6 +422,8 @@ files:
422
422
  - db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb
423
423
  - db/migrate/20260524000001_create_completion_kit_starter_metric_dismissals.rb
424
424
  - db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb
425
+ - db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
426
+ - db/migrate/20260528000002_add_metric_version_to_reviews.rb
425
427
  - lib/completion-kit.rb
426
428
  - lib/completion_kit.rb
427
429
  - lib/completion_kit/concurrency_check.rb