completion-kit 0.5.42 → 0.5.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/javascripts/completion_kit/application.js +17 -0
  3. data/app/assets/stylesheets/completion_kit/application.css +530 -39
  4. data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +1 -1
  5. data/app/controllers/completion_kit/api/v1/runs_controller.rb +4 -0
  6. data/app/controllers/completion_kit/calibrations_controller.rb +1 -1
  7. data/app/controllers/completion_kit/metrics_controller.rb +88 -31
  8. data/app/controllers/completion_kit/runs_controller.rb +6 -0
  9. data/app/jobs/completion_kit/judge_review_job.rb +14 -0
  10. data/app/models/completion_kit/calibration.rb +6 -2
  11. data/app/models/completion_kit/metric.rb +0 -17
  12. data/app/models/completion_kit/{judge_version.rb → metric_version.rb} +35 -2
  13. data/app/models/completion_kit/review.rb +9 -0
  14. data/app/models/completion_kit/run.rb +28 -0
  15. data/app/services/completion_kit/mcp_tools/calibrations.rb +1 -1
  16. data/app/services/completion_kit/mcp_tools/judges.rb +15 -13
  17. data/app/services/completion_kit/metric_calibration_stats.rb +17 -5
  18. data/app/services/completion_kit/{judge_variant_generator.rb → metric_variant_generator.rb} +14 -12
  19. data/app/views/completion_kit/api_reference/_body.html.erb +1 -1
  20. data/app/views/completion_kit/calibrations/_buttons.html.erb +43 -6
  21. data/app/views/completion_kit/calibrations/_trust_panel.html.erb +32 -28
  22. data/app/views/completion_kit/metrics/_form.html.erb +90 -4
  23. data/app/views/completion_kit/metrics/_rubric_diff.html.erb +25 -0
  24. data/app/views/completion_kit/metrics/_rubric_hint.html.erb +4 -0
  25. data/app/views/completion_kit/metrics/_starter_card.html.erb +13 -9
  26. data/app/views/completion_kit/metrics/edit.html.erb +5 -1
  27. data/app/views/completion_kit/metrics/index.html.erb +5 -3
  28. data/app/views/completion_kit/metrics/show.html.erb +131 -127
  29. data/app/views/completion_kit/metrics/starter_preview.html.erb +6 -6
  30. data/app/views/completion_kit/responses/show.html.erb +9 -1
  31. data/app/views/completion_kit/runs/_status_panel.html.erb +2 -2
  32. data/app/views/completion_kit/runs/show.html.erb +23 -0
  33. data/config/routes.rb +2 -1
  34. data/db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb +24 -0
  35. data/db/migrate/20260528000001_rename_judge_version_to_metric_version.rb +22 -0
  36. data/db/migrate/20260528000002_add_metric_version_to_reviews.rb +21 -0
  37. data/lib/completion_kit/version.rb +1 -1
  38. metadata +8 -3
@@ -45,6 +45,10 @@ module CompletionKit
45
45
  end
46
46
 
47
47
  def retry_failures
48
+ if @run.stale_review_summary.any?
49
+ return render(json: { error: "Judge has changed since this run executed. Retry would mix versions in the same run; use POST /api/v1/runs/:id/rerun instead." }, status: :conflict)
50
+ end
51
+
48
52
  scope = @run.responses.where(status: "failed")
49
53
  scope = scope.where(id: params[:only]) if params[:only].present?
50
54
 
@@ -18,7 +18,7 @@ module CompletionKit
18
18
  run: @run, response: @response, metric: @metric, created_by: created_by
19
19
  )
20
20
  calibration.assign_attributes(
21
- judge_version: JudgeVersion.ensure_current_for(@metric),
21
+ metric_version: MetricVersion.ensure_current_for(@metric),
22
22
  verdict: params[:verdict],
23
23
  corrected_score: params[:corrected_score].presence,
24
24
  note: params[:note].presence
@@ -1,7 +1,7 @@
1
1
  module CompletionKit
2
2
  class MetricsController < ApplicationController
3
3
  include CompletionKit::TagFiltering
4
- before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
4
+ before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :remove_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
5
5
 
6
6
  def index
7
7
  @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
@@ -35,14 +35,16 @@ module CompletionKit
35
35
  end
36
36
 
37
37
  def show
38
+ @published_metric_version = MetricVersion.ensure_current_for(@metric)
38
39
  @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
39
- .includes(response: [:reviews, :run])
40
+ .includes(:metric_version, response: [:reviews, :run])
40
41
  .order(created_at: :desc)
41
42
  .limit(50)
42
- @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
43
- @published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
44
- @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
45
- @improve_disagreement_count = @disagreements.size
43
+ @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
44
+ @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
45
+ @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree",
46
+ metric_version_id: @published_metric_version.id).count
47
+ @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
46
48
  end
47
49
 
48
50
  def new
@@ -50,6 +52,14 @@ module CompletionKit
50
52
  end
51
53
 
52
54
  def edit
55
+ @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
56
+ @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
57
+ @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
58
+
59
+ if @edit_draft
60
+ @metric.instruction = @edit_draft.instruction
61
+ @metric.rubric_bands = @edit_draft.rubric_bands
62
+ end
53
63
  end
54
64
 
55
65
  def create
@@ -63,10 +73,42 @@ module CompletionKit
63
73
  end
64
74
 
65
75
  def update
66
- if @metric.update(metric_params)
67
- redirect_to metric_path(@metric), notice: "Metric was successfully updated."
76
+ judge_keys = %i[instruction rubric_bands]
77
+ meta_attrs = metric_params.except(*judge_keys)
78
+ proposed_instruction = metric_params[:instruction]
79
+ proposed_rubric = metric_params[:rubric_bands]
80
+
81
+ unless @metric.update(meta_attrs)
82
+ return render(:edit, status: :unprocessable_entity)
83
+ end
84
+
85
+ current_instruction = @metric.instruction.to_s
86
+ current_rubric = @metric.rubric_bands || []
87
+ normalized_proposed_rubric = normalize_rubric_bands_for_update(proposed_rubric)
88
+
89
+ instruction_changed = !proposed_instruction.nil? && proposed_instruction.to_s != current_instruction
90
+ rubric_changed = !normalized_proposed_rubric.nil? && normalized_proposed_rubric != current_rubric
91
+
92
+ unless instruction_changed || rubric_changed
93
+ return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
94
+ end
95
+
96
+ new_instruction = instruction_changed ? proposed_instruction.to_s : current_instruction
97
+ new_rubric = rubric_changed ? normalized_proposed_rubric : current_rubric
98
+
99
+ if @metric.reviews.exists?
100
+ MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
101
+ draft = MetricVersion.create!(
102
+ metric: @metric, instruction: new_instruction, rubric_bands: new_rubric,
103
+ state: "draft", source: "edit", current: false
104
+ )
105
+ redirect_to edit_metric_path(@metric),
106
+ notice: "Saved as draft #{draft.version_label}. Publish to push these changes to the live judge."
68
107
  else
69
- render :edit, status: :unprocessable_entity
108
+ @metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
109
+ current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
110
+ current_pub&.update!(instruction: @metric.instruction, rubric_bands: @metric.rubric_bands)
111
+ redirect_to metric_path(@metric), notice: "Metric was successfully updated."
70
112
  end
71
113
  end
72
114
 
@@ -76,49 +118,48 @@ module CompletionKit
76
118
  end
77
119
 
78
120
  def suggest_variants
121
+ target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
79
122
  disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
80
123
  if disagreement_count.zero?
81
- redirect_to metric_path(@metric), alert: "Mark at least one row as Disagree before asking the model to suggest a change."
124
+ redirect_to target, alert: "Mark at least one row as Disagree before asking the model to suggest a change."
82
125
  return
83
126
  end
84
127
 
85
- JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
128
+ MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
86
129
 
87
- generator = JudgeVariantGenerator.new(@metric, count: 1)
130
+ generator = MetricVariantGenerator.new(@metric, count: 1)
88
131
  variants = generator.call
89
132
  if variants.empty?
90
- redirect_to metric_path(@metric), alert: "The model returned no usable variants. Try again with a different model."
133
+ redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
91
134
  return
92
135
  end
93
136
  generator.persist!(variants)
94
- redirect_to metric_path(@metric), notice: "Drafted a new version. Review it below."
137
+ redirect_to target, notice: "Drafted a new version. Review it below."
95
138
  end
96
139
 
97
140
  def dismiss_suggestion
98
- draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").find_by(id: params[:draft_id])
141
+ draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
99
142
  draft&.destroy
100
- redirect_to metric_path(@metric), notice: "Dismissed."
143
+ target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
144
+ redirect_to target, notice: "Dismissed."
101
145
  end
102
146
 
103
147
  def publish_draft
104
- scope = JudgeVersion.drafts.where(metric_id: @metric.id)
105
- draft = params[:draft_id].present? ? scope.find_by(id: params[:draft_id]) : scope.order(created_at: :desc).first
106
-
107
- if draft.nil?
108
- redirect_to metric_path(@metric), alert: "No draft to publish."
148
+ scope = MetricVersion.where(metric_id: @metric.id)
149
+ version = if params[:draft_id].present?
150
+ scope.find_by(id: params[:draft_id])
151
+ else
152
+ MetricVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
153
+ end
154
+
155
+ if version.nil?
156
+ redirect_to metric_path(@metric), alert: "No version to publish."
109
157
  return
110
158
  end
111
159
 
112
- JudgeVersion.transaction do
113
- JudgeVersion.where(metric_id: @metric.id, state: "published").update_all(current: false)
114
- draft.update!(state: "published", current: true)
115
- @metric.update_columns(
116
- instruction: draft.instruction,
117
- rubric_bands: Array(draft.rubric_bands).to_json
118
- )
119
- end
120
-
121
- redirect_to metric_path(@metric), notice: "This judge version is now live."
160
+ version.publish!
161
+ redirect_to metric_path(@metric),
162
+ notice: "#{@metric.name} #{version.version_label} is now the published version."
122
163
  end
123
164
 
124
165
  def add_few_shot
@@ -139,6 +180,13 @@ module CompletionKit
139
180
  redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
140
181
  end
141
182
 
183
+ def remove_few_shot
184
+ cal_id = params[:calibration_id].to_i
185
+ remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
186
+ @metric.update!(few_shot_examples: remaining)
187
+ redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
188
+ end
189
+
142
190
  private
143
191
 
144
192
  def set_metric
@@ -149,5 +197,14 @@ module CompletionKit
149
197
  params.require(:metric).permit(:name, :instruction,
150
198
  rubric_bands: [:stars, :description], tag_names: [])
151
199
  end
200
+
201
+ def normalize_rubric_bands_for_update(bands)
202
+ return nil if bands.nil?
203
+ array = bands.is_a?(ActionController::Parameters) ? bands.to_unsafe_h.values : bands
204
+ Array(array).map do |b|
205
+ h = b.respond_to?(:to_unsafe_h) ? b.to_unsafe_h : b
206
+ { "stars" => h["stars"].to_i, "description" => h["description"].to_s }
207
+ end.sort_by { |b| -b["stars"] }
208
+ end
152
209
  end
153
210
  end
@@ -126,6 +126,12 @@ module CompletionKit
126
126
  end
127
127
 
128
128
  def retry_failures
129
+ if @run.stale_review_summary.any?
130
+ redirect_to run_path(@run),
131
+ alert: "The judge has changed since this run executed. Retrying failed cases would mix scores from two metric versions in the same run. Use 'Re-run with current judge' to refresh everything against the live judge."
132
+ return
133
+ end
134
+
129
135
  scope = @run.responses.where(status: "failed")
130
136
  scope = scope.where(id: params[:only]) if params[:only].present?
131
137
 
@@ -57,13 +57,16 @@ module CompletionKit
57
57
  run.prompt&.template,
58
58
  criteria: metric.instruction.to_s,
59
59
  rubric_text: metric.display_rubric_text,
60
+ human_examples: few_shot_payload(metric),
60
61
  input_data: response.input_data
61
62
  )
62
63
 
63
64
  review = response.reviews.find_or_initialize_by(metric_id: metric.id)
65
+ current_metric_version = MetricVersion.ensure_current_for(metric)
64
66
  review.assign_attributes(
65
67
  metric_name: metric.name,
66
68
  instruction: metric.instruction.to_s,
69
+ metric_version_id: current_metric_version.id,
67
70
  status: "succeeded",
68
71
  ai_score: evaluation[:score],
69
72
  ai_feedback: evaluation[:feedback],
@@ -119,5 +122,16 @@ module CompletionKit
119
122
  response = Response.find_by(id: response_id)
120
123
  RunCompletionCheckJob.perform_later(response.run_id) if response
121
124
  end
125
+
126
+ def few_shot_payload(metric)
127
+ return nil unless CompletionKit.config.judge_calibration_enabled
128
+ Array(metric.few_shot_examples).map do |fs|
129
+ {
130
+ human_score: fs["human_score"],
131
+ response_text: fs["response"].to_s,
132
+ human_note: fs["human_note"].to_s
133
+ }
134
+ end
135
+ end
122
136
  end
123
137
  end
@@ -5,7 +5,11 @@ module CompletionKit
5
5
  belongs_to :run
6
6
  belongs_to :response
7
7
  belongs_to :metric
8
- belongs_to :judge_version
8
+ belongs_to :metric_version
9
+
10
+ alias_attribute :judge_version_id, :metric_version_id
11
+ alias_method :judge_version, :metric_version
12
+ alias_method :judge_version=, :metric_version=
9
13
 
10
14
  validates :verdict, presence: true, inclusion: { in: VERDICTS }
11
15
  validates :response_id,
@@ -22,7 +26,7 @@ module CompletionKit
22
26
  run_id: run_id,
23
27
  response_id: response_id,
24
28
  metric_id: metric_id,
25
- judge_version_id: judge_version_id,
29
+ metric_version_id: metric_version_id,
26
30
  verdict: verdict,
27
31
  corrected_score: corrected_score,
28
32
  note: note,
@@ -24,7 +24,6 @@ module CompletionKit
24
24
  before_validation :generate_key
25
25
  before_validation :normalize_rubric_bands
26
26
  before_validation :set_defaults
27
- after_update :fork_draft_judge_version, if: :judge_relevant_changes?
28
27
 
29
28
  def self.default_rubric_bands
30
29
  DEFAULT_RUBRIC_BANDS.map(&:dup)
@@ -98,21 +97,5 @@ module CompletionKit
98
97
  self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
99
98
  end
100
99
 
101
- def judge_relevant_changes?
102
- saved_change_to_instruction? || saved_change_to_rubric_bands?
103
- end
104
-
105
- def fork_draft_judge_version
106
- JudgeVersion.ensure_current_for(self)
107
- JudgeVersion.where(metric_id: id, state: "draft").update_all(current: false)
108
- JudgeVersion.create!(
109
- metric: self,
110
- instruction: instruction,
111
- rubric_bands: rubric_bands,
112
- current: false,
113
- state: "draft",
114
- source: "edit"
115
- )
116
- end
117
100
  end
118
101
  end
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- class JudgeVersion < ApplicationRecord
2
+ class MetricVersion < ApplicationRecord
3
3
  STATES = %w[draft published].freeze
4
4
 
5
5
  belongs_to :metric
@@ -7,8 +7,11 @@ module CompletionKit
7
7
 
8
8
  serialize :rubric_bands, coder: JSON
9
9
 
10
+ before_validation :assign_version_number, on: :create
11
+
10
12
  validates :metric_id, presence: true
11
13
  validates :state, inclusion: { in: STATES }
14
+ validates :version_number, presence: true, uniqueness: { scope: :metric_id }
12
15
 
13
16
  scope :current, -> { where(current: true) }
14
17
  scope :published, -> { where(state: "published") }
@@ -20,7 +23,8 @@ module CompletionKit
20
23
  instruction: metric.instruction,
21
24
  rubric_bands: metric.rubric_bands,
22
25
  current: true,
23
- state: "published"
26
+ state: "published",
27
+ published_at: Time.current
24
28
  )
25
29
  end
26
30
 
@@ -32,17 +36,46 @@ module CompletionKit
32
36
  state == "published"
33
37
  end
34
38
 
39
+ def version_label
40
+ "v#{version_number}"
41
+ end
42
+
43
+ def publish!
44
+ MetricVersion.transaction do
45
+ self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
46
+ reload
47
+ update!(state: "published", current: true, published_at: published_at || Time.current)
48
+ metric.update_columns(
49
+ instruction: instruction,
50
+ rubric_bands: Array(rubric_bands).to_json
51
+ )
52
+ end
53
+ self
54
+ end
55
+
35
56
  def as_json(options = {})
36
57
  {
37
58
  id: id,
38
59
  metric_id: metric_id,
60
+ version_number: version_number,
39
61
  instruction: instruction,
40
62
  rubric_bands: rubric_bands,
41
63
  current: current,
42
64
  state: state,
43
65
  source: source,
66
+ published_at: published_at,
44
67
  created_at: created_at
45
68
  }
46
69
  end
70
+
71
+ private
72
+
73
+ def assign_version_number
74
+ return if version_number.present?
75
+ max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i
76
+ self.version_number = max + 1
77
+ end
47
78
  end
79
+
80
+ JudgeVersion = MetricVersion
48
81
  end
@@ -5,8 +5,16 @@ module CompletionKit
5
5
 
6
6
  belongs_to :response
7
7
  belongs_to :metric, optional: true
8
+ belongs_to :metric_version, optional: true
8
9
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
9
10
 
11
+ def stale_against_current_judge?
12
+ return false unless metric_id && metric_version_id
13
+ current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
14
+ return false if current_id.nil?
15
+ metric_version_id != current_id
16
+ end
17
+
10
18
  validates :metric_name, presence: true
11
19
  validates :status, inclusion: { in: STATUSES }
12
20
  validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
@@ -29,6 +37,7 @@ module CompletionKit
29
37
  def as_json(options = {})
30
38
  {
31
39
  id: id, response_id: response_id, metric_id: metric_id,
40
+ metric_version_id: metric_version_id,
32
41
  metric_name: metric_name, ai_score: ai_score,
33
42
  ai_feedback: ai_feedback, status: status, attempts: attempts,
34
43
  error: error_payload
@@ -89,6 +89,34 @@ module CompletionKit
89
89
  end
90
90
  end
91
91
 
92
+ def stale_review_summary
93
+ review_pairs = Review.where(response_id: response_ids)
94
+ .where.not(metric_id: nil)
95
+ .where.not(metric_version_id: nil)
96
+ .pluck(:metric_id, :metric_version_id, :metric_name)
97
+ return {} if review_pairs.empty?
98
+
99
+ metric_ids = review_pairs.map(&:first).uniq
100
+ version_ids = review_pairs.map { |_, vid, _| vid }.uniq
101
+ current_by_metric = MetricVersion.current.where(metric_id: metric_ids).pluck(:metric_id, :id, :version_number).each_with_object({}) do |(mid, vid, vnum), h|
102
+ h[mid] = { id: vid, label: "v#{vnum}" }
103
+ end
104
+ label_by_version = MetricVersion.where(id: version_ids).pluck(:id, :version_number).each_with_object({}) { |(vid, vnum), h| h[vid] = "v#{vnum}" }
105
+
106
+ summary = {}
107
+ review_pairs.each do |metric_id, version_id, metric_name|
108
+ current = current_by_metric[metric_id]
109
+ next if current.nil?
110
+ next if version_id == current[:id]
111
+ label = label_by_version[version_id]
112
+ next if label.nil?
113
+ summary[metric_id] ||= { metric_name: metric_name, current_label: current[:label], stale_count: 0, scored_labels: [] }
114
+ summary[metric_id][:stale_count] += 1
115
+ summary[metric_id][:scored_labels] |= [label]
116
+ end
117
+ summary
118
+ end
119
+
92
120
  def start!
93
121
  rows = if dataset
94
122
  CsvProcessor.process_self(self)
@@ -56,7 +56,7 @@ module CompletionKit
56
56
  run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
57
57
  )
58
58
  calibration.assign_attributes(
59
- judge_version: CompletionKit::JudgeVersion.ensure_current_for(metric),
59
+ metric_version: CompletionKit::MetricVersion.ensure_current_for(metric),
60
60
  verdict: args["verdict"],
61
61
  corrected_score: args["corrected_score"],
62
62
  note: args["note"]
@@ -5,7 +5,7 @@ module CompletionKit
5
5
 
6
6
  TOOLS = {
7
7
  "judges_suggest" => {
8
- description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft JudgeVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
8
+ description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft MetricVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
9
9
  inputSchema: {
10
10
  type: "object",
11
11
  properties: {
@@ -33,15 +33,15 @@ module CompletionKit
33
33
  handler: :replay
34
34
  },
35
35
  "judges_compare" => {
36
- description: "Compare two judge versions' calibration stats side by side. Pass either two judge_version_ids or one metric_id with judge_version_a_id / judge_version_b_id.",
36
+ description: "Compare two metric versions' calibration stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
37
37
  inputSchema: {
38
38
  type: "object",
39
39
  properties: {
40
40
  metric_id: { type: "integer" },
41
- judge_version_a_id: { type: "integer" },
42
- judge_version_b_id: { type: "integer" }
41
+ metric_version_a_id: { type: "integer" },
42
+ metric_version_b_id: { type: "integer" }
43
43
  },
44
- required: ["metric_id", "judge_version_a_id", "judge_version_b_id"]
44
+ required: ["metric_id", "metric_version_a_id", "metric_version_b_id"]
45
45
  },
46
46
  handler: :compare
47
47
  }
@@ -49,7 +49,7 @@ module CompletionKit
49
49
 
50
50
  def self.suggest(args)
51
51
  metric = CompletionKit::Metric.find(args["metric_id"])
52
- generator = CompletionKit::JudgeVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
52
+ generator = CompletionKit::MetricVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
53
53
  variants = generator.call
54
54
  return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
55
55
  versions = generator.persist!(variants)
@@ -75,20 +75,22 @@ module CompletionKit
75
75
 
76
76
  def self.compare(args)
77
77
  metric = CompletionKit::Metric.find(args["metric_id"])
78
- a = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_a_id"])
79
- b = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_b_id"])
80
- stats_a = CompletionKit::MetricCalibrationStats.for(metric, judge_version: a)
81
- stats_b = CompletionKit::MetricCalibrationStats.for(metric, judge_version: b)
78
+ a_id = args["metric_version_a_id"] || args["judge_version_a_id"]
79
+ b_id = args["metric_version_b_id"] || args["judge_version_b_id"]
80
+ a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(a_id)
81
+ b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(b_id)
82
+ stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
83
+ stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
82
84
  text_result({
83
85
  metric_id: metric.id,
84
- a: judge_version_payload(a, stats_a),
85
- b: judge_version_payload(b, stats_b),
86
+ a: metric_version_payload(a, stats_a),
87
+ b: metric_version_payload(b, stats_b),
86
88
  delta: delta_payload(stats_a, stats_b),
87
89
  recommendation: recommendation_for(stats_a, stats_b)
88
90
  })
89
91
  end
90
92
 
91
- def self.judge_version_payload(version, stats)
93
+ def self.metric_version_payload(version, stats)
92
94
  {
93
95
  id: version.id, state: version.state, current: version.current,
94
96
  source: version.source, created_at: version.created_at,
@@ -31,18 +31,30 @@ module CompletionKit
31
31
  end
32
32
  end
33
33
 
34
- def self.for(metric, judge_version: nil)
35
- new(metric: metric, judge_version: judge_version).call
34
+ CURRENT = :current
35
+
36
+ def self.for(metric, metric_version: CURRENT)
37
+ resolved = case metric_version
38
+ when CURRENT then MetricVersion.current.find_by(metric_id: metric.id)
39
+ when nil then nil
40
+ else metric_version
41
+ end
42
+ new(metric: metric, metric_version: resolved, all_versions: metric_version.nil?).call
36
43
  end
37
44
 
38
- def initialize(metric:, judge_version: nil)
45
+ def initialize(metric:, metric_version: nil, all_versions: false)
39
46
  @metric = metric
40
- @judge_version = judge_version
47
+ @metric_version = metric_version
48
+ @all_versions = all_versions
41
49
  end
42
50
 
43
51
  def call
44
52
  scope = Calibration.where(metric_id: @metric.id)
45
- scope = scope.where(judge_version_id: @judge_version.id) if @judge_version
53
+ if @metric_version
54
+ scope = scope.where(metric_version_id: @metric_version.id)
55
+ elsif !@all_versions
56
+ scope = scope.none
57
+ end
46
58
 
47
59
  verdicts = scope.pluck(:verdict, :corrected_score, :response_id)
48
60
  n = verdicts.length
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- class JudgeVariantGenerator
2
+ class MetricVariantGenerator
3
3
  DEFAULT_VARIANT_COUNT = 1
4
4
  MAX_VARIANT_COUNT = 3
5
5
  DEFAULT_TEMPERATURE = 0.4
@@ -20,9 +20,9 @@ module CompletionKit
20
20
  end
21
21
 
22
22
  def persist!(variants)
23
- JudgeVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
23
+ MetricVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
24
24
  versions = variants.map do |variant|
25
- JudgeVersion.create!(
25
+ MetricVersion.create!(
26
26
  metric: @metric,
27
27
  instruction: variant.instruction,
28
28
  rubric_bands: variant.rubric_bands.presence || @metric.rubric_bands,
@@ -41,8 +41,8 @@ module CompletionKit
41
41
  private
42
42
 
43
43
  def build_meta_prompt
44
- disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
45
- borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
44
+ disagreements = MetricCalibrationExamples.disagreements_for(@metric)
45
+ borderlines = MetricCalibrationExamples.borderlines_for(@metric)
46
46
  sections = []
47
47
  sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
48
48
  sections << ""
@@ -86,7 +86,7 @@ module CompletionKit
86
86
  sections << "REASONING: <one short sentence: what changes and why>"
87
87
  sections << "INSTRUCTION:"
88
88
  sections << "<the rewritten instruction>"
89
- sections << "RUBRIC: # optional omit this block if the rubric is unchanged"
89
+ sections << "RUBRIC: # optional. Omit this block if the rubric is unchanged."
90
90
  sections << "5: <description for 5 stars>"
91
91
  sections << "4: <description for 4 stars>"
92
92
  sections << "3: <description for 3 stars>"
@@ -117,7 +117,7 @@ module CompletionKit
117
117
  end
118
118
  end
119
119
 
120
- module JudgeCalibrationExamples
120
+ module MetricCalibrationExamples
121
121
  module_function
122
122
 
123
123
  def for(metric, limit: 8)
@@ -133,11 +133,13 @@ module CompletionKit
133
133
  end
134
134
 
135
135
  def calibrations_for(metric, verdict:, limit:)
136
- Calibration.where(metric_id: metric.id, verdict: verdict)
137
- .includes(response: :reviews)
138
- .order(created_at: :desc)
139
- .limit(limit)
140
- .map do |cal|
136
+ scope = Calibration.where(metric_id: metric.id, verdict: verdict)
137
+ current_version = MetricVersion.current.find_by(metric_id: metric.id)
138
+ scope = scope.where(metric_version_id: current_version.id) if current_version
139
+ scope.includes(response: :reviews)
140
+ .order(created_at: :desc)
141
+ .limit(limit)
142
+ .map do |cal|
141
143
  review = cal.response.reviews.find { |r| r.metric_id == metric.id }
142
144
  {
143
145
  input: cal.response.input_data,
@@ -187,7 +187,7 @@
187
187
  </div>
188
188
  <%= render "completion_kit/api_reference/resource_list", title: "Your datasets",
189
189
  items: datasets.map { |d|
190
- { name: d.name, meta: pluralize([d.csv_data.to_s.lines.count - 1, 0].max, "row"),
190
+ { name: d.name, meta: pluralize([d.csv_data.to_s.lines.count - 1, 0].max, "entry"),
191
191
  url: "#{base_url}/api/v1/datasets/#{d.id}", dom_id: "dataset_ep_#{d.id}" }
192
192
  } %>
193
193
  </div>