completion-kit 0.7.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/stylesheets/completion_kit/application.css +146 -325
  3. data/app/controllers/completion_kit/api/v1/base_controller.rb +14 -4
  4. data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +2 -2
  5. data/app/controllers/completion_kit/api/v1/datasets_controller.rb +2 -2
  6. data/app/controllers/completion_kit/api/v1/metric_groups_controller.rb +2 -2
  7. data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +1 -1
  8. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +5 -32
  9. data/app/controllers/completion_kit/api/v1/prompts_controller.rb +2 -2
  10. data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb +2 -2
  11. data/app/controllers/completion_kit/api/v1/runs_controller.rb +7 -7
  12. data/app/controllers/completion_kit/api/v1/tags_controller.rb +2 -2
  13. data/app/controllers/completion_kit/metrics_controller.rb +14 -37
  14. data/app/controllers/completion_kit/runs_controller.rb +2 -2
  15. data/app/jobs/completion_kit/generate_row_job.rb +2 -4
  16. data/app/jobs/completion_kit/judge_review_job.rb +4 -19
  17. data/app/models/completion_kit/metric.rb +0 -1
  18. data/app/models/completion_kit/metric_version.rb +35 -0
  19. data/app/models/completion_kit/run.rb +0 -1
  20. data/app/services/completion_kit/judge_service.rb +3 -10
  21. data/app/services/completion_kit/mcp_tools/metric_versions.rb +1 -1
  22. data/app/services/completion_kit/metric_variant_generator.rb +0 -13
  23. data/app/views/completion_kit/api_reference/_body.html.erb +2 -12
  24. data/app/views/completion_kit/api_reference/index.html.erb +4 -0
  25. data/app/views/completion_kit/calibrations/_trust_panel.html.erb +19 -12
  26. data/app/views/completion_kit/metrics/_form.html.erb +11 -12
  27. data/app/views/completion_kit/metrics/edit.html.erb +18 -0
  28. data/app/views/completion_kit/metrics/index.html.erb +0 -17
  29. data/app/views/completion_kit/metrics/show.html.erb +87 -105
  30. data/app/views/completion_kit/responses/show.html.erb +2 -2
  31. data/app/views/completion_kit/runs/show.html.erb +7 -7
  32. data/config/routes.rb +0 -4
  33. data/db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb +5 -0
  34. data/lib/completion_kit/version.rb +1 -1
  35. metadata +2 -1
@@ -20,7 +20,7 @@ module CompletionKit
20
20
  metric_group.replace_metrics!(params[:metric_ids]) if params.key?(:metric_ids)
21
21
  render json: metric_group.reload, status: :created
22
22
  else
23
- render json: {errors: metric_group.errors}, status: :unprocessable_entity
23
+ render_validation_errors(metric_group)
24
24
  end
25
25
  end
26
26
 
@@ -29,7 +29,7 @@ module CompletionKit
29
29
  @metric_group.replace_metrics!(params[:metric_ids]) if params.key?(:metric_ids)
30
30
  render json: @metric_group.reload
31
31
  else
32
- render json: {errors: @metric_group.errors}, status: :unprocessable_entity
32
+ render_validation_errors(@metric_group)
33
33
  end
34
34
  end
35
35
 
@@ -25,7 +25,7 @@ module CompletionKit
25
25
 
26
26
  def destroy
27
27
  if @version.published?
28
- render json: { error: "Cannot dismiss a published version. Publish a different version as current instead." }, status: :conflict
28
+ render_error("Cannot dismiss a published version. Publish a different version as current instead.", status: :conflict)
29
29
  return
30
30
  end
31
31
  @version.destroy!
@@ -2,7 +2,7 @@ module CompletionKit
2
2
  module Api
3
3
  module V1
4
4
  class MetricsController < BaseController
5
- before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants, :add_few_shot, :remove_few_shot]
5
+ before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants]
6
6
 
7
7
  def index
8
8
  scope = Metric.includes(:tags)
@@ -19,7 +19,7 @@ module CompletionKit
19
19
  if metric.save
20
20
  render json: metric, status: :created
21
21
  else
22
- render json: {errors: metric.errors}, status: :unprocessable_entity
22
+ render_validation_errors(metric)
23
23
  end
24
24
  end
25
25
 
@@ -27,7 +27,7 @@ module CompletionKit
27
27
  if @metric.update(metric_params)
28
28
  render json: @metric
29
29
  else
30
- render json: {errors: @metric.errors}, status: :unprocessable_entity
30
+ render_validation_errors(@metric)
31
31
  end
32
32
  end
33
33
 
@@ -39,7 +39,7 @@ module CompletionKit
39
39
  def suggest_variants
40
40
  disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
41
41
  if disagreement_count.zero?
42
- render json: { error: "Mark at least one case as Disagree before asking the model to suggest a change." }, status: :unprocessable_entity
42
+ render_error("Mark at least one case as Disagree before asking the model to suggest a change.", status: :unprocessable_entity)
43
43
  return
44
44
  end
45
45
 
@@ -47,40 +47,13 @@ module CompletionKit
47
47
  generator = MetricVariantGenerator.new(@metric, count: params[:count].to_i, model: params[:model])
48
48
  variants = generator.call
49
49
  if variants.empty?
50
- render json: { error: "The model returned no usable variants. Try again with a different model." }, status: :unprocessable_entity
50
+ render_error("The model returned no usable variants. Try again with a different model.", status: :unprocessable_entity)
51
51
  return
52
52
  end
53
53
  versions = generator.persist!(variants)
54
54
  render json: versions, status: :created
55
55
  end
56
56
 
57
- def add_few_shot
58
- calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
59
- review = calibration.response.reviews.find_by(metric_id: @metric.id)
60
- examples = Array(@metric.few_shot_examples)
61
- examples << {
62
- "input" => calibration.response.input_data.to_s.truncate(2000),
63
- "response" => calibration.response.response_text.to_s.truncate(2000),
64
- "judge_score" => review&.ai_score&.to_f,
65
- "judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
66
- "human_score" => calibration.corrected_score&.to_f,
67
- "human_note" => calibration.note.to_s.truncate(1000),
68
- "calibration_id" => calibration.id,
69
- "added_at" => Time.current.utc.iso8601
70
- }
71
- @metric.update!(few_shot_examples: examples)
72
- render json: @metric.reload
73
- rescue ActiveRecord::RecordNotFound
74
- render json: { error: "Calibration not found or not a disagree on this metric." }, status: :not_found
75
- end
76
-
77
- def remove_few_shot
78
- cal_id = params[:calibration_id].to_i
79
- remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
80
- @metric.update!(few_shot_examples: remaining)
81
- render json: @metric.reload
82
- end
83
-
84
57
  private
85
58
 
86
59
  def set_metric
@@ -19,7 +19,7 @@ module CompletionKit
19
19
  if prompt.save
20
20
  render json: prompt, status: :created
21
21
  else
22
- render json: {errors: prompt.errors}, status: :unprocessable_entity
22
+ render_validation_errors(prompt)
23
23
  end
24
24
  end
25
25
 
@@ -32,7 +32,7 @@ module CompletionKit
32
32
  elsif @prompt.update(prompt_params)
33
33
  render json: @prompt
34
34
  else
35
- render json: {errors: @prompt.errors}, status: :unprocessable_entity
35
+ render_validation_errors(@prompt)
36
36
  end
37
37
  end
38
38
 
@@ -17,7 +17,7 @@ module CompletionKit
17
17
  if credential.save
18
18
  render json: credential, status: :created
19
19
  else
20
- render json: {errors: credential.errors}, status: :unprocessable_entity
20
+ render_validation_errors(credential)
21
21
  end
22
22
  end
23
23
 
@@ -25,7 +25,7 @@ module CompletionKit
25
25
  if @credential.update(credential_params)
26
26
  render json: @credential
27
27
  else
28
- render json: {errors: @credential.errors}, status: :unprocessable_entity
28
+ render_validation_errors(@credential)
29
29
  end
30
30
  end
31
31
 
@@ -23,7 +23,7 @@ module CompletionKit
23
23
  run.replace_metrics!(params[:metric_ids])
24
24
  render json: run.reload, status: :created
25
25
  else
26
- render json: {errors: run.errors}, status: :unprocessable_entity
26
+ render_validation_errors(run)
27
27
  end
28
28
  end
29
29
 
@@ -32,7 +32,7 @@ module CompletionKit
32
32
  @run.replace_metrics!(params[:metric_ids]) if params.key?(:metric_ids)
33
33
  render json: @run.reload
34
34
  else
35
- render json: {errors: @run.errors}, status: :unprocessable_entity
35
+ render_validation_errors(@run)
36
36
  end
37
37
  end
38
38
 
@@ -45,13 +45,13 @@ module CompletionKit
45
45
  if @run.start!
46
46
  render json: @run.reload, status: :accepted
47
47
  else
48
- render json: { errors: [@run.failure_summary || @run.errors.full_messages.to_sentence] }, status: :unprocessable_entity
48
+ render_error(@run.failure_summary || @run.errors.full_messages.to_sentence, status: :unprocessable_entity)
49
49
  end
50
50
  end
51
51
 
52
52
  def retry_failures
53
53
  if @run.stale_review_summary.any?
54
- return render(json: { error: "Judge has changed since this run executed. Retry would mix versions in the same run; use POST /api/v1/runs/:id/rerun instead." }, status: :conflict)
54
+ return render_error("Judge has changed since this run executed. Retry would mix versions in the same run; use POST /api/v1/runs/:id/rerun instead.", status: :conflict)
55
55
  end
56
56
 
57
57
  scope = @run.responses.where(status: "failed")
@@ -90,7 +90,7 @@ module CompletionKit
90
90
  if new_run.start!
91
91
  render json: new_run.reload, status: :accepted
92
92
  else
93
- render json: { errors: [new_run.failure_summary || "Could not start the new run."] }, status: :unprocessable_entity
93
+ render_error(new_run.failure_summary || "Could not start the new run.", status: :unprocessable_entity)
94
94
  end
95
95
  end
96
96
 
@@ -98,7 +98,7 @@ module CompletionKit
98
98
  if @run.regrade!
99
99
  render json: @run.reload, status: :accepted
100
100
  else
101
- render json: { error: "Nothing to re-grade. The run has no succeeded responses or no metrics attached." }, status: :unprocessable_entity
101
+ render_error("Nothing to re-grade. The run has no succeeded responses or no metrics attached.", status: :unprocessable_entity)
102
102
  end
103
103
  end
104
104
 
@@ -107,7 +107,7 @@ module CompletionKit
107
107
  comparison = build_run_comparison(@run, other)
108
108
  render json: { left_run_id: @run.id, right_run_id: other.id, metric_ids: comparison[:metric_ids], rows: comparison[:rows] }
109
109
  rescue ActiveRecord::RecordNotFound
110
- render json: { error: "Other run not found. Pass ?with=<run_id>." }, status: :not_found
110
+ render_error("Other run not found. Pass ?with=<run_id>.", status: :not_found)
111
111
  end
112
112
 
113
113
  private
@@ -17,7 +17,7 @@ module CompletionKit
17
17
  if tag.save
18
18
  render json: tag, status: :created
19
19
  else
20
- render json: {errors: tag.errors}, status: :unprocessable_entity
20
+ render_validation_errors(tag)
21
21
  end
22
22
  end
23
23
 
@@ -25,7 +25,7 @@ module CompletionKit
25
25
  if @tag.update(tag_params)
26
26
  render json: @tag
27
27
  else
28
- render json: {errors: @tag.errors}, status: :unprocessable_entity
28
+ render_validation_errors(@tag)
29
29
  end
30
30
  end
31
31
 
@@ -1,7 +1,7 @@
1
1
  module CompletionKit
2
2
  class MetricsController < ApplicationController
3
3
  include CompletionKit::TagFiltering
4
- before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :remove_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
4
+ before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion]
5
5
 
6
6
  def index
7
7
  @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
@@ -35,11 +35,6 @@ module CompletionKit
35
35
  end
36
36
 
37
37
  def show
38
- @published_metric_version = MetricVersion.ensure_current_for(@metric)
39
- @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
40
- .includes(:metric_version, response: [:reviews, :run])
41
- .order(created_at: :desc)
42
- .limit(50)
43
38
  @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
44
39
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
45
40
  @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
@@ -54,6 +49,7 @@ module CompletionKit
54
49
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
55
50
  @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
56
51
  @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
52
+ @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
57
53
 
58
54
  if @edit_draft
59
55
  @metric.instruction = @edit_draft.instruction
@@ -102,7 +98,7 @@ module CompletionKit
102
98
  state: "draft", source: "edit", current: false
103
99
  )
104
100
  redirect_to edit_metric_path(@metric),
105
- notice: "Saved as draft #{draft.version_label}. Publish to push these changes to the live judge."
101
+ notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
106
102
  else
107
103
  @metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
108
104
  current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
@@ -120,7 +116,7 @@ module CompletionKit
120
116
  target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
121
117
  disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
122
118
  if disagreement_count.zero?
123
- redirect_to target, alert: "Mark at least one row as Disagree before asking the model to suggest a change."
119
+ redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
124
120
  return
125
121
  end
126
122
 
@@ -132,15 +128,21 @@ module CompletionKit
132
128
  redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
133
129
  return
134
130
  end
135
- generator.persist!(variants)
136
- redirect_to target, notice: "Drafted a new version. Review it below."
131
+ versions = generator.persist!(variants)
132
+ new_version = versions.max_by(&:version_number)
133
+ if params[:back_to] == "edit"
134
+ redirect_to edit_metric_path(@metric), notice: "Drafted #{new_version.version_label} from your reviews. Review the proposed changes below, then Publish to use it."
135
+ else
136
+ redirect_to metric_path(@metric, show_change: new_version.id), notice: "Drafted #{new_version.version_label} from your reviews."
137
+ end
137
138
  end
138
139
 
139
140
  def dismiss_suggestion
140
141
  draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
142
+ label = draft&.version_label
141
143
  draft&.destroy
142
144
  target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
143
- redirect_to target, notice: "Dismissed."
145
+ redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
144
146
  end
145
147
 
146
148
  def publish_draft
@@ -164,7 +166,7 @@ module CompletionKit
164
166
  audit = version.revert!
165
167
  prior_label = previously_current.version_label
166
168
  redirect_to metric_path(@metric),
167
- notice: "Reverted to #{@metric.name} #{version.version_label} (now logged as #{audit.version_label}). Pinned cases still flow to the judge, and calibration verdicts collected against #{prior_label} stay tied to it."
169
+ notice: "Reverted #{@metric.name} to #{version.version_label} (logged as #{audit.version_label}). Human reviews collected against #{prior_label} stay tied to it."
168
170
  else
169
171
  version.publish!
170
172
  redirect_to metric_path(@metric),
@@ -172,31 +174,6 @@ module CompletionKit
172
174
  end
173
175
  end
174
176
 
175
- def add_few_shot
176
- calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
177
- review = calibration.response.reviews.find_by(metric_id: @metric.id)
178
- examples = Array(@metric.few_shot_examples)
179
- examples << {
180
- "input" => calibration.response.input_data.to_s.truncate(2000),
181
- "response" => calibration.response.response_text.to_s.truncate(2000),
182
- "judge_score" => review&.ai_score&.to_f,
183
- "judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
184
- "human_score" => calibration.corrected_score&.to_f,
185
- "human_note" => calibration.note.to_s.truncate(1000),
186
- "calibration_id" => calibration.id,
187
- "added_at" => Time.current.utc.iso8601
188
- }
189
- @metric.update!(few_shot_examples: examples)
190
- redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
191
- end
192
-
193
- def remove_few_shot
194
- cal_id = params[:calibration_id].to_i
195
- remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
196
- @metric.update!(few_shot_examples: remaining)
197
- redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
198
- end
199
-
200
177
  private
201
178
 
202
179
  def set_metric
@@ -95,7 +95,7 @@ module CompletionKit
95
95
 
96
96
  def regrade
97
97
  if @run.regrade!
98
- redirect_to run_path(@run), notice: "Re-grading existing responses with the current judge."
98
+ redirect_to run_path(@run), notice: "Re-grading existing responses against the current metrics."
99
99
  else
100
100
  redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
101
101
  end
@@ -151,7 +151,7 @@ module CompletionKit
151
151
  def retry_failures
152
152
  if @run.stale_review_summary.any?
153
153
  redirect_to run_path(@run),
154
- alert: "The judge has changed since this run executed. Retrying failed cases would mix scores from two metric versions in the same run. Use 'Re-run with current judge' to refresh everything against the live judge."
154
+ alert: "A metric has a newer version than the one this run was scored against. Retrying failed cases would mix scores from two versions in the same run. Use 'Re-run from scratch' to refresh everything against the current metrics."
155
155
  return
156
156
  end
157
157
 
@@ -80,8 +80,7 @@ module CompletionKit
80
80
  end
81
81
 
82
82
  def record_terminal_failure!(error)
83
- response_id = @response_id || arguments.last
84
- response = Response.find_by(id: response_id)
83
+ response = Response.find_by(id: @response_id)
85
84
  return unless response
86
85
 
87
86
  response.update!(
@@ -98,8 +97,7 @@ module CompletionKit
98
97
  end
99
98
 
100
99
  def enqueue_completion_check
101
- run_id = @run_id || arguments.first
102
- RunCompletionCheckJob.perform_later(run_id)
100
+ RunCompletionCheckJob.perform_later(@run_id)
103
101
  end
104
102
  end
105
103
  end
@@ -58,7 +58,6 @@ module CompletionKit
58
58
  run.prompt&.template,
59
59
  criteria: metric.instruction.to_s,
60
60
  rubric_text: metric.display_rubric_text,
61
- human_examples: few_shot_payload(metric),
62
61
  input_data: response.input_data
63
62
  )
64
63
 
@@ -91,14 +90,12 @@ module CompletionKit
91
90
  end
92
91
 
93
92
  def record_terminal_failure!(error)
94
- response_id = @response_id || arguments.first
95
- metric_id = @metric_id || arguments.last
96
- response = Response.find_by(id: response_id)
93
+ response = Response.find_by(id: @response_id)
97
94
  return unless response
98
95
 
99
- review = response.reviews.find_or_initialize_by(metric_id: metric_id)
96
+ review = response.reviews.find_or_initialize_by(metric_id: @metric_id)
100
97
  review.assign_attributes(
101
- metric_name: review.metric_name || Metric.find_by(id: metric_id)&.name || "(deleted metric)",
98
+ metric_name: review.metric_name || Metric.find_by(id: @metric_id)&.name || "(deleted metric)",
102
99
  status: "failed",
103
100
  error_provider: provider_for(response),
104
101
  error_class: error.class.name,
@@ -115,20 +112,8 @@ module CompletionKit
115
112
  end
116
113
 
117
114
  def enqueue_completion_check
118
- response_id = @response_id || arguments.first
119
- response = Response.find_by(id: response_id)
115
+ response = Response.find_by(id: @response_id)
120
116
  RunCompletionCheckJob.perform_later(response.run_id) if response
121
117
  end
122
-
123
- def few_shot_payload(metric)
124
- return nil unless CompletionKit.config.judge_calibration_enabled
125
- Array(metric.few_shot_examples).map do |fs|
126
- {
127
- human_score: fs["human_score"],
128
- response_text: fs["response"].to_s,
129
- human_note: fs["human_note"].to_s
130
- }
131
- end
132
- end
133
118
  end
134
119
  end
@@ -17,7 +17,6 @@ module CompletionKit
17
17
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
18
18
 
19
19
  serialize :rubric_bands, coder: JSON
20
- serialize :few_shot_examples, coder: JSON, type: Array
21
20
 
22
21
  validates :name, presence: true
23
22
  validates :key, tenant_scoped_uniqueness: { allow_nil: true }
@@ -40,6 +40,35 @@ module CompletionKit
40
40
  "v#{version_number}"
41
41
  end
42
42
 
43
+ def change_summary_against(previous)
44
+ return nil if previous.nil?
45
+
46
+ instruction_changed = previous.instruction.to_s.strip != instruction.to_s.strip
47
+ rubric_changes = rubric_band_change_count(previous)
48
+ return nil unless instruction_changed || rubric_changes.positive?
49
+
50
+ dimensions = []
51
+ dimensions << "instruction" if instruction_changed
52
+ dimensions << "rubric" if rubric_changes.positive?
53
+
54
+ words_changed = 0
55
+ if instruction_changed
56
+ old_words = previous.instruction.to_s.split
57
+ new_words = instruction.to_s.split
58
+ words_changed = (old_words - new_words).size + (new_words - old_words).size
59
+ end
60
+
61
+ magnitude = if rubric_changes >= 2 || (instruction_changed && rubric_changes >= 1) || words_changed >= 15
62
+ :major
63
+ elsif rubric_changes == 1 || words_changed >= 4
64
+ :minor
65
+ else
66
+ :trivial
67
+ end
68
+
69
+ { magnitude: magnitude, label: "#{magnitude.to_s.capitalize} #{dimensions.to_sentence} changes" }
70
+ end
71
+
43
72
  def publish!
44
73
  MetricVersion.transaction do
45
74
  self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
@@ -86,6 +115,12 @@ module CompletionKit
86
115
 
87
116
  private
88
117
 
118
+ def rubric_band_change_count(previous)
119
+ prev = Metric.normalize_rubric_bands(previous.rubric_bands)
120
+ curr = Metric.normalize_rubric_bands(rubric_bands)
121
+ prev.zip(curr).count { |p, c| p["description"].to_s.strip != c["description"].to_s.strip }
122
+ end
123
+
89
124
  def assign_version_number
90
125
  return if version_number.present?
91
126
  max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i
@@ -290,7 +290,6 @@ module CompletionKit
290
290
  target: "run_status_panel",
291
291
  html: render_engine_partial("completion_kit/runs/status_panel", run: self)
292
292
  )
293
- broadcast_status_header
294
293
  end
295
294
 
296
295
  def broadcast_status_header
@@ -10,12 +10,12 @@ module CompletionKit
10
10
  @judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
11
11
  end
12
12
 
13
- def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, human_examples: nil, input_data: nil, **_extras)
13
+ def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, **_extras)
14
14
  raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
15
15
 
16
16
  judge_prompt = build_judge_prompt(output, expected_output, prompt,
17
17
  criteria: criteria,
18
- rubric_text: rubric_text, human_examples: human_examples,
18
+ rubric_text: rubric_text,
19
19
  input_data: input_data)
20
20
 
21
21
  response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
@@ -25,7 +25,7 @@ module CompletionKit
25
25
 
26
26
  private
27
27
 
28
- def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, human_examples: nil, input_data: nil)
28
+ def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil)
29
29
  judge_prompt = <<~PROMPT
30
30
  You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
31
31
 
@@ -42,13 +42,6 @@ module CompletionKit
42
42
  judge_prompt += "\nCriteria: #{criteria}\n"
43
43
  end
44
44
 
45
- if human_examples.present?
46
- judge_prompt += "\nCalibration examples:\n"
47
- human_examples.each_with_index do |example, index|
48
- judge_prompt += "Example #{index + 1}: score=#{example[:human_score]} output=#{example[:response_text].to_s.truncate(200)}\n"
49
- end
50
- end
51
-
52
45
  judge_prompt += <<~PROMPT
53
46
 
54
47
  Original prompt: #{prompt || "Not provided"}
@@ -16,7 +16,7 @@ module CompletionKit
16
16
  handler: :list
17
17
  },
18
18
  "metric_versions_publish" => {
19
- description: "Publish a MetricVersion as the live judge for its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge actually uses them.",
19
+ description: "Publish a MetricVersion as the live version of its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge grades against it.",
20
20
  inputSchema: {
21
21
  type: "object",
22
22
  properties: {
@@ -43,7 +43,6 @@ module CompletionKit
43
43
  def build_meta_prompt
44
44
  disagreements = MetricCalibrationExamples.disagreements_for(@metric)
45
45
  borderlines = MetricCalibrationExamples.borderlines_for(@metric)
46
- pinned_examples = Array(@metric.few_shot_examples)
47
46
  sections = []
48
47
  sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
49
48
  sections << ""
@@ -78,18 +77,6 @@ module CompletionKit
78
77
  sections << ""
79
78
  end
80
79
  end
81
- if pinned_examples.any?
82
- sections << "## Pinned cases the judge already references"
83
- sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
84
- pinned_examples.each_with_index do |ex, i|
85
- sections << "### Pinned #{i + 1}"
86
- sections << "Input: #{ex["input"].to_s.truncate(200)}"
87
- sections << "Output: #{ex["response"].to_s.truncate(200)}"
88
- sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
89
- sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
90
- sections << ""
91
- end
92
- end
93
80
  sections << "## Task"
94
81
  sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
95
82
  sections << ""
@@ -27,7 +27,7 @@
27
27
  <label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">10</span></label>
28
28
  <label for="ck-tab-responses" class="ck-api-tabs__label">Responses <span class="ck-api-tabs__count">2</span></label>
29
29
  <label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
30
- <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">12</span></label>
30
+ <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">10</span></label>
31
31
  <label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
32
32
  <label for="ck-tab-calibrations" class="ck-api-tabs__label">Calibrations <span class="ck-api-tabs__count">3</span></label>
33
33
  <label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
@@ -239,7 +239,7 @@
239
239
 
240
240
  <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
241
241
  <p class="ck-kicker" style="margin-bottom: 0.5rem;">Calibration loop</p>
242
- <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model for variants, then pin individual cases as few-shot examples on the metric.</p>
242
+ <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model to rewrite the instruction and rubric into a new draft version.</p>
243
243
  </div>
244
244
  <div class="ck-api-endpoint">
245
245
  <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
@@ -247,16 +247,6 @@
247
247
  <p class="ck-api-params"><strong>Optional:</strong>&ensp;<code>count</code>, <code>model</code></p>
248
248
  <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/metrics/1/suggest_variants \\\n -H \"Authorization: Bearer #{token}\"" %>
249
249
  </div>
250
- <div class="ck-api-endpoint">
251
- <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/add_few_shot</p>
252
- <p class="ck-meta-copy">Pin a disagree calibration as a few-shot example on the metric. Returns the updated metric, or 404 if the calibration is not a disagree on this metric.</p>
253
- <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
254
- </div>
255
- <div class="ck-api-endpoint">
256
- <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:id/remove_few_shot</p>
257
- <p class="ck-meta-copy">Drop the pinned few-shot example by calibration ID.</p>
258
- <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
259
- </div>
260
250
 
261
251
  <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
262
252
  <p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>
@@ -20,6 +20,10 @@
20
20
  <p class="ck-kicker">Tag filtering</p>
21
21
  <p class="ck-meta-copy">Prompts, runs, metrics, datasets, and metric groups accept <code>?tag[]=name</code> (repeat for OR semantics).</p>
22
22
  </div>
23
+ <div>
24
+ <p class="ck-kicker">Error shape</p>
25
+ <p class="ck-meta-copy">Every error response carries a top-level <code>error</code> string. Validation failures (422) add a <code>details</code> object keyed by field: <code>{ "error": "Validation failed", "details": { "name": ["can't be blank"] } }</code>.</p>
26
+ </div>
23
27
  </div>
24
28
  </div>
25
29
  </div>