completion-kit 0.5.42 → 0.5.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/javascripts/completion_kit/application.js +17 -0
- data/app/assets/stylesheets/completion_kit/application.css +530 -39
- data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +1 -1
- data/app/controllers/completion_kit/api/v1/runs_controller.rb +4 -0
- data/app/controllers/completion_kit/calibrations_controller.rb +1 -1
- data/app/controllers/completion_kit/metrics_controller.rb +88 -31
- data/app/controllers/completion_kit/runs_controller.rb +6 -0
- data/app/jobs/completion_kit/judge_review_job.rb +14 -0
- data/app/models/completion_kit/calibration.rb +6 -2
- data/app/models/completion_kit/metric.rb +0 -17
- data/app/models/completion_kit/{judge_version.rb → metric_version.rb} +35 -2
- data/app/models/completion_kit/review.rb +9 -0
- data/app/models/completion_kit/run.rb +28 -0
- data/app/services/completion_kit/mcp_tools/calibrations.rb +1 -1
- data/app/services/completion_kit/mcp_tools/judges.rb +15 -13
- data/app/services/completion_kit/metric_calibration_stats.rb +17 -5
- data/app/services/completion_kit/{judge_variant_generator.rb → metric_variant_generator.rb} +14 -12
- data/app/views/completion_kit/api_reference/_body.html.erb +1 -1
- data/app/views/completion_kit/calibrations/_buttons.html.erb +43 -6
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +32 -28
- data/app/views/completion_kit/metrics/_form.html.erb +90 -4
- data/app/views/completion_kit/metrics/_rubric_diff.html.erb +25 -0
- data/app/views/completion_kit/metrics/_rubric_hint.html.erb +4 -0
- data/app/views/completion_kit/metrics/_starter_card.html.erb +13 -9
- data/app/views/completion_kit/metrics/edit.html.erb +5 -1
- data/app/views/completion_kit/metrics/index.html.erb +5 -3
- data/app/views/completion_kit/metrics/show.html.erb +131 -127
- data/app/views/completion_kit/metrics/starter_preview.html.erb +6 -6
- data/app/views/completion_kit/responses/show.html.erb +9 -1
- data/app/views/completion_kit/runs/_status_panel.html.erb +2 -2
- data/app/views/completion_kit/runs/show.html.erb +23 -0
- data/config/routes.rb +2 -1
- data/db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb +24 -0
- data/db/migrate/20260528000001_rename_judge_version_to_metric_version.rb +22 -0
- data/db/migrate/20260528000002_add_metric_version_to_reviews.rb +21 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +8 -3
|
@@ -45,6 +45,10 @@ module CompletionKit
|
|
|
45
45
|
end
|
|
46
46
|
|
|
47
47
|
def retry_failures
|
|
48
|
+
if @run.stale_review_summary.any?
|
|
49
|
+
return render(json: { error: "Judge has changed since this run executed. Retry would mix versions in the same run; use POST /api/v1/runs/:id/rerun instead." }, status: :conflict)
|
|
50
|
+
end
|
|
51
|
+
|
|
48
52
|
scope = @run.responses.where(status: "failed")
|
|
49
53
|
scope = scope.where(id: params[:only]) if params[:only].present?
|
|
50
54
|
|
|
@@ -18,7 +18,7 @@ module CompletionKit
|
|
|
18
18
|
run: @run, response: @response, metric: @metric, created_by: created_by
|
|
19
19
|
)
|
|
20
20
|
calibration.assign_attributes(
|
|
21
|
-
|
|
21
|
+
metric_version: MetricVersion.ensure_current_for(@metric),
|
|
22
22
|
verdict: params[:verdict],
|
|
23
23
|
corrected_score: params[:corrected_score].presence,
|
|
24
24
|
note: params[:note].presence
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class MetricsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
|
|
4
|
+
before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :remove_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
|
|
5
5
|
|
|
6
6
|
def index
|
|
7
7
|
@metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
|
|
@@ -35,14 +35,16 @@ module CompletionKit
|
|
|
35
35
|
end
|
|
36
36
|
|
|
37
37
|
def show
|
|
38
|
+
@published_metric_version = MetricVersion.ensure_current_for(@metric)
|
|
38
39
|
@disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
|
|
39
|
-
.includes(response: [:reviews, :run])
|
|
40
|
+
.includes(:metric_version, response: [:reviews, :run])
|
|
40
41
|
.order(created_at: :desc)
|
|
41
42
|
.limit(50)
|
|
42
|
-
@edit_draft =
|
|
43
|
-
@
|
|
44
|
-
@
|
|
45
|
-
|
|
43
|
+
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
44
|
+
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
45
|
+
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree",
|
|
46
|
+
metric_version_id: @published_metric_version.id).count
|
|
47
|
+
@versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
|
|
46
48
|
end
|
|
47
49
|
|
|
48
50
|
def new
|
|
@@ -50,6 +52,14 @@ module CompletionKit
|
|
|
50
52
|
end
|
|
51
53
|
|
|
52
54
|
def edit
|
|
55
|
+
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
56
|
+
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
57
|
+
@published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
58
|
+
|
|
59
|
+
if @edit_draft
|
|
60
|
+
@metric.instruction = @edit_draft.instruction
|
|
61
|
+
@metric.rubric_bands = @edit_draft.rubric_bands
|
|
62
|
+
end
|
|
53
63
|
end
|
|
54
64
|
|
|
55
65
|
def create
|
|
@@ -63,10 +73,42 @@ module CompletionKit
|
|
|
63
73
|
end
|
|
64
74
|
|
|
65
75
|
def update
|
|
66
|
-
|
|
67
|
-
|
|
76
|
+
judge_keys = %i[instruction rubric_bands]
|
|
77
|
+
meta_attrs = metric_params.except(*judge_keys)
|
|
78
|
+
proposed_instruction = metric_params[:instruction]
|
|
79
|
+
proposed_rubric = metric_params[:rubric_bands]
|
|
80
|
+
|
|
81
|
+
unless @metric.update(meta_attrs)
|
|
82
|
+
return render(:edit, status: :unprocessable_entity)
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
current_instruction = @metric.instruction.to_s
|
|
86
|
+
current_rubric = @metric.rubric_bands || []
|
|
87
|
+
normalized_proposed_rubric = normalize_rubric_bands_for_update(proposed_rubric)
|
|
88
|
+
|
|
89
|
+
instruction_changed = !proposed_instruction.nil? && proposed_instruction.to_s != current_instruction
|
|
90
|
+
rubric_changed = !normalized_proposed_rubric.nil? && normalized_proposed_rubric != current_rubric
|
|
91
|
+
|
|
92
|
+
unless instruction_changed || rubric_changed
|
|
93
|
+
return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
new_instruction = instruction_changed ? proposed_instruction.to_s : current_instruction
|
|
97
|
+
new_rubric = rubric_changed ? normalized_proposed_rubric : current_rubric
|
|
98
|
+
|
|
99
|
+
if @metric.reviews.exists?
|
|
100
|
+
MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
|
|
101
|
+
draft = MetricVersion.create!(
|
|
102
|
+
metric: @metric, instruction: new_instruction, rubric_bands: new_rubric,
|
|
103
|
+
state: "draft", source: "edit", current: false
|
|
104
|
+
)
|
|
105
|
+
redirect_to edit_metric_path(@metric),
|
|
106
|
+
notice: "Saved as draft #{draft.version_label}. Publish to push these changes to the live judge."
|
|
68
107
|
else
|
|
69
|
-
|
|
108
|
+
@metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
|
|
109
|
+
current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
110
|
+
current_pub&.update!(instruction: @metric.instruction, rubric_bands: @metric.rubric_bands)
|
|
111
|
+
redirect_to metric_path(@metric), notice: "Metric was successfully updated."
|
|
70
112
|
end
|
|
71
113
|
end
|
|
72
114
|
|
|
@@ -76,49 +118,48 @@ module CompletionKit
|
|
|
76
118
|
end
|
|
77
119
|
|
|
78
120
|
def suggest_variants
|
|
121
|
+
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
79
122
|
disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
80
123
|
if disagreement_count.zero?
|
|
81
|
-
redirect_to
|
|
124
|
+
redirect_to target, alert: "Mark at least one row as Disagree before asking the model to suggest a change."
|
|
82
125
|
return
|
|
83
126
|
end
|
|
84
127
|
|
|
85
|
-
|
|
128
|
+
MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
|
|
86
129
|
|
|
87
|
-
generator =
|
|
130
|
+
generator = MetricVariantGenerator.new(@metric, count: 1)
|
|
88
131
|
variants = generator.call
|
|
89
132
|
if variants.empty?
|
|
90
|
-
redirect_to
|
|
133
|
+
redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
|
|
91
134
|
return
|
|
92
135
|
end
|
|
93
136
|
generator.persist!(variants)
|
|
94
|
-
redirect_to
|
|
137
|
+
redirect_to target, notice: "Drafted a new version. Review it below."
|
|
95
138
|
end
|
|
96
139
|
|
|
97
140
|
def dismiss_suggestion
|
|
98
|
-
draft =
|
|
141
|
+
draft = MetricVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
|
|
99
142
|
draft&.destroy
|
|
100
|
-
|
|
143
|
+
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
144
|
+
redirect_to target, notice: "Dismissed."
|
|
101
145
|
end
|
|
102
146
|
|
|
103
147
|
def publish_draft
|
|
104
|
-
scope =
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
148
|
+
scope = MetricVersion.where(metric_id: @metric.id)
|
|
149
|
+
version = if params[:draft_id].present?
|
|
150
|
+
scope.find_by(id: params[:draft_id])
|
|
151
|
+
else
|
|
152
|
+
MetricVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
if version.nil?
|
|
156
|
+
redirect_to metric_path(@metric), alert: "No version to publish."
|
|
109
157
|
return
|
|
110
158
|
end
|
|
111
159
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
@metric.update_columns(
|
|
116
|
-
instruction: draft.instruction,
|
|
117
|
-
rubric_bands: Array(draft.rubric_bands).to_json
|
|
118
|
-
)
|
|
119
|
-
end
|
|
120
|
-
|
|
121
|
-
redirect_to metric_path(@metric), notice: "This judge version is now live."
|
|
160
|
+
version.publish!
|
|
161
|
+
redirect_to metric_path(@metric),
|
|
162
|
+
notice: "#{@metric.name} #{version.version_label} is now the published version."
|
|
122
163
|
end
|
|
123
164
|
|
|
124
165
|
def add_few_shot
|
|
@@ -139,6 +180,13 @@ module CompletionKit
|
|
|
139
180
|
redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
|
|
140
181
|
end
|
|
141
182
|
|
|
183
|
+
def remove_few_shot
|
|
184
|
+
cal_id = params[:calibration_id].to_i
|
|
185
|
+
remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
|
|
186
|
+
@metric.update!(few_shot_examples: remaining)
|
|
187
|
+
redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
|
|
188
|
+
end
|
|
189
|
+
|
|
142
190
|
private
|
|
143
191
|
|
|
144
192
|
def set_metric
|
|
@@ -149,5 +197,14 @@ module CompletionKit
|
|
|
149
197
|
params.require(:metric).permit(:name, :instruction,
|
|
150
198
|
rubric_bands: [:stars, :description], tag_names: [])
|
|
151
199
|
end
|
|
200
|
+
|
|
201
|
+
def normalize_rubric_bands_for_update(bands)
|
|
202
|
+
return nil if bands.nil?
|
|
203
|
+
array = bands.is_a?(ActionController::Parameters) ? bands.to_unsafe_h.values : bands
|
|
204
|
+
Array(array).map do |b|
|
|
205
|
+
h = b.respond_to?(:to_unsafe_h) ? b.to_unsafe_h : b
|
|
206
|
+
{ "stars" => h["stars"].to_i, "description" => h["description"].to_s }
|
|
207
|
+
end.sort_by { |b| -b["stars"] }
|
|
208
|
+
end
|
|
152
209
|
end
|
|
153
210
|
end
|
|
@@ -126,6 +126,12 @@ module CompletionKit
|
|
|
126
126
|
end
|
|
127
127
|
|
|
128
128
|
def retry_failures
|
|
129
|
+
if @run.stale_review_summary.any?
|
|
130
|
+
redirect_to run_path(@run),
|
|
131
|
+
alert: "The judge has changed since this run executed. Retrying failed cases would mix scores from two metric versions in the same run. Use 'Re-run with current judge' to refresh everything against the live judge."
|
|
132
|
+
return
|
|
133
|
+
end
|
|
134
|
+
|
|
129
135
|
scope = @run.responses.where(status: "failed")
|
|
130
136
|
scope = scope.where(id: params[:only]) if params[:only].present?
|
|
131
137
|
|
|
@@ -57,13 +57,16 @@ module CompletionKit
|
|
|
57
57
|
run.prompt&.template,
|
|
58
58
|
criteria: metric.instruction.to_s,
|
|
59
59
|
rubric_text: metric.display_rubric_text,
|
|
60
|
+
human_examples: few_shot_payload(metric),
|
|
60
61
|
input_data: response.input_data
|
|
61
62
|
)
|
|
62
63
|
|
|
63
64
|
review = response.reviews.find_or_initialize_by(metric_id: metric.id)
|
|
65
|
+
current_metric_version = MetricVersion.ensure_current_for(metric)
|
|
64
66
|
review.assign_attributes(
|
|
65
67
|
metric_name: metric.name,
|
|
66
68
|
instruction: metric.instruction.to_s,
|
|
69
|
+
metric_version_id: current_metric_version.id,
|
|
67
70
|
status: "succeeded",
|
|
68
71
|
ai_score: evaluation[:score],
|
|
69
72
|
ai_feedback: evaluation[:feedback],
|
|
@@ -119,5 +122,16 @@ module CompletionKit
|
|
|
119
122
|
response = Response.find_by(id: response_id)
|
|
120
123
|
RunCompletionCheckJob.perform_later(response.run_id) if response
|
|
121
124
|
end
|
|
125
|
+
|
|
126
|
+
def few_shot_payload(metric)
|
|
127
|
+
return nil unless CompletionKit.config.judge_calibration_enabled
|
|
128
|
+
Array(metric.few_shot_examples).map do |fs|
|
|
129
|
+
{
|
|
130
|
+
human_score: fs["human_score"],
|
|
131
|
+
response_text: fs["response"].to_s,
|
|
132
|
+
human_note: fs["human_note"].to_s
|
|
133
|
+
}
|
|
134
|
+
end
|
|
135
|
+
end
|
|
122
136
|
end
|
|
123
137
|
end
|
|
@@ -5,7 +5,11 @@ module CompletionKit
|
|
|
5
5
|
belongs_to :run
|
|
6
6
|
belongs_to :response
|
|
7
7
|
belongs_to :metric
|
|
8
|
-
belongs_to :
|
|
8
|
+
belongs_to :metric_version
|
|
9
|
+
|
|
10
|
+
alias_attribute :judge_version_id, :metric_version_id
|
|
11
|
+
alias_method :judge_version, :metric_version
|
|
12
|
+
alias_method :judge_version=, :metric_version=
|
|
9
13
|
|
|
10
14
|
validates :verdict, presence: true, inclusion: { in: VERDICTS }
|
|
11
15
|
validates :response_id,
|
|
@@ -22,7 +26,7 @@ module CompletionKit
|
|
|
22
26
|
run_id: run_id,
|
|
23
27
|
response_id: response_id,
|
|
24
28
|
metric_id: metric_id,
|
|
25
|
-
|
|
29
|
+
metric_version_id: metric_version_id,
|
|
26
30
|
verdict: verdict,
|
|
27
31
|
corrected_score: corrected_score,
|
|
28
32
|
note: note,
|
|
@@ -24,7 +24,6 @@ module CompletionKit
|
|
|
24
24
|
before_validation :generate_key
|
|
25
25
|
before_validation :normalize_rubric_bands
|
|
26
26
|
before_validation :set_defaults
|
|
27
|
-
after_update :fork_draft_judge_version, if: :judge_relevant_changes?
|
|
28
27
|
|
|
29
28
|
def self.default_rubric_bands
|
|
30
29
|
DEFAULT_RUBRIC_BANDS.map(&:dup)
|
|
@@ -98,21 +97,5 @@ module CompletionKit
|
|
|
98
97
|
self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
|
|
99
98
|
end
|
|
100
99
|
|
|
101
|
-
def judge_relevant_changes?
|
|
102
|
-
saved_change_to_instruction? || saved_change_to_rubric_bands?
|
|
103
|
-
end
|
|
104
|
-
|
|
105
|
-
def fork_draft_judge_version
|
|
106
|
-
JudgeVersion.ensure_current_for(self)
|
|
107
|
-
JudgeVersion.where(metric_id: id, state: "draft").update_all(current: false)
|
|
108
|
-
JudgeVersion.create!(
|
|
109
|
-
metric: self,
|
|
110
|
-
instruction: instruction,
|
|
111
|
-
rubric_bands: rubric_bands,
|
|
112
|
-
current: false,
|
|
113
|
-
state: "draft",
|
|
114
|
-
source: "edit"
|
|
115
|
-
)
|
|
116
|
-
end
|
|
117
100
|
end
|
|
118
101
|
end
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
|
-
class
|
|
2
|
+
class MetricVersion < ApplicationRecord
|
|
3
3
|
STATES = %w[draft published].freeze
|
|
4
4
|
|
|
5
5
|
belongs_to :metric
|
|
@@ -7,8 +7,11 @@ module CompletionKit
|
|
|
7
7
|
|
|
8
8
|
serialize :rubric_bands, coder: JSON
|
|
9
9
|
|
|
10
|
+
before_validation :assign_version_number, on: :create
|
|
11
|
+
|
|
10
12
|
validates :metric_id, presence: true
|
|
11
13
|
validates :state, inclusion: { in: STATES }
|
|
14
|
+
validates :version_number, presence: true, uniqueness: { scope: :metric_id }
|
|
12
15
|
|
|
13
16
|
scope :current, -> { where(current: true) }
|
|
14
17
|
scope :published, -> { where(state: "published") }
|
|
@@ -20,7 +23,8 @@ module CompletionKit
|
|
|
20
23
|
instruction: metric.instruction,
|
|
21
24
|
rubric_bands: metric.rubric_bands,
|
|
22
25
|
current: true,
|
|
23
|
-
state: "published"
|
|
26
|
+
state: "published",
|
|
27
|
+
published_at: Time.current
|
|
24
28
|
)
|
|
25
29
|
end
|
|
26
30
|
|
|
@@ -32,17 +36,46 @@ module CompletionKit
|
|
|
32
36
|
state == "published"
|
|
33
37
|
end
|
|
34
38
|
|
|
39
|
+
def version_label
|
|
40
|
+
"v#{version_number}"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def publish!
|
|
44
|
+
MetricVersion.transaction do
|
|
45
|
+
self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
|
|
46
|
+
reload
|
|
47
|
+
update!(state: "published", current: true, published_at: published_at || Time.current)
|
|
48
|
+
metric.update_columns(
|
|
49
|
+
instruction: instruction,
|
|
50
|
+
rubric_bands: Array(rubric_bands).to_json
|
|
51
|
+
)
|
|
52
|
+
end
|
|
53
|
+
self
|
|
54
|
+
end
|
|
55
|
+
|
|
35
56
|
def as_json(options = {})
|
|
36
57
|
{
|
|
37
58
|
id: id,
|
|
38
59
|
metric_id: metric_id,
|
|
60
|
+
version_number: version_number,
|
|
39
61
|
instruction: instruction,
|
|
40
62
|
rubric_bands: rubric_bands,
|
|
41
63
|
current: current,
|
|
42
64
|
state: state,
|
|
43
65
|
source: source,
|
|
66
|
+
published_at: published_at,
|
|
44
67
|
created_at: created_at
|
|
45
68
|
}
|
|
46
69
|
end
|
|
70
|
+
|
|
71
|
+
private
|
|
72
|
+
|
|
73
|
+
def assign_version_number
|
|
74
|
+
return if version_number.present?
|
|
75
|
+
max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i
|
|
76
|
+
self.version_number = max + 1
|
|
77
|
+
end
|
|
47
78
|
end
|
|
79
|
+
|
|
80
|
+
JudgeVersion = MetricVersion
|
|
48
81
|
end
|
|
@@ -5,8 +5,16 @@ module CompletionKit
|
|
|
5
5
|
|
|
6
6
|
belongs_to :response
|
|
7
7
|
belongs_to :metric, optional: true
|
|
8
|
+
belongs_to :metric_version, optional: true
|
|
8
9
|
has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
|
|
9
10
|
|
|
11
|
+
def stale_against_current_judge?
|
|
12
|
+
return false unless metric_id && metric_version_id
|
|
13
|
+
current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
|
|
14
|
+
return false if current_id.nil?
|
|
15
|
+
metric_version_id != current_id
|
|
16
|
+
end
|
|
17
|
+
|
|
10
18
|
validates :metric_name, presence: true
|
|
11
19
|
validates :status, inclusion: { in: STATUSES }
|
|
12
20
|
validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
|
|
@@ -29,6 +37,7 @@ module CompletionKit
|
|
|
29
37
|
def as_json(options = {})
|
|
30
38
|
{
|
|
31
39
|
id: id, response_id: response_id, metric_id: metric_id,
|
|
40
|
+
metric_version_id: metric_version_id,
|
|
32
41
|
metric_name: metric_name, ai_score: ai_score,
|
|
33
42
|
ai_feedback: ai_feedback, status: status, attempts: attempts,
|
|
34
43
|
error: error_payload
|
|
@@ -89,6 +89,34 @@ module CompletionKit
|
|
|
89
89
|
end
|
|
90
90
|
end
|
|
91
91
|
|
|
92
|
+
def stale_review_summary
|
|
93
|
+
review_pairs = Review.where(response_id: response_ids)
|
|
94
|
+
.where.not(metric_id: nil)
|
|
95
|
+
.where.not(metric_version_id: nil)
|
|
96
|
+
.pluck(:metric_id, :metric_version_id, :metric_name)
|
|
97
|
+
return {} if review_pairs.empty?
|
|
98
|
+
|
|
99
|
+
metric_ids = review_pairs.map(&:first).uniq
|
|
100
|
+
version_ids = review_pairs.map { |_, vid, _| vid }.uniq
|
|
101
|
+
current_by_metric = MetricVersion.current.where(metric_id: metric_ids).pluck(:metric_id, :id, :version_number).each_with_object({}) do |(mid, vid, vnum), h|
|
|
102
|
+
h[mid] = { id: vid, label: "v#{vnum}" }
|
|
103
|
+
end
|
|
104
|
+
label_by_version = MetricVersion.where(id: version_ids).pluck(:id, :version_number).each_with_object({}) { |(vid, vnum), h| h[vid] = "v#{vnum}" }
|
|
105
|
+
|
|
106
|
+
summary = {}
|
|
107
|
+
review_pairs.each do |metric_id, version_id, metric_name|
|
|
108
|
+
current = current_by_metric[metric_id]
|
|
109
|
+
next if current.nil?
|
|
110
|
+
next if version_id == current[:id]
|
|
111
|
+
label = label_by_version[version_id]
|
|
112
|
+
next if label.nil?
|
|
113
|
+
summary[metric_id] ||= { metric_name: metric_name, current_label: current[:label], stale_count: 0, scored_labels: [] }
|
|
114
|
+
summary[metric_id][:stale_count] += 1
|
|
115
|
+
summary[metric_id][:scored_labels] |= [label]
|
|
116
|
+
end
|
|
117
|
+
summary
|
|
118
|
+
end
|
|
119
|
+
|
|
92
120
|
def start!
|
|
93
121
|
rows = if dataset
|
|
94
122
|
CsvProcessor.process_self(self)
|
|
@@ -56,7 +56,7 @@ module CompletionKit
|
|
|
56
56
|
run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
|
|
57
57
|
)
|
|
58
58
|
calibration.assign_attributes(
|
|
59
|
-
|
|
59
|
+
metric_version: CompletionKit::MetricVersion.ensure_current_for(metric),
|
|
60
60
|
verdict: args["verdict"],
|
|
61
61
|
corrected_score: args["corrected_score"],
|
|
62
62
|
note: args["note"]
|
|
@@ -5,7 +5,7 @@ module CompletionKit
|
|
|
5
5
|
|
|
6
6
|
TOOLS = {
|
|
7
7
|
"judges_suggest" => {
|
|
8
|
-
description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft
|
|
8
|
+
description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft MetricVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
|
|
9
9
|
inputSchema: {
|
|
10
10
|
type: "object",
|
|
11
11
|
properties: {
|
|
@@ -33,15 +33,15 @@ module CompletionKit
|
|
|
33
33
|
handler: :replay
|
|
34
34
|
},
|
|
35
35
|
"judges_compare" => {
|
|
36
|
-
description: "Compare two
|
|
36
|
+
description: "Compare two metric versions' calibration stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
|
|
37
37
|
inputSchema: {
|
|
38
38
|
type: "object",
|
|
39
39
|
properties: {
|
|
40
40
|
metric_id: { type: "integer" },
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
metric_version_a_id: { type: "integer" },
|
|
42
|
+
metric_version_b_id: { type: "integer" }
|
|
43
43
|
},
|
|
44
|
-
required: ["metric_id", "
|
|
44
|
+
required: ["metric_id", "metric_version_a_id", "metric_version_b_id"]
|
|
45
45
|
},
|
|
46
46
|
handler: :compare
|
|
47
47
|
}
|
|
@@ -49,7 +49,7 @@ module CompletionKit
|
|
|
49
49
|
|
|
50
50
|
def self.suggest(args)
|
|
51
51
|
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
52
|
-
generator = CompletionKit::
|
|
52
|
+
generator = CompletionKit::MetricVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
|
|
53
53
|
variants = generator.call
|
|
54
54
|
return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
|
|
55
55
|
versions = generator.persist!(variants)
|
|
@@ -75,20 +75,22 @@ module CompletionKit
|
|
|
75
75
|
|
|
76
76
|
def self.compare(args)
|
|
77
77
|
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
78
|
+
a_id = args["metric_version_a_id"] || args["judge_version_a_id"]
|
|
79
|
+
b_id = args["metric_version_b_id"] || args["judge_version_b_id"]
|
|
80
|
+
a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(a_id)
|
|
81
|
+
b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(b_id)
|
|
82
|
+
stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
|
|
83
|
+
stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
|
|
82
84
|
text_result({
|
|
83
85
|
metric_id: metric.id,
|
|
84
|
-
a:
|
|
85
|
-
b:
|
|
86
|
+
a: metric_version_payload(a, stats_a),
|
|
87
|
+
b: metric_version_payload(b, stats_b),
|
|
86
88
|
delta: delta_payload(stats_a, stats_b),
|
|
87
89
|
recommendation: recommendation_for(stats_a, stats_b)
|
|
88
90
|
})
|
|
89
91
|
end
|
|
90
92
|
|
|
91
|
-
def self.
|
|
93
|
+
def self.metric_version_payload(version, stats)
|
|
92
94
|
{
|
|
93
95
|
id: version.id, state: version.state, current: version.current,
|
|
94
96
|
source: version.source, created_at: version.created_at,
|
|
@@ -31,18 +31,30 @@ module CompletionKit
|
|
|
31
31
|
end
|
|
32
32
|
end
|
|
33
33
|
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
CURRENT = :current
|
|
35
|
+
|
|
36
|
+
def self.for(metric, metric_version: CURRENT)
|
|
37
|
+
resolved = case metric_version
|
|
38
|
+
when CURRENT then MetricVersion.current.find_by(metric_id: metric.id)
|
|
39
|
+
when nil then nil
|
|
40
|
+
else metric_version
|
|
41
|
+
end
|
|
42
|
+
new(metric: metric, metric_version: resolved, all_versions: metric_version.nil?).call
|
|
36
43
|
end
|
|
37
44
|
|
|
38
|
-
def initialize(metric:,
|
|
45
|
+
def initialize(metric:, metric_version: nil, all_versions: false)
|
|
39
46
|
@metric = metric
|
|
40
|
-
@
|
|
47
|
+
@metric_version = metric_version
|
|
48
|
+
@all_versions = all_versions
|
|
41
49
|
end
|
|
42
50
|
|
|
43
51
|
def call
|
|
44
52
|
scope = Calibration.where(metric_id: @metric.id)
|
|
45
|
-
|
|
53
|
+
if @metric_version
|
|
54
|
+
scope = scope.where(metric_version_id: @metric_version.id)
|
|
55
|
+
elsif !@all_versions
|
|
56
|
+
scope = scope.none
|
|
57
|
+
end
|
|
46
58
|
|
|
47
59
|
verdicts = scope.pluck(:verdict, :corrected_score, :response_id)
|
|
48
60
|
n = verdicts.length
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
|
-
class
|
|
2
|
+
class MetricVariantGenerator
|
|
3
3
|
DEFAULT_VARIANT_COUNT = 1
|
|
4
4
|
MAX_VARIANT_COUNT = 3
|
|
5
5
|
DEFAULT_TEMPERATURE = 0.4
|
|
@@ -20,9 +20,9 @@ module CompletionKit
|
|
|
20
20
|
end
|
|
21
21
|
|
|
22
22
|
def persist!(variants)
|
|
23
|
-
|
|
23
|
+
MetricVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
|
|
24
24
|
versions = variants.map do |variant|
|
|
25
|
-
|
|
25
|
+
MetricVersion.create!(
|
|
26
26
|
metric: @metric,
|
|
27
27
|
instruction: variant.instruction,
|
|
28
28
|
rubric_bands: variant.rubric_bands.presence || @metric.rubric_bands,
|
|
@@ -41,8 +41,8 @@ module CompletionKit
|
|
|
41
41
|
private
|
|
42
42
|
|
|
43
43
|
def build_meta_prompt
|
|
44
|
-
disagreements =
|
|
45
|
-
borderlines =
|
|
44
|
+
disagreements = MetricCalibrationExamples.disagreements_for(@metric)
|
|
45
|
+
borderlines = MetricCalibrationExamples.borderlines_for(@metric)
|
|
46
46
|
sections = []
|
|
47
47
|
sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
|
|
48
48
|
sections << ""
|
|
@@ -86,7 +86,7 @@ module CompletionKit
|
|
|
86
86
|
sections << "REASONING: <one short sentence: what changes and why>"
|
|
87
87
|
sections << "INSTRUCTION:"
|
|
88
88
|
sections << "<the rewritten instruction>"
|
|
89
|
-
sections << "RUBRIC: # optional
|
|
89
|
+
sections << "RUBRIC: # optional. Omit this block if the rubric is unchanged."
|
|
90
90
|
sections << "5: <description for 5 stars>"
|
|
91
91
|
sections << "4: <description for 4 stars>"
|
|
92
92
|
sections << "3: <description for 3 stars>"
|
|
@@ -117,7 +117,7 @@ module CompletionKit
|
|
|
117
117
|
end
|
|
118
118
|
end
|
|
119
119
|
|
|
120
|
-
module
|
|
120
|
+
module MetricCalibrationExamples
|
|
121
121
|
module_function
|
|
122
122
|
|
|
123
123
|
def for(metric, limit: 8)
|
|
@@ -133,11 +133,13 @@ module CompletionKit
|
|
|
133
133
|
end
|
|
134
134
|
|
|
135
135
|
def calibrations_for(metric, verdict:, limit:)
|
|
136
|
-
Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
136
|
+
scope = Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
137
|
+
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
138
|
+
scope = scope.where(metric_version_id: current_version.id) if current_version
|
|
139
|
+
scope.includes(response: :reviews)
|
|
140
|
+
.order(created_at: :desc)
|
|
141
|
+
.limit(limit)
|
|
142
|
+
.map do |cal|
|
|
141
143
|
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
142
144
|
{
|
|
143
145
|
input: cal.response.input_data,
|
|
@@ -187,7 +187,7 @@
|
|
|
187
187
|
</div>
|
|
188
188
|
<%= render "completion_kit/api_reference/resource_list", title: "Your datasets",
|
|
189
189
|
items: datasets.map { |d|
|
|
190
|
-
{ name: d.name, meta: pluralize([d.csv_data.to_s.lines.count - 1, 0].max, "
|
|
190
|
+
{ name: d.name, meta: pluralize([d.csv_data.to_s.lines.count - 1, 0].max, "entry"),
|
|
191
191
|
url: "#{base_url}/api/v1/datasets/#{d.id}", dom_id: "dataset_ep_#{d.id}" }
|
|
192
192
|
} %>
|
|
193
193
|
</div>
|