completion-kit 0.5.44 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +31 -4
- data/app/controllers/completion_kit/api/v1/base_controller.rb +22 -0
- data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +22 -3
- data/app/controllers/completion_kit/api/v1/datasets_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/metric_groups_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +51 -0
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +49 -2
- data/app/controllers/completion_kit/api/v1/prompts_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb +1 -1
- data/app/controllers/completion_kit/api/v1/responses_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/runs_controller.rb +75 -2
- data/app/controllers/completion_kit/api/v1/tags_controller.rb +1 -1
- data/app/controllers/completion_kit/metrics_controller.rb +15 -5
- data/app/controllers/completion_kit/runs_controller.rb +64 -2
- data/app/helpers/completion_kit/application_helper.rb +0 -14
- data/app/jobs/completion_kit/generate_row_job.rb +3 -8
- data/app/jobs/completion_kit/judge_review_job.rb +6 -9
- data/app/models/completion_kit/calibration.rb +0 -4
- data/app/models/completion_kit/metric.rb +1 -0
- data/app/models/completion_kit/metric_version.rb +16 -1
- data/app/models/completion_kit/response.rb +13 -17
- data/app/models/completion_kit/review.rb +18 -22
- data/app/models/completion_kit/run.rb +58 -22
- data/app/models/concerns/completion_kit/has_job_status.rb +31 -0
- data/app/services/completion_kit/mcp_dispatcher.rb +3 -1
- data/app/services/completion_kit/mcp_tools/judges.rb +2 -4
- data/app/services/completion_kit/mcp_tools/metric_versions.rb +67 -0
- data/app/services/completion_kit/metric_variant_generator.rb +20 -6
- data/app/services/completion_kit/starter_metrics.rb +5 -5
- data/app/views/completion_kit/api_reference/_body.html.erb +91 -6
- data/app/views/completion_kit/api_reference/index.html.erb +8 -0
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +6 -1
- data/app/views/completion_kit/metrics/index.html.erb +3 -3
- data/app/views/completion_kit/metrics/show.html.erb +2 -1
- data/app/views/completion_kit/runs/_actions.html.erb +1 -0
- data/app/views/completion_kit/runs/compare.html.erb +85 -0
- data/app/views/completion_kit/runs/compare_picker.html.erb +39 -0
- data/app/views/completion_kit/runs/show.html.erb +8 -2
- data/config/routes.rb +18 -1
- data/lib/completion_kit/version.rb +1 -1
- metadata +6 -1
|
@@ -5,7 +5,9 @@ module CompletionKit
|
|
|
5
5
|
queue_as :llm
|
|
6
6
|
|
|
7
7
|
limits_concurrency to: ENV.fetch("COMPLETION_KIT_PER_RUN_CONCURRENCY", 5).to_i,
|
|
8
|
-
key: ->(response_id,
|
|
8
|
+
key: ->(response_id, _metric_id, run_id = nil) {
|
|
9
|
+
"run:#{run_id || Response.where(id: response_id).pick(:run_id)}"
|
|
10
|
+
},
|
|
9
11
|
duration: 10.minutes
|
|
10
12
|
|
|
11
13
|
def self.rate_limit_wait(executions)
|
|
@@ -29,7 +31,7 @@ module CompletionKit
|
|
|
29
31
|
end
|
|
30
32
|
|
|
31
33
|
before_perform do |job|
|
|
32
|
-
response_id, metric_id = job.arguments
|
|
34
|
+
response_id, metric_id, _run_id = job.arguments
|
|
33
35
|
response = Response.find_by(id: response_id)
|
|
34
36
|
next unless response
|
|
35
37
|
review = response.reviews.find_or_initialize_by(metric_id: metric_id)
|
|
@@ -37,10 +39,9 @@ module CompletionKit
|
|
|
37
39
|
review.attempts = (review.attempts || 0) + 1
|
|
38
40
|
review.status = "retrying"
|
|
39
41
|
review.save!(validate: false)
|
|
40
|
-
response.run.send(:broadcast_response_update, response) if response.run
|
|
41
42
|
end
|
|
42
43
|
|
|
43
|
-
def perform(response_id, metric_id)
|
|
44
|
+
def perform(response_id, metric_id, _run_id = nil)
|
|
44
45
|
@response_id = response_id
|
|
45
46
|
@metric_id = metric_id
|
|
46
47
|
|
|
@@ -75,8 +76,6 @@ module CompletionKit
|
|
|
75
76
|
review.save!
|
|
76
77
|
|
|
77
78
|
confirm_judging_capability(run.judge_model)
|
|
78
|
-
run.send(:broadcast_response_update, response)
|
|
79
|
-
run.send(:broadcast_progress)
|
|
80
79
|
enqueue_completion_check
|
|
81
80
|
end
|
|
82
81
|
|
|
@@ -107,13 +106,11 @@ module CompletionKit
|
|
|
107
106
|
error_message: error.message.to_s.truncate(2000)
|
|
108
107
|
)
|
|
109
108
|
review.save!(validate: false)
|
|
110
|
-
response.run&.send(:broadcast_response_update, response)
|
|
111
|
-
response.run&.send(:broadcast_progress)
|
|
112
109
|
end
|
|
113
110
|
|
|
114
111
|
def provider_for(response)
|
|
115
112
|
run = response.run
|
|
116
|
-
return nil unless run
|
|
113
|
+
return nil unless run.judge_model
|
|
117
114
|
ApiConfig.provider_for_model(run.judge_model)
|
|
118
115
|
end
|
|
119
116
|
|
|
@@ -7,10 +7,6 @@ module CompletionKit
|
|
|
7
7
|
belongs_to :metric
|
|
8
8
|
belongs_to :metric_version
|
|
9
9
|
|
|
10
|
-
alias_attribute :judge_version_id, :metric_version_id
|
|
11
|
-
alias_method :judge_version, :metric_version
|
|
12
|
-
alias_method :judge_version=, :metric_version=
|
|
13
|
-
|
|
14
10
|
validates :verdict, presence: true, inclusion: { in: VERDICTS }
|
|
15
11
|
validates :response_id,
|
|
16
12
|
uniqueness: { scope: [:metric_id, :created_by] }
|
|
@@ -12,6 +12,7 @@ module CompletionKit
|
|
|
12
12
|
|
|
13
13
|
has_many :metric_group_memberships, dependent: :destroy
|
|
14
14
|
has_many :metric_groups, through: :metric_group_memberships, source: :metric_group
|
|
15
|
+
has_many :metric_versions, dependent: :destroy
|
|
15
16
|
has_many :reviews, dependent: :nullify
|
|
16
17
|
has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
|
|
17
18
|
|
|
@@ -53,6 +53,22 @@ module CompletionKit
|
|
|
53
53
|
self
|
|
54
54
|
end
|
|
55
55
|
|
|
56
|
+
def revert!
|
|
57
|
+
raise ArgumentError, "only a published version can be reverted to" unless published?
|
|
58
|
+
audit = nil
|
|
59
|
+
MetricVersion.transaction do
|
|
60
|
+
audit = self.class.create!(
|
|
61
|
+
metric: metric,
|
|
62
|
+
instruction: instruction,
|
|
63
|
+
rubric_bands: rubric_bands,
|
|
64
|
+
state: "draft",
|
|
65
|
+
source: "revert"
|
|
66
|
+
)
|
|
67
|
+
audit.publish!
|
|
68
|
+
end
|
|
69
|
+
audit
|
|
70
|
+
end
|
|
71
|
+
|
|
56
72
|
def as_json(options = {})
|
|
57
73
|
{
|
|
58
74
|
id: id,
|
|
@@ -77,5 +93,4 @@ module CompletionKit
|
|
|
77
93
|
end
|
|
78
94
|
end
|
|
79
95
|
|
|
80
|
-
JudgeVersion = MetricVersion
|
|
81
96
|
end
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class Response < ApplicationRecord
|
|
3
|
-
|
|
4
|
-
TERMINAL_STATUSES = %w[succeeded failed].freeze
|
|
3
|
+
include HasJobStatus
|
|
5
4
|
|
|
6
5
|
belongs_to :run
|
|
7
6
|
has_many :reviews, dependent: :destroy
|
|
@@ -10,17 +9,11 @@ module CompletionKit
|
|
|
10
9
|
delegate :prompt, to: :run
|
|
11
10
|
|
|
12
11
|
validates :response_text, presence: true, if: :succeeded?
|
|
13
|
-
validates :status, inclusion: { in: STATUSES }
|
|
14
12
|
|
|
15
13
|
before_validation :set_default_status, on: :create
|
|
16
14
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
def succeeded?
|
|
22
|
-
status == "succeeded"
|
|
23
|
-
end
|
|
15
|
+
after_save_commit :broadcast_row_update, unless: :destroyed?
|
|
16
|
+
after_save_commit :broadcast_run_progress, if: :should_broadcast_progress?
|
|
24
17
|
|
|
25
18
|
def as_json(options = {})
|
|
26
19
|
{
|
|
@@ -47,19 +40,22 @@ module CompletionKit
|
|
|
47
40
|
def fully_reviewed?
|
|
48
41
|
metric_ids = run.metric_ids
|
|
49
42
|
return true if metric_ids.empty?
|
|
50
|
-
reviewed_metric_ids = reviews.where(status:
|
|
43
|
+
reviewed_metric_ids = reviews.where(status: HasJobStatus::TERMINAL_STATUSES).pluck(:metric_id).uniq
|
|
51
44
|
(metric_ids - reviewed_metric_ids).empty?
|
|
52
45
|
end
|
|
53
46
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
def broadcast_row_update
|
|
50
|
+
run.broadcast_response_update(self)
|
|
57
51
|
end
|
|
58
52
|
|
|
59
|
-
|
|
53
|
+
def broadcast_run_progress
|
|
54
|
+
run.broadcast_progress
|
|
55
|
+
end
|
|
60
56
|
|
|
61
|
-
def
|
|
62
|
-
|
|
57
|
+
def should_broadcast_progress?
|
|
58
|
+
saved_change_to_status? && terminal?
|
|
63
59
|
end
|
|
64
60
|
end
|
|
65
61
|
end
|
|
@@ -1,37 +1,25 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class Review < ApplicationRecord
|
|
3
|
-
|
|
4
|
-
TERMINAL_STATUSES = %w[succeeded failed].freeze
|
|
3
|
+
include HasJobStatus
|
|
5
4
|
|
|
6
5
|
belongs_to :response
|
|
7
6
|
belongs_to :metric, optional: true
|
|
8
7
|
belongs_to :metric_version, optional: true
|
|
9
8
|
has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
|
|
10
9
|
|
|
11
|
-
def stale_against_current_judge?
|
|
12
|
-
return false unless metric_id && metric_version_id
|
|
13
|
-
current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
|
|
14
|
-
return false if current_id.nil?
|
|
15
|
-
metric_version_id != current_id
|
|
16
|
-
end
|
|
17
|
-
|
|
18
10
|
validates :metric_name, presence: true
|
|
19
|
-
validates :status, inclusion: { in: STATUSES }
|
|
20
11
|
validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
|
|
21
12
|
|
|
22
13
|
before_validation :set_default_status
|
|
23
14
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
def succeeded?
|
|
29
|
-
status == "succeeded"
|
|
30
|
-
end
|
|
15
|
+
after_save_commit :broadcast_parent_row_update, unless: :destroyed?
|
|
16
|
+
after_save_commit :broadcast_run_progress, if: :should_broadcast_progress?
|
|
31
17
|
|
|
32
|
-
def
|
|
33
|
-
return
|
|
34
|
-
|
|
18
|
+
def stale_against_current_judge?
|
|
19
|
+
return false unless metric_id && metric_version_id
|
|
20
|
+
current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
|
|
21
|
+
return false if current_id.nil?
|
|
22
|
+
metric_version_id != current_id
|
|
35
23
|
end
|
|
36
24
|
|
|
37
25
|
def as_json(options = {})
|
|
@@ -46,8 +34,16 @@ module CompletionKit
|
|
|
46
34
|
|
|
47
35
|
private
|
|
48
36
|
|
|
49
|
-
def
|
|
50
|
-
|
|
37
|
+
def broadcast_parent_row_update
|
|
38
|
+
response.run.broadcast_response_update(response)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def broadcast_run_progress
|
|
42
|
+
response.run.broadcast_progress
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def should_broadcast_progress?
|
|
46
|
+
saved_change_to_status? && terminal?
|
|
51
47
|
end
|
|
52
48
|
end
|
|
53
49
|
end
|
|
@@ -43,7 +43,7 @@ module CompletionKit
|
|
|
43
43
|
end
|
|
44
44
|
|
|
45
45
|
def outstanding_work_zero?
|
|
46
|
-
return false if responses.where.not(status:
|
|
46
|
+
return false if responses.where.not(status: HasJobStatus::TERMINAL_STATUSES).exists?
|
|
47
47
|
|
|
48
48
|
metric_ids = metrics.pluck(:id)
|
|
49
49
|
return true if metric_ids.empty?
|
|
@@ -55,7 +55,7 @@ module CompletionKit
|
|
|
55
55
|
terminal_review_count = Review.where(
|
|
56
56
|
response_id: succeeded_response_ids,
|
|
57
57
|
metric_id: metric_ids,
|
|
58
|
-
status:
|
|
58
|
+
status: HasJobStatus::TERMINAL_STATUSES
|
|
59
59
|
).count
|
|
60
60
|
|
|
61
61
|
terminal_review_count >= expected_reviews
|
|
@@ -118,6 +118,10 @@ module CompletionKit
|
|
|
118
118
|
end
|
|
119
119
|
|
|
120
120
|
def start!
|
|
121
|
+
unless %w[pending failed].include?(status)
|
|
122
|
+
return fail_with_summary!("Cannot start a run in state \"#{status}\". Use rerun to create a fresh copy, or retry_failures / regrade to work with the existing responses.")
|
|
123
|
+
end
|
|
124
|
+
|
|
121
125
|
rows = if dataset
|
|
122
126
|
CsvProcessor.process_self(self)
|
|
123
127
|
else
|
|
@@ -161,7 +165,7 @@ module CompletionKit
|
|
|
161
165
|
response = responses.create!(attrs)
|
|
162
166
|
|
|
163
167
|
if judge_only?
|
|
164
|
-
metrics.each { |m| JudgeReviewJob.perform_later(response.id, m.id) } if judge_configured?
|
|
168
|
+
metrics.each { |m| JudgeReviewJob.perform_later(response.id, m.id, id) } if judge_configured?
|
|
165
169
|
else
|
|
166
170
|
GenerateRowJob.perform_later(id, response.id)
|
|
167
171
|
end
|
|
@@ -179,6 +183,38 @@ module CompletionKit
|
|
|
179
183
|
start!
|
|
180
184
|
end
|
|
181
185
|
|
|
186
|
+
def regrade!
|
|
187
|
+
grading_metrics = metrics
|
|
188
|
+
return false if grading_metrics.empty? || !judge_configured?
|
|
189
|
+
|
|
190
|
+
eligible_responses = responses.where(status: "succeeded").where.not(response_text: nil)
|
|
191
|
+
response_ids = eligible_responses.pluck(:id)
|
|
192
|
+
return false if response_ids.empty?
|
|
193
|
+
|
|
194
|
+
transaction do
|
|
195
|
+
Review.where(response_id: response_ids).update_all(
|
|
196
|
+
status: "pending",
|
|
197
|
+
attempts: 0,
|
|
198
|
+
metric_version_id: nil,
|
|
199
|
+
ai_score: nil,
|
|
200
|
+
ai_feedback: nil,
|
|
201
|
+
error_provider: nil,
|
|
202
|
+
error_class: nil,
|
|
203
|
+
error_status: nil,
|
|
204
|
+
error_message: nil
|
|
205
|
+
)
|
|
206
|
+
update!(status: "running", failure_summary: nil, error_message: nil)
|
|
207
|
+
|
|
208
|
+
response_ids.each do |rid|
|
|
209
|
+
grading_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id, id) }
|
|
210
|
+
end
|
|
211
|
+
RunCompletionCheckJob.perform_later(id)
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
broadcast_ui
|
|
215
|
+
true
|
|
216
|
+
end
|
|
217
|
+
|
|
182
218
|
def progress_snapshot
|
|
183
219
|
generated_done = responses.where(status: "succeeded").count
|
|
184
220
|
generated_failed = responses.where(status: "failed").count
|
|
@@ -240,17 +276,6 @@ module CompletionKit
|
|
|
240
276
|
}
|
|
241
277
|
end
|
|
242
278
|
|
|
243
|
-
private
|
|
244
|
-
|
|
245
|
-
def fail_with_summary!(message)
|
|
246
|
-
errors.add(:base, message)
|
|
247
|
-
if persisted?
|
|
248
|
-
update_columns(status: "failed", failure_summary: message, error_message: message)
|
|
249
|
-
broadcast_ui
|
|
250
|
-
end
|
|
251
|
-
false
|
|
252
|
-
end
|
|
253
|
-
|
|
254
279
|
def broadcast_ui
|
|
255
280
|
broadcast_progress
|
|
256
281
|
broadcast_status_header
|
|
@@ -258,14 +283,6 @@ module CompletionKit
|
|
|
258
283
|
broadcast_sort_toolbar
|
|
259
284
|
end
|
|
260
285
|
|
|
261
|
-
def render_engine_partial(partial, locals)
|
|
262
|
-
CompletionKit::Engine.warm_routes!
|
|
263
|
-
CompletionKit::ApplicationController.render(
|
|
264
|
-
partial: partial,
|
|
265
|
-
locals: locals
|
|
266
|
-
)
|
|
267
|
-
end
|
|
268
|
-
|
|
269
286
|
def broadcast_progress
|
|
270
287
|
reload
|
|
271
288
|
broadcast_replace_to(
|
|
@@ -324,6 +341,25 @@ module CompletionKit
|
|
|
324
341
|
)
|
|
325
342
|
end
|
|
326
343
|
|
|
344
|
+
private
|
|
345
|
+
|
|
346
|
+
def fail_with_summary!(message)
|
|
347
|
+
errors.add(:base, message)
|
|
348
|
+
if persisted?
|
|
349
|
+
update_columns(status: "failed", failure_summary: message, error_message: message)
|
|
350
|
+
broadcast_ui
|
|
351
|
+
end
|
|
352
|
+
false
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
def render_engine_partial(partial, locals)
|
|
356
|
+
CompletionKit::Engine.warm_routes!
|
|
357
|
+
CompletionKit::ApplicationController.render(
|
|
358
|
+
partial: partial,
|
|
359
|
+
locals: locals
|
|
360
|
+
)
|
|
361
|
+
end
|
|
362
|
+
|
|
327
363
|
def set_default_status
|
|
328
364
|
self.status ||= "pending"
|
|
329
365
|
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module HasJobStatus
|
|
3
|
+
extend ActiveSupport::Concern
|
|
4
|
+
|
|
5
|
+
STATUSES = %w[pending retrying succeeded failed].freeze
|
|
6
|
+
TERMINAL_STATUSES = %w[succeeded failed].freeze
|
|
7
|
+
|
|
8
|
+
included do
|
|
9
|
+
validates :status, inclusion: { in: STATUSES }
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def terminal?
|
|
13
|
+
TERMINAL_STATUSES.include?(status)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def succeeded?
|
|
17
|
+
status == "succeeded"
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def error_payload
|
|
21
|
+
return nil if error_class.blank?
|
|
22
|
+
{ provider: error_provider, class: error_class, status: error_status, message: error_message }
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def set_default_status
|
|
28
|
+
self.status ||= "pending"
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -32,6 +32,7 @@ module CompletionKit
|
|
|
32
32
|
McpTools::Datasets.definitions +
|
|
33
33
|
McpTools::Metrics.definitions +
|
|
34
34
|
McpTools::MetricGroups.definitions +
|
|
35
|
+
McpTools::MetricVersions.definitions +
|
|
35
36
|
McpTools::ProviderCredentials.definitions +
|
|
36
37
|
McpTools::Tags.definitions +
|
|
37
38
|
McpTools::Calibrations.definitions +
|
|
@@ -44,8 +45,9 @@ module CompletionKit
|
|
|
44
45
|
when /\Aruns_/ then McpTools::Runs.call(name, arguments)
|
|
45
46
|
when /\Aresponses_/ then McpTools::Responses.call(name, arguments)
|
|
46
47
|
when /\Adatasets_/ then McpTools::Datasets.call(name, arguments)
|
|
47
|
-
when /\
|
|
48
|
+
when /\Ametric_versions_/ then McpTools::MetricVersions.call(name, arguments)
|
|
48
49
|
when /\Ametric_groups_/ then McpTools::MetricGroups.call(name, arguments)
|
|
50
|
+
when /\Ametrics_/ then McpTools::Metrics.call(name, arguments)
|
|
49
51
|
when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
|
|
50
52
|
when /\Atags_/ then McpTools::Tags.call(name, arguments)
|
|
51
53
|
when /\Acalibrations_/ then McpTools::Calibrations.call(name, arguments)
|
|
@@ -75,10 +75,8 @@ module CompletionKit
|
|
|
75
75
|
|
|
76
76
|
def self.compare(args)
|
|
77
77
|
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(a_id)
|
|
81
|
-
b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(b_id)
|
|
78
|
+
a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
|
|
79
|
+
b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
|
|
82
80
|
stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
|
|
83
81
|
stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
|
|
84
82
|
text_result({
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module McpTools
|
|
3
|
+
module MetricVersions
|
|
4
|
+
extend Base
|
|
5
|
+
|
|
6
|
+
TOOLS = {
|
|
7
|
+
"metric_versions_list" => {
|
|
8
|
+
description: "List every MetricVersion (drafts + published) for a metric, newest first. Each row carries version_number, state, source, current flag, and timestamps.",
|
|
9
|
+
inputSchema: {
|
|
10
|
+
type: "object",
|
|
11
|
+
properties: {
|
|
12
|
+
metric_id: { type: "integer" }
|
|
13
|
+
},
|
|
14
|
+
required: ["metric_id"]
|
|
15
|
+
},
|
|
16
|
+
handler: :list
|
|
17
|
+
},
|
|
18
|
+
"metric_versions_publish" => {
|
|
19
|
+
description: "Publish a MetricVersion as the live judge for its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge actually uses them.",
|
|
20
|
+
inputSchema: {
|
|
21
|
+
type: "object",
|
|
22
|
+
properties: {
|
|
23
|
+
metric_version_id: { type: "integer" }
|
|
24
|
+
},
|
|
25
|
+
required: ["metric_version_id"]
|
|
26
|
+
},
|
|
27
|
+
handler: :publish
|
|
28
|
+
},
|
|
29
|
+
"metric_versions_dismiss" => {
|
|
30
|
+
description: "Destroy a draft MetricVersion (use for either source: 'edit' or source: 'suggestion'). Published versions are refused — to demote a published version, publish a different one as current instead.",
|
|
31
|
+
inputSchema: {
|
|
32
|
+
type: "object",
|
|
33
|
+
properties: {
|
|
34
|
+
metric_version_id: { type: "integer" }
|
|
35
|
+
},
|
|
36
|
+
required: ["metric_version_id"]
|
|
37
|
+
},
|
|
38
|
+
handler: :dismiss
|
|
39
|
+
}
|
|
40
|
+
}.freeze
|
|
41
|
+
|
|
42
|
+
def self.list(args)
|
|
43
|
+
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
44
|
+
versions = CompletionKit::MetricVersion.where(metric_id: metric.id).order(version_number: :desc)
|
|
45
|
+
text_result(versions.map(&:as_json))
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def self.publish(args)
|
|
49
|
+
version = CompletionKit::MetricVersion.find(args["metric_version_id"])
|
|
50
|
+
if version.published? && !version.current?
|
|
51
|
+
audit = version.revert!
|
|
52
|
+
text_result(audit.as_json)
|
|
53
|
+
else
|
|
54
|
+
version.publish!
|
|
55
|
+
text_result(version.reload.as_json)
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def self.dismiss(args)
|
|
60
|
+
version = CompletionKit::MetricVersion.find(args["metric_version_id"])
|
|
61
|
+
return error_result("Cannot dismiss a published version. Publish a different version as current instead.") if version.published?
|
|
62
|
+
version.destroy!
|
|
63
|
+
text_result({id: version.id, destroyed: true})
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -43,6 +43,7 @@ module CompletionKit
|
|
|
43
43
|
def build_meta_prompt
|
|
44
44
|
disagreements = MetricCalibrationExamples.disagreements_for(@metric)
|
|
45
45
|
borderlines = MetricCalibrationExamples.borderlines_for(@metric)
|
|
46
|
+
pinned_examples = Array(@metric.few_shot_examples)
|
|
46
47
|
sections = []
|
|
47
48
|
sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
|
|
48
49
|
sections << ""
|
|
@@ -77,6 +78,18 @@ module CompletionKit
|
|
|
77
78
|
sections << ""
|
|
78
79
|
end
|
|
79
80
|
end
|
|
81
|
+
if pinned_examples.any?
|
|
82
|
+
sections << "## Pinned cases the judge already references"
|
|
83
|
+
sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
|
|
84
|
+
pinned_examples.each_with_index do |ex, i|
|
|
85
|
+
sections << "### Pinned #{i + 1}"
|
|
86
|
+
sections << "Input: #{ex["input"].to_s.truncate(200)}"
|
|
87
|
+
sections << "Output: #{ex["response"].to_s.truncate(200)}"
|
|
88
|
+
sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
|
|
89
|
+
sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
|
|
90
|
+
sections << ""
|
|
91
|
+
end
|
|
92
|
+
end
|
|
80
93
|
sections << "## Task"
|
|
81
94
|
sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
|
|
82
95
|
sections << ""
|
|
@@ -133,13 +146,14 @@ module CompletionKit
|
|
|
133
146
|
end
|
|
134
147
|
|
|
135
148
|
def calibrations_for(metric, verdict:, limit:)
|
|
136
|
-
|
|
149
|
+
base = Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
137
150
|
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
151
|
+
scoped = current_version ? base.where(metric_version_id: current_version.id) : base
|
|
152
|
+
effective = scoped.exists? ? scoped : base
|
|
153
|
+
effective.includes(response: :reviews)
|
|
154
|
+
.order(created_at: :desc)
|
|
155
|
+
.limit(limit)
|
|
156
|
+
.map do |cal|
|
|
143
157
|
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
144
158
|
{
|
|
145
159
|
input: cal.response.input_data,
|
|
@@ -21,8 +21,8 @@ module CompletionKit
|
|
|
21
21
|
key: "instruction_following",
|
|
22
22
|
name: "Instruction following",
|
|
23
23
|
description: "Did the model do everything that was asked?",
|
|
24
|
-
catches: "The response is factually right but ignores \"answer in two sentences\", \"use bullet points\", \"do not include X\". Distinct from Correctness
|
|
25
|
-
instruction: "Did the model do every concrete thing the prompt asked for? Score against the explicit requirements in the prompt (format constraints, count limits, exclusions, audience cues). Factual accuracy is a different dimension
|
|
24
|
+
catches: "The response is factually right but ignores \"answer in two sentences\", \"use bullet points\", \"do not include X\". Distinct from Correctness; a response can be right and still fail this.",
|
|
25
|
+
instruction: "Did the model do every concrete thing the prompt asked for? Score against the explicit requirements in the prompt (format constraints, count limits, exclusions, audience cues). Factual accuracy is a different dimension; score that elsewhere.",
|
|
26
26
|
rubric_bands: [
|
|
27
27
|
{ "stars" => 5, "description" => "Followed every requirement in the prompt exactly." },
|
|
28
28
|
{ "stars" => 4, "description" => "Followed every requirement with a small slip." },
|
|
@@ -36,7 +36,7 @@ module CompletionKit
|
|
|
36
36
|
name: "Format compliance",
|
|
37
37
|
description: "Does the output follow the required structure?",
|
|
38
38
|
catches: "Invalid JSON, missing schema fields, extra prose around a structured response, wrong casing on keys. Critical for any LLM wired into an API.",
|
|
39
|
-
instruction: "Does the output match the format the prompt asked for
|
|
39
|
+
instruction: "Does the output match the format the prompt asked for: JSON shape, schema, keys, casing, no stray prose? Score on whether a downstream parser would accept it without massaging.",
|
|
40
40
|
rubric_bands: [
|
|
41
41
|
{ "stars" => 5, "description" => "Exact spec, ready to consume programmatically." },
|
|
42
42
|
{ "stars" => 4, "description" => "Spec-compliant with one cosmetic issue." },
|
|
@@ -62,9 +62,9 @@ module CompletionKit
|
|
|
62
62
|
Starter.new(
|
|
63
63
|
key: "conciseness",
|
|
64
64
|
name: "Conciseness",
|
|
65
|
-
description: "Is it the right length
|
|
65
|
+
description: "Is it the right length, no padding, no missing detail?",
|
|
66
66
|
catches: "Rambling responses, repetitive caveats, over-hedging. LLMs default to verbose. Conciseness is the dimension where users most often see scores move after tuning.",
|
|
67
|
-
instruction: "Is the output the right length for the task
|
|
67
|
+
instruction: "Is the output the right length for the task: no padding, no missing detail, no hedging filler? Penalise rambling, repetition, over-caveating, and unnecessary preamble. Penalise too-short outputs that drop information.",
|
|
68
68
|
rubric_bands: [
|
|
69
69
|
{ "stars" => 5, "description" => "Exactly as long as the task needs, no more, no less." },
|
|
70
70
|
{ "stars" => 4, "description" => "Right length with a small redundancy." },
|