completion-kit 0.5.44 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/stylesheets/completion_kit/application.css +31 -4
  3. data/app/controllers/completion_kit/api/v1/base_controller.rb +22 -0
  4. data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +22 -3
  5. data/app/controllers/completion_kit/api/v1/datasets_controller.rb +3 -1
  6. data/app/controllers/completion_kit/api/v1/metric_groups_controller.rb +3 -1
  7. data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +51 -0
  8. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +49 -2
  9. data/app/controllers/completion_kit/api/v1/prompts_controller.rb +3 -1
  10. data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb +1 -1
  11. data/app/controllers/completion_kit/api/v1/responses_controller.rb +3 -1
  12. data/app/controllers/completion_kit/api/v1/runs_controller.rb +75 -2
  13. data/app/controllers/completion_kit/api/v1/tags_controller.rb +1 -1
  14. data/app/controllers/completion_kit/metrics_controller.rb +15 -5
  15. data/app/controllers/completion_kit/runs_controller.rb +64 -2
  16. data/app/helpers/completion_kit/application_helper.rb +0 -14
  17. data/app/jobs/completion_kit/generate_row_job.rb +3 -8
  18. data/app/jobs/completion_kit/judge_review_job.rb +6 -9
  19. data/app/models/completion_kit/calibration.rb +0 -4
  20. data/app/models/completion_kit/metric.rb +1 -0
  21. data/app/models/completion_kit/metric_version.rb +16 -1
  22. data/app/models/completion_kit/response.rb +13 -17
  23. data/app/models/completion_kit/review.rb +18 -22
  24. data/app/models/completion_kit/run.rb +58 -22
  25. data/app/models/concerns/completion_kit/has_job_status.rb +31 -0
  26. data/app/services/completion_kit/mcp_dispatcher.rb +3 -1
  27. data/app/services/completion_kit/mcp_tools/judges.rb +2 -4
  28. data/app/services/completion_kit/mcp_tools/metric_versions.rb +67 -0
  29. data/app/services/completion_kit/metric_variant_generator.rb +20 -6
  30. data/app/services/completion_kit/starter_metrics.rb +5 -5
  31. data/app/views/completion_kit/api_reference/_body.html.erb +91 -6
  32. data/app/views/completion_kit/api_reference/index.html.erb +8 -0
  33. data/app/views/completion_kit/calibrations/_trust_panel.html.erb +6 -1
  34. data/app/views/completion_kit/metrics/index.html.erb +3 -3
  35. data/app/views/completion_kit/metrics/show.html.erb +2 -1
  36. data/app/views/completion_kit/runs/_actions.html.erb +1 -0
  37. data/app/views/completion_kit/runs/compare.html.erb +85 -0
  38. data/app/views/completion_kit/runs/compare_picker.html.erb +39 -0
  39. data/app/views/completion_kit/runs/show.html.erb +8 -2
  40. data/config/routes.rb +18 -1
  41. data/lib/completion_kit/version.rb +1 -1
  42. metadata +6 -1
@@ -5,7 +5,9 @@ module CompletionKit
5
5
  queue_as :llm
6
6
 
7
7
  limits_concurrency to: ENV.fetch("COMPLETION_KIT_PER_RUN_CONCURRENCY", 5).to_i,
8
- key: ->(response_id, _) { "run:#{Response.find_by(id: response_id)&.run_id}" },
8
+ key: ->(response_id, _metric_id, run_id = nil) {
9
+ "run:#{run_id || Response.where(id: response_id).pick(:run_id)}"
10
+ },
9
11
  duration: 10.minutes
10
12
 
11
13
  def self.rate_limit_wait(executions)
@@ -29,7 +31,7 @@ module CompletionKit
29
31
  end
30
32
 
31
33
  before_perform do |job|
32
- response_id, metric_id = job.arguments
34
+ response_id, metric_id, _run_id = job.arguments
33
35
  response = Response.find_by(id: response_id)
34
36
  next unless response
35
37
  review = response.reviews.find_or_initialize_by(metric_id: metric_id)
@@ -37,10 +39,9 @@ module CompletionKit
37
39
  review.attempts = (review.attempts || 0) + 1
38
40
  review.status = "retrying"
39
41
  review.save!(validate: false)
40
- response.run.send(:broadcast_response_update, response) if response.run
41
42
  end
42
43
 
43
- def perform(response_id, metric_id)
44
+ def perform(response_id, metric_id, _run_id = nil)
44
45
  @response_id = response_id
45
46
  @metric_id = metric_id
46
47
 
@@ -75,8 +76,6 @@ module CompletionKit
75
76
  review.save!
76
77
 
77
78
  confirm_judging_capability(run.judge_model)
78
- run.send(:broadcast_response_update, response)
79
- run.send(:broadcast_progress)
80
79
  enqueue_completion_check
81
80
  end
82
81
 
@@ -107,13 +106,11 @@ module CompletionKit
107
106
  error_message: error.message.to_s.truncate(2000)
108
107
  )
109
108
  review.save!(validate: false)
110
- response.run&.send(:broadcast_response_update, response)
111
- response.run&.send(:broadcast_progress)
112
109
  end
113
110
 
114
111
  def provider_for(response)
115
112
  run = response.run
116
- return nil unless run&.judge_model
113
+ return nil unless run.judge_model
117
114
  ApiConfig.provider_for_model(run.judge_model)
118
115
  end
119
116
 
@@ -7,10 +7,6 @@ module CompletionKit
7
7
  belongs_to :metric
8
8
  belongs_to :metric_version
9
9
 
10
- alias_attribute :judge_version_id, :metric_version_id
11
- alias_method :judge_version, :metric_version
12
- alias_method :judge_version=, :metric_version=
13
-
14
10
  validates :verdict, presence: true, inclusion: { in: VERDICTS }
15
11
  validates :response_id,
16
12
  uniqueness: { scope: [:metric_id, :created_by] }
@@ -12,6 +12,7 @@ module CompletionKit
12
12
 
13
13
  has_many :metric_group_memberships, dependent: :destroy
14
14
  has_many :metric_groups, through: :metric_group_memberships, source: :metric_group
15
+ has_many :metric_versions, dependent: :destroy
15
16
  has_many :reviews, dependent: :nullify
16
17
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
17
18
 
@@ -53,6 +53,22 @@ module CompletionKit
53
53
  self
54
54
  end
55
55
 
56
+ def revert!
57
+ raise ArgumentError, "only a published version can be reverted to" unless published?
58
+ audit = nil
59
+ MetricVersion.transaction do
60
+ audit = self.class.create!(
61
+ metric: metric,
62
+ instruction: instruction,
63
+ rubric_bands: rubric_bands,
64
+ state: "draft",
65
+ source: "revert"
66
+ )
67
+ audit.publish!
68
+ end
69
+ audit
70
+ end
71
+
56
72
  def as_json(options = {})
57
73
  {
58
74
  id: id,
@@ -77,5 +93,4 @@ module CompletionKit
77
93
  end
78
94
  end
79
95
 
80
- JudgeVersion = MetricVersion
81
96
  end
@@ -1,7 +1,6 @@
1
1
  module CompletionKit
2
2
  class Response < ApplicationRecord
3
- STATUSES = %w[pending retrying succeeded failed].freeze
4
- TERMINAL_STATUSES = %w[succeeded failed].freeze
3
+ include HasJobStatus
5
4
 
6
5
  belongs_to :run
7
6
  has_many :reviews, dependent: :destroy
@@ -10,17 +9,11 @@ module CompletionKit
10
9
  delegate :prompt, to: :run
11
10
 
12
11
  validates :response_text, presence: true, if: :succeeded?
13
- validates :status, inclusion: { in: STATUSES }
14
12
 
15
13
  before_validation :set_default_status, on: :create
16
14
 
17
- def terminal?
18
- TERMINAL_STATUSES.include?(status)
19
- end
20
-
21
- def succeeded?
22
- status == "succeeded"
23
- end
15
+ after_save_commit :broadcast_row_update, unless: :destroyed?
16
+ after_save_commit :broadcast_run_progress, if: :should_broadcast_progress?
24
17
 
25
18
  def as_json(options = {})
26
19
  {
@@ -47,19 +40,22 @@ module CompletionKit
47
40
  def fully_reviewed?
48
41
  metric_ids = run.metric_ids
49
42
  return true if metric_ids.empty?
50
- reviewed_metric_ids = reviews.where(status: Review::TERMINAL_STATUSES).pluck(:metric_id).uniq
43
+ reviewed_metric_ids = reviews.where(status: HasJobStatus::TERMINAL_STATUSES).pluck(:metric_id).uniq
51
44
  (metric_ids - reviewed_metric_ids).empty?
52
45
  end
53
46
 
54
- def error_payload
55
- return nil if error_class.blank?
56
- { provider: error_provider, class: error_class, status: error_status, message: error_message }
47
+ private
48
+
49
+ def broadcast_row_update
50
+ run.broadcast_response_update(self)
57
51
  end
58
52
 
59
- private
53
+ def broadcast_run_progress
54
+ run.broadcast_progress
55
+ end
60
56
 
61
- def set_default_status
62
- self.status ||= "pending"
57
+ def should_broadcast_progress?
58
+ saved_change_to_status? && terminal?
63
59
  end
64
60
  end
65
61
  end
@@ -1,37 +1,25 @@
1
1
  module CompletionKit
2
2
  class Review < ApplicationRecord
3
- STATUSES = %w[pending retrying succeeded failed].freeze
4
- TERMINAL_STATUSES = %w[succeeded failed].freeze
3
+ include HasJobStatus
5
4
 
6
5
  belongs_to :response
7
6
  belongs_to :metric, optional: true
8
7
  belongs_to :metric_version, optional: true
9
8
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
10
9
 
11
- def stale_against_current_judge?
12
- return false unless metric_id && metric_version_id
13
- current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
14
- return false if current_id.nil?
15
- metric_version_id != current_id
16
- end
17
-
18
10
  validates :metric_name, presence: true
19
- validates :status, inclusion: { in: STATUSES }
20
11
  validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
21
12
 
22
13
  before_validation :set_default_status
23
14
 
24
- def terminal?
25
- TERMINAL_STATUSES.include?(status)
26
- end
27
-
28
- def succeeded?
29
- status == "succeeded"
30
- end
15
+ after_save_commit :broadcast_parent_row_update, unless: :destroyed?
16
+ after_save_commit :broadcast_run_progress, if: :should_broadcast_progress?
31
17
 
32
- def error_payload
33
- return nil if error_class.blank?
34
- { provider: error_provider, class: error_class, status: error_status, message: error_message }
18
+ def stale_against_current_judge?
19
+ return false unless metric_id && metric_version_id
20
+ current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
21
+ return false if current_id.nil?
22
+ metric_version_id != current_id
35
23
  end
36
24
 
37
25
  def as_json(options = {})
@@ -46,8 +34,16 @@ module CompletionKit
46
34
 
47
35
  private
48
36
 
49
- def set_default_status
50
- self.status ||= "pending"
37
+ def broadcast_parent_row_update
38
+ response.run.broadcast_response_update(response)
39
+ end
40
+
41
+ def broadcast_run_progress
42
+ response.run.broadcast_progress
43
+ end
44
+
45
+ def should_broadcast_progress?
46
+ saved_change_to_status? && terminal?
51
47
  end
52
48
  end
53
49
  end
@@ -43,7 +43,7 @@ module CompletionKit
43
43
  end
44
44
 
45
45
  def outstanding_work_zero?
46
- return false if responses.where.not(status: Response::TERMINAL_STATUSES).exists?
46
+ return false if responses.where.not(status: HasJobStatus::TERMINAL_STATUSES).exists?
47
47
 
48
48
  metric_ids = metrics.pluck(:id)
49
49
  return true if metric_ids.empty?
@@ -55,7 +55,7 @@ module CompletionKit
55
55
  terminal_review_count = Review.where(
56
56
  response_id: succeeded_response_ids,
57
57
  metric_id: metric_ids,
58
- status: Review::TERMINAL_STATUSES
58
+ status: HasJobStatus::TERMINAL_STATUSES
59
59
  ).count
60
60
 
61
61
  terminal_review_count >= expected_reviews
@@ -118,6 +118,10 @@ module CompletionKit
118
118
  end
119
119
 
120
120
  def start!
121
+ unless %w[pending failed].include?(status)
122
+ return fail_with_summary!("Cannot start a run in state \"#{status}\". Use rerun to create a fresh copy, or retry_failures / regrade to work with the existing responses.")
123
+ end
124
+
121
125
  rows = if dataset
122
126
  CsvProcessor.process_self(self)
123
127
  else
@@ -161,7 +165,7 @@ module CompletionKit
161
165
  response = responses.create!(attrs)
162
166
 
163
167
  if judge_only?
164
- metrics.each { |m| JudgeReviewJob.perform_later(response.id, m.id) } if judge_configured?
168
+ metrics.each { |m| JudgeReviewJob.perform_later(response.id, m.id, id) } if judge_configured?
165
169
  else
166
170
  GenerateRowJob.perform_later(id, response.id)
167
171
  end
@@ -179,6 +183,38 @@ module CompletionKit
179
183
  start!
180
184
  end
181
185
 
186
+ def regrade!
187
+ grading_metrics = metrics
188
+ return false if grading_metrics.empty? || !judge_configured?
189
+
190
+ eligible_responses = responses.where(status: "succeeded").where.not(response_text: nil)
191
+ response_ids = eligible_responses.pluck(:id)
192
+ return false if response_ids.empty?
193
+
194
+ transaction do
195
+ Review.where(response_id: response_ids).update_all(
196
+ status: "pending",
197
+ attempts: 0,
198
+ metric_version_id: nil,
199
+ ai_score: nil,
200
+ ai_feedback: nil,
201
+ error_provider: nil,
202
+ error_class: nil,
203
+ error_status: nil,
204
+ error_message: nil
205
+ )
206
+ update!(status: "running", failure_summary: nil, error_message: nil)
207
+
208
+ response_ids.each do |rid|
209
+ grading_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id, id) }
210
+ end
211
+ RunCompletionCheckJob.perform_later(id)
212
+ end
213
+
214
+ broadcast_ui
215
+ true
216
+ end
217
+
182
218
  def progress_snapshot
183
219
  generated_done = responses.where(status: "succeeded").count
184
220
  generated_failed = responses.where(status: "failed").count
@@ -240,17 +276,6 @@ module CompletionKit
240
276
  }
241
277
  end
242
278
 
243
- private
244
-
245
- def fail_with_summary!(message)
246
- errors.add(:base, message)
247
- if persisted?
248
- update_columns(status: "failed", failure_summary: message, error_message: message)
249
- broadcast_ui
250
- end
251
- false
252
- end
253
-
254
279
  def broadcast_ui
255
280
  broadcast_progress
256
281
  broadcast_status_header
@@ -258,14 +283,6 @@ module CompletionKit
258
283
  broadcast_sort_toolbar
259
284
  end
260
285
 
261
- def render_engine_partial(partial, locals)
262
- CompletionKit::Engine.warm_routes!
263
- CompletionKit::ApplicationController.render(
264
- partial: partial,
265
- locals: locals
266
- )
267
- end
268
-
269
286
  def broadcast_progress
270
287
  reload
271
288
  broadcast_replace_to(
@@ -324,6 +341,25 @@ module CompletionKit
324
341
  )
325
342
  end
326
343
 
344
+ private
345
+
346
+ def fail_with_summary!(message)
347
+ errors.add(:base, message)
348
+ if persisted?
349
+ update_columns(status: "failed", failure_summary: message, error_message: message)
350
+ broadcast_ui
351
+ end
352
+ false
353
+ end
354
+
355
+ def render_engine_partial(partial, locals)
356
+ CompletionKit::Engine.warm_routes!
357
+ CompletionKit::ApplicationController.render(
358
+ partial: partial,
359
+ locals: locals
360
+ )
361
+ end
362
+
327
363
  def set_default_status
328
364
  self.status ||= "pending"
329
365
  end
@@ -0,0 +1,31 @@
1
+ module CompletionKit
2
+ module HasJobStatus
3
+ extend ActiveSupport::Concern
4
+
5
+ STATUSES = %w[pending retrying succeeded failed].freeze
6
+ TERMINAL_STATUSES = %w[succeeded failed].freeze
7
+
8
+ included do
9
+ validates :status, inclusion: { in: STATUSES }
10
+ end
11
+
12
+ def terminal?
13
+ TERMINAL_STATUSES.include?(status)
14
+ end
15
+
16
+ def succeeded?
17
+ status == "succeeded"
18
+ end
19
+
20
+ def error_payload
21
+ return nil if error_class.blank?
22
+ { provider: error_provider, class: error_class, status: error_status, message: error_message }
23
+ end
24
+
25
+ private
26
+
27
+ def set_default_status
28
+ self.status ||= "pending"
29
+ end
30
+ end
31
+ end
@@ -32,6 +32,7 @@ module CompletionKit
32
32
  McpTools::Datasets.definitions +
33
33
  McpTools::Metrics.definitions +
34
34
  McpTools::MetricGroups.definitions +
35
+ McpTools::MetricVersions.definitions +
35
36
  McpTools::ProviderCredentials.definitions +
36
37
  McpTools::Tags.definitions +
37
38
  McpTools::Calibrations.definitions +
@@ -44,8 +45,9 @@ module CompletionKit
44
45
  when /\Aruns_/ then McpTools::Runs.call(name, arguments)
45
46
  when /\Aresponses_/ then McpTools::Responses.call(name, arguments)
46
47
  when /\Adatasets_/ then McpTools::Datasets.call(name, arguments)
47
- when /\Ametrics_/ then McpTools::Metrics.call(name, arguments)
48
+ when /\Ametric_versions_/ then McpTools::MetricVersions.call(name, arguments)
48
49
  when /\Ametric_groups_/ then McpTools::MetricGroups.call(name, arguments)
50
+ when /\Ametrics_/ then McpTools::Metrics.call(name, arguments)
49
51
  when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
50
52
  when /\Atags_/ then McpTools::Tags.call(name, arguments)
51
53
  when /\Acalibrations_/ then McpTools::Calibrations.call(name, arguments)
@@ -75,10 +75,8 @@ module CompletionKit
75
75
 
76
76
  def self.compare(args)
77
77
  metric = CompletionKit::Metric.find(args["metric_id"])
78
- a_id = args["metric_version_a_id"] || args["judge_version_a_id"]
79
- b_id = args["metric_version_b_id"] || args["judge_version_b_id"]
80
- a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(a_id)
81
- b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(b_id)
78
+ a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
79
+ b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
82
80
  stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
83
81
  stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
84
82
  text_result({
@@ -0,0 +1,67 @@
1
+ module CompletionKit
2
+ module McpTools
3
+ module MetricVersions
4
+ extend Base
5
+
6
+ TOOLS = {
7
+ "metric_versions_list" => {
8
+ description: "List every MetricVersion (drafts + published) for a metric, newest first. Each row carries version_number, state, source, current flag, and timestamps.",
9
+ inputSchema: {
10
+ type: "object",
11
+ properties: {
12
+ metric_id: { type: "integer" }
13
+ },
14
+ required: ["metric_id"]
15
+ },
16
+ handler: :list
17
+ },
18
+ "metric_versions_publish" => {
19
+ description: "Publish a MetricVersion as the live judge for its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge actually uses them.",
20
+ inputSchema: {
21
+ type: "object",
22
+ properties: {
23
+ metric_version_id: { type: "integer" }
24
+ },
25
+ required: ["metric_version_id"]
26
+ },
27
+ handler: :publish
28
+ },
29
+ "metric_versions_dismiss" => {
30
+ description: "Destroy a draft MetricVersion (use for either source: 'edit' or source: 'suggestion'). Published versions are refused — to demote a published version, publish a different one as current instead.",
31
+ inputSchema: {
32
+ type: "object",
33
+ properties: {
34
+ metric_version_id: { type: "integer" }
35
+ },
36
+ required: ["metric_version_id"]
37
+ },
38
+ handler: :dismiss
39
+ }
40
+ }.freeze
41
+
42
+ def self.list(args)
43
+ metric = CompletionKit::Metric.find(args["metric_id"])
44
+ versions = CompletionKit::MetricVersion.where(metric_id: metric.id).order(version_number: :desc)
45
+ text_result(versions.map(&:as_json))
46
+ end
47
+
48
+ def self.publish(args)
49
+ version = CompletionKit::MetricVersion.find(args["metric_version_id"])
50
+ if version.published? && !version.current?
51
+ audit = version.revert!
52
+ text_result(audit.as_json)
53
+ else
54
+ version.publish!
55
+ text_result(version.reload.as_json)
56
+ end
57
+ end
58
+
59
+ def self.dismiss(args)
60
+ version = CompletionKit::MetricVersion.find(args["metric_version_id"])
61
+ return error_result("Cannot dismiss a published version. Publish a different version as current instead.") if version.published?
62
+ version.destroy!
63
+ text_result({id: version.id, destroyed: true})
64
+ end
65
+ end
66
+ end
67
+ end
@@ -43,6 +43,7 @@ module CompletionKit
43
43
  def build_meta_prompt
44
44
  disagreements = MetricCalibrationExamples.disagreements_for(@metric)
45
45
  borderlines = MetricCalibrationExamples.borderlines_for(@metric)
46
+ pinned_examples = Array(@metric.few_shot_examples)
46
47
  sections = []
47
48
  sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
48
49
  sections << ""
@@ -77,6 +78,18 @@ module CompletionKit
77
78
  sections << ""
78
79
  end
79
80
  end
81
+ if pinned_examples.any?
82
+ sections << "## Pinned cases the judge already references"
83
+ sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
84
+ pinned_examples.each_with_index do |ex, i|
85
+ sections << "### Pinned #{i + 1}"
86
+ sections << "Input: #{ex["input"].to_s.truncate(200)}"
87
+ sections << "Output: #{ex["response"].to_s.truncate(200)}"
88
+ sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
89
+ sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
90
+ sections << ""
91
+ end
92
+ end
80
93
  sections << "## Task"
81
94
  sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
82
95
  sections << ""
@@ -133,13 +146,14 @@ module CompletionKit
133
146
  end
134
147
 
135
148
  def calibrations_for(metric, verdict:, limit:)
136
- scope = Calibration.where(metric_id: metric.id, verdict: verdict)
149
+ base = Calibration.where(metric_id: metric.id, verdict: verdict)
137
150
  current_version = MetricVersion.current.find_by(metric_id: metric.id)
138
- scope = scope.where(metric_version_id: current_version.id) if current_version
139
- scope.includes(response: :reviews)
140
- .order(created_at: :desc)
141
- .limit(limit)
142
- .map do |cal|
151
+ scoped = current_version ? base.where(metric_version_id: current_version.id) : base
152
+ effective = scoped.exists? ? scoped : base
153
+ effective.includes(response: :reviews)
154
+ .order(created_at: :desc)
155
+ .limit(limit)
156
+ .map do |cal|
143
157
  review = cal.response.reviews.find { |r| r.metric_id == metric.id }
144
158
  {
145
159
  input: cal.response.input_data,
@@ -21,8 +21,8 @@ module CompletionKit
21
21
  key: "instruction_following",
22
22
  name: "Instruction following",
23
23
  description: "Did the model do everything that was asked?",
24
- catches: "The response is factually right but ignores \"answer in two sentences\", \"use bullet points\", \"do not include X\". Distinct from Correctness a response can be right and still fail this.",
25
- instruction: "Did the model do every concrete thing the prompt asked for? Score against the explicit requirements in the prompt (format constraints, count limits, exclusions, audience cues). Factual accuracy is a different dimension score that elsewhere.",
24
+ catches: "The response is factually right but ignores \"answer in two sentences\", \"use bullet points\", \"do not include X\". Distinct from Correctness; a response can be right and still fail this.",
25
+ instruction: "Did the model do every concrete thing the prompt asked for? Score against the explicit requirements in the prompt (format constraints, count limits, exclusions, audience cues). Factual accuracy is a different dimension; score that elsewhere.",
26
26
  rubric_bands: [
27
27
  { "stars" => 5, "description" => "Followed every requirement in the prompt exactly." },
28
28
  { "stars" => 4, "description" => "Followed every requirement with a small slip." },
@@ -36,7 +36,7 @@ module CompletionKit
36
36
  name: "Format compliance",
37
37
  description: "Does the output follow the required structure?",
38
38
  catches: "Invalid JSON, missing schema fields, extra prose around a structured response, wrong casing on keys. Critical for any LLM wired into an API.",
39
- instruction: "Does the output match the format the prompt asked for JSON shape, schema, keys, casing, no stray prose? Score on whether a downstream parser would accept it without massaging.",
39
+ instruction: "Does the output match the format the prompt asked for: JSON shape, schema, keys, casing, no stray prose? Score on whether a downstream parser would accept it without massaging.",
40
40
  rubric_bands: [
41
41
  { "stars" => 5, "description" => "Exact spec, ready to consume programmatically." },
42
42
  { "stars" => 4, "description" => "Spec-compliant with one cosmetic issue." },
@@ -62,9 +62,9 @@ module CompletionKit
62
62
  Starter.new(
63
63
  key: "conciseness",
64
64
  name: "Conciseness",
65
- description: "Is it the right length no padding, no missing detail?",
65
+ description: "Is it the right length, no padding, no missing detail?",
66
66
  catches: "Rambling responses, repetitive caveats, over-hedging. LLMs default to verbose. Conciseness is the dimension where users most often see scores move after tuning.",
67
- instruction: "Is the output the right length for the task no padding, no missing detail, no hedging filler? Penalise rambling, repetition, over-caveating, and unnecessary preamble. Penalise too-short outputs that drop information.",
67
+ instruction: "Is the output the right length for the task: no padding, no missing detail, no hedging filler? Penalise rambling, repetition, over-caveating, and unnecessary preamble. Penalise too-short outputs that drop information.",
68
68
  rubric_bands: [
69
69
  { "stars" => 5, "description" => "Exactly as long as the task needs, no more, no less." },
70
70
  { "stars" => 4, "description" => "Right length with a small redundancy." },