completion-kit 0.17.1 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of completion-kit might be problematic. Click here for more details.

Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/completion_kit/agreements_controller.rb +5 -0
  3. data/app/controllers/completion_kit/api/v1/agreements_controller.rb +5 -0
  4. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +9 -2
  5. data/app/controllers/completion_kit/api/v1/runs_controller.rb +1 -1
  6. data/app/controllers/completion_kit/metrics_controller.rb +97 -36
  7. data/app/controllers/completion_kit/runs_controller.rb +1 -1
  8. data/app/jobs/completion_kit/check_review_job.rb +66 -0
  9. data/app/jobs/completion_kit/generate_row_job.rb +5 -2
  10. data/app/jobs/completion_kit/metric_suggestion_job.rb +1 -0
  11. data/app/models/completion_kit/metric.rb +91 -5
  12. data/app/models/completion_kit/metric_version.rb +34 -7
  13. data/app/models/completion_kit/response.rb +18 -2
  14. data/app/models/completion_kit/review.rb +5 -1
  15. data/app/models/completion_kit/run.rb +70 -14
  16. data/app/services/completion_kit/checks/contains.rb +21 -0
  17. data/app/services/completion_kit/checks/equals.rb +26 -0
  18. data/app/services/completion_kit/checks/json_path_equals.rb +32 -0
  19. data/app/services/completion_kit/checks/length_bounds.rb +19 -0
  20. data/app/services/completion_kit/checks/no_refusal.rb +23 -0
  21. data/app/services/completion_kit/checks/not_contains.rb +21 -0
  22. data/app/services/completion_kit/checks/regex.rb +20 -0
  23. data/app/services/completion_kit/checks/registry.rb +41 -0
  24. data/app/services/completion_kit/checks/result.rb +5 -0
  25. data/app/services/completion_kit/checks/target_resolver.rb +31 -0
  26. data/app/services/completion_kit/checks/valid_json.rb +12 -0
  27. data/app/services/completion_kit/mcp_tools/agreements.rb +2 -0
  28. data/app/services/completion_kit/mcp_tools/judges.rb +2 -0
  29. data/app/services/completion_kit/mcp_tools/metrics.rb +32 -4
  30. data/app/services/completion_kit/metric_agreement_examples.rb +2 -0
  31. data/app/services/completion_kit/metric_improvement_validator.rb +2 -0
  32. data/app/services/completion_kit/metric_variant_generator.rb +1 -0
  33. data/app/services/completion_kit/onboarding/concepts.rb +1 -1
  34. data/app/services/completion_kit/prompt_improvement_service.rb +8 -4
  35. data/app/services/completion_kit/prompt_improvement_validator.rb +1 -1
  36. data/app/services/completion_kit/starter_metrics.rb +25 -1
  37. data/app/views/completion_kit/api_reference/_body.html.erb +4 -4
  38. data/app/views/completion_kit/metrics/_check_spec.html.erb +17 -0
  39. data/app/views/completion_kit/metrics/_form.html.erb +104 -1
  40. data/app/views/completion_kit/metrics/index.html.erb +4 -3
  41. data/app/views/completion_kit/metrics/show.html.erb +26 -14
  42. data/app/views/completion_kit/metrics/starter_preview.html.erb +8 -0
  43. data/app/views/completion_kit/responses/show.html.erb +1 -1
  44. data/db/migrate/20260629000001_add_check_type_to_completion_kit_metrics.rb +6 -0
  45. data/db/migrate/20260629000002_add_check_type_to_completion_kit_metric_versions.rb +6 -0
  46. data/db/migrate/20260629000003_add_passed_to_completion_kit_reviews.rb +5 -0
  47. data/lib/completion_kit/version.rb +1 -1
  48. metadata +17 -1
@@ -51,10 +51,16 @@ module CompletionKit
51
51
  broadcast_ui
52
52
  end
53
53
 
54
+ def gradable_metric_ids
55
+ ids = check_metrics.pluck(:id)
56
+ ids += llm_metrics.pluck(:id) if llm_judge_configured?
57
+ ids
58
+ end
59
+
54
60
  def outstanding_work_zero?
55
61
  return false if responses.where.not(status: HasJobStatus::TERMINAL_STATUSES).exists?
56
62
 
57
- metric_ids = metrics.pluck(:id)
63
+ metric_ids = gradable_metric_ids
58
64
  return true if metric_ids.empty?
59
65
 
60
66
  succeeded_response_ids = responses.where(status: "succeeded").pluck(:id)
@@ -74,6 +80,31 @@ module CompletionKit
74
80
  judge_model.present? && metrics.any? && ApiConfig.valid_for_model?(judge_model)
75
81
  end
76
82
 
83
+ def llm_metrics
84
+ metrics.where(metric_type: "llm_judge")
85
+ end
86
+
87
+ def check_metrics
88
+ metrics.where(metric_type: "check")
89
+ end
90
+
91
+ def llm_judge_configured?
92
+ judge_model.present? && llm_metrics.any? && ApiConfig.valid_for_model?(judge_model)
93
+ end
94
+
95
+ def gradable?
96
+ llm_judge_configured? || check_metrics.any?
97
+ end
98
+
99
+ def judge_only_input_data_checks?
100
+ return false unless judge_only?
101
+
102
+ attached = run_metrics.filter_map(&:metric)
103
+ return false if attached.empty?
104
+
105
+ attached.all?(&:check?) && attached.all? { |m| m.check_config.to_h["target"] == "input_data" }
106
+ end
107
+
77
108
  def replace_metrics!(metric_ids)
78
109
  return unless metric_ids
79
110
  run_metrics.delete_all
@@ -91,13 +122,29 @@ module CompletionKit
91
122
  end
92
123
 
93
124
  def metric_averages
94
- all_reviews = responses.flat_map(&:reviews).select { |r| r.ai_score.present? }
95
- all_reviews.group_by(&:metric_name).map do |name, reviews|
96
- scores = reviews.map { |r| r.ai_score.to_f }
97
- { name: name, avg: (scores.sum / scores.length).round(1) }
125
+ responses.flat_map(&:reviews).group_by(&:metric_name).filter_map do |name, reviews|
126
+ scored = reviews.select { |r| r.ai_score.present? }
127
+ if scored.any?
128
+ scores = scored.map { |r| r.ai_score.to_f }
129
+ { name: name, avg: (scores.sum / scores.length).round(1) }
130
+ else
131
+ resolved = reviews.reject { |r| r.passed.nil? }
132
+ next if resolved.empty?
133
+
134
+ passed = resolved.count { |r| r.passed == true }
135
+ { name: name, kind: "check", pass_rate: (passed.to_f / resolved.length).round(2) }
136
+ end
98
137
  end
99
138
  end
100
139
 
140
+ def check_pass_rate
141
+ resolved = responses.flat_map(&:reviews).reject { |r| r.passed.nil? }
142
+ return nil if resolved.empty?
143
+
144
+ passed = resolved.count { |r| r.passed == true }
145
+ (passed.to_f / resolved.length).round(2)
146
+ end
147
+
101
148
  def stale_review_summary
102
149
  review_pairs = Review.where(response_id: response_ids)
103
150
  .where.not(metric_id: nil)
@@ -141,7 +188,9 @@ module CompletionKit
141
188
 
142
189
  if judge_only?
143
190
  column = output_column.presence || "actual_output"
144
- return fail_with_summary!("Dataset has no \"#{column}\" column") unless dataset && dataset.headers.include?(column)
191
+ unless judge_only_input_data_checks? || (dataset && dataset.headers.include?(column))
192
+ return fail_with_summary!("Dataset has no \"#{column}\" column")
193
+ end
145
194
  else
146
195
  client = LlmClient.for_model(prompt.llm_model, ApiConfig.for_model(prompt.llm_model))
147
196
  unless client.configured?
@@ -168,13 +217,15 @@ module CompletionKit
168
217
  }
169
218
  if judge_only?
170
219
  attrs[:status] = "succeeded"
171
- attrs[:response_text] = row[output_column.presence || "actual_output"].to_s
220
+ column = output_column.presence || "actual_output"
221
+ attrs[:response_text] = row[column].to_s if dataset && dataset.headers.include?(column)
172
222
  end
173
223
 
174
224
  response = responses.create!(attrs)
175
225
 
176
226
  if judge_only?
177
- metrics.each { |m| JudgeReviewJob.perform_later(response.id, m.id, id) } if judge_configured?
227
+ llm_metrics.each { |m| JudgeReviewJob.perform_later(response.id, m.id, id) } if llm_judge_configured?
228
+ check_metrics.each { |m| CheckReviewJob.perform_later(response.id, m.id, id) }
178
229
  else
179
230
  GenerateRowJob.perform_later(id, response.id)
180
231
  end
@@ -195,10 +246,10 @@ module CompletionKit
195
246
  end
196
247
 
197
248
  def regrade!
198
- grading_metrics = metrics
199
- return false if grading_metrics.empty? || !judge_configured?
249
+ return false if metrics.empty? || !gradable?
200
250
 
201
- eligible_responses = responses.where(status: "succeeded").where.not(response_text: nil)
251
+ eligible_responses = responses.where(status: "succeeded")
252
+ eligible_responses = eligible_responses.where.not(response_text: nil) unless judge_only_input_data_checks?
202
253
  response_ids = eligible_responses.pluck(:id)
203
254
  return false if response_ids.empty?
204
255
 
@@ -208,6 +259,7 @@ module CompletionKit
208
259
  attempts: 0,
209
260
  metric_version_id: nil,
210
261
  ai_score: nil,
262
+ passed: nil,
211
263
  ai_feedback: nil,
212
264
  error_provider: nil,
213
265
  error_class: nil,
@@ -217,7 +269,8 @@ module CompletionKit
217
269
  update!(status: "running", failure_summary: nil, error_message: nil)
218
270
 
219
271
  response_ids.each do |rid|
220
- grading_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id, id) }
272
+ llm_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id, id) } if llm_judge_configured?
273
+ check_metrics.each { |m| CheckReviewJob.perform_later(rid, m.id, id) }
221
274
  end
222
275
  RunCompletionCheckJob.perform_later(id)
223
276
  end
@@ -231,14 +284,14 @@ module CompletionKit
231
284
  generated_failed = responses.where(status: "failed").count
232
285
  generated_total = progress_total
233
286
 
234
- metric_count = metrics.count
287
+ metric_ids = gradable_metric_ids
288
+ metric_count = metric_ids.size
235
289
  judged_total = metric_count > 0 ? generated_done : 0
236
290
  judged_done = 0
237
291
  judged_failed = 0
238
292
 
239
293
  if metric_count > 0 && judged_total > 0
240
294
  succeeded_response_ids = responses.where(status: "succeeded").pluck(:id)
241
- metric_ids = metrics.pluck(:id)
242
295
  review_counts = Review
243
296
  .where(response_id: succeeded_response_ids, metric_id: metric_ids)
244
297
  .group(:response_id, :status)
@@ -273,6 +326,7 @@ module CompletionKit
273
326
  output_column: output_column,
274
327
  created_at: created_at, updated_at: updated_at,
275
328
  responses_count: responses.count, avg_score: avg_score,
329
+ check_pass_rate: check_pass_rate,
276
330
  progress_current: snap[:generated_done],
277
331
  progress_total: snap[:generated_total],
278
332
  progress: {
@@ -411,6 +465,8 @@ module CompletionKit
411
465
  return
412
466
  end
413
467
 
468
+ return if judge_only_input_data_checks?
469
+
414
470
  column = output_column.presence || "actual_output"
415
471
  unless dataset.headers.include?(column)
416
472
  errors.add(:output_column, "\"#{column}\" is not a column on dataset \"#{dataset.name}\"")
@@ -0,0 +1,21 @@
1
+ module CompletionKit
2
+ module Checks
3
+ class Contains
4
+ def call(target, config)
5
+ value = config["value"].to_s
6
+ haystack = target.to_s
7
+ present = if config["case_sensitive"] == true
8
+ haystack.include?(value)
9
+ else
10
+ haystack.downcase.include?(value.downcase)
11
+ end
12
+
13
+ if present
14
+ Result.new(passed: true, detail: "contains #{value.inspect}")
15
+ else
16
+ Result.new(passed: false, detail: "does not contain #{value.inspect}")
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,26 @@
1
+ module CompletionKit
2
+ module Checks
3
+ class Equals
4
+ def call(target, config)
5
+ actual = target.to_s
6
+ expected = config["value"].to_s
7
+ if config["trim"] == true
8
+ actual = actual.strip
9
+ expected = expected.strip
10
+ end
11
+
12
+ match = if config["case_sensitive"] == true
13
+ actual == expected
14
+ else
15
+ actual.casecmp?(expected)
16
+ end
17
+
18
+ if match
19
+ Result.new(passed: true, detail: "equals #{expected.inspect}")
20
+ else
21
+ Result.new(passed: false, detail: "#{actual.inspect} != #{expected.inspect}")
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,32 @@
1
+ module CompletionKit
2
+ module Checks
3
+ class JsonPathEquals
4
+ MISSING = Object.new
5
+
6
+ def call(target, config)
7
+ parsed = JSON.parse(target.to_s)
8
+ value = dig(parsed, config["json_path"].to_s)
9
+
10
+ if value.equal?(MISSING)
11
+ Result.new(passed: false, detail: "path #{config["json_path"]} not found")
12
+ elsif value == config["expected"]
13
+ Result.new(passed: true, detail: "#{config["json_path"]} == #{config["expected"].inspect}")
14
+ else
15
+ Result.new(passed: false, detail: "#{config["json_path"]} was #{value.inspect}, expected #{config["expected"].inspect}")
16
+ end
17
+ rescue JSON::ParserError
18
+ Result.new(passed: false, detail: "not valid JSON")
19
+ end
20
+
21
+ private
22
+
23
+ def dig(data, path)
24
+ path.split(".").reduce(data) do |node, key|
25
+ return MISSING unless node.is_a?(Hash) && node.key?(key)
26
+
27
+ node[key]
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,19 @@
1
+ module CompletionKit
2
+ module Checks
3
+ class LengthBounds
4
+ def call(target, config)
5
+ length = target.to_s.length
6
+ min = config["min"] && config["min"].to_i
7
+ max = config["max"] && config["max"].to_i
8
+
9
+ if min && length < min
10
+ Result.new(passed: false, detail: "length #{length} below min #{min}")
11
+ elsif max && length > max
12
+ Result.new(passed: false, detail: "length #{length} above max #{max}")
13
+ else
14
+ Result.new(passed: true, detail: "length #{length} within bounds")
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,23 @@
1
+ module CompletionKit
2
+ module Checks
3
+ class NoRefusal
4
+ PATTERNS = [
5
+ /\bi'?m sorry\b/i,
6
+ /\bi can'?t (?:help|assist|comply|do that|provide)/i,
7
+ /\bi (?:cannot|can'?t) (?:help|assist|fulfill|comply|provide)/i,
8
+ /\bi'?m (?:unable|not able) to\b/i,
9
+ /\bi (?:won'?t|will not) (?:be able|help|assist)\b/i,
10
+ /\bas an ai\b/i
11
+ ].freeze
12
+
13
+ def call(target, _config)
14
+ text = target.to_s
15
+ if PATTERNS.any? { |pattern| pattern.match?(text) }
16
+ Result.new(passed: false, detail: "refusal detected")
17
+ else
18
+ Result.new(passed: true, detail: "no refusal detected")
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,21 @@
1
+ module CompletionKit
2
+ module Checks
3
+ class NotContains
4
+ def call(target, config)
5
+ value = config["value"].to_s
6
+ haystack = target.to_s
7
+ present = if config["case_sensitive"] == true
8
+ haystack.include?(value)
9
+ else
10
+ haystack.downcase.include?(value.downcase)
11
+ end
12
+
13
+ if present
14
+ Result.new(passed: false, detail: "contains #{value.inspect}")
15
+ else
16
+ Result.new(passed: true, detail: "does not contain #{value.inspect}")
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,20 @@
1
+ module CompletionKit
2
+ module Checks
3
+ class Regex
4
+ def call(target, config)
5
+ options = 0
6
+ options |= Regexp::IGNORECASE if config["case_sensitive"] == false
7
+ options |= Regexp::MULTILINE if config["multiline"] == true
8
+ pattern = Regexp.new(config["pattern"].to_s, options)
9
+
10
+ if pattern.match?(target.to_s)
11
+ Result.new(passed: true, detail: "matched /#{config["pattern"]}/")
12
+ else
13
+ Result.new(passed: false, detail: "no match for /#{config["pattern"]}/")
14
+ end
15
+ rescue RegexpError => e
16
+ Result.new(passed: false, detail: "invalid pattern: #{e.message}")
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,41 @@
1
+ module CompletionKit
2
+ module Checks
3
+ module Registry
4
+ CHECKS = {
5
+ "contains" => Contains,
6
+ "not_contains" => NotContains,
7
+ "equals" => Equals,
8
+ "regex" => Regex,
9
+ "valid_json" => ValidJson,
10
+ "json_path_equals" => JsonPathEquals,
11
+ "length_bounds" => LengthBounds,
12
+ "no_refusal" => NoRefusal
13
+ }.freeze
14
+
15
+ REQUIRED_KEYS = {
16
+ "contains" => %w[value],
17
+ "not_contains" => %w[value],
18
+ "equals" => %w[value],
19
+ "regex" => %w[pattern],
20
+ "valid_json" => [],
21
+ "json_path_equals" => %w[json_path expected],
22
+ "length_bounds" => [],
23
+ "no_refusal" => []
24
+ }.freeze
25
+
26
+ KINDS = CHECKS.keys.freeze
27
+
28
+ def self.kinds
29
+ KINDS
30
+ end
31
+
32
+ def self.required_keys
33
+ REQUIRED_KEYS
34
+ end
35
+
36
+ def self.fetch(kind)
37
+ CHECKS.fetch(kind).new
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,5 @@
1
+ module CompletionKit
2
+ module Checks
3
+ Result = Data.define(:passed, :detail)
4
+ end
5
+ end
@@ -0,0 +1,31 @@
1
+ module CompletionKit
2
+ module Checks
3
+ module TargetResolver
4
+ TARGETS = %w[response_text input_data json_path].freeze
5
+ UNRESOLVED = Object.new.freeze
6
+
7
+ def self.call(response, config)
8
+ case config["target"]
9
+ when "input_data"
10
+ response.input_data
11
+ when "json_path"
12
+ resolve_json_path(response.response_text, config["target_path"].to_s)
13
+ else
14
+ response.response_text
15
+ end
16
+ end
17
+
18
+ def self.resolve_json_path(text, path)
19
+ parsed = JSON.parse(text.to_s)
20
+ value = path.split(".").reduce(parsed) do |node, key|
21
+ return UNRESOLVED unless node.is_a?(Hash) && node.key?(key)
22
+
23
+ node[key]
24
+ end
25
+ value.to_s
26
+ rescue JSON::ParserError
27
+ UNRESOLVED
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,12 @@
1
+ module CompletionKit
2
+ module Checks
3
+ class ValidJson
4
+ def call(target, _config)
5
+ JSON.parse(target.to_s)
6
+ Result.new(passed: true, detail: "valid JSON")
7
+ rescue JSON::ParserError
8
+ Result.new(passed: false, detail: "not valid JSON")
9
+ end
10
+ end
11
+ end
12
+ end
@@ -50,6 +50,8 @@ module CompletionKit
50
50
  run = CompletionKit::Run.find(args["run_id"])
51
51
  response = run.responses.find(args["response_id"])
52
52
  metric = CompletionKit::Metric.find(args["metric_id"])
53
+ return error_result("Checks have nothing to calibrate; agreements are only for llm_judge metrics.") if metric.check?
54
+
53
55
  created_by = args["created_by"].presence || "mcp"
54
56
 
55
57
  agreement = CompletionKit::Agreement.find_or_initialize_by(
@@ -53,6 +53,8 @@ module CompletionKit
53
53
 
54
54
  def self.compare(args)
55
55
  metric = CompletionKit::Metric.find(args["metric_id"])
56
+ return error_result("judges_compare is unavailable for check metrics.") if metric.check?
57
+
56
58
  a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
57
59
  b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
58
60
  stats_a = CompletionKit::MetricAgreementStats.for(metric, metric_version: a)
@@ -3,6 +3,28 @@ module CompletionKit
3
3
  module Metrics
4
4
  extend Base
5
5
 
6
+ CHECK_CONFIG_SCHEMA = {
7
+ type: "object",
8
+ properties: {
9
+ check_kind: {type: "string", enum: CompletionKit::Checks::Registry.kinds},
10
+ target: {type: "string", enum: CompletionKit::Checks::TargetResolver::TARGETS},
11
+ target_path: {type: "string"},
12
+ value: {type: "string"},
13
+ pattern: {type: "string"},
14
+ json_path: {type: "string"},
15
+ expected: {},
16
+ min: {type: "integer"},
17
+ max: {type: "integer"},
18
+ case_sensitive: {type: "boolean"},
19
+ multiline: {type: "boolean"},
20
+ trim: {type: "boolean"}
21
+ }
22
+ }.freeze
23
+
24
+ CHECK_CONFIG_HINT = "For a deterministic check set metric_type:\"check\" and check_config. Per-kind required keys: " \
25
+ "value (contains/not_contains/equals), pattern (regex), json_path+expected (json_path_equals), " \
26
+ "min and/or max (length_bounds); valid_json and no_refusal take no extra keys. target_path is required when target is json_path."
27
+
6
28
  TOOLS = {
7
29
  "metrics_list" => {
8
30
  description: "List all metrics",
@@ -15,12 +37,14 @@ module CompletionKit
15
37
  handler: :get
16
38
  },
17
39
  "metrics_create" => {
18
- description: "Create a metric with evaluation criteria",
40
+ description: "Create a metric with evaluation criteria. #{CHECK_CONFIG_HINT}",
19
41
  inputSchema: {
20
42
  type: "object",
21
43
  properties: {
22
44
  name: {type: "string"}, instruction: {type: "string"},
45
+ metric_type: {type: "string", enum: CompletionKit::Metric::METRIC_TYPES},
23
46
  rubric_bands: {type: "array", items: {type: "object", properties: {stars: {type: "integer"}, description: {type: "string"}}}},
47
+ check_config: CHECK_CONFIG_SCHEMA,
24
48
  tag_names: {type: "array", items: {type: "string"}}
25
49
  },
26
50
  required: ["name"]
@@ -28,12 +52,14 @@ module CompletionKit
28
52
  handler: :create
29
53
  },
30
54
  "metrics_update" => {
31
- description: "Update a metric",
55
+ description: "Update a metric. #{CHECK_CONFIG_HINT}",
32
56
  inputSchema: {
33
57
  type: "object",
34
58
  properties: {
35
59
  id: {type: "integer"}, name: {type: "string"}, instruction: {type: "string"},
60
+ metric_type: {type: "string", enum: CompletionKit::Metric::METRIC_TYPES},
36
61
  rubric_bands: {type: "array", items: {type: "object", properties: {stars: {type: "integer"}, description: {type: "string"}}}},
62
+ check_config: CHECK_CONFIG_SCHEMA,
37
63
  tag_names: {type: "array", items: {type: "string"}}
38
64
  },
39
65
  required: ["id"]
@@ -69,7 +95,7 @@ module CompletionKit
69
95
  end
70
96
 
71
97
  def self.create(args)
72
- metric = Metric.new(args.slice("name", "instruction", "rubric_bands"))
98
+ metric = Metric.new(args.slice("name", "instruction", "rubric_bands", "metric_type", "check_config"))
73
99
  metric.tag_names = args["tag_names"] if args.key?("tag_names")
74
100
  if metric.save
75
101
  text_result(metric.reload.as_json)
@@ -80,7 +106,7 @@ module CompletionKit
80
106
 
81
107
  def self.update(args)
82
108
  metric = Metric.find(args["id"])
83
- if metric.update(args.except("id").slice("name", "instruction", "rubric_bands"))
109
+ if metric.update(args.except("id").slice("name", "instruction", "rubric_bands", "metric_type", "check_config"))
84
110
  metric.update!(tag_names: args["tag_names"]) if args.key?("tag_names")
85
111
  text_result(metric.reload.as_json)
86
112
  else
@@ -95,6 +121,8 @@ module CompletionKit
95
121
 
96
122
  def self.suggest_variants(args)
97
123
  metric = Metric.find(args["metric_id"])
124
+ return error_result("Metric ##{metric.id} is a check; checks are exact and have no variants to suggest.") if metric.check?
125
+
98
126
  generator = MetricVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
99
127
  variants = generator.call
100
128
  return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
@@ -17,6 +17,8 @@ module CompletionKit
17
17
  end
18
18
 
19
19
  def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
20
+ return [] if metric.check?
21
+
20
22
  current_version = MetricVersion.current.find_by(metric_id: metric.id)
21
23
  return [] unless current_version
22
24
 
@@ -9,6 +9,8 @@ module CompletionKit
9
9
  end
10
10
 
11
11
  def call
12
+ return summarize([], 0, false) if @metric.check?
13
+
12
14
  key = answer_key
13
15
  rows = []
14
16
  key.each do |entry|
@@ -14,6 +14,7 @@ module CompletionKit
14
14
  end
15
15
 
16
16
  def call
17
+ return [] if @metric.check?
17
18
  raise CompletionKit::ConfigurationError, "No judging model available; set CompletionKit.config.judge_model or add a provider with a judging model" if @model.blank?
18
19
 
19
20
  client = LlmClient.for_model(@model, ApiConfig.for_model(@model))
@@ -24,7 +24,7 @@ module CompletionKit
24
24
  },
25
25
  metric: {
26
26
  name: "Metric",
27
- definition: "An evaluation dimension with its own 1-5 rubric. The LLM judge scores every response against it."
27
+ definition: "An evaluation dimension. An LLM judge scores each response on a 1-5 rubric, or a deterministic check passes or fails it with no model call."
28
28
  }
29
29
  }.freeze
30
30
  end
@@ -37,7 +37,11 @@ module CompletionKit
37
37
  sections << "Expected: #{resp.expected_output.truncate(200)}"
38
38
  end
39
39
  resp.reviews.each do |review|
40
- sections << " #{review.metric_name}: #{review.ai_score}/5 — #{review.ai_feedback}"
40
+ if review.check?
41
+ sections << " #{review.metric_name}: #{review.passed ? "PASS" : "FAIL"}"
42
+ else
43
+ sections << " #{review.metric_name}: #{review.ai_score}/5 — #{review.ai_feedback}"
44
+ end
41
45
  end
42
46
  sections << ""
43
47
  end
@@ -45,10 +49,10 @@ module CompletionKit
45
49
  avg = @run.avg_score
46
50
  sections << "## Overall Score: #{avg}/5" if avg
47
51
 
48
- metric_avgs = @run.metric_averages
49
- if metric_avgs.any?
52
+ rubric_avgs = @run.metric_averages.select { |m| m.key?(:avg) }
53
+ if rubric_avgs.any?
50
54
  sections << "## Metric Averages"
51
- metric_avgs.each { |m| sections << " #{m[:name]}: #{m[:avg]}/5" }
55
+ rubric_avgs.each { |m| sections << " #{m[:name]}: #{m[:avg]}/5" }
52
56
  sections << ""
53
57
  end
54
58
 
@@ -86,7 +86,7 @@ module CompletionKit
86
86
  def judge_score(response, new_text)
87
87
  config = ApiConfig.for_model(@run.judge_model).merge(judge_model: @run.judge_model)
88
88
  judge = JudgeService.new(config)
89
- scores = @run.metrics.filter_map do |metric|
89
+ scores = @run.metrics.select(&:llm_judge?).filter_map do |metric|
90
90
  judge.evaluate(
91
91
  new_text, response.expected_output, @candidate,
92
92
  criteria: metric.instruction.to_s,
@@ -1,6 +1,6 @@
1
1
  module CompletionKit
2
2
  module StarterMetrics
3
- Starter = Struct.new(:key, :name, :description, :catches, :instruction, :rubric_bands, keyword_init: true)
3
+ Starter = Struct.new(:key, :name, :description, :catches, :instruction, :rubric_bands, :metric_type, :check_config, keyword_init: true)
4
4
 
5
5
  ALL = [
6
6
  Starter.new(
@@ -72,6 +72,30 @@ module CompletionKit
72
72
  { "stars" => 2, "description" => "Noticeable filler or visible gaps." },
73
73
  { "stars" => 1, "description" => "Padded, repetitive, or so short it loses information." }
74
74
  ]
75
+ ),
76
+ Starter.new(
77
+ key: "valid_json",
78
+ name: "Valid JSON",
79
+ description: "Does the output parse as JSON?",
80
+ catches: "Broken or partial JSON, prose wrapped around a structured response, trailing commas. A deterministic pass/fail with no LLM judgement.",
81
+ metric_type: "check",
82
+ check_config: { "check_kind" => "valid_json", "target" => "response_text" }
83
+ ),
84
+ Starter.new(
85
+ key: "no_refusal",
86
+ name: "No refusal",
87
+ description: "Did the model answer instead of refusing?",
88
+ catches: "\"I'm sorry, I can't help with that\" and other refusal boilerplate when a real answer was expected. Deterministic, no judge call.",
89
+ metric_type: "check",
90
+ check_config: { "check_kind" => "no_refusal", "target" => "response_text" }
91
+ ),
92
+ Starter.new(
93
+ key: "contains_token",
94
+ name: "Contains required token",
95
+ description: "Does the output contain a required substring?",
96
+ catches: "A required marker, citation, or keyword the output must always include. Set the value to the token you require.",
97
+ metric_type: "check",
98
+ check_config: { "check_kind" => "contains", "target" => "response_text", "value" => "REQUIRED" }
75
99
  )
76
100
  ].freeze
77
101