completion-kit 0.17.1 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of completion-kit might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/app/controllers/completion_kit/agreements_controller.rb +5 -0
- data/app/controllers/completion_kit/api/v1/agreements_controller.rb +5 -0
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +9 -2
- data/app/controllers/completion_kit/api/v1/runs_controller.rb +1 -1
- data/app/controllers/completion_kit/metrics_controller.rb +97 -36
- data/app/controllers/completion_kit/runs_controller.rb +1 -1
- data/app/jobs/completion_kit/check_review_job.rb +66 -0
- data/app/jobs/completion_kit/generate_row_job.rb +5 -2
- data/app/jobs/completion_kit/metric_suggestion_job.rb +1 -0
- data/app/models/completion_kit/metric.rb +91 -5
- data/app/models/completion_kit/metric_version.rb +34 -7
- data/app/models/completion_kit/response.rb +18 -2
- data/app/models/completion_kit/review.rb +5 -1
- data/app/models/completion_kit/run.rb +70 -14
- data/app/services/completion_kit/checks/contains.rb +21 -0
- data/app/services/completion_kit/checks/equals.rb +26 -0
- data/app/services/completion_kit/checks/json_path_equals.rb +32 -0
- data/app/services/completion_kit/checks/length_bounds.rb +19 -0
- data/app/services/completion_kit/checks/no_refusal.rb +23 -0
- data/app/services/completion_kit/checks/not_contains.rb +21 -0
- data/app/services/completion_kit/checks/regex.rb +20 -0
- data/app/services/completion_kit/checks/registry.rb +41 -0
- data/app/services/completion_kit/checks/result.rb +5 -0
- data/app/services/completion_kit/checks/target_resolver.rb +31 -0
- data/app/services/completion_kit/checks/valid_json.rb +12 -0
- data/app/services/completion_kit/mcp_tools/agreements.rb +2 -0
- data/app/services/completion_kit/mcp_tools/judges.rb +2 -0
- data/app/services/completion_kit/mcp_tools/metrics.rb +32 -4
- data/app/services/completion_kit/metric_agreement_examples.rb +2 -0
- data/app/services/completion_kit/metric_improvement_validator.rb +2 -0
- data/app/services/completion_kit/metric_variant_generator.rb +1 -0
- data/app/services/completion_kit/onboarding/concepts.rb +1 -1
- data/app/services/completion_kit/prompt_improvement_service.rb +8 -4
- data/app/services/completion_kit/prompt_improvement_validator.rb +1 -1
- data/app/services/completion_kit/starter_metrics.rb +25 -1
- data/app/views/completion_kit/api_reference/_body.html.erb +4 -4
- data/app/views/completion_kit/metrics/_check_spec.html.erb +17 -0
- data/app/views/completion_kit/metrics/_form.html.erb +104 -1
- data/app/views/completion_kit/metrics/index.html.erb +4 -3
- data/app/views/completion_kit/metrics/show.html.erb +26 -14
- data/app/views/completion_kit/metrics/starter_preview.html.erb +8 -0
- data/app/views/completion_kit/responses/show.html.erb +1 -1
- data/db/migrate/20260629000001_add_check_type_to_completion_kit_metrics.rb +6 -0
- data/db/migrate/20260629000002_add_check_type_to_completion_kit_metric_versions.rb +6 -0
- data/db/migrate/20260629000003_add_passed_to_completion_kit_reviews.rb +5 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +17 -1
|
@@ -51,10 +51,16 @@ module CompletionKit
|
|
|
51
51
|
broadcast_ui
|
|
52
52
|
end
|
|
53
53
|
|
|
54
|
+
def gradable_metric_ids
|
|
55
|
+
ids = check_metrics.pluck(:id)
|
|
56
|
+
ids += llm_metrics.pluck(:id) if llm_judge_configured?
|
|
57
|
+
ids
|
|
58
|
+
end
|
|
59
|
+
|
|
54
60
|
def outstanding_work_zero?
|
|
55
61
|
return false if responses.where.not(status: HasJobStatus::TERMINAL_STATUSES).exists?
|
|
56
62
|
|
|
57
|
-
metric_ids =
|
|
63
|
+
metric_ids = gradable_metric_ids
|
|
58
64
|
return true if metric_ids.empty?
|
|
59
65
|
|
|
60
66
|
succeeded_response_ids = responses.where(status: "succeeded").pluck(:id)
|
|
@@ -74,6 +80,31 @@ module CompletionKit
|
|
|
74
80
|
judge_model.present? && metrics.any? && ApiConfig.valid_for_model?(judge_model)
|
|
75
81
|
end
|
|
76
82
|
|
|
83
|
+
def llm_metrics
|
|
84
|
+
metrics.where(metric_type: "llm_judge")
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def check_metrics
|
|
88
|
+
metrics.where(metric_type: "check")
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def llm_judge_configured?
|
|
92
|
+
judge_model.present? && llm_metrics.any? && ApiConfig.valid_for_model?(judge_model)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def gradable?
|
|
96
|
+
llm_judge_configured? || check_metrics.any?
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def judge_only_input_data_checks?
|
|
100
|
+
return false unless judge_only?
|
|
101
|
+
|
|
102
|
+
attached = run_metrics.filter_map(&:metric)
|
|
103
|
+
return false if attached.empty?
|
|
104
|
+
|
|
105
|
+
attached.all?(&:check?) && attached.all? { |m| m.check_config.to_h["target"] == "input_data" }
|
|
106
|
+
end
|
|
107
|
+
|
|
77
108
|
def replace_metrics!(metric_ids)
|
|
78
109
|
return unless metric_ids
|
|
79
110
|
run_metrics.delete_all
|
|
@@ -91,13 +122,29 @@ module CompletionKit
|
|
|
91
122
|
end
|
|
92
123
|
|
|
93
124
|
def metric_averages
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
125
|
+
responses.flat_map(&:reviews).group_by(&:metric_name).filter_map do |name, reviews|
|
|
126
|
+
scored = reviews.select { |r| r.ai_score.present? }
|
|
127
|
+
if scored.any?
|
|
128
|
+
scores = scored.map { |r| r.ai_score.to_f }
|
|
129
|
+
{ name: name, avg: (scores.sum / scores.length).round(1) }
|
|
130
|
+
else
|
|
131
|
+
resolved = reviews.reject { |r| r.passed.nil? }
|
|
132
|
+
next if resolved.empty?
|
|
133
|
+
|
|
134
|
+
passed = resolved.count { |r| r.passed == true }
|
|
135
|
+
{ name: name, kind: "check", pass_rate: (passed.to_f / resolved.length).round(2) }
|
|
136
|
+
end
|
|
98
137
|
end
|
|
99
138
|
end
|
|
100
139
|
|
|
140
|
+
def check_pass_rate
|
|
141
|
+
resolved = responses.flat_map(&:reviews).reject { |r| r.passed.nil? }
|
|
142
|
+
return nil if resolved.empty?
|
|
143
|
+
|
|
144
|
+
passed = resolved.count { |r| r.passed == true }
|
|
145
|
+
(passed.to_f / resolved.length).round(2)
|
|
146
|
+
end
|
|
147
|
+
|
|
101
148
|
def stale_review_summary
|
|
102
149
|
review_pairs = Review.where(response_id: response_ids)
|
|
103
150
|
.where.not(metric_id: nil)
|
|
@@ -141,7 +188,9 @@ module CompletionKit
|
|
|
141
188
|
|
|
142
189
|
if judge_only?
|
|
143
190
|
column = output_column.presence || "actual_output"
|
|
144
|
-
|
|
191
|
+
unless judge_only_input_data_checks? || (dataset && dataset.headers.include?(column))
|
|
192
|
+
return fail_with_summary!("Dataset has no \"#{column}\" column")
|
|
193
|
+
end
|
|
145
194
|
else
|
|
146
195
|
client = LlmClient.for_model(prompt.llm_model, ApiConfig.for_model(prompt.llm_model))
|
|
147
196
|
unless client.configured?
|
|
@@ -168,13 +217,15 @@ module CompletionKit
|
|
|
168
217
|
}
|
|
169
218
|
if judge_only?
|
|
170
219
|
attrs[:status] = "succeeded"
|
|
171
|
-
|
|
220
|
+
column = output_column.presence || "actual_output"
|
|
221
|
+
attrs[:response_text] = row[column].to_s if dataset && dataset.headers.include?(column)
|
|
172
222
|
end
|
|
173
223
|
|
|
174
224
|
response = responses.create!(attrs)
|
|
175
225
|
|
|
176
226
|
if judge_only?
|
|
177
|
-
|
|
227
|
+
llm_metrics.each { |m| JudgeReviewJob.perform_later(response.id, m.id, id) } if llm_judge_configured?
|
|
228
|
+
check_metrics.each { |m| CheckReviewJob.perform_later(response.id, m.id, id) }
|
|
178
229
|
else
|
|
179
230
|
GenerateRowJob.perform_later(id, response.id)
|
|
180
231
|
end
|
|
@@ -195,10 +246,10 @@ module CompletionKit
|
|
|
195
246
|
end
|
|
196
247
|
|
|
197
248
|
def regrade!
|
|
198
|
-
|
|
199
|
-
return false if grading_metrics.empty? || !judge_configured?
|
|
249
|
+
return false if metrics.empty? || !gradable?
|
|
200
250
|
|
|
201
|
-
eligible_responses = responses.where(status: "succeeded")
|
|
251
|
+
eligible_responses = responses.where(status: "succeeded")
|
|
252
|
+
eligible_responses = eligible_responses.where.not(response_text: nil) unless judge_only_input_data_checks?
|
|
202
253
|
response_ids = eligible_responses.pluck(:id)
|
|
203
254
|
return false if response_ids.empty?
|
|
204
255
|
|
|
@@ -208,6 +259,7 @@ module CompletionKit
|
|
|
208
259
|
attempts: 0,
|
|
209
260
|
metric_version_id: nil,
|
|
210
261
|
ai_score: nil,
|
|
262
|
+
passed: nil,
|
|
211
263
|
ai_feedback: nil,
|
|
212
264
|
error_provider: nil,
|
|
213
265
|
error_class: nil,
|
|
@@ -217,7 +269,8 @@ module CompletionKit
|
|
|
217
269
|
update!(status: "running", failure_summary: nil, error_message: nil)
|
|
218
270
|
|
|
219
271
|
response_ids.each do |rid|
|
|
220
|
-
|
|
272
|
+
llm_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id, id) } if llm_judge_configured?
|
|
273
|
+
check_metrics.each { |m| CheckReviewJob.perform_later(rid, m.id, id) }
|
|
221
274
|
end
|
|
222
275
|
RunCompletionCheckJob.perform_later(id)
|
|
223
276
|
end
|
|
@@ -231,14 +284,14 @@ module CompletionKit
|
|
|
231
284
|
generated_failed = responses.where(status: "failed").count
|
|
232
285
|
generated_total = progress_total
|
|
233
286
|
|
|
234
|
-
|
|
287
|
+
metric_ids = gradable_metric_ids
|
|
288
|
+
metric_count = metric_ids.size
|
|
235
289
|
judged_total = metric_count > 0 ? generated_done : 0
|
|
236
290
|
judged_done = 0
|
|
237
291
|
judged_failed = 0
|
|
238
292
|
|
|
239
293
|
if metric_count > 0 && judged_total > 0
|
|
240
294
|
succeeded_response_ids = responses.where(status: "succeeded").pluck(:id)
|
|
241
|
-
metric_ids = metrics.pluck(:id)
|
|
242
295
|
review_counts = Review
|
|
243
296
|
.where(response_id: succeeded_response_ids, metric_id: metric_ids)
|
|
244
297
|
.group(:response_id, :status)
|
|
@@ -273,6 +326,7 @@ module CompletionKit
|
|
|
273
326
|
output_column: output_column,
|
|
274
327
|
created_at: created_at, updated_at: updated_at,
|
|
275
328
|
responses_count: responses.count, avg_score: avg_score,
|
|
329
|
+
check_pass_rate: check_pass_rate,
|
|
276
330
|
progress_current: snap[:generated_done],
|
|
277
331
|
progress_total: snap[:generated_total],
|
|
278
332
|
progress: {
|
|
@@ -411,6 +465,8 @@ module CompletionKit
|
|
|
411
465
|
return
|
|
412
466
|
end
|
|
413
467
|
|
|
468
|
+
return if judge_only_input_data_checks?
|
|
469
|
+
|
|
414
470
|
column = output_column.presence || "actual_output"
|
|
415
471
|
unless dataset.headers.include?(column)
|
|
416
472
|
errors.add(:output_column, "\"#{column}\" is not a column on dataset \"#{dataset.name}\"")
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Checks
|
|
3
|
+
class Contains
|
|
4
|
+
def call(target, config)
|
|
5
|
+
value = config["value"].to_s
|
|
6
|
+
haystack = target.to_s
|
|
7
|
+
present = if config["case_sensitive"] == true
|
|
8
|
+
haystack.include?(value)
|
|
9
|
+
else
|
|
10
|
+
haystack.downcase.include?(value.downcase)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
if present
|
|
14
|
+
Result.new(passed: true, detail: "contains #{value.inspect}")
|
|
15
|
+
else
|
|
16
|
+
Result.new(passed: false, detail: "does not contain #{value.inspect}")
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Checks
|
|
3
|
+
class Equals
|
|
4
|
+
def call(target, config)
|
|
5
|
+
actual = target.to_s
|
|
6
|
+
expected = config["value"].to_s
|
|
7
|
+
if config["trim"] == true
|
|
8
|
+
actual = actual.strip
|
|
9
|
+
expected = expected.strip
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
match = if config["case_sensitive"] == true
|
|
13
|
+
actual == expected
|
|
14
|
+
else
|
|
15
|
+
actual.casecmp?(expected)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
if match
|
|
19
|
+
Result.new(passed: true, detail: "equals #{expected.inspect}")
|
|
20
|
+
else
|
|
21
|
+
Result.new(passed: false, detail: "#{actual.inspect} != #{expected.inspect}")
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Checks
|
|
3
|
+
class JsonPathEquals
|
|
4
|
+
MISSING = Object.new
|
|
5
|
+
|
|
6
|
+
def call(target, config)
|
|
7
|
+
parsed = JSON.parse(target.to_s)
|
|
8
|
+
value = dig(parsed, config["json_path"].to_s)
|
|
9
|
+
|
|
10
|
+
if value.equal?(MISSING)
|
|
11
|
+
Result.new(passed: false, detail: "path #{config["json_path"]} not found")
|
|
12
|
+
elsif value == config["expected"]
|
|
13
|
+
Result.new(passed: true, detail: "#{config["json_path"]} == #{config["expected"].inspect}")
|
|
14
|
+
else
|
|
15
|
+
Result.new(passed: false, detail: "#{config["json_path"]} was #{value.inspect}, expected #{config["expected"].inspect}")
|
|
16
|
+
end
|
|
17
|
+
rescue JSON::ParserError
|
|
18
|
+
Result.new(passed: false, detail: "not valid JSON")
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
def dig(data, path)
|
|
24
|
+
path.split(".").reduce(data) do |node, key|
|
|
25
|
+
return MISSING unless node.is_a?(Hash) && node.key?(key)
|
|
26
|
+
|
|
27
|
+
node[key]
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Checks
|
|
3
|
+
class LengthBounds
|
|
4
|
+
def call(target, config)
|
|
5
|
+
length = target.to_s.length
|
|
6
|
+
min = config["min"] && config["min"].to_i
|
|
7
|
+
max = config["max"] && config["max"].to_i
|
|
8
|
+
|
|
9
|
+
if min && length < min
|
|
10
|
+
Result.new(passed: false, detail: "length #{length} below min #{min}")
|
|
11
|
+
elsif max && length > max
|
|
12
|
+
Result.new(passed: false, detail: "length #{length} above max #{max}")
|
|
13
|
+
else
|
|
14
|
+
Result.new(passed: true, detail: "length #{length} within bounds")
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Checks
|
|
3
|
+
class NoRefusal
|
|
4
|
+
PATTERNS = [
|
|
5
|
+
/\bi'?m sorry\b/i,
|
|
6
|
+
/\bi can'?t (?:help|assist|comply|do that|provide)/i,
|
|
7
|
+
/\bi (?:cannot|can'?t) (?:help|assist|fulfill|comply|provide)/i,
|
|
8
|
+
/\bi'?m (?:unable|not able) to\b/i,
|
|
9
|
+
/\bi (?:won'?t|will not) (?:be able|help|assist)\b/i,
|
|
10
|
+
/\bas an ai\b/i
|
|
11
|
+
].freeze
|
|
12
|
+
|
|
13
|
+
def call(target, _config)
|
|
14
|
+
text = target.to_s
|
|
15
|
+
if PATTERNS.any? { |pattern| pattern.match?(text) }
|
|
16
|
+
Result.new(passed: false, detail: "refusal detected")
|
|
17
|
+
else
|
|
18
|
+
Result.new(passed: true, detail: "no refusal detected")
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Checks
|
|
3
|
+
class NotContains
|
|
4
|
+
def call(target, config)
|
|
5
|
+
value = config["value"].to_s
|
|
6
|
+
haystack = target.to_s
|
|
7
|
+
present = if config["case_sensitive"] == true
|
|
8
|
+
haystack.include?(value)
|
|
9
|
+
else
|
|
10
|
+
haystack.downcase.include?(value.downcase)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
if present
|
|
14
|
+
Result.new(passed: false, detail: "contains #{value.inspect}")
|
|
15
|
+
else
|
|
16
|
+
Result.new(passed: true, detail: "does not contain #{value.inspect}")
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Checks
|
|
3
|
+
class Regex
|
|
4
|
+
def call(target, config)
|
|
5
|
+
options = 0
|
|
6
|
+
options |= Regexp::IGNORECASE if config["case_sensitive"] == false
|
|
7
|
+
options |= Regexp::MULTILINE if config["multiline"] == true
|
|
8
|
+
pattern = Regexp.new(config["pattern"].to_s, options)
|
|
9
|
+
|
|
10
|
+
if pattern.match?(target.to_s)
|
|
11
|
+
Result.new(passed: true, detail: "matched /#{config["pattern"]}/")
|
|
12
|
+
else
|
|
13
|
+
Result.new(passed: false, detail: "no match for /#{config["pattern"]}/")
|
|
14
|
+
end
|
|
15
|
+
rescue RegexpError => e
|
|
16
|
+
Result.new(passed: false, detail: "invalid pattern: #{e.message}")
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Checks
|
|
3
|
+
module Registry
|
|
4
|
+
CHECKS = {
|
|
5
|
+
"contains" => Contains,
|
|
6
|
+
"not_contains" => NotContains,
|
|
7
|
+
"equals" => Equals,
|
|
8
|
+
"regex" => Regex,
|
|
9
|
+
"valid_json" => ValidJson,
|
|
10
|
+
"json_path_equals" => JsonPathEquals,
|
|
11
|
+
"length_bounds" => LengthBounds,
|
|
12
|
+
"no_refusal" => NoRefusal
|
|
13
|
+
}.freeze
|
|
14
|
+
|
|
15
|
+
REQUIRED_KEYS = {
|
|
16
|
+
"contains" => %w[value],
|
|
17
|
+
"not_contains" => %w[value],
|
|
18
|
+
"equals" => %w[value],
|
|
19
|
+
"regex" => %w[pattern],
|
|
20
|
+
"valid_json" => [],
|
|
21
|
+
"json_path_equals" => %w[json_path expected],
|
|
22
|
+
"length_bounds" => [],
|
|
23
|
+
"no_refusal" => []
|
|
24
|
+
}.freeze
|
|
25
|
+
|
|
26
|
+
KINDS = CHECKS.keys.freeze
|
|
27
|
+
|
|
28
|
+
def self.kinds
|
|
29
|
+
KINDS
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def self.required_keys
|
|
33
|
+
REQUIRED_KEYS
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def self.fetch(kind)
|
|
37
|
+
CHECKS.fetch(kind).new
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Checks
|
|
3
|
+
module TargetResolver
|
|
4
|
+
TARGETS = %w[response_text input_data json_path].freeze
|
|
5
|
+
UNRESOLVED = Object.new.freeze
|
|
6
|
+
|
|
7
|
+
def self.call(response, config)
|
|
8
|
+
case config["target"]
|
|
9
|
+
when "input_data"
|
|
10
|
+
response.input_data
|
|
11
|
+
when "json_path"
|
|
12
|
+
resolve_json_path(response.response_text, config["target_path"].to_s)
|
|
13
|
+
else
|
|
14
|
+
response.response_text
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def self.resolve_json_path(text, path)
|
|
19
|
+
parsed = JSON.parse(text.to_s)
|
|
20
|
+
value = path.split(".").reduce(parsed) do |node, key|
|
|
21
|
+
return UNRESOLVED unless node.is_a?(Hash) && node.key?(key)
|
|
22
|
+
|
|
23
|
+
node[key]
|
|
24
|
+
end
|
|
25
|
+
value.to_s
|
|
26
|
+
rescue JSON::ParserError
|
|
27
|
+
UNRESOLVED
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Checks
|
|
3
|
+
class ValidJson
|
|
4
|
+
def call(target, _config)
|
|
5
|
+
JSON.parse(target.to_s)
|
|
6
|
+
Result.new(passed: true, detail: "valid JSON")
|
|
7
|
+
rescue JSON::ParserError
|
|
8
|
+
Result.new(passed: false, detail: "not valid JSON")
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
end
|
|
@@ -50,6 +50,8 @@ module CompletionKit
|
|
|
50
50
|
run = CompletionKit::Run.find(args["run_id"])
|
|
51
51
|
response = run.responses.find(args["response_id"])
|
|
52
52
|
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
53
|
+
return error_result("Checks have nothing to calibrate; agreements are only for llm_judge metrics.") if metric.check?
|
|
54
|
+
|
|
53
55
|
created_by = args["created_by"].presence || "mcp"
|
|
54
56
|
|
|
55
57
|
agreement = CompletionKit::Agreement.find_or_initialize_by(
|
|
@@ -53,6 +53,8 @@ module CompletionKit
|
|
|
53
53
|
|
|
54
54
|
def self.compare(args)
|
|
55
55
|
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
56
|
+
return error_result("judges_compare is unavailable for check metrics.") if metric.check?
|
|
57
|
+
|
|
56
58
|
a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
|
|
57
59
|
b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
|
|
58
60
|
stats_a = CompletionKit::MetricAgreementStats.for(metric, metric_version: a)
|
|
@@ -3,6 +3,28 @@ module CompletionKit
|
|
|
3
3
|
module Metrics
|
|
4
4
|
extend Base
|
|
5
5
|
|
|
6
|
+
CHECK_CONFIG_SCHEMA = {
|
|
7
|
+
type: "object",
|
|
8
|
+
properties: {
|
|
9
|
+
check_kind: {type: "string", enum: CompletionKit::Checks::Registry.kinds},
|
|
10
|
+
target: {type: "string", enum: CompletionKit::Checks::TargetResolver::TARGETS},
|
|
11
|
+
target_path: {type: "string"},
|
|
12
|
+
value: {type: "string"},
|
|
13
|
+
pattern: {type: "string"},
|
|
14
|
+
json_path: {type: "string"},
|
|
15
|
+
expected: {},
|
|
16
|
+
min: {type: "integer"},
|
|
17
|
+
max: {type: "integer"},
|
|
18
|
+
case_sensitive: {type: "boolean"},
|
|
19
|
+
multiline: {type: "boolean"},
|
|
20
|
+
trim: {type: "boolean"}
|
|
21
|
+
}
|
|
22
|
+
}.freeze
|
|
23
|
+
|
|
24
|
+
CHECK_CONFIG_HINT = "For a deterministic check set metric_type:\"check\" and check_config. Per-kind required keys: " \
|
|
25
|
+
"value (contains/not_contains/equals), pattern (regex), json_path+expected (json_path_equals), " \
|
|
26
|
+
"min and/or max (length_bounds); valid_json and no_refusal take no extra keys. target_path is required when target is json_path."
|
|
27
|
+
|
|
6
28
|
TOOLS = {
|
|
7
29
|
"metrics_list" => {
|
|
8
30
|
description: "List all metrics",
|
|
@@ -15,12 +37,14 @@ module CompletionKit
|
|
|
15
37
|
handler: :get
|
|
16
38
|
},
|
|
17
39
|
"metrics_create" => {
|
|
18
|
-
description: "Create a metric with evaluation criteria",
|
|
40
|
+
description: "Create a metric with evaluation criteria. #{CHECK_CONFIG_HINT}",
|
|
19
41
|
inputSchema: {
|
|
20
42
|
type: "object",
|
|
21
43
|
properties: {
|
|
22
44
|
name: {type: "string"}, instruction: {type: "string"},
|
|
45
|
+
metric_type: {type: "string", enum: CompletionKit::Metric::METRIC_TYPES},
|
|
23
46
|
rubric_bands: {type: "array", items: {type: "object", properties: {stars: {type: "integer"}, description: {type: "string"}}}},
|
|
47
|
+
check_config: CHECK_CONFIG_SCHEMA,
|
|
24
48
|
tag_names: {type: "array", items: {type: "string"}}
|
|
25
49
|
},
|
|
26
50
|
required: ["name"]
|
|
@@ -28,12 +52,14 @@ module CompletionKit
|
|
|
28
52
|
handler: :create
|
|
29
53
|
},
|
|
30
54
|
"metrics_update" => {
|
|
31
|
-
description: "Update a metric",
|
|
55
|
+
description: "Update a metric. #{CHECK_CONFIG_HINT}",
|
|
32
56
|
inputSchema: {
|
|
33
57
|
type: "object",
|
|
34
58
|
properties: {
|
|
35
59
|
id: {type: "integer"}, name: {type: "string"}, instruction: {type: "string"},
|
|
60
|
+
metric_type: {type: "string", enum: CompletionKit::Metric::METRIC_TYPES},
|
|
36
61
|
rubric_bands: {type: "array", items: {type: "object", properties: {stars: {type: "integer"}, description: {type: "string"}}}},
|
|
62
|
+
check_config: CHECK_CONFIG_SCHEMA,
|
|
37
63
|
tag_names: {type: "array", items: {type: "string"}}
|
|
38
64
|
},
|
|
39
65
|
required: ["id"]
|
|
@@ -69,7 +95,7 @@ module CompletionKit
|
|
|
69
95
|
end
|
|
70
96
|
|
|
71
97
|
def self.create(args)
|
|
72
|
-
metric = Metric.new(args.slice("name", "instruction", "rubric_bands"))
|
|
98
|
+
metric = Metric.new(args.slice("name", "instruction", "rubric_bands", "metric_type", "check_config"))
|
|
73
99
|
metric.tag_names = args["tag_names"] if args.key?("tag_names")
|
|
74
100
|
if metric.save
|
|
75
101
|
text_result(metric.reload.as_json)
|
|
@@ -80,7 +106,7 @@ module CompletionKit
|
|
|
80
106
|
|
|
81
107
|
def self.update(args)
|
|
82
108
|
metric = Metric.find(args["id"])
|
|
83
|
-
if metric.update(args.except("id").slice("name", "instruction", "rubric_bands"))
|
|
109
|
+
if metric.update(args.except("id").slice("name", "instruction", "rubric_bands", "metric_type", "check_config"))
|
|
84
110
|
metric.update!(tag_names: args["tag_names"]) if args.key?("tag_names")
|
|
85
111
|
text_result(metric.reload.as_json)
|
|
86
112
|
else
|
|
@@ -95,6 +121,8 @@ module CompletionKit
|
|
|
95
121
|
|
|
96
122
|
def self.suggest_variants(args)
|
|
97
123
|
metric = Metric.find(args["metric_id"])
|
|
124
|
+
return error_result("Metric ##{metric.id} is a check; checks are exact and have no variants to suggest.") if metric.check?
|
|
125
|
+
|
|
98
126
|
generator = MetricVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
|
|
99
127
|
variants = generator.call
|
|
100
128
|
return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
|
|
@@ -17,6 +17,8 @@ module CompletionKit
|
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
|
|
20
|
+
return [] if metric.check?
|
|
21
|
+
|
|
20
22
|
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
21
23
|
return [] unless current_version
|
|
22
24
|
|
|
@@ -14,6 +14,7 @@ module CompletionKit
|
|
|
14
14
|
end
|
|
15
15
|
|
|
16
16
|
def call
|
|
17
|
+
return [] if @metric.check?
|
|
17
18
|
raise CompletionKit::ConfigurationError, "No judging model available; set CompletionKit.config.judge_model or add a provider with a judging model" if @model.blank?
|
|
18
19
|
|
|
19
20
|
client = LlmClient.for_model(@model, ApiConfig.for_model(@model))
|
|
@@ -24,7 +24,7 @@ module CompletionKit
|
|
|
24
24
|
},
|
|
25
25
|
metric: {
|
|
26
26
|
name: "Metric",
|
|
27
|
-
definition: "An evaluation dimension
|
|
27
|
+
definition: "An evaluation dimension. An LLM judge scores each response on a 1-5 rubric, or a deterministic check passes or fails it with no model call."
|
|
28
28
|
}
|
|
29
29
|
}.freeze
|
|
30
30
|
end
|
|
@@ -37,7 +37,11 @@ module CompletionKit
|
|
|
37
37
|
sections << "Expected: #{resp.expected_output.truncate(200)}"
|
|
38
38
|
end
|
|
39
39
|
resp.reviews.each do |review|
|
|
40
|
-
|
|
40
|
+
if review.check?
|
|
41
|
+
sections << " #{review.metric_name}: #{review.passed ? "PASS" : "FAIL"}"
|
|
42
|
+
else
|
|
43
|
+
sections << " #{review.metric_name}: #{review.ai_score}/5 — #{review.ai_feedback}"
|
|
44
|
+
end
|
|
41
45
|
end
|
|
42
46
|
sections << ""
|
|
43
47
|
end
|
|
@@ -45,10 +49,10 @@ module CompletionKit
|
|
|
45
49
|
avg = @run.avg_score
|
|
46
50
|
sections << "## Overall Score: #{avg}/5" if avg
|
|
47
51
|
|
|
48
|
-
|
|
49
|
-
if
|
|
52
|
+
rubric_avgs = @run.metric_averages.select { |m| m.key?(:avg) }
|
|
53
|
+
if rubric_avgs.any?
|
|
50
54
|
sections << "## Metric Averages"
|
|
51
|
-
|
|
55
|
+
rubric_avgs.each { |m| sections << " #{m[:name]}: #{m[:avg]}/5" }
|
|
52
56
|
sections << ""
|
|
53
57
|
end
|
|
54
58
|
|
|
@@ -86,7 +86,7 @@ module CompletionKit
|
|
|
86
86
|
def judge_score(response, new_text)
|
|
87
87
|
config = ApiConfig.for_model(@run.judge_model).merge(judge_model: @run.judge_model)
|
|
88
88
|
judge = JudgeService.new(config)
|
|
89
|
-
scores = @run.metrics.filter_map do |metric|
|
|
89
|
+
scores = @run.metrics.select(&:llm_judge?).filter_map do |metric|
|
|
90
90
|
judge.evaluate(
|
|
91
91
|
new_text, response.expected_output, @candidate,
|
|
92
92
|
criteria: metric.instruction.to_s,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
module StarterMetrics
|
|
3
|
-
Starter = Struct.new(:key, :name, :description, :catches, :instruction, :rubric_bands, keyword_init: true)
|
|
3
|
+
Starter = Struct.new(:key, :name, :description, :catches, :instruction, :rubric_bands, :metric_type, :check_config, keyword_init: true)
|
|
4
4
|
|
|
5
5
|
ALL = [
|
|
6
6
|
Starter.new(
|
|
@@ -72,6 +72,30 @@ module CompletionKit
|
|
|
72
72
|
{ "stars" => 2, "description" => "Noticeable filler or visible gaps." },
|
|
73
73
|
{ "stars" => 1, "description" => "Padded, repetitive, or so short it loses information." }
|
|
74
74
|
]
|
|
75
|
+
),
|
|
76
|
+
Starter.new(
|
|
77
|
+
key: "valid_json",
|
|
78
|
+
name: "Valid JSON",
|
|
79
|
+
description: "Does the output parse as JSON?",
|
|
80
|
+
catches: "Broken or partial JSON, prose wrapped around a structured response, trailing commas. A deterministic pass/fail with no LLM judgement.",
|
|
81
|
+
metric_type: "check",
|
|
82
|
+
check_config: { "check_kind" => "valid_json", "target" => "response_text" }
|
|
83
|
+
),
|
|
84
|
+
Starter.new(
|
|
85
|
+
key: "no_refusal",
|
|
86
|
+
name: "No refusal",
|
|
87
|
+
description: "Did the model answer instead of refusing?",
|
|
88
|
+
catches: "\"I'm sorry, I can't help with that\" and other refusal boilerplate when a real answer was expected. Deterministic, no judge call.",
|
|
89
|
+
metric_type: "check",
|
|
90
|
+
check_config: { "check_kind" => "no_refusal", "target" => "response_text" }
|
|
91
|
+
),
|
|
92
|
+
Starter.new(
|
|
93
|
+
key: "contains_token",
|
|
94
|
+
name: "Contains required token",
|
|
95
|
+
description: "Does the output contain a required substring?",
|
|
96
|
+
catches: "A required marker, citation, or keyword the output must always include. Set the value to the token you require.",
|
|
97
|
+
metric_type: "check",
|
|
98
|
+
check_config: { "check_kind" => "contains", "target" => "response_text", "value" => "REQUIRED" }
|
|
75
99
|
)
|
|
76
100
|
].freeze
|
|
77
101
|
|