completion-kit 0.17.1 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of completion-kit might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/app/controllers/completion_kit/agreements_controller.rb +5 -0
- data/app/controllers/completion_kit/api/v1/agreements_controller.rb +5 -0
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +9 -2
- data/app/controllers/completion_kit/api/v1/runs_controller.rb +1 -1
- data/app/controllers/completion_kit/metrics_controller.rb +97 -36
- data/app/controllers/completion_kit/runs_controller.rb +1 -1
- data/app/jobs/completion_kit/check_review_job.rb +66 -0
- data/app/jobs/completion_kit/generate_row_job.rb +5 -2
- data/app/jobs/completion_kit/metric_suggestion_job.rb +1 -0
- data/app/models/completion_kit/metric.rb +91 -5
- data/app/models/completion_kit/metric_version.rb +34 -7
- data/app/models/completion_kit/response.rb +18 -2
- data/app/models/completion_kit/review.rb +5 -1
- data/app/models/completion_kit/run.rb +70 -14
- data/app/services/completion_kit/checks/contains.rb +21 -0
- data/app/services/completion_kit/checks/equals.rb +26 -0
- data/app/services/completion_kit/checks/json_path_equals.rb +32 -0
- data/app/services/completion_kit/checks/length_bounds.rb +19 -0
- data/app/services/completion_kit/checks/no_refusal.rb +23 -0
- data/app/services/completion_kit/checks/not_contains.rb +21 -0
- data/app/services/completion_kit/checks/regex.rb +20 -0
- data/app/services/completion_kit/checks/registry.rb +41 -0
- data/app/services/completion_kit/checks/result.rb +5 -0
- data/app/services/completion_kit/checks/target_resolver.rb +31 -0
- data/app/services/completion_kit/checks/valid_json.rb +12 -0
- data/app/services/completion_kit/mcp_tools/agreements.rb +2 -0
- data/app/services/completion_kit/mcp_tools/judges.rb +2 -0
- data/app/services/completion_kit/mcp_tools/metrics.rb +32 -4
- data/app/services/completion_kit/metric_agreement_examples.rb +2 -0
- data/app/services/completion_kit/metric_improvement_validator.rb +2 -0
- data/app/services/completion_kit/metric_variant_generator.rb +1 -0
- data/app/services/completion_kit/onboarding/concepts.rb +1 -1
- data/app/services/completion_kit/prompt_improvement_service.rb +8 -4
- data/app/services/completion_kit/prompt_improvement_validator.rb +1 -1
- data/app/services/completion_kit/starter_metrics.rb +25 -1
- data/app/views/completion_kit/api_reference/_body.html.erb +4 -4
- data/app/views/completion_kit/metrics/_check_spec.html.erb +17 -0
- data/app/views/completion_kit/metrics/_form.html.erb +104 -1
- data/app/views/completion_kit/metrics/index.html.erb +4 -3
- data/app/views/completion_kit/metrics/show.html.erb +26 -14
- data/app/views/completion_kit/metrics/starter_preview.html.erb +8 -0
- data/app/views/completion_kit/responses/show.html.erb +1 -1
- data/db/migrate/20260629000001_add_check_type_to_completion_kit_metrics.rb +6 -0
- data/db/migrate/20260629000002_add_check_type_to_completion_kit_metric_versions.rb +6 -0
- data/db/migrate/20260629000003_add_passed_to_completion_kit_reviews.rb +5 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +17 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 84ae9459a26b612bb68a2f875a83274dc1bd711659b62d230aa7315fb3e7ce66
|
|
4
|
+
data.tar.gz: ebbe020a987228e1c1f5e2c0c1d6be4caa3c17cd7448141d69e331fc9a207eb3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 95ee6ba7cd0db74ea2e27629f3a8d3b83de56b1171f3f26419c6cce72fcdf9cf01dafefb7942bcd06519db2b1b304e1e108156e7b22c9b3a54230cb2b620ac50
|
|
7
|
+
data.tar.gz: f9883c98aa3e6e4ec4cfaf2dbe7f9f9c3261959de217b20813efd53551ecbe69a41d871d3fc9179771413a5f214c9ffd337ea67775d9d558c8d2ea21dc2e336a
|
|
@@ -2,6 +2,7 @@ module CompletionKit
|
|
|
2
2
|
class AgreementsController < ApplicationController
|
|
3
3
|
before_action :ensure_agreement_enabled
|
|
4
4
|
before_action :set_scope
|
|
5
|
+
before_action :reject_check_metric, only: [:create]
|
|
5
6
|
|
|
6
7
|
def create
|
|
7
8
|
created_by = agreement_creator
|
|
@@ -60,6 +61,10 @@ module CompletionKit
|
|
|
60
61
|
head :not_found unless CompletionKit.config.judge_agreement_enabled
|
|
61
62
|
end
|
|
62
63
|
|
|
64
|
+
def reject_check_metric
|
|
65
|
+
head :unprocessable_entity if @metric.check?
|
|
66
|
+
end
|
|
67
|
+
|
|
63
68
|
def set_scope
|
|
64
69
|
@run = Run.find(params[:run_id])
|
|
65
70
|
@response = @run.responses.find(params[:response_id])
|
|
@@ -4,6 +4,7 @@ module CompletionKit
|
|
|
4
4
|
class AgreementsController < BaseController
|
|
5
5
|
before_action :ensure_agreement_enabled
|
|
6
6
|
before_action :set_nested_scope, only: [:create]
|
|
7
|
+
before_action :reject_check_metric, only: [:create]
|
|
7
8
|
before_action :load_agreement, only: [:destroy]
|
|
8
9
|
|
|
9
10
|
def index
|
|
@@ -53,6 +54,10 @@ module CompletionKit
|
|
|
53
54
|
not_found
|
|
54
55
|
end
|
|
55
56
|
|
|
57
|
+
def reject_check_metric
|
|
58
|
+
render_error("Checks have nothing to calibrate", status: :unprocessable_entity) if @metric.check?
|
|
59
|
+
end
|
|
60
|
+
|
|
56
61
|
def load_agreement
|
|
57
62
|
@agreement = Agreement.find(params[:id])
|
|
58
63
|
rescue ActiveRecord::RecordNotFound
|
|
@@ -37,6 +37,11 @@ module CompletionKit
|
|
|
37
37
|
end
|
|
38
38
|
|
|
39
39
|
def suggest_variants
|
|
40
|
+
if @metric.check?
|
|
41
|
+
render_error("Checks are exact; no variants to suggest.", status: :unprocessable_entity)
|
|
42
|
+
return
|
|
43
|
+
end
|
|
44
|
+
|
|
40
45
|
disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
|
|
41
46
|
if disagreement_count.zero?
|
|
42
47
|
render_error("Mark at least one case as Disagree before asking the model to suggest a change.", status: :unprocessable_entity)
|
|
@@ -63,8 +68,10 @@ module CompletionKit
|
|
|
63
68
|
end
|
|
64
69
|
|
|
65
70
|
def metric_params
|
|
66
|
-
params.permit(:name, :instruction,
|
|
67
|
-
rubric_bands: [:stars, :description],
|
|
71
|
+
params.permit(:name, :instruction, :metric_type,
|
|
72
|
+
rubric_bands: [:stars, :description],
|
|
73
|
+
check_config: %i[check_kind target target_path value pattern json_path expected min max case_sensitive multiline trim],
|
|
74
|
+
tag_names: [])
|
|
68
75
|
end
|
|
69
76
|
end
|
|
70
77
|
end
|
|
@@ -62,7 +62,7 @@ module CompletionKit
|
|
|
62
62
|
CompletionKit::Review.where(response_id: failed_response_ids, status: "failed").update_all(
|
|
63
63
|
status: "pending", attempts: 0,
|
|
64
64
|
error_provider: nil, error_class: nil, error_status: nil, error_message: nil,
|
|
65
|
-
ai_score: nil, ai_feedback: nil
|
|
65
|
+
ai_score: nil, passed: nil, ai_feedback: nil
|
|
66
66
|
)
|
|
67
67
|
scope.update_all(
|
|
68
68
|
status: "pending", attempts: 0,
|
|
@@ -24,7 +24,9 @@ module CompletionKit
|
|
|
24
24
|
metric = Metric.create!(
|
|
25
25
|
name: starter.name,
|
|
26
26
|
instruction: starter.instruction,
|
|
27
|
-
rubric_bands: starter.rubric_bands
|
|
27
|
+
rubric_bands: starter.rubric_bands,
|
|
28
|
+
metric_type: starter.metric_type || "llm_judge",
|
|
29
|
+
check_config: starter.check_config
|
|
28
30
|
)
|
|
29
31
|
redirect_to metric_path(metric), notice: "Added the \"#{starter.name}\" starter. Tweak any band before you run a judge against it."
|
|
30
32
|
end
|
|
@@ -39,9 +41,14 @@ module CompletionKit
|
|
|
39
41
|
def show
|
|
40
42
|
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
41
43
|
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
42
|
-
@improve_disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
|
|
43
44
|
@versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
|
|
44
|
-
@
|
|
45
|
+
if @metric.check?
|
|
46
|
+
@improve_disagreement_count = 0
|
|
47
|
+
@guiding_examples = []
|
|
48
|
+
else
|
|
49
|
+
@improve_disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
|
|
50
|
+
@guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricAgreementExamples.judge_examples_for(@metric) : []
|
|
51
|
+
end
|
|
45
52
|
end
|
|
46
53
|
|
|
47
54
|
def new
|
|
@@ -52,7 +59,7 @@ module CompletionKit
|
|
|
52
59
|
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
53
60
|
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
54
61
|
@published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
55
|
-
@improve_disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
|
|
62
|
+
@improve_disagreement_count = @metric.check? ? 0 : Agreement.where(metric_id: @metric.id, verdict: "disagree").count
|
|
56
63
|
|
|
57
64
|
if @edit_draft
|
|
58
65
|
@metric.instruction = @edit_draft.instruction
|
|
@@ -71,42 +78,16 @@ module CompletionKit
|
|
|
71
78
|
end
|
|
72
79
|
|
|
73
80
|
def update
|
|
74
|
-
|
|
75
|
-
meta_attrs = metric_params.except(*judge_keys)
|
|
76
|
-
proposed_instruction = metric_params[:instruction]
|
|
77
|
-
proposed_rubric = metric_params[:rubric_bands]
|
|
81
|
+
meta_attrs = metric_params.except(:instruction, :rubric_bands, :check_config)
|
|
78
82
|
|
|
79
83
|
unless @metric.update(meta_attrs)
|
|
80
84
|
return render(:edit, status: :unprocessable_entity)
|
|
81
85
|
end
|
|
82
86
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
normalized_proposed_rubric = normalize_rubric_bands_for_update(proposed_rubric)
|
|
86
|
-
|
|
87
|
-
instruction_changed = !proposed_instruction.nil? && proposed_instruction.to_s != current_instruction
|
|
88
|
-
rubric_changed = !normalized_proposed_rubric.nil? && normalized_proposed_rubric != current_rubric
|
|
89
|
-
|
|
90
|
-
unless instruction_changed || rubric_changed
|
|
91
|
-
return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
|
|
92
|
-
end
|
|
93
|
-
|
|
94
|
-
new_instruction = instruction_changed ? proposed_instruction.to_s : current_instruction
|
|
95
|
-
new_rubric = rubric_changed ? normalized_proposed_rubric : current_rubric
|
|
96
|
-
|
|
97
|
-
if @metric.reviews.exists?
|
|
98
|
-
MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
|
|
99
|
-
draft = MetricVersion.create!(
|
|
100
|
-
metric: @metric, instruction: new_instruction, rubric_bands: new_rubric,
|
|
101
|
-
state: "draft", source: "edit", current: false
|
|
102
|
-
)
|
|
103
|
-
redirect_to edit_metric_path(@metric),
|
|
104
|
-
notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
|
|
87
|
+
if @metric.check?
|
|
88
|
+
update_check_definition
|
|
105
89
|
else
|
|
106
|
-
|
|
107
|
-
current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
108
|
-
current_pub&.update!(instruction: @metric.instruction, rubric_bands: @metric.rubric_bands)
|
|
109
|
-
redirect_to metric_path(@metric), notice: "Metric was successfully updated."
|
|
90
|
+
update_judge_definition
|
|
110
91
|
end
|
|
111
92
|
end
|
|
112
93
|
|
|
@@ -116,6 +97,11 @@ module CompletionKit
|
|
|
116
97
|
end
|
|
117
98
|
|
|
118
99
|
def suggest_variants
|
|
100
|
+
if @metric.check?
|
|
101
|
+
redirect_to metric_path(@metric), alert: "Checks are exact, so there is nothing to suggest."
|
|
102
|
+
return
|
|
103
|
+
end
|
|
104
|
+
|
|
119
105
|
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
120
106
|
counts = Agreement.where(metric_id: @metric.id, verdict: %w[agree disagree]).group(:verdict).count
|
|
121
107
|
if counts["disagree"].to_i.zero?
|
|
@@ -188,13 +174,88 @@ module CompletionKit
|
|
|
188
174
|
head :not_found unless CompletionKit.config.judge_examples_from_reviews
|
|
189
175
|
end
|
|
190
176
|
|
|
177
|
+
def update_judge_definition
|
|
178
|
+
proposed_instruction = metric_params[:instruction]
|
|
179
|
+
proposed_rubric = metric_params[:rubric_bands]
|
|
180
|
+
current_instruction = @metric.instruction.to_s
|
|
181
|
+
current_rubric = @metric.rubric_bands || []
|
|
182
|
+
normalized_proposed_rubric = normalize_rubric_bands_for_update(proposed_rubric)
|
|
183
|
+
|
|
184
|
+
instruction_changed = !proposed_instruction.nil? && proposed_instruction.to_s != current_instruction
|
|
185
|
+
rubric_changed = !normalized_proposed_rubric.nil? && normalized_proposed_rubric != current_rubric
|
|
186
|
+
|
|
187
|
+
unless instruction_changed || rubric_changed
|
|
188
|
+
return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
new_instruction = instruction_changed ? proposed_instruction.to_s : current_instruction
|
|
192
|
+
new_rubric = rubric_changed ? normalized_proposed_rubric : current_rubric
|
|
193
|
+
|
|
194
|
+
if @metric.reviews.exists?
|
|
195
|
+
MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
|
|
196
|
+
draft = MetricVersion.create!(
|
|
197
|
+
metric: @metric, instruction: new_instruction, rubric_bands: new_rubric,
|
|
198
|
+
state: "draft", source: "edit", current: false
|
|
199
|
+
)
|
|
200
|
+
redirect_to edit_metric_path(@metric),
|
|
201
|
+
notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
|
|
202
|
+
else
|
|
203
|
+
@metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
|
|
204
|
+
current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
205
|
+
current_pub&.update!(instruction: @metric.instruction, rubric_bands: @metric.rubric_bands)
|
|
206
|
+
redirect_to metric_path(@metric), notice: "Metric was successfully updated."
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def update_check_definition
|
|
211
|
+
raw = metric_params[:check_config]
|
|
212
|
+
proposed = raw.nil? ? nil : normalize_check_config(raw)
|
|
213
|
+
|
|
214
|
+
unless !proposed.nil? && proposed != @metric.check_config
|
|
215
|
+
return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
if @metric.reviews.exists?
|
|
219
|
+
MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
|
|
220
|
+
draft = MetricVersion.create!(
|
|
221
|
+
metric: @metric, metric_type: "check", check_config: proposed,
|
|
222
|
+
state: "draft", source: "edit", current: false
|
|
223
|
+
)
|
|
224
|
+
redirect_to edit_metric_path(@metric),
|
|
225
|
+
notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
|
|
226
|
+
else
|
|
227
|
+
@metric.update!(check_config: proposed)
|
|
228
|
+
current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
|
|
229
|
+
current_pub&.update!(metric_type: "check", check_config: proposed)
|
|
230
|
+
redirect_to metric_path(@metric), notice: "Metric was successfully updated."
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
191
234
|
def set_metric
|
|
192
235
|
@metric = Metric.find(params[:id])
|
|
193
236
|
end
|
|
194
237
|
|
|
195
238
|
def metric_params
|
|
196
|
-
params.require(:metric).permit(:name, :instruction,
|
|
197
|
-
rubric_bands: [:stars, :description],
|
|
239
|
+
permitted = params.require(:metric).permit(:name, :instruction, :metric_type,
|
|
240
|
+
rubric_bands: [:stars, :description],
|
|
241
|
+
check_config: %i[check_kind target target_path value pattern json_path expected min max case_sensitive multiline trim],
|
|
242
|
+
tag_names: [])
|
|
243
|
+
permitted[:check_config] = normalize_check_config(permitted[:check_config]) if permitted.key?(:check_config)
|
|
244
|
+
permitted
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
def normalize_check_config(config)
|
|
248
|
+
hash = config.to_unsafe_h.stringify_keys
|
|
249
|
+
%w[min max].each { |key| hash[key] = hash[key].to_i if hash[key].present? }
|
|
250
|
+
%w[case_sensitive multiline trim].each { |key| hash[key] = ActiveModel::Type::Boolean.new.cast(hash[key]) if hash.key?(key) }
|
|
251
|
+
hash["expected"] = coerce_scalar(hash["expected"]) if hash["expected"].present?
|
|
252
|
+
hash.reject { |_, value| value.nil? || value == "" }
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def coerce_scalar(value)
|
|
256
|
+
JSON.parse(value)
|
|
257
|
+
rescue JSON::ParserError
|
|
258
|
+
value
|
|
198
259
|
end
|
|
199
260
|
|
|
200
261
|
def normalize_rubric_bands_for_update(bands)
|
|
@@ -164,7 +164,7 @@ module CompletionKit
|
|
|
164
164
|
status: "pending",
|
|
165
165
|
attempts: 0,
|
|
166
166
|
error_provider: nil, error_class: nil, error_status: nil, error_message: nil,
|
|
167
|
-
ai_score: nil, ai_feedback: nil
|
|
167
|
+
ai_score: nil, passed: nil, ai_feedback: nil
|
|
168
168
|
)
|
|
169
169
|
scope.update_all(
|
|
170
170
|
status: "pending",
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class CheckReviewJob < ApplicationJob
|
|
3
|
+
queue_as :default
|
|
4
|
+
|
|
5
|
+
rescue_from(StandardError) do |error|
|
|
6
|
+
Rails.error.report(error, handled: true, context: { job: self.class.name, run_id: @run_id, response_id: @response_id, metric_id: @metric_id })
|
|
7
|
+
record_terminal_failure!(error)
|
|
8
|
+
enqueue_completion_check
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def perform(response_id, metric_id, run_id = nil)
|
|
12
|
+
@response_id = response_id
|
|
13
|
+
@metric_id = metric_id
|
|
14
|
+
@run_id = run_id
|
|
15
|
+
|
|
16
|
+
response = Response.find(response_id)
|
|
17
|
+
metric = Metric.find(metric_id)
|
|
18
|
+
result = evaluate(response, metric.check_config || {})
|
|
19
|
+
|
|
20
|
+
review = response.reviews.find_or_initialize_by(metric_id: metric.id)
|
|
21
|
+
current_metric_version = MetricVersion.ensure_current_for(metric)
|
|
22
|
+
review.assign_attributes(
|
|
23
|
+
metric_name: metric.name,
|
|
24
|
+
metric_version_id: current_metric_version.id,
|
|
25
|
+
status: "succeeded",
|
|
26
|
+
passed: result.passed,
|
|
27
|
+
ai_score: nil,
|
|
28
|
+
ai_feedback: result.detail,
|
|
29
|
+
error_provider: nil, error_class: nil, error_status: nil, error_message: nil
|
|
30
|
+
)
|
|
31
|
+
review.save!
|
|
32
|
+
|
|
33
|
+
enqueue_completion_check
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
def evaluate(response, config)
|
|
39
|
+
target_value = Checks::TargetResolver.call(response, config)
|
|
40
|
+
if target_value.equal?(Checks::TargetResolver::UNRESOLVED)
|
|
41
|
+
return Checks::Result.new(passed: false, detail: "could not resolve target")
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
Checks::Registry.fetch(config["check_kind"]).call(target_value, config)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def record_terminal_failure!(error)
|
|
48
|
+
response = Response.find_by(id: @response_id)
|
|
49
|
+
return unless response
|
|
50
|
+
|
|
51
|
+
review = response.reviews.find_or_initialize_by(metric_id: @metric_id)
|
|
52
|
+
review.assign_attributes(
|
|
53
|
+
metric_name: review.metric_name || Metric.find_by(id: @metric_id)&.name || "(deleted metric)",
|
|
54
|
+
status: "failed",
|
|
55
|
+
error_class: error.class.name,
|
|
56
|
+
error_message: error.message.to_s.truncate(2000)
|
|
57
|
+
)
|
|
58
|
+
review.save!(validate: false)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def enqueue_completion_check
|
|
62
|
+
response = Response.find_by(id: @response_id)
|
|
63
|
+
RunCompletionCheckJob.perform_later(response.run_id) if response
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -61,11 +61,14 @@ module CompletionKit
|
|
|
61
61
|
error_provider: nil, error_class: nil, error_status: nil, error_message: nil
|
|
62
62
|
)
|
|
63
63
|
|
|
64
|
-
if run.
|
|
65
|
-
run.
|
|
64
|
+
if run.llm_judge_configured?
|
|
65
|
+
run.llm_metrics.each do |metric|
|
|
66
66
|
JudgeReviewJob.perform_later(response.id, metric.id, run.id)
|
|
67
67
|
end
|
|
68
68
|
end
|
|
69
|
+
run.check_metrics.each do |metric|
|
|
70
|
+
CheckReviewJob.perform_later(response.id, metric.id, run.id)
|
|
71
|
+
end
|
|
69
72
|
|
|
70
73
|
enqueue_completion_check
|
|
71
74
|
end
|
|
@@ -16,14 +16,20 @@ module CompletionKit
|
|
|
16
16
|
has_many :reviews, dependent: :nullify
|
|
17
17
|
has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
|
|
18
18
|
|
|
19
|
+
METRIC_TYPES = %w[llm_judge check].freeze
|
|
20
|
+
|
|
19
21
|
serialize :rubric_bands, coder: JSON
|
|
22
|
+
serialize :check_config, coder: JSON
|
|
20
23
|
|
|
21
24
|
validates :name, presence: true
|
|
22
25
|
validates :key, tenant_scoped_uniqueness: { allow_nil: true }
|
|
26
|
+
validates :metric_type, inclusion: { in: METRIC_TYPES }
|
|
27
|
+
validate :validate_check_config, if: :check?
|
|
28
|
+
validate :metric_type_immutable_once_in_use, on: :update
|
|
23
29
|
|
|
24
30
|
before_validation :generate_key
|
|
25
|
-
before_validation :normalize_rubric_bands
|
|
26
|
-
before_validation :set_defaults
|
|
31
|
+
before_validation :normalize_rubric_bands, if: :llm_judge?
|
|
32
|
+
before_validation :set_defaults, if: :llm_judge?
|
|
27
33
|
|
|
28
34
|
def self.default_rubric_bands
|
|
29
35
|
DEFAULT_RUBRIC_BANDS.map(&:dup)
|
|
@@ -74,13 +80,29 @@ module CompletionKit
|
|
|
74
80
|
self.class.rubric_text_for(rubric_bands_for_form)
|
|
75
81
|
end
|
|
76
82
|
|
|
83
|
+
def check?
|
|
84
|
+
metric_type == "check"
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def llm_judge?
|
|
88
|
+
!check?
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def in_use?
|
|
92
|
+
RunMetric.exists?(metric_id: id) || reviews.exists? || metric_versions.exists?
|
|
93
|
+
end
|
|
94
|
+
|
|
77
95
|
def as_json(options = {})
|
|
78
|
-
{
|
|
79
|
-
id: id, name: name, key: key,
|
|
80
|
-
rubric_bands: rubric_bands,
|
|
96
|
+
base = {
|
|
97
|
+
id: id, name: name, key: key, metric_type: metric_type,
|
|
81
98
|
created_at: created_at, updated_at: updated_at,
|
|
82
99
|
tags: tags.as_json
|
|
83
100
|
}
|
|
101
|
+
if check?
|
|
102
|
+
base.merge(check_config: check_config)
|
|
103
|
+
else
|
|
104
|
+
base.merge(instruction: instruction, rubric_bands: rubric_bands)
|
|
105
|
+
end
|
|
84
106
|
end
|
|
85
107
|
|
|
86
108
|
private
|
|
@@ -89,6 +111,70 @@ module CompletionKit
|
|
|
89
111
|
self.key ||= name.parameterize if name.present?
|
|
90
112
|
end
|
|
91
113
|
|
|
114
|
+
def metric_type_immutable_once_in_use
|
|
115
|
+
return unless metric_type_changed?
|
|
116
|
+
return unless in_use?
|
|
117
|
+
|
|
118
|
+
errors.add(:metric_type, "cannot change once the metric has been used in a run")
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def validate_check_config
|
|
122
|
+
config = check_config
|
|
123
|
+
unless config.is_a?(Hash)
|
|
124
|
+
errors.add(:check_config, "must be a configuration object")
|
|
125
|
+
return
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
kind = config["check_kind"]
|
|
129
|
+
unless CompletionKit::Checks::Registry.kinds.include?(kind)
|
|
130
|
+
errors.add(:check_config, "check_kind must be one of #{CompletionKit::Checks::Registry.kinds.join(", ")}")
|
|
131
|
+
return
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
validate_check_target(config)
|
|
135
|
+
validate_check_required_keys(config, kind)
|
|
136
|
+
validate_check_kind_rules(config, kind)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
def validate_check_target(config)
|
|
140
|
+
target = config["target"].presence || "response_text"
|
|
141
|
+
unless CompletionKit::Checks::TargetResolver::TARGETS.include?(target)
|
|
142
|
+
errors.add(:check_config, "target must be one of #{CompletionKit::Checks::TargetResolver::TARGETS.join(", ")}")
|
|
143
|
+
end
|
|
144
|
+
if target == "json_path" && config["target_path"].to_s.strip.empty?
|
|
145
|
+
errors.add(:check_config, "target_path is required when target is json_path")
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def validate_check_required_keys(config, kind)
|
|
150
|
+
CompletionKit::Checks::Registry.required_keys.fetch(kind).each do |required_key|
|
|
151
|
+
if required_key == "expected"
|
|
152
|
+
errors.add(:check_config, "expected is required") unless config.key?("expected")
|
|
153
|
+
elsif config[required_key].to_s.strip.empty?
|
|
154
|
+
errors.add(:check_config, "#{required_key} is required")
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def validate_check_kind_rules(config, kind)
|
|
160
|
+
case kind
|
|
161
|
+
when "regex"
|
|
162
|
+
begin
|
|
163
|
+
Regexp.new(config["pattern"].to_s)
|
|
164
|
+
rescue RegexpError
|
|
165
|
+
errors.add(:check_config, "pattern is not a valid regular expression")
|
|
166
|
+
end
|
|
167
|
+
when "length_bounds"
|
|
168
|
+
min = config["min"]
|
|
169
|
+
max = config["max"]
|
|
170
|
+
if min.nil? && max.nil?
|
|
171
|
+
errors.add(:check_config, "length_bounds requires at least one of min or max")
|
|
172
|
+
elsif min && max && min.to_i > max.to_i
|
|
173
|
+
errors.add(:check_config, "min must be less than or equal to max")
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
|
|
92
178
|
def set_defaults
|
|
93
179
|
self.rubric_bands = self.class.default_rubric_bands if rubric_bands.blank?
|
|
94
180
|
end
|
|
@@ -6,6 +6,7 @@ module CompletionKit
|
|
|
6
6
|
has_many :agreements, dependent: :destroy
|
|
7
7
|
|
|
8
8
|
serialize :rubric_bands, coder: JSON
|
|
9
|
+
serialize :check_config, coder: JSON
|
|
9
10
|
serialize :validation_summary, coder: JSON
|
|
10
11
|
|
|
11
12
|
before_validation :assign_version_number, on: :create
|
|
@@ -23,12 +24,22 @@ module CompletionKit
|
|
|
23
24
|
metric: metric,
|
|
24
25
|
instruction: metric.instruction,
|
|
25
26
|
rubric_bands: metric.rubric_bands,
|
|
27
|
+
metric_type: metric.metric_type,
|
|
28
|
+
check_config: metric.check_config,
|
|
26
29
|
current: true,
|
|
27
30
|
state: "published",
|
|
28
31
|
published_at: Time.current
|
|
29
32
|
)
|
|
30
33
|
end
|
|
31
34
|
|
|
35
|
+
def check?
|
|
36
|
+
metric_type == "check"
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def llm_judge?
|
|
40
|
+
!check?
|
|
41
|
+
end
|
|
42
|
+
|
|
32
43
|
def draft?
|
|
33
44
|
state == "draft"
|
|
34
45
|
end
|
|
@@ -43,6 +54,7 @@ module CompletionKit
|
|
|
43
54
|
|
|
44
55
|
def change_summary_against(previous)
|
|
45
56
|
return nil if previous.nil?
|
|
57
|
+
return check_change_summary_against(previous) if check?
|
|
46
58
|
|
|
47
59
|
instruction_changed = previous.instruction.to_s.strip != instruction.to_s.strip
|
|
48
60
|
rubric_changes = rubric_band_change_count(previous)
|
|
@@ -75,31 +87,46 @@ module CompletionKit
|
|
|
75
87
|
self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
|
|
76
88
|
reload
|
|
77
89
|
update!(state: "published", current: true, published_at: published_at || Time.current)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
90
|
+
if check?
|
|
91
|
+
metric.update_columns(metric_type: "check", check_config: check_config)
|
|
92
|
+
else
|
|
93
|
+
metric.update_columns(
|
|
94
|
+
metric_type: "llm_judge",
|
|
95
|
+
instruction: instruction,
|
|
96
|
+
rubric_bands: Array(rubric_bands).to_json
|
|
97
|
+
)
|
|
98
|
+
end
|
|
82
99
|
end
|
|
83
100
|
self
|
|
84
101
|
end
|
|
85
102
|
|
|
86
103
|
def as_json(options = {})
|
|
87
|
-
{
|
|
104
|
+
base = {
|
|
88
105
|
id: id,
|
|
89
106
|
metric_id: metric_id,
|
|
90
107
|
version_number: version_number,
|
|
91
|
-
|
|
92
|
-
rubric_bands: rubric_bands,
|
|
108
|
+
metric_type: metric_type,
|
|
93
109
|
current: current,
|
|
94
110
|
state: state,
|
|
95
111
|
source: source,
|
|
96
112
|
published_at: published_at,
|
|
97
113
|
created_at: created_at
|
|
98
114
|
}
|
|
115
|
+
if check?
|
|
116
|
+
base.merge(check_config: check_config)
|
|
117
|
+
else
|
|
118
|
+
base.merge(instruction: instruction, rubric_bands: rubric_bands)
|
|
119
|
+
end
|
|
99
120
|
end
|
|
100
121
|
|
|
101
122
|
private
|
|
102
123
|
|
|
124
|
+
def check_change_summary_against(previous)
|
|
125
|
+
return nil if check_config == previous.check_config
|
|
126
|
+
|
|
127
|
+
{ magnitude: :minor, label: "Check configuration changes" }
|
|
128
|
+
end
|
|
129
|
+
|
|
103
130
|
def rubric_band_change_count(previous)
|
|
104
131
|
prev = Metric.normalize_rubric_bands(previous.rubric_bands)
|
|
105
132
|
curr = Metric.normalize_rubric_bands(rubric_bands)
|
|
@@ -8,7 +8,7 @@ module CompletionKit
|
|
|
8
8
|
|
|
9
9
|
delegate :prompt, to: :run
|
|
10
10
|
|
|
11
|
-
validates :response_text, presence: true, if: :
|
|
11
|
+
validates :response_text, presence: true, if: :requires_response_text?
|
|
12
12
|
|
|
13
13
|
before_validation :set_default_status, on: :create
|
|
14
14
|
|
|
@@ -34,7 +34,19 @@ module CompletionKit
|
|
|
34
34
|
end
|
|
35
35
|
|
|
36
36
|
def reviewed?
|
|
37
|
-
reviews.any? { |r| r.ai_score.present? }
|
|
37
|
+
reviews.any? { |r| r.ai_score.present? || !r.passed.nil? }
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def checks_total
|
|
41
|
+
reviews.count { |r| !r.passed.nil? }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def checks_passed
|
|
45
|
+
reviews.count { |r| r.passed == true }
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def checks_failed
|
|
49
|
+
reviews.count { |r| r.passed == false }
|
|
38
50
|
end
|
|
39
51
|
|
|
40
52
|
def fully_reviewed?
|
|
@@ -46,6 +58,10 @@ module CompletionKit
|
|
|
46
58
|
|
|
47
59
|
private
|
|
48
60
|
|
|
61
|
+
def requires_response_text?
|
|
62
|
+
succeeded? && !run&.judge_only_input_data_checks?
|
|
63
|
+
end
|
|
64
|
+
|
|
49
65
|
def broadcast_row_update
|
|
50
66
|
run.broadcast_response_update(self)
|
|
51
67
|
end
|
|
@@ -16,6 +16,10 @@ module CompletionKit
|
|
|
16
16
|
after_save_commit :broadcast_parent_row_update, unless: :destroyed?
|
|
17
17
|
after_save_commit :broadcast_run_progress, if: :should_broadcast_progress?
|
|
18
18
|
|
|
19
|
+
def check?
|
|
20
|
+
metric_version&.metric_type == "check"
|
|
21
|
+
end
|
|
22
|
+
|
|
19
23
|
def stale_against_current_judge?
|
|
20
24
|
return false unless metric_id && metric_version_id
|
|
21
25
|
current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
|
|
@@ -27,7 +31,7 @@ module CompletionKit
|
|
|
27
31
|
{
|
|
28
32
|
id: id, response_id: response_id, metric_id: metric_id,
|
|
29
33
|
metric_version_id: metric_version_id,
|
|
30
|
-
metric_name: metric_name, ai_score: ai_score,
|
|
34
|
+
metric_name: metric_name, ai_score: ai_score, passed: passed,
|
|
31
35
|
ai_feedback: ai_feedback, status: status, attempts: attempts,
|
|
32
36
|
error: error_payload
|
|
33
37
|
}
|