completion-kit 0.17.1 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of completion-kit might be problematic. Click here for more details.

Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/app/controllers/completion_kit/agreements_controller.rb +5 -0
  3. data/app/controllers/completion_kit/api/v1/agreements_controller.rb +5 -0
  4. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +9 -2
  5. data/app/controllers/completion_kit/api/v1/runs_controller.rb +1 -1
  6. data/app/controllers/completion_kit/metrics_controller.rb +97 -36
  7. data/app/controllers/completion_kit/runs_controller.rb +1 -1
  8. data/app/jobs/completion_kit/check_review_job.rb +66 -0
  9. data/app/jobs/completion_kit/generate_row_job.rb +5 -2
  10. data/app/jobs/completion_kit/metric_suggestion_job.rb +1 -0
  11. data/app/models/completion_kit/metric.rb +91 -5
  12. data/app/models/completion_kit/metric_version.rb +34 -7
  13. data/app/models/completion_kit/response.rb +18 -2
  14. data/app/models/completion_kit/review.rb +5 -1
  15. data/app/models/completion_kit/run.rb +70 -14
  16. data/app/services/completion_kit/checks/contains.rb +21 -0
  17. data/app/services/completion_kit/checks/equals.rb +26 -0
  18. data/app/services/completion_kit/checks/json_path_equals.rb +32 -0
  19. data/app/services/completion_kit/checks/length_bounds.rb +19 -0
  20. data/app/services/completion_kit/checks/no_refusal.rb +23 -0
  21. data/app/services/completion_kit/checks/not_contains.rb +21 -0
  22. data/app/services/completion_kit/checks/regex.rb +20 -0
  23. data/app/services/completion_kit/checks/registry.rb +41 -0
  24. data/app/services/completion_kit/checks/result.rb +5 -0
  25. data/app/services/completion_kit/checks/target_resolver.rb +31 -0
  26. data/app/services/completion_kit/checks/valid_json.rb +12 -0
  27. data/app/services/completion_kit/mcp_tools/agreements.rb +2 -0
  28. data/app/services/completion_kit/mcp_tools/judges.rb +2 -0
  29. data/app/services/completion_kit/mcp_tools/metrics.rb +32 -4
  30. data/app/services/completion_kit/metric_agreement_examples.rb +2 -0
  31. data/app/services/completion_kit/metric_improvement_validator.rb +2 -0
  32. data/app/services/completion_kit/metric_variant_generator.rb +1 -0
  33. data/app/services/completion_kit/onboarding/concepts.rb +1 -1
  34. data/app/services/completion_kit/prompt_improvement_service.rb +8 -4
  35. data/app/services/completion_kit/prompt_improvement_validator.rb +1 -1
  36. data/app/services/completion_kit/starter_metrics.rb +25 -1
  37. data/app/views/completion_kit/api_reference/_body.html.erb +4 -4
  38. data/app/views/completion_kit/metrics/_check_spec.html.erb +17 -0
  39. data/app/views/completion_kit/metrics/_form.html.erb +104 -1
  40. data/app/views/completion_kit/metrics/index.html.erb +4 -3
  41. data/app/views/completion_kit/metrics/show.html.erb +26 -14
  42. data/app/views/completion_kit/metrics/starter_preview.html.erb +8 -0
  43. data/app/views/completion_kit/responses/show.html.erb +1 -1
  44. data/db/migrate/20260629000001_add_check_type_to_completion_kit_metrics.rb +6 -0
  45. data/db/migrate/20260629000002_add_check_type_to_completion_kit_metric_versions.rb +6 -0
  46. data/db/migrate/20260629000003_add_passed_to_completion_kit_reviews.rb +5 -0
  47. data/lib/completion_kit/version.rb +1 -1
  48. metadata +17 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e35865036a3c24bd7cea9332a734af90d031b8c9d1b8d6b23b0694322fc248fe
4
- data.tar.gz: 16202eef750b6b0e233456eedff06429d23634f325de5bde2a2ff8d9598108e0
3
+ metadata.gz: 84ae9459a26b612bb68a2f875a83274dc1bd711659b62d230aa7315fb3e7ce66
4
+ data.tar.gz: ebbe020a987228e1c1f5e2c0c1d6be4caa3c17cd7448141d69e331fc9a207eb3
5
5
  SHA512:
6
- metadata.gz: a9112f5dcc7419ac0f6f6cc2f375b40bcd2d4d40e9d7fb4ca3b45899a51ff611234ef1ecfa6f64c4eecefbb43ac5e5bb4e2ab9edace0b531d1b3cf963adb2fdc
7
- data.tar.gz: 9a3f51f1754b0d60475c7f5b2c72c6bc0f8cdd2bfa00240aca7d2c5afdac91acf105cd2fddd103b29f4d0c0da9c846ddc752f068f409a4b257f1e9f9016199d0
6
+ metadata.gz: 95ee6ba7cd0db74ea2e27629f3a8d3b83de56b1171f3f26419c6cce72fcdf9cf01dafefb7942bcd06519db2b1b304e1e108156e7b22c9b3a54230cb2b620ac50
7
+ data.tar.gz: f9883c98aa3e6e4ec4cfaf2dbe7f9f9c3261959de217b20813efd53551ecbe69a41d871d3fc9179771413a5f214c9ffd337ea67775d9d558c8d2ea21dc2e336a
@@ -2,6 +2,7 @@ module CompletionKit
2
2
  class AgreementsController < ApplicationController
3
3
  before_action :ensure_agreement_enabled
4
4
  before_action :set_scope
5
+ before_action :reject_check_metric, only: [:create]
5
6
 
6
7
  def create
7
8
  created_by = agreement_creator
@@ -60,6 +61,10 @@ module CompletionKit
60
61
  head :not_found unless CompletionKit.config.judge_agreement_enabled
61
62
  end
62
63
 
64
+ def reject_check_metric
65
+ head :unprocessable_entity if @metric.check?
66
+ end
67
+
63
68
  def set_scope
64
69
  @run = Run.find(params[:run_id])
65
70
  @response = @run.responses.find(params[:response_id])
@@ -4,6 +4,7 @@ module CompletionKit
4
4
  class AgreementsController < BaseController
5
5
  before_action :ensure_agreement_enabled
6
6
  before_action :set_nested_scope, only: [:create]
7
+ before_action :reject_check_metric, only: [:create]
7
8
  before_action :load_agreement, only: [:destroy]
8
9
 
9
10
  def index
@@ -53,6 +54,10 @@ module CompletionKit
53
54
  not_found
54
55
  end
55
56
 
57
+ def reject_check_metric
58
+ render_error("Checks have nothing to calibrate", status: :unprocessable_entity) if @metric.check?
59
+ end
60
+
56
61
  def load_agreement
57
62
  @agreement = Agreement.find(params[:id])
58
63
  rescue ActiveRecord::RecordNotFound
@@ -37,6 +37,11 @@ module CompletionKit
37
37
  end
38
38
 
39
39
  def suggest_variants
40
+ if @metric.check?
41
+ render_error("Checks are exact; no variants to suggest.", status: :unprocessable_entity)
42
+ return
43
+ end
44
+
40
45
  disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
41
46
  if disagreement_count.zero?
42
47
  render_error("Mark at least one case as Disagree before asking the model to suggest a change.", status: :unprocessable_entity)
@@ -63,8 +68,10 @@ module CompletionKit
63
68
  end
64
69
 
65
70
  def metric_params
66
- params.permit(:name, :instruction,
67
- rubric_bands: [:stars, :description], tag_names: [])
71
+ params.permit(:name, :instruction, :metric_type,
72
+ rubric_bands: [:stars, :description],
73
+ check_config: %i[check_kind target target_path value pattern json_path expected min max case_sensitive multiline trim],
74
+ tag_names: [])
68
75
  end
69
76
  end
70
77
  end
@@ -62,7 +62,7 @@ module CompletionKit
62
62
  CompletionKit::Review.where(response_id: failed_response_ids, status: "failed").update_all(
63
63
  status: "pending", attempts: 0,
64
64
  error_provider: nil, error_class: nil, error_status: nil, error_message: nil,
65
- ai_score: nil, ai_feedback: nil
65
+ ai_score: nil, passed: nil, ai_feedback: nil
66
66
  )
67
67
  scope.update_all(
68
68
  status: "pending", attempts: 0,
@@ -24,7 +24,9 @@ module CompletionKit
24
24
  metric = Metric.create!(
25
25
  name: starter.name,
26
26
  instruction: starter.instruction,
27
- rubric_bands: starter.rubric_bands
27
+ rubric_bands: starter.rubric_bands,
28
+ metric_type: starter.metric_type || "llm_judge",
29
+ check_config: starter.check_config
28
30
  )
29
31
  redirect_to metric_path(metric), notice: "Added the \"#{starter.name}\" starter. Tweak any band before you run a judge against it."
30
32
  end
@@ -39,9 +41,14 @@ module CompletionKit
39
41
  def show
40
42
  @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
41
43
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
42
- @improve_disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
43
44
  @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
44
- @guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricAgreementExamples.judge_examples_for(@metric) : []
45
+ if @metric.check?
46
+ @improve_disagreement_count = 0
47
+ @guiding_examples = []
48
+ else
49
+ @improve_disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
50
+ @guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricAgreementExamples.judge_examples_for(@metric) : []
51
+ end
45
52
  end
46
53
 
47
54
  def new
@@ -52,7 +59,7 @@ module CompletionKit
52
59
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
53
60
  @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
54
61
  @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
55
- @improve_disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
62
+ @improve_disagreement_count = @metric.check? ? 0 : Agreement.where(metric_id: @metric.id, verdict: "disagree").count
56
63
 
57
64
  if @edit_draft
58
65
  @metric.instruction = @edit_draft.instruction
@@ -71,42 +78,16 @@ module CompletionKit
71
78
  end
72
79
 
73
80
  def update
74
- judge_keys = %i[instruction rubric_bands]
75
- meta_attrs = metric_params.except(*judge_keys)
76
- proposed_instruction = metric_params[:instruction]
77
- proposed_rubric = metric_params[:rubric_bands]
81
+ meta_attrs = metric_params.except(:instruction, :rubric_bands, :check_config)
78
82
 
79
83
  unless @metric.update(meta_attrs)
80
84
  return render(:edit, status: :unprocessable_entity)
81
85
  end
82
86
 
83
- current_instruction = @metric.instruction.to_s
84
- current_rubric = @metric.rubric_bands || []
85
- normalized_proposed_rubric = normalize_rubric_bands_for_update(proposed_rubric)
86
-
87
- instruction_changed = !proposed_instruction.nil? && proposed_instruction.to_s != current_instruction
88
- rubric_changed = !normalized_proposed_rubric.nil? && normalized_proposed_rubric != current_rubric
89
-
90
- unless instruction_changed || rubric_changed
91
- return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
92
- end
93
-
94
- new_instruction = instruction_changed ? proposed_instruction.to_s : current_instruction
95
- new_rubric = rubric_changed ? normalized_proposed_rubric : current_rubric
96
-
97
- if @metric.reviews.exists?
98
- MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
99
- draft = MetricVersion.create!(
100
- metric: @metric, instruction: new_instruction, rubric_bands: new_rubric,
101
- state: "draft", source: "edit", current: false
102
- )
103
- redirect_to edit_metric_path(@metric),
104
- notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
87
+ if @metric.check?
88
+ update_check_definition
105
89
  else
106
- @metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
107
- current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
108
- current_pub&.update!(instruction: @metric.instruction, rubric_bands: @metric.rubric_bands)
109
- redirect_to metric_path(@metric), notice: "Metric was successfully updated."
90
+ update_judge_definition
110
91
  end
111
92
  end
112
93
 
@@ -116,6 +97,11 @@ module CompletionKit
116
97
  end
117
98
 
118
99
  def suggest_variants
100
+ if @metric.check?
101
+ redirect_to metric_path(@metric), alert: "Checks are exact, so there is nothing to suggest."
102
+ return
103
+ end
104
+
119
105
  target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
120
106
  counts = Agreement.where(metric_id: @metric.id, verdict: %w[agree disagree]).group(:verdict).count
121
107
  if counts["disagree"].to_i.zero?
@@ -188,13 +174,88 @@ module CompletionKit
188
174
  head :not_found unless CompletionKit.config.judge_examples_from_reviews
189
175
  end
190
176
 
177
+ def update_judge_definition
178
+ proposed_instruction = metric_params[:instruction]
179
+ proposed_rubric = metric_params[:rubric_bands]
180
+ current_instruction = @metric.instruction.to_s
181
+ current_rubric = @metric.rubric_bands || []
182
+ normalized_proposed_rubric = normalize_rubric_bands_for_update(proposed_rubric)
183
+
184
+ instruction_changed = !proposed_instruction.nil? && proposed_instruction.to_s != current_instruction
185
+ rubric_changed = !normalized_proposed_rubric.nil? && normalized_proposed_rubric != current_rubric
186
+
187
+ unless instruction_changed || rubric_changed
188
+ return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
189
+ end
190
+
191
+ new_instruction = instruction_changed ? proposed_instruction.to_s : current_instruction
192
+ new_rubric = rubric_changed ? normalized_proposed_rubric : current_rubric
193
+
194
+ if @metric.reviews.exists?
195
+ MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
196
+ draft = MetricVersion.create!(
197
+ metric: @metric, instruction: new_instruction, rubric_bands: new_rubric,
198
+ state: "draft", source: "edit", current: false
199
+ )
200
+ redirect_to edit_metric_path(@metric),
201
+ notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
202
+ else
203
+ @metric.update!(instruction: new_instruction, rubric_bands: new_rubric)
204
+ current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
205
+ current_pub&.update!(instruction: @metric.instruction, rubric_bands: @metric.rubric_bands)
206
+ redirect_to metric_path(@metric), notice: "Metric was successfully updated."
207
+ end
208
+ end
209
+
210
+ def update_check_definition
211
+ raw = metric_params[:check_config]
212
+ proposed = raw.nil? ? nil : normalize_check_config(raw)
213
+
214
+ unless !proposed.nil? && proposed != @metric.check_config
215
+ return redirect_to(metric_path(@metric), notice: "Metric was successfully updated.")
216
+ end
217
+
218
+ if @metric.reviews.exists?
219
+ MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").destroy_all
220
+ draft = MetricVersion.create!(
221
+ metric: @metric, metric_type: "check", check_config: proposed,
222
+ state: "draft", source: "edit", current: false
223
+ )
224
+ redirect_to edit_metric_path(@metric),
225
+ notice: "Saved as draft #{draft.version_label}. Publish to make these changes the metric's live version."
226
+ else
227
+ @metric.update!(check_config: proposed)
228
+ current_pub = MetricVersion.published.where(metric_id: @metric.id, current: true).first
229
+ current_pub&.update!(metric_type: "check", check_config: proposed)
230
+ redirect_to metric_path(@metric), notice: "Metric was successfully updated."
231
+ end
232
+ end
233
+
191
234
  def set_metric
192
235
  @metric = Metric.find(params[:id])
193
236
  end
194
237
 
195
238
  def metric_params
196
- params.require(:metric).permit(:name, :instruction,
197
- rubric_bands: [:stars, :description], tag_names: [])
239
+ permitted = params.require(:metric).permit(:name, :instruction, :metric_type,
240
+ rubric_bands: [:stars, :description],
241
+ check_config: %i[check_kind target target_path value pattern json_path expected min max case_sensitive multiline trim],
242
+ tag_names: [])
243
+ permitted[:check_config] = normalize_check_config(permitted[:check_config]) if permitted.key?(:check_config)
244
+ permitted
245
+ end
246
+
247
+ def normalize_check_config(config)
248
+ hash = config.to_unsafe_h.stringify_keys
249
+ %w[min max].each { |key| hash[key] = hash[key].to_i if hash[key].present? }
250
+ %w[case_sensitive multiline trim].each { |key| hash[key] = ActiveModel::Type::Boolean.new.cast(hash[key]) if hash.key?(key) }
251
+ hash["expected"] = coerce_scalar(hash["expected"]) if hash["expected"].present?
252
+ hash.reject { |_, value| value.nil? || value == "" }
253
+ end
254
+
255
+ def coerce_scalar(value)
256
+ JSON.parse(value)
257
+ rescue JSON::ParserError
258
+ value
198
259
  end
199
260
 
200
261
  def normalize_rubric_bands_for_update(bands)
@@ -164,7 +164,7 @@ module CompletionKit
164
164
  status: "pending",
165
165
  attempts: 0,
166
166
  error_provider: nil, error_class: nil, error_status: nil, error_message: nil,
167
- ai_score: nil, ai_feedback: nil
167
+ ai_score: nil, passed: nil, ai_feedback: nil
168
168
  )
169
169
  scope.update_all(
170
170
  status: "pending",
@@ -0,0 +1,66 @@
1
+ module CompletionKit
2
+ class CheckReviewJob < ApplicationJob
3
+ queue_as :default
4
+
5
+ rescue_from(StandardError) do |error|
6
+ Rails.error.report(error, handled: true, context: { job: self.class.name, run_id: @run_id, response_id: @response_id, metric_id: @metric_id })
7
+ record_terminal_failure!(error)
8
+ enqueue_completion_check
9
+ end
10
+
11
+ def perform(response_id, metric_id, run_id = nil)
12
+ @response_id = response_id
13
+ @metric_id = metric_id
14
+ @run_id = run_id
15
+
16
+ response = Response.find(response_id)
17
+ metric = Metric.find(metric_id)
18
+ result = evaluate(response, metric.check_config || {})
19
+
20
+ review = response.reviews.find_or_initialize_by(metric_id: metric.id)
21
+ current_metric_version = MetricVersion.ensure_current_for(metric)
22
+ review.assign_attributes(
23
+ metric_name: metric.name,
24
+ metric_version_id: current_metric_version.id,
25
+ status: "succeeded",
26
+ passed: result.passed,
27
+ ai_score: nil,
28
+ ai_feedback: result.detail,
29
+ error_provider: nil, error_class: nil, error_status: nil, error_message: nil
30
+ )
31
+ review.save!
32
+
33
+ enqueue_completion_check
34
+ end
35
+
36
+ private
37
+
38
+ def evaluate(response, config)
39
+ target_value = Checks::TargetResolver.call(response, config)
40
+ if target_value.equal?(Checks::TargetResolver::UNRESOLVED)
41
+ return Checks::Result.new(passed: false, detail: "could not resolve target")
42
+ end
43
+
44
+ Checks::Registry.fetch(config["check_kind"]).call(target_value, config)
45
+ end
46
+
47
+ def record_terminal_failure!(error)
48
+ response = Response.find_by(id: @response_id)
49
+ return unless response
50
+
51
+ review = response.reviews.find_or_initialize_by(metric_id: @metric_id)
52
+ review.assign_attributes(
53
+ metric_name: review.metric_name || Metric.find_by(id: @metric_id)&.name || "(deleted metric)",
54
+ status: "failed",
55
+ error_class: error.class.name,
56
+ error_message: error.message.to_s.truncate(2000)
57
+ )
58
+ review.save!(validate: false)
59
+ end
60
+
61
+ def enqueue_completion_check
62
+ response = Response.find_by(id: @response_id)
63
+ RunCompletionCheckJob.perform_later(response.run_id) if response
64
+ end
65
+ end
66
+ end
@@ -61,11 +61,14 @@ module CompletionKit
61
61
  error_provider: nil, error_class: nil, error_status: nil, error_message: nil
62
62
  )
63
63
 
64
- if run.judge_configured?
65
- run.metrics.each do |metric|
64
+ if run.llm_judge_configured?
65
+ run.llm_metrics.each do |metric|
66
66
  JudgeReviewJob.perform_later(response.id, metric.id, run.id)
67
67
  end
68
68
  end
69
+ run.check_metrics.each do |metric|
70
+ CheckReviewJob.perform_later(response.id, metric.id, run.id)
71
+ end
69
72
 
70
73
  enqueue_completion_check
71
74
  end
@@ -15,6 +15,7 @@ module CompletionKit
15
15
  def perform(metric_id)
16
16
  @metric = Metric.find_by(id: metric_id)
17
17
  return unless @metric
18
+ return if @metric.check?
18
19
 
19
20
  MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
20
21
 
@@ -16,14 +16,20 @@ module CompletionKit
16
16
  has_many :reviews, dependent: :nullify
17
17
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
18
18
 
19
+ METRIC_TYPES = %w[llm_judge check].freeze
20
+
19
21
  serialize :rubric_bands, coder: JSON
22
+ serialize :check_config, coder: JSON
20
23
 
21
24
  validates :name, presence: true
22
25
  validates :key, tenant_scoped_uniqueness: { allow_nil: true }
26
+ validates :metric_type, inclusion: { in: METRIC_TYPES }
27
+ validate :validate_check_config, if: :check?
28
+ validate :metric_type_immutable_once_in_use, on: :update
23
29
 
24
30
  before_validation :generate_key
25
- before_validation :normalize_rubric_bands
26
- before_validation :set_defaults
31
+ before_validation :normalize_rubric_bands, if: :llm_judge?
32
+ before_validation :set_defaults, if: :llm_judge?
27
33
 
28
34
  def self.default_rubric_bands
29
35
  DEFAULT_RUBRIC_BANDS.map(&:dup)
@@ -74,13 +80,29 @@ module CompletionKit
74
80
  self.class.rubric_text_for(rubric_bands_for_form)
75
81
  end
76
82
 
83
+ def check?
84
+ metric_type == "check"
85
+ end
86
+
87
+ def llm_judge?
88
+ !check?
89
+ end
90
+
91
+ def in_use?
92
+ RunMetric.exists?(metric_id: id) || reviews.exists? || metric_versions.exists?
93
+ end
94
+
77
95
  def as_json(options = {})
78
- {
79
- id: id, name: name, key: key, instruction: instruction,
80
- rubric_bands: rubric_bands,
96
+ base = {
97
+ id: id, name: name, key: key, metric_type: metric_type,
81
98
  created_at: created_at, updated_at: updated_at,
82
99
  tags: tags.as_json
83
100
  }
101
+ if check?
102
+ base.merge(check_config: check_config)
103
+ else
104
+ base.merge(instruction: instruction, rubric_bands: rubric_bands)
105
+ end
84
106
  end
85
107
 
86
108
  private
@@ -89,6 +111,70 @@ module CompletionKit
89
111
  self.key ||= name.parameterize if name.present?
90
112
  end
91
113
 
114
+ def metric_type_immutable_once_in_use
115
+ return unless metric_type_changed?
116
+ return unless in_use?
117
+
118
+ errors.add(:metric_type, "cannot change once the metric has been used in a run")
119
+ end
120
+
121
+ def validate_check_config
122
+ config = check_config
123
+ unless config.is_a?(Hash)
124
+ errors.add(:check_config, "must be a configuration object")
125
+ return
126
+ end
127
+
128
+ kind = config["check_kind"]
129
+ unless CompletionKit::Checks::Registry.kinds.include?(kind)
130
+ errors.add(:check_config, "check_kind must be one of #{CompletionKit::Checks::Registry.kinds.join(", ")}")
131
+ return
132
+ end
133
+
134
+ validate_check_target(config)
135
+ validate_check_required_keys(config, kind)
136
+ validate_check_kind_rules(config, kind)
137
+ end
138
+
139
+ def validate_check_target(config)
140
+ target = config["target"].presence || "response_text"
141
+ unless CompletionKit::Checks::TargetResolver::TARGETS.include?(target)
142
+ errors.add(:check_config, "target must be one of #{CompletionKit::Checks::TargetResolver::TARGETS.join(", ")}")
143
+ end
144
+ if target == "json_path" && config["target_path"].to_s.strip.empty?
145
+ errors.add(:check_config, "target_path is required when target is json_path")
146
+ end
147
+ end
148
+
149
+ def validate_check_required_keys(config, kind)
150
+ CompletionKit::Checks::Registry.required_keys.fetch(kind).each do |required_key|
151
+ if required_key == "expected"
152
+ errors.add(:check_config, "expected is required") unless config.key?("expected")
153
+ elsif config[required_key].to_s.strip.empty?
154
+ errors.add(:check_config, "#{required_key} is required")
155
+ end
156
+ end
157
+ end
158
+
159
+ def validate_check_kind_rules(config, kind)
160
+ case kind
161
+ when "regex"
162
+ begin
163
+ Regexp.new(config["pattern"].to_s)
164
+ rescue RegexpError
165
+ errors.add(:check_config, "pattern is not a valid regular expression")
166
+ end
167
+ when "length_bounds"
168
+ min = config["min"]
169
+ max = config["max"]
170
+ if min.nil? && max.nil?
171
+ errors.add(:check_config, "length_bounds requires at least one of min or max")
172
+ elsif min && max && min.to_i > max.to_i
173
+ errors.add(:check_config, "min must be less than or equal to max")
174
+ end
175
+ end
176
+ end
177
+
92
178
  def set_defaults
93
179
  self.rubric_bands = self.class.default_rubric_bands if rubric_bands.blank?
94
180
  end
@@ -6,6 +6,7 @@ module CompletionKit
6
6
  has_many :agreements, dependent: :destroy
7
7
 
8
8
  serialize :rubric_bands, coder: JSON
9
+ serialize :check_config, coder: JSON
9
10
  serialize :validation_summary, coder: JSON
10
11
 
11
12
  before_validation :assign_version_number, on: :create
@@ -23,12 +24,22 @@ module CompletionKit
23
24
  metric: metric,
24
25
  instruction: metric.instruction,
25
26
  rubric_bands: metric.rubric_bands,
27
+ metric_type: metric.metric_type,
28
+ check_config: metric.check_config,
26
29
  current: true,
27
30
  state: "published",
28
31
  published_at: Time.current
29
32
  )
30
33
  end
31
34
 
35
+ def check?
36
+ metric_type == "check"
37
+ end
38
+
39
+ def llm_judge?
40
+ !check?
41
+ end
42
+
32
43
  def draft?
33
44
  state == "draft"
34
45
  end
@@ -43,6 +54,7 @@ module CompletionKit
43
54
 
44
55
  def change_summary_against(previous)
45
56
  return nil if previous.nil?
57
+ return check_change_summary_against(previous) if check?
46
58
 
47
59
  instruction_changed = previous.instruction.to_s.strip != instruction.to_s.strip
48
60
  rubric_changes = rubric_band_change_count(previous)
@@ -75,31 +87,46 @@ module CompletionKit
75
87
  self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
76
88
  reload
77
89
  update!(state: "published", current: true, published_at: published_at || Time.current)
78
- metric.update_columns(
79
- instruction: instruction,
80
- rubric_bands: Array(rubric_bands).to_json
81
- )
90
+ if check?
91
+ metric.update_columns(metric_type: "check", check_config: check_config)
92
+ else
93
+ metric.update_columns(
94
+ metric_type: "llm_judge",
95
+ instruction: instruction,
96
+ rubric_bands: Array(rubric_bands).to_json
97
+ )
98
+ end
82
99
  end
83
100
  self
84
101
  end
85
102
 
86
103
  def as_json(options = {})
87
- {
104
+ base = {
88
105
  id: id,
89
106
  metric_id: metric_id,
90
107
  version_number: version_number,
91
- instruction: instruction,
92
- rubric_bands: rubric_bands,
108
+ metric_type: metric_type,
93
109
  current: current,
94
110
  state: state,
95
111
  source: source,
96
112
  published_at: published_at,
97
113
  created_at: created_at
98
114
  }
115
+ if check?
116
+ base.merge(check_config: check_config)
117
+ else
118
+ base.merge(instruction: instruction, rubric_bands: rubric_bands)
119
+ end
99
120
  end
100
121
 
101
122
  private
102
123
 
124
+ def check_change_summary_against(previous)
125
+ return nil if check_config == previous.check_config
126
+
127
+ { magnitude: :minor, label: "Check configuration changes" }
128
+ end
129
+
103
130
  def rubric_band_change_count(previous)
104
131
  prev = Metric.normalize_rubric_bands(previous.rubric_bands)
105
132
  curr = Metric.normalize_rubric_bands(rubric_bands)
@@ -8,7 +8,7 @@ module CompletionKit
8
8
 
9
9
  delegate :prompt, to: :run
10
10
 
11
- validates :response_text, presence: true, if: :succeeded?
11
+ validates :response_text, presence: true, if: :requires_response_text?
12
12
 
13
13
  before_validation :set_default_status, on: :create
14
14
 
@@ -34,7 +34,19 @@ module CompletionKit
34
34
  end
35
35
 
36
36
  def reviewed?
37
- reviews.any? { |r| r.ai_score.present? }
37
+ reviews.any? { |r| r.ai_score.present? || !r.passed.nil? }
38
+ end
39
+
40
+ def checks_total
41
+ reviews.count { |r| !r.passed.nil? }
42
+ end
43
+
44
+ def checks_passed
45
+ reviews.count { |r| r.passed == true }
46
+ end
47
+
48
+ def checks_failed
49
+ reviews.count { |r| r.passed == false }
38
50
  end
39
51
 
40
52
  def fully_reviewed?
@@ -46,6 +58,10 @@ module CompletionKit
46
58
 
47
59
  private
48
60
 
61
+ def requires_response_text?
62
+ succeeded? && !run&.judge_only_input_data_checks?
63
+ end
64
+
49
65
  def broadcast_row_update
50
66
  run.broadcast_response_update(self)
51
67
  end
@@ -16,6 +16,10 @@ module CompletionKit
16
16
  after_save_commit :broadcast_parent_row_update, unless: :destroyed?
17
17
  after_save_commit :broadcast_run_progress, if: :should_broadcast_progress?
18
18
 
19
+ def check?
20
+ metric_version&.metric_type == "check"
21
+ end
22
+
19
23
  def stale_against_current_judge?
20
24
  return false unless metric_id && metric_version_id
21
25
  current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
@@ -27,7 +31,7 @@ module CompletionKit
27
31
  {
28
32
  id: id, response_id: response_id, metric_id: metric_id,
29
33
  metric_version_id: metric_version_id,
30
- metric_name: metric_name, ai_score: ai_score,
34
+ metric_name: metric_name, ai_score: ai_score, passed: passed,
31
35
  ai_feedback: ai_feedback, status: status, attempts: attempts,
32
36
  error: error_payload
33
37
  }