completion-kit 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/stylesheets/completion_kit/application.css +118 -55
  3. data/app/controllers/completion_kit/{calibrations_controller.rb → agreements_controller.rb} +19 -19
  4. data/app/controllers/completion_kit/api/v1/{calibrations_controller.rb → agreements_controller.rb} +18 -18
  5. data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +2 -7
  6. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +1 -1
  7. data/app/controllers/completion_kit/metrics_controller.rb +18 -23
  8. data/app/jobs/completion_kit/judge_review_job.rb +2 -2
  9. data/app/jobs/completion_kit/metric_suggestion_job.rb +46 -0
  10. data/app/models/completion_kit/{calibration.rb → agreement.rb} +1 -1
  11. data/app/models/completion_kit/metric_version.rb +2 -17
  12. data/app/models/completion_kit/review.rb +1 -0
  13. data/app/services/completion_kit/{calibration_math.rb → agreement_math.rb} +1 -1
  14. data/app/services/completion_kit/mcp_dispatcher.rb +2 -2
  15. data/app/services/completion_kit/mcp_tools/{calibrations.rb → agreements.rb} +11 -11
  16. data/app/services/completion_kit/mcp_tools/judges.rb +3 -3
  17. data/app/services/completion_kit/mcp_tools/metric_versions.rb +2 -7
  18. data/app/services/completion_kit/{metric_calibration_examples.rb → metric_agreement_examples.rb} +6 -6
  19. data/app/services/completion_kit/{metric_calibration_stats.rb → metric_agreement_stats.rb} +6 -6
  20. data/app/services/completion_kit/metric_improvement_validator.rb +101 -0
  21. data/app/services/completion_kit/metric_variant_generator.rb +2 -2
  22. data/app/views/completion_kit/{calibrations → agreements}/_buttons.html.erb +33 -33
  23. data/app/views/completion_kit/{calibrations → agreements}/_trust_panel.html.erb +6 -9
  24. data/app/views/completion_kit/api_reference/_body.html.erb +15 -15
  25. data/app/views/completion_kit/metrics/_guiding_examples.html.erb +1 -1
  26. data/app/views/completion_kit/metrics/_suggestion_failed.html.erb +3 -0
  27. data/app/views/completion_kit/metrics/_suggestion_pending.html.erb +3 -0
  28. data/app/views/completion_kit/metrics/_suggestion_ready.html.erb +4 -0
  29. data/app/views/completion_kit/metrics/_validation_scoreboard.html.erb +12 -0
  30. data/app/views/completion_kit/metrics/edit.html.erb +1 -1
  31. data/app/views/completion_kit/metrics/show.html.erb +25 -11
  32. data/app/views/completion_kit/responses/show.html.erb +4 -4
  33. data/app/views/completion_kit/runs/show.html.erb +1 -1
  34. data/config/routes.rb +3 -3
  35. data/db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb +5 -0
  36. data/db/migrate/20260531000002_backfill_review_metric_versions.rb +33 -0
  37. data/db/migrate/20260531000003_add_metric_version_fk_to_reviews.rb +6 -0
  38. data/db/migrate/20260531000004_rename_calibrations_to_agreements.rb +19 -0
  39. data/lib/completion_kit/version.rb +1 -1
  40. data/lib/completion_kit.rb +2 -2
  41. metadata +20 -10
@@ -0,0 +1,46 @@
1
+ require "faraday"
2
+
3
+ module CompletionKit
4
+ class MetricSuggestionJob < ApplicationJob
5
+ queue_as :llm
6
+
7
+ retry_on Faraday::TimeoutError, Faraday::ConnectionFailed, wait: :polynomially_longer, attempts: 5
8
+ retry_on CompletionKit::RateLimitError, wait: :polynomially_longer, attempts: 5
9
+
10
+ rescue_from(StandardError) do |error|
11
+ Rails.error.report(error, handled: true, context: { job: self.class.name })
12
+ broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
13
+ end
14
+
15
+ def perform(metric_id)
16
+ @metric = Metric.find_by(id: metric_id)
17
+ return unless @metric
18
+
19
+ MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
20
+
21
+ generator = MetricVariantGenerator.new(@metric, count: 1)
22
+ variants = generator.call
23
+ if variants.empty?
24
+ broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
25
+ return
26
+ end
27
+
28
+ draft = generator.persist!(variants).max_by(&:version_number)
29
+ summary = MetricImprovementValidator.new(@metric, draft).call
30
+ draft.update!(validation_summary: summary)
31
+
32
+ broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_ready", locals: { metric: @metric, draft: draft })
33
+ end
34
+
35
+ private
36
+
37
+ def broadcast_status(metric, partial:, locals:)
38
+ html = CompletionKit::ApplicationController.render(partial: partial, locals: locals)
39
+ Turbo::StreamsChannel.broadcast_replace_to(
40
+ "metric_#{metric.id}_suggestion",
41
+ target: "ck-suggestion-status-#{metric.id}",
42
+ html: html
43
+ )
44
+ end
45
+ end
46
+ end
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- class Calibration < ApplicationRecord
2
+ class Agreement < ApplicationRecord
3
3
  VERDICTS = %w[agree disagree borderline].freeze
4
4
 
5
5
  belongs_to :run
@@ -3,9 +3,10 @@ module CompletionKit
3
3
  STATES = %w[draft published].freeze
4
4
 
5
5
  belongs_to :metric
6
- has_many :calibrations, dependent: :destroy
6
+ has_many :agreements, dependent: :destroy
7
7
 
8
8
  serialize :rubric_bands, coder: JSON
9
+ serialize :validation_summary, coder: JSON
9
10
 
10
11
  before_validation :assign_version_number, on: :create
11
12
 
@@ -82,22 +83,6 @@ module CompletionKit
82
83
  self
83
84
  end
84
85
 
85
- def revert!
86
- raise ArgumentError, "only a published version can be reverted to" unless published?
87
- audit = nil
88
- MetricVersion.transaction do
89
- audit = self.class.create!(
90
- metric: metric,
91
- instruction: instruction,
92
- rubric_bands: rubric_bands,
93
- state: "draft",
94
- source: "revert"
95
- )
96
- audit.publish!
97
- end
98
- audit
99
- end
100
-
101
86
  def as_json(options = {})
102
87
  {
103
88
  id: id,
@@ -8,6 +8,7 @@ module CompletionKit
8
8
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
9
9
 
10
10
  validates :metric_name, presence: true
11
+ validates :metric_version, presence: true
11
12
  validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
12
13
 
13
14
  before_validation :set_default_status
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- module CalibrationMath
2
+ module AgreementMath
3
3
  Z_95 = 1.959963984540054
4
4
 
5
5
  module_function
@@ -35,7 +35,7 @@ module CompletionKit
35
35
  McpTools::MetricVersions.definitions +
36
36
  McpTools::ProviderCredentials.definitions +
37
37
  McpTools::Tags.definitions +
38
- McpTools::Calibrations.definitions +
38
+ McpTools::Agreements.definitions +
39
39
  McpTools::Judges.definitions
40
40
  end
41
41
 
@@ -50,7 +50,7 @@ module CompletionKit
50
50
  when /\Ametrics_/ then McpTools::Metrics.call(name, arguments)
51
51
  when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
52
52
  when /\Atags_/ then McpTools::Tags.call(name, arguments)
53
- when /\Acalibrations_/ then McpTools::Calibrations.call(name, arguments)
53
+ when /\Aagreements_/ then McpTools::Agreements.call(name, arguments)
54
54
  when /\Ajudges_/ then McpTools::Judges.call(name, arguments)
55
55
  else raise MethodNotFound, "Unknown tool: #{name}"
56
56
  end
@@ -1,11 +1,11 @@
1
1
  module CompletionKit
2
2
  module McpTools
3
- module Calibrations
3
+ module Agreements
4
4
  extend Base
5
5
 
6
6
  TOOLS = {
7
- "calibrations_list" => {
8
- description: "List calibrations. Filter by run_id, response_id, metric_id, or created_by.",
7
+ "agreements_list" => {
8
+ description: "List agreements. Filter by run_id, response_id, metric_id, or created_by.",
9
9
  inputSchema: {
10
10
  type: "object",
11
11
  properties: {
@@ -18,8 +18,8 @@ module CompletionKit
18
18
  },
19
19
  handler: :list
20
20
  },
21
- "calibrations_create" => {
22
- description: "Upsert a calibration for (run, response, metric, created_by). Verdict is one of agree, disagree, borderline. corrected_score (1..5) is required when verdict is 'disagree'.",
21
+ "agreements_create" => {
22
+ description: "Upsert an agreement for (run, response, metric, created_by). Verdict is one of agree, disagree, borderline. corrected_score (1..5) is required when verdict is 'disagree'.",
23
23
  inputSchema: {
24
24
  type: "object",
25
25
  properties: {
@@ -38,7 +38,7 @@ module CompletionKit
38
38
  }.freeze
39
39
 
40
40
  def self.list(args)
41
- scope = CompletionKit::Calibration.all
41
+ scope = CompletionKit::Agreement.all
42
42
  scope = scope.where(run_id: args["run_id"]) if args["run_id"]
43
43
  scope = scope.where(response_id: args["response_id"]) if args["response_id"]
44
44
  scope = scope.where(metric_id: args["metric_id"]) if args["metric_id"]
@@ -52,20 +52,20 @@ module CompletionKit
52
52
  metric = CompletionKit::Metric.find(args["metric_id"])
53
53
  created_by = args["created_by"].presence || "mcp"
54
54
 
55
- calibration = CompletionKit::Calibration.find_or_initialize_by(
55
+ agreement = CompletionKit::Agreement.find_or_initialize_by(
56
56
  run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
57
57
  )
58
- calibration.assign_attributes(
58
+ agreement.assign_attributes(
59
59
  metric_version: CompletionKit::MetricVersion.ensure_current_for(metric),
60
60
  verdict: args["verdict"],
61
61
  corrected_score: args["corrected_score"],
62
62
  note: args["note"]
63
63
  )
64
64
 
65
- if calibration.save
66
- text_result(calibration.as_json)
65
+ if agreement.save
66
+ text_result(agreement.as_json)
67
67
  else
68
- error_result(calibration.errors.full_messages.join(", "))
68
+ error_result(agreement.errors.full_messages.join(", "))
69
69
  end
70
70
  end
71
71
  end
@@ -33,7 +33,7 @@ module CompletionKit
33
33
  handler: :replay
34
34
  },
35
35
  "judges_compare" => {
36
- description: "Compare two metric versions' calibration stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
36
+ description: "Compare two metric versions' agreement stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
37
37
  inputSchema: {
38
38
  type: "object",
39
39
  properties: {
@@ -77,8 +77,8 @@ module CompletionKit
77
77
  metric = CompletionKit::Metric.find(args["metric_id"])
78
78
  a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
79
79
  b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
80
- stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
81
- stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
80
+ stats_a = CompletionKit::MetricAgreementStats.for(metric, metric_version: a)
81
+ stats_b = CompletionKit::MetricAgreementStats.for(metric, metric_version: b)
82
82
  text_result({
83
83
  metric_id: metric.id,
84
84
  a: metric_version_payload(a, stats_a),
@@ -47,13 +47,8 @@ module CompletionKit
47
47
 
48
48
  def self.publish(args)
49
49
  version = CompletionKit::MetricVersion.find(args["metric_version_id"])
50
- if version.published? && !version.current?
51
- audit = version.revert!
52
- text_result(audit.as_json)
53
- else
54
- version.publish!
55
- text_result(version.reload.as_json)
56
- end
50
+ version.publish!
51
+ text_result(version.reload.as_json)
57
52
  end
58
53
 
59
54
  def self.dismiss(args)
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- module MetricCalibrationExamples
2
+ module MetricAgreementExamples
3
3
  DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
4
4
 
5
5
  module_function
@@ -9,18 +9,18 @@ module CompletionKit
9
9
  end
10
10
 
11
11
  def disagreements_for(metric, limit: 8)
12
- calibrations_for(metric, verdict: "disagree", limit: limit)
12
+ agreements_for(metric, verdict: "disagree", limit: limit)
13
13
  end
14
14
 
15
15
  def borderlines_for(metric, limit: 6)
16
- calibrations_for(metric, verdict: "borderline", limit: limit)
16
+ agreements_for(metric, verdict: "borderline", limit: limit)
17
17
  end
18
18
 
19
19
  def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
20
20
  current_version = MetricVersion.current.find_by(metric_id: metric.id)
21
21
  return [] unless current_version
22
22
 
23
- relation = Calibration
23
+ relation = Agreement
24
24
  .where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
25
25
  .where.not(corrected_score: nil)
26
26
  relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
@@ -28,8 +28,8 @@ module CompletionKit
28
28
  .reject { |example| example[:judge_score].nil? }
29
29
  end
30
30
 
31
- def calibrations_for(metric, verdict:, limit:)
32
- base = Calibration.where(metric_id: metric.id, verdict: verdict)
31
+ def agreements_for(metric, verdict:, limit:)
32
+ base = Agreement.where(metric_id: metric.id, verdict: verdict)
33
33
  current_version = MetricVersion.current.find_by(metric_id: metric.id)
34
34
  scoped = current_version ? base.where(metric_version_id: current_version.id) : base
35
35
  effective = scoped.exists? ? scoped : base
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- class MetricCalibrationStats
2
+ class MetricAgreementStats
3
3
  PROVISIONAL_MIN = 10
4
4
  FIRM_MIN = 30
5
5
 
@@ -49,7 +49,7 @@ module CompletionKit
49
49
  end
50
50
 
51
51
  def call
52
- scope = Calibration.where(metric_id: @metric.id)
52
+ scope = Agreement.where(metric_id: @metric.id)
53
53
  if @metric_version
54
54
  scope = scope.where(metric_version_id: @metric_version.id)
55
55
  elsif !@all_versions
@@ -62,12 +62,12 @@ module CompletionKit
62
62
  disagrees = verdicts.count { |v, _, _| v == "disagree" }
63
63
  borderlines = verdicts.count { |v, _, _| v == "borderline" }
64
64
 
65
- ci = CalibrationMath.wilson_interval(successes: agrees, n: n)
65
+ ci = AgreementMath.wilson_interval(successes: agrees, n: n)
66
66
 
67
67
  pairs = score_pairs(verdicts)
68
- mae_value = CalibrationMath.mae(pairs)
69
- pearson_value = CalibrationMath.pearson(pairs)
70
- kappa_value = CalibrationMath.quadratic_weighted_kappa(pairs, categories: 1..5)
68
+ mae_value = AgreementMath.mae(pairs)
69
+ pearson_value = AgreementMath.pearson(pairs)
70
+ kappa_value = AgreementMath.quadratic_weighted_kappa(pairs, categories: 1..5)
71
71
 
72
72
  Result.new(
73
73
  sample_size: n,
@@ -0,0 +1,101 @@
1
+ module CompletionKit
2
+ class MetricImprovementValidator
3
+ ANSWER_KEY_LIMIT = 30
4
+
5
+ def initialize(metric, candidate, scorer: nil)
6
+ @metric = metric
7
+ @candidate = candidate
8
+ @scorer = scorer || method(:rescore)
9
+ end
10
+
11
+ def call
12
+ key = answer_key
13
+ rows = []
14
+ key.each do |entry|
15
+ begin
16
+ score = @scorer.call(entry[:response], @candidate)
17
+ rescue StandardError
18
+ next
19
+ end
20
+ rows << classify(entry, score.to_i)
21
+ end
22
+ summarize(rows, key.size, key_capped?)
23
+ end
24
+
25
+ private
26
+
27
+ def answer_key
28
+ current = MetricVersion.current.find_by(metric_id: @metric.id)
29
+ return [] unless current
30
+
31
+ base = Agreement.where(metric_id: @metric.id, metric_version_id: current.id, verdict: %w[agree disagree])
32
+ @key_size_before_cap = base.count
33
+ base.includes(response: :reviews)
34
+ .order(created_at: :desc)
35
+ .limit(ANSWER_KEY_LIMIT)
36
+ .filter_map do |cal|
37
+ response = cal.response
38
+ next unless response.response_text.present?
39
+ review = response.reviews.find { |r| r.metric_id == @metric.id }
40
+ position = cal.verdict == "disagree" ? cal.corrected_score : review&.ai_score
41
+ next if position.nil?
42
+ { response: response, verdict: cal.verdict, position: position }
43
+ end
44
+ end
45
+
46
+ def key_capped?
47
+ @key_size_before_cap.to_i > ANSWER_KEY_LIMIT
48
+ end
49
+
50
+ def classify(entry, candidate_score)
51
+ matched = candidate_score == entry[:position].to_i
52
+ outcome = if entry[:verdict] == "disagree"
53
+ matched ? "fix" : "still_off"
54
+ else
55
+ matched ? "keep" : "break"
56
+ end
57
+ {
58
+ "response_id" => entry[:response].id,
59
+ "verdict" => entry[:verdict],
60
+ "position" => entry[:position].to_i,
61
+ "candidate_score" => candidate_score,
62
+ "outcome" => outcome
63
+ }
64
+ end
65
+
66
+ def summarize(rows, total, capped)
67
+ fixes = rows.count { |r| r["outcome"] == "fix" }
68
+ keeps = rows.count { |r| r["outcome"] == "keep" }
69
+ breaks = rows.count { |r| r["outcome"] == "break" }
70
+ still_off = rows.count { |r| r["outcome"] == "still_off" }
71
+ agreements = rows.count { |r| r["verdict"] == "agree" }
72
+ {
73
+ "total" => total,
74
+ "tested" => rows.size,
75
+ "capped" => capped,
76
+ "fixes" => fixes,
77
+ "keeps" => keeps,
78
+ "breaks" => breaks,
79
+ "still_off" => still_off,
80
+ "before" => agreements,
81
+ "after" => fixes + keeps,
82
+ "rows" => rows
83
+ }
84
+ end
85
+
86
+ def rescore(response, candidate)
87
+ run = response.run
88
+ config = ApiConfig.for_model(run.judge_model).merge(judge_model: run.judge_model)
89
+ rubric_text = Metric.rubric_text_for(Metric.normalize_rubric_bands(candidate.rubric_bands))
90
+ result = JudgeService.new(config).evaluate(
91
+ response.response_text,
92
+ response.expected_output,
93
+ run.prompt&.template,
94
+ criteria: candidate.instruction.to_s,
95
+ rubric_text: rubric_text,
96
+ input_data: response.input_data
97
+ )
98
+ result[:score]
99
+ end
100
+ end
101
+ end
@@ -41,8 +41,8 @@ module CompletionKit
41
41
  private
42
42
 
43
43
  def build_meta_prompt
44
- disagreements = MetricCalibrationExamples.disagreements_for(@metric)
45
- borderlines = MetricCalibrationExamples.borderlines_for(@metric)
44
+ disagreements = MetricAgreementExamples.disagreements_for(@metric)
45
+ borderlines = MetricAgreementExamples.borderlines_for(@metric)
46
46
  sections = []
47
47
  sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
48
48
  sections << ""
@@ -1,34 +1,34 @@
1
- <div id="calibration_<%= response_row.id %>_<%= metric.id %>" class="ck-calibration">
2
- <% current_verdict = calibration&.verdict %>
1
+ <div id="agreement_<%= response_row.id %>_<%= metric.id %>" class="ck-agreement">
2
+ <% current_verdict = agreement&.verdict %>
3
3
  <% pending_verdict = local_assigns[:pending_verdict] %>
4
4
  <% active_verdict = pending_verdict || current_verdict %>
5
5
  <% error = local_assigns[:error] %>
6
6
  <% me = CompletionKit.config.username.presence || "operator" %>
7
- <% other_calibrations = CompletionKit::Calibration
7
+ <% other_agreements = CompletionKit::Agreement
8
8
  .where(response_id: response_row.id, metric_id: metric.id)
9
9
  .where.not(created_by: me)
10
10
  .order(created_at: :asc).to_a %>
11
11
  <% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
12
- <p class="ck-calibration__prompt">
13
- <span class="ck-calibration__label">Your verdict</span>
14
- <% if other_calibrations.any? %>
15
- <span class="ck-calibration__meta"><%= pluralize(other_calibrations.size, "other verdict") %> on this score</span>
16
- <span class="ck-calibration__sep">·</span>
12
+ <p class="ck-agreement__prompt">
13
+ <span class="ck-agreement__label">Your verdict</span>
14
+ <% if other_agreements.any? %>
15
+ <span class="ck-agreement__meta"><%= pluralize(other_agreements.size, "other verdict") %> on this score</span>
16
+ <span class="ck-agreement__sep">·</span>
17
17
  <% end %>
18
- <%= link_to metric_path(metric), class: "ck-calibration__meta-link" do %><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration →<% end %>
18
+ <%= link_to metric_path(metric, anchor: "agreement"), class: "ck-agreement__meta-link" do %><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Agreement →<% end %>
19
19
  </p>
20
- <div class="ck-calibration__buttons">
20
+ <div class="ck-agreement__buttons">
21
21
  <% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
22
22
  <% verdict_hints = {
23
23
  "agree" => "The score looks right.",
24
24
  "disagree" => "The score is wrong — you'll pick the right one.",
25
25
  "borderline" => "The rubric is unclear here; either score could be defensible."
26
26
  } %>
27
- <% CompletionKit::Calibration::VERDICTS.each do |verdict| %>
28
- <%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
27
+ <% CompletionKit::Agreement::VERDICTS.each do |verdict| %>
28
+ <%= button_to run_response_agreements_path(run, response_row, metric_id: metric.id, verdict: verdict),
29
29
  method: :post,
30
30
  form: { data: { turbo: "true" } },
31
- class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
31
+ class: "ck-agreement__pill ck-agreement__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
32
32
  "aria-pressed": (verdict == active_verdict).to_s,
33
33
  title: verdict_hints[verdict] do %>
34
34
  <%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
@@ -38,26 +38,26 @@
38
38
  </div>
39
39
 
40
40
  <% if error.present? %>
41
- <p class="ck-calibration__error" role="alert"><%= error %></p>
41
+ <p class="ck-agreement__error" role="alert"><%= error %></p>
42
42
  <% end %>
43
43
 
44
- <% if other_calibrations.any? %>
45
- <details class="ck-calibration__others">
46
- <summary class="ck-calibration__others-summary">
44
+ <% if other_agreements.any? %>
45
+ <details class="ck-agreement__others">
46
+ <summary class="ck-agreement__others-summary">
47
47
  <%= heroicon_tag "chevron-right", variant: :outline, size: 14, "aria-hidden": "true" %>
48
- <span>What others said (<%= other_calibrations.size %>)</span>
48
+ <span>What others said (<%= other_agreements.size %>)</span>
49
49
  </summary>
50
- <ul class="ck-calibration__others-list">
51
- <% other_calibrations.each do |other| %>
52
- <li class="ck-calibration__others-item ck-calibration__others-item--<%= other.verdict %>">
53
- <div class="ck-calibration__others-row">
54
- <span class="ck-calibration__others-verdict">
50
+ <ul class="ck-agreement__others-list">
51
+ <% other_agreements.each do |other| %>
52
+ <li class="ck-agreement__others-item ck-agreement__others-item--<%= other.verdict %>">
53
+ <div class="ck-agreement__others-row">
54
+ <span class="ck-agreement__others-verdict">
55
55
  <%= heroicon_tag verdict_icons[other.verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
56
56
  <%= other.verdict %>
57
57
  </span>
58
- <span class="ck-calibration__others-by"><%= other.created_by %></span>
58
+ <span class="ck-agreement__others-by"><%= other.created_by %></span>
59
59
  <% if other.corrected_score %>
60
- <span class="ck-calibration__others-stars" aria-label="<%= pluralize(other.corrected_score.to_i, 'star') %>" title="<%= pluralize(other.corrected_score.to_i, 'star') %>">
60
+ <span class="ck-agreement__others-stars" aria-label="<%= pluralize(other.corrected_score.to_i, 'star') %>" title="<%= pluralize(other.corrected_score.to_i, 'star') %>">
61
61
  <% 5.times do |i| %>
62
62
  <svg viewBox="0 0 24 24" width="12" height="12" stroke-width="1.75" class="ck-star <%= i < other.corrected_score.to_i ? "ck-star--filled" : "ck-star--empty" %>" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
63
63
  <% end %>
@@ -65,7 +65,7 @@
65
65
  <% end %>
66
66
  </div>
67
67
  <% if other.note.to_s.present? %>
68
- <p class="ck-calibration__others-note">"<%= other.note.to_s.truncate(140) %>"</p>
68
+ <p class="ck-agreement__others-note">"<%= other.note.to_s.truncate(140) %>"</p>
69
69
  <% end %>
70
70
  </li>
71
71
  <% end %>
@@ -74,10 +74,10 @@
74
74
  <% end %>
75
75
 
76
76
  <% if active_verdict == "disagree" %>
77
- <% existing_score = (calibration&.corrected_score || review&.ai_score)&.round %>
78
- <%= form_with url: run_response_calibrations_path(run, response_row),
77
+ <% existing_score = (agreement&.corrected_score || review&.ai_score)&.round %>
78
+ <%= form_with url: run_response_agreements_path(run, response_row),
79
79
  method: :post, local: false,
80
- class: "ck-calibration__detail" do |f| %>
80
+ class: "ck-agreement__detail" do |f| %>
81
81
  <%= hidden_field_tag :metric_id, metric.id %>
82
82
  <%= hidden_field_tag :verdict, "disagree" %>
83
83
  <p class="ck-label">What should the score have been?</p>
@@ -93,16 +93,16 @@
93
93
  <% end %>
94
94
  </div>
95
95
  </fieldset>
96
- <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration&.note %></textarea>
96
+ <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= agreement&.note %></textarea>
97
97
  <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
98
98
  <% end %>
99
99
  <% elsif active_verdict == "borderline" %>
100
- <%= form_with url: run_response_calibrations_path(run, response_row),
100
+ <%= form_with url: run_response_agreements_path(run, response_row),
101
101
  method: :post, local: false,
102
- class: "ck-calibration__detail" do |f| %>
102
+ class: "ck-agreement__detail" do |f| %>
103
103
  <%= hidden_field_tag :metric_id, metric.id %>
104
104
  <%= hidden_field_tag :verdict, "borderline" %>
105
- <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration&.note %></textarea>
105
+ <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= agreement&.note %></textarea>
106
106
  <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
107
107
  <% end %>
108
108
  <% end %>
@@ -4,7 +4,7 @@
4
4
  <% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
5
5
  <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
6
6
  created_by = CompletionKit.config.username.presence || "operator"
7
- verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
7
+ verdicted_ids = CompletionKit::Agreement.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
8
8
  CompletionKit::Response.joins(:reviews)
9
9
  .where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
10
10
  .where.not(reviews: { ai_score: nil })
@@ -12,29 +12,26 @@
12
12
  .order(created_at: :desc).first
13
13
  end %>
14
14
  <% prior_version_verdicts = if stats.sample_size.zero? && metric && current_metric_version
15
- CompletionKit::Calibration.where(metric_id: metric.id).where.not(metric_version_id: current_metric_version.id).count
15
+ CompletionKit::Agreement.where(metric_id: metric.id).where.not(metric_version_id: current_metric_version.id).count
16
16
  else
17
17
  0
18
18
  end %>
19
19
 
20
- <p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
20
+ <p id="agreement" class="ck-trust-line ck-trust-line--<%= stats.gate %>">
21
21
  <% if stats.sample_size.zero? %>
22
22
  <span class="ck-trust-line__lead">Not measured yet.</span>
23
- <span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
23
+ <span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricAgreementStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
24
24
  <% if target_response %>
25
25
  <%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
26
26
  <% end %>
27
27
  <% elsif stats.counter_only? %>
28
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></strong></span>
28
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricAgreementStats::PROVISIONAL_MIN %></strong></span>
29
29
  <% if stats.short_to_target > 0 %><span class="ck-trust-line__hint"><%= stats.short_to_target %> more to report a rate</span><% end %>
30
30
  <% if target_response %>
31
31
  <%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
32
32
  <% end %>
33
33
  <% else %>
34
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agreement</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong></span>
35
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
36
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
37
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
34
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agrees with you</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong> of <%= stats.sample_size %> reviews</span>
38
35
  <% if stats.borderline_rate && stats.borderline_rate > 0 %>
39
36
  <% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
40
37
  <span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>