completion-kit 0.11.0 → 0.12.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/javascripts/completion_kit/application.js +32 -0
  3. data/app/assets/stylesheets/completion_kit/application.css +51 -51
  4. data/app/controllers/completion_kit/{calibrations_controller.rb → agreements_controller.rb} +19 -19
  5. data/app/controllers/completion_kit/api/v1/{calibrations_controller.rb → agreements_controller.rb} +18 -18
  6. data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +2 -7
  7. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +1 -1
  8. data/app/controllers/completion_kit/metrics_controller.rb +10 -11
  9. data/app/controllers/completion_kit/provider_credentials_controller.rb +7 -2
  10. data/app/jobs/completion_kit/judge_review_job.rb +2 -2
  11. data/app/jobs/completion_kit/model_discovery_job.rb +2 -2
  12. data/app/models/completion_kit/{calibration.rb → agreement.rb} +1 -1
  13. data/app/models/completion_kit/metric_version.rb +1 -17
  14. data/app/models/completion_kit/provider_credential.rb +22 -12
  15. data/app/models/completion_kit/review.rb +1 -0
  16. data/app/services/completion_kit/{calibration_math.rb → agreement_math.rb} +1 -1
  17. data/app/services/completion_kit/mcp_dispatcher.rb +2 -2
  18. data/app/services/completion_kit/mcp_tools/{calibrations.rb → agreements.rb} +11 -11
  19. data/app/services/completion_kit/mcp_tools/judges.rb +3 -3
  20. data/app/services/completion_kit/mcp_tools/metric_versions.rb +2 -7
  21. data/app/services/completion_kit/{metric_calibration_examples.rb → metric_agreement_examples.rb} +6 -6
  22. data/app/services/completion_kit/{metric_calibration_stats.rb → metric_agreement_stats.rb} +6 -6
  23. data/app/services/completion_kit/metric_improvement_validator.rb +1 -1
  24. data/app/services/completion_kit/metric_variant_generator.rb +2 -2
  25. data/app/services/completion_kit/model_discovery_service.rb +16 -3
  26. data/app/views/completion_kit/{calibrations → agreements}/_buttons.html.erb +33 -33
  27. data/app/views/completion_kit/{calibrations → agreements}/_trust_panel.html.erb +5 -5
  28. data/app/views/completion_kit/api_reference/_body.html.erb +15 -15
  29. data/app/views/completion_kit/metrics/_guiding_examples.html.erb +1 -1
  30. data/app/views/completion_kit/metrics/edit.html.erb +1 -1
  31. data/app/views/completion_kit/metrics/show.html.erb +6 -6
  32. data/app/views/completion_kit/provider_credentials/_models_card.html.erb +1 -1
  33. data/app/views/completion_kit/provider_credentials/statuses.turbo_stream.erb +4 -0
  34. data/app/views/completion_kit/responses/show.html.erb +4 -4
  35. data/app/views/completion_kit/runs/show.html.erb +1 -1
  36. data/config/routes.rb +4 -3
  37. data/db/migrate/20260531000002_backfill_review_metric_versions.rb +33 -0
  38. data/db/migrate/20260531000003_add_metric_version_fk_to_reviews.rb +6 -0
  39. data/db/migrate/20260531000004_rename_calibrations_to_agreements.rb +19 -0
  40. data/lib/completion_kit/version.rb +1 -1
  41. data/lib/completion_kit.rb +2 -2
  42. metadata +14 -10
@@ -3,7 +3,7 @@ module CompletionKit
3
3
  STATES = %w[draft published].freeze
4
4
 
5
5
  belongs_to :metric
6
- has_many :calibrations, dependent: :destroy
6
+ has_many :agreements, dependent: :destroy
7
7
 
8
8
  serialize :rubric_bands, coder: JSON
9
9
  serialize :validation_summary, coder: JSON
@@ -83,22 +83,6 @@ module CompletionKit
83
83
  self
84
84
  end
85
85
 
86
- def revert!
87
- raise ArgumentError, "only a published version can be reverted to" unless published?
88
- audit = nil
89
- MetricVersion.transaction do
90
- audit = self.class.create!(
91
- metric: metric,
92
- instruction: instruction,
93
- rubric_bands: rubric_bands,
94
- state: "draft",
95
- source: "revert"
96
- )
97
- audit.publish!
98
- end
99
- audit
100
- end
101
-
102
86
  def as_json(options = {})
103
87
  {
104
88
  id: id,
@@ -79,11 +79,13 @@ module CompletionKit
79
79
  end
80
80
 
81
81
  def broadcast_discovery_progress
82
- broadcast_replace_to(
83
- "completion_kit_provider_#{id}",
84
- target: "discovery_status_#{id}",
85
- html: render_partial("completion_kit/provider_credentials/discovery_status", provider_credential: self)
86
- )
82
+ safely_broadcast do
83
+ broadcast_replace_to(
84
+ "completion_kit_provider_#{id}",
85
+ target: "discovery_status_#{id}",
86
+ html: render_partial("completion_kit/provider_credentials/discovery_status", provider_credential: self)
87
+ )
88
+ end
87
89
  broadcast_provider_models
88
90
  end
89
91
 
@@ -93,13 +95,15 @@ module CompletionKit
93
95
  end
94
96
 
95
97
  def broadcast_provider_models
96
- Turbo::StreamsChannel.broadcast_action_to(
97
- "completion_kit_provider_#{id}",
98
- action: "replace",
99
- target: "provider_models_#{id}",
100
- method: "morph",
101
- html: render_partial("completion_kit/provider_credentials/models_card", provider_credential: self)
102
- )
98
+ safely_broadcast do
99
+ Turbo::StreamsChannel.broadcast_action_to(
100
+ "completion_kit_provider_#{id}",
101
+ action: "replace",
102
+ target: "provider_models_#{id}",
103
+ method: "morph",
104
+ html: render_partial("completion_kit/provider_credentials/models_card", provider_credential: self)
105
+ )
106
+ end
103
107
  end
104
108
 
105
109
  private
@@ -133,6 +137,12 @@ module CompletionKit
133
137
  CompletionKit::ApplicationController.render(partial: partial, locals: locals)
134
138
  end
135
139
 
140
+ def safely_broadcast
141
+ yield
142
+ rescue StandardError => e
143
+ Rails.logger.error("[CompletionKit] discovery broadcast render failed: #{e.class}: #{e.message}")
144
+ end
145
+
136
146
  def api_endpoint_not_internal
137
147
  return if api_endpoint.blank?
138
148
 
@@ -8,6 +8,7 @@ module CompletionKit
8
8
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
9
9
 
10
10
  validates :metric_name, presence: true
11
+ validates :metric_version, presence: true
11
12
  validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
12
13
 
13
14
  before_validation :set_default_status
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- module CalibrationMath
2
+ module AgreementMath
3
3
  Z_95 = 1.959963984540054
4
4
 
5
5
  module_function
@@ -35,7 +35,7 @@ module CompletionKit
35
35
  McpTools::MetricVersions.definitions +
36
36
  McpTools::ProviderCredentials.definitions +
37
37
  McpTools::Tags.definitions +
38
- McpTools::Calibrations.definitions +
38
+ McpTools::Agreements.definitions +
39
39
  McpTools::Judges.definitions
40
40
  end
41
41
 
@@ -50,7 +50,7 @@ module CompletionKit
50
50
  when /\Ametrics_/ then McpTools::Metrics.call(name, arguments)
51
51
  when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
52
52
  when /\Atags_/ then McpTools::Tags.call(name, arguments)
53
- when /\Acalibrations_/ then McpTools::Calibrations.call(name, arguments)
53
+ when /\Aagreements_/ then McpTools::Agreements.call(name, arguments)
54
54
  when /\Ajudges_/ then McpTools::Judges.call(name, arguments)
55
55
  else raise MethodNotFound, "Unknown tool: #{name}"
56
56
  end
@@ -1,11 +1,11 @@
1
1
  module CompletionKit
2
2
  module McpTools
3
- module Calibrations
3
+ module Agreements
4
4
  extend Base
5
5
 
6
6
  TOOLS = {
7
- "calibrations_list" => {
8
- description: "List calibrations. Filter by run_id, response_id, metric_id, or created_by.",
7
+ "agreements_list" => {
8
+ description: "List agreements. Filter by run_id, response_id, metric_id, or created_by.",
9
9
  inputSchema: {
10
10
  type: "object",
11
11
  properties: {
@@ -18,8 +18,8 @@ module CompletionKit
18
18
  },
19
19
  handler: :list
20
20
  },
21
- "calibrations_create" => {
22
- description: "Upsert a calibration for (run, response, metric, created_by). Verdict is one of agree, disagree, borderline. corrected_score (1..5) is required when verdict is 'disagree'.",
21
+ "agreements_create" => {
22
+ description: "Upsert an agreement for (run, response, metric, created_by). Verdict is one of agree, disagree, borderline. corrected_score (1..5) is required when verdict is 'disagree'.",
23
23
  inputSchema: {
24
24
  type: "object",
25
25
  properties: {
@@ -38,7 +38,7 @@ module CompletionKit
38
38
  }.freeze
39
39
 
40
40
  def self.list(args)
41
- scope = CompletionKit::Calibration.all
41
+ scope = CompletionKit::Agreement.all
42
42
  scope = scope.where(run_id: args["run_id"]) if args["run_id"]
43
43
  scope = scope.where(response_id: args["response_id"]) if args["response_id"]
44
44
  scope = scope.where(metric_id: args["metric_id"]) if args["metric_id"]
@@ -52,20 +52,20 @@ module CompletionKit
52
52
  metric = CompletionKit::Metric.find(args["metric_id"])
53
53
  created_by = args["created_by"].presence || "mcp"
54
54
 
55
- calibration = CompletionKit::Calibration.find_or_initialize_by(
55
+ agreement = CompletionKit::Agreement.find_or_initialize_by(
56
56
  run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
57
57
  )
58
- calibration.assign_attributes(
58
+ agreement.assign_attributes(
59
59
  metric_version: CompletionKit::MetricVersion.ensure_current_for(metric),
60
60
  verdict: args["verdict"],
61
61
  corrected_score: args["corrected_score"],
62
62
  note: args["note"]
63
63
  )
64
64
 
65
- if calibration.save
66
- text_result(calibration.as_json)
65
+ if agreement.save
66
+ text_result(agreement.as_json)
67
67
  else
68
- error_result(calibration.errors.full_messages.join(", "))
68
+ error_result(agreement.errors.full_messages.join(", "))
69
69
  end
70
70
  end
71
71
  end
@@ -33,7 +33,7 @@ module CompletionKit
33
33
  handler: :replay
34
34
  },
35
35
  "judges_compare" => {
36
- description: "Compare two metric versions' calibration stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
36
+ description: "Compare two metric versions' agreement stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
37
37
  inputSchema: {
38
38
  type: "object",
39
39
  properties: {
@@ -77,8 +77,8 @@ module CompletionKit
77
77
  metric = CompletionKit::Metric.find(args["metric_id"])
78
78
  a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
79
79
  b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
80
- stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
81
- stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
80
+ stats_a = CompletionKit::MetricAgreementStats.for(metric, metric_version: a)
81
+ stats_b = CompletionKit::MetricAgreementStats.for(metric, metric_version: b)
82
82
  text_result({
83
83
  metric_id: metric.id,
84
84
  a: metric_version_payload(a, stats_a),
@@ -47,13 +47,8 @@ module CompletionKit
47
47
 
48
48
  def self.publish(args)
49
49
  version = CompletionKit::MetricVersion.find(args["metric_version_id"])
50
- if version.published? && !version.current?
51
- audit = version.revert!
52
- text_result(audit.as_json)
53
- else
54
- version.publish!
55
- text_result(version.reload.as_json)
56
- end
50
+ version.publish!
51
+ text_result(version.reload.as_json)
57
52
  end
58
53
 
59
54
  def self.dismiss(args)
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- module MetricCalibrationExamples
2
+ module MetricAgreementExamples
3
3
  DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
4
4
 
5
5
  module_function
@@ -9,18 +9,18 @@ module CompletionKit
9
9
  end
10
10
 
11
11
  def disagreements_for(metric, limit: 8)
12
- calibrations_for(metric, verdict: "disagree", limit: limit)
12
+ agreements_for(metric, verdict: "disagree", limit: limit)
13
13
  end
14
14
 
15
15
  def borderlines_for(metric, limit: 6)
16
- calibrations_for(metric, verdict: "borderline", limit: limit)
16
+ agreements_for(metric, verdict: "borderline", limit: limit)
17
17
  end
18
18
 
19
19
  def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
20
20
  current_version = MetricVersion.current.find_by(metric_id: metric.id)
21
21
  return [] unless current_version
22
22
 
23
- relation = Calibration
23
+ relation = Agreement
24
24
  .where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
25
25
  .where.not(corrected_score: nil)
26
26
  relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
@@ -28,8 +28,8 @@ module CompletionKit
28
28
  .reject { |example| example[:judge_score].nil? }
29
29
  end
30
30
 
31
- def calibrations_for(metric, verdict:, limit:)
32
- base = Calibration.where(metric_id: metric.id, verdict: verdict)
31
+ def agreements_for(metric, verdict:, limit:)
32
+ base = Agreement.where(metric_id: metric.id, verdict: verdict)
33
33
  current_version = MetricVersion.current.find_by(metric_id: metric.id)
34
34
  scoped = current_version ? base.where(metric_version_id: current_version.id) : base
35
35
  effective = scoped.exists? ? scoped : base
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- class MetricCalibrationStats
2
+ class MetricAgreementStats
3
3
  PROVISIONAL_MIN = 10
4
4
  FIRM_MIN = 30
5
5
 
@@ -49,7 +49,7 @@ module CompletionKit
49
49
  end
50
50
 
51
51
  def call
52
- scope = Calibration.where(metric_id: @metric.id)
52
+ scope = Agreement.where(metric_id: @metric.id)
53
53
  if @metric_version
54
54
  scope = scope.where(metric_version_id: @metric_version.id)
55
55
  elsif !@all_versions
@@ -62,12 +62,12 @@ module CompletionKit
62
62
  disagrees = verdicts.count { |v, _, _| v == "disagree" }
63
63
  borderlines = verdicts.count { |v, _, _| v == "borderline" }
64
64
 
65
- ci = CalibrationMath.wilson_interval(successes: agrees, n: n)
65
+ ci = AgreementMath.wilson_interval(successes: agrees, n: n)
66
66
 
67
67
  pairs = score_pairs(verdicts)
68
- mae_value = CalibrationMath.mae(pairs)
69
- pearson_value = CalibrationMath.pearson(pairs)
70
- kappa_value = CalibrationMath.quadratic_weighted_kappa(pairs, categories: 1..5)
68
+ mae_value = AgreementMath.mae(pairs)
69
+ pearson_value = AgreementMath.pearson(pairs)
70
+ kappa_value = AgreementMath.quadratic_weighted_kappa(pairs, categories: 1..5)
71
71
 
72
72
  Result.new(
73
73
  sample_size: n,
@@ -28,7 +28,7 @@ module CompletionKit
28
28
  current = MetricVersion.current.find_by(metric_id: @metric.id)
29
29
  return [] unless current
30
30
 
31
- base = Calibration.where(metric_id: @metric.id, metric_version_id: current.id, verdict: %w[agree disagree])
31
+ base = Agreement.where(metric_id: @metric.id, metric_version_id: current.id, verdict: %w[agree disagree])
32
32
  @key_size_before_cap = base.count
33
33
  base.includes(response: :reviews)
34
34
  .order(created_at: :desc)
@@ -41,8 +41,8 @@ module CompletionKit
41
41
  private
42
42
 
43
43
  def build_meta_prompt
44
- disagreements = MetricCalibrationExamples.disagreements_for(@metric)
45
- borderlines = MetricCalibrationExamples.borderlines_for(@metric)
44
+ disagreements = MetricAgreementExamples.disagreements_for(@metric)
45
+ borderlines = MetricAgreementExamples.borderlines_for(@metric)
46
46
  sections = []
47
47
  sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
48
48
  sections << ""
@@ -12,7 +12,7 @@ module CompletionKit
12
12
  @api_endpoint = config[:api_endpoint]
13
13
  end
14
14
 
15
- def refresh!(&on_progress)
15
+ def refresh!(force: false, &on_progress)
16
16
  discovered = fetch_models
17
17
  reconcile(discovered)
18
18
  # OpenRouter publishes capability metadata (output modalities, etc.), so we
@@ -20,6 +20,7 @@ module CompletionKit
20
20
  # Judging stays unknown ("?") until a real run proves it.
21
21
  return if @provider == "openrouter"
22
22
 
23
+ reset_failed_generation if force
23
24
  probe_new_models(&on_progress)
24
25
  end
25
26
 
@@ -181,6 +182,11 @@ module CompletionKit
181
182
  end
182
183
  end
183
184
 
185
+ def reset_failed_generation
186
+ Model.where(provider: @provider, status: %w[active failed], supports_generation: false)
187
+ .update_all(supports_generation: nil, generation_error: nil)
188
+ end
189
+
184
190
  def probe_new_models(&on_progress)
185
191
  candidates = Model.where(provider: @provider, status: %w[active failed])
186
192
  .where("supports_generation IS NULL OR supports_judging IS NULL OR (generation_error IS NOT NULL AND #{retryable_error_sql('generation_error')}) OR (judging_error IS NOT NULL AND #{retryable_error_sql('judging_error')})")
@@ -220,7 +226,7 @@ module CompletionKit
220
226
 
221
227
  def probe_generation(model)
222
228
  probe_input = "Reply with exactly this token and nothing else: PING-OK"
223
- response = send_probe(model.model_id, probe_input, 65536)
229
+ response = send_probe(model.model_id, probe_input, probe_max_output_tokens)
224
230
  if response.success?
225
231
  text = extract_text(response).to_s
226
232
  if text.blank?
@@ -251,7 +257,7 @@ module CompletionKit
251
257
  AI output to evaluate: The sky is blue.
252
258
  PROMPT
253
259
 
254
- response = send_probe(model.model_id, judge_input, 65536)
260
+ response = send_probe(model.model_id, judge_input, probe_max_output_tokens)
255
261
  if response.success?
256
262
  text = extract_text(response).to_s
257
263
  if text.match?(/Score:\s*\d/i)
@@ -269,6 +275,13 @@ module CompletionKit
269
275
  model.judging_error = e.message
270
276
  end
271
277
 
278
+ OPENAI_REASONING_PROBE_BUDGET = 65_536
279
+ CHAT_PROBE_BUDGET = 1_024
280
+
281
+ def probe_max_output_tokens
282
+ @provider == "openai" ? OPENAI_REASONING_PROBE_BUDGET : CHAT_PROBE_BUDGET
283
+ end
284
+
272
285
  def send_probe(model_id, input, max_tokens)
273
286
  case @provider
274
287
  when "openai" then openai_probe(model_id, input, max_tokens)
@@ -1,34 +1,34 @@
1
- <div id="calibration_<%= response_row.id %>_<%= metric.id %>" class="ck-calibration">
2
- <% current_verdict = calibration&.verdict %>
1
+ <div id="agreement_<%= response_row.id %>_<%= metric.id %>" class="ck-agreement">
2
+ <% current_verdict = agreement&.verdict %>
3
3
  <% pending_verdict = local_assigns[:pending_verdict] %>
4
4
  <% active_verdict = pending_verdict || current_verdict %>
5
5
  <% error = local_assigns[:error] %>
6
6
  <% me = CompletionKit.config.username.presence || "operator" %>
7
- <% other_calibrations = CompletionKit::Calibration
7
+ <% other_agreements = CompletionKit::Agreement
8
8
  .where(response_id: response_row.id, metric_id: metric.id)
9
9
  .where.not(created_by: me)
10
10
  .order(created_at: :asc).to_a %>
11
11
  <% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
12
- <p class="ck-calibration__prompt">
13
- <span class="ck-calibration__label">Your verdict</span>
14
- <% if other_calibrations.any? %>
15
- <span class="ck-calibration__meta"><%= pluralize(other_calibrations.size, "other verdict") %> on this score</span>
16
- <span class="ck-calibration__sep">·</span>
12
+ <p class="ck-agreement__prompt">
13
+ <span class="ck-agreement__label">Your verdict</span>
14
+ <% if other_agreements.any? %>
15
+ <span class="ck-agreement__meta"><%= pluralize(other_agreements.size, "other verdict") %> on this score</span>
16
+ <span class="ck-agreement__sep">·</span>
17
17
  <% end %>
18
- <%= link_to metric_path(metric), class: "ck-calibration__meta-link" do %><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration →<% end %>
18
+ <%= link_to metric_path(metric, anchor: "agreement"), class: "ck-agreement__meta-link" do %><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Agreement →<% end %>
19
19
  </p>
20
- <div class="ck-calibration__buttons">
20
+ <div class="ck-agreement__buttons">
21
21
  <% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
22
22
  <% verdict_hints = {
23
23
  "agree" => "The score looks right.",
24
24
  "disagree" => "The score is wrong — you'll pick the right one.",
25
25
  "borderline" => "The rubric is unclear here; either score could be defensible."
26
26
  } %>
27
- <% CompletionKit::Calibration::VERDICTS.each do |verdict| %>
28
- <%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
27
+ <% CompletionKit::Agreement::VERDICTS.each do |verdict| %>
28
+ <%= button_to run_response_agreements_path(run, response_row, metric_id: metric.id, verdict: verdict),
29
29
  method: :post,
30
30
  form: { data: { turbo: "true" } },
31
- class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
31
+ class: "ck-agreement__pill ck-agreement__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
32
32
  "aria-pressed": (verdict == active_verdict).to_s,
33
33
  title: verdict_hints[verdict] do %>
34
34
  <%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
@@ -38,26 +38,26 @@
38
38
  </div>
39
39
 
40
40
  <% if error.present? %>
41
- <p class="ck-calibration__error" role="alert"><%= error %></p>
41
+ <p class="ck-agreement__error" role="alert"><%= error %></p>
42
42
  <% end %>
43
43
 
44
- <% if other_calibrations.any? %>
45
- <details class="ck-calibration__others">
46
- <summary class="ck-calibration__others-summary">
44
+ <% if other_agreements.any? %>
45
+ <details class="ck-agreement__others">
46
+ <summary class="ck-agreement__others-summary">
47
47
  <%= heroicon_tag "chevron-right", variant: :outline, size: 14, "aria-hidden": "true" %>
48
- <span>What others said (<%= other_calibrations.size %>)</span>
48
+ <span>What others said (<%= other_agreements.size %>)</span>
49
49
  </summary>
50
- <ul class="ck-calibration__others-list">
51
- <% other_calibrations.each do |other| %>
52
- <li class="ck-calibration__others-item ck-calibration__others-item--<%= other.verdict %>">
53
- <div class="ck-calibration__others-row">
54
- <span class="ck-calibration__others-verdict">
50
+ <ul class="ck-agreement__others-list">
51
+ <% other_agreements.each do |other| %>
52
+ <li class="ck-agreement__others-item ck-agreement__others-item--<%= other.verdict %>">
53
+ <div class="ck-agreement__others-row">
54
+ <span class="ck-agreement__others-verdict">
55
55
  <%= heroicon_tag verdict_icons[other.verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
56
56
  <%= other.verdict %>
57
57
  </span>
58
- <span class="ck-calibration__others-by"><%= other.created_by %></span>
58
+ <span class="ck-agreement__others-by"><%= other.created_by %></span>
59
59
  <% if other.corrected_score %>
60
- <span class="ck-calibration__others-stars" aria-label="<%= pluralize(other.corrected_score.to_i, 'star') %>" title="<%= pluralize(other.corrected_score.to_i, 'star') %>">
60
+ <span class="ck-agreement__others-stars" aria-label="<%= pluralize(other.corrected_score.to_i, 'star') %>" title="<%= pluralize(other.corrected_score.to_i, 'star') %>">
61
61
  <% 5.times do |i| %>
62
62
  <svg viewBox="0 0 24 24" width="12" height="12" stroke-width="1.75" class="ck-star <%= i < other.corrected_score.to_i ? "ck-star--filled" : "ck-star--empty" %>" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
63
63
  <% end %>
@@ -65,7 +65,7 @@
65
65
  <% end %>
66
66
  </div>
67
67
  <% if other.note.to_s.present? %>
68
- <p class="ck-calibration__others-note">"<%= other.note.to_s.truncate(140) %>"</p>
68
+ <p class="ck-agreement__others-note">"<%= other.note.to_s.truncate(140) %>"</p>
69
69
  <% end %>
70
70
  </li>
71
71
  <% end %>
@@ -74,10 +74,10 @@
74
74
  <% end %>
75
75
 
76
76
  <% if active_verdict == "disagree" %>
77
- <% existing_score = (calibration&.corrected_score || review&.ai_score)&.round %>
78
- <%= form_with url: run_response_calibrations_path(run, response_row),
77
+ <% existing_score = (agreement&.corrected_score || review&.ai_score)&.round %>
78
+ <%= form_with url: run_response_agreements_path(run, response_row),
79
79
  method: :post, local: false,
80
- class: "ck-calibration__detail" do |f| %>
80
+ class: "ck-agreement__detail" do |f| %>
81
81
  <%= hidden_field_tag :metric_id, metric.id %>
82
82
  <%= hidden_field_tag :verdict, "disagree" %>
83
83
  <p class="ck-label">What should the score have been?</p>
@@ -93,16 +93,16 @@
93
93
  <% end %>
94
94
  </div>
95
95
  </fieldset>
96
- <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration&.note %></textarea>
96
+ <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= agreement&.note %></textarea>
97
97
  <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
98
98
  <% end %>
99
99
  <% elsif active_verdict == "borderline" %>
100
- <%= form_with url: run_response_calibrations_path(run, response_row),
100
+ <%= form_with url: run_response_agreements_path(run, response_row),
101
101
  method: :post, local: false,
102
- class: "ck-calibration__detail" do |f| %>
102
+ class: "ck-agreement__detail" do |f| %>
103
103
  <%= hidden_field_tag :metric_id, metric.id %>
104
104
  <%= hidden_field_tag :verdict, "borderline" %>
105
- <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration&.note %></textarea>
105
+ <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= agreement&.note %></textarea>
106
106
  <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
107
107
  <% end %>
108
108
  <% end %>
@@ -4,7 +4,7 @@
4
4
  <% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
5
5
  <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
6
6
  created_by = CompletionKit.config.username.presence || "operator"
7
- verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
7
+ verdicted_ids = CompletionKit::Agreement.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
8
8
  CompletionKit::Response.joins(:reviews)
9
9
  .where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
10
10
  .where.not(reviews: { ai_score: nil })
@@ -12,20 +12,20 @@
12
12
  .order(created_at: :desc).first
13
13
  end %>
14
14
  <% prior_version_verdicts = if stats.sample_size.zero? && metric && current_metric_version
15
- CompletionKit::Calibration.where(metric_id: metric.id).where.not(metric_version_id: current_metric_version.id).count
15
+ CompletionKit::Agreement.where(metric_id: metric.id).where.not(metric_version_id: current_metric_version.id).count
16
16
  else
17
17
  0
18
18
  end %>
19
19
 
20
- <p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
20
+ <p id="agreement" class="ck-trust-line ck-trust-line--<%= stats.gate %>">
21
21
  <% if stats.sample_size.zero? %>
22
22
  <span class="ck-trust-line__lead">Not measured yet.</span>
23
- <span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
23
+ <span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricAgreementStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
24
24
  <% if target_response %>
25
25
  <%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
26
26
  <% end %>
27
27
  <% elsif stats.counter_only? %>
28
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></strong></span>
28
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricAgreementStats::PROVISIONAL_MIN %></strong></span>
29
29
  <% if stats.short_to_target > 0 %><span class="ck-trust-line__hint"><%= stats.short_to_target %> more to report a rate</span><% end %>
30
30
  <% if target_response %>
31
31
  <%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
@@ -17,7 +17,7 @@
17
17
  <input type="radio" name="ck-api-tab" id="ck-tab-datasets" class="ck-api-tabs__radio">
18
18
  <input type="radio" name="ck-api-tab" id="ck-tab-metrics" class="ck-api-tabs__radio">
19
19
  <input type="radio" name="ck-api-tab" id="ck-tab-metric-groups" class="ck-api-tabs__radio">
20
- <input type="radio" name="ck-api-tab" id="ck-tab-calibrations" class="ck-api-tabs__radio">
20
+ <input type="radio" name="ck-api-tab" id="ck-tab-agreements" class="ck-api-tabs__radio">
21
21
  <input type="radio" name="ck-api-tab" id="ck-tab-tags" class="ck-api-tabs__radio">
22
22
  <input type="radio" name="ck-api-tab" id="ck-tab-providers" class="ck-api-tabs__radio">
23
23
 
@@ -29,7 +29,7 @@
29
29
  <label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
30
30
  <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">10</span></label>
31
31
  <label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
32
- <label for="ck-tab-calibrations" class="ck-api-tabs__label">Calibrations <span class="ck-api-tabs__count">3</span></label>
32
+ <label for="ck-tab-agreements" class="ck-api-tabs__label">Agreements <span class="ck-api-tabs__count">3</span></label>
33
33
  <label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
34
34
  <label for="ck-tab-providers" class="ck-api-tabs__label">Providers <span class="ck-api-tabs__count">5</span></label>
35
35
  </nav>
@@ -238,8 +238,8 @@
238
238
  } %>
239
239
 
240
240
  <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
241
- <p class="ck-kicker" style="margin-bottom: 0.5rem;">Calibration loop</p>
242
- <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model to rewrite the instruction and rubric into a new draft version.</p>
241
+ <p class="ck-kicker" style="margin-bottom: 0.5rem;">Agreement loop</p>
242
+ <p class="ck-meta-copy">Drive metric improvement from disagree-flagged agreements: ask the model to rewrite the instruction and rubric into a new draft version.</p>
243
243
  </div>
244
244
  <div class="ck-api-endpoint">
245
245
  <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
@@ -250,7 +250,7 @@
250
250
 
251
251
  <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
252
252
  <p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>
253
- <p class="ck-meta-copy">Every metric carries a history of versions (the current published one, prior published ones, and any draft suggestions). Reviews and calibrations record the version they ran against, so the API can surface stale state and let you revert.</p>
253
+ <p class="ck-meta-copy">Every metric carries a history of versions (the current published one, prior published ones, and any draft suggestions). Reviews and agreements record the version they ran against, so the API can surface stale state and let you revert.</p>
254
254
  </div>
255
255
  <div class="ck-api-endpoint">
256
256
  <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/metrics/:metric_id/metric_versions</p>
@@ -294,23 +294,23 @@
294
294
  </div>
295
295
 
296
296
  <div class="ck-api-tabs__panel">
297
- <h2 class="ck-section-title">Calibrations</h2>
298
- <p class="ck-copy">Per-verdict feedback events on a response/metric pair: agree, disagree (with a corrected score and note), or borderline. Calibrations capture the metric version that was current when the verdict was cast, which is what drives the trust signal and the "stale" indicators across the rest of the API.</p>
297
+ <h2 class="ck-section-title">Agreements</h2>
298
+ <p class="ck-copy">Per-verdict feedback events on a response/metric pair: agree, disagree (with a corrected score and note), or borderline. Agreements capture the metric version that was current when the verdict was cast, which is what drives the trust signal and the "stale" indicators across the rest of the API.</p>
299
299
  <div class="ck-api-endpoint">
300
- <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/calibrations</p>
301
- <p class="ck-meta-copy">List calibrations across all runs. Supports filtering by any combination of the query params below.</p>
300
+ <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/agreements</p>
301
+ <p class="ck-meta-copy">List agreements across all runs. Supports filtering by any combination of the query params below.</p>
302
302
  <p class="ck-api-params"><strong>Optional filters:</strong>&ensp;<code>run_id</code>, <code>response_id</code>, <code>metric_id</code>, <code>metric_version_id</code>, <code>created_by</code>, <code>verdict</code> (<code>agree</code>, <code>disagree</code>, or <code>borderline</code>)</p>
303
- <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/calibrations?metric_id=1&verdict=disagree\" \\\n -H \"Authorization: Bearer #{token}\"" %>
303
+ <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/agreements?metric_id=1&verdict=disagree\" \\\n -H \"Authorization: Bearer #{token}\"" %>
304
304
  </div>
305
305
  <div class="ck-api-endpoint">
306
- <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:run_id/responses/:response_id/metrics/:metric_id/calibrations</p>
307
- <p class="ck-meta-copy">Cast a calibration on a specific response/metric pair. The metric version on the record is set automatically from the run's review.</p>
306
+ <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:run_id/responses/:response_id/metrics/:metric_id/agreements</p>
307
+ <p class="ck-meta-copy">Cast an agreement on a specific response/metric pair. The metric version on the record is set automatically from the run's review.</p>
308
308
  <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>verdict</code>, <code>created_by</code>&emsp;<strong>Optional:</strong>&ensp;<code>corrected_score</code>, <code>note</code></p>
309
- <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/responses/42/metrics/3/calibrations \\\n -H \"Authorization: Bearer #{token}\" \\\n -H \"Content-Type: application/json\" \\\n -d '{\"verdict\": \"disagree\", \"corrected_score\": 3, \"note\": \"too generous\", \"created_by\": \"alice\"}'" %>
309
+ <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/responses/42/metrics/3/agreements \\\n -H \"Authorization: Bearer #{token}\" \\\n -H \"Content-Type: application/json\" \\\n -d '{\"verdict\": \"disagree\", \"corrected_score\": 3, \"note\": \"too generous\", \"created_by\": \"alice\"}'" %>
310
310
  </div>
311
311
  <div class="ck-api-endpoint">
312
- <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/calibrations/:id</p>
313
- <p class="ck-meta-copy">Delete a calibration. Returns 204 No Content.</p>
312
+ <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/agreements/:id</p>
313
+ <p class="ck-meta-copy">Delete an agreement. Returns 204 No Content.</p>
314
314
  </div>
315
315
  </div>
316
316