completion-kit 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/stylesheets/completion_kit/application.css +51 -51
  3. data/app/controllers/completion_kit/{calibrations_controller.rb → agreements_controller.rb} +19 -19
  4. data/app/controllers/completion_kit/api/v1/{calibrations_controller.rb → agreements_controller.rb} +18 -18
  5. data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +2 -7
  6. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +1 -1
  7. data/app/controllers/completion_kit/metrics_controller.rb +10 -11
  8. data/app/jobs/completion_kit/judge_review_job.rb +2 -2
  9. data/app/models/completion_kit/{calibration.rb → agreement.rb} +1 -1
  10. data/app/models/completion_kit/metric_version.rb +1 -17
  11. data/app/models/completion_kit/review.rb +1 -0
  12. data/app/services/completion_kit/{calibration_math.rb → agreement_math.rb} +1 -1
  13. data/app/services/completion_kit/mcp_dispatcher.rb +2 -2
  14. data/app/services/completion_kit/mcp_tools/{calibrations.rb → agreements.rb} +11 -11
  15. data/app/services/completion_kit/mcp_tools/judges.rb +3 -3
  16. data/app/services/completion_kit/mcp_tools/metric_versions.rb +2 -7
  17. data/app/services/completion_kit/{metric_calibration_examples.rb → metric_agreement_examples.rb} +6 -6
  18. data/app/services/completion_kit/{metric_calibration_stats.rb → metric_agreement_stats.rb} +6 -6
  19. data/app/services/completion_kit/metric_improvement_validator.rb +1 -1
  20. data/app/services/completion_kit/metric_variant_generator.rb +2 -2
  21. data/app/views/completion_kit/{calibrations → agreements}/_buttons.html.erb +33 -33
  22. data/app/views/completion_kit/{calibrations → agreements}/_trust_panel.html.erb +5 -5
  23. data/app/views/completion_kit/api_reference/_body.html.erb +15 -15
  24. data/app/views/completion_kit/metrics/_guiding_examples.html.erb +1 -1
  25. data/app/views/completion_kit/metrics/edit.html.erb +1 -1
  26. data/app/views/completion_kit/metrics/show.html.erb +6 -6
  27. data/app/views/completion_kit/responses/show.html.erb +4 -4
  28. data/app/views/completion_kit/runs/show.html.erb +1 -1
  29. data/config/routes.rb +3 -3
  30. data/db/migrate/20260531000002_backfill_review_metric_versions.rb +33 -0
  31. data/db/migrate/20260531000003_add_metric_version_fk_to_reviews.rb +6 -0
  32. data/db/migrate/20260531000004_rename_calibrations_to_agreements.rb +19 -0
  33. data/lib/completion_kit/version.rb +1 -1
  34. data/lib/completion_kit.rb +2 -2
  35. metadata +13 -10
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0b32ec77fb60d07f40e4b83827c2510aaeb695c96c9c6df86e4b42a7ec57516b
4
- data.tar.gz: ade912039e4942c87d73c13443bd405533eec2988478e02cd1ccb87550de2783
3
+ metadata.gz: cdcc8d4cdaf4b7aa4b3cff7cb0dd3fe65ce213bbce7b8ab1ba52cba304bff19a
4
+ data.tar.gz: d2b25e3b12b187b3df15e9b8347668a3dcf529b765fd5427a1dc2579665ee664
5
5
  SHA512:
6
- metadata.gz: bb8664ea804d59e3761ab385d1af98ecf7d110dd7e68e7003e1a4b2c059c5e377a5e42d350a46310f58c6d8c41c0e31a6aa0cdaf8a6b50b4d9f419e6fa60e474
7
- data.tar.gz: 3bbe72cf7e99a4ae899765829ee8bee83703885ebdfa5b6b9f7253f25f2373b1dd22aadfeb071f4b340c1b7d33dfc4e590e4f8a2df2239afbbc305de009af2cf
6
+ metadata.gz: 81921860b28c13076623a462e9cc82569a3721f035f5eb8dbe236c177fa02f9277aa7425659ae583da6713dfcc9467fc03683b3305f65fd8eaf29525cc93e143
7
+ data.tar.gz: c03aa7c4ad395228e4268ee166a908779a6c50e614c1a0591cf52569712236ffb4ccd9b6d8dc918d40e2ebb253fc48760fdd4e95d3e374e4cf68fad7b1b7ee19
@@ -3158,7 +3158,7 @@ select.ck-input {
3158
3158
  #ck-tab-datasets:checked ~ .ck-api-tabs__nav label[for="ck-tab-datasets"],
3159
3159
  #ck-tab-metrics:checked ~ .ck-api-tabs__nav label[for="ck-tab-metrics"],
3160
3160
  #ck-tab-metric-groups:checked ~ .ck-api-tabs__nav label[for="ck-tab-metric-groups"],
3161
- #ck-tab-calibrations:checked ~ .ck-api-tabs__nav label[for="ck-tab-calibrations"],
3161
+ #ck-tab-agreements:checked ~ .ck-api-tabs__nav label[for="ck-tab-agreements"],
3162
3162
  #ck-tab-tags:checked ~ .ck-api-tabs__nav label[for="ck-tab-tags"],
3163
3163
  #ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"] {
3164
3164
  color: var(--ck-accent);
@@ -3173,7 +3173,7 @@ select.ck-input {
3173
3173
  #ck-tab-datasets:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(5),
3174
3174
  #ck-tab-metrics:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(6),
3175
3175
  #ck-tab-metric-groups:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(7),
3176
- #ck-tab-calibrations:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(8),
3176
+ #ck-tab-agreements:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(8),
3177
3177
  #ck-tab-tags:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(9),
3178
3178
  #ck-tab-providers:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(10) {
3179
3179
  display: block;
@@ -3215,7 +3215,7 @@ select.ck-input {
3215
3215
  #ck-tab-datasets:checked ~ .ck-api-tabs__nav label[for="ck-tab-datasets"],
3216
3216
  #ck-tab-metrics:checked ~ .ck-api-tabs__nav label[for="ck-tab-metrics"],
3217
3217
  #ck-tab-metric-groups:checked ~ .ck-api-tabs__nav label[for="ck-tab-metric-groups"],
3218
- #ck-tab-calibrations:checked ~ .ck-api-tabs__nav label[for="ck-tab-calibrations"],
3218
+ #ck-tab-agreements:checked ~ .ck-api-tabs__nav label[for="ck-tab-agreements"],
3219
3219
  #ck-tab-tags:checked ~ .ck-api-tabs__nav label[for="ck-tab-tags"],
3220
3220
  #ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"] {
3221
3221
  border-left-color: transparent;
@@ -5377,33 +5377,33 @@ a.tag-mark {
5377
5377
  outline-offset: 2px;
5378
5378
  }
5379
5379
 
5380
- .ck-calibration {
5380
+ .ck-agreement {
5381
5381
  margin-top: 12px;
5382
5382
  padding-top: 12px;
5383
5383
  border-top: 1px dashed var(--ck-line);
5384
5384
  }
5385
- .ck-calibration__prompt {
5385
+ .ck-agreement__prompt {
5386
5386
  margin: 0 0 10px;
5387
5387
  font-family: var(--ck-mono);
5388
5388
  font-size: 0.72rem;
5389
5389
  letter-spacing: 0.04em;
5390
5390
  color: var(--ck-dim);
5391
5391
  }
5392
- .ck-calibration__prompt > * + * {
5392
+ .ck-agreement__prompt > * + * {
5393
5393
  margin-left: 8px;
5394
5394
  }
5395
- .ck-calibration__label {
5395
+ .ck-agreement__label {
5396
5396
  letter-spacing: 0.08em;
5397
5397
  text-transform: uppercase;
5398
5398
  color: var(--ck-dim);
5399
5399
  }
5400
- .ck-calibration__meta {
5400
+ .ck-agreement__meta {
5401
5401
  color: var(--ck-muted);
5402
5402
  }
5403
- .ck-calibration__sep {
5403
+ .ck-agreement__sep {
5404
5404
  color: var(--ck-line-strong);
5405
5405
  }
5406
- .ck-calibration__meta-link {
5406
+ .ck-agreement__meta-link {
5407
5407
  color: var(--ck-accent);
5408
5408
  text-decoration: none;
5409
5409
  white-space: nowrap;
@@ -5411,12 +5411,12 @@ a.tag-mark {
5411
5411
  letter-spacing: 0.08em;
5412
5412
  }
5413
5413
 
5414
- .ck-calibration__others {
5414
+ .ck-agreement__others {
5415
5415
  margin: 10px 0 0;
5416
5416
  font-family: var(--ck-mono);
5417
5417
  font-size: 0.78rem;
5418
5418
  }
5419
- .ck-calibration__others-summary {
5419
+ .ck-agreement__others-summary {
5420
5420
  display: inline-flex;
5421
5421
  align-items: center;
5422
5422
  gap: 6px;
@@ -5426,20 +5426,20 @@ a.tag-mark {
5426
5426
  user-select: none;
5427
5427
  list-style: none;
5428
5428
  }
5429
- .ck-calibration__others-summary:hover,
5430
- .ck-calibration__others-summary:focus-visible {
5429
+ .ck-agreement__others-summary:hover,
5430
+ .ck-agreement__others-summary:focus-visible {
5431
5431
  color: var(--ck-accent-hover);
5432
5432
  }
5433
- .ck-calibration__others-summary::-webkit-details-marker { display: none; }
5434
- .ck-calibration__others-summary svg {
5433
+ .ck-agreement__others-summary::-webkit-details-marker { display: none; }
5434
+ .ck-agreement__others-summary svg {
5435
5435
  width: 12px;
5436
5436
  height: 12px;
5437
5437
  transition: transform 0.15s;
5438
5438
  }
5439
- .ck-calibration__others[open] .ck-calibration__others-summary svg {
5439
+ .ck-agreement__others[open] .ck-agreement__others-summary svg {
5440
5440
  transform: rotate(90deg);
5441
5441
  }
5442
- .ck-calibration__others-list {
5442
+ .ck-agreement__others-list {
5443
5443
  list-style: none;
5444
5444
  padding: 8px 0 0;
5445
5445
  margin: 0;
@@ -5447,24 +5447,24 @@ a.tag-mark {
5447
5447
  flex-direction: column;
5448
5448
  gap: 6px;
5449
5449
  }
5450
- .ck-calibration__others-item {
5450
+ .ck-agreement__others-item {
5451
5451
  padding: 8px 10px;
5452
5452
  background: var(--ck-surface-soft);
5453
5453
  border: 1px solid var(--ck-line);
5454
5454
  border-radius: 4px;
5455
5455
  color: var(--ck-dim);
5456
5456
  }
5457
- .ck-calibration__others-item--agree { border-left: 2px solid var(--ck-success); }
5458
- .ck-calibration__others-item--disagree { border-left: 2px solid var(--ck-danger); }
5459
- .ck-calibration__others-item--borderline { border-left: 2px solid var(--ck-warning); }
5460
- .ck-calibration__others-row {
5457
+ .ck-agreement__others-item--agree { border-left: 2px solid var(--ck-success); }
5458
+ .ck-agreement__others-item--disagree { border-left: 2px solid var(--ck-danger); }
5459
+ .ck-agreement__others-item--borderline { border-left: 2px solid var(--ck-warning); }
5460
+ .ck-agreement__others-row {
5461
5461
  display: flex;
5462
5462
  flex-wrap: wrap;
5463
5463
  align-items: center;
5464
5464
  gap: 10px;
5465
5465
  line-height: 1;
5466
5466
  }
5467
- .ck-calibration__others-verdict {
5467
+ .ck-agreement__others-verdict {
5468
5468
  display: inline-flex;
5469
5469
  align-items: center;
5470
5470
  gap: 4px;
@@ -5473,24 +5473,24 @@ a.tag-mark {
5473
5473
  font-weight: 500;
5474
5474
  color: var(--ck-text);
5475
5475
  }
5476
- .ck-calibration__others-item--agree .ck-calibration__others-verdict { color: var(--ck-success); }
5477
- .ck-calibration__others-item--disagree .ck-calibration__others-verdict { color: var(--ck-danger); }
5478
- .ck-calibration__others-item--borderline .ck-calibration__others-verdict { color: var(--ck-warning); }
5479
- .ck-calibration__others-by {
5476
+ .ck-agreement__others-item--agree .ck-agreement__others-verdict { color: var(--ck-success); }
5477
+ .ck-agreement__others-item--disagree .ck-agreement__others-verdict { color: var(--ck-danger); }
5478
+ .ck-agreement__others-item--borderline .ck-agreement__others-verdict { color: var(--ck-warning); }
5479
+ .ck-agreement__others-by {
5480
5480
  color: var(--ck-muted);
5481
5481
  }
5482
- .ck-calibration__others-stars {
5482
+ .ck-agreement__others-stars {
5483
5483
  display: inline-flex;
5484
5484
  align-items: center;
5485
5485
  gap: 2px;
5486
5486
  }
5487
- .ck-calibration__others-stars svg { display: block; }
5488
- .ck-calibration__others-note {
5487
+ .ck-agreement__others-stars svg { display: block; }
5488
+ .ck-agreement__others-note {
5489
5489
  margin: 6px 0 0;
5490
5490
  color: var(--ck-dim);
5491
5491
  line-height: 1.5;
5492
5492
  }
5493
- .ck-calibration__meta-link svg {
5493
+ .ck-agreement__meta-link svg {
5494
5494
  display: inline-block;
5495
5495
  width: 12px;
5496
5496
  height: 12px;
@@ -5499,16 +5499,16 @@ a.tag-mark {
5499
5499
  position: relative;
5500
5500
  top: -1px;
5501
5501
  }
5502
- .ck-calibration__meta-link:hover,
5503
- .ck-calibration__meta-link:focus-visible {
5502
+ .ck-agreement__meta-link:hover,
5503
+ .ck-agreement__meta-link:focus-visible {
5504
5504
  color: var(--ck-accent-hover);
5505
5505
  }
5506
- .ck-calibration__buttons {
5506
+ .ck-agreement__buttons {
5507
5507
  display: flex;
5508
5508
  gap: 6px;
5509
5509
  flex-wrap: wrap;
5510
5510
  }
5511
- .ck-calibration__pill {
5511
+ .ck-agreement__pill {
5512
5512
  display: inline-flex;
5513
5513
  align-items: center;
5514
5514
  gap: 0.4rem;
@@ -5525,50 +5525,50 @@ a.tag-mark {
5525
5525
  cursor: pointer;
5526
5526
  transition: background 0.12s, border-color 0.12s, color 0.12s;
5527
5527
  }
5528
- .ck-calibration__pill svg {
5528
+ .ck-agreement__pill svg {
5529
5529
  width: 14px;
5530
5530
  height: 14px;
5531
5531
  }
5532
- .ck-calibration__pill:hover,
5533
- .ck-calibration__pill:focus-visible {
5532
+ .ck-agreement__pill:hover,
5533
+ .ck-agreement__pill:focus-visible {
5534
5534
  color: var(--ck-text);
5535
5535
  border-color: var(--ck-dim);
5536
5536
  }
5537
- .ck-calibration__pill--agree.is-active {
5537
+ .ck-agreement__pill--agree.is-active {
5538
5538
  background: var(--ck-success-soft);
5539
5539
  border-color: rgba(45, 212, 168, 0.35);
5540
5540
  color: var(--ck-success);
5541
5541
  }
5542
- .ck-calibration__pill--disagree.is-active {
5542
+ .ck-agreement__pill--disagree.is-active {
5543
5543
  background: var(--ck-danger-soft);
5544
5544
  border-color: rgba(248, 113, 113, 0.35);
5545
5545
  color: var(--ck-danger);
5546
5546
  }
5547
- .ck-calibration__pill--borderline.is-active {
5547
+ .ck-agreement__pill--borderline.is-active {
5548
5548
  background: var(--ck-warning-soft);
5549
5549
  border-color: rgba(224, 164, 88, 0.35);
5550
5550
  color: var(--ck-warning);
5551
5551
  }
5552
- .ck-calibration__pill--agree:hover { border-color: rgba(45, 212, 168, 0.45); color: var(--ck-success); }
5553
- .ck-calibration__pill--disagree:hover { border-color: rgba(248, 113, 113, 0.45); color: var(--ck-danger); }
5554
- .ck-calibration__pill--borderline:hover { border-color: rgba(224, 164, 88, 0.45); color: var(--ck-warning); }
5555
- .ck-calibration__detail {
5552
+ .ck-agreement__pill--agree:hover { border-color: rgba(45, 212, 168, 0.45); color: var(--ck-success); }
5553
+ .ck-agreement__pill--disagree:hover { border-color: rgba(248, 113, 113, 0.45); color: var(--ck-danger); }
5554
+ .ck-agreement__pill--borderline:hover { border-color: rgba(224, 164, 88, 0.45); color: var(--ck-warning); }
5555
+ .ck-agreement__detail {
5556
5556
  margin-top: 12px;
5557
5557
  display: flex;
5558
5558
  flex-direction: column;
5559
5559
  gap: 12px;
5560
5560
  }
5561
- .ck-calibration__detail > * {
5561
+ .ck-agreement__detail > * {
5562
5562
  margin: 0;
5563
5563
  }
5564
- .ck-calibration__detail .ck-button {
5564
+ .ck-agreement__detail .ck-button {
5565
5565
  align-self: flex-start;
5566
5566
  }
5567
- .ck-calibration__detail textarea {
5567
+ .ck-agreement__detail textarea {
5568
5568
  font-family: var(--ck-mono);
5569
5569
  font-size: 0.82rem;
5570
5570
  }
5571
- .ck-calibration__value {
5571
+ .ck-agreement__value {
5572
5572
  color: var(--ck-accent);
5573
5573
  font-family: var(--ck-mono);
5574
5574
  font-weight: 600;
@@ -5660,7 +5660,7 @@ a.tag-mark {
5660
5660
  background: linear-gradient(180deg, var(--ck-accent-soft), var(--ck-surface));
5661
5661
  }
5662
5662
 
5663
- .ck-calibration__error {
5663
+ .ck-agreement__error {
5664
5664
  margin: 8px 0 0;
5665
5665
  padding: 8px 10px;
5666
5666
  background: var(--ck-danger-soft);
@@ -1,36 +1,36 @@
1
1
  module CompletionKit
2
- class CalibrationsController < ApplicationController
3
- before_action :ensure_calibration_enabled
2
+ class AgreementsController < ApplicationController
3
+ before_action :ensure_agreement_enabled
4
4
  before_action :set_scope
5
5
 
6
6
  def create
7
- created_by = calibration_creator
8
- existing = Calibration.find_by(
7
+ created_by = agreement_creator
8
+ existing = Agreement.find_by(
9
9
  run_id: @run.id, response_id: @response.id, metric_id: @metric.id, created_by: created_by
10
10
  )
11
11
 
12
12
  if params[:verdict] == "disagree" && params[:corrected_score].blank?
13
- render_calibration(calibration: existing, pending_verdict: "disagree")
13
+ render_agreement(agreement: existing, pending_verdict: "disagree")
14
14
  return
15
15
  end
16
16
 
17
- calibration = existing || Calibration.new(
17
+ agreement = existing || Agreement.new(
18
18
  run: @run, response: @response, metric: @metric, created_by: created_by
19
19
  )
20
- calibration.assign_attributes(
20
+ agreement.assign_attributes(
21
21
  metric_version: MetricVersion.ensure_current_for(@metric),
22
22
  verdict: params[:verdict],
23
23
  corrected_score: params[:corrected_score].presence,
24
24
  note: params[:note].presence
25
25
  )
26
26
 
27
- if calibration.save
28
- render_calibration(calibration: calibration, just_saved: true)
27
+ if agreement.save
28
+ render_agreement(agreement: agreement, just_saved: true)
29
29
  else
30
- render_calibration(
31
- calibration: existing,
30
+ render_agreement(
31
+ agreement: existing,
32
32
  pending_verdict: params[:verdict],
33
- error: calibration.errors.full_messages.to_sentence,
33
+ error: agreement.errors.full_messages.to_sentence,
34
34
  status: :unprocessable_entity
35
35
  )
36
36
  end
@@ -38,10 +38,10 @@ module CompletionKit
38
38
 
39
39
  private
40
40
 
41
- def render_calibration(calibration:, pending_verdict: nil, error: nil, just_saved: false, status: :ok)
41
+ def render_agreement(agreement:, pending_verdict: nil, error: nil, just_saved: false, status: :ok)
42
42
  locals = {
43
43
  review: review_for_metric,
44
- calibration: calibration,
44
+ agreement: agreement,
45
45
  run: @run,
46
46
  response_row: @response,
47
47
  metric: @metric,
@@ -50,14 +50,14 @@ module CompletionKit
50
50
  just_saved: just_saved
51
51
  }
52
52
  render turbo_stream: turbo_stream.replace(
53
- "calibration_#{@response.id}_#{@metric.id}",
54
- partial: "completion_kit/calibrations/buttons",
53
+ "agreement_#{@response.id}_#{@metric.id}",
54
+ partial: "completion_kit/agreements/buttons",
55
55
  locals: locals
56
56
  ), status: status
57
57
  end
58
58
 
59
- def ensure_calibration_enabled
60
- head :not_found unless CompletionKit.config.judge_calibration_enabled
59
+ def ensure_agreement_enabled
60
+ head :not_found unless CompletionKit.config.judge_agreement_enabled
61
61
  end
62
62
 
63
63
  def set_scope
@@ -70,7 +70,7 @@ module CompletionKit
70
70
  @response.reviews.find_by(metric_id: @metric.id)
71
71
  end
72
72
 
73
- def calibration_creator
73
+ def agreement_creator
74
74
  request.env["HTTP_X_REMOTE_USER"].presence || CompletionKit.config.username.presence || "operator"
75
75
  end
76
76
  end
@@ -1,13 +1,13 @@
1
1
  module CompletionKit
2
2
  module Api
3
3
  module V1
4
- class CalibrationsController < BaseController
5
- before_action :ensure_calibration_enabled
4
+ class AgreementsController < BaseController
5
+ before_action :ensure_agreement_enabled
6
6
  before_action :set_nested_scope, only: [:create]
7
- before_action :load_calibration, only: [:destroy]
7
+ before_action :load_agreement, only: [:destroy]
8
8
 
9
9
  def index
10
- scope = Calibration.all
10
+ scope = Agreement.all
11
11
  scope = scope.where(run_id: params[:run_id]) if params[:run_id].present?
12
12
  scope = scope.where(response_id: params[:response_id]) if params[:response_id].present?
13
13
  scope = scope.where(metric_id: params[:metric_id]) if params[:metric_id].present?
@@ -18,31 +18,31 @@ module CompletionKit
18
18
  end
19
19
 
20
20
  def create
21
- calibration = scope_calibrations.find_or_initialize_by(created_by: created_by_param)
22
- calibration.assign_attributes(
21
+ agreement = scope_agreements.find_or_initialize_by(created_by: created_by_param)
22
+ agreement.assign_attributes(
23
23
  run: @run,
24
24
  response: @response,
25
25
  metric: @metric,
26
26
  metric_version: MetricVersion.ensure_current_for(@metric),
27
- **calibration_params
27
+ **agreement_params
28
28
  )
29
29
 
30
- if calibration.save
31
- render json: calibration, status: calibration.previously_new_record? ? :created : :ok
30
+ if agreement.save
31
+ render json: agreement, status: agreement.previously_new_record? ? :created : :ok
32
32
  else
33
- render_validation_errors(calibration)
33
+ render_validation_errors(agreement)
34
34
  end
35
35
  end
36
36
 
37
37
  def destroy
38
- @calibration.destroy!
38
+ @agreement.destroy!
39
39
  head :no_content
40
40
  end
41
41
 
42
42
  private
43
43
 
44
- def ensure_calibration_enabled
45
- render_error("Calibration disabled", status: :not_found) unless CompletionKit.config.judge_calibration_enabled
44
+ def ensure_agreement_enabled
45
+ render_error("Agreement disabled", status: :not_found) unless CompletionKit.config.judge_agreement_enabled
46
46
  end
47
47
 
48
48
  def set_nested_scope
@@ -53,17 +53,17 @@ module CompletionKit
53
53
  not_found
54
54
  end
55
55
 
56
- def load_calibration
57
- @calibration = Calibration.find(params[:id])
56
+ def load_agreement
57
+ @agreement = Agreement.find(params[:id])
58
58
  rescue ActiveRecord::RecordNotFound
59
59
  not_found
60
60
  end
61
61
 
62
- def scope_calibrations
63
- Calibration.where(run_id: @run.id, response_id: @response.id, metric_id: @metric.id)
62
+ def scope_agreements
63
+ Agreement.where(run_id: @run.id, response_id: @response.id, metric_id: @metric.id)
64
64
  end
65
65
 
66
- def calibration_params
66
+ def agreement_params
67
67
  params.permit(:verdict, :corrected_score, :note).to_h.symbolize_keys
68
68
  end
69
69
 
@@ -14,13 +14,8 @@ module CompletionKit
14
14
  end
15
15
 
16
16
  def publish
17
- if @version.published? && !@version.current?
18
- audit = @version.revert!
19
- render json: audit
20
- else
21
- @version.publish!
22
- render json: @version.reload
23
- end
17
+ @version.publish!
18
+ render json: @version.reload
24
19
  end
25
20
 
26
21
  def destroy
@@ -37,7 +37,7 @@ module CompletionKit
37
37
  end
38
38
 
39
39
  def suggest_variants
40
- disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
40
+ disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
41
41
  if disagreement_count.zero?
42
42
  render_error("Mark at least one case as Disagree before asking the model to suggest a change.", status: :unprocessable_entity)
43
43
  return
@@ -39,9 +39,9 @@ module CompletionKit
39
39
  def show
40
40
  @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
41
41
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
42
- @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
42
+ @improve_disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
43
43
  @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
44
- @guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricCalibrationExamples.judge_examples_for(@metric) : []
44
+ @guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricAgreementExamples.judge_examples_for(@metric) : []
45
45
  end
46
46
 
47
47
  def new
@@ -52,7 +52,7 @@ module CompletionKit
52
52
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
53
53
  @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
54
54
  @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
55
- @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
55
+ @improve_disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
56
56
 
57
57
  if @edit_draft
58
58
  @metric.instruction = @edit_draft.instruction
@@ -117,7 +117,7 @@ module CompletionKit
117
117
 
118
118
  def suggest_variants
119
119
  target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
120
- counts = Calibration.where(metric_id: @metric.id, verdict: %w[agree disagree]).group(:verdict).count
120
+ counts = Agreement.where(metric_id: @metric.id, verdict: %w[agree disagree]).group(:verdict).count
121
121
  if counts["disagree"].to_i.zero?
122
122
  redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
123
123
  return
@@ -145,12 +145,12 @@ module CompletionKit
145
145
  end
146
146
 
147
147
  def exclude_example
148
- calibration = Calibration.where(metric_id: @metric.id).find(params[:calibration_id])
149
- calibration.update!(excluded_from_examples: true)
148
+ agreement = Agreement.where(metric_id: @metric.id).find(params[:agreement_id])
149
+ agreement.update!(excluded_from_examples: true)
150
150
  render turbo_stream: turbo_stream.replace(
151
151
  "ck-guiding-#{@metric.id}",
152
152
  partial: "completion_kit/metrics/guiding_examples",
153
- locals: { metric: @metric, examples: MetricCalibrationExamples.judge_examples_for(@metric) }
153
+ locals: { metric: @metric, examples: MetricAgreementExamples.judge_examples_for(@metric) }
154
154
  )
155
155
  end
156
156
 
@@ -171,13 +171,12 @@ module CompletionKit
171
171
  reverting = was_published_already && !version.current?
172
172
  previously_current = MetricVersion.current.find_by(metric_id: @metric.id)
173
173
 
174
+ version.publish!
175
+
174
176
  if reverting
175
- audit = version.revert!
176
- prior_label = previously_current.version_label
177
177
  redirect_to metric_path(@metric),
178
- notice: "Reverted #{@metric.name} to #{version.version_label} (logged as #{audit.version_label}). Human reviews collected against #{prior_label} stay tied to it."
178
+ notice: "#{@metric.name} is back on #{version.version_label}. Its reviews count again; the ones you gave on #{previously_current.version_label} stay with that version."
179
179
  else
180
- version.publish!
181
180
  redirect_to metric_path(@metric),
182
181
  notice: "#{@metric.name} #{version.version_label} is now the published version."
183
182
  end
@@ -82,10 +82,10 @@ module CompletionKit
82
82
  private
83
83
 
84
84
  def review_examples_for(metric, response)
85
- return nil unless CompletionKit.config.judge_calibration_enabled
85
+ return nil unless CompletionKit.config.judge_agreement_enabled
86
86
  return nil unless CompletionKit.config.judge_examples_from_reviews
87
87
 
88
- MetricCalibrationExamples.judge_examples_for(metric, exclude_response_id: response.id)
88
+ MetricAgreementExamples.judge_examples_for(metric, exclude_response_id: response.id)
89
89
  end
90
90
 
91
91
  def confirm_judging_capability(judge_model_id)
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- class Calibration < ApplicationRecord
2
+ class Agreement < ApplicationRecord
3
3
  VERDICTS = %w[agree disagree borderline].freeze
4
4
 
5
5
  belongs_to :run
@@ -3,7 +3,7 @@ module CompletionKit
3
3
  STATES = %w[draft published].freeze
4
4
 
5
5
  belongs_to :metric
6
- has_many :calibrations, dependent: :destroy
6
+ has_many :agreements, dependent: :destroy
7
7
 
8
8
  serialize :rubric_bands, coder: JSON
9
9
  serialize :validation_summary, coder: JSON
@@ -83,22 +83,6 @@ module CompletionKit
83
83
  self
84
84
  end
85
85
 
86
- def revert!
87
- raise ArgumentError, "only a published version can be reverted to" unless published?
88
- audit = nil
89
- MetricVersion.transaction do
90
- audit = self.class.create!(
91
- metric: metric,
92
- instruction: instruction,
93
- rubric_bands: rubric_bands,
94
- state: "draft",
95
- source: "revert"
96
- )
97
- audit.publish!
98
- end
99
- audit
100
- end
101
-
102
86
  def as_json(options = {})
103
87
  {
104
88
  id: id,
@@ -8,6 +8,7 @@ module CompletionKit
8
8
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
9
9
 
10
10
  validates :metric_name, presence: true
11
+ validates :metric_version, presence: true
11
12
  validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
12
13
 
13
14
  before_validation :set_default_status
@@ -1,5 +1,5 @@
1
1
  module CompletionKit
2
- module CalibrationMath
2
+ module AgreementMath
3
3
  Z_95 = 1.959963984540054
4
4
 
5
5
  module_function
@@ -35,7 +35,7 @@ module CompletionKit
35
35
  McpTools::MetricVersions.definitions +
36
36
  McpTools::ProviderCredentials.definitions +
37
37
  McpTools::Tags.definitions +
38
- McpTools::Calibrations.definitions +
38
+ McpTools::Agreements.definitions +
39
39
  McpTools::Judges.definitions
40
40
  end
41
41
 
@@ -50,7 +50,7 @@ module CompletionKit
50
50
  when /\Ametrics_/ then McpTools::Metrics.call(name, arguments)
51
51
  when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
52
52
  when /\Atags_/ then McpTools::Tags.call(name, arguments)
53
- when /\Acalibrations_/ then McpTools::Calibrations.call(name, arguments)
53
+ when /\Aagreements_/ then McpTools::Agreements.call(name, arguments)
54
54
  when /\Ajudges_/ then McpTools::Judges.call(name, arguments)
55
55
  else raise MethodNotFound, "Unknown tool: #{name}"
56
56
  end