completion-kit 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/stylesheets/completion_kit/application.css +118 -55
  3. data/app/controllers/completion_kit/{calibrations_controller.rb → agreements_controller.rb} +19 -19
  4. data/app/controllers/completion_kit/api/v1/{calibrations_controller.rb → agreements_controller.rb} +18 -18
  5. data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +2 -7
  6. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +1 -1
  7. data/app/controllers/completion_kit/metrics_controller.rb +18 -23
  8. data/app/jobs/completion_kit/judge_review_job.rb +2 -2
  9. data/app/jobs/completion_kit/metric_suggestion_job.rb +46 -0
  10. data/app/models/completion_kit/{calibration.rb → agreement.rb} +1 -1
  11. data/app/models/completion_kit/metric_version.rb +2 -17
  12. data/app/models/completion_kit/review.rb +1 -0
  13. data/app/services/completion_kit/{calibration_math.rb → agreement_math.rb} +1 -1
  14. data/app/services/completion_kit/mcp_dispatcher.rb +2 -2
  15. data/app/services/completion_kit/mcp_tools/{calibrations.rb → agreements.rb} +11 -11
  16. data/app/services/completion_kit/mcp_tools/judges.rb +3 -3
  17. data/app/services/completion_kit/mcp_tools/metric_versions.rb +2 -7
  18. data/app/services/completion_kit/{metric_calibration_examples.rb → metric_agreement_examples.rb} +6 -6
  19. data/app/services/completion_kit/{metric_calibration_stats.rb → metric_agreement_stats.rb} +6 -6
  20. data/app/services/completion_kit/metric_improvement_validator.rb +101 -0
  21. data/app/services/completion_kit/metric_variant_generator.rb +2 -2
  22. data/app/views/completion_kit/{calibrations → agreements}/_buttons.html.erb +33 -33
  23. data/app/views/completion_kit/{calibrations → agreements}/_trust_panel.html.erb +6 -9
  24. data/app/views/completion_kit/api_reference/_body.html.erb +15 -15
  25. data/app/views/completion_kit/metrics/_guiding_examples.html.erb +1 -1
  26. data/app/views/completion_kit/metrics/_suggestion_failed.html.erb +3 -0
  27. data/app/views/completion_kit/metrics/_suggestion_pending.html.erb +3 -0
  28. data/app/views/completion_kit/metrics/_suggestion_ready.html.erb +4 -0
  29. data/app/views/completion_kit/metrics/_validation_scoreboard.html.erb +12 -0
  30. data/app/views/completion_kit/metrics/edit.html.erb +1 -1
  31. data/app/views/completion_kit/metrics/show.html.erb +25 -11
  32. data/app/views/completion_kit/responses/show.html.erb +4 -4
  33. data/app/views/completion_kit/runs/show.html.erb +1 -1
  34. data/config/routes.rb +3 -3
  35. data/db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb +5 -0
  36. data/db/migrate/20260531000002_backfill_review_metric_versions.rb +33 -0
  37. data/db/migrate/20260531000003_add_metric_version_fk_to_reviews.rb +6 -0
  38. data/db/migrate/20260531000004_rename_calibrations_to_agreements.rb +19 -0
  39. data/lib/completion_kit/version.rb +1 -1
  40. data/lib/completion_kit.rb +2 -2
  41. metadata +20 -10
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4772b264a668a86e004f78c7bc2397d93f5266d8dc287b422728d951ab24fbcc
4
- data.tar.gz: 63b221e144e930df9978607a78533d354f659ca2bf6141046291168af50d3cd7
3
+ metadata.gz: cdcc8d4cdaf4b7aa4b3cff7cb0dd3fe65ce213bbce7b8ab1ba52cba304bff19a
4
+ data.tar.gz: d2b25e3b12b187b3df15e9b8347668a3dcf529b765fd5427a1dc2579665ee664
5
5
  SHA512:
6
- metadata.gz: a3249ae1c734dcee0c6f9410baf0400f4b16e091b220d86e8417dec91ff9943a165bbc6c8368629cc14054c3af7946bdb693f006a5756de40a809c41db5bbe3a
7
- data.tar.gz: 541323c93b08f08f32c2f024e3709ee5f3e4e48144cd8e9cb2ba8891312fd9b5712e4ca39cbcc739b64ba61ff0ac890be3528e5bfd9c238e073d640b37b47e90
6
+ metadata.gz: 81921860b28c13076623a462e9cc82569a3721f035f5eb8dbe236c177fa02f9277aa7425659ae583da6713dfcc9467fc03683b3305f65fd8eaf29525cc93e143
7
+ data.tar.gz: c03aa7c4ad395228e4268ee166a908779a6c50e614c1a0591cf52569712236ffb4ccd9b6d8dc918d40e2ebb253fc48760fdd4e95d3e374e4cf68fad7b1b7ee19
@@ -3158,7 +3158,7 @@ select.ck-input {
3158
3158
  #ck-tab-datasets:checked ~ .ck-api-tabs__nav label[for="ck-tab-datasets"],
3159
3159
  #ck-tab-metrics:checked ~ .ck-api-tabs__nav label[for="ck-tab-metrics"],
3160
3160
  #ck-tab-metric-groups:checked ~ .ck-api-tabs__nav label[for="ck-tab-metric-groups"],
3161
- #ck-tab-calibrations:checked ~ .ck-api-tabs__nav label[for="ck-tab-calibrations"],
3161
+ #ck-tab-agreements:checked ~ .ck-api-tabs__nav label[for="ck-tab-agreements"],
3162
3162
  #ck-tab-tags:checked ~ .ck-api-tabs__nav label[for="ck-tab-tags"],
3163
3163
  #ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"] {
3164
3164
  color: var(--ck-accent);
@@ -3173,7 +3173,7 @@ select.ck-input {
3173
3173
  #ck-tab-datasets:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(5),
3174
3174
  #ck-tab-metrics:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(6),
3175
3175
  #ck-tab-metric-groups:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(7),
3176
- #ck-tab-calibrations:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(8),
3176
+ #ck-tab-agreements:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(8),
3177
3177
  #ck-tab-tags:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(9),
3178
3178
  #ck-tab-providers:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(10) {
3179
3179
  display: block;
@@ -3215,7 +3215,7 @@ select.ck-input {
3215
3215
  #ck-tab-datasets:checked ~ .ck-api-tabs__nav label[for="ck-tab-datasets"],
3216
3216
  #ck-tab-metrics:checked ~ .ck-api-tabs__nav label[for="ck-tab-metrics"],
3217
3217
  #ck-tab-metric-groups:checked ~ .ck-api-tabs__nav label[for="ck-tab-metric-groups"],
3218
- #ck-tab-calibrations:checked ~ .ck-api-tabs__nav label[for="ck-tab-calibrations"],
3218
+ #ck-tab-agreements:checked ~ .ck-api-tabs__nav label[for="ck-tab-agreements"],
3219
3219
  #ck-tab-tags:checked ~ .ck-api-tabs__nav label[for="ck-tab-tags"],
3220
3220
  #ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"] {
3221
3221
  border-left-color: transparent;
@@ -3619,10 +3619,9 @@ select.ck-input {
3619
3619
  .ck-prompt-versions-table th:nth-child(3), .ck-prompt-versions-table td:nth-child(3) { width: 8rem; white-space: nowrap; }
3620
3620
  .ck-prompt-versions-table th:nth-child(4), .ck-prompt-versions-table td:nth-child(4) { width: auto; }
3621
3621
 
3622
- .ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 34%; }
3623
- .ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: 33%; white-space: nowrap; }
3624
- .ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: 33%; white-space: nowrap; }
3625
- .ck-metric-versions-table .ck-version-cell { justify-content: flex-start; gap: 0.75rem; }
3622
+ .ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 18rem; }
3623
+ .ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: 16rem; white-space: nowrap; }
3624
+ .ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: auto; white-space: nowrap; }
3626
3625
 
3627
3626
 
3628
3627
  .ck-source-chip {
@@ -5378,33 +5377,33 @@ a.tag-mark {
5378
5377
  outline-offset: 2px;
5379
5378
  }
5380
5379
 
5381
- .ck-calibration {
5380
+ .ck-agreement {
5382
5381
  margin-top: 12px;
5383
5382
  padding-top: 12px;
5384
5383
  border-top: 1px dashed var(--ck-line);
5385
5384
  }
5386
- .ck-calibration__prompt {
5385
+ .ck-agreement__prompt {
5387
5386
  margin: 0 0 10px;
5388
5387
  font-family: var(--ck-mono);
5389
5388
  font-size: 0.72rem;
5390
5389
  letter-spacing: 0.04em;
5391
5390
  color: var(--ck-dim);
5392
5391
  }
5393
- .ck-calibration__prompt > * + * {
5392
+ .ck-agreement__prompt > * + * {
5394
5393
  margin-left: 8px;
5395
5394
  }
5396
- .ck-calibration__label {
5395
+ .ck-agreement__label {
5397
5396
  letter-spacing: 0.08em;
5398
5397
  text-transform: uppercase;
5399
5398
  color: var(--ck-dim);
5400
5399
  }
5401
- .ck-calibration__meta {
5400
+ .ck-agreement__meta {
5402
5401
  color: var(--ck-muted);
5403
5402
  }
5404
- .ck-calibration__sep {
5403
+ .ck-agreement__sep {
5405
5404
  color: var(--ck-line-strong);
5406
5405
  }
5407
- .ck-calibration__meta-link {
5406
+ .ck-agreement__meta-link {
5408
5407
  color: var(--ck-accent);
5409
5408
  text-decoration: none;
5410
5409
  white-space: nowrap;
@@ -5412,12 +5411,12 @@ a.tag-mark {
5412
5411
  letter-spacing: 0.08em;
5413
5412
  }
5414
5413
 
5415
- .ck-calibration__others {
5414
+ .ck-agreement__others {
5416
5415
  margin: 10px 0 0;
5417
5416
  font-family: var(--ck-mono);
5418
5417
  font-size: 0.78rem;
5419
5418
  }
5420
- .ck-calibration__others-summary {
5419
+ .ck-agreement__others-summary {
5421
5420
  display: inline-flex;
5422
5421
  align-items: center;
5423
5422
  gap: 6px;
@@ -5427,20 +5426,20 @@ a.tag-mark {
5427
5426
  user-select: none;
5428
5427
  list-style: none;
5429
5428
  }
5430
- .ck-calibration__others-summary:hover,
5431
- .ck-calibration__others-summary:focus-visible {
5429
+ .ck-agreement__others-summary:hover,
5430
+ .ck-agreement__others-summary:focus-visible {
5432
5431
  color: var(--ck-accent-hover);
5433
5432
  }
5434
- .ck-calibration__others-summary::-webkit-details-marker { display: none; }
5435
- .ck-calibration__others-summary svg {
5433
+ .ck-agreement__others-summary::-webkit-details-marker { display: none; }
5434
+ .ck-agreement__others-summary svg {
5436
5435
  width: 12px;
5437
5436
  height: 12px;
5438
5437
  transition: transform 0.15s;
5439
5438
  }
5440
- .ck-calibration__others[open] .ck-calibration__others-summary svg {
5439
+ .ck-agreement__others[open] .ck-agreement__others-summary svg {
5441
5440
  transform: rotate(90deg);
5442
5441
  }
5443
- .ck-calibration__others-list {
5442
+ .ck-agreement__others-list {
5444
5443
  list-style: none;
5445
5444
  padding: 8px 0 0;
5446
5445
  margin: 0;
@@ -5448,24 +5447,24 @@ a.tag-mark {
5448
5447
  flex-direction: column;
5449
5448
  gap: 6px;
5450
5449
  }
5451
- .ck-calibration__others-item {
5450
+ .ck-agreement__others-item {
5452
5451
  padding: 8px 10px;
5453
5452
  background: var(--ck-surface-soft);
5454
5453
  border: 1px solid var(--ck-line);
5455
5454
  border-radius: 4px;
5456
5455
  color: var(--ck-dim);
5457
5456
  }
5458
- .ck-calibration__others-item--agree { border-left: 2px solid var(--ck-success); }
5459
- .ck-calibration__others-item--disagree { border-left: 2px solid var(--ck-danger); }
5460
- .ck-calibration__others-item--borderline { border-left: 2px solid var(--ck-warning); }
5461
- .ck-calibration__others-row {
5457
+ .ck-agreement__others-item--agree { border-left: 2px solid var(--ck-success); }
5458
+ .ck-agreement__others-item--disagree { border-left: 2px solid var(--ck-danger); }
5459
+ .ck-agreement__others-item--borderline { border-left: 2px solid var(--ck-warning); }
5460
+ .ck-agreement__others-row {
5462
5461
  display: flex;
5463
5462
  flex-wrap: wrap;
5464
5463
  align-items: center;
5465
5464
  gap: 10px;
5466
5465
  line-height: 1;
5467
5466
  }
5468
- .ck-calibration__others-verdict {
5467
+ .ck-agreement__others-verdict {
5469
5468
  display: inline-flex;
5470
5469
  align-items: center;
5471
5470
  gap: 4px;
@@ -5474,24 +5473,24 @@ a.tag-mark {
5474
5473
  font-weight: 500;
5475
5474
  color: var(--ck-text);
5476
5475
  }
5477
- .ck-calibration__others-item--agree .ck-calibration__others-verdict { color: var(--ck-success); }
5478
- .ck-calibration__others-item--disagree .ck-calibration__others-verdict { color: var(--ck-danger); }
5479
- .ck-calibration__others-item--borderline .ck-calibration__others-verdict { color: var(--ck-warning); }
5480
- .ck-calibration__others-by {
5476
+ .ck-agreement__others-item--agree .ck-agreement__others-verdict { color: var(--ck-success); }
5477
+ .ck-agreement__others-item--disagree .ck-agreement__others-verdict { color: var(--ck-danger); }
5478
+ .ck-agreement__others-item--borderline .ck-agreement__others-verdict { color: var(--ck-warning); }
5479
+ .ck-agreement__others-by {
5481
5480
  color: var(--ck-muted);
5482
5481
  }
5483
- .ck-calibration__others-stars {
5482
+ .ck-agreement__others-stars {
5484
5483
  display: inline-flex;
5485
5484
  align-items: center;
5486
5485
  gap: 2px;
5487
5486
  }
5488
- .ck-calibration__others-stars svg { display: block; }
5489
- .ck-calibration__others-note {
5487
+ .ck-agreement__others-stars svg { display: block; }
5488
+ .ck-agreement__others-note {
5490
5489
  margin: 6px 0 0;
5491
5490
  color: var(--ck-dim);
5492
5491
  line-height: 1.5;
5493
5492
  }
5494
- .ck-calibration__meta-link svg {
5493
+ .ck-agreement__meta-link svg {
5495
5494
  display: inline-block;
5496
5495
  width: 12px;
5497
5496
  height: 12px;
@@ -5500,16 +5499,16 @@ a.tag-mark {
5500
5499
  position: relative;
5501
5500
  top: -1px;
5502
5501
  }
5503
- .ck-calibration__meta-link:hover,
5504
- .ck-calibration__meta-link:focus-visible {
5502
+ .ck-agreement__meta-link:hover,
5503
+ .ck-agreement__meta-link:focus-visible {
5505
5504
  color: var(--ck-accent-hover);
5506
5505
  }
5507
- .ck-calibration__buttons {
5506
+ .ck-agreement__buttons {
5508
5507
  display: flex;
5509
5508
  gap: 6px;
5510
5509
  flex-wrap: wrap;
5511
5510
  }
5512
- .ck-calibration__pill {
5511
+ .ck-agreement__pill {
5513
5512
  display: inline-flex;
5514
5513
  align-items: center;
5515
5514
  gap: 0.4rem;
@@ -5526,50 +5525,50 @@ a.tag-mark {
5526
5525
  cursor: pointer;
5527
5526
  transition: background 0.12s, border-color 0.12s, color 0.12s;
5528
5527
  }
5529
- .ck-calibration__pill svg {
5528
+ .ck-agreement__pill svg {
5530
5529
  width: 14px;
5531
5530
  height: 14px;
5532
5531
  }
5533
- .ck-calibration__pill:hover,
5534
- .ck-calibration__pill:focus-visible {
5532
+ .ck-agreement__pill:hover,
5533
+ .ck-agreement__pill:focus-visible {
5535
5534
  color: var(--ck-text);
5536
5535
  border-color: var(--ck-dim);
5537
5536
  }
5538
- .ck-calibration__pill--agree.is-active {
5537
+ .ck-agreement__pill--agree.is-active {
5539
5538
  background: var(--ck-success-soft);
5540
5539
  border-color: rgba(45, 212, 168, 0.35);
5541
5540
  color: var(--ck-success);
5542
5541
  }
5543
- .ck-calibration__pill--disagree.is-active {
5542
+ .ck-agreement__pill--disagree.is-active {
5544
5543
  background: var(--ck-danger-soft);
5545
5544
  border-color: rgba(248, 113, 113, 0.35);
5546
5545
  color: var(--ck-danger);
5547
5546
  }
5548
- .ck-calibration__pill--borderline.is-active {
5547
+ .ck-agreement__pill--borderline.is-active {
5549
5548
  background: var(--ck-warning-soft);
5550
5549
  border-color: rgba(224, 164, 88, 0.35);
5551
5550
  color: var(--ck-warning);
5552
5551
  }
5553
- .ck-calibration__pill--agree:hover { border-color: rgba(45, 212, 168, 0.45); color: var(--ck-success); }
5554
- .ck-calibration__pill--disagree:hover { border-color: rgba(248, 113, 113, 0.45); color: var(--ck-danger); }
5555
- .ck-calibration__pill--borderline:hover { border-color: rgba(224, 164, 88, 0.45); color: var(--ck-warning); }
5556
- .ck-calibration__detail {
5552
+ .ck-agreement__pill--agree:hover { border-color: rgba(45, 212, 168, 0.45); color: var(--ck-success); }
5553
+ .ck-agreement__pill--disagree:hover { border-color: rgba(248, 113, 113, 0.45); color: var(--ck-danger); }
5554
+ .ck-agreement__pill--borderline:hover { border-color: rgba(224, 164, 88, 0.45); color: var(--ck-warning); }
5555
+ .ck-agreement__detail {
5557
5556
  margin-top: 12px;
5558
5557
  display: flex;
5559
5558
  flex-direction: column;
5560
5559
  gap: 12px;
5561
5560
  }
5562
- .ck-calibration__detail > * {
5561
+ .ck-agreement__detail > * {
5563
5562
  margin: 0;
5564
5563
  }
5565
- .ck-calibration__detail .ck-button {
5564
+ .ck-agreement__detail .ck-button {
5566
5565
  align-self: flex-start;
5567
5566
  }
5568
- .ck-calibration__detail textarea {
5567
+ .ck-agreement__detail textarea {
5569
5568
  font-family: var(--ck-mono);
5570
5569
  font-size: 0.82rem;
5571
5570
  }
5572
- .ck-calibration__value {
5571
+ .ck-agreement__value {
5573
5572
  color: var(--ck-accent);
5574
5573
  font-family: var(--ck-mono);
5575
5574
  font-weight: 600;
@@ -5661,7 +5660,7 @@ a.tag-mark {
5661
5660
  background: linear-gradient(180deg, var(--ck-accent-soft), var(--ck-surface));
5662
5661
  }
5663
5662
 
5664
- .ck-calibration__error {
5663
+ .ck-agreement__error {
5665
5664
  margin: 8px 0 0;
5666
5665
  padding: 8px 10px;
5667
5666
  background: var(--ck-danger-soft);
@@ -6001,3 +6000,67 @@ a.tag-mark {
6001
6000
  width: 2rem;
6002
6001
  height: 2rem;
6003
6002
  }
6003
+
6004
+ .ck-suggestion-status:empty { display: none; }
6005
+ .ck-suggestion-status {
6006
+ margin-top: 10px;
6007
+ display: flex;
6008
+ align-items: baseline;
6009
+ gap: 10px;
6010
+ flex-wrap: wrap;
6011
+ }
6012
+
6013
+ .ck-scoreboard {
6014
+ margin-bottom: 16px;
6015
+ padding-bottom: 14px;
6016
+ border-bottom: 1px solid var(--ck-line);
6017
+ }
6018
+ .ck-scoreboard__headline {
6019
+ margin: 0 0 8px;
6020
+ font-size: 0.95rem;
6021
+ color: var(--ck-text);
6022
+ }
6023
+ .ck-scoreboard__was {
6024
+ font-family: var(--ck-mono);
6025
+ font-size: 0.74rem;
6026
+ color: var(--ck-muted);
6027
+ margin-left: 6px;
6028
+ }
6029
+ .ck-scoreboard__tally {
6030
+ list-style: none;
6031
+ margin: 0;
6032
+ padding: 0;
6033
+ display: flex;
6034
+ gap: 18px;
6035
+ }
6036
+ .ck-scoreboard__stat {
6037
+ font-family: var(--ck-mono);
6038
+ font-size: 0.72rem;
6039
+ letter-spacing: 0.06em;
6040
+ text-transform: uppercase;
6041
+ color: var(--ck-muted);
6042
+ }
6043
+ .ck-scoreboard__stat strong { color: var(--ck-text); }
6044
+ .ck-scoreboard__stat--break strong { color: var(--ck-warning); }
6045
+ .ck-scoreboard__note {
6046
+ margin: 8px 0 0;
6047
+ font-size: 0.78rem;
6048
+ color: var(--ck-muted);
6049
+ }
6050
+ .ck-version-change {
6051
+ display: inline-flex;
6052
+ align-items: baseline;
6053
+ gap: 0.6rem;
6054
+ }
6055
+ .ck-version-score {
6056
+ font-family: var(--ck-mono);
6057
+ font-size: 0.74rem;
6058
+ color: var(--ck-dim);
6059
+ }
6060
+ .ck-version-score__label {
6061
+ font-size: 0.6rem;
6062
+ letter-spacing: 0.08em;
6063
+ text-transform: uppercase;
6064
+ color: var(--ck-muted);
6065
+ margin-right: 0.2rem;
6066
+ }
@@ -1,36 +1,36 @@
1
1
  module CompletionKit
2
- class CalibrationsController < ApplicationController
3
- before_action :ensure_calibration_enabled
2
+ class AgreementsController < ApplicationController
3
+ before_action :ensure_agreement_enabled
4
4
  before_action :set_scope
5
5
 
6
6
  def create
7
- created_by = calibration_creator
8
- existing = Calibration.find_by(
7
+ created_by = agreement_creator
8
+ existing = Agreement.find_by(
9
9
  run_id: @run.id, response_id: @response.id, metric_id: @metric.id, created_by: created_by
10
10
  )
11
11
 
12
12
  if params[:verdict] == "disagree" && params[:corrected_score].blank?
13
- render_calibration(calibration: existing, pending_verdict: "disagree")
13
+ render_agreement(agreement: existing, pending_verdict: "disagree")
14
14
  return
15
15
  end
16
16
 
17
- calibration = existing || Calibration.new(
17
+ agreement = existing || Agreement.new(
18
18
  run: @run, response: @response, metric: @metric, created_by: created_by
19
19
  )
20
- calibration.assign_attributes(
20
+ agreement.assign_attributes(
21
21
  metric_version: MetricVersion.ensure_current_for(@metric),
22
22
  verdict: params[:verdict],
23
23
  corrected_score: params[:corrected_score].presence,
24
24
  note: params[:note].presence
25
25
  )
26
26
 
27
- if calibration.save
28
- render_calibration(calibration: calibration, just_saved: true)
27
+ if agreement.save
28
+ render_agreement(agreement: agreement, just_saved: true)
29
29
  else
30
- render_calibration(
31
- calibration: existing,
30
+ render_agreement(
31
+ agreement: existing,
32
32
  pending_verdict: params[:verdict],
33
- error: calibration.errors.full_messages.to_sentence,
33
+ error: agreement.errors.full_messages.to_sentence,
34
34
  status: :unprocessable_entity
35
35
  )
36
36
  end
@@ -38,10 +38,10 @@ module CompletionKit
38
38
 
39
39
  private
40
40
 
41
- def render_calibration(calibration:, pending_verdict: nil, error: nil, just_saved: false, status: :ok)
41
+ def render_agreement(agreement:, pending_verdict: nil, error: nil, just_saved: false, status: :ok)
42
42
  locals = {
43
43
  review: review_for_metric,
44
- calibration: calibration,
44
+ agreement: agreement,
45
45
  run: @run,
46
46
  response_row: @response,
47
47
  metric: @metric,
@@ -50,14 +50,14 @@ module CompletionKit
50
50
  just_saved: just_saved
51
51
  }
52
52
  render turbo_stream: turbo_stream.replace(
53
- "calibration_#{@response.id}_#{@metric.id}",
54
- partial: "completion_kit/calibrations/buttons",
53
+ "agreement_#{@response.id}_#{@metric.id}",
54
+ partial: "completion_kit/agreements/buttons",
55
55
  locals: locals
56
56
  ), status: status
57
57
  end
58
58
 
59
- def ensure_calibration_enabled
60
- head :not_found unless CompletionKit.config.judge_calibration_enabled
59
+ def ensure_agreement_enabled
60
+ head :not_found unless CompletionKit.config.judge_agreement_enabled
61
61
  end
62
62
 
63
63
  def set_scope
@@ -70,7 +70,7 @@ module CompletionKit
70
70
  @response.reviews.find_by(metric_id: @metric.id)
71
71
  end
72
72
 
73
- def calibration_creator
73
+ def agreement_creator
74
74
  request.env["HTTP_X_REMOTE_USER"].presence || CompletionKit.config.username.presence || "operator"
75
75
  end
76
76
  end
@@ -1,13 +1,13 @@
1
1
  module CompletionKit
2
2
  module Api
3
3
  module V1
4
- class CalibrationsController < BaseController
5
- before_action :ensure_calibration_enabled
4
+ class AgreementsController < BaseController
5
+ before_action :ensure_agreement_enabled
6
6
  before_action :set_nested_scope, only: [:create]
7
- before_action :load_calibration, only: [:destroy]
7
+ before_action :load_agreement, only: [:destroy]
8
8
 
9
9
  def index
10
- scope = Calibration.all
10
+ scope = Agreement.all
11
11
  scope = scope.where(run_id: params[:run_id]) if params[:run_id].present?
12
12
  scope = scope.where(response_id: params[:response_id]) if params[:response_id].present?
13
13
  scope = scope.where(metric_id: params[:metric_id]) if params[:metric_id].present?
@@ -18,31 +18,31 @@ module CompletionKit
18
18
  end
19
19
 
20
20
  def create
21
- calibration = scope_calibrations.find_or_initialize_by(created_by: created_by_param)
22
- calibration.assign_attributes(
21
+ agreement = scope_agreements.find_or_initialize_by(created_by: created_by_param)
22
+ agreement.assign_attributes(
23
23
  run: @run,
24
24
  response: @response,
25
25
  metric: @metric,
26
26
  metric_version: MetricVersion.ensure_current_for(@metric),
27
- **calibration_params
27
+ **agreement_params
28
28
  )
29
29
 
30
- if calibration.save
31
- render json: calibration, status: calibration.previously_new_record? ? :created : :ok
30
+ if agreement.save
31
+ render json: agreement, status: agreement.previously_new_record? ? :created : :ok
32
32
  else
33
- render_validation_errors(calibration)
33
+ render_validation_errors(agreement)
34
34
  end
35
35
  end
36
36
 
37
37
  def destroy
38
- @calibration.destroy!
38
+ @agreement.destroy!
39
39
  head :no_content
40
40
  end
41
41
 
42
42
  private
43
43
 
44
- def ensure_calibration_enabled
45
- render_error("Calibration disabled", status: :not_found) unless CompletionKit.config.judge_calibration_enabled
44
+ def ensure_agreement_enabled
45
+ render_error("Agreement disabled", status: :not_found) unless CompletionKit.config.judge_agreement_enabled
46
46
  end
47
47
 
48
48
  def set_nested_scope
@@ -53,17 +53,17 @@ module CompletionKit
53
53
  not_found
54
54
  end
55
55
 
56
- def load_calibration
57
- @calibration = Calibration.find(params[:id])
56
+ def load_agreement
57
+ @agreement = Agreement.find(params[:id])
58
58
  rescue ActiveRecord::RecordNotFound
59
59
  not_found
60
60
  end
61
61
 
62
- def scope_calibrations
63
- Calibration.where(run_id: @run.id, response_id: @response.id, metric_id: @metric.id)
62
+ def scope_agreements
63
+ Agreement.where(run_id: @run.id, response_id: @response.id, metric_id: @metric.id)
64
64
  end
65
65
 
66
- def calibration_params
66
+ def agreement_params
67
67
  params.permit(:verdict, :corrected_score, :note).to_h.symbolize_keys
68
68
  end
69
69
 
@@ -14,13 +14,8 @@ module CompletionKit
14
14
  end
15
15
 
16
16
  def publish
17
- if @version.published? && !@version.current?
18
- audit = @version.revert!
19
- render json: audit
20
- else
21
- @version.publish!
22
- render json: @version.reload
23
- end
17
+ @version.publish!
18
+ render json: @version.reload
24
19
  end
25
20
 
26
21
  def destroy
@@ -37,7 +37,7 @@ module CompletionKit
37
37
  end
38
38
 
39
39
  def suggest_variants
40
- disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
40
+ disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
41
41
  if disagreement_count.zero?
42
42
  render_error("Mark at least one case as Disagree before asking the model to suggest a change.", status: :unprocessable_entity)
43
43
  return
@@ -39,9 +39,9 @@ module CompletionKit
39
39
  def show
40
40
  @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
41
41
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
42
- @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
42
+ @improve_disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
43
43
  @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
44
- @guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricCalibrationExamples.judge_examples_for(@metric) : []
44
+ @guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricAgreementExamples.judge_examples_for(@metric) : []
45
45
  end
46
46
 
47
47
  def new
@@ -52,7 +52,7 @@ module CompletionKit
52
52
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
53
53
  @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
54
54
  @published_metric_version = MetricVersion.published.where(metric_id: @metric.id, current: true).first
55
- @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
55
+ @improve_disagreement_count = Agreement.where(metric_id: @metric.id, verdict: "disagree").count
56
56
 
57
57
  if @edit_draft
58
58
  @metric.instruction = @edit_draft.instruction
@@ -117,26 +117,22 @@ module CompletionKit
117
117
 
118
118
  def suggest_variants
119
119
  target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
120
- disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
121
- if disagreement_count.zero?
120
+ counts = Agreement.where(metric_id: @metric.id, verdict: %w[agree disagree]).group(:verdict).count
121
+ if counts["disagree"].to_i.zero?
122
122
  redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
123
123
  return
124
124
  end
125
125
 
126
- MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
126
+ MetricSuggestionJob.perform_later(@metric.id)
127
127
 
128
- generator = MetricVariantGenerator.new(@metric, count: 1)
129
- variants = generator.call
130
- if variants.empty?
131
- redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
132
- return
133
- end
134
- versions = generator.persist!(variants)
135
- new_version = versions.max_by(&:version_number)
136
128
  if params[:back_to] == "edit"
137
- redirect_to edit_metric_path(@metric), notice: "Drafted #{new_version.version_label} from your reviews. Review the proposed changes below, then Publish to use it."
129
+ redirect_to metric_path(@metric), notice: "Drafting a change from your reviews. It will appear here once it's tested."
138
130
  else
139
- redirect_to metric_path(@metric, show_change: new_version.id), notice: "Drafted #{new_version.version_label} from your reviews."
131
+ render turbo_stream: turbo_stream.replace(
132
+ "ck-suggestion-status-#{@metric.id}",
133
+ partial: "completion_kit/metrics/suggestion_pending",
134
+ locals: { metric: @metric, count: counts.values.sum }
135
+ )
140
136
  end
141
137
  end
142
138
 
@@ -149,12 +145,12 @@ module CompletionKit
149
145
  end
150
146
 
151
147
  def exclude_example
152
- calibration = Calibration.where(metric_id: @metric.id).find(params[:calibration_id])
153
- calibration.update!(excluded_from_examples: true)
148
+ agreement = Agreement.where(metric_id: @metric.id).find(params[:agreement_id])
149
+ agreement.update!(excluded_from_examples: true)
154
150
  render turbo_stream: turbo_stream.replace(
155
151
  "ck-guiding-#{@metric.id}",
156
152
  partial: "completion_kit/metrics/guiding_examples",
157
- locals: { metric: @metric, examples: MetricCalibrationExamples.judge_examples_for(@metric) }
153
+ locals: { metric: @metric, examples: MetricAgreementExamples.judge_examples_for(@metric) }
158
154
  )
159
155
  end
160
156
 
@@ -175,13 +171,12 @@ module CompletionKit
175
171
  reverting = was_published_already && !version.current?
176
172
  previously_current = MetricVersion.current.find_by(metric_id: @metric.id)
177
173
 
174
+ version.publish!
175
+
178
176
  if reverting
179
- audit = version.revert!
180
- prior_label = previously_current.version_label
181
177
  redirect_to metric_path(@metric),
182
- notice: "Reverted #{@metric.name} to #{version.version_label} (logged as #{audit.version_label}). Human reviews collected against #{prior_label} stay tied to it."
178
+ notice: "#{@metric.name} is back on #{version.version_label}. Its reviews count again; the ones you gave on #{previously_current.version_label} stay with that version."
183
179
  else
184
- version.publish!
185
180
  redirect_to metric_path(@metric),
186
181
  notice: "#{@metric.name} #{version.version_label} is now the published version."
187
182
  end
@@ -82,10 +82,10 @@ module CompletionKit
82
82
  private
83
83
 
84
84
  def review_examples_for(metric, response)
85
- return nil unless CompletionKit.config.judge_calibration_enabled
85
+ return nil unless CompletionKit.config.judge_agreement_enabled
86
86
  return nil unless CompletionKit.config.judge_examples_from_reviews
87
87
 
88
- MetricCalibrationExamples.judge_examples_for(metric, exclude_response_id: response.id)
88
+ MetricAgreementExamples.judge_examples_for(metric, exclude_response_id: response.id)
89
89
  end
90
90
 
91
91
  def confirm_judging_capability(judge_model_id)