completion-kit 0.5.36 → 0.5.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css.erb +207 -19
- data/app/controllers/completion_kit/metrics_controller.rb +52 -1
- data/app/models/completion_kit/judge_version.rb +17 -1
- data/app/models/completion_kit/metric.rb +19 -0
- data/app/services/completion_kit/calibration_math.rb +84 -0
- data/app/services/completion_kit/judge_variant_generator.rb +108 -0
- data/app/services/completion_kit/mcp_dispatcher.rb +3 -1
- data/app/services/completion_kit/mcp_tools/judges.rb +138 -0
- data/app/services/completion_kit/metric_calibration_stats.rb +99 -0
- data/app/views/completion_kit/calibrations/_buttons.html.erb +5 -4
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +34 -0
- data/app/views/completion_kit/metrics/show.html.erb +141 -0
- data/config/routes.rb +7 -1
- data/db/migrate/20260523000001_add_few_shot_examples_to_completion_kit_metrics.rb +5 -0
- data/db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb +15 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +8 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1ccc7d1feb86aed6af17569a642d8b8e81fe522f0a7c68ca4ebb34abc113dbce
|
|
4
|
+
data.tar.gz: 88793eabe6b04c3497c761cde5b61511623c5a9844ce7e101560f8eb3b492e18
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d133a9d0db55ee41eb07e290b9657e044c8a0836806bbd055d0b7b6d1cf8b981056b40e3a8795a951e36e7d7dbcc7626e249c2a2f4cba9492fad38aa931b6bfc
|
|
7
|
+
data.tar.gz: 71bbbe827f33648b12f121c949af74fe8d02702d44c70fd39beb4795c8f95d2b9941aa755d631a78029e0b412cdec7ca9e2bea107d1395ea001abe46dcfddf3f
|
|
@@ -5123,53 +5123,241 @@ a.tag-mark {
|
|
|
5123
5123
|
.ck-calibration {
|
|
5124
5124
|
margin-top: 12px;
|
|
5125
5125
|
padding-top: 12px;
|
|
5126
|
-
border-top: 1px dashed
|
|
5126
|
+
border-top: 1px dashed var(--ck-line);
|
|
5127
5127
|
}
|
|
5128
5128
|
.ck-calibration__prompt {
|
|
5129
|
-
font-
|
|
5129
|
+
font-family: var(--ck-mono);
|
|
5130
|
+
font-size: 0.72rem;
|
|
5131
|
+
letter-spacing: 0.06em;
|
|
5132
|
+
text-transform: uppercase;
|
|
5130
5133
|
color: var(--ck-dim);
|
|
5131
|
-
margin: 0 0
|
|
5134
|
+
margin: 0 0 10px;
|
|
5132
5135
|
display: flex;
|
|
5133
5136
|
align-items: center;
|
|
5134
|
-
gap:
|
|
5137
|
+
gap: 10px;
|
|
5135
5138
|
}
|
|
5136
5139
|
.ck-calibration__count {
|
|
5137
|
-
font-
|
|
5140
|
+
font-family: var(--ck-mono);
|
|
5141
|
+
font-size: 0.72rem;
|
|
5142
|
+
letter-spacing: 0.03em;
|
|
5138
5143
|
color: var(--ck-accent);
|
|
5144
|
+
text-transform: none;
|
|
5139
5145
|
}
|
|
5140
5146
|
.ck-calibration__buttons {
|
|
5141
5147
|
display: flex;
|
|
5142
|
-
gap:
|
|
5148
|
+
gap: 6px;
|
|
5143
5149
|
flex-wrap: wrap;
|
|
5144
5150
|
}
|
|
5145
5151
|
.ck-calibration__pill {
|
|
5146
5152
|
display: inline-flex;
|
|
5147
5153
|
align-items: center;
|
|
5148
|
-
gap:
|
|
5149
|
-
padding:
|
|
5150
|
-
border-radius:
|
|
5151
|
-
font-
|
|
5152
|
-
|
|
5153
|
-
|
|
5154
|
-
|
|
5154
|
+
gap: 0.4rem;
|
|
5155
|
+
padding: 0.32rem 0.65rem;
|
|
5156
|
+
border-radius: 4px;
|
|
5157
|
+
font-family: var(--ck-mono);
|
|
5158
|
+
font-size: 0.78rem;
|
|
5159
|
+
font-weight: 500;
|
|
5160
|
+
letter-spacing: 0.04em;
|
|
5161
|
+
text-transform: uppercase;
|
|
5162
|
+
background: var(--ck-surface-soft);
|
|
5163
|
+
border: 1px solid var(--ck-line);
|
|
5164
|
+
color: var(--ck-dim);
|
|
5155
5165
|
cursor: pointer;
|
|
5166
|
+
transition: background 0.12s, border-color 0.12s, color 0.12s;
|
|
5167
|
+
}
|
|
5168
|
+
.ck-calibration__pill svg {
|
|
5169
|
+
width: 14px;
|
|
5170
|
+
height: 14px;
|
|
5156
5171
|
}
|
|
5157
5172
|
.ck-calibration__pill:hover,
|
|
5158
5173
|
.ck-calibration__pill:focus-visible {
|
|
5159
|
-
|
|
5174
|
+
color: var(--ck-text);
|
|
5175
|
+
border-color: var(--ck-dim);
|
|
5160
5176
|
}
|
|
5161
|
-
.ck-calibration__pill.is-active {
|
|
5162
|
-
background: var(--ck-
|
|
5163
|
-
color:
|
|
5164
|
-
|
|
5177
|
+
.ck-calibration__pill--agree.is-active {
|
|
5178
|
+
background: var(--ck-success-soft);
|
|
5179
|
+
border-color: rgba(45, 212, 168, 0.35);
|
|
5180
|
+
color: var(--ck-success);
|
|
5165
5181
|
}
|
|
5182
|
+
.ck-calibration__pill--disagree.is-active {
|
|
5183
|
+
background: var(--ck-danger-soft);
|
|
5184
|
+
border-color: rgba(248, 113, 113, 0.35);
|
|
5185
|
+
color: var(--ck-danger);
|
|
5186
|
+
}
|
|
5187
|
+
.ck-calibration__pill--borderline.is-active {
|
|
5188
|
+
background: var(--ck-warning-soft);
|
|
5189
|
+
border-color: rgba(224, 164, 88, 0.35);
|
|
5190
|
+
color: var(--ck-warning);
|
|
5191
|
+
}
|
|
5192
|
+
.ck-calibration__pill--agree:hover { border-color: rgba(45, 212, 168, 0.45); color: var(--ck-success); }
|
|
5193
|
+
.ck-calibration__pill--disagree:hover { border-color: rgba(248, 113, 113, 0.45); color: var(--ck-danger); }
|
|
5194
|
+
.ck-calibration__pill--borderline:hover { border-color: rgba(224, 164, 88, 0.45); color: var(--ck-warning); }
|
|
5166
5195
|
.ck-calibration__detail {
|
|
5167
|
-
margin-top:
|
|
5196
|
+
margin-top: 12px;
|
|
5168
5197
|
display: flex;
|
|
5169
5198
|
flex-direction: column;
|
|
5170
5199
|
gap: 8px;
|
|
5200
|
+
padding: 12px;
|
|
5201
|
+
background: var(--ck-surface-soft);
|
|
5202
|
+
border: 1px solid var(--ck-line);
|
|
5203
|
+
border-radius: 6px;
|
|
5171
5204
|
}
|
|
5172
5205
|
.ck-calibration__value {
|
|
5173
5206
|
color: var(--ck-accent);
|
|
5207
|
+
font-family: var(--ck-mono);
|
|
5208
|
+
font-weight: 600;
|
|
5209
|
+
}
|
|
5210
|
+
|
|
5211
|
+
.ck-trust-panel {
|
|
5212
|
+
display: inline-flex;
|
|
5213
|
+
flex-direction: column;
|
|
5214
|
+
gap: 6px;
|
|
5215
|
+
margin-top: 12px;
|
|
5216
|
+
padding: 10px 14px;
|
|
5217
|
+
background: var(--ck-surface-soft);
|
|
5218
|
+
border: 1px solid var(--ck-line);
|
|
5219
|
+
border-radius: 6px;
|
|
5220
|
+
}
|
|
5221
|
+
.ck-trust-panel__label {
|
|
5222
|
+
margin: 0;
|
|
5223
|
+
font-family: var(--ck-mono);
|
|
5224
|
+
font-size: 0.7rem;
|
|
5225
|
+
letter-spacing: 0.08em;
|
|
5226
|
+
text-transform: uppercase;
|
|
5227
|
+
color: var(--ck-dim);
|
|
5228
|
+
}
|
|
5229
|
+
.ck-trust-panel__body {
|
|
5230
|
+
display: flex;
|
|
5231
|
+
align-items: baseline;
|
|
5232
|
+
gap: 10px;
|
|
5233
|
+
}
|
|
5234
|
+
.ck-trust-panel__counter {
|
|
5235
|
+
font-family: var(--ck-mono);
|
|
5236
|
+
font-size: 1.6rem;
|
|
5237
|
+
font-weight: 600;
|
|
5238
|
+
color: var(--ck-accent);
|
|
5239
|
+
}
|
|
5240
|
+
.ck-trust-panel__counter-of {
|
|
5241
|
+
font-size: 0.9rem;
|
|
5242
|
+
color: var(--ck-dim);
|
|
5243
|
+
margin-left: 4px;
|
|
5244
|
+
}
|
|
5245
|
+
.ck-trust-panel__hint {
|
|
5246
|
+
font-family: var(--ck-mono);
|
|
5247
|
+
font-size: 0.72rem;
|
|
5248
|
+
color: var(--ck-dim);
|
|
5249
|
+
letter-spacing: 0.04em;
|
|
5250
|
+
}
|
|
5251
|
+
.ck-trust-panel__score {
|
|
5252
|
+
font-family: var(--ck-mono);
|
|
5253
|
+
font-size: 1.6rem;
|
|
5174
5254
|
font-weight: 600;
|
|
5255
|
+
color: var(--ck-success);
|
|
5256
|
+
}
|
|
5257
|
+
.ck-trust-panel__score-pct {
|
|
5258
|
+
font-size: 0.9rem;
|
|
5259
|
+
color: var(--ck-dim);
|
|
5260
|
+
margin-left: 2px;
|
|
5261
|
+
}
|
|
5262
|
+
.ck-trust-panel__margin {
|
|
5263
|
+
font-family: var(--ck-mono);
|
|
5264
|
+
font-size: 0.8rem;
|
|
5265
|
+
color: var(--ck-dim);
|
|
5266
|
+
}
|
|
5267
|
+
.ck-trust-panel__gate {
|
|
5268
|
+
font-family: var(--ck-mono);
|
|
5269
|
+
font-size: 0.66rem;
|
|
5270
|
+
letter-spacing: 0.08em;
|
|
5271
|
+
text-transform: uppercase;
|
|
5272
|
+
padding: 2px 6px;
|
|
5273
|
+
border-radius: 3px;
|
|
5274
|
+
background: var(--ck-surface);
|
|
5275
|
+
border: 1px solid var(--ck-line);
|
|
5276
|
+
color: var(--ck-dim);
|
|
5277
|
+
}
|
|
5278
|
+
.ck-trust-panel--firm .ck-trust-panel__gate {
|
|
5279
|
+
color: var(--ck-success);
|
|
5280
|
+
border-color: rgba(45, 212, 168, 0.35);
|
|
5281
|
+
}
|
|
5282
|
+
.ck-trust-panel__details {
|
|
5283
|
+
display: flex;
|
|
5284
|
+
flex-wrap: wrap;
|
|
5285
|
+
gap: 14px;
|
|
5286
|
+
font-family: var(--ck-mono);
|
|
5287
|
+
font-size: 0.72rem;
|
|
5288
|
+
color: var(--ck-dim);
|
|
5289
|
+
}
|
|
5290
|
+
.ck-trust-panel__borderline {
|
|
5291
|
+
color: var(--ck-warning);
|
|
5292
|
+
}
|
|
5293
|
+
|
|
5294
|
+
.ck-trust-panel__borderline--ok { color: var(--ck-dim); }
|
|
5295
|
+
.ck-trust-panel__borderline--warning { color: var(--ck-warning); }
|
|
5296
|
+
.ck-trust-panel__borderline--danger { color: var(--ck-danger); }
|
|
5297
|
+
|
|
5298
|
+
.ck-disagreements-table td .ck-meta-copy {
|
|
5299
|
+
font-size: 0.78rem;
|
|
5300
|
+
}
|
|
5301
|
+
.ck-few-shot-list {
|
|
5302
|
+
list-style: decimal;
|
|
5303
|
+
padding-left: 1.4rem;
|
|
5304
|
+
margin: 0;
|
|
5305
|
+
display: flex;
|
|
5306
|
+
flex-direction: column;
|
|
5307
|
+
gap: 12px;
|
|
5308
|
+
}
|
|
5309
|
+
.ck-few-shot-item {
|
|
5310
|
+
padding: 10px 12px;
|
|
5311
|
+
background: var(--ck-surface-soft);
|
|
5312
|
+
border: 1px solid var(--ck-line);
|
|
5313
|
+
border-radius: 6px;
|
|
5314
|
+
}
|
|
5315
|
+
.ck-few-shot-item__scores {
|
|
5316
|
+
display: flex;
|
|
5317
|
+
align-items: center;
|
|
5318
|
+
gap: 8px;
|
|
5319
|
+
font-family: var(--ck-mono);
|
|
5320
|
+
font-size: 0.75rem;
|
|
5321
|
+
letter-spacing: 0.04em;
|
|
5322
|
+
text-transform: uppercase;
|
|
5323
|
+
}
|
|
5324
|
+
|
|
5325
|
+
.ck-draft-banner {
|
|
5326
|
+
display: inline-flex;
|
|
5327
|
+
align-items: center;
|
|
5328
|
+
gap: 10px;
|
|
5329
|
+
margin-top: 10px;
|
|
5330
|
+
padding: 8px 12px;
|
|
5331
|
+
background: var(--ck-accent-soft);
|
|
5332
|
+
border: 1px dashed rgba(6, 182, 212, 0.4);
|
|
5333
|
+
border-radius: 6px;
|
|
5334
|
+
}
|
|
5335
|
+
|
|
5336
|
+
.ck-suggestion-list {
|
|
5337
|
+
display: flex;
|
|
5338
|
+
flex-direction: column;
|
|
5339
|
+
gap: 12px;
|
|
5340
|
+
}
|
|
5341
|
+
.ck-suggestion-card {
|
|
5342
|
+
padding: 12px 14px;
|
|
5343
|
+
background: var(--ck-surface-soft);
|
|
5344
|
+
border: 1px solid var(--ck-line);
|
|
5345
|
+
border-radius: 6px;
|
|
5346
|
+
display: flex;
|
|
5347
|
+
flex-direction: column;
|
|
5348
|
+
gap: 10px;
|
|
5349
|
+
}
|
|
5350
|
+
.ck-suggestion-card__header {
|
|
5351
|
+
display: flex;
|
|
5352
|
+
align-items: center;
|
|
5353
|
+
gap: 10px;
|
|
5354
|
+
}
|
|
5355
|
+
.ck-suggestion-card__instruction {
|
|
5356
|
+
margin: 0;
|
|
5357
|
+
white-space: pre-wrap;
|
|
5358
|
+
font-size: 0.85rem;
|
|
5359
|
+
background: var(--ck-bg-strong);
|
|
5360
|
+
padding: 10px 12px;
|
|
5361
|
+
border-radius: 4px;
|
|
5362
|
+
border: 1px solid var(--ck-line);
|
|
5175
5363
|
}
|
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class MetricsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_metric, only: [:show, :edit, :update, :destroy]
|
|
4
|
+
before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants]
|
|
5
5
|
|
|
6
6
|
def index
|
|
7
7
|
@metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
|
|
8
8
|
end
|
|
9
9
|
|
|
10
10
|
def show
|
|
11
|
+
@disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
|
|
12
|
+
.includes(response: [:reviews, :run])
|
|
13
|
+
.order(created_at: :desc)
|
|
14
|
+
.limit(50)
|
|
15
|
+
@latest_draft = JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
|
|
16
|
+
@suggestion_drafts = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc)
|
|
11
17
|
end
|
|
12
18
|
|
|
13
19
|
def new
|
|
@@ -40,6 +46,51 @@ module CompletionKit
|
|
|
40
46
|
redirect_to metrics_path, notice: "Metric was successfully destroyed."
|
|
41
47
|
end
|
|
42
48
|
|
|
49
|
+
def suggest_variants
|
|
50
|
+
generator = JudgeVariantGenerator.new(@metric)
|
|
51
|
+
variants = generator.call
|
|
52
|
+
if variants.empty?
|
|
53
|
+
redirect_to metric_path(@metric), alert: "The model returned no usable variants. Try again with a different model."
|
|
54
|
+
return
|
|
55
|
+
end
|
|
56
|
+
generator.persist!(variants)
|
|
57
|
+
label = variants.length == 1 ? "judge variant" : "judge variants"
|
|
58
|
+
redirect_to metric_path(@metric), notice: "Generated #{variants.length} #{label} as drafts. Pick one to publish."
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def publish_draft
|
|
62
|
+
draft = JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
|
|
63
|
+
if draft.nil?
|
|
64
|
+
redirect_to metric_path(@metric), alert: "No draft to publish."
|
|
65
|
+
return
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
JudgeVersion.transaction do
|
|
69
|
+
JudgeVersion.where(metric_id: @metric.id, state: "published").update_all(current: false)
|
|
70
|
+
draft.update!(state: "published", current: true)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
redirect_to metric_path(@metric), notice: "Draft published as the current judge version."
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def add_few_shot
|
|
77
|
+
calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
|
|
78
|
+
review = calibration.response.reviews.find_by(metric_id: @metric.id)
|
|
79
|
+
examples = Array(@metric.few_shot_examples)
|
|
80
|
+
examples << {
|
|
81
|
+
"input" => calibration.response.input_data.to_s.truncate(2000),
|
|
82
|
+
"response" => calibration.response.response_text.to_s.truncate(2000),
|
|
83
|
+
"judge_score" => review&.ai_score&.to_f,
|
|
84
|
+
"judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
|
|
85
|
+
"human_score" => calibration.corrected_score&.to_f,
|
|
86
|
+
"human_note" => calibration.note.to_s.truncate(1000),
|
|
87
|
+
"calibration_id" => calibration.id,
|
|
88
|
+
"added_at" => Time.current.utc.iso8601
|
|
89
|
+
}
|
|
90
|
+
@metric.update!(few_shot_examples: examples)
|
|
91
|
+
redirect_to metric_path(@metric), notice: "Added as a judge few-shot."
|
|
92
|
+
end
|
|
93
|
+
|
|
43
94
|
private
|
|
44
95
|
|
|
45
96
|
def set_metric
|
|
@@ -1,23 +1,37 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class JudgeVersion < ApplicationRecord
|
|
3
|
+
STATES = %w[draft published].freeze
|
|
4
|
+
|
|
3
5
|
belongs_to :metric
|
|
4
6
|
has_many :calibrations, dependent: :destroy
|
|
5
7
|
|
|
6
8
|
serialize :rubric_bands, coder: JSON
|
|
7
9
|
|
|
8
10
|
validates :metric_id, presence: true
|
|
11
|
+
validates :state, inclusion: { in: STATES }
|
|
9
12
|
|
|
10
13
|
scope :current, -> { where(current: true) }
|
|
14
|
+
scope :published, -> { where(state: "published") }
|
|
15
|
+
scope :drafts, -> { where(state: "draft") }
|
|
11
16
|
|
|
12
17
|
def self.ensure_current_for(metric)
|
|
13
18
|
current.find_by(metric_id: metric.id) || create!(
|
|
14
19
|
metric: metric,
|
|
15
20
|
instruction: metric.instruction,
|
|
16
21
|
rubric_bands: metric.rubric_bands,
|
|
17
|
-
current: true
|
|
22
|
+
current: true,
|
|
23
|
+
state: "published"
|
|
18
24
|
)
|
|
19
25
|
end
|
|
20
26
|
|
|
27
|
+
def draft?
|
|
28
|
+
state == "draft"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def published?
|
|
32
|
+
state == "published"
|
|
33
|
+
end
|
|
34
|
+
|
|
21
35
|
def as_json(options = {})
|
|
22
36
|
{
|
|
23
37
|
id: id,
|
|
@@ -25,6 +39,8 @@ module CompletionKit
|
|
|
25
39
|
instruction: instruction,
|
|
26
40
|
rubric_bands: rubric_bands,
|
|
27
41
|
current: current,
|
|
42
|
+
state: state,
|
|
43
|
+
source: source,
|
|
28
44
|
created_at: created_at
|
|
29
45
|
}
|
|
30
46
|
end
|
|
@@ -16,6 +16,7 @@ module CompletionKit
|
|
|
16
16
|
has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
|
|
17
17
|
|
|
18
18
|
serialize :rubric_bands, coder: JSON
|
|
19
|
+
serialize :few_shot_examples, coder: JSON, type: Array
|
|
19
20
|
|
|
20
21
|
validates :name, presence: true
|
|
21
22
|
validates :key, tenant_scoped_uniqueness: { allow_nil: true }
|
|
@@ -23,6 +24,7 @@ module CompletionKit
|
|
|
23
24
|
before_validation :generate_key
|
|
24
25
|
before_validation :normalize_rubric_bands
|
|
25
26
|
before_validation :set_defaults
|
|
27
|
+
after_update :fork_draft_judge_version, if: :judge_relevant_changes?
|
|
26
28
|
|
|
27
29
|
def self.default_rubric_bands
|
|
28
30
|
DEFAULT_RUBRIC_BANDS.map(&:dup)
|
|
@@ -95,5 +97,22 @@ module CompletionKit
|
|
|
95
97
|
def normalize_rubric_bands
|
|
96
98
|
self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
|
|
97
99
|
end
|
|
100
|
+
|
|
101
|
+
def judge_relevant_changes?
|
|
102
|
+
saved_change_to_instruction? || saved_change_to_rubric_bands?
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def fork_draft_judge_version
|
|
106
|
+
JudgeVersion.ensure_current_for(self)
|
|
107
|
+
JudgeVersion.where(metric_id: id, state: "draft").update_all(current: false)
|
|
108
|
+
JudgeVersion.create!(
|
|
109
|
+
metric: self,
|
|
110
|
+
instruction: instruction,
|
|
111
|
+
rubric_bands: rubric_bands,
|
|
112
|
+
current: false,
|
|
113
|
+
state: "draft",
|
|
114
|
+
source: "edit"
|
|
115
|
+
)
|
|
116
|
+
end
|
|
98
117
|
end
|
|
99
118
|
end
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module CalibrationMath
|
|
3
|
+
Z_95 = 1.959963984540054
|
|
4
|
+
|
|
5
|
+
module_function
|
|
6
|
+
|
|
7
|
+
def wilson_interval(successes:, n:, z: Z_95)
|
|
8
|
+
return { point: nil, low: nil, high: nil } if n.to_i.zero?
|
|
9
|
+
|
|
10
|
+
p_hat = successes.to_f / n
|
|
11
|
+
denom = 1.0 + (z * z) / n
|
|
12
|
+
center = (p_hat + (z * z) / (2.0 * n)) / denom
|
|
13
|
+
margin = z * Math.sqrt((p_hat * (1 - p_hat) / n) + ((z * z) / (4.0 * n * n))) / denom
|
|
14
|
+
|
|
15
|
+
{ point: p_hat, low: [center - margin, 0.0].max, high: [center + margin, 1.0].min }
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def mae(pairs)
|
|
19
|
+
return nil if pairs.empty?
|
|
20
|
+
sum = pairs.sum { |ai, human| (ai.to_f - human.to_f).abs }
|
|
21
|
+
sum / pairs.length
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def pearson(pairs)
|
|
25
|
+
return nil if pairs.length < 2
|
|
26
|
+
xs = pairs.map { |a, _| a.to_f }
|
|
27
|
+
ys = pairs.map { |_, h| h.to_f }
|
|
28
|
+
mx = xs.sum / xs.length
|
|
29
|
+
my = ys.sum / ys.length
|
|
30
|
+
num = xs.zip(ys).sum { |x, y| (x - mx) * (y - my) }
|
|
31
|
+
dx2 = xs.sum { |x| (x - mx)**2 }
|
|
32
|
+
dy2 = ys.sum { |y| (y - my)**2 }
|
|
33
|
+
denom = Math.sqrt(dx2 * dy2)
|
|
34
|
+
return nil if denom.zero?
|
|
35
|
+
num / denom
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def quadratic_weighted_kappa(pairs, categories:)
|
|
39
|
+
return nil if pairs.empty?
|
|
40
|
+
|
|
41
|
+
ratings = categories.to_a
|
|
42
|
+
k = ratings.length
|
|
43
|
+
return nil if k < 2
|
|
44
|
+
|
|
45
|
+
index = ratings.each_with_index.to_h
|
|
46
|
+
observed = Array.new(k) { Array.new(k, 0) }
|
|
47
|
+
row_totals = Array.new(k, 0)
|
|
48
|
+
col_totals = Array.new(k, 0)
|
|
49
|
+
n = 0
|
|
50
|
+
|
|
51
|
+
pairs.each do |ai, human|
|
|
52
|
+
i = index[score_bucket(ai, ratings)]
|
|
53
|
+
j = index[score_bucket(human, ratings)]
|
|
54
|
+
next if i.nil? || j.nil?
|
|
55
|
+
observed[i][j] += 1
|
|
56
|
+
row_totals[i] += 1
|
|
57
|
+
col_totals[j] += 1
|
|
58
|
+
n += 1
|
|
59
|
+
end
|
|
60
|
+
return nil if n.zero?
|
|
61
|
+
|
|
62
|
+
max_dist_sq = (k - 1.0)**2
|
|
63
|
+
numerator = 0.0
|
|
64
|
+
denominator = 0.0
|
|
65
|
+
(0...k).each do |i|
|
|
66
|
+
(0...k).each do |j|
|
|
67
|
+
weight = ((i - j)**2) / max_dist_sq
|
|
68
|
+
expected = (row_totals[i] * col_totals[j]).to_f / n
|
|
69
|
+
numerator += weight * observed[i][j]
|
|
70
|
+
denominator += weight * expected
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
return 1.0 if denominator.zero?
|
|
74
|
+
1.0 - (numerator / denominator)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def score_bucket(value, ratings)
|
|
78
|
+
rounded = value.to_f.round
|
|
79
|
+
return ratings.first if rounded <= ratings.first
|
|
80
|
+
return ratings.last if rounded >= ratings.last
|
|
81
|
+
rounded
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class JudgeVariantGenerator
|
|
3
|
+
DEFAULT_VARIANT_COUNT = 3
|
|
4
|
+
DEFAULT_TEMPERATURE = 0.4
|
|
5
|
+
|
|
6
|
+
Variant = Struct.new(:reasoning, :instruction, keyword_init: true)
|
|
7
|
+
|
|
8
|
+
def initialize(metric, count: DEFAULT_VARIANT_COUNT, model: nil)
|
|
9
|
+
@metric = metric
|
|
10
|
+
@count = count
|
|
11
|
+
@model = model || CompletionKit.config.judge_model
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def call
|
|
15
|
+
client = LlmClient.for_model(@model, ApiConfig.for_model(@model))
|
|
16
|
+
raw = client.generate_completion(build_meta_prompt, model: @model, max_tokens: 2500, temperature: DEFAULT_TEMPERATURE)
|
|
17
|
+
parse(raw).first(@count)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def persist!(variants)
|
|
21
|
+
JudgeVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
|
|
22
|
+
versions = variants.map do |variant|
|
|
23
|
+
JudgeVersion.create!(
|
|
24
|
+
metric: @metric,
|
|
25
|
+
instruction: variant.instruction,
|
|
26
|
+
rubric_bands: @metric.rubric_bands,
|
|
27
|
+
state: "draft",
|
|
28
|
+
source: "suggestion",
|
|
29
|
+
current: false
|
|
30
|
+
)
|
|
31
|
+
end
|
|
32
|
+
ActiveSupport::Notifications.instrument("completion_kit.judge_suggestion.generated",
|
|
33
|
+
metric_id: @metric.id,
|
|
34
|
+
count: versions.length,
|
|
35
|
+
model: @model)
|
|
36
|
+
versions
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def build_meta_prompt
|
|
42
|
+
examples = JudgeCalibrationExamples.for(@metric)
|
|
43
|
+
sections = []
|
|
44
|
+
sections << "You are an expert evaluator. Rewrite a judge's grading instruction so it agrees better with humans on the cases below."
|
|
45
|
+
sections << ""
|
|
46
|
+
sections << "## Current instruction"
|
|
47
|
+
sections << "```"
|
|
48
|
+
sections << @metric.instruction.to_s
|
|
49
|
+
sections << "```"
|
|
50
|
+
sections << ""
|
|
51
|
+
sections << "## Rubric (unchanged across variants — only rewrite the instruction)"
|
|
52
|
+
sections << @metric.display_rubric_text
|
|
53
|
+
sections << ""
|
|
54
|
+
sections << "## Recent disagreements (judge vs human)"
|
|
55
|
+
examples.each_with_index do |ex, i|
|
|
56
|
+
sections << "### Case #{i + 1}"
|
|
57
|
+
sections << "Input: #{ex[:input].to_s.truncate(200)}"
|
|
58
|
+
sections << "Output: #{ex[:output].to_s.truncate(200)}"
|
|
59
|
+
sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
|
|
60
|
+
sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
|
|
61
|
+
sections << ""
|
|
62
|
+
end
|
|
63
|
+
sections << "## Task"
|
|
64
|
+
sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite — not a wholesale rewrite of the rubric. Aim to close the disagreement gap."
|
|
65
|
+
sections << ""
|
|
66
|
+
sections << "Respond in EXACTLY this format, repeated #{@count} times:"
|
|
67
|
+
sections << ""
|
|
68
|
+
sections << "VARIANT:"
|
|
69
|
+
sections << "REASONING: <one sentence explaining what this variant changes>"
|
|
70
|
+
sections << "INSTRUCTION:"
|
|
71
|
+
sections << "<the rewritten instruction>"
|
|
72
|
+
sections << "END_VARIANT"
|
|
73
|
+
sections.join("\n")
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def parse(text)
|
|
77
|
+
blocks = text.to_s.scan(/VARIANT:(.*?)END_VARIANT/m).flatten
|
|
78
|
+
blocks.filter_map do |raw|
|
|
79
|
+
reasoning = raw[/REASONING:\s*(.*?)(?=INSTRUCTION:|\z)/m, 1].to_s.strip
|
|
80
|
+
instruction = raw[/INSTRUCTION:\s*(.*)/m, 1].to_s.strip
|
|
81
|
+
next if instruction.empty?
|
|
82
|
+
Variant.new(reasoning: reasoning, instruction: instruction)
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
module JudgeCalibrationExamples
|
|
88
|
+
module_function
|
|
89
|
+
|
|
90
|
+
def for(metric, limit: 8)
|
|
91
|
+
disagreements = Calibration.where(metric_id: metric.id, verdict: "disagree")
|
|
92
|
+
.includes(response: :reviews)
|
|
93
|
+
.order(created_at: :desc)
|
|
94
|
+
.limit(limit)
|
|
95
|
+
disagreements.map do |cal|
|
|
96
|
+
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
97
|
+
{
|
|
98
|
+
input: cal.response.input_data,
|
|
99
|
+
output: cal.response.response_text,
|
|
100
|
+
judge_score: review&.ai_score,
|
|
101
|
+
judge_feedback: review&.ai_feedback,
|
|
102
|
+
human_score: cal.corrected_score,
|
|
103
|
+
human_note: cal.note
|
|
104
|
+
}
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
@@ -34,7 +34,8 @@ module CompletionKit
|
|
|
34
34
|
McpTools::MetricGroups.definitions +
|
|
35
35
|
McpTools::ProviderCredentials.definitions +
|
|
36
36
|
McpTools::Tags.definitions +
|
|
37
|
-
McpTools::Calibrations.definitions
|
|
37
|
+
McpTools::Calibrations.definitions +
|
|
38
|
+
McpTools::Judges.definitions
|
|
38
39
|
end
|
|
39
40
|
|
|
40
41
|
def self.call_tool(name, arguments)
|
|
@@ -48,6 +49,7 @@ module CompletionKit
|
|
|
48
49
|
when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
|
|
49
50
|
when /\Atags_/ then McpTools::Tags.call(name, arguments)
|
|
50
51
|
when /\Acalibrations_/ then McpTools::Calibrations.call(name, arguments)
|
|
52
|
+
when /\Ajudges_/ then McpTools::Judges.call(name, arguments)
|
|
51
53
|
else raise MethodNotFound, "Unknown tool: #{name}"
|
|
52
54
|
end
|
|
53
55
|
end
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module McpTools
|
|
3
|
+
module Judges
|
|
4
|
+
extend Base
|
|
5
|
+
|
|
6
|
+
TOOLS = {
|
|
7
|
+
"judges_suggest" => {
|
|
8
|
+
description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft JudgeVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
|
|
9
|
+
inputSchema: {
|
|
10
|
+
type: "object",
|
|
11
|
+
properties: {
|
|
12
|
+
metric_id: { type: "integer" },
|
|
13
|
+
count: { type: "integer", description: "How many variants to request (default 3, max 5)." },
|
|
14
|
+
model: { type: "string", description: "Override the model used to generate variants. Defaults to CompletionKit.config.judge_model." }
|
|
15
|
+
},
|
|
16
|
+
required: ["metric_id"]
|
|
17
|
+
},
|
|
18
|
+
handler: :suggest
|
|
19
|
+
},
|
|
20
|
+
"judges_replay" => {
|
|
21
|
+
description: "Run the current judge against a dataset (judge-only run). Wraps runs_create with prompt_id omitted and output_column supplied. Re-judges existing dataset outputs so you can compare against human verdicts.",
|
|
22
|
+
inputSchema: {
|
|
23
|
+
type: "object",
|
|
24
|
+
properties: {
|
|
25
|
+
name: { type: "string" },
|
|
26
|
+
metric_id: { type: "integer" },
|
|
27
|
+
dataset_id: { type: "integer" },
|
|
28
|
+
judge_model: { type: "string" },
|
|
29
|
+
output_column: { type: "string", description: "Dataset column with the existing outputs to grade. Defaults to actual_output." }
|
|
30
|
+
},
|
|
31
|
+
required: ["name", "metric_id", "dataset_id", "judge_model"]
|
|
32
|
+
},
|
|
33
|
+
handler: :replay
|
|
34
|
+
},
|
|
35
|
+
"judges_compare" => {
|
|
36
|
+
description: "Compare two judge versions' calibration stats side by side. Pass either two judge_version_ids or one metric_id with judge_version_a_id / judge_version_b_id.",
|
|
37
|
+
inputSchema: {
|
|
38
|
+
type: "object",
|
|
39
|
+
properties: {
|
|
40
|
+
metric_id: { type: "integer" },
|
|
41
|
+
judge_version_a_id: { type: "integer" },
|
|
42
|
+
judge_version_b_id: { type: "integer" }
|
|
43
|
+
},
|
|
44
|
+
required: ["metric_id", "judge_version_a_id", "judge_version_b_id"]
|
|
45
|
+
},
|
|
46
|
+
handler: :compare
|
|
47
|
+
}
|
|
48
|
+
}.freeze
|
|
49
|
+
|
|
50
|
+
def self.suggest(args)
|
|
51
|
+
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
52
|
+
count = [args["count"].to_i, 5].min
|
|
53
|
+
count = CompletionKit::JudgeVariantGenerator::DEFAULT_VARIANT_COUNT if count <= 0
|
|
54
|
+
generator = CompletionKit::JudgeVariantGenerator.new(metric, count: count, model: args["model"])
|
|
55
|
+
variants = generator.call
|
|
56
|
+
return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
|
|
57
|
+
versions = generator.persist!(variants)
|
|
58
|
+
text_result(versions.map(&:as_json))
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def self.replay(args)
|
|
62
|
+
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
63
|
+
dataset = CompletionKit::Dataset.find(args["dataset_id"])
|
|
64
|
+
run = CompletionKit::Run.new(
|
|
65
|
+
name: args["name"],
|
|
66
|
+
dataset: dataset,
|
|
67
|
+
judge_model: args["judge_model"],
|
|
68
|
+
output_column: args["output_column"].presence || "actual_output"
|
|
69
|
+
)
|
|
70
|
+
if run.save
|
|
71
|
+
run.replace_metrics!([metric.id])
|
|
72
|
+
text_result(run.reload.as_json)
|
|
73
|
+
else
|
|
74
|
+
error_result(run.errors.full_messages.join(", "))
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def self.compare(args)
|
|
79
|
+
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
80
|
+
a = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_a_id"])
|
|
81
|
+
b = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_b_id"])
|
|
82
|
+
stats_a = CompletionKit::MetricCalibrationStats.for(metric, judge_version: a)
|
|
83
|
+
stats_b = CompletionKit::MetricCalibrationStats.for(metric, judge_version: b)
|
|
84
|
+
text_result({
|
|
85
|
+
metric_id: metric.id,
|
|
86
|
+
a: judge_version_payload(a, stats_a),
|
|
87
|
+
b: judge_version_payload(b, stats_b),
|
|
88
|
+
delta: delta_payload(stats_a, stats_b),
|
|
89
|
+
recommendation: recommendation_for(stats_a, stats_b)
|
|
90
|
+
})
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def self.judge_version_payload(version, stats)
|
|
94
|
+
{
|
|
95
|
+
id: version.id, state: version.state, current: version.current,
|
|
96
|
+
source: version.source, created_at: version.created_at,
|
|
97
|
+
sample_size: stats.sample_size,
|
|
98
|
+
agreement_point: stats.agreement_point,
|
|
99
|
+
agreement_low: stats.agreement_low,
|
|
100
|
+
agreement_high: stats.agreement_high,
|
|
101
|
+
borderline_rate: stats.borderline_rate,
|
|
102
|
+
mae: stats.mae, kappa: stats.kappa
|
|
103
|
+
}
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def self.delta_payload(a, b)
|
|
107
|
+
{
|
|
108
|
+
agreement: pair_delta(a.agreement_point, b.agreement_point),
|
|
109
|
+
mae: pair_delta(a.mae, b.mae),
|
|
110
|
+
kappa: pair_delta(a.kappa, b.kappa),
|
|
111
|
+
sample_size: { a: a.sample_size, b: b.sample_size }
|
|
112
|
+
}
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def self.pair_delta(a, b)
|
|
116
|
+
{ a: a, b: b, delta: (a.nil? || b.nil?) ? nil : (b - a) }
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def self.recommendation_for(a, b)
|
|
120
|
+
total = a.sample_size + b.sample_size
|
|
121
|
+
if total < 30
|
|
122
|
+
{ state: "need_more_data", reason: "Combined n=#{total}; need 30+ to make a call." }
|
|
123
|
+
elsif a.agreement_point.nil? || b.agreement_point.nil?
|
|
124
|
+
{ state: "no_change", reason: "Not enough verdicts on one of the versions to compare." }
|
|
125
|
+
else
|
|
126
|
+
lift = b.agreement_point - a.agreement_point
|
|
127
|
+
if lift > 0.03
|
|
128
|
+
{ state: "recommend", reason: "B agreement +#{(lift * 100).round}pt over A." }
|
|
129
|
+
elsif lift < -0.03
|
|
130
|
+
{ state: "hold", reason: "B agreement #{(lift * 100).round}pt vs A." }
|
|
131
|
+
else
|
|
132
|
+
{ state: "no_change", reason: "Agreement within noise (#{(lift * 100).round}pt)." }
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class MetricCalibrationStats
|
|
3
|
+
PROVISIONAL_MIN = 10
|
|
4
|
+
FIRM_MIN = 30
|
|
5
|
+
|
|
6
|
+
Result = Struct.new(
|
|
7
|
+
:sample_size, :agree_count, :disagree_count, :borderline_count,
|
|
8
|
+
:agreement_point, :agreement_low, :agreement_high,
|
|
9
|
+
:borderline_rate, :mae, :pearson, :kappa, :gate,
|
|
10
|
+
keyword_init: true
|
|
11
|
+
) do
|
|
12
|
+
def counter_only?
|
|
13
|
+
gate == :counter
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def provisional?
|
|
17
|
+
gate == :provisional
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def firm?
|
|
21
|
+
gate == :firm
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def short_to_target
|
|
25
|
+
[PROVISIONAL_MIN - sample_size, 0].max
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def margin
|
|
29
|
+
return nil if agreement_low.nil? || agreement_high.nil?
|
|
30
|
+
(agreement_high - agreement_low) / 2.0
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def self.for(metric, judge_version: nil)
|
|
35
|
+
new(metric: metric, judge_version: judge_version).call
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def initialize(metric:, judge_version: nil)
|
|
39
|
+
@metric = metric
|
|
40
|
+
@judge_version = judge_version
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def call
|
|
44
|
+
scope = Calibration.where(metric_id: @metric.id)
|
|
45
|
+
scope = scope.where(judge_version_id: @judge_version.id) if @judge_version
|
|
46
|
+
|
|
47
|
+
verdicts = scope.pluck(:verdict, :corrected_score, :response_id)
|
|
48
|
+
n = verdicts.length
|
|
49
|
+
agrees = verdicts.count { |v, _, _| v == "agree" }
|
|
50
|
+
disagrees = verdicts.count { |v, _, _| v == "disagree" }
|
|
51
|
+
borderlines = verdicts.count { |v, _, _| v == "borderline" }
|
|
52
|
+
|
|
53
|
+
ci = CalibrationMath.wilson_interval(successes: agrees, n: n)
|
|
54
|
+
|
|
55
|
+
pairs = score_pairs(verdicts)
|
|
56
|
+
mae_value = CalibrationMath.mae(pairs)
|
|
57
|
+
pearson_value = CalibrationMath.pearson(pairs)
|
|
58
|
+
kappa_value = CalibrationMath.quadratic_weighted_kappa(pairs, categories: 1..5)
|
|
59
|
+
|
|
60
|
+
Result.new(
|
|
61
|
+
sample_size: n,
|
|
62
|
+
agree_count: agrees,
|
|
63
|
+
disagree_count: disagrees,
|
|
64
|
+
borderline_count: borderlines,
|
|
65
|
+
agreement_point: ci[:point],
|
|
66
|
+
agreement_low: ci[:low],
|
|
67
|
+
agreement_high: ci[:high],
|
|
68
|
+
borderline_rate: n.zero? ? nil : borderlines.to_f / n,
|
|
69
|
+
mae: mae_value,
|
|
70
|
+
pearson: pearson_value,
|
|
71
|
+
kappa: kappa_value,
|
|
72
|
+
gate: gate_for(n)
|
|
73
|
+
)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
private
|
|
77
|
+
|
|
78
|
+
def score_pairs(verdicts)
|
|
79
|
+
response_ids = verdicts.map { |_, _, rid| rid }.uniq
|
|
80
|
+
ai_scores = Review.where(response_id: response_ids, metric_id: @metric.id)
|
|
81
|
+
.pluck(:response_id, :ai_score).to_h
|
|
82
|
+
|
|
83
|
+
verdicts.filter_map do |verdict, corrected, response_id|
|
|
84
|
+
next if verdict == "borderline"
|
|
85
|
+
ai = ai_scores[response_id]
|
|
86
|
+
next if ai.nil?
|
|
87
|
+
human = verdict == "agree" ? ai : corrected
|
|
88
|
+
next if human.nil?
|
|
89
|
+
[ai.to_f, human.to_f]
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def gate_for(n)
|
|
94
|
+
return :counter if n < PROVISIONAL_MIN
|
|
95
|
+
return :firm if n >= FIRM_MIN
|
|
96
|
+
:provisional
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -2,20 +2,21 @@
|
|
|
2
2
|
<% current_verdict = calibration&.verdict %>
|
|
3
3
|
<% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
|
|
4
4
|
<p class="ck-calibration__prompt">
|
|
5
|
-
|
|
5
|
+
Your verdict
|
|
6
6
|
<% if verdict_count > 0 %>
|
|
7
|
-
<span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %>
|
|
7
|
+
<span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score</span>
|
|
8
8
|
<% end %>
|
|
9
9
|
</p>
|
|
10
10
|
<div class="ck-calibration__buttons">
|
|
11
|
+
<% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
|
|
11
12
|
<% CompletionKit::Calibration::VERDICTS.each do |verdict| %>
|
|
12
13
|
<%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
|
|
13
14
|
method: :post,
|
|
14
15
|
form: { data: { turbo: "true" } },
|
|
15
16
|
class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == current_verdict}",
|
|
16
17
|
"aria-pressed": (verdict == current_verdict).to_s do %>
|
|
17
|
-
|
|
18
|
-
|
|
18
|
+
<%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
19
|
+
<span><%= verdict %></span>
|
|
19
20
|
<% end %>
|
|
20
21
|
<% end %>
|
|
21
22
|
</div>
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
<% stats = local_assigns[:stats] %>
|
|
2
|
+
<div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
|
|
3
|
+
<p class="ck-trust-panel__label">Judge trust</p>
|
|
4
|
+
<% if stats.counter_only? %>
|
|
5
|
+
<div class="ck-trust-panel__body">
|
|
6
|
+
<span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>
|
|
7
|
+
<span class="ck-trust-panel__hint">verdicts<% if stats.short_to_target > 0 %> · <%= pluralize(stats.short_to_target, "more") %> to score<% end %></span>
|
|
8
|
+
</div>
|
|
9
|
+
<% else %>
|
|
10
|
+
<div class="ck-trust-panel__body">
|
|
11
|
+
<span class="ck-trust-panel__score">~<%= (stats.agreement_point * 100).round %><span class="ck-trust-panel__score-pct">%</span></span>
|
|
12
|
+
<span class="ck-trust-panel__margin">±<%= (stats.margin * 100).round %> pt</span>
|
|
13
|
+
<span class="ck-trust-panel__gate"><%= stats.firm? ? "settled" : "provisional" %></span>
|
|
14
|
+
</div>
|
|
15
|
+
<div class="ck-trust-panel__details">
|
|
16
|
+
<span><%= stats.sample_size %> verdicts</span>
|
|
17
|
+
<% if stats.borderline_rate && stats.borderline_rate > 0 %>
|
|
18
|
+
<% level = if stats.borderline_rate > 0.30 then "danger"
|
|
19
|
+
elsif stats.borderline_rate > 0.15 then "warning"
|
|
20
|
+
else "ok" end %>
|
|
21
|
+
<span class="ck-trust-panel__borderline ck-trust-panel__borderline--<%= level %>"
|
|
22
|
+
title="<%= level == 'ok' ? '' : 'Rubric ambiguous. Consider splitting the metric or clarifying the rubric.' %>">
|
|
23
|
+
<%= (stats.borderline_rate * 100).round %>% borderline
|
|
24
|
+
</span>
|
|
25
|
+
<% end %>
|
|
26
|
+
<% if stats.mae %>
|
|
27
|
+
<span>MAE <%= stats.mae.round(2) %></span>
|
|
28
|
+
<% end %>
|
|
29
|
+
<% if stats.kappa %>
|
|
30
|
+
<span>κ <%= stats.kappa.round(2) %></span>
|
|
31
|
+
<% end %>
|
|
32
|
+
</div>
|
|
33
|
+
<% end %>
|
|
34
|
+
</div>
|
|
@@ -6,8 +6,27 @@
|
|
|
6
6
|
<section class="ck-page-header">
|
|
7
7
|
<div>
|
|
8
8
|
<h1 class="ck-title"><%= @metric.name %></h1>
|
|
9
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
10
|
+
<%= render "completion_kit/calibrations/trust_panel",
|
|
11
|
+
stats: CompletionKit::MetricCalibrationStats.for(@metric) %>
|
|
12
|
+
<% if @latest_draft %>
|
|
13
|
+
<div class="ck-draft-banner">
|
|
14
|
+
<span class="ck-chip ck-chip--soft">Draft pending</span>
|
|
15
|
+
<span class="ck-meta-copy">An edit forked a draft judge version. Publish it to make this the current judge.</span>
|
|
16
|
+
<%= button_to "Publish draft", publish_draft_metric_path(@metric),
|
|
17
|
+
method: :post, form_class: "inline-block",
|
|
18
|
+
class: ck_button_classes(:dark) %>
|
|
19
|
+
</div>
|
|
20
|
+
<% end %>
|
|
21
|
+
<% end %>
|
|
9
22
|
</div>
|
|
10
23
|
<div class="ck-actions">
|
|
24
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
25
|
+
<%= button_to "Suggest improvements", suggest_variants_metric_path(@metric),
|
|
26
|
+
method: :post, form_class: "inline-block",
|
|
27
|
+
class: ck_button_classes(:light, variant: :outline),
|
|
28
|
+
data: { turbo_confirm: "Ask the model to propose new judge instructions based on the disagreements collected so far?" } %>
|
|
29
|
+
<% end %>
|
|
11
30
|
<%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
|
|
12
31
|
</div>
|
|
13
32
|
</section>
|
|
@@ -42,3 +61,125 @@
|
|
|
42
61
|
<% end %>
|
|
43
62
|
</div>
|
|
44
63
|
</section>
|
|
64
|
+
|
|
65
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
66
|
+
<section class="ck-card ck-card--spaced">
|
|
67
|
+
<div class="ck-prompt-preview__header">
|
|
68
|
+
<p class="ck-kicker">Disagreements</p>
|
|
69
|
+
<% if @disagreements.any? %>
|
|
70
|
+
<span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
|
|
71
|
+
<% end %>
|
|
72
|
+
</div>
|
|
73
|
+
<% if @disagreements.empty? %>
|
|
74
|
+
<p class="ck-meta-copy">No disagreements yet. As humans give the verdict "disagree" on individual rows, the judge's misses will show up here for review.</p>
|
|
75
|
+
<% else %>
|
|
76
|
+
<table class="ck-results-table ck-disagreements-table">
|
|
77
|
+
<thead>
|
|
78
|
+
<tr>
|
|
79
|
+
<th scope="col">Run · row</th>
|
|
80
|
+
<th scope="col">Judge</th>
|
|
81
|
+
<th scope="col">Human</th>
|
|
82
|
+
<th scope="col">Note</th>
|
|
83
|
+
<th scope="col"></th>
|
|
84
|
+
</tr>
|
|
85
|
+
</thead>
|
|
86
|
+
<tbody>
|
|
87
|
+
<% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
|
|
88
|
+
<% @disagreements.each do |cal| %>
|
|
89
|
+
<% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
|
|
90
|
+
<% already = existing_ids.include?(cal.id) %>
|
|
91
|
+
<tr>
|
|
92
|
+
<td>
|
|
93
|
+
<%= link_to ck_run_path(cal.response.run), class: "ck-record-name" do %>
|
|
94
|
+
<strong><%= cal.response.run.name.to_s.truncate(40) %></strong>
|
|
95
|
+
<% end %>
|
|
96
|
+
<span class="ck-meta-copy">· <%= link_to "row ##{cal.response.id}", run_response_path(cal.response.run, cal.response), class: "ck-link" %></span>
|
|
97
|
+
</td>
|
|
98
|
+
<td>
|
|
99
|
+
<% if review&.ai_score %>
|
|
100
|
+
<span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
|
|
101
|
+
<% else %>
|
|
102
|
+
<span class="ck-meta-copy">—</span>
|
|
103
|
+
<% end %>
|
|
104
|
+
</td>
|
|
105
|
+
<td>
|
|
106
|
+
<% if cal.corrected_score %>
|
|
107
|
+
<span class="<%= ck_badge_classes(ck_score_kind(cal.corrected_score.to_f)) %>"><%= cal.corrected_score %></span>
|
|
108
|
+
<% else %>
|
|
109
|
+
<span class="ck-meta-copy">—</span>
|
|
110
|
+
<% end %>
|
|
111
|
+
</td>
|
|
112
|
+
<td class="ck-meta-copy"><%= cal.note.to_s.truncate(120) %></td>
|
|
113
|
+
<td>
|
|
114
|
+
<% if already %>
|
|
115
|
+
<span class="ck-chip ck-chip--done">Added</span>
|
|
116
|
+
<% else %>
|
|
117
|
+
<%= button_to "Add as judge few-shot",
|
|
118
|
+
add_few_shot_metric_path(@metric, calibration_id: cal.id),
|
|
119
|
+
method: :post,
|
|
120
|
+
form_class: "inline-block",
|
|
121
|
+
class: ck_button_classes(:light, variant: :outline) %>
|
|
122
|
+
<% end %>
|
|
123
|
+
</td>
|
|
124
|
+
</tr>
|
|
125
|
+
<% end %>
|
|
126
|
+
</tbody>
|
|
127
|
+
</table>
|
|
128
|
+
<% end %>
|
|
129
|
+
</section>
|
|
130
|
+
|
|
131
|
+
<% if @suggestion_drafts.any? %>
|
|
132
|
+
<section class="ck-card ck-card--spaced">
|
|
133
|
+
<div class="ck-prompt-preview__header">
|
|
134
|
+
<p class="ck-kicker">Suggested judge variants</p>
|
|
135
|
+
<span class="ck-chip"><%= @suggestion_drafts.size %> draft<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
|
|
136
|
+
</div>
|
|
137
|
+
<p class="ck-meta-copy">Pick one and publish it to make it the current judge. The previous published version stays in history.</p>
|
|
138
|
+
<div class="ck-suggestion-list">
|
|
139
|
+
<% @suggestion_drafts.each do |draft| %>
|
|
140
|
+
<article class="ck-suggestion-card">
|
|
141
|
+
<header class="ck-suggestion-card__header">
|
|
142
|
+
<span class="ck-chip ck-chip--soft">Draft #<%= draft.id %></span>
|
|
143
|
+
<time class="ck-meta-copy" data-relative-time datetime="<%= draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(draft.created_at) %> ago</time>
|
|
144
|
+
</header>
|
|
145
|
+
<pre class="ck-code ck-suggestion-card__instruction"><%= draft.instruction %></pre>
|
|
146
|
+
<div class="ck-actions">
|
|
147
|
+
<%= button_to "Publish this draft", publish_draft_metric_path(@metric),
|
|
148
|
+
method: :post, form_class: "inline-block",
|
|
149
|
+
class: ck_button_classes(:dark) %>
|
|
150
|
+
</div>
|
|
151
|
+
</article>
|
|
152
|
+
<% end %>
|
|
153
|
+
</div>
|
|
154
|
+
</section>
|
|
155
|
+
<% end %>
|
|
156
|
+
|
|
157
|
+
<% if Array(@metric.few_shot_examples).any? %>
|
|
158
|
+
<section class="ck-card ck-card--spaced">
|
|
159
|
+
<div class="ck-prompt-preview__header">
|
|
160
|
+
<p class="ck-kicker">Judge few-shot examples</p>
|
|
161
|
+
<span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "example") %></span>
|
|
162
|
+
</div>
|
|
163
|
+
<p class="ck-meta-copy">Disagreements added here will be injected as worked examples when the judge runs on this metric. Used by Phase 4 / 5 to retrain the judge.</p>
|
|
164
|
+
<ol class="ck-few-shot-list">
|
|
165
|
+
<% Array(@metric.few_shot_examples).each do |fs| %>
|
|
166
|
+
<li class="ck-few-shot-item">
|
|
167
|
+
<div class="ck-few-shot-item__scores">
|
|
168
|
+
<span class="ck-meta-copy">judge said</span>
|
|
169
|
+
<% if fs["judge_score"] %>
|
|
170
|
+
<span class="<%= ck_badge_classes(ck_score_kind(fs["judge_score"].to_f)) %>"><%= fs["judge_score"] %></span>
|
|
171
|
+
<% end %>
|
|
172
|
+
<span class="ck-meta-copy">human said</span>
|
|
173
|
+
<% if fs["human_score"] %>
|
|
174
|
+
<span class="<%= ck_badge_classes(ck_score_kind(fs["human_score"].to_f)) %>"><%= fs["human_score"] %></span>
|
|
175
|
+
<% end %>
|
|
176
|
+
</div>
|
|
177
|
+
<% if fs["human_note"].to_s.present? %>
|
|
178
|
+
<p class="ck-copy"><%= fs["human_note"] %></p>
|
|
179
|
+
<% end %>
|
|
180
|
+
</li>
|
|
181
|
+
<% end %>
|
|
182
|
+
</ol>
|
|
183
|
+
</section>
|
|
184
|
+
<% end %>
|
|
185
|
+
<% end %>
|
data/config/routes.rb
CHANGED
|
@@ -12,7 +12,13 @@ CompletionKit::Engine.routes.draw do
|
|
|
12
12
|
end
|
|
13
13
|
|
|
14
14
|
resources :datasets
|
|
15
|
-
resources :metrics
|
|
15
|
+
resources :metrics do
|
|
16
|
+
member do
|
|
17
|
+
post :add_few_shot
|
|
18
|
+
post :publish_draft
|
|
19
|
+
post :suggest_variants
|
|
20
|
+
end
|
|
21
|
+
end
|
|
16
22
|
resources :metric_groups
|
|
17
23
|
resources :tags
|
|
18
24
|
resources :dashboard_dismissals, only: [:create, :destroy]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
class AddStateToCompletionKitJudgeVersions < ActiveRecord::Migration[8.1]
|
|
2
|
+
def change
|
|
3
|
+
add_column :completion_kit_judge_versions, :state, :string, null: false, default: "published"
|
|
4
|
+
add_column :completion_kit_judge_versions, :source, :string
|
|
5
|
+
|
|
6
|
+
reversible do |dir|
|
|
7
|
+
dir.up do
|
|
8
|
+
execute "UPDATE completion_kit_judge_versions SET state = 'published'"
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
add_index :completion_kit_judge_versions, [:metric_id, :state],
|
|
13
|
+
name: "index_ck_judge_versions_on_metric_state"
|
|
14
|
+
end
|
|
15
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.37
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -290,14 +290,17 @@ files:
|
|
|
290
290
|
- app/models/concerns/completion_kit/taggable.rb
|
|
291
291
|
- app/services/completion_kit/anthropic_client.rb
|
|
292
292
|
- app/services/completion_kit/api_config.rb
|
|
293
|
+
- app/services/completion_kit/calibration_math.rb
|
|
293
294
|
- app/services/completion_kit/csv_processor.rb
|
|
294
295
|
- app/services/completion_kit/dashboard_stats.rb
|
|
295
296
|
- app/services/completion_kit/judge_service.rb
|
|
297
|
+
- app/services/completion_kit/judge_variant_generator.rb
|
|
296
298
|
- app/services/completion_kit/llm_client.rb
|
|
297
299
|
- app/services/completion_kit/mcp_dispatcher.rb
|
|
298
300
|
- app/services/completion_kit/mcp_tools/base.rb
|
|
299
301
|
- app/services/completion_kit/mcp_tools/calibrations.rb
|
|
300
302
|
- app/services/completion_kit/mcp_tools/datasets.rb
|
|
303
|
+
- app/services/completion_kit/mcp_tools/judges.rb
|
|
301
304
|
- app/services/completion_kit/mcp_tools/metric_groups.rb
|
|
302
305
|
- app/services/completion_kit/mcp_tools/metrics.rb
|
|
303
306
|
- app/services/completion_kit/mcp_tools/prompts.rb
|
|
@@ -305,6 +308,7 @@ files:
|
|
|
305
308
|
- app/services/completion_kit/mcp_tools/responses.rb
|
|
306
309
|
- app/services/completion_kit/mcp_tools/runs.rb
|
|
307
310
|
- app/services/completion_kit/mcp_tools/tags.rb
|
|
311
|
+
- app/services/completion_kit/metric_calibration_stats.rb
|
|
308
312
|
- app/services/completion_kit/model_discovery_service.rb
|
|
309
313
|
- app/services/completion_kit/ollama_client.rb
|
|
310
314
|
- app/services/completion_kit/onboarding/checklist.rb
|
|
@@ -323,6 +327,7 @@ files:
|
|
|
323
327
|
- app/views/completion_kit/api_reference/_resource_list.html.erb
|
|
324
328
|
- app/views/completion_kit/api_reference/index.html.erb
|
|
325
329
|
- app/views/completion_kit/calibrations/_buttons.html.erb
|
|
330
|
+
- app/views/completion_kit/calibrations/_trust_panel.html.erb
|
|
326
331
|
- app/views/completion_kit/dashboard/_eye_icon.html.erb
|
|
327
332
|
- app/views/completion_kit/dashboard/_eye_off_icon.html.erb
|
|
328
333
|
- app/views/completion_kit/dashboard/_failures_card.html.erb
|
|
@@ -407,6 +412,8 @@ files:
|
|
|
407
412
|
- db/migrate/20260516000001_create_completion_kit_dashboard_dismissals.rb
|
|
408
413
|
- db/migrate/20260522000001_create_completion_kit_judge_versions.rb
|
|
409
414
|
- db/migrate/20260522000002_create_completion_kit_calibrations.rb
|
|
415
|
+
- db/migrate/20260523000001_add_few_shot_examples_to_completion_kit_metrics.rb
|
|
416
|
+
- db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb
|
|
410
417
|
- lib/completion-kit.rb
|
|
411
418
|
- lib/completion_kit.rb
|
|
412
419
|
- lib/completion_kit/concurrency_check.rb
|