completion-kit 0.5.36 → 0.5.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css.erb +218 -19
- data/app/controllers/completion_kit/metrics_controller.rb +58 -1
- data/app/models/completion_kit/judge_version.rb +17 -1
- data/app/models/completion_kit/metric.rb +19 -0
- data/app/services/completion_kit/calibration_math.rb +84 -0
- data/app/services/completion_kit/judge_variant_generator.rb +108 -0
- data/app/services/completion_kit/mcp_dispatcher.rb +3 -1
- data/app/services/completion_kit/mcp_tools/judges.rb +138 -0
- data/app/services/completion_kit/metric_calibration_stats.rb +99 -0
- data/app/views/completion_kit/calibrations/_buttons.html.erb +15 -6
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +31 -0
- data/app/views/completion_kit/metrics/index.html.erb +18 -0
- data/app/views/completion_kit/metrics/show.html.erb +144 -0
- data/config/routes.rb +7 -1
- data/db/migrate/20260523000001_add_few_shot_examples_to_completion_kit_metrics.rb +5 -0
- data/db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb +15 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +8 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fc7d527828189c2993060b315dca634fb958d2da11fd7fae63c4790179c46701
|
|
4
|
+
data.tar.gz: f0323b980bdfb35d36742b548ddd3629e66d39e587775521678dc80b4cd2f068
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: '020946bdac698194bb5246cfbe21fdf45c56006c80c15d1d7bcfda4d3494d95cde45645e090df14a411b172c83dcde42be777d74811b25a340e5710dba6ae7ce'
|
|
7
|
+
data.tar.gz: 1b4f0ea8cf4e613df783ac428404ef1ae19b285db04f8a6768119760c65fb81d9ee72d52905ab2ef30e077ae910eb00e622925c1dcc43aee0c9e3a0f748718b1
|
|
@@ -5123,53 +5123,252 @@ a.tag-mark {
|
|
|
5123
5123
|
.ck-calibration {
|
|
5124
5124
|
margin-top: 12px;
|
|
5125
5125
|
padding-top: 12px;
|
|
5126
|
-
border-top: 1px dashed
|
|
5126
|
+
border-top: 1px dashed var(--ck-line);
|
|
5127
5127
|
}
|
|
5128
5128
|
.ck-calibration__prompt {
|
|
5129
|
-
font-
|
|
5129
|
+
font-family: var(--ck-mono);
|
|
5130
|
+
font-size: 0.72rem;
|
|
5131
|
+
letter-spacing: 0.06em;
|
|
5132
|
+
text-transform: uppercase;
|
|
5130
5133
|
color: var(--ck-dim);
|
|
5131
|
-
margin: 0 0
|
|
5134
|
+
margin: 0 0 10px;
|
|
5132
5135
|
display: flex;
|
|
5133
5136
|
align-items: center;
|
|
5134
|
-
gap:
|
|
5137
|
+
gap: 10px;
|
|
5135
5138
|
}
|
|
5136
5139
|
.ck-calibration__count {
|
|
5137
|
-
font-
|
|
5140
|
+
font-family: var(--ck-mono);
|
|
5141
|
+
font-size: 0.72rem;
|
|
5142
|
+
letter-spacing: 0.03em;
|
|
5138
5143
|
color: var(--ck-accent);
|
|
5144
|
+
text-transform: none;
|
|
5139
5145
|
}
|
|
5140
5146
|
.ck-calibration__buttons {
|
|
5141
5147
|
display: flex;
|
|
5142
|
-
gap:
|
|
5148
|
+
gap: 6px;
|
|
5143
5149
|
flex-wrap: wrap;
|
|
5144
5150
|
}
|
|
5145
5151
|
.ck-calibration__pill {
|
|
5146
5152
|
display: inline-flex;
|
|
5147
5153
|
align-items: center;
|
|
5148
|
-
gap:
|
|
5149
|
-
padding:
|
|
5150
|
-
border-radius:
|
|
5151
|
-
font-
|
|
5152
|
-
|
|
5153
|
-
|
|
5154
|
-
|
|
5154
|
+
gap: 0.4rem;
|
|
5155
|
+
padding: 0.32rem 0.65rem;
|
|
5156
|
+
border-radius: 4px;
|
|
5157
|
+
font-family: var(--ck-mono);
|
|
5158
|
+
font-size: 0.78rem;
|
|
5159
|
+
font-weight: 500;
|
|
5160
|
+
letter-spacing: 0.04em;
|
|
5161
|
+
text-transform: uppercase;
|
|
5162
|
+
background: var(--ck-surface-soft);
|
|
5163
|
+
border: 1px solid var(--ck-line);
|
|
5164
|
+
color: var(--ck-dim);
|
|
5155
5165
|
cursor: pointer;
|
|
5166
|
+
transition: background 0.12s, border-color 0.12s, color 0.12s;
|
|
5167
|
+
}
|
|
5168
|
+
.ck-calibration__pill svg {
|
|
5169
|
+
width: 14px;
|
|
5170
|
+
height: 14px;
|
|
5156
5171
|
}
|
|
5157
5172
|
.ck-calibration__pill:hover,
|
|
5158
5173
|
.ck-calibration__pill:focus-visible {
|
|
5159
|
-
|
|
5174
|
+
color: var(--ck-text);
|
|
5175
|
+
border-color: var(--ck-dim);
|
|
5160
5176
|
}
|
|
5161
|
-
.ck-calibration__pill.is-active {
|
|
5162
|
-
background: var(--ck-
|
|
5163
|
-
color:
|
|
5164
|
-
|
|
5177
|
+
.ck-calibration__pill--agree.is-active {
|
|
5178
|
+
background: var(--ck-success-soft);
|
|
5179
|
+
border-color: rgba(45, 212, 168, 0.35);
|
|
5180
|
+
color: var(--ck-success);
|
|
5165
5181
|
}
|
|
5182
|
+
.ck-calibration__pill--disagree.is-active {
|
|
5183
|
+
background: var(--ck-danger-soft);
|
|
5184
|
+
border-color: rgba(248, 113, 113, 0.35);
|
|
5185
|
+
color: var(--ck-danger);
|
|
5186
|
+
}
|
|
5187
|
+
.ck-calibration__pill--borderline.is-active {
|
|
5188
|
+
background: var(--ck-warning-soft);
|
|
5189
|
+
border-color: rgba(224, 164, 88, 0.35);
|
|
5190
|
+
color: var(--ck-warning);
|
|
5191
|
+
}
|
|
5192
|
+
.ck-calibration__pill--agree:hover { border-color: rgba(45, 212, 168, 0.45); color: var(--ck-success); }
|
|
5193
|
+
.ck-calibration__pill--disagree:hover { border-color: rgba(248, 113, 113, 0.45); color: var(--ck-danger); }
|
|
5194
|
+
.ck-calibration__pill--borderline:hover { border-color: rgba(224, 164, 88, 0.45); color: var(--ck-warning); }
|
|
5166
5195
|
.ck-calibration__detail {
|
|
5167
|
-
margin-top:
|
|
5196
|
+
margin-top: 12px;
|
|
5168
5197
|
display: flex;
|
|
5169
5198
|
flex-direction: column;
|
|
5170
5199
|
gap: 8px;
|
|
5200
|
+
padding: 12px;
|
|
5201
|
+
background: var(--ck-surface-soft);
|
|
5202
|
+
border: 1px solid var(--ck-line);
|
|
5203
|
+
border-radius: 6px;
|
|
5171
5204
|
}
|
|
5172
5205
|
.ck-calibration__value {
|
|
5173
5206
|
color: var(--ck-accent);
|
|
5207
|
+
font-family: var(--ck-mono);
|
|
5208
|
+
font-weight: 600;
|
|
5209
|
+
}
|
|
5210
|
+
|
|
5211
|
+
.ck-trust-panel {
|
|
5212
|
+
display: inline-flex;
|
|
5213
|
+
flex-direction: column;
|
|
5214
|
+
gap: 6px;
|
|
5215
|
+
margin-top: 12px;
|
|
5216
|
+
padding: 10px 14px;
|
|
5217
|
+
background: var(--ck-surface-soft);
|
|
5218
|
+
border: 1px solid var(--ck-line);
|
|
5219
|
+
border-radius: 6px;
|
|
5220
|
+
}
|
|
5221
|
+
.ck-trust-panel__label {
|
|
5222
|
+
margin: 0;
|
|
5223
|
+
font-family: var(--ck-mono);
|
|
5224
|
+
font-size: 0.7rem;
|
|
5225
|
+
letter-spacing: 0.08em;
|
|
5226
|
+
text-transform: uppercase;
|
|
5227
|
+
color: var(--ck-dim);
|
|
5228
|
+
}
|
|
5229
|
+
.ck-trust-panel__body {
|
|
5230
|
+
display: flex;
|
|
5231
|
+
align-items: baseline;
|
|
5232
|
+
gap: 10px;
|
|
5233
|
+
}
|
|
5234
|
+
.ck-trust-panel__counter {
|
|
5235
|
+
font-family: var(--ck-mono);
|
|
5236
|
+
font-size: 1.6rem;
|
|
5237
|
+
font-weight: 600;
|
|
5238
|
+
color: var(--ck-accent);
|
|
5239
|
+
}
|
|
5240
|
+
.ck-trust-panel__counter-of {
|
|
5241
|
+
font-size: 0.9rem;
|
|
5242
|
+
color: var(--ck-dim);
|
|
5243
|
+
margin-left: 4px;
|
|
5244
|
+
}
|
|
5245
|
+
.ck-trust-panel__hint {
|
|
5246
|
+
font-family: var(--ck-mono);
|
|
5247
|
+
font-size: 0.72rem;
|
|
5248
|
+
color: var(--ck-dim);
|
|
5249
|
+
letter-spacing: 0.04em;
|
|
5250
|
+
}
|
|
5251
|
+
.ck-trust-panel__score {
|
|
5252
|
+
font-family: var(--ck-mono);
|
|
5253
|
+
font-size: 1.6rem;
|
|
5174
5254
|
font-weight: 600;
|
|
5255
|
+
color: var(--ck-success);
|
|
5256
|
+
}
|
|
5257
|
+
.ck-trust-panel__score-pct {
|
|
5258
|
+
font-size: 0.9rem;
|
|
5259
|
+
color: var(--ck-dim);
|
|
5260
|
+
margin-left: 2px;
|
|
5261
|
+
}
|
|
5262
|
+
.ck-trust-panel__margin {
|
|
5263
|
+
font-family: var(--ck-mono);
|
|
5264
|
+
font-size: 0.8rem;
|
|
5265
|
+
color: var(--ck-dim);
|
|
5266
|
+
}
|
|
5267
|
+
.ck-trust-panel__gate {
|
|
5268
|
+
font-family: var(--ck-mono);
|
|
5269
|
+
font-size: 0.66rem;
|
|
5270
|
+
letter-spacing: 0.08em;
|
|
5271
|
+
text-transform: uppercase;
|
|
5272
|
+
padding: 2px 6px;
|
|
5273
|
+
border-radius: 3px;
|
|
5274
|
+
background: var(--ck-surface);
|
|
5275
|
+
border: 1px solid var(--ck-line);
|
|
5276
|
+
color: var(--ck-dim);
|
|
5277
|
+
}
|
|
5278
|
+
.ck-trust-panel--firm .ck-trust-panel__gate {
|
|
5279
|
+
color: var(--ck-success);
|
|
5280
|
+
border-color: rgba(45, 212, 168, 0.35);
|
|
5281
|
+
}
|
|
5282
|
+
.ck-trust-panel__details {
|
|
5283
|
+
display: flex;
|
|
5284
|
+
flex-wrap: wrap;
|
|
5285
|
+
gap: 14px;
|
|
5286
|
+
font-family: var(--ck-mono);
|
|
5287
|
+
font-size: 0.72rem;
|
|
5288
|
+
color: var(--ck-dim);
|
|
5289
|
+
}
|
|
5290
|
+
.ck-trust-panel__borderline {
|
|
5291
|
+
color: var(--ck-warning);
|
|
5292
|
+
}
|
|
5293
|
+
|
|
5294
|
+
.ck-trust-panel__borderline--ok { color: var(--ck-dim); }
|
|
5295
|
+
.ck-trust-panel__borderline--warning { color: var(--ck-warning); }
|
|
5296
|
+
.ck-trust-panel__borderline--danger { color: var(--ck-danger); }
|
|
5297
|
+
|
|
5298
|
+
.ck-disagreements-table td .ck-meta-copy {
|
|
5299
|
+
font-size: 0.78rem;
|
|
5300
|
+
}
|
|
5301
|
+
.ck-few-shot-list {
|
|
5302
|
+
list-style: decimal;
|
|
5303
|
+
padding-left: 1.4rem;
|
|
5304
|
+
margin: 0;
|
|
5305
|
+
display: flex;
|
|
5306
|
+
flex-direction: column;
|
|
5307
|
+
gap: 12px;
|
|
5308
|
+
}
|
|
5309
|
+
.ck-few-shot-item {
|
|
5310
|
+
padding: 10px 12px;
|
|
5311
|
+
background: var(--ck-surface-soft);
|
|
5312
|
+
border: 1px solid var(--ck-line);
|
|
5313
|
+
border-radius: 6px;
|
|
5314
|
+
}
|
|
5315
|
+
.ck-few-shot-item__scores {
|
|
5316
|
+
display: flex;
|
|
5317
|
+
align-items: center;
|
|
5318
|
+
gap: 8px;
|
|
5319
|
+
font-family: var(--ck-mono);
|
|
5320
|
+
font-size: 0.75rem;
|
|
5321
|
+
letter-spacing: 0.04em;
|
|
5322
|
+
text-transform: uppercase;
|
|
5323
|
+
}
|
|
5324
|
+
|
|
5325
|
+
.ck-draft-banner {
|
|
5326
|
+
display: inline-flex;
|
|
5327
|
+
align-items: center;
|
|
5328
|
+
gap: 10px;
|
|
5329
|
+
margin-top: 10px;
|
|
5330
|
+
padding: 8px 12px;
|
|
5331
|
+
background: var(--ck-accent-soft);
|
|
5332
|
+
border: 1px dashed rgba(6, 182, 212, 0.4);
|
|
5333
|
+
border-radius: 6px;
|
|
5334
|
+
}
|
|
5335
|
+
|
|
5336
|
+
.ck-suggestion-list {
|
|
5337
|
+
display: flex;
|
|
5338
|
+
flex-direction: column;
|
|
5339
|
+
gap: 12px;
|
|
5340
|
+
}
|
|
5341
|
+
.ck-suggestion-card {
|
|
5342
|
+
padding: 12px 14px;
|
|
5343
|
+
background: var(--ck-surface-soft);
|
|
5344
|
+
border: 1px solid var(--ck-line);
|
|
5345
|
+
border-radius: 6px;
|
|
5346
|
+
display: flex;
|
|
5347
|
+
flex-direction: column;
|
|
5348
|
+
gap: 10px;
|
|
5349
|
+
}
|
|
5350
|
+
.ck-suggestion-card__header {
|
|
5351
|
+
display: flex;
|
|
5352
|
+
align-items: center;
|
|
5353
|
+
gap: 10px;
|
|
5354
|
+
}
|
|
5355
|
+
.ck-suggestion-card__instruction {
|
|
5356
|
+
margin: 0;
|
|
5357
|
+
white-space: pre-wrap;
|
|
5358
|
+
font-size: 0.85rem;
|
|
5359
|
+
background: var(--ck-bg-strong);
|
|
5360
|
+
padding: 10px 12px;
|
|
5361
|
+
border-radius: 4px;
|
|
5362
|
+
border: 1px solid var(--ck-line);
|
|
5363
|
+
}
|
|
5364
|
+
|
|
5365
|
+
.ck-metrics-table__trust {
|
|
5366
|
+
font-family: var(--ck-mono);
|
|
5367
|
+
font-size: 0.78rem;
|
|
5368
|
+
letter-spacing: 0.03em;
|
|
5369
|
+
}
|
|
5370
|
+
.ck-metrics-table__trust-rate {
|
|
5371
|
+
font-weight: 600;
|
|
5372
|
+
color: var(--ck-success);
|
|
5373
|
+
margin-right: 6px;
|
|
5175
5374
|
}
|
|
@@ -1,13 +1,19 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class MetricsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_metric, only: [:show, :edit, :update, :destroy]
|
|
4
|
+
before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants]
|
|
5
5
|
|
|
6
6
|
def index
|
|
7
7
|
@metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
|
|
8
8
|
end
|
|
9
9
|
|
|
10
10
|
def show
|
|
11
|
+
@disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
|
|
12
|
+
.includes(response: [:reviews, :run])
|
|
13
|
+
.order(created_at: :desc)
|
|
14
|
+
.limit(50)
|
|
15
|
+
@latest_draft = JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
|
|
16
|
+
@suggestion_drafts = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc)
|
|
11
17
|
end
|
|
12
18
|
|
|
13
19
|
def new
|
|
@@ -40,6 +46,57 @@ module CompletionKit
|
|
|
40
46
|
redirect_to metrics_path, notice: "Metric was successfully destroyed."
|
|
41
47
|
end
|
|
42
48
|
|
|
49
|
+
def suggest_variants
|
|
50
|
+
generator = JudgeVariantGenerator.new(@metric)
|
|
51
|
+
variants = generator.call
|
|
52
|
+
if variants.empty?
|
|
53
|
+
redirect_to metric_path(@metric), alert: "The model returned no usable variants. Try again with a different model."
|
|
54
|
+
return
|
|
55
|
+
end
|
|
56
|
+
generator.persist!(variants)
|
|
57
|
+
label = variants.length == 1 ? "alternative" : "alternatives"
|
|
58
|
+
redirect_to metric_path(@metric), notice: "Wrote #{variants.length} #{label} for the judge instruction. Pick one to make it live."
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def publish_draft
|
|
62
|
+
scope = JudgeVersion.drafts.where(metric_id: @metric.id)
|
|
63
|
+
draft = params[:draft_id].present? ? scope.find_by(id: params[:draft_id]) : scope.order(created_at: :desc).first
|
|
64
|
+
|
|
65
|
+
if draft.nil?
|
|
66
|
+
redirect_to metric_path(@metric), alert: "No draft to publish."
|
|
67
|
+
return
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
JudgeVersion.transaction do
|
|
71
|
+
JudgeVersion.where(metric_id: @metric.id, state: "published").update_all(current: false)
|
|
72
|
+
draft.update!(state: "published", current: true)
|
|
73
|
+
@metric.update_columns(
|
|
74
|
+
instruction: draft.instruction,
|
|
75
|
+
rubric_bands: Array(draft.rubric_bands).to_json
|
|
76
|
+
)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
redirect_to metric_path(@metric), notice: "This judge version is now live."
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def add_few_shot
|
|
83
|
+
calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
|
|
84
|
+
review = calibration.response.reviews.find_by(metric_id: @metric.id)
|
|
85
|
+
examples = Array(@metric.few_shot_examples)
|
|
86
|
+
examples << {
|
|
87
|
+
"input" => calibration.response.input_data.to_s.truncate(2000),
|
|
88
|
+
"response" => calibration.response.response_text.to_s.truncate(2000),
|
|
89
|
+
"judge_score" => review&.ai_score&.to_f,
|
|
90
|
+
"judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
|
|
91
|
+
"human_score" => calibration.corrected_score&.to_f,
|
|
92
|
+
"human_note" => calibration.note.to_s.truncate(1000),
|
|
93
|
+
"calibration_id" => calibration.id,
|
|
94
|
+
"added_at" => Time.current.utc.iso8601
|
|
95
|
+
}
|
|
96
|
+
@metric.update!(few_shot_examples: examples)
|
|
97
|
+
redirect_to metric_path(@metric), notice: "Saved as a teaching example. The judge will see it next time it grades."
|
|
98
|
+
end
|
|
99
|
+
|
|
43
100
|
private
|
|
44
101
|
|
|
45
102
|
def set_metric
|
|
@@ -1,23 +1,37 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class JudgeVersion < ApplicationRecord
|
|
3
|
+
STATES = %w[draft published].freeze
|
|
4
|
+
|
|
3
5
|
belongs_to :metric
|
|
4
6
|
has_many :calibrations, dependent: :destroy
|
|
5
7
|
|
|
6
8
|
serialize :rubric_bands, coder: JSON
|
|
7
9
|
|
|
8
10
|
validates :metric_id, presence: true
|
|
11
|
+
validates :state, inclusion: { in: STATES }
|
|
9
12
|
|
|
10
13
|
scope :current, -> { where(current: true) }
|
|
14
|
+
scope :published, -> { where(state: "published") }
|
|
15
|
+
scope :drafts, -> { where(state: "draft") }
|
|
11
16
|
|
|
12
17
|
def self.ensure_current_for(metric)
|
|
13
18
|
current.find_by(metric_id: metric.id) || create!(
|
|
14
19
|
metric: metric,
|
|
15
20
|
instruction: metric.instruction,
|
|
16
21
|
rubric_bands: metric.rubric_bands,
|
|
17
|
-
current: true
|
|
22
|
+
current: true,
|
|
23
|
+
state: "published"
|
|
18
24
|
)
|
|
19
25
|
end
|
|
20
26
|
|
|
27
|
+
def draft?
|
|
28
|
+
state == "draft"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def published?
|
|
32
|
+
state == "published"
|
|
33
|
+
end
|
|
34
|
+
|
|
21
35
|
def as_json(options = {})
|
|
22
36
|
{
|
|
23
37
|
id: id,
|
|
@@ -25,6 +39,8 @@ module CompletionKit
|
|
|
25
39
|
instruction: instruction,
|
|
26
40
|
rubric_bands: rubric_bands,
|
|
27
41
|
current: current,
|
|
42
|
+
state: state,
|
|
43
|
+
source: source,
|
|
28
44
|
created_at: created_at
|
|
29
45
|
}
|
|
30
46
|
end
|
|
@@ -16,6 +16,7 @@ module CompletionKit
|
|
|
16
16
|
has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
|
|
17
17
|
|
|
18
18
|
serialize :rubric_bands, coder: JSON
|
|
19
|
+
serialize :few_shot_examples, coder: JSON, type: Array
|
|
19
20
|
|
|
20
21
|
validates :name, presence: true
|
|
21
22
|
validates :key, tenant_scoped_uniqueness: { allow_nil: true }
|
|
@@ -23,6 +24,7 @@ module CompletionKit
|
|
|
23
24
|
before_validation :generate_key
|
|
24
25
|
before_validation :normalize_rubric_bands
|
|
25
26
|
before_validation :set_defaults
|
|
27
|
+
after_update :fork_draft_judge_version, if: :judge_relevant_changes?
|
|
26
28
|
|
|
27
29
|
def self.default_rubric_bands
|
|
28
30
|
DEFAULT_RUBRIC_BANDS.map(&:dup)
|
|
@@ -95,5 +97,22 @@ module CompletionKit
|
|
|
95
97
|
def normalize_rubric_bands
|
|
96
98
|
self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
|
|
97
99
|
end
|
|
100
|
+
|
|
101
|
+
def judge_relevant_changes?
|
|
102
|
+
saved_change_to_instruction? || saved_change_to_rubric_bands?
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def fork_draft_judge_version
|
|
106
|
+
JudgeVersion.ensure_current_for(self)
|
|
107
|
+
JudgeVersion.where(metric_id: id, state: "draft").update_all(current: false)
|
|
108
|
+
JudgeVersion.create!(
|
|
109
|
+
metric: self,
|
|
110
|
+
instruction: instruction,
|
|
111
|
+
rubric_bands: rubric_bands,
|
|
112
|
+
current: false,
|
|
113
|
+
state: "draft",
|
|
114
|
+
source: "edit"
|
|
115
|
+
)
|
|
116
|
+
end
|
|
98
117
|
end
|
|
99
118
|
end
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module CalibrationMath
|
|
3
|
+
Z_95 = 1.959963984540054
|
|
4
|
+
|
|
5
|
+
module_function
|
|
6
|
+
|
|
7
|
+
def wilson_interval(successes:, n:, z: Z_95)
|
|
8
|
+
return { point: nil, low: nil, high: nil } if n.to_i.zero?
|
|
9
|
+
|
|
10
|
+
p_hat = successes.to_f / n
|
|
11
|
+
denom = 1.0 + (z * z) / n
|
|
12
|
+
center = (p_hat + (z * z) / (2.0 * n)) / denom
|
|
13
|
+
margin = z * Math.sqrt((p_hat * (1 - p_hat) / n) + ((z * z) / (4.0 * n * n))) / denom
|
|
14
|
+
|
|
15
|
+
{ point: p_hat, low: [center - margin, 0.0].max, high: [center + margin, 1.0].min }
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def mae(pairs)
|
|
19
|
+
return nil if pairs.empty?
|
|
20
|
+
sum = pairs.sum { |ai, human| (ai.to_f - human.to_f).abs }
|
|
21
|
+
sum / pairs.length
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def pearson(pairs)
|
|
25
|
+
return nil if pairs.length < 2
|
|
26
|
+
xs = pairs.map { |a, _| a.to_f }
|
|
27
|
+
ys = pairs.map { |_, h| h.to_f }
|
|
28
|
+
mx = xs.sum / xs.length
|
|
29
|
+
my = ys.sum / ys.length
|
|
30
|
+
num = xs.zip(ys).sum { |x, y| (x - mx) * (y - my) }
|
|
31
|
+
dx2 = xs.sum { |x| (x - mx)**2 }
|
|
32
|
+
dy2 = ys.sum { |y| (y - my)**2 }
|
|
33
|
+
denom = Math.sqrt(dx2 * dy2)
|
|
34
|
+
return nil if denom.zero?
|
|
35
|
+
num / denom
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def quadratic_weighted_kappa(pairs, categories:)
|
|
39
|
+
return nil if pairs.empty?
|
|
40
|
+
|
|
41
|
+
ratings = categories.to_a
|
|
42
|
+
k = ratings.length
|
|
43
|
+
return nil if k < 2
|
|
44
|
+
|
|
45
|
+
index = ratings.each_with_index.to_h
|
|
46
|
+
observed = Array.new(k) { Array.new(k, 0) }
|
|
47
|
+
row_totals = Array.new(k, 0)
|
|
48
|
+
col_totals = Array.new(k, 0)
|
|
49
|
+
n = 0
|
|
50
|
+
|
|
51
|
+
pairs.each do |ai, human|
|
|
52
|
+
i = index[score_bucket(ai, ratings)]
|
|
53
|
+
j = index[score_bucket(human, ratings)]
|
|
54
|
+
next if i.nil? || j.nil?
|
|
55
|
+
observed[i][j] += 1
|
|
56
|
+
row_totals[i] += 1
|
|
57
|
+
col_totals[j] += 1
|
|
58
|
+
n += 1
|
|
59
|
+
end
|
|
60
|
+
return nil if n.zero?
|
|
61
|
+
|
|
62
|
+
max_dist_sq = (k - 1.0)**2
|
|
63
|
+
numerator = 0.0
|
|
64
|
+
denominator = 0.0
|
|
65
|
+
(0...k).each do |i|
|
|
66
|
+
(0...k).each do |j|
|
|
67
|
+
weight = ((i - j)**2) / max_dist_sq
|
|
68
|
+
expected = (row_totals[i] * col_totals[j]).to_f / n
|
|
69
|
+
numerator += weight * observed[i][j]
|
|
70
|
+
denominator += weight * expected
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
return 1.0 if denominator.zero?
|
|
74
|
+
1.0 - (numerator / denominator)
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def score_bucket(value, ratings)
|
|
78
|
+
rounded = value.to_f.round
|
|
79
|
+
return ratings.first if rounded <= ratings.first
|
|
80
|
+
return ratings.last if rounded >= ratings.last
|
|
81
|
+
rounded
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class JudgeVariantGenerator
|
|
3
|
+
DEFAULT_VARIANT_COUNT = 3
|
|
4
|
+
DEFAULT_TEMPERATURE = 0.4
|
|
5
|
+
|
|
6
|
+
Variant = Struct.new(:reasoning, :instruction, keyword_init: true)
|
|
7
|
+
|
|
8
|
+
def initialize(metric, count: DEFAULT_VARIANT_COUNT, model: nil)
|
|
9
|
+
@metric = metric
|
|
10
|
+
@count = count
|
|
11
|
+
@model = model || CompletionKit.config.judge_model
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def call
|
|
15
|
+
client = LlmClient.for_model(@model, ApiConfig.for_model(@model))
|
|
16
|
+
raw = client.generate_completion(build_meta_prompt, model: @model, max_tokens: 2500, temperature: DEFAULT_TEMPERATURE)
|
|
17
|
+
parse(raw).first(@count)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def persist!(variants)
|
|
21
|
+
JudgeVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
|
|
22
|
+
versions = variants.map do |variant|
|
|
23
|
+
JudgeVersion.create!(
|
|
24
|
+
metric: @metric,
|
|
25
|
+
instruction: variant.instruction,
|
|
26
|
+
rubric_bands: @metric.rubric_bands,
|
|
27
|
+
state: "draft",
|
|
28
|
+
source: "suggestion",
|
|
29
|
+
current: false
|
|
30
|
+
)
|
|
31
|
+
end
|
|
32
|
+
ActiveSupport::Notifications.instrument("completion_kit.judge_suggestion.generated",
|
|
33
|
+
metric_id: @metric.id,
|
|
34
|
+
count: versions.length,
|
|
35
|
+
model: @model)
|
|
36
|
+
versions
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def build_meta_prompt
|
|
42
|
+
examples = JudgeCalibrationExamples.for(@metric)
|
|
43
|
+
sections = []
|
|
44
|
+
sections << "You are an expert evaluator. Rewrite a judge's grading instruction so it agrees better with humans on the cases below."
|
|
45
|
+
sections << ""
|
|
46
|
+
sections << "## Current instruction"
|
|
47
|
+
sections << "```"
|
|
48
|
+
sections << @metric.instruction.to_s
|
|
49
|
+
sections << "```"
|
|
50
|
+
sections << ""
|
|
51
|
+
sections << "## Rubric (unchanged across variants — only rewrite the instruction)"
|
|
52
|
+
sections << @metric.display_rubric_text
|
|
53
|
+
sections << ""
|
|
54
|
+
sections << "## Recent disagreements (judge vs human)"
|
|
55
|
+
examples.each_with_index do |ex, i|
|
|
56
|
+
sections << "### Case #{i + 1}"
|
|
57
|
+
sections << "Input: #{ex[:input].to_s.truncate(200)}"
|
|
58
|
+
sections << "Output: #{ex[:output].to_s.truncate(200)}"
|
|
59
|
+
sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
|
|
60
|
+
sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
|
|
61
|
+
sections << ""
|
|
62
|
+
end
|
|
63
|
+
sections << "## Task"
|
|
64
|
+
sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite — not a wholesale rewrite of the rubric. Aim to close the disagreement gap."
|
|
65
|
+
sections << ""
|
|
66
|
+
sections << "Respond in EXACTLY this format, repeated #{@count} times:"
|
|
67
|
+
sections << ""
|
|
68
|
+
sections << "VARIANT:"
|
|
69
|
+
sections << "REASONING: <one sentence explaining what this variant changes>"
|
|
70
|
+
sections << "INSTRUCTION:"
|
|
71
|
+
sections << "<the rewritten instruction>"
|
|
72
|
+
sections << "END_VARIANT"
|
|
73
|
+
sections.join("\n")
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def parse(text)
|
|
77
|
+
blocks = text.to_s.scan(/VARIANT:(.*?)END_VARIANT/m).flatten
|
|
78
|
+
blocks.filter_map do |raw|
|
|
79
|
+
reasoning = raw[/REASONING:\s*(.*?)(?=INSTRUCTION:|\z)/m, 1].to_s.strip
|
|
80
|
+
instruction = raw[/INSTRUCTION:\s*(.*)/m, 1].to_s.strip
|
|
81
|
+
next if instruction.empty?
|
|
82
|
+
Variant.new(reasoning: reasoning, instruction: instruction)
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
module JudgeCalibrationExamples
|
|
88
|
+
module_function
|
|
89
|
+
|
|
90
|
+
def for(metric, limit: 8)
|
|
91
|
+
disagreements = Calibration.where(metric_id: metric.id, verdict: "disagree")
|
|
92
|
+
.includes(response: :reviews)
|
|
93
|
+
.order(created_at: :desc)
|
|
94
|
+
.limit(limit)
|
|
95
|
+
disagreements.map do |cal|
|
|
96
|
+
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
97
|
+
{
|
|
98
|
+
input: cal.response.input_data,
|
|
99
|
+
output: cal.response.response_text,
|
|
100
|
+
judge_score: review&.ai_score,
|
|
101
|
+
judge_feedback: review&.ai_feedback,
|
|
102
|
+
human_score: cal.corrected_score,
|
|
103
|
+
human_note: cal.note
|
|
104
|
+
}
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
@@ -34,7 +34,8 @@ module CompletionKit
|
|
|
34
34
|
McpTools::MetricGroups.definitions +
|
|
35
35
|
McpTools::ProviderCredentials.definitions +
|
|
36
36
|
McpTools::Tags.definitions +
|
|
37
|
-
McpTools::Calibrations.definitions
|
|
37
|
+
McpTools::Calibrations.definitions +
|
|
38
|
+
McpTools::Judges.definitions
|
|
38
39
|
end
|
|
39
40
|
|
|
40
41
|
def self.call_tool(name, arguments)
|
|
@@ -48,6 +49,7 @@ module CompletionKit
|
|
|
48
49
|
when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
|
|
49
50
|
when /\Atags_/ then McpTools::Tags.call(name, arguments)
|
|
50
51
|
when /\Acalibrations_/ then McpTools::Calibrations.call(name, arguments)
|
|
52
|
+
when /\Ajudges_/ then McpTools::Judges.call(name, arguments)
|
|
51
53
|
else raise MethodNotFound, "Unknown tool: #{name}"
|
|
52
54
|
end
|
|
53
55
|
end
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module McpTools
|
|
3
|
+
module Judges
|
|
4
|
+
extend Base
|
|
5
|
+
|
|
6
|
+
TOOLS = {
|
|
7
|
+
"judges_suggest" => {
|
|
8
|
+
description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft JudgeVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
|
|
9
|
+
inputSchema: {
|
|
10
|
+
type: "object",
|
|
11
|
+
properties: {
|
|
12
|
+
metric_id: { type: "integer" },
|
|
13
|
+
count: { type: "integer", description: "How many variants to request (default 3, max 5)." },
|
|
14
|
+
model: { type: "string", description: "Override the model used to generate variants. Defaults to CompletionKit.config.judge_model." }
|
|
15
|
+
},
|
|
16
|
+
required: ["metric_id"]
|
|
17
|
+
},
|
|
18
|
+
handler: :suggest
|
|
19
|
+
},
|
|
20
|
+
"judges_replay" => {
|
|
21
|
+
description: "Run the current judge against a dataset (judge-only run). Wraps runs_create with prompt_id omitted and output_column supplied. Re-judges existing dataset outputs so you can compare against human verdicts.",
|
|
22
|
+
inputSchema: {
|
|
23
|
+
type: "object",
|
|
24
|
+
properties: {
|
|
25
|
+
name: { type: "string" },
|
|
26
|
+
metric_id: { type: "integer" },
|
|
27
|
+
dataset_id: { type: "integer" },
|
|
28
|
+
judge_model: { type: "string" },
|
|
29
|
+
output_column: { type: "string", description: "Dataset column with the existing outputs to grade. Defaults to actual_output." }
|
|
30
|
+
},
|
|
31
|
+
required: ["name", "metric_id", "dataset_id", "judge_model"]
|
|
32
|
+
},
|
|
33
|
+
handler: :replay
|
|
34
|
+
},
|
|
35
|
+
"judges_compare" => {
|
|
36
|
+
description: "Compare two judge versions' calibration stats side by side. Pass either two judge_version_ids or one metric_id with judge_version_a_id / judge_version_b_id.",
|
|
37
|
+
inputSchema: {
|
|
38
|
+
type: "object",
|
|
39
|
+
properties: {
|
|
40
|
+
metric_id: { type: "integer" },
|
|
41
|
+
judge_version_a_id: { type: "integer" },
|
|
42
|
+
judge_version_b_id: { type: "integer" }
|
|
43
|
+
},
|
|
44
|
+
required: ["metric_id", "judge_version_a_id", "judge_version_b_id"]
|
|
45
|
+
},
|
|
46
|
+
handler: :compare
|
|
47
|
+
}
|
|
48
|
+
}.freeze
|
|
49
|
+
|
|
50
|
+
def self.suggest(args)
|
|
51
|
+
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
52
|
+
count = [args["count"].to_i, 5].min
|
|
53
|
+
count = CompletionKit::JudgeVariantGenerator::DEFAULT_VARIANT_COUNT if count <= 0
|
|
54
|
+
generator = CompletionKit::JudgeVariantGenerator.new(metric, count: count, model: args["model"])
|
|
55
|
+
variants = generator.call
|
|
56
|
+
return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
|
|
57
|
+
versions = generator.persist!(variants)
|
|
58
|
+
text_result(versions.map(&:as_json))
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def self.replay(args)
|
|
62
|
+
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
63
|
+
dataset = CompletionKit::Dataset.find(args["dataset_id"])
|
|
64
|
+
run = CompletionKit::Run.new(
|
|
65
|
+
name: args["name"],
|
|
66
|
+
dataset: dataset,
|
|
67
|
+
judge_model: args["judge_model"],
|
|
68
|
+
output_column: args["output_column"].presence || "actual_output"
|
|
69
|
+
)
|
|
70
|
+
if run.save
|
|
71
|
+
run.replace_metrics!([metric.id])
|
|
72
|
+
text_result(run.reload.as_json)
|
|
73
|
+
else
|
|
74
|
+
error_result(run.errors.full_messages.join(", "))
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def self.compare(args)
|
|
79
|
+
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
80
|
+
a = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_a_id"])
|
|
81
|
+
b = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_b_id"])
|
|
82
|
+
stats_a = CompletionKit::MetricCalibrationStats.for(metric, judge_version: a)
|
|
83
|
+
stats_b = CompletionKit::MetricCalibrationStats.for(metric, judge_version: b)
|
|
84
|
+
text_result({
|
|
85
|
+
metric_id: metric.id,
|
|
86
|
+
a: judge_version_payload(a, stats_a),
|
|
87
|
+
b: judge_version_payload(b, stats_b),
|
|
88
|
+
delta: delta_payload(stats_a, stats_b),
|
|
89
|
+
recommendation: recommendation_for(stats_a, stats_b)
|
|
90
|
+
})
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def self.judge_version_payload(version, stats)
|
|
94
|
+
{
|
|
95
|
+
id: version.id, state: version.state, current: version.current,
|
|
96
|
+
source: version.source, created_at: version.created_at,
|
|
97
|
+
sample_size: stats.sample_size,
|
|
98
|
+
agreement_point: stats.agreement_point,
|
|
99
|
+
agreement_low: stats.agreement_low,
|
|
100
|
+
agreement_high: stats.agreement_high,
|
|
101
|
+
borderline_rate: stats.borderline_rate,
|
|
102
|
+
mae: stats.mae, kappa: stats.kappa
|
|
103
|
+
}
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def self.delta_payload(a, b)
|
|
107
|
+
{
|
|
108
|
+
agreement: pair_delta(a.agreement_point, b.agreement_point),
|
|
109
|
+
mae: pair_delta(a.mae, b.mae),
|
|
110
|
+
kappa: pair_delta(a.kappa, b.kappa),
|
|
111
|
+
sample_size: { a: a.sample_size, b: b.sample_size }
|
|
112
|
+
}
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
def self.pair_delta(a, b)
|
|
116
|
+
{ a: a, b: b, delta: (a.nil? || b.nil?) ? nil : (b - a) }
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
def self.recommendation_for(a, b)
|
|
120
|
+
total = a.sample_size + b.sample_size
|
|
121
|
+
if total < 30
|
|
122
|
+
{ state: "need_more_data", reason: "Combined n=#{total}; need 30+ to make a call." }
|
|
123
|
+
elsif a.agreement_point.nil? || b.agreement_point.nil?
|
|
124
|
+
{ state: "no_change", reason: "Not enough verdicts on one of the versions to compare." }
|
|
125
|
+
else
|
|
126
|
+
lift = b.agreement_point - a.agreement_point
|
|
127
|
+
if lift > 0.03
|
|
128
|
+
{ state: "recommend", reason: "B agreement +#{(lift * 100).round}pt over A." }
|
|
129
|
+
elsif lift < -0.03
|
|
130
|
+
{ state: "hold", reason: "B agreement #{(lift * 100).round}pt vs A." }
|
|
131
|
+
else
|
|
132
|
+
{ state: "no_change", reason: "Agreement within noise (#{(lift * 100).round}pt)." }
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class MetricCalibrationStats
|
|
3
|
+
PROVISIONAL_MIN = 10
|
|
4
|
+
FIRM_MIN = 30
|
|
5
|
+
|
|
6
|
+
Result = Struct.new(
|
|
7
|
+
:sample_size, :agree_count, :disagree_count, :borderline_count,
|
|
8
|
+
:agreement_point, :agreement_low, :agreement_high,
|
|
9
|
+
:borderline_rate, :mae, :pearson, :kappa, :gate,
|
|
10
|
+
keyword_init: true
|
|
11
|
+
) do
|
|
12
|
+
def counter_only?
|
|
13
|
+
gate == :counter
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def provisional?
|
|
17
|
+
gate == :provisional
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def firm?
|
|
21
|
+
gate == :firm
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def short_to_target
|
|
25
|
+
[PROVISIONAL_MIN - sample_size, 0].max
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def margin
|
|
29
|
+
return nil if agreement_low.nil? || agreement_high.nil?
|
|
30
|
+
(agreement_high - agreement_low) / 2.0
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def self.for(metric, judge_version: nil)
|
|
35
|
+
new(metric: metric, judge_version: judge_version).call
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def initialize(metric:, judge_version: nil)
|
|
39
|
+
@metric = metric
|
|
40
|
+
@judge_version = judge_version
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def call
|
|
44
|
+
scope = Calibration.where(metric_id: @metric.id)
|
|
45
|
+
scope = scope.where(judge_version_id: @judge_version.id) if @judge_version
|
|
46
|
+
|
|
47
|
+
verdicts = scope.pluck(:verdict, :corrected_score, :response_id)
|
|
48
|
+
n = verdicts.length
|
|
49
|
+
agrees = verdicts.count { |v, _, _| v == "agree" }
|
|
50
|
+
disagrees = verdicts.count { |v, _, _| v == "disagree" }
|
|
51
|
+
borderlines = verdicts.count { |v, _, _| v == "borderline" }
|
|
52
|
+
|
|
53
|
+
ci = CalibrationMath.wilson_interval(successes: agrees, n: n)
|
|
54
|
+
|
|
55
|
+
pairs = score_pairs(verdicts)
|
|
56
|
+
mae_value = CalibrationMath.mae(pairs)
|
|
57
|
+
pearson_value = CalibrationMath.pearson(pairs)
|
|
58
|
+
kappa_value = CalibrationMath.quadratic_weighted_kappa(pairs, categories: 1..5)
|
|
59
|
+
|
|
60
|
+
Result.new(
|
|
61
|
+
sample_size: n,
|
|
62
|
+
agree_count: agrees,
|
|
63
|
+
disagree_count: disagrees,
|
|
64
|
+
borderline_count: borderlines,
|
|
65
|
+
agreement_point: ci[:point],
|
|
66
|
+
agreement_low: ci[:low],
|
|
67
|
+
agreement_high: ci[:high],
|
|
68
|
+
borderline_rate: n.zero? ? nil : borderlines.to_f / n,
|
|
69
|
+
mae: mae_value,
|
|
70
|
+
pearson: pearson_value,
|
|
71
|
+
kappa: kappa_value,
|
|
72
|
+
gate: gate_for(n)
|
|
73
|
+
)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
private
|
|
77
|
+
|
|
78
|
+
def score_pairs(verdicts)
|
|
79
|
+
response_ids = verdicts.map { |_, _, rid| rid }.uniq
|
|
80
|
+
ai_scores = Review.where(response_id: response_ids, metric_id: @metric.id)
|
|
81
|
+
.pluck(:response_id, :ai_score).to_h
|
|
82
|
+
|
|
83
|
+
verdicts.filter_map do |verdict, corrected, response_id|
|
|
84
|
+
next if verdict == "borderline"
|
|
85
|
+
ai = ai_scores[response_id]
|
|
86
|
+
next if ai.nil?
|
|
87
|
+
human = verdict == "agree" ? ai : corrected
|
|
88
|
+
next if human.nil?
|
|
89
|
+
[ai.to_f, human.to_f]
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def gate_for(n)
|
|
94
|
+
return :counter if n < PROVISIONAL_MIN
|
|
95
|
+
return :firm if n >= FIRM_MIN
|
|
96
|
+
:provisional
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -2,20 +2,29 @@
|
|
|
2
2
|
<% current_verdict = calibration&.verdict %>
|
|
3
3
|
<% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
|
|
4
4
|
<p class="ck-calibration__prompt">
|
|
5
|
-
|
|
5
|
+
Your verdict
|
|
6
6
|
<% if verdict_count > 0 %>
|
|
7
|
-
<span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %>
|
|
7
|
+
<span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "judge trust →", metric_path(metric), class: "ck-link" %></span>
|
|
8
|
+
<% else %>
|
|
9
|
+
<span class="ck-calibration__count">Tell us what you think — was the score right? Verdicts roll up into <%= link_to "judge trust", metric_path(metric), class: "ck-link" %>.</span>
|
|
8
10
|
<% end %>
|
|
9
11
|
</p>
|
|
10
12
|
<div class="ck-calibration__buttons">
|
|
13
|
+
<% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
|
|
14
|
+
<% verdict_hints = {
|
|
15
|
+
"agree" => "The score looks right.",
|
|
16
|
+
"disagree" => "The score is wrong — you'll pick the right one.",
|
|
17
|
+
"borderline" => "The rubric is unclear here; either score could be defensible."
|
|
18
|
+
} %>
|
|
11
19
|
<% CompletionKit::Calibration::VERDICTS.each do |verdict| %>
|
|
12
20
|
<%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
|
|
13
21
|
method: :post,
|
|
14
22
|
form: { data: { turbo: "true" } },
|
|
15
23
|
class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == current_verdict}",
|
|
16
|
-
"aria-pressed": (verdict == current_verdict).to_s
|
|
17
|
-
|
|
18
|
-
|
|
24
|
+
"aria-pressed": (verdict == current_verdict).to_s,
|
|
25
|
+
title: verdict_hints[verdict] do %>
|
|
26
|
+
<%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
27
|
+
<span><%= verdict %></span>
|
|
19
28
|
<% end %>
|
|
20
29
|
<% end %>
|
|
21
30
|
</div>
|
|
@@ -27,7 +36,7 @@
|
|
|
27
36
|
<%= hidden_field_tag :metric_id, metric.id %>
|
|
28
37
|
<%= hidden_field_tag :verdict, "disagree" %>
|
|
29
38
|
<label class="ck-label">
|
|
30
|
-
|
|
39
|
+
What should the score have been?
|
|
31
40
|
<span class="ck-calibration__value" data-calibration-value><%= calibration.corrected_score || review&.ai_score || 3 %></span>
|
|
32
41
|
</label>
|
|
33
42
|
<input type="range" name="corrected_score" min="1" max="5" step="0.5"
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
<% stats = local_assigns[:stats] %>
|
|
2
|
+
<div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
|
|
3
|
+
<p class="ck-trust-panel__label" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</p>
|
|
4
|
+
<% if stats.counter_only? %>
|
|
5
|
+
<div class="ck-trust-panel__body">
|
|
6
|
+
<span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>
|
|
7
|
+
<span class="ck-trust-panel__hint">verdicts so far<% if stats.short_to_target > 0 %> · <%= stats.short_to_target %> more before we can score the judge<% end %></span>
|
|
8
|
+
</div>
|
|
9
|
+
<% else %>
|
|
10
|
+
<div class="ck-trust-panel__body">
|
|
11
|
+
<span class="ck-trust-panel__score"
|
|
12
|
+
title="Roughly how often the judge and the humans agreed.">~<%= (stats.agreement_point * 100).round %><span class="ck-trust-panel__score-pct">%</span></span>
|
|
13
|
+
<span class="ck-trust-panel__margin"
|
|
14
|
+
title="The range we're confident the true rate sits in, given how few verdicts we have so far.">±<%= (stats.margin * 100).round %> pt</span>
|
|
15
|
+
<span class="ck-trust-panel__gate"
|
|
16
|
+
title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts to tighten the margin.' %>"><%= stats.firm? ? "settled" : "early read" %></span>
|
|
17
|
+
</div>
|
|
18
|
+
<div class="ck-trust-panel__details">
|
|
19
|
+
<span><%= pluralize(stats.sample_size, "verdict") %></span>
|
|
20
|
+
<% if stats.borderline_rate && stats.borderline_rate > 0 %>
|
|
21
|
+
<% level = if stats.borderline_rate > 0.30 then "danger"
|
|
22
|
+
elsif stats.borderline_rate > 0.15 then "warning"
|
|
23
|
+
else "ok" end %>
|
|
24
|
+
<span class="ck-trust-panel__borderline ck-trust-panel__borderline--<%= level %>"
|
|
25
|
+
title="<%= level == 'ok' ? 'Some reviewers said the rubric was unclear here.' : 'A lot of reviewers say the rubric is unclear here. Consider splitting the metric or rewriting the rubric.' %>">
|
|
26
|
+
<%= (stats.borderline_rate * 100).round %>% said "unclear"
|
|
27
|
+
</span>
|
|
28
|
+
<% end %>
|
|
29
|
+
</div>
|
|
30
|
+
<% end %>
|
|
31
|
+
</div>
|
|
@@ -19,6 +19,7 @@
|
|
|
19
19
|
<tr>
|
|
20
20
|
<th scope="col">Name</th>
|
|
21
21
|
<th scope="col">Instruction</th>
|
|
22
|
+
<th scope="col" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</th>
|
|
22
23
|
<th scope="col">In groups</th>
|
|
23
24
|
<th scope="col"></th>
|
|
24
25
|
</tr>
|
|
@@ -35,6 +36,23 @@
|
|
|
35
36
|
<% end %>
|
|
36
37
|
</td>
|
|
37
38
|
<td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
|
|
39
|
+
<td data-label="Judge trust" class="ck-metrics-table__trust">
|
|
40
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
41
|
+
<% s = CompletionKit::MetricCalibrationStats.for(metric) %>
|
|
42
|
+
<% if s.counter_only? %>
|
|
43
|
+
<% if s.sample_size.zero? %>
|
|
44
|
+
<span class="ck-meta-copy">No verdicts yet</span>
|
|
45
|
+
<% else %>
|
|
46
|
+
<span class="ck-meta-copy"><%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts</span>
|
|
47
|
+
<% end %>
|
|
48
|
+
<% else %>
|
|
49
|
+
<span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read — keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
|
|
50
|
+
<span class="ck-meta-copy">±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %></span>
|
|
51
|
+
<% end %>
|
|
52
|
+
<% else %>
|
|
53
|
+
<span class="ck-meta-copy">—</span>
|
|
54
|
+
<% end %>
|
|
55
|
+
</td>
|
|
38
56
|
<td data-label="In groups">
|
|
39
57
|
<% groups = metric.metric_groups %>
|
|
40
58
|
<% if groups.any? %>
|
|
@@ -6,8 +6,28 @@
|
|
|
6
6
|
<section class="ck-page-header">
|
|
7
7
|
<div>
|
|
8
8
|
<h1 class="ck-title"><%= @metric.name %></h1>
|
|
9
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
10
|
+
<%= render "completion_kit/calibrations/trust_panel",
|
|
11
|
+
stats: CompletionKit::MetricCalibrationStats.for(@metric) %>
|
|
12
|
+
<% if @latest_draft %>
|
|
13
|
+
<div class="ck-draft-banner">
|
|
14
|
+
<span class="ck-chip ck-chip--soft">Draft pending</span>
|
|
15
|
+
<span class="ck-meta-copy">A draft version of this judge is saved. Publishing it replaces the live instruction and rubric.</span>
|
|
16
|
+
<%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @latest_draft.id),
|
|
17
|
+
method: :post, form_class: "inline-block",
|
|
18
|
+
class: ck_button_classes(:dark) %>
|
|
19
|
+
</div>
|
|
20
|
+
<% end %>
|
|
21
|
+
<% end %>
|
|
9
22
|
</div>
|
|
10
23
|
<div class="ck-actions">
|
|
24
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
25
|
+
<%= button_to "Suggest rewrites", suggest_variants_metric_path(@metric),
|
|
26
|
+
method: :post, form_class: "inline-block",
|
|
27
|
+
class: ck_button_classes(:light, variant: :outline),
|
|
28
|
+
title: "Ask the model to rewrite this judge instruction based on the disagreements collected so far.",
|
|
29
|
+
data: { turbo_confirm: "Ask the model to rewrite this judge instruction based on the disagreements collected so far?" } %>
|
|
30
|
+
<% end %>
|
|
11
31
|
<%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
|
|
12
32
|
</div>
|
|
13
33
|
</section>
|
|
@@ -42,3 +62,127 @@
|
|
|
42
62
|
<% end %>
|
|
43
63
|
</div>
|
|
44
64
|
</section>
|
|
65
|
+
|
|
66
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
67
|
+
<section class="ck-card ck-card--spaced">
|
|
68
|
+
<div class="ck-prompt-preview__header">
|
|
69
|
+
<p class="ck-kicker">Where the judge got it wrong</p>
|
|
70
|
+
<% if @disagreements.any? %>
|
|
71
|
+
<span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
|
|
72
|
+
<% end %>
|
|
73
|
+
</div>
|
|
74
|
+
<% if @disagreements.empty? %>
|
|
75
|
+
<p class="ck-meta-copy">Nothing here yet. As people give a "disagree" verdict on response rows, those rows show up below so you can review the judge's misses and turn them into teaching examples.</p>
|
|
76
|
+
<% else %>
|
|
77
|
+
<p class="ck-meta-copy">Rows where a reviewer said the judge got it wrong. Save the best ones as teaching examples — the judge will see them next time it grades.</p>
|
|
78
|
+
<table class="ck-results-table ck-disagreements-table">
|
|
79
|
+
<thead>
|
|
80
|
+
<tr>
|
|
81
|
+
<th scope="col">Run · row</th>
|
|
82
|
+
<th scope="col">Judge</th>
|
|
83
|
+
<th scope="col">Human</th>
|
|
84
|
+
<th scope="col">Note</th>
|
|
85
|
+
<th scope="col"></th>
|
|
86
|
+
</tr>
|
|
87
|
+
</thead>
|
|
88
|
+
<tbody>
|
|
89
|
+
<% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
|
|
90
|
+
<% @disagreements.each do |cal| %>
|
|
91
|
+
<% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
|
|
92
|
+
<% already = existing_ids.include?(cal.id) %>
|
|
93
|
+
<tr>
|
|
94
|
+
<td>
|
|
95
|
+
<%= link_to ck_run_path(cal.response.run), class: "ck-record-name" do %>
|
|
96
|
+
<strong><%= cal.response.run.name.to_s.truncate(40) %></strong>
|
|
97
|
+
<% end %>
|
|
98
|
+
<span class="ck-meta-copy">· <%= link_to "row ##{cal.response.id}", run_response_path(cal.response.run, cal.response), class: "ck-link" %></span>
|
|
99
|
+
</td>
|
|
100
|
+
<td>
|
|
101
|
+
<% if review&.ai_score %>
|
|
102
|
+
<span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
|
|
103
|
+
<% else %>
|
|
104
|
+
<span class="ck-meta-copy">—</span>
|
|
105
|
+
<% end %>
|
|
106
|
+
</td>
|
|
107
|
+
<td>
|
|
108
|
+
<% if cal.corrected_score %>
|
|
109
|
+
<span class="<%= ck_badge_classes(ck_score_kind(cal.corrected_score.to_f)) %>"><%= cal.corrected_score %></span>
|
|
110
|
+
<% else %>
|
|
111
|
+
<span class="ck-meta-copy">—</span>
|
|
112
|
+
<% end %>
|
|
113
|
+
</td>
|
|
114
|
+
<td class="ck-meta-copy"><%= cal.note.to_s.truncate(120) %></td>
|
|
115
|
+
<td>
|
|
116
|
+
<% if already %>
|
|
117
|
+
<span class="ck-chip ck-chip--done">Saved as example</span>
|
|
118
|
+
<% else %>
|
|
119
|
+
<%= button_to "Teach the judge",
|
|
120
|
+
add_few_shot_metric_path(@metric, calibration_id: cal.id),
|
|
121
|
+
method: :post,
|
|
122
|
+
form_class: "inline-block",
|
|
123
|
+
class: ck_button_classes(:light, variant: :outline),
|
|
124
|
+
title: "Save this row as a teaching example. The judge will see it next time it grades." %>
|
|
125
|
+
<% end %>
|
|
126
|
+
</td>
|
|
127
|
+
</tr>
|
|
128
|
+
<% end %>
|
|
129
|
+
</tbody>
|
|
130
|
+
</table>
|
|
131
|
+
<% end %>
|
|
132
|
+
</section>
|
|
133
|
+
|
|
134
|
+
<% if @suggestion_drafts.any? %>
|
|
135
|
+
<section class="ck-card ck-card--spaced">
|
|
136
|
+
<div class="ck-prompt-preview__header">
|
|
137
|
+
<p class="ck-kicker">Suggested rewrites</p>
|
|
138
|
+
<span class="ck-chip"><%= @suggestion_drafts.size %> option<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
|
|
139
|
+
</div>
|
|
140
|
+
<p class="ck-meta-copy">The model wrote these alternate instructions based on the disagreements above. Pick one to make it the live judge — the previous version is kept in history.</p>
|
|
141
|
+
<div class="ck-suggestion-list">
|
|
142
|
+
<% @suggestion_drafts.each do |draft| %>
|
|
143
|
+
<article class="ck-suggestion-card">
|
|
144
|
+
<header class="ck-suggestion-card__header">
|
|
145
|
+
<span class="ck-chip ck-chip--soft">Option #<%= draft.id %></span>
|
|
146
|
+
<time class="ck-meta-copy" data-relative-time datetime="<%= draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(draft.created_at) %> ago</time>
|
|
147
|
+
</header>
|
|
148
|
+
<pre class="ck-code ck-suggestion-card__instruction"><%= draft.instruction %></pre>
|
|
149
|
+
<div class="ck-actions">
|
|
150
|
+
<%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: draft.id),
|
|
151
|
+
method: :post, form_class: "inline-block",
|
|
152
|
+
class: ck_button_classes(:dark) %>
|
|
153
|
+
</div>
|
|
154
|
+
</article>
|
|
155
|
+
<% end %>
|
|
156
|
+
</div>
|
|
157
|
+
</section>
|
|
158
|
+
<% end %>
|
|
159
|
+
|
|
160
|
+
<% if Array(@metric.few_shot_examples).any? %>
|
|
161
|
+
<section class="ck-card ck-card--spaced">
|
|
162
|
+
<div class="ck-prompt-preview__header">
|
|
163
|
+
<p class="ck-kicker">Teaching examples</p>
|
|
164
|
+
<span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "example") %></span>
|
|
165
|
+
</div>
|
|
166
|
+
<p class="ck-meta-copy">The judge sees these worked examples whenever it grades for this metric. Each shows what the judge gave and what a human said it should have been.</p>
|
|
167
|
+
<ol class="ck-few-shot-list">
|
|
168
|
+
<% Array(@metric.few_shot_examples).each do |fs| %>
|
|
169
|
+
<li class="ck-few-shot-item">
|
|
170
|
+
<div class="ck-few-shot-item__scores">
|
|
171
|
+
<span class="ck-meta-copy">judge said</span>
|
|
172
|
+
<% if fs["judge_score"] %>
|
|
173
|
+
<span class="<%= ck_badge_classes(ck_score_kind(fs["judge_score"].to_f)) %>"><%= fs["judge_score"] %></span>
|
|
174
|
+
<% end %>
|
|
175
|
+
<span class="ck-meta-copy">human said</span>
|
|
176
|
+
<% if fs["human_score"] %>
|
|
177
|
+
<span class="<%= ck_badge_classes(ck_score_kind(fs["human_score"].to_f)) %>"><%= fs["human_score"] %></span>
|
|
178
|
+
<% end %>
|
|
179
|
+
</div>
|
|
180
|
+
<% if fs["human_note"].to_s.present? %>
|
|
181
|
+
<p class="ck-copy"><%= fs["human_note"] %></p>
|
|
182
|
+
<% end %>
|
|
183
|
+
</li>
|
|
184
|
+
<% end %>
|
|
185
|
+
</ol>
|
|
186
|
+
</section>
|
|
187
|
+
<% end %>
|
|
188
|
+
<% end %>
|
data/config/routes.rb
CHANGED
|
@@ -12,7 +12,13 @@ CompletionKit::Engine.routes.draw do
|
|
|
12
12
|
end
|
|
13
13
|
|
|
14
14
|
resources :datasets
|
|
15
|
-
resources :metrics
|
|
15
|
+
resources :metrics do
|
|
16
|
+
member do
|
|
17
|
+
post :add_few_shot
|
|
18
|
+
post :publish_draft
|
|
19
|
+
post :suggest_variants
|
|
20
|
+
end
|
|
21
|
+
end
|
|
16
22
|
resources :metric_groups
|
|
17
23
|
resources :tags
|
|
18
24
|
resources :dashboard_dismissals, only: [:create, :destroy]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
class AddStateToCompletionKitJudgeVersions < ActiveRecord::Migration[8.1]
|
|
2
|
+
def change
|
|
3
|
+
add_column :completion_kit_judge_versions, :state, :string, null: false, default: "published"
|
|
4
|
+
add_column :completion_kit_judge_versions, :source, :string
|
|
5
|
+
|
|
6
|
+
reversible do |dir|
|
|
7
|
+
dir.up do
|
|
8
|
+
execute "UPDATE completion_kit_judge_versions SET state = 'published'"
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
add_index :completion_kit_judge_versions, [:metric_id, :state],
|
|
13
|
+
name: "index_ck_judge_versions_on_metric_state"
|
|
14
|
+
end
|
|
15
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.38
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -290,14 +290,17 @@ files:
|
|
|
290
290
|
- app/models/concerns/completion_kit/taggable.rb
|
|
291
291
|
- app/services/completion_kit/anthropic_client.rb
|
|
292
292
|
- app/services/completion_kit/api_config.rb
|
|
293
|
+
- app/services/completion_kit/calibration_math.rb
|
|
293
294
|
- app/services/completion_kit/csv_processor.rb
|
|
294
295
|
- app/services/completion_kit/dashboard_stats.rb
|
|
295
296
|
- app/services/completion_kit/judge_service.rb
|
|
297
|
+
- app/services/completion_kit/judge_variant_generator.rb
|
|
296
298
|
- app/services/completion_kit/llm_client.rb
|
|
297
299
|
- app/services/completion_kit/mcp_dispatcher.rb
|
|
298
300
|
- app/services/completion_kit/mcp_tools/base.rb
|
|
299
301
|
- app/services/completion_kit/mcp_tools/calibrations.rb
|
|
300
302
|
- app/services/completion_kit/mcp_tools/datasets.rb
|
|
303
|
+
- app/services/completion_kit/mcp_tools/judges.rb
|
|
301
304
|
- app/services/completion_kit/mcp_tools/metric_groups.rb
|
|
302
305
|
- app/services/completion_kit/mcp_tools/metrics.rb
|
|
303
306
|
- app/services/completion_kit/mcp_tools/prompts.rb
|
|
@@ -305,6 +308,7 @@ files:
|
|
|
305
308
|
- app/services/completion_kit/mcp_tools/responses.rb
|
|
306
309
|
- app/services/completion_kit/mcp_tools/runs.rb
|
|
307
310
|
- app/services/completion_kit/mcp_tools/tags.rb
|
|
311
|
+
- app/services/completion_kit/metric_calibration_stats.rb
|
|
308
312
|
- app/services/completion_kit/model_discovery_service.rb
|
|
309
313
|
- app/services/completion_kit/ollama_client.rb
|
|
310
314
|
- app/services/completion_kit/onboarding/checklist.rb
|
|
@@ -323,6 +327,7 @@ files:
|
|
|
323
327
|
- app/views/completion_kit/api_reference/_resource_list.html.erb
|
|
324
328
|
- app/views/completion_kit/api_reference/index.html.erb
|
|
325
329
|
- app/views/completion_kit/calibrations/_buttons.html.erb
|
|
330
|
+
- app/views/completion_kit/calibrations/_trust_panel.html.erb
|
|
326
331
|
- app/views/completion_kit/dashboard/_eye_icon.html.erb
|
|
327
332
|
- app/views/completion_kit/dashboard/_eye_off_icon.html.erb
|
|
328
333
|
- app/views/completion_kit/dashboard/_failures_card.html.erb
|
|
@@ -407,6 +412,8 @@ files:
|
|
|
407
412
|
- db/migrate/20260516000001_create_completion_kit_dashboard_dismissals.rb
|
|
408
413
|
- db/migrate/20260522000001_create_completion_kit_judge_versions.rb
|
|
409
414
|
- db/migrate/20260522000002_create_completion_kit_calibrations.rb
|
|
415
|
+
- db/migrate/20260523000001_add_few_shot_examples_to_completion_kit_metrics.rb
|
|
416
|
+
- db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb
|
|
410
417
|
- lib/completion-kit.rb
|
|
411
418
|
- lib/completion_kit.rb
|
|
412
419
|
- lib/completion_kit/concurrency_check.rb
|