completion-kit 0.5.36 → 0.5.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f3189fdba715e4750befcadf517462d0523ae7366611964388309031ad6a4d4f
4
- data.tar.gz: 322e2dd4847c5e8bd3b8af83b4b09873efcd4238c945d928fdac7c7ffbbaf7de
3
+ metadata.gz: 1ccc7d1feb86aed6af17569a642d8b8e81fe522f0a7c68ca4ebb34abc113dbce
4
+ data.tar.gz: 88793eabe6b04c3497c761cde5b61511623c5a9844ce7e101560f8eb3b492e18
5
5
  SHA512:
6
- metadata.gz: f611cfbc07196fd75eb16962bb1acf4a271d759f42473af9b94d049860839a12657223c72bb12aaeaa80a14adfbb29a20c4cb3c4c3176193ce44966bb876b011
7
- data.tar.gz: 2806d017ce92c625e6c7f83e789d13afe53065107a5a803f0d309cb1eaee65d752a8b1e18734c42e9106fc6c38c2cbbfcc803f76477571d7e2330130ebe8eee1
6
+ metadata.gz: d133a9d0db55ee41eb07e290b9657e044c8a0836806bbd055d0b7b6d1cf8b981056b40e3a8795a951e36e7d7dbcc7626e249c2a2f4cba9492fad38aa931b6bfc
7
+ data.tar.gz: 71bbbe827f33648b12f121c949af74fe8d02702d44c70fd39beb4795c8f95d2b9941aa755d631a78029e0b412cdec7ca9e2bea107d1395ea001abe46dcfddf3f
@@ -5123,53 +5123,241 @@ a.tag-mark {
5123
5123
  .ck-calibration {
5124
5124
  margin-top: 12px;
5125
5125
  padding-top: 12px;
5126
- border-top: 1px dashed rgba(255, 255, 255, 0.08);
5126
+ border-top: 1px dashed var(--ck-line);
5127
5127
  }
5128
5128
  .ck-calibration__prompt {
5129
- font-size: 0.8rem;
5129
+ font-family: var(--ck-mono);
5130
+ font-size: 0.72rem;
5131
+ letter-spacing: 0.06em;
5132
+ text-transform: uppercase;
5130
5133
  color: var(--ck-dim);
5131
- margin: 0 0 8px;
5134
+ margin: 0 0 10px;
5132
5135
  display: flex;
5133
5136
  align-items: center;
5134
- gap: 8px;
5137
+ gap: 10px;
5135
5138
  }
5136
5139
  .ck-calibration__count {
5137
- font-size: 0.75rem;
5140
+ font-family: var(--ck-mono);
5141
+ font-size: 0.72rem;
5142
+ letter-spacing: 0.03em;
5138
5143
  color: var(--ck-accent);
5144
+ text-transform: none;
5139
5145
  }
5140
5146
  .ck-calibration__buttons {
5141
5147
  display: flex;
5142
- gap: 8px;
5148
+ gap: 6px;
5143
5149
  flex-wrap: wrap;
5144
5150
  }
5145
5151
  .ck-calibration__pill {
5146
5152
  display: inline-flex;
5147
5153
  align-items: center;
5148
- gap: 6px;
5149
- padding: 6px 12px;
5150
- border-radius: 999px;
5151
- font-size: 0.85rem;
5152
- background: transparent;
5153
- border: 1px solid rgba(255, 255, 255, 0.18);
5154
- color: inherit;
5154
+ gap: 0.4rem;
5155
+ padding: 0.32rem 0.65rem;
5156
+ border-radius: 4px;
5157
+ font-family: var(--ck-mono);
5158
+ font-size: 0.78rem;
5159
+ font-weight: 500;
5160
+ letter-spacing: 0.04em;
5161
+ text-transform: uppercase;
5162
+ background: var(--ck-surface-soft);
5163
+ border: 1px solid var(--ck-line);
5164
+ color: var(--ck-dim);
5155
5165
  cursor: pointer;
5166
+ transition: background 0.12s, border-color 0.12s, color 0.12s;
5167
+ }
5168
+ .ck-calibration__pill svg {
5169
+ width: 14px;
5170
+ height: 14px;
5156
5171
  }
5157
5172
  .ck-calibration__pill:hover,
5158
5173
  .ck-calibration__pill:focus-visible {
5159
- border-color: var(--ck-accent);
5174
+ color: var(--ck-text);
5175
+ border-color: var(--ck-dim);
5160
5176
  }
5161
- .ck-calibration__pill.is-active {
5162
- background: var(--ck-accent);
5163
- color: #0b1320;
5164
- border-color: var(--ck-accent);
5177
+ .ck-calibration__pill--agree.is-active {
5178
+ background: var(--ck-success-soft);
5179
+ border-color: rgba(45, 212, 168, 0.35);
5180
+ color: var(--ck-success);
5165
5181
  }
5182
+ .ck-calibration__pill--disagree.is-active {
5183
+ background: var(--ck-danger-soft);
5184
+ border-color: rgba(248, 113, 113, 0.35);
5185
+ color: var(--ck-danger);
5186
+ }
5187
+ .ck-calibration__pill--borderline.is-active {
5188
+ background: var(--ck-warning-soft);
5189
+ border-color: rgba(224, 164, 88, 0.35);
5190
+ color: var(--ck-warning);
5191
+ }
5192
+ .ck-calibration__pill--agree:hover { border-color: rgba(45, 212, 168, 0.45); color: var(--ck-success); }
5193
+ .ck-calibration__pill--disagree:hover { border-color: rgba(248, 113, 113, 0.45); color: var(--ck-danger); }
5194
+ .ck-calibration__pill--borderline:hover { border-color: rgba(224, 164, 88, 0.45); color: var(--ck-warning); }
5166
5195
  .ck-calibration__detail {
5167
- margin-top: 10px;
5196
+ margin-top: 12px;
5168
5197
  display: flex;
5169
5198
  flex-direction: column;
5170
5199
  gap: 8px;
5200
+ padding: 12px;
5201
+ background: var(--ck-surface-soft);
5202
+ border: 1px solid var(--ck-line);
5203
+ border-radius: 6px;
5171
5204
  }
5172
5205
  .ck-calibration__value {
5173
5206
  color: var(--ck-accent);
5207
+ font-family: var(--ck-mono);
5208
+ font-weight: 600;
5209
+ }
5210
+
5211
+ .ck-trust-panel {
5212
+ display: inline-flex;
5213
+ flex-direction: column;
5214
+ gap: 6px;
5215
+ margin-top: 12px;
5216
+ padding: 10px 14px;
5217
+ background: var(--ck-surface-soft);
5218
+ border: 1px solid var(--ck-line);
5219
+ border-radius: 6px;
5220
+ }
5221
+ .ck-trust-panel__label {
5222
+ margin: 0;
5223
+ font-family: var(--ck-mono);
5224
+ font-size: 0.7rem;
5225
+ letter-spacing: 0.08em;
5226
+ text-transform: uppercase;
5227
+ color: var(--ck-dim);
5228
+ }
5229
+ .ck-trust-panel__body {
5230
+ display: flex;
5231
+ align-items: baseline;
5232
+ gap: 10px;
5233
+ }
5234
+ .ck-trust-panel__counter {
5235
+ font-family: var(--ck-mono);
5236
+ font-size: 1.6rem;
5237
+ font-weight: 600;
5238
+ color: var(--ck-accent);
5239
+ }
5240
+ .ck-trust-panel__counter-of {
5241
+ font-size: 0.9rem;
5242
+ color: var(--ck-dim);
5243
+ margin-left: 4px;
5244
+ }
5245
+ .ck-trust-panel__hint {
5246
+ font-family: var(--ck-mono);
5247
+ font-size: 0.72rem;
5248
+ color: var(--ck-dim);
5249
+ letter-spacing: 0.04em;
5250
+ }
5251
+ .ck-trust-panel__score {
5252
+ font-family: var(--ck-mono);
5253
+ font-size: 1.6rem;
5174
5254
  font-weight: 600;
5255
+ color: var(--ck-success);
5256
+ }
5257
+ .ck-trust-panel__score-pct {
5258
+ font-size: 0.9rem;
5259
+ color: var(--ck-dim);
5260
+ margin-left: 2px;
5261
+ }
5262
+ .ck-trust-panel__margin {
5263
+ font-family: var(--ck-mono);
5264
+ font-size: 0.8rem;
5265
+ color: var(--ck-dim);
5266
+ }
5267
+ .ck-trust-panel__gate {
5268
+ font-family: var(--ck-mono);
5269
+ font-size: 0.66rem;
5270
+ letter-spacing: 0.08em;
5271
+ text-transform: uppercase;
5272
+ padding: 2px 6px;
5273
+ border-radius: 3px;
5274
+ background: var(--ck-surface);
5275
+ border: 1px solid var(--ck-line);
5276
+ color: var(--ck-dim);
5277
+ }
5278
+ .ck-trust-panel--firm .ck-trust-panel__gate {
5279
+ color: var(--ck-success);
5280
+ border-color: rgba(45, 212, 168, 0.35);
5281
+ }
5282
+ .ck-trust-panel__details {
5283
+ display: flex;
5284
+ flex-wrap: wrap;
5285
+ gap: 14px;
5286
+ font-family: var(--ck-mono);
5287
+ font-size: 0.72rem;
5288
+ color: var(--ck-dim);
5289
+ }
5290
+ .ck-trust-panel__borderline {
5291
+ color: var(--ck-warning);
5292
+ }
5293
+
5294
+ .ck-trust-panel__borderline--ok { color: var(--ck-dim); }
5295
+ .ck-trust-panel__borderline--warning { color: var(--ck-warning); }
5296
+ .ck-trust-panel__borderline--danger { color: var(--ck-danger); }
5297
+
5298
+ .ck-disagreements-table td .ck-meta-copy {
5299
+ font-size: 0.78rem;
5300
+ }
5301
+ .ck-few-shot-list {
5302
+ list-style: decimal;
5303
+ padding-left: 1.4rem;
5304
+ margin: 0;
5305
+ display: flex;
5306
+ flex-direction: column;
5307
+ gap: 12px;
5308
+ }
5309
+ .ck-few-shot-item {
5310
+ padding: 10px 12px;
5311
+ background: var(--ck-surface-soft);
5312
+ border: 1px solid var(--ck-line);
5313
+ border-radius: 6px;
5314
+ }
5315
+ .ck-few-shot-item__scores {
5316
+ display: flex;
5317
+ align-items: center;
5318
+ gap: 8px;
5319
+ font-family: var(--ck-mono);
5320
+ font-size: 0.75rem;
5321
+ letter-spacing: 0.04em;
5322
+ text-transform: uppercase;
5323
+ }
5324
+
5325
+ .ck-draft-banner {
5326
+ display: inline-flex;
5327
+ align-items: center;
5328
+ gap: 10px;
5329
+ margin-top: 10px;
5330
+ padding: 8px 12px;
5331
+ background: var(--ck-accent-soft);
5332
+ border: 1px dashed rgba(6, 182, 212, 0.4);
5333
+ border-radius: 6px;
5334
+ }
5335
+
5336
+ .ck-suggestion-list {
5337
+ display: flex;
5338
+ flex-direction: column;
5339
+ gap: 12px;
5340
+ }
5341
+ .ck-suggestion-card {
5342
+ padding: 12px 14px;
5343
+ background: var(--ck-surface-soft);
5344
+ border: 1px solid var(--ck-line);
5345
+ border-radius: 6px;
5346
+ display: flex;
5347
+ flex-direction: column;
5348
+ gap: 10px;
5349
+ }
5350
+ .ck-suggestion-card__header {
5351
+ display: flex;
5352
+ align-items: center;
5353
+ gap: 10px;
5354
+ }
5355
+ .ck-suggestion-card__instruction {
5356
+ margin: 0;
5357
+ white-space: pre-wrap;
5358
+ font-size: 0.85rem;
5359
+ background: var(--ck-bg-strong);
5360
+ padding: 10px 12px;
5361
+ border-radius: 4px;
5362
+ border: 1px solid var(--ck-line);
5175
5363
  }
@@ -1,13 +1,19 @@
1
1
  module CompletionKit
2
2
  class MetricsController < ApplicationController
3
3
  include CompletionKit::TagFiltering
4
- before_action :set_metric, only: [:show, :edit, :update, :destroy]
4
+ before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants]
5
5
 
6
6
  def index
7
7
  @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
8
8
  end
9
9
 
10
10
  def show
11
+ @disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
12
+ .includes(response: [:reviews, :run])
13
+ .order(created_at: :desc)
14
+ .limit(50)
15
+ @latest_draft = JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
16
+ @suggestion_drafts = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc)
11
17
  end
12
18
 
13
19
  def new
@@ -40,6 +46,51 @@ module CompletionKit
40
46
  redirect_to metrics_path, notice: "Metric was successfully destroyed."
41
47
  end
42
48
 
49
+ def suggest_variants
50
+ generator = JudgeVariantGenerator.new(@metric)
51
+ variants = generator.call
52
+ if variants.empty?
53
+ redirect_to metric_path(@metric), alert: "The model returned no usable variants. Try again with a different model."
54
+ return
55
+ end
56
+ generator.persist!(variants)
57
+ label = variants.length == 1 ? "judge variant" : "judge variants"
58
+ redirect_to metric_path(@metric), notice: "Generated #{variants.length} #{label} as drafts. Pick one to publish."
59
+ end
60
+
61
+ def publish_draft
62
+ draft = JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
63
+ if draft.nil?
64
+ redirect_to metric_path(@metric), alert: "No draft to publish."
65
+ return
66
+ end
67
+
68
+ JudgeVersion.transaction do
69
+ JudgeVersion.where(metric_id: @metric.id, state: "published").update_all(current: false)
70
+ draft.update!(state: "published", current: true)
71
+ end
72
+
73
+ redirect_to metric_path(@metric), notice: "Draft published as the current judge version."
74
+ end
75
+
76
+ def add_few_shot
77
+ calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
78
+ review = calibration.response.reviews.find_by(metric_id: @metric.id)
79
+ examples = Array(@metric.few_shot_examples)
80
+ examples << {
81
+ "input" => calibration.response.input_data.to_s.truncate(2000),
82
+ "response" => calibration.response.response_text.to_s.truncate(2000),
83
+ "judge_score" => review&.ai_score&.to_f,
84
+ "judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
85
+ "human_score" => calibration.corrected_score&.to_f,
86
+ "human_note" => calibration.note.to_s.truncate(1000),
87
+ "calibration_id" => calibration.id,
88
+ "added_at" => Time.current.utc.iso8601
89
+ }
90
+ @metric.update!(few_shot_examples: examples)
91
+ redirect_to metric_path(@metric), notice: "Added as a judge few-shot."
92
+ end
93
+
43
94
  private
44
95
 
45
96
  def set_metric
@@ -1,23 +1,37 @@
1
1
  module CompletionKit
2
2
  class JudgeVersion < ApplicationRecord
3
+ STATES = %w[draft published].freeze
4
+
3
5
  belongs_to :metric
4
6
  has_many :calibrations, dependent: :destroy
5
7
 
6
8
  serialize :rubric_bands, coder: JSON
7
9
 
8
10
  validates :metric_id, presence: true
11
+ validates :state, inclusion: { in: STATES }
9
12
 
10
13
  scope :current, -> { where(current: true) }
14
+ scope :published, -> { where(state: "published") }
15
+ scope :drafts, -> { where(state: "draft") }
11
16
 
12
17
  def self.ensure_current_for(metric)
13
18
  current.find_by(metric_id: metric.id) || create!(
14
19
  metric: metric,
15
20
  instruction: metric.instruction,
16
21
  rubric_bands: metric.rubric_bands,
17
- current: true
22
+ current: true,
23
+ state: "published"
18
24
  )
19
25
  end
20
26
 
27
+ def draft?
28
+ state == "draft"
29
+ end
30
+
31
+ def published?
32
+ state == "published"
33
+ end
34
+
21
35
  def as_json(options = {})
22
36
  {
23
37
  id: id,
@@ -25,6 +39,8 @@ module CompletionKit
25
39
  instruction: instruction,
26
40
  rubric_bands: rubric_bands,
27
41
  current: current,
42
+ state: state,
43
+ source: source,
28
44
  created_at: created_at
29
45
  }
30
46
  end
@@ -16,6 +16,7 @@ module CompletionKit
16
16
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
17
17
 
18
18
  serialize :rubric_bands, coder: JSON
19
+ serialize :few_shot_examples, coder: JSON, type: Array
19
20
 
20
21
  validates :name, presence: true
21
22
  validates :key, tenant_scoped_uniqueness: { allow_nil: true }
@@ -23,6 +24,7 @@ module CompletionKit
23
24
  before_validation :generate_key
24
25
  before_validation :normalize_rubric_bands
25
26
  before_validation :set_defaults
27
+ after_update :fork_draft_judge_version, if: :judge_relevant_changes?
26
28
 
27
29
  def self.default_rubric_bands
28
30
  DEFAULT_RUBRIC_BANDS.map(&:dup)
@@ -95,5 +97,22 @@ module CompletionKit
95
97
  def normalize_rubric_bands
96
98
  self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
97
99
  end
100
+
101
+ def judge_relevant_changes?
102
+ saved_change_to_instruction? || saved_change_to_rubric_bands?
103
+ end
104
+
105
+ def fork_draft_judge_version
106
+ JudgeVersion.ensure_current_for(self)
107
+ JudgeVersion.where(metric_id: id, state: "draft").update_all(current: false)
108
+ JudgeVersion.create!(
109
+ metric: self,
110
+ instruction: instruction,
111
+ rubric_bands: rubric_bands,
112
+ current: false,
113
+ state: "draft",
114
+ source: "edit"
115
+ )
116
+ end
98
117
  end
99
118
  end
@@ -0,0 +1,84 @@
1
+ module CompletionKit
2
+ module CalibrationMath
3
+ Z_95 = 1.959963984540054
4
+
5
+ module_function
6
+
7
+ def wilson_interval(successes:, n:, z: Z_95)
8
+ return { point: nil, low: nil, high: nil } if n.to_i.zero?
9
+
10
+ p_hat = successes.to_f / n
11
+ denom = 1.0 + (z * z) / n
12
+ center = (p_hat + (z * z) / (2.0 * n)) / denom
13
+ margin = z * Math.sqrt((p_hat * (1 - p_hat) / n) + ((z * z) / (4.0 * n * n))) / denom
14
+
15
+ { point: p_hat, low: [center - margin, 0.0].max, high: [center + margin, 1.0].min }
16
+ end
17
+
18
+ def mae(pairs)
19
+ return nil if pairs.empty?
20
+ sum = pairs.sum { |ai, human| (ai.to_f - human.to_f).abs }
21
+ sum / pairs.length
22
+ end
23
+
24
+ def pearson(pairs)
25
+ return nil if pairs.length < 2
26
+ xs = pairs.map { |a, _| a.to_f }
27
+ ys = pairs.map { |_, h| h.to_f }
28
+ mx = xs.sum / xs.length
29
+ my = ys.sum / ys.length
30
+ num = xs.zip(ys).sum { |x, y| (x - mx) * (y - my) }
31
+ dx2 = xs.sum { |x| (x - mx)**2 }
32
+ dy2 = ys.sum { |y| (y - my)**2 }
33
+ denom = Math.sqrt(dx2 * dy2)
34
+ return nil if denom.zero?
35
+ num / denom
36
+ end
37
+
38
+ def quadratic_weighted_kappa(pairs, categories:)
39
+ return nil if pairs.empty?
40
+
41
+ ratings = categories.to_a
42
+ k = ratings.length
43
+ return nil if k < 2
44
+
45
+ index = ratings.each_with_index.to_h
46
+ observed = Array.new(k) { Array.new(k, 0) }
47
+ row_totals = Array.new(k, 0)
48
+ col_totals = Array.new(k, 0)
49
+ n = 0
50
+
51
+ pairs.each do |ai, human|
52
+ i = index[score_bucket(ai, ratings)]
53
+ j = index[score_bucket(human, ratings)]
54
+ next if i.nil? || j.nil?
55
+ observed[i][j] += 1
56
+ row_totals[i] += 1
57
+ col_totals[j] += 1
58
+ n += 1
59
+ end
60
+ return nil if n.zero?
61
+
62
+ max_dist_sq = (k - 1.0)**2
63
+ numerator = 0.0
64
+ denominator = 0.0
65
+ (0...k).each do |i|
66
+ (0...k).each do |j|
67
+ weight = ((i - j)**2) / max_dist_sq
68
+ expected = (row_totals[i] * col_totals[j]).to_f / n
69
+ numerator += weight * observed[i][j]
70
+ denominator += weight * expected
71
+ end
72
+ end
73
+ return 1.0 if denominator.zero?
74
+ 1.0 - (numerator / denominator)
75
+ end
76
+
77
+ def score_bucket(value, ratings)
78
+ rounded = value.to_f.round
79
+ return ratings.first if rounded <= ratings.first
80
+ return ratings.last if rounded >= ratings.last
81
+ rounded
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,108 @@
1
+ module CompletionKit
2
+ class JudgeVariantGenerator
3
+ DEFAULT_VARIANT_COUNT = 3
4
+ DEFAULT_TEMPERATURE = 0.4
5
+
6
+ Variant = Struct.new(:reasoning, :instruction, keyword_init: true)
7
+
8
+ def initialize(metric, count: DEFAULT_VARIANT_COUNT, model: nil)
9
+ @metric = metric
10
+ @count = count
11
+ @model = model || CompletionKit.config.judge_model
12
+ end
13
+
14
+ def call
15
+ client = LlmClient.for_model(@model, ApiConfig.for_model(@model))
16
+ raw = client.generate_completion(build_meta_prompt, model: @model, max_tokens: 2500, temperature: DEFAULT_TEMPERATURE)
17
+ parse(raw).first(@count)
18
+ end
19
+
20
+ def persist!(variants)
21
+ JudgeVersion.where(metric_id: @metric.id, state: "draft", source: "suggestion").update_all(current: false)
22
+ versions = variants.map do |variant|
23
+ JudgeVersion.create!(
24
+ metric: @metric,
25
+ instruction: variant.instruction,
26
+ rubric_bands: @metric.rubric_bands,
27
+ state: "draft",
28
+ source: "suggestion",
29
+ current: false
30
+ )
31
+ end
32
+ ActiveSupport::Notifications.instrument("completion_kit.judge_suggestion.generated",
33
+ metric_id: @metric.id,
34
+ count: versions.length,
35
+ model: @model)
36
+ versions
37
+ end
38
+
39
+ private
40
+
41
+ def build_meta_prompt
42
+ examples = JudgeCalibrationExamples.for(@metric)
43
+ sections = []
44
+ sections << "You are an expert evaluator. Rewrite a judge's grading instruction so it agrees better with humans on the cases below."
45
+ sections << ""
46
+ sections << "## Current instruction"
47
+ sections << "```"
48
+ sections << @metric.instruction.to_s
49
+ sections << "```"
50
+ sections << ""
51
+ sections << "## Rubric (unchanged across variants — only rewrite the instruction)"
52
+ sections << @metric.display_rubric_text
53
+ sections << ""
54
+ sections << "## Recent disagreements (judge vs human)"
55
+ examples.each_with_index do |ex, i|
56
+ sections << "### Case #{i + 1}"
57
+ sections << "Input: #{ex[:input].to_s.truncate(200)}"
58
+ sections << "Output: #{ex[:output].to_s.truncate(200)}"
59
+ sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
60
+ sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
61
+ sections << ""
62
+ end
63
+ sections << "## Task"
64
+ sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite — not a wholesale rewrite of the rubric. Aim to close the disagreement gap."
65
+ sections << ""
66
+ sections << "Respond in EXACTLY this format, repeated #{@count} times:"
67
+ sections << ""
68
+ sections << "VARIANT:"
69
+ sections << "REASONING: <one sentence explaining what this variant changes>"
70
+ sections << "INSTRUCTION:"
71
+ sections << "<the rewritten instruction>"
72
+ sections << "END_VARIANT"
73
+ sections.join("\n")
74
+ end
75
+
76
+ def parse(text)
77
+ blocks = text.to_s.scan(/VARIANT:(.*?)END_VARIANT/m).flatten
78
+ blocks.filter_map do |raw|
79
+ reasoning = raw[/REASONING:\s*(.*?)(?=INSTRUCTION:|\z)/m, 1].to_s.strip
80
+ instruction = raw[/INSTRUCTION:\s*(.*)/m, 1].to_s.strip
81
+ next if instruction.empty?
82
+ Variant.new(reasoning: reasoning, instruction: instruction)
83
+ end
84
+ end
85
+ end
86
+
87
+ module JudgeCalibrationExamples
88
+ module_function
89
+
90
+ def for(metric, limit: 8)
91
+ disagreements = Calibration.where(metric_id: metric.id, verdict: "disagree")
92
+ .includes(response: :reviews)
93
+ .order(created_at: :desc)
94
+ .limit(limit)
95
+ disagreements.map do |cal|
96
+ review = cal.response.reviews.find { |r| r.metric_id == metric.id }
97
+ {
98
+ input: cal.response.input_data,
99
+ output: cal.response.response_text,
100
+ judge_score: review&.ai_score,
101
+ judge_feedback: review&.ai_feedback,
102
+ human_score: cal.corrected_score,
103
+ human_note: cal.note
104
+ }
105
+ end
106
+ end
107
+ end
108
+ end
@@ -34,7 +34,8 @@ module CompletionKit
34
34
  McpTools::MetricGroups.definitions +
35
35
  McpTools::ProviderCredentials.definitions +
36
36
  McpTools::Tags.definitions +
37
- McpTools::Calibrations.definitions
37
+ McpTools::Calibrations.definitions +
38
+ McpTools::Judges.definitions
38
39
  end
39
40
 
40
41
  def self.call_tool(name, arguments)
@@ -48,6 +49,7 @@ module CompletionKit
48
49
  when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
49
50
  when /\Atags_/ then McpTools::Tags.call(name, arguments)
50
51
  when /\Acalibrations_/ then McpTools::Calibrations.call(name, arguments)
52
+ when /\Ajudges_/ then McpTools::Judges.call(name, arguments)
51
53
  else raise MethodNotFound, "Unknown tool: #{name}"
52
54
  end
53
55
  end
@@ -0,0 +1,138 @@
1
+ module CompletionKit
2
+ module McpTools
3
+ module Judges
4
+ extend Base
5
+
6
+ TOOLS = {
7
+ "judges_suggest" => {
8
+ description: "Ask the model to rewrite the metric's judge instruction in N variants targeted at the recent disagreements. Each variant is saved as a draft JudgeVersion with source=\"suggestion\". Returns the persisted drafts. Stripe-metering hooks fire via ActiveSupport::Notifications under completion_kit.judge_suggestion.generated.",
9
+ inputSchema: {
10
+ type: "object",
11
+ properties: {
12
+ metric_id: { type: "integer" },
13
+ count: { type: "integer", description: "How many variants to request (default 3, max 5)." },
14
+ model: { type: "string", description: "Override the model used to generate variants. Defaults to CompletionKit.config.judge_model." }
15
+ },
16
+ required: ["metric_id"]
17
+ },
18
+ handler: :suggest
19
+ },
20
+ "judges_replay" => {
21
+ description: "Run the current judge against a dataset (judge-only run). Wraps runs_create with prompt_id omitted and output_column supplied. Re-judges existing dataset outputs so you can compare against human verdicts.",
22
+ inputSchema: {
23
+ type: "object",
24
+ properties: {
25
+ name: { type: "string" },
26
+ metric_id: { type: "integer" },
27
+ dataset_id: { type: "integer" },
28
+ judge_model: { type: "string" },
29
+ output_column: { type: "string", description: "Dataset column with the existing outputs to grade. Defaults to actual_output." }
30
+ },
31
+ required: ["name", "metric_id", "dataset_id", "judge_model"]
32
+ },
33
+ handler: :replay
34
+ },
35
+ "judges_compare" => {
36
+ description: "Compare two judge versions' calibration stats side by side. Pass either two judge_version_ids or one metric_id with judge_version_a_id / judge_version_b_id.",
37
+ inputSchema: {
38
+ type: "object",
39
+ properties: {
40
+ metric_id: { type: "integer" },
41
+ judge_version_a_id: { type: "integer" },
42
+ judge_version_b_id: { type: "integer" }
43
+ },
44
+ required: ["metric_id", "judge_version_a_id", "judge_version_b_id"]
45
+ },
46
+ handler: :compare
47
+ }
48
+ }.freeze
49
+
50
+ def self.suggest(args)
51
+ metric = CompletionKit::Metric.find(args["metric_id"])
52
+ count = [args["count"].to_i, 5].min
53
+ count = CompletionKit::JudgeVariantGenerator::DEFAULT_VARIANT_COUNT if count <= 0
54
+ generator = CompletionKit::JudgeVariantGenerator.new(metric, count: count, model: args["model"])
55
+ variants = generator.call
56
+ return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
57
+ versions = generator.persist!(variants)
58
+ text_result(versions.map(&:as_json))
59
+ end
60
+
61
+ def self.replay(args)
62
+ metric = CompletionKit::Metric.find(args["metric_id"])
63
+ dataset = CompletionKit::Dataset.find(args["dataset_id"])
64
+ run = CompletionKit::Run.new(
65
+ name: args["name"],
66
+ dataset: dataset,
67
+ judge_model: args["judge_model"],
68
+ output_column: args["output_column"].presence || "actual_output"
69
+ )
70
+ if run.save
71
+ run.replace_metrics!([metric.id])
72
+ text_result(run.reload.as_json)
73
+ else
74
+ error_result(run.errors.full_messages.join(", "))
75
+ end
76
+ end
77
+
78
+ def self.compare(args)
79
+ metric = CompletionKit::Metric.find(args["metric_id"])
80
+ a = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_a_id"])
81
+ b = CompletionKit::JudgeVersion.where(metric_id: metric.id).find(args["judge_version_b_id"])
82
+ stats_a = CompletionKit::MetricCalibrationStats.for(metric, judge_version: a)
83
+ stats_b = CompletionKit::MetricCalibrationStats.for(metric, judge_version: b)
84
+ text_result({
85
+ metric_id: metric.id,
86
+ a: judge_version_payload(a, stats_a),
87
+ b: judge_version_payload(b, stats_b),
88
+ delta: delta_payload(stats_a, stats_b),
89
+ recommendation: recommendation_for(stats_a, stats_b)
90
+ })
91
+ end
92
+
93
+ def self.judge_version_payload(version, stats)
94
+ {
95
+ id: version.id, state: version.state, current: version.current,
96
+ source: version.source, created_at: version.created_at,
97
+ sample_size: stats.sample_size,
98
+ agreement_point: stats.agreement_point,
99
+ agreement_low: stats.agreement_low,
100
+ agreement_high: stats.agreement_high,
101
+ borderline_rate: stats.borderline_rate,
102
+ mae: stats.mae, kappa: stats.kappa
103
+ }
104
+ end
105
+
106
+ def self.delta_payload(a, b)
107
+ {
108
+ agreement: pair_delta(a.agreement_point, b.agreement_point),
109
+ mae: pair_delta(a.mae, b.mae),
110
+ kappa: pair_delta(a.kappa, b.kappa),
111
+ sample_size: { a: a.sample_size, b: b.sample_size }
112
+ }
113
+ end
114
+
115
+ def self.pair_delta(a, b)
116
+ { a: a, b: b, delta: (a.nil? || b.nil?) ? nil : (b - a) }
117
+ end
118
+
119
+ def self.recommendation_for(a, b)
120
+ total = a.sample_size + b.sample_size
121
+ if total < 30
122
+ { state: "need_more_data", reason: "Combined n=#{total}; need 30+ to make a call." }
123
+ elsif a.agreement_point.nil? || b.agreement_point.nil?
124
+ { state: "no_change", reason: "Not enough verdicts on one of the versions to compare." }
125
+ else
126
+ lift = b.agreement_point - a.agreement_point
127
+ if lift > 0.03
128
+ { state: "recommend", reason: "B agreement +#{(lift * 100).round}pt over A." }
129
+ elsif lift < -0.03
130
+ { state: "hold", reason: "B agreement #{(lift * 100).round}pt vs A." }
131
+ else
132
+ { state: "no_change", reason: "Agreement within noise (#{(lift * 100).round}pt)." }
133
+ end
134
+ end
135
+ end
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,99 @@
1
+ module CompletionKit
2
+ class MetricCalibrationStats
3
+ PROVISIONAL_MIN = 10
4
+ FIRM_MIN = 30
5
+
6
+ Result = Struct.new(
7
+ :sample_size, :agree_count, :disagree_count, :borderline_count,
8
+ :agreement_point, :agreement_low, :agreement_high,
9
+ :borderline_rate, :mae, :pearson, :kappa, :gate,
10
+ keyword_init: true
11
+ ) do
12
+ def counter_only?
13
+ gate == :counter
14
+ end
15
+
16
+ def provisional?
17
+ gate == :provisional
18
+ end
19
+
20
+ def firm?
21
+ gate == :firm
22
+ end
23
+
24
+ def short_to_target
25
+ [PROVISIONAL_MIN - sample_size, 0].max
26
+ end
27
+
28
+ def margin
29
+ return nil if agreement_low.nil? || agreement_high.nil?
30
+ (agreement_high - agreement_low) / 2.0
31
+ end
32
+ end
33
+
34
+ def self.for(metric, judge_version: nil)
35
+ new(metric: metric, judge_version: judge_version).call
36
+ end
37
+
38
+ def initialize(metric:, judge_version: nil)
39
+ @metric = metric
40
+ @judge_version = judge_version
41
+ end
42
+
43
+ def call
44
+ scope = Calibration.where(metric_id: @metric.id)
45
+ scope = scope.where(judge_version_id: @judge_version.id) if @judge_version
46
+
47
+ verdicts = scope.pluck(:verdict, :corrected_score, :response_id)
48
+ n = verdicts.length
49
+ agrees = verdicts.count { |v, _, _| v == "agree" }
50
+ disagrees = verdicts.count { |v, _, _| v == "disagree" }
51
+ borderlines = verdicts.count { |v, _, _| v == "borderline" }
52
+
53
+ ci = CalibrationMath.wilson_interval(successes: agrees, n: n)
54
+
55
+ pairs = score_pairs(verdicts)
56
+ mae_value = CalibrationMath.mae(pairs)
57
+ pearson_value = CalibrationMath.pearson(pairs)
58
+ kappa_value = CalibrationMath.quadratic_weighted_kappa(pairs, categories: 1..5)
59
+
60
+ Result.new(
61
+ sample_size: n,
62
+ agree_count: agrees,
63
+ disagree_count: disagrees,
64
+ borderline_count: borderlines,
65
+ agreement_point: ci[:point],
66
+ agreement_low: ci[:low],
67
+ agreement_high: ci[:high],
68
+ borderline_rate: n.zero? ? nil : borderlines.to_f / n,
69
+ mae: mae_value,
70
+ pearson: pearson_value,
71
+ kappa: kappa_value,
72
+ gate: gate_for(n)
73
+ )
74
+ end
75
+
76
+ private
77
+
78
+ def score_pairs(verdicts)
79
+ response_ids = verdicts.map { |_, _, rid| rid }.uniq
80
+ ai_scores = Review.where(response_id: response_ids, metric_id: @metric.id)
81
+ .pluck(:response_id, :ai_score).to_h
82
+
83
+ verdicts.filter_map do |verdict, corrected, response_id|
84
+ next if verdict == "borderline"
85
+ ai = ai_scores[response_id]
86
+ next if ai.nil?
87
+ human = verdict == "agree" ? ai : corrected
88
+ next if human.nil?
89
+ [ai.to_f, human.to_f]
90
+ end
91
+ end
92
+
93
+ def gate_for(n)
94
+ return :counter if n < PROVISIONAL_MIN
95
+ return :firm if n >= FIRM_MIN
96
+ :provisional
97
+ end
98
+ end
99
+ end
@@ -2,20 +2,21 @@
2
2
  <% current_verdict = calibration&.verdict %>
3
3
  <% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
4
4
  <p class="ck-calibration__prompt">
5
- How does this score feel?
5
+ Your verdict
6
6
  <% if verdict_count > 0 %>
7
- <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> collected</span>
7
+ <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score</span>
8
8
  <% end %>
9
9
  </p>
10
10
  <div class="ck-calibration__buttons">
11
+ <% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
11
12
  <% CompletionKit::Calibration::VERDICTS.each do |verdict| %>
12
13
  <%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
13
14
  method: :post,
14
15
  form: { data: { turbo: "true" } },
15
16
  class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == current_verdict}",
16
17
  "aria-pressed": (verdict == current_verdict).to_s do %>
17
- <% case verdict
18
- when "agree" %>👍 Agree<% when "disagree" %>👎 Disagree<% else %>🤔 Borderline<% end %>
18
+ <%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
19
+ <span><%= verdict %></span>
19
20
  <% end %>
20
21
  <% end %>
21
22
  </div>
@@ -0,0 +1,34 @@
1
+ <% stats = local_assigns[:stats] %>
2
+ <div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
3
+ <p class="ck-trust-panel__label">Judge trust</p>
4
+ <% if stats.counter_only? %>
5
+ <div class="ck-trust-panel__body">
6
+ <span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>
7
+ <span class="ck-trust-panel__hint">verdicts<% if stats.short_to_target > 0 %> · <%= pluralize(stats.short_to_target, "more") %> to score<% end %></span>
8
+ </div>
9
+ <% else %>
10
+ <div class="ck-trust-panel__body">
11
+ <span class="ck-trust-panel__score">~<%= (stats.agreement_point * 100).round %><span class="ck-trust-panel__score-pct">%</span></span>
12
+ <span class="ck-trust-panel__margin">±<%= (stats.margin * 100).round %> pt</span>
13
+ <span class="ck-trust-panel__gate"><%= stats.firm? ? "settled" : "provisional" %></span>
14
+ </div>
15
+ <div class="ck-trust-panel__details">
16
+ <span><%= stats.sample_size %> verdicts</span>
17
+ <% if stats.borderline_rate && stats.borderline_rate > 0 %>
18
+ <% level = if stats.borderline_rate > 0.30 then "danger"
19
+ elsif stats.borderline_rate > 0.15 then "warning"
20
+ else "ok" end %>
21
+ <span class="ck-trust-panel__borderline ck-trust-panel__borderline--<%= level %>"
22
+ title="<%= level == 'ok' ? '' : 'Rubric ambiguous. Consider splitting the metric or clarifying the rubric.' %>">
23
+ <%= (stats.borderline_rate * 100).round %>% borderline
24
+ </span>
25
+ <% end %>
26
+ <% if stats.mae %>
27
+ <span>MAE <%= stats.mae.round(2) %></span>
28
+ <% end %>
29
+ <% if stats.kappa %>
30
+ <span>κ <%= stats.kappa.round(2) %></span>
31
+ <% end %>
32
+ </div>
33
+ <% end %>
34
+ </div>
@@ -6,8 +6,27 @@
6
6
  <section class="ck-page-header">
7
7
  <div>
8
8
  <h1 class="ck-title"><%= @metric.name %></h1>
9
+ <% if CompletionKit.config.judge_calibration_enabled %>
10
+ <%= render "completion_kit/calibrations/trust_panel",
11
+ stats: CompletionKit::MetricCalibrationStats.for(@metric) %>
12
+ <% if @latest_draft %>
13
+ <div class="ck-draft-banner">
14
+ <span class="ck-chip ck-chip--soft">Draft pending</span>
15
+ <span class="ck-meta-copy">An edit forked a draft judge version. Publish it to make this the current judge.</span>
16
+ <%= button_to "Publish draft", publish_draft_metric_path(@metric),
17
+ method: :post, form_class: "inline-block",
18
+ class: ck_button_classes(:dark) %>
19
+ </div>
20
+ <% end %>
21
+ <% end %>
9
22
  </div>
10
23
  <div class="ck-actions">
24
+ <% if CompletionKit.config.judge_calibration_enabled %>
25
+ <%= button_to "Suggest improvements", suggest_variants_metric_path(@metric),
26
+ method: :post, form_class: "inline-block",
27
+ class: ck_button_classes(:light, variant: :outline),
28
+ data: { turbo_confirm: "Ask the model to propose new judge instructions based on the disagreements collected so far?" } %>
29
+ <% end %>
11
30
  <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
12
31
  </div>
13
32
  </section>
@@ -42,3 +61,125 @@
42
61
  <% end %>
43
62
  </div>
44
63
  </section>
64
+
65
+ <% if CompletionKit.config.judge_calibration_enabled %>
66
+ <section class="ck-card ck-card--spaced">
67
+ <div class="ck-prompt-preview__header">
68
+ <p class="ck-kicker">Disagreements</p>
69
+ <% if @disagreements.any? %>
70
+ <span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
71
+ <% end %>
72
+ </div>
73
+ <% if @disagreements.empty? %>
74
+ <p class="ck-meta-copy">No disagreements yet. As humans give the verdict "disagree" on individual rows, the judge's misses will show up here for review.</p>
75
+ <% else %>
76
+ <table class="ck-results-table ck-disagreements-table">
77
+ <thead>
78
+ <tr>
79
+ <th scope="col">Run · row</th>
80
+ <th scope="col">Judge</th>
81
+ <th scope="col">Human</th>
82
+ <th scope="col">Note</th>
83
+ <th scope="col"></th>
84
+ </tr>
85
+ </thead>
86
+ <tbody>
87
+ <% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
88
+ <% @disagreements.each do |cal| %>
89
+ <% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
90
+ <% already = existing_ids.include?(cal.id) %>
91
+ <tr>
92
+ <td>
93
+ <%= link_to ck_run_path(cal.response.run), class: "ck-record-name" do %>
94
+ <strong><%= cal.response.run.name.to_s.truncate(40) %></strong>
95
+ <% end %>
96
+ <span class="ck-meta-copy">· <%= link_to "row ##{cal.response.id}", run_response_path(cal.response.run, cal.response), class: "ck-link" %></span>
97
+ </td>
98
+ <td>
99
+ <% if review&.ai_score %>
100
+ <span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
101
+ <% else %>
102
+ <span class="ck-meta-copy">—</span>
103
+ <% end %>
104
+ </td>
105
+ <td>
106
+ <% if cal.corrected_score %>
107
+ <span class="<%= ck_badge_classes(ck_score_kind(cal.corrected_score.to_f)) %>"><%= cal.corrected_score %></span>
108
+ <% else %>
109
+ <span class="ck-meta-copy">—</span>
110
+ <% end %>
111
+ </td>
112
+ <td class="ck-meta-copy"><%= cal.note.to_s.truncate(120) %></td>
113
+ <td>
114
+ <% if already %>
115
+ <span class="ck-chip ck-chip--done">Added</span>
116
+ <% else %>
117
+ <%= button_to "Add as judge few-shot",
118
+ add_few_shot_metric_path(@metric, calibration_id: cal.id),
119
+ method: :post,
120
+ form_class: "inline-block",
121
+ class: ck_button_classes(:light, variant: :outline) %>
122
+ <% end %>
123
+ </td>
124
+ </tr>
125
+ <% end %>
126
+ </tbody>
127
+ </table>
128
+ <% end %>
129
+ </section>
130
+
131
+ <% if @suggestion_drafts.any? %>
132
+ <section class="ck-card ck-card--spaced">
133
+ <div class="ck-prompt-preview__header">
134
+ <p class="ck-kicker">Suggested judge variants</p>
135
+ <span class="ck-chip"><%= @suggestion_drafts.size %> draft<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
136
+ </div>
137
+ <p class="ck-meta-copy">Pick one and publish it to make it the current judge. The previous published version stays in history.</p>
138
+ <div class="ck-suggestion-list">
139
+ <% @suggestion_drafts.each do |draft| %>
140
+ <article class="ck-suggestion-card">
141
+ <header class="ck-suggestion-card__header">
142
+ <span class="ck-chip ck-chip--soft">Draft #<%= draft.id %></span>
143
+ <time class="ck-meta-copy" data-relative-time datetime="<%= draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(draft.created_at) %> ago</time>
144
+ </header>
145
+ <pre class="ck-code ck-suggestion-card__instruction"><%= draft.instruction %></pre>
146
+ <div class="ck-actions">
147
+ <%= button_to "Publish this draft", publish_draft_metric_path(@metric),
148
+ method: :post, form_class: "inline-block",
149
+ class: ck_button_classes(:dark) %>
150
+ </div>
151
+ </article>
152
+ <% end %>
153
+ </div>
154
+ </section>
155
+ <% end %>
156
+
157
+ <% if Array(@metric.few_shot_examples).any? %>
158
+ <section class="ck-card ck-card--spaced">
159
+ <div class="ck-prompt-preview__header">
160
+ <p class="ck-kicker">Judge few-shot examples</p>
161
+ <span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "example") %></span>
162
+ </div>
163
+ <p class="ck-meta-copy">Disagreements added here will be injected as worked examples when the judge runs on this metric. Used by Phase 4 / 5 to retrain the judge.</p>
164
+ <ol class="ck-few-shot-list">
165
+ <% Array(@metric.few_shot_examples).each do |fs| %>
166
+ <li class="ck-few-shot-item">
167
+ <div class="ck-few-shot-item__scores">
168
+ <span class="ck-meta-copy">judge said</span>
169
+ <% if fs["judge_score"] %>
170
+ <span class="<%= ck_badge_classes(ck_score_kind(fs["judge_score"].to_f)) %>"><%= fs["judge_score"] %></span>
171
+ <% end %>
172
+ <span class="ck-meta-copy">human said</span>
173
+ <% if fs["human_score"] %>
174
+ <span class="<%= ck_badge_classes(ck_score_kind(fs["human_score"].to_f)) %>"><%= fs["human_score"] %></span>
175
+ <% end %>
176
+ </div>
177
+ <% if fs["human_note"].to_s.present? %>
178
+ <p class="ck-copy"><%= fs["human_note"] %></p>
179
+ <% end %>
180
+ </li>
181
+ <% end %>
182
+ </ol>
183
+ </section>
184
+ <% end %>
185
+ <% end %>
data/config/routes.rb CHANGED
@@ -12,7 +12,13 @@ CompletionKit::Engine.routes.draw do
12
12
  end
13
13
 
14
14
  resources :datasets
15
- resources :metrics
15
+ resources :metrics do
16
+ member do
17
+ post :add_few_shot
18
+ post :publish_draft
19
+ post :suggest_variants
20
+ end
21
+ end
16
22
  resources :metric_groups
17
23
  resources :tags
18
24
  resources :dashboard_dismissals, only: [:create, :destroy]
@@ -0,0 +1,5 @@
1
+ class AddFewShotExamplesToCompletionKitMetrics < ActiveRecord::Migration[8.1]
2
+ def change
3
+ add_column :completion_kit_metrics, :few_shot_examples, :text
4
+ end
5
+ end
@@ -0,0 +1,15 @@
1
+ class AddStateToCompletionKitJudgeVersions < ActiveRecord::Migration[8.1]
2
+ def change
3
+ add_column :completion_kit_judge_versions, :state, :string, null: false, default: "published"
4
+ add_column :completion_kit_judge_versions, :source, :string
5
+
6
+ reversible do |dir|
7
+ dir.up do
8
+ execute "UPDATE completion_kit_judge_versions SET state = 'published'"
9
+ end
10
+ end
11
+
12
+ add_index :completion_kit_judge_versions, [:metric_id, :state],
13
+ name: "index_ck_judge_versions_on_metric_state"
14
+ end
15
+ end
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.5.36"
2
+ VERSION = "0.5.37"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.36
4
+ version: 0.5.37
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin
@@ -290,14 +290,17 @@ files:
290
290
  - app/models/concerns/completion_kit/taggable.rb
291
291
  - app/services/completion_kit/anthropic_client.rb
292
292
  - app/services/completion_kit/api_config.rb
293
+ - app/services/completion_kit/calibration_math.rb
293
294
  - app/services/completion_kit/csv_processor.rb
294
295
  - app/services/completion_kit/dashboard_stats.rb
295
296
  - app/services/completion_kit/judge_service.rb
297
+ - app/services/completion_kit/judge_variant_generator.rb
296
298
  - app/services/completion_kit/llm_client.rb
297
299
  - app/services/completion_kit/mcp_dispatcher.rb
298
300
  - app/services/completion_kit/mcp_tools/base.rb
299
301
  - app/services/completion_kit/mcp_tools/calibrations.rb
300
302
  - app/services/completion_kit/mcp_tools/datasets.rb
303
+ - app/services/completion_kit/mcp_tools/judges.rb
301
304
  - app/services/completion_kit/mcp_tools/metric_groups.rb
302
305
  - app/services/completion_kit/mcp_tools/metrics.rb
303
306
  - app/services/completion_kit/mcp_tools/prompts.rb
@@ -305,6 +308,7 @@ files:
305
308
  - app/services/completion_kit/mcp_tools/responses.rb
306
309
  - app/services/completion_kit/mcp_tools/runs.rb
307
310
  - app/services/completion_kit/mcp_tools/tags.rb
311
+ - app/services/completion_kit/metric_calibration_stats.rb
308
312
  - app/services/completion_kit/model_discovery_service.rb
309
313
  - app/services/completion_kit/ollama_client.rb
310
314
  - app/services/completion_kit/onboarding/checklist.rb
@@ -323,6 +327,7 @@ files:
323
327
  - app/views/completion_kit/api_reference/_resource_list.html.erb
324
328
  - app/views/completion_kit/api_reference/index.html.erb
325
329
  - app/views/completion_kit/calibrations/_buttons.html.erb
330
+ - app/views/completion_kit/calibrations/_trust_panel.html.erb
326
331
  - app/views/completion_kit/dashboard/_eye_icon.html.erb
327
332
  - app/views/completion_kit/dashboard/_eye_off_icon.html.erb
328
333
  - app/views/completion_kit/dashboard/_failures_card.html.erb
@@ -407,6 +412,8 @@ files:
407
412
  - db/migrate/20260516000001_create_completion_kit_dashboard_dismissals.rb
408
413
  - db/migrate/20260522000001_create_completion_kit_judge_versions.rb
409
414
  - db/migrate/20260522000002_create_completion_kit_calibrations.rb
415
+ - db/migrate/20260523000001_add_few_shot_examples_to_completion_kit_metrics.rb
416
+ - db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb
410
417
  - lib/completion-kit.rb
411
418
  - lib/completion_kit.rb
412
419
  - lib/completion_kit/concurrency_check.rb