completion-kit 0.5.38 → 0.5.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fc7d527828189c2993060b315dca634fb958d2da11fd7fae63c4790179c46701
4
- data.tar.gz: f0323b980bdfb35d36742b548ddd3629e66d39e587775521678dc80b4cd2f068
3
+ metadata.gz: d8875fa22de9c8626e401706818b271f3cb26bffa7faae1f583b29503228cead
4
+ data.tar.gz: 5bf39fad883b2eed2f505b11403ab60fddf9efc5735073ff61c290d53f59a36d
5
5
  SHA512:
6
- metadata.gz: '020946bdac698194bb5246cfbe21fdf45c56006c80c15d1d7bcfda4d3494d95cde45645e090df14a411b172c83dcde42be777d74811b25a340e5710dba6ae7ce'
7
- data.tar.gz: 1b4f0ea8cf4e613df783ac428404ef1ae19b285db04f8a6768119760c65fb81d9ee72d52905ab2ef30e077ae910eb00e622925c1dcc43aee0c9e3a0f748718b1
6
+ metadata.gz: a8f2a7f14235c1214b567b891defaf523a645a21a2409ed81df964893a260cccb1fc9bf63903794f952c91bc9c91f3c1e3850db751a08ce0edc49b360ad9642d
7
+ data.tar.gz: e148b500e498a00dc370bf203fea3e2618b9f1c8fccd1dc5f220ae77a4988a8934198ad32bbf25a29bc84f948c687fab9a0ae16b9c03fe575fe9fca4cf98a0ea
@@ -1,12 +1,26 @@
1
- <% %w[400 500 700].each do |weight| %>
2
1
  @font-face {
3
2
  font-family: 'JetBrains Mono';
4
3
  font-style: normal;
5
- font-weight: <%= weight %>;
4
+ font-weight: 400;
5
+ font-display: swap;
6
+ src: url('completion_kit/jetbrains-mono-400.woff2') format('woff2');
7
+ }
8
+
9
+ @font-face {
10
+ font-family: 'JetBrains Mono';
11
+ font-style: normal;
12
+ font-weight: 500;
13
+ font-display: swap;
14
+ src: url('completion_kit/jetbrains-mono-500.woff2') format('woff2');
15
+ }
16
+
17
+ @font-face {
18
+ font-family: 'JetBrains Mono';
19
+ font-style: normal;
20
+ font-weight: 700;
6
21
  font-display: swap;
7
- src: url('<%= asset_path("completion_kit/jetbrains-mono-#{weight}.woff2") %>') format('woff2');
22
+ src: url('completion_kit/jetbrains-mono-700.woff2') format('woff2');
8
23
  }
9
- <% end %>
10
24
 
11
25
  .turbo-progress-bar {
12
26
  background-color: var(--ck-accent);
@@ -1527,6 +1541,25 @@ tr:hover .ck-chip--publish {
1527
1541
  border: 0;
1528
1542
  border-radius: 0;
1529
1543
  background: transparent;
1544
+ white-space: pre;
1545
+ color: #93c5fd;
1546
+ font-size: 0.86rem;
1547
+ line-height: 1.55;
1548
+ }
1549
+ .ck-code-scroll-wrap > .ck-code .ck-json-key {
1550
+ color: #c4b5fd;
1551
+ }
1552
+ .ck-code-scroll-wrap > .ck-code .ck-json-string {
1553
+ color: #93c5fd;
1554
+ }
1555
+ .ck-code-scroll-wrap > .ck-code .ck-json-number {
1556
+ color: #fcd34d;
1557
+ }
1558
+ .ck-code-scroll-wrap > .ck-code .ck-json-keyword {
1559
+ color: #f9a8d4;
1560
+ }
1561
+ .ck-code-scroll-wrap > .ck-code .ck-json-punct {
1562
+ color: var(--ck-dim);
1530
1563
  }
1531
1564
 
1532
1565
  .ck-note-box {
@@ -2741,7 +2774,7 @@ select.ck-input {
2741
2774
  border: 1px solid var(--ck-line);
2742
2775
  border-radius: var(--ck-radius-lg);
2743
2776
  background: var(--ck-surface);
2744
- padding: 1.5rem;
2777
+ padding: 1.25rem;
2745
2778
  }
2746
2779
 
2747
2780
  .ck-review-card__header {
@@ -2751,6 +2784,11 @@ select.ck-input {
2751
2784
  gap: 1rem;
2752
2785
  }
2753
2786
 
2787
+ .ck-review-card__header .ck-inline {
2788
+ flex-wrap: nowrap;
2789
+ flex-shrink: 0;
2790
+ }
2791
+
2754
2792
  .ck-review-card__metric {
2755
2793
  font-family: var(--ck-mono);
2756
2794
  font-size: 0.95rem;
@@ -2759,11 +2797,9 @@ select.ck-input {
2759
2797
  }
2760
2798
 
2761
2799
  .ck-review-card__feedback {
2762
- margin-top: 0.75rem;
2763
- }
2764
-
2765
- .ck-review-card__feedback .ck-note-box {
2766
- margin-top: 0;
2800
+ margin: 0.6rem 0 0;
2801
+ color: var(--ck-muted);
2802
+ line-height: 1.55;
2767
2803
  }
2768
2804
 
2769
2805
  @media (max-width: 900px) {
@@ -2800,6 +2836,14 @@ select.ck-input {
2800
2836
  width: 100%;
2801
2837
  }
2802
2838
 
2839
+ /* button_to renders a form.inline-block wrapping the button. When the inner
2840
+ button is a full .ck-button (not an icon-button or chip), the form should
2841
+ stretch with it. */
2842
+ form.inline-block:has(> .ck-button) {
2843
+ width: 100%;
2844
+ display: block;
2845
+ }
2846
+
2803
2847
  /* Page header stacks: title, then lead text full-width, then action. */
2804
2848
  .ck-page-header {
2805
2849
  flex-direction: column;
@@ -2827,6 +2871,12 @@ select.ck-input {
2827
2871
  padding: 1rem;
2828
2872
  }
2829
2873
 
2874
+ .ck-review-card__header {
2875
+ flex-direction: column;
2876
+ align-items: flex-start;
2877
+ gap: 0.5rem;
2878
+ }
2879
+
2830
2880
  /* Topbar nav collapses behind the hamburger trigger. */
2831
2881
  .ck-nav-menu__trigger {
2832
2882
  display: inline-flex;
@@ -4559,9 +4609,8 @@ a.tag-mark {
4559
4609
  }
4560
4610
 
4561
4611
  .ck-launch__progress {
4562
- padding-bottom: 1.5rem;
4563
- margin-bottom: 0.5rem;
4564
- border-bottom: 1px solid var(--ck-line);
4612
+ padding-bottom: 0;
4613
+ margin-bottom: 1.25rem;
4565
4614
  }
4566
4615
  .ck-launch__progress-head {
4567
4616
  display: flex;
@@ -5126,22 +5175,37 @@ a.tag-mark {
5126
5175
  border-top: 1px dashed var(--ck-line);
5127
5176
  }
5128
5177
  .ck-calibration__prompt {
5178
+ margin: 0 0 10px;
5179
+ display: flex;
5180
+ align-items: baseline;
5181
+ flex-wrap: wrap;
5182
+ gap: 8px 12px;
5183
+ }
5184
+ .ck-calibration__label {
5129
5185
  font-family: var(--ck-mono);
5130
5186
  font-size: 0.72rem;
5131
5187
  letter-spacing: 0.06em;
5132
5188
  text-transform: uppercase;
5133
5189
  color: var(--ck-dim);
5134
- margin: 0 0 10px;
5135
- display: flex;
5136
- align-items: center;
5137
- gap: 10px;
5190
+ flex-shrink: 0;
5138
5191
  }
5139
5192
  .ck-calibration__count {
5140
5193
  font-family: var(--ck-mono);
5141
5194
  font-size: 0.72rem;
5142
5195
  letter-spacing: 0.03em;
5143
5196
  color: var(--ck-accent);
5144
- text-transform: none;
5197
+ }
5198
+ .ck-calibration__hint {
5199
+ font-size: 0.82rem;
5200
+ color: var(--ck-dim);
5201
+ line-height: 1.4;
5202
+ }
5203
+ @media (max-width: 640px) {
5204
+ .ck-calibration__prompt {
5205
+ flex-direction: column;
5206
+ align-items: flex-start;
5207
+ gap: 4px;
5208
+ }
5145
5209
  }
5146
5210
  .ck-calibration__buttons {
5147
5211
  display: flex;
@@ -5196,11 +5260,13 @@ a.tag-mark {
5196
5260
  margin-top: 12px;
5197
5261
  display: flex;
5198
5262
  flex-direction: column;
5199
- gap: 8px;
5200
- padding: 12px;
5201
- background: var(--ck-surface-soft);
5202
- border: 1px solid var(--ck-line);
5203
- border-radius: 6px;
5263
+ gap: 12px;
5264
+ }
5265
+ .ck-calibration__detail > * {
5266
+ margin: 0;
5267
+ }
5268
+ .ck-calibration__detail .ck-button {
5269
+ align-self: flex-start;
5204
5270
  }
5205
5271
  .ck-calibration__value {
5206
5272
  color: var(--ck-accent);
@@ -5322,44 +5388,28 @@ a.tag-mark {
5322
5388
  text-transform: uppercase;
5323
5389
  }
5324
5390
 
5325
- .ck-draft-banner {
5391
+ .ck-draft-pending {
5392
+ border-color: rgba(6, 182, 212, 0.45);
5393
+ background: linear-gradient(180deg, var(--ck-accent-soft), var(--ck-surface));
5394
+ }
5395
+
5396
+ .ck-suggestion-banner {
5326
5397
  display: inline-flex;
5327
5398
  align-items: center;
5328
5399
  gap: 10px;
5329
5400
  margin-top: 10px;
5330
- padding: 8px 12px;
5401
+ padding: 8px 14px;
5331
5402
  background: var(--ck-accent-soft);
5332
- border: 1px dashed rgba(6, 182, 212, 0.4);
5333
- border-radius: 6px;
5334
- }
5335
-
5336
- .ck-suggestion-list {
5337
- display: flex;
5338
- flex-direction: column;
5339
- gap: 12px;
5340
- }
5341
- .ck-suggestion-card {
5342
- padding: 12px 14px;
5343
- background: var(--ck-surface-soft);
5344
- border: 1px solid var(--ck-line);
5403
+ border: 1px solid rgba(6, 182, 212, 0.35);
5345
5404
  border-radius: 6px;
5346
- display: flex;
5347
- flex-direction: column;
5348
- gap: 10px;
5349
- }
5350
- .ck-suggestion-card__header {
5351
- display: flex;
5352
- align-items: center;
5353
- gap: 10px;
5405
+ color: var(--ck-accent);
5406
+ font-family: var(--ck-mono);
5407
+ font-size: 0.82rem;
5408
+ text-decoration: none;
5354
5409
  }
5355
- .ck-suggestion-card__instruction {
5356
- margin: 0;
5357
- white-space: pre-wrap;
5358
- font-size: 0.85rem;
5359
- background: var(--ck-bg-strong);
5360
- padding: 10px 12px;
5361
- border-radius: 4px;
5362
- border: 1px solid var(--ck-line);
5410
+ .ck-suggestion-banner:hover,
5411
+ .ck-suggestion-banner:focus-visible {
5412
+ border-color: var(--ck-accent);
5363
5413
  }
5364
5414
 
5365
5415
  .ck-metrics-table__trust {
@@ -5372,3 +5422,66 @@ a.tag-mark {
5372
5422
  color: var(--ck-success);
5373
5423
  margin-right: 6px;
5374
5424
  }
5425
+
5426
+ .ck-calibration__error {
5427
+ margin: 8px 0 0;
5428
+ padding: 8px 10px;
5429
+ background: var(--ck-danger-soft);
5430
+ border: 1px solid rgba(248, 113, 113, 0.3);
5431
+ border-radius: 4px;
5432
+ color: var(--ck-danger);
5433
+ font-size: 0.82rem;
5434
+ }
5435
+
5436
+ .ck-star-picker {
5437
+ border: 0;
5438
+ padding: 0;
5439
+ margin: 0;
5440
+ }
5441
+ .ck-star-picker__row {
5442
+ display: inline-flex;
5443
+ flex-direction: row-reverse;
5444
+ gap: 2px;
5445
+ }
5446
+ .ck-star-picker input {
5447
+ position: absolute;
5448
+ width: 1px;
5449
+ height: 1px;
5450
+ opacity: 0;
5451
+ pointer-events: none;
5452
+ }
5453
+ .ck-star-picker label {
5454
+ cursor: pointer;
5455
+ display: inline-flex;
5456
+ padding: 4px;
5457
+ border-radius: 4px;
5458
+ }
5459
+ .ck-star-picker label svg {
5460
+ fill: transparent;
5461
+ stroke: var(--ck-line-strong);
5462
+ transition: fill 0.08s, stroke 0.08s;
5463
+ }
5464
+ .ck-star-picker input:checked ~ label svg {
5465
+ fill: var(--ck-warning);
5466
+ stroke: var(--ck-warning);
5467
+ }
5468
+ .ck-star-picker__row:hover label svg {
5469
+ fill: transparent;
5470
+ stroke: var(--ck-line-strong);
5471
+ }
5472
+ .ck-star-picker__row:hover label:hover svg,
5473
+ .ck-star-picker__row:hover label:hover ~ label svg {
5474
+ fill: var(--ck-warning);
5475
+ stroke: var(--ck-warning);
5476
+ }
5477
+ .ck-star-picker input:focus-visible + label {
5478
+ outline: 2px solid var(--ck-accent);
5479
+ outline-offset: 2px;
5480
+ }
5481
+
5482
+ .ck-button--just-saved {
5483
+ animation: ck-saved-flash 1.4s ease-out;
5484
+ }
5485
+ @keyframes ck-saved-flash {
5486
+ 0% { background: var(--ck-success); border-color: var(--ck-success); }
5487
+ }
@@ -5,9 +5,18 @@ module CompletionKit
5
5
 
6
6
  def create
7
7
  created_by = calibration_creator
8
- calibration = Calibration.find_or_initialize_by(
8
+ existing = Calibration.find_by(
9
9
  run_id: @run.id, response_id: @response.id, metric_id: @metric.id, created_by: created_by
10
10
  )
11
+
12
+ if params[:verdict] == "disagree" && params[:corrected_score].blank?
13
+ render_calibration(calibration: existing, pending_verdict: "disagree")
14
+ return
15
+ end
16
+
17
+ calibration = existing || Calibration.new(
18
+ run: @run, response: @response, metric: @metric, created_by: created_by
19
+ )
11
20
  calibration.assign_attributes(
12
21
  judge_version: JudgeVersion.ensure_current_for(@metric),
13
22
  verdict: params[:verdict],
@@ -16,19 +25,37 @@ module CompletionKit
16
25
  )
17
26
 
18
27
  if calibration.save
19
- render turbo_stream: turbo_stream.replace(
20
- "calibration_#{@response.id}_#{@metric.id}",
21
- partial: "completion_kit/calibrations/buttons",
22
- locals: { review: review_for_metric, calibration: calibration, run: @run, response_row: @response, metric: @metric }
23
- )
28
+ render_calibration(calibration: calibration, just_saved: true)
24
29
  else
25
- flash[:alert] = calibration.errors.full_messages.to_sentence
26
- redirect_to run_response_path(@run, @response)
30
+ render_calibration(
31
+ calibration: existing,
32
+ pending_verdict: params[:verdict],
33
+ error: calibration.errors.full_messages.to_sentence,
34
+ status: :unprocessable_entity
35
+ )
27
36
  end
28
37
  end
29
38
 
30
39
  private
31
40
 
41
+ def render_calibration(calibration:, pending_verdict: nil, error: nil, just_saved: false, status: :ok)
42
+ locals = {
43
+ review: review_for_metric,
44
+ calibration: calibration,
45
+ run: @run,
46
+ response_row: @response,
47
+ metric: @metric,
48
+ pending_verdict: pending_verdict,
49
+ error: error,
50
+ just_saved: just_saved
51
+ }
52
+ render turbo_stream: turbo_stream.replace(
53
+ "calibration_#{@response.id}_#{@metric.id}",
54
+ partial: "completion_kit/calibrations/buttons",
55
+ locals: locals
56
+ ), status: status
57
+ end
58
+
32
59
  def ensure_calibration_enabled
33
60
  head :not_found unless CompletionKit.config.judge_calibration_enabled
34
61
  end
@@ -1,7 +1,7 @@
1
1
  module CompletionKit
2
2
  class MetricsController < ApplicationController
3
3
  include CompletionKit::TagFiltering
4
- before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants]
4
+ before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
5
5
 
6
6
  def index
7
7
  @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
@@ -12,8 +12,10 @@ module CompletionKit
12
12
  .includes(response: [:reviews, :run])
13
13
  .order(created_at: :desc)
14
14
  .limit(50)
15
- @latest_draft = JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
16
- @suggestion_drafts = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc)
15
+ @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
16
+ @published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
17
+ @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
18
+ @improve_disagreement_count = @disagreements.size
17
19
  end
18
20
 
19
21
  def new
@@ -47,15 +49,28 @@ module CompletionKit
47
49
  end
48
50
 
49
51
  def suggest_variants
50
- generator = JudgeVariantGenerator.new(@metric)
52
+ disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
53
+ if disagreement_count.zero?
54
+ redirect_to metric_path(@metric), alert: "Mark at least one row as Disagree before asking the model to suggest a change."
55
+ return
56
+ end
57
+
58
+ JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
59
+
60
+ generator = JudgeVariantGenerator.new(@metric, count: 1)
51
61
  variants = generator.call
52
62
  if variants.empty?
53
63
  redirect_to metric_path(@metric), alert: "The model returned no usable variants. Try again with a different model."
54
64
  return
55
65
  end
56
66
  generator.persist!(variants)
57
- label = variants.length == 1 ? "alternative" : "alternatives"
58
- redirect_to metric_path(@metric), notice: "Wrote #{variants.length} #{label} for the judge instruction. Pick one to make it live."
67
+ redirect_to metric_path(@metric), notice: "Drafted a new version. Review it below."
68
+ end
69
+
70
+ def dismiss_suggestion
71
+ draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").find_by(id: params[:draft_id])
72
+ draft&.destroy
73
+ redirect_to metric_path(@metric), notice: "Dismissed."
59
74
  end
60
75
 
61
76
  def publish_draft
@@ -202,15 +202,86 @@ module CompletionKit
202
202
  def ck_format_maybe_json(text)
203
203
  s = text.to_s
204
204
  return s if s.strip.empty?
205
- first = s.strip[0]
205
+ payload = ck_unwrap_json_fence(s.strip)
206
+ first = payload[0]
206
207
  return s unless first == "{" || first == "["
207
208
  begin
208
- JSON.pretty_generate(JSON.parse(s))
209
+ ck_highlight_json(JSON.pretty_generate(JSON.parse(payload)))
209
210
  rescue JSON::ParserError
210
211
  s
211
212
  end
212
213
  end
213
214
 
215
+ def ck_unwrap_json_fence(text)
216
+ m = text.match(/\A```(?:json|JSON)?\s*\n(.*?)\n?```\s*\z/m)
217
+ m ? m[1].strip : text
218
+ end
219
+
220
+ def ck_highlight_json(text)
221
+ tokens = ck_tokenize_json(text)
222
+ is_key = ck_mark_json_keys(tokens)
223
+ parts = tokens.each_with_index.map do |(type, value), idx|
224
+ escaped = ERB::Util.html_escape(value)
225
+ case type
226
+ when :punct then %(<span class="ck-json-punct">#{escaped}</span>)
227
+ when :string
228
+ %(<span class="#{is_key[idx] ? "ck-json-key" : "ck-json-string"}">#{escaped}</span>)
229
+ when :number then %(<span class="ck-json-number">#{escaped}</span>)
230
+ when :keyword then %(<span class="ck-json-keyword">#{escaped}</span>)
231
+ else escaped
232
+ end
233
+ end
234
+ parts.join.html_safe
235
+ end
236
+
237
+ def ck_tokenize_json(text)
238
+ tokens = []
239
+ i = 0
240
+ len = text.length
241
+ while i < len
242
+ ch = text[i]
243
+ if ch == " " || ch == "\n" || ch == "\t"
244
+ tokens << [:ws, ch]
245
+ i += 1
246
+ elsif "{}[]:,".include?(ch)
247
+ tokens << [:punct, ch]
248
+ i += 1
249
+ elsif ch == '"'
250
+ j = i + 1
251
+ while j < len && text[j] != '"'
252
+ j += text[j] == "\\" ? 2 : 1
253
+ end
254
+ j = len - 1 if j >= len
255
+ tokens << [:string, text[i..j]]
256
+ i = j + 1
257
+ elsif ch == "-" || (ch >= "0" && ch <= "9")
258
+ j = i + 1
259
+ j += 1 while j < len && "0123456789.eE+-".include?(text[j])
260
+ tokens << [:number, text[i...j]]
261
+ i = j
262
+ elsif text[i, 4] == "true" || text[i, 4] == "null"
263
+ tokens << [:keyword, text[i, 4]]
264
+ i += 4
265
+ elsif text[i, 5] == "false"
266
+ tokens << [:keyword, "false"]
267
+ i += 5
268
+ else
269
+ tokens << [:other, ch]
270
+ i += 1
271
+ end
272
+ end
273
+ tokens
274
+ end
275
+
276
+ def ck_mark_json_keys(tokens)
277
+ tokens.each_with_index.map do |(type, _), idx|
278
+ next false unless type == :string
279
+ j = idx + 1
280
+ j += 1 while j < tokens.length && tokens[j][0] == :ws
281
+ j < tokens.length && tokens[j] == [:punct, ":"]
282
+ end
283
+ end
284
+
214
285
  def tag_filter_url(base_path, selected, toggling)
215
286
  remaining = selected.reject { |t| t.id == toggling.id }
216
287
  next_set = selected.include?(toggling) ? remaining : remaining + [toggling]
@@ -1,13 +1,15 @@
1
1
  module CompletionKit
2
2
  class JudgeVariantGenerator
3
- DEFAULT_VARIANT_COUNT = 3
3
+ DEFAULT_VARIANT_COUNT = 1
4
+ MAX_VARIANT_COUNT = 3
4
5
  DEFAULT_TEMPERATURE = 0.4
5
6
 
6
- Variant = Struct.new(:reasoning, :instruction, keyword_init: true)
7
+ Variant = Struct.new(:reasoning, :instruction, :rubric_bands, keyword_init: true)
7
8
 
8
9
  def initialize(metric, count: DEFAULT_VARIANT_COUNT, model: nil)
9
10
  @metric = metric
10
- @count = count
11
+ n = count.to_i
12
+ @count = n < 1 ? DEFAULT_VARIANT_COUNT : [n, MAX_VARIANT_COUNT].min
11
13
  @model = model || CompletionKit.config.judge_model
12
14
  end
13
15
 
@@ -23,7 +25,7 @@ module CompletionKit
23
25
  JudgeVersion.create!(
24
26
  metric: @metric,
25
27
  instruction: variant.instruction,
26
- rubric_bands: @metric.rubric_bands,
28
+ rubric_bands: variant.rubric_bands.presence || @metric.rubric_bands,
27
29
  state: "draft",
28
30
  source: "suggestion",
29
31
  current: false
@@ -39,36 +41,57 @@ module CompletionKit
39
41
  private
40
42
 
41
43
  def build_meta_prompt
42
- examples = JudgeCalibrationExamples.for(@metric)
44
+ disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
45
+ borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
43
46
  sections = []
44
- sections << "You are an expert evaluator. Rewrite a judge's grading instruction so it agrees better with humans on the cases below."
47
+ sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
45
48
  sections << ""
46
49
  sections << "## Current instruction"
47
50
  sections << "```"
48
51
  sections << @metric.instruction.to_s
49
52
  sections << "```"
50
53
  sections << ""
51
- sections << "## Rubric (unchanged across variants — only rewrite the instruction)"
54
+ sections << "## Current rubric (5 to 1)"
52
55
  sections << @metric.display_rubric_text
53
56
  sections << ""
54
- sections << "## Recent disagreements (judge vs human)"
55
- examples.each_with_index do |ex, i|
56
- sections << "### Case #{i + 1}"
57
- sections << "Input: #{ex[:input].to_s.truncate(200)}"
58
- sections << "Output: #{ex[:output].to_s.truncate(200)}"
59
- sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
60
- sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
61
- sections << ""
57
+ if disagreements.any?
58
+ sections << "## Recent disagreements (judge vs human)"
59
+ disagreements.each_with_index do |ex, i|
60
+ sections << "### Case #{i + 1}"
61
+ sections << "Input: #{ex[:input].to_s.truncate(200)}"
62
+ sections << "Output: #{ex[:output].to_s.truncate(200)}"
63
+ sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
64
+ sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
65
+ sections << ""
66
+ end
67
+ end
68
+ if borderlines.any?
69
+ sections << "## Rubric-ambiguous cases (humans marked these borderline)"
70
+ sections << "These are cases where a human said the rubric itself was unclear. If the rubric needs sharpening, rewrite it."
71
+ borderlines.each_with_index do |ex, i|
72
+ sections << "### Borderline #{i + 1}"
73
+ sections << "Input: #{ex[:input].to_s.truncate(200)}"
74
+ sections << "Output: #{ex[:output].to_s.truncate(200)}"
75
+ sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
76
+ sections << "Human note: #{ex[:human_note].to_s.truncate(200)}" if ex[:human_note].to_s.present?
77
+ sections << ""
78
+ end
62
79
  end
63
80
  sections << "## Task"
64
- sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite not a wholesale rewrite of the rubric. Aim to close the disagreement gap."
81
+ sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
65
82
  sections << ""
66
- sections << "Respond in EXACTLY this format, repeated #{@count} times:"
83
+ sections << "Respond in EXACTLY this format, repeated #{@count} time#{@count == 1 ? "" : "s"}:"
67
84
  sections << ""
68
85
  sections << "VARIANT:"
69
- sections << "REASONING: <one sentence explaining what this variant changes>"
86
+ sections << "REASONING: <one short sentence: what changes and why>"
70
87
  sections << "INSTRUCTION:"
71
88
  sections << "<the rewritten instruction>"
89
+ sections << "RUBRIC: # optional — omit this block if the rubric is unchanged"
90
+ sections << "5: <description for 5 stars>"
91
+ sections << "4: <description for 4 stars>"
92
+ sections << "3: <description for 3 stars>"
93
+ sections << "2: <description for 2 stars>"
94
+ sections << "1: <description for 1 star>"
72
95
  sections << "END_VARIANT"
73
96
  sections.join("\n")
74
97
  end
@@ -77,22 +100,44 @@ module CompletionKit
77
100
  blocks = text.to_s.scan(/VARIANT:(.*?)END_VARIANT/m).flatten
78
101
  blocks.filter_map do |raw|
79
102
  reasoning = raw[/REASONING:\s*(.*?)(?=INSTRUCTION:|\z)/m, 1].to_s.strip
80
- instruction = raw[/INSTRUCTION:\s*(.*)/m, 1].to_s.strip
103
+ instruction = raw[/INSTRUCTION:\s*(.*?)(?=RUBRIC:|\z)/m, 1].to_s.strip
81
104
  next if instruction.empty?
82
- Variant.new(reasoning: reasoning, instruction: instruction)
105
+ rubric_block = raw[/RUBRIC:\s*(.*)/m, 1].to_s
106
+ Variant.new(reasoning: reasoning, instruction: instruction, rubric_bands: parse_rubric(rubric_block))
83
107
  end
84
108
  end
109
+
110
+ def parse_rubric(block)
111
+ return nil if block.strip.empty?
112
+ bands = block.scan(/^\s*([1-5])\s*[:\-]\s*(.+?)\s*$/).map do |stars, description|
113
+ { "stars" => stars.to_i, "description" => description.strip }
114
+ end
115
+ return nil if bands.length != 5
116
+ bands.sort_by { |b| -b["stars"] }
117
+ end
85
118
  end
86
119
 
87
120
  module JudgeCalibrationExamples
88
121
  module_function
89
122
 
90
123
  def for(metric, limit: 8)
91
- disagreements = Calibration.where(metric_id: metric.id, verdict: "disagree")
92
- .includes(response: :reviews)
93
- .order(created_at: :desc)
94
- .limit(limit)
95
- disagreements.map do |cal|
124
+ disagreements_for(metric, limit: limit)
125
+ end
126
+
127
+ def disagreements_for(metric, limit: 8)
128
+ calibrations_for(metric, verdict: "disagree", limit: limit)
129
+ end
130
+
131
+ def borderlines_for(metric, limit: 6)
132
+ calibrations_for(metric, verdict: "borderline", limit: limit)
133
+ end
134
+
135
+ def calibrations_for(metric, verdict:, limit:)
136
+ Calibration.where(metric_id: metric.id, verdict: verdict)
137
+ .includes(response: :reviews)
138
+ .order(created_at: :desc)
139
+ .limit(limit)
140
+ .map do |cal|
96
141
  review = cal.response.reviews.find { |r| r.metric_id == metric.id }
97
142
  {
98
143
  input: cal.response.input_data,
@@ -10,7 +10,7 @@ module CompletionKit
10
10
  type: "object",
11
11
  properties: {
12
12
  metric_id: { type: "integer" },
13
- count: { type: "integer", description: "How many variants to request (default 3, max 5)." },
13
+ count: { type: "integer", description: "How many variants to request (default 1, max 3). One focused rewrite beats five reworded copies." },
14
14
  model: { type: "string", description: "Override the model used to generate variants. Defaults to CompletionKit.config.judge_model." }
15
15
  },
16
16
  required: ["metric_id"]
@@ -49,9 +49,7 @@ module CompletionKit
49
49
 
50
50
  def self.suggest(args)
51
51
  metric = CompletionKit::Metric.find(args["metric_id"])
52
- count = [args["count"].to_i, 5].min
53
- count = CompletionKit::JudgeVariantGenerator::DEFAULT_VARIANT_COUNT if count <= 0
54
- generator = CompletionKit::JudgeVariantGenerator.new(metric, count: count, model: args["model"])
52
+ generator = CompletionKit::JudgeVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
55
53
  variants = generator.call
56
54
  return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
57
55
  versions = generator.persist!(variants)
@@ -1,20 +1,20 @@
1
1
  module CompletionKit
2
2
  module Onboarding
3
3
  # Opt-in starter data for the onboarding page: one dataset + one prompt so a
4
- # brand-new install has something to poke at. Idempotent a no-op once the
4
+ # brand-new install has something to poke at. Idempotent. A no-op once the
5
5
  # workspace already has any prompt or dataset. Deliberately does NOT create a
6
6
  # provider credential (needs a real API key) or a run (user-initiated).
7
7
  module SampleData
8
8
  SAMPLE_CSV = <<~CSV.freeze
9
9
  ticket
10
10
  "My order #4827 arrived with a dented panel. I emailed photos 11 days ago and heard nothing. Today I was told the return window 'closed'. I paid $749. I want a refund or replacement, not store credit."
11
- "Tracking says delivered to my porch Tuesday 3:47pm. I was home all day, nothing arrived, neighbours' cameras show no van. Order #5102 a $315 mixer, wedding gift, wedding is Saturday. Can someone look today?"
11
+ "Tracking says delivered to my porch Tuesday 3:47pm. I was home all day, nothing arrived, neighbours' cameras show no van. Order #5102. A $315 mixer, wedding gift, wedding is Saturday. Can someone look today?"
12
12
  "WELCOME20 says 'invalid' at checkout but the promo email says it's good through May 31. Same email I'm signed in with. Tried Chrome and Safari. Cart is $186 waiting on you."
13
13
  CSV
14
14
 
15
15
  SAMPLE_PROMPT = {
16
16
  name: "Sample: Support reply",
17
- description: "A starter prompt drafts a warm, professional reply to a customer support ticket. Edit it or delete it; it's just here to get you going.",
17
+ description: "A starter prompt. Drafts a warm, professional reply to a customer support ticket. Edit it or delete it; it's just here to get you going.",
18
18
  template: "You are a senior customer-support specialist. Write a warm, professional reply to this ticket. Acknowledge the customer's situation, be specific about next steps, and don't be defensive.\n\nTicket:\n{{ticket}}",
19
19
  llm_model: "gpt-4o-mini"
20
20
  }.freeze
@@ -1,12 +1,15 @@
1
1
  <div id="calibration_<%= response_row.id %>_<%= metric.id %>" class="ck-calibration">
2
2
  <% current_verdict = calibration&.verdict %>
3
+ <% pending_verdict = local_assigns[:pending_verdict] %>
4
+ <% active_verdict = pending_verdict || current_verdict %>
5
+ <% error = local_assigns[:error] %>
3
6
  <% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
4
7
  <p class="ck-calibration__prompt">
5
- Your verdict
8
+ <span class="ck-calibration__label">Your verdict</span>
6
9
  <% if verdict_count > 0 %>
7
- <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "judge trust →", metric_path(metric), class: "ck-link" %></span>
10
+ <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "trust level →", metric_path(metric), class: "ck-link" %></span>
8
11
  <% else %>
9
- <span class="ck-calibration__count">Tell us what you think — was the score right? Verdicts roll up into <%= link_to "judge trust", metric_path(metric), class: "ck-link" %>.</span>
12
+ <span class="ck-calibration__hint">Tell us what you think — was the score right? Verdicts roll up into the metric's <%= link_to "trust level", metric_path(metric), class: "ck-link" %>.</span>
10
13
  <% end %>
11
14
  </p>
12
15
  <div class="ck-calibration__buttons">
@@ -20,8 +23,8 @@
20
23
  <%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
21
24
  method: :post,
22
25
  form: { data: { turbo: "true" } },
23
- class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == current_verdict}",
24
- "aria-pressed": (verdict == current_verdict).to_s,
26
+ class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
27
+ "aria-pressed": (verdict == active_verdict).to_s,
25
28
  title: verdict_hints[verdict] do %>
26
29
  <%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
27
30
  <span><%= verdict %></span>
@@ -29,31 +32,41 @@
29
32
  <% end %>
30
33
  </div>
31
34
 
32
- <% if current_verdict == "disagree" %>
35
+ <% if error.present? %>
36
+ <p class="ck-calibration__error" role="alert"><%= error %></p>
37
+ <% end %>
38
+
39
+ <% if active_verdict == "disagree" %>
40
+ <% existing_score = (calibration&.corrected_score || review&.ai_score)&.round %>
33
41
  <%= form_with url: run_response_calibrations_path(run, response_row),
34
42
  method: :post, local: false,
35
43
  class: "ck-calibration__detail" do |f| %>
36
44
  <%= hidden_field_tag :metric_id, metric.id %>
37
45
  <%= hidden_field_tag :verdict, "disagree" %>
38
- <label class="ck-label">
39
- What should the score have been?
40
- <span class="ck-calibration__value" data-calibration-value><%= calibration.corrected_score || review&.ai_score || 3 %></span>
41
- </label>
42
- <input type="range" name="corrected_score" min="1" max="5" step="0.5"
43
- value="<%= calibration.corrected_score || review&.ai_score || 3 %>"
44
- oninput="this.closest('.ck-calibration__detail').querySelector('[data-calibration-value]').textContent = this.value"
45
- class="ck-slider">
46
- <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration.note %></textarea>
47
- <%= f.submit "Save", class: ck_button_classes(:dark) %>
46
+ <p class="ck-label">What should the score have been?</p>
47
+ <fieldset class="ck-star-picker">
48
+ <legend class="ck-visually-hidden">Pick a score from 1 to 5 stars</legend>
49
+ <div class="ck-star-picker__row">
50
+ <% [5, 4, 3, 2, 1].each do |n| %>
51
+ <% radio_id = "ck-star-#{response_row.id}-#{metric.id}-#{n}" %>
52
+ <input type="radio" name="corrected_score" id="<%= radio_id %>" value="<%= n %>" <%= "checked" if existing_score == n %> required>
53
+ <label for="<%= radio_id %>" title="<%= pluralize(n, 'star') %>" aria-label="<%= pluralize(n, 'star') %>">
54
+ <svg viewBox="0 0 24 24" width="28" height="28" stroke-width="1.5" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
55
+ </label>
56
+ <% end %>
57
+ </div>
58
+ </fieldset>
59
+ <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration&.note %></textarea>
60
+ <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
48
61
  <% end %>
49
- <% elsif current_verdict == "borderline" %>
62
+ <% elsif active_verdict == "borderline" %>
50
63
  <%= form_with url: run_response_calibrations_path(run, response_row),
51
64
  method: :post, local: false,
52
65
  class: "ck-calibration__detail" do |f| %>
53
66
  <%= hidden_field_tag :metric_id, metric.id %>
54
67
  <%= hidden_field_tag :verdict, "borderline" %>
55
- <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration.note %></textarea>
56
- <%= f.submit "Save", class: ck_button_classes(:dark) %>
68
+ <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration&.note %></textarea>
69
+ <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
57
70
  <% end %>
58
71
  <% end %>
59
72
  </div>
@@ -1,6 +1,6 @@
1
1
  <% stats = local_assigns[:stats] %>
2
2
  <div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
3
- <p class="ck-trust-panel__label" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</p>
3
+ <p class="ck-trust-panel__label" title="How often this metric's scores match the humans who reviewed them.">Trust level</p>
4
4
  <% if stats.counter_only? %>
5
5
  <div class="ck-trust-panel__body">
6
6
  <span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>
@@ -19,7 +19,7 @@
19
19
  <tr>
20
20
  <th scope="col">Name</th>
21
21
  <th scope="col">Instruction</th>
22
- <th scope="col" title="How often the judge agrees with the humans who reviewed its scores.">Judge trust</th>
22
+ <th scope="col" title="How often this metric's scores match the humans who reviewed them.">Trust level</th>
23
23
  <th scope="col">In groups</th>
24
24
  <th scope="col"></th>
25
25
  </tr>
@@ -36,7 +36,7 @@
36
36
  <% end %>
37
37
  </td>
38
38
  <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
39
- <td data-label="Judge trust" class="ck-metrics-table__trust">
39
+ <td data-label="Trust level" class="ck-metrics-table__trust">
40
40
  <% if CompletionKit.config.judge_calibration_enabled %>
41
41
  <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
42
42
  <% if s.counter_only? %>
@@ -9,24 +9,58 @@
9
9
  <% if CompletionKit.config.judge_calibration_enabled %>
10
10
  <%= render "completion_kit/calibrations/trust_panel",
11
11
  stats: CompletionKit::MetricCalibrationStats.for(@metric) %>
12
- <% if @latest_draft %>
13
- <div class="ck-draft-banner">
14
- <span class="ck-chip ck-chip--soft">Draft pending</span>
15
- <span class="ck-meta-copy">A draft version of this judge is saved. Publishing it replaces the live instruction and rubric.</span>
16
- <%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @latest_draft.id),
17
- method: :post, form_class: "inline-block",
18
- class: ck_button_classes(:dark) %>
19
- </div>
12
+ <% if @edit_draft %>
13
+ <% pub_instr = @published_judge_version&.instruction.to_s %>
14
+ <% draft_instr = @edit_draft.instruction.to_s %>
15
+ <% instruction_changed = pub_instr != draft_instr %>
16
+ <% rubric_changed = @published_judge_version && @published_judge_version.rubric_bands != @edit_draft.rubric_bands %>
17
+ <section class="ck-card ck-card--spaced ck-draft-pending">
18
+ <div class="ck-prompt-preview__header">
19
+ <p class="ck-kicker">Draft pending</p>
20
+ <%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @edit_draft.id),
21
+ method: :post, form_class: "inline-block",
22
+ class: ck_button_classes(:dark) %>
23
+ </div>
24
+ <p class="ck-meta-copy">A draft of this metric is saved. Publishing it replaces the live instruction<%= ", rubric," if rubric_changed %> for future runs. Here's what changes.</p>
25
+
26
+ <% if instruction_changed %>
27
+ <div class="ck-suggest-diff">
28
+ <div class="ck-suggest-diff__pane">
29
+ <div class="ck-suggest-diff__header">
30
+ <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Currently live</span>
31
+ </div>
32
+ <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(pub_instr, draft_instr) %></pre>
33
+ </div>
34
+ <div class="ck-suggest-diff__pane">
35
+ <div class="ck-suggest-diff__header">
36
+ <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Draft</span>
37
+ </div>
38
+ <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(pub_instr, draft_instr) %></pre>
39
+ </div>
40
+ </div>
41
+ <% else %>
42
+ <p class="ck-meta-copy">The instruction is unchanged.</p>
43
+ <% end %>
44
+
45
+ <% if rubric_changed %>
46
+ <p class="ck-meta-copy"><strong>Rubric also changed.</strong> Edit the metric to inspect each band, or publish to apply the new wording.</p>
47
+ <% end %>
48
+ </section>
20
49
  <% end %>
21
50
  <% end %>
22
51
  </div>
23
52
  <div class="ck-actions">
24
53
  <% if CompletionKit.config.judge_calibration_enabled %>
25
- <%= button_to "Suggest rewrites", suggest_variants_metric_path(@metric),
26
- method: :post, form_class: "inline-block",
27
- class: ck_button_classes(:light, variant: :outline),
28
- title: "Ask the model to rewrite this judge instruction based on the disagreements collected so far.",
29
- data: { turbo_confirm: "Ask the model to rewrite this judge instruction based on the disagreements collected so far?" } %>
54
+ <% if @improve_disagreement_count.positive? %>
55
+ <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
56
+ method: :post, form_class: "inline-block",
57
+ class: ck_button_classes(:light, variant: :outline),
58
+ title: "Rewrite this metric based on the disagreements collected so far.",
59
+ data: { turbo_confirm: "Rewrite this metric based on the disagreements collected so far?" } %>
60
+ <% else %>
61
+ <button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
62
+ title="Mark at least one row as Disagree before the model can suggest a change.">Improve the metric</button>
63
+ <% end %>
30
64
  <% end %>
31
65
  <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
32
66
  </div>
@@ -41,7 +75,7 @@
41
75
  <% if @metric.instruction.present? %>
42
76
  <section class="ck-card">
43
77
  <p class="ck-kicker">Instruction</p>
44
- <div class="ck-note-box"><%= simple_format(@metric.instruction) %></div>
78
+ <%= simple_format(@metric.instruction, {}, class: "ck-copy") %>
45
79
  </section>
46
80
  <% end %>
47
81
 
@@ -63,6 +97,49 @@
63
97
  </div>
64
98
  </section>
65
99
 
100
+ <% if CompletionKit.config.judge_calibration_enabled && @suggestion_draft %>
101
+ <% sd_current_instr = @published_judge_version&.instruction.to_s %>
102
+ <% sd_draft_instr = @suggestion_draft.instruction.to_s %>
103
+ <% sd_current_rubric = @published_judge_version&.rubric_bands || [] %>
104
+ <% sd_rubric_changed = @suggestion_draft.rubric_bands != sd_current_rubric %>
105
+ <section class="ck-card ck-card--spaced ck-draft-pending">
106
+ <div class="ck-prompt-preview__header">
107
+ <p class="ck-kicker">Suggested change</p>
108
+ <time class="ck-meta-copy" data-relative-time datetime="<%= @suggestion_draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(@suggestion_draft.created_at) %> ago</time>
109
+ </div>
110
+ <p class="ck-meta-copy">Based on your disagreements, the model proposed this rewrite. Use it to replace the live version, or discard.</p>
111
+
112
+ <div class="ck-suggest-diff">
113
+ <div class="ck-suggest-diff__pane">
114
+ <div class="ck-suggest-diff__header">
115
+ <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Currently live</span>
116
+ </div>
117
+ <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(sd_current_instr, sd_draft_instr) %></pre>
118
+ </div>
119
+ <div class="ck-suggest-diff__pane">
120
+ <div class="ck-suggest-diff__header">
121
+ <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Proposed</span>
122
+ </div>
123
+ <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(sd_current_instr, sd_draft_instr) %></pre>
124
+ </div>
125
+ </div>
126
+
127
+ <% if sd_rubric_changed %>
128
+ <p class="ck-meta-copy"><strong>Rubric also changed.</strong> Publishing applies the new rubric too.</p>
129
+ <% end %>
130
+
131
+ <div class="ck-actions">
132
+ <%= button_to "Discard", dismiss_suggestion_metric_path(@metric, draft_id: @suggestion_draft.id),
133
+ method: :delete, form_class: "inline-block",
134
+ class: ck_button_classes(:light, variant: :outline),
135
+ data: { turbo_confirm: "Drop this suggestion?" } %>
136
+ <%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: @suggestion_draft.id),
137
+ method: :post, form_class: "inline-block",
138
+ class: ck_button_classes(:dark) %>
139
+ </div>
140
+ </section>
141
+ <% end %>
142
+
66
143
  <% if CompletionKit.config.judge_calibration_enabled %>
67
144
  <section class="ck-card ck-card--spaced">
68
145
  <div class="ck-prompt-preview__header">
@@ -131,31 +208,6 @@
131
208
  <% end %>
132
209
  </section>
133
210
 
134
- <% if @suggestion_drafts.any? %>
135
- <section class="ck-card ck-card--spaced">
136
- <div class="ck-prompt-preview__header">
137
- <p class="ck-kicker">Suggested rewrites</p>
138
- <span class="ck-chip"><%= @suggestion_drafts.size %> option<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
139
- </div>
140
- <p class="ck-meta-copy">The model wrote these alternate instructions based on the disagreements above. Pick one to make it the live judge — the previous version is kept in history.</p>
141
- <div class="ck-suggestion-list">
142
- <% @suggestion_drafts.each do |draft| %>
143
- <article class="ck-suggestion-card">
144
- <header class="ck-suggestion-card__header">
145
- <span class="ck-chip ck-chip--soft">Option #<%= draft.id %></span>
146
- <time class="ck-meta-copy" data-relative-time datetime="<%= draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(draft.created_at) %> ago</time>
147
- </header>
148
- <pre class="ck-code ck-suggestion-card__instruction"><%= draft.instruction %></pre>
149
- <div class="ck-actions">
150
- <%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: draft.id),
151
- method: :post, form_class: "inline-block",
152
- class: ck_button_classes(:dark) %>
153
- </div>
154
- </article>
155
- <% end %>
156
- </div>
157
- </section>
158
- <% end %>
159
211
 
160
212
  <% if Array(@metric.few_shot_examples).any? %>
161
213
  <section class="ck-card ck-card--spaced">
@@ -112,9 +112,7 @@
112
112
  </div>
113
113
  </div>
114
114
  <% if review.ai_feedback.present? %>
115
- <div class="ck-review-card__feedback">
116
- <div class="ck-note-box"><%= review.ai_feedback %></div>
117
- </div>
115
+ <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
118
116
  <% end %>
119
117
  <% if CompletionKit.config.judge_calibration_enabled && review.metric && review.ai_score %>
120
118
  <% existing = CompletionKit::Calibration.find_by(
data/config/routes.rb CHANGED
@@ -17,6 +17,7 @@ CompletionKit::Engine.routes.draw do
17
17
  post :add_few_shot
18
18
  post :publish_draft
19
19
  post :suggest_variants
20
+ delete :dismiss_suggestion
20
21
  end
21
22
  end
22
23
  resources :metric_groups
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.5.38"
2
+ VERSION = "0.5.40"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.38
4
+ version: 0.5.40
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin
@@ -233,7 +233,7 @@ files:
233
233
  - app/assets/images/completion_kit/favicon.ico
234
234
  - app/assets/images/completion_kit/logo.png
235
235
  - app/assets/javascripts/completion_kit/application.js
236
- - app/assets/stylesheets/completion_kit/application.css.erb
236
+ - app/assets/stylesheets/completion_kit/application.css
237
237
  - app/controllers/completion_kit/api/v1/base_controller.rb
238
238
  - app/controllers/completion_kit/api/v1/calibrations_controller.rb
239
239
  - app/controllers/completion_kit/api/v1/datasets_controller.rb