completion-kit 0.5.39 → 0.5.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d8d052b5ce9253412be890b820439248547d767575969e4260566a63426ac612
4
- data.tar.gz: 8e2f73e59c977c1923b90c9b36fae7dd8eadd35d0c499ae04cea1d63113e7655
3
+ metadata.gz: d8875fa22de9c8626e401706818b271f3cb26bffa7faae1f583b29503228cead
4
+ data.tar.gz: 5bf39fad883b2eed2f505b11403ab60fddf9efc5735073ff61c290d53f59a36d
5
5
  SHA512:
6
- metadata.gz: 54dd9bd2a4b2e64f929865508649ca2ada6972840715552b920b2bcc156b74cc76fe957b8ac58ec2f9ad7d8594dbe2ef15c600efb10304963b66b226cdee959b
7
- data.tar.gz: 2db1e93c654e7d0de826a9f9c0ffadae292cf57d1dfff1df71763c5a98da4fc6d547560808bee9c8364f64c08c747661546a91f330d0effcda7b3587547d35e8
6
+ metadata.gz: a8f2a7f14235c1214b567b891defaf523a645a21a2409ed81df964893a260cccb1fc9bf63903794f952c91bc9c91f3c1e3850db751a08ce0edc49b360ad9642d
7
+ data.tar.gz: e148b500e498a00dc370bf203fea3e2618b9f1c8fccd1dc5f220ae77a4988a8934198ad32bbf25a29bc84f948c687fab9a0ae16b9c03fe575fe9fca4cf98a0ea
@@ -1541,6 +1541,25 @@ tr:hover .ck-chip--publish {
1541
1541
  border: 0;
1542
1542
  border-radius: 0;
1543
1543
  background: transparent;
1544
+ white-space: pre;
1545
+ color: #93c5fd;
1546
+ font-size: 0.86rem;
1547
+ line-height: 1.55;
1548
+ }
1549
+ .ck-code-scroll-wrap > .ck-code .ck-json-key {
1550
+ color: #c4b5fd;
1551
+ }
1552
+ .ck-code-scroll-wrap > .ck-code .ck-json-string {
1553
+ color: #93c5fd;
1554
+ }
1555
+ .ck-code-scroll-wrap > .ck-code .ck-json-number {
1556
+ color: #fcd34d;
1557
+ }
1558
+ .ck-code-scroll-wrap > .ck-code .ck-json-keyword {
1559
+ color: #f9a8d4;
1560
+ }
1561
+ .ck-code-scroll-wrap > .ck-code .ck-json-punct {
1562
+ color: var(--ck-dim);
1544
1563
  }
1545
1564
 
1546
1565
  .ck-note-box {
@@ -2755,7 +2774,7 @@ select.ck-input {
2755
2774
  border: 1px solid var(--ck-line);
2756
2775
  border-radius: var(--ck-radius-lg);
2757
2776
  background: var(--ck-surface);
2758
- padding: 1.5rem;
2777
+ padding: 1.25rem;
2759
2778
  }
2760
2779
 
2761
2780
  .ck-review-card__header {
@@ -2778,11 +2797,9 @@ select.ck-input {
2778
2797
  }
2779
2798
 
2780
2799
  .ck-review-card__feedback {
2781
- margin-top: 0.75rem;
2782
- }
2783
-
2784
- .ck-review-card__feedback .ck-note-box {
2785
- margin-top: 0;
2800
+ margin: 0.6rem 0 0;
2801
+ color: var(--ck-muted);
2802
+ line-height: 1.55;
2786
2803
  }
2787
2804
 
2788
2805
  @media (max-width: 900px) {
@@ -2819,6 +2836,14 @@ select.ck-input {
2819
2836
  width: 100%;
2820
2837
  }
2821
2838
 
2839
+ /* button_to renders a form.inline-block wrapping the button. When the inner
2840
+ button is a full .ck-button (not an icon-button or chip), the form should
2841
+ stretch with it. */
2842
+ form.inline-block:has(> .ck-button) {
2843
+ width: 100%;
2844
+ display: block;
2845
+ }
2846
+
2822
2847
  /* Page header stacks: title, then lead text full-width, then action. */
2823
2848
  .ck-page-header {
2824
2849
  flex-direction: column;
@@ -4584,9 +4609,8 @@ a.tag-mark {
4584
4609
  }
4585
4610
 
4586
4611
  .ck-launch__progress {
4587
- padding-bottom: 1.5rem;
4588
- margin-bottom: 0.5rem;
4589
- border-bottom: 1px solid var(--ck-line);
4612
+ padding-bottom: 0;
4613
+ margin-bottom: 1.25rem;
4590
4614
  }
4591
4615
  .ck-launch__progress-head {
4592
4616
  display: flex;
@@ -5151,22 +5175,37 @@ a.tag-mark {
5151
5175
  border-top: 1px dashed var(--ck-line);
5152
5176
  }
5153
5177
  .ck-calibration__prompt {
5178
+ margin: 0 0 10px;
5179
+ display: flex;
5180
+ align-items: baseline;
5181
+ flex-wrap: wrap;
5182
+ gap: 8px 12px;
5183
+ }
5184
+ .ck-calibration__label {
5154
5185
  font-family: var(--ck-mono);
5155
5186
  font-size: 0.72rem;
5156
5187
  letter-spacing: 0.06em;
5157
5188
  text-transform: uppercase;
5158
5189
  color: var(--ck-dim);
5159
- margin: 0 0 10px;
5160
- display: flex;
5161
- align-items: center;
5162
- gap: 10px;
5190
+ flex-shrink: 0;
5163
5191
  }
5164
5192
  .ck-calibration__count {
5165
5193
  font-family: var(--ck-mono);
5166
5194
  font-size: 0.72rem;
5167
5195
  letter-spacing: 0.03em;
5168
5196
  color: var(--ck-accent);
5169
- text-transform: none;
5197
+ }
5198
+ .ck-calibration__hint {
5199
+ font-size: 0.82rem;
5200
+ color: var(--ck-dim);
5201
+ line-height: 1.4;
5202
+ }
5203
+ @media (max-width: 640px) {
5204
+ .ck-calibration__prompt {
5205
+ flex-direction: column;
5206
+ align-items: flex-start;
5207
+ gap: 4px;
5208
+ }
5170
5209
  }
5171
5210
  .ck-calibration__buttons {
5172
5211
  display: flex;
@@ -5221,11 +5260,13 @@ a.tag-mark {
5221
5260
  margin-top: 12px;
5222
5261
  display: flex;
5223
5262
  flex-direction: column;
5224
- gap: 8px;
5225
- padding: 12px;
5226
- background: var(--ck-surface-soft);
5227
- border: 1px solid var(--ck-line);
5228
- border-radius: 6px;
5263
+ gap: 12px;
5264
+ }
5265
+ .ck-calibration__detail > * {
5266
+ margin: 0;
5267
+ }
5268
+ .ck-calibration__detail .ck-button {
5269
+ align-self: flex-start;
5229
5270
  }
5230
5271
  .ck-calibration__value {
5231
5272
  color: var(--ck-accent);
@@ -5347,44 +5388,28 @@ a.tag-mark {
5347
5388
  text-transform: uppercase;
5348
5389
  }
5349
5390
 
5350
- .ck-draft-banner {
5391
+ .ck-draft-pending {
5392
+ border-color: rgba(6, 182, 212, 0.45);
5393
+ background: linear-gradient(180deg, var(--ck-accent-soft), var(--ck-surface));
5394
+ }
5395
+
5396
+ .ck-suggestion-banner {
5351
5397
  display: inline-flex;
5352
5398
  align-items: center;
5353
5399
  gap: 10px;
5354
5400
  margin-top: 10px;
5355
- padding: 8px 12px;
5401
+ padding: 8px 14px;
5356
5402
  background: var(--ck-accent-soft);
5357
- border: 1px dashed rgba(6, 182, 212, 0.4);
5358
- border-radius: 6px;
5359
- }
5360
-
5361
- .ck-suggestion-list {
5362
- display: flex;
5363
- flex-direction: column;
5364
- gap: 12px;
5365
- }
5366
- .ck-suggestion-card {
5367
- padding: 12px 14px;
5368
- background: var(--ck-surface-soft);
5369
- border: 1px solid var(--ck-line);
5403
+ border: 1px solid rgba(6, 182, 212, 0.35);
5370
5404
  border-radius: 6px;
5371
- display: flex;
5372
- flex-direction: column;
5373
- gap: 10px;
5374
- }
5375
- .ck-suggestion-card__header {
5376
- display: flex;
5377
- align-items: center;
5378
- gap: 10px;
5405
+ color: var(--ck-accent);
5406
+ font-family: var(--ck-mono);
5407
+ font-size: 0.82rem;
5408
+ text-decoration: none;
5379
5409
  }
5380
- .ck-suggestion-card__instruction {
5381
- margin: 0;
5382
- white-space: pre-wrap;
5383
- font-size: 0.85rem;
5384
- background: var(--ck-bg-strong);
5385
- padding: 10px 12px;
5386
- border-radius: 4px;
5387
- border: 1px solid var(--ck-line);
5410
+ .ck-suggestion-banner:hover,
5411
+ .ck-suggestion-banner:focus-visible {
5412
+ border-color: var(--ck-accent);
5388
5413
  }
5389
5414
 
5390
5415
  .ck-metrics-table__trust {
@@ -5407,3 +5432,56 @@ a.tag-mark {
5407
5432
  color: var(--ck-danger);
5408
5433
  font-size: 0.82rem;
5409
5434
  }
5435
+
5436
+ .ck-star-picker {
5437
+ border: 0;
5438
+ padding: 0;
5439
+ margin: 0;
5440
+ }
5441
+ .ck-star-picker__row {
5442
+ display: inline-flex;
5443
+ flex-direction: row-reverse;
5444
+ gap: 2px;
5445
+ }
5446
+ .ck-star-picker input {
5447
+ position: absolute;
5448
+ width: 1px;
5449
+ height: 1px;
5450
+ opacity: 0;
5451
+ pointer-events: none;
5452
+ }
5453
+ .ck-star-picker label {
5454
+ cursor: pointer;
5455
+ display: inline-flex;
5456
+ padding: 4px;
5457
+ border-radius: 4px;
5458
+ }
5459
+ .ck-star-picker label svg {
5460
+ fill: transparent;
5461
+ stroke: var(--ck-line-strong);
5462
+ transition: fill 0.08s, stroke 0.08s;
5463
+ }
5464
+ .ck-star-picker input:checked ~ label svg {
5465
+ fill: var(--ck-warning);
5466
+ stroke: var(--ck-warning);
5467
+ }
5468
+ .ck-star-picker__row:hover label svg {
5469
+ fill: transparent;
5470
+ stroke: var(--ck-line-strong);
5471
+ }
5472
+ .ck-star-picker__row:hover label:hover svg,
5473
+ .ck-star-picker__row:hover label:hover ~ label svg {
5474
+ fill: var(--ck-warning);
5475
+ stroke: var(--ck-warning);
5476
+ }
5477
+ .ck-star-picker input:focus-visible + label {
5478
+ outline: 2px solid var(--ck-accent);
5479
+ outline-offset: 2px;
5480
+ }
5481
+
5482
+ .ck-button--just-saved {
5483
+ animation: ck-saved-flash 1.4s ease-out;
5484
+ }
5485
+ @keyframes ck-saved-flash {
5486
+ 0% { background: var(--ck-success); border-color: var(--ck-success); }
5487
+ }
@@ -25,7 +25,7 @@ module CompletionKit
25
25
  )
26
26
 
27
27
  if calibration.save
28
- render_calibration(calibration: calibration)
28
+ render_calibration(calibration: calibration, just_saved: true)
29
29
  else
30
30
  render_calibration(
31
31
  calibration: existing,
@@ -38,7 +38,7 @@ module CompletionKit
38
38
 
39
39
  private
40
40
 
41
- def render_calibration(calibration:, pending_verdict: nil, error: nil, status: :ok)
41
+ def render_calibration(calibration:, pending_verdict: nil, error: nil, just_saved: false, status: :ok)
42
42
  locals = {
43
43
  review: review_for_metric,
44
44
  calibration: calibration,
@@ -46,7 +46,8 @@ module CompletionKit
46
46
  response_row: @response,
47
47
  metric: @metric,
48
48
  pending_verdict: pending_verdict,
49
- error: error
49
+ error: error,
50
+ just_saved: just_saved
50
51
  }
51
52
  render turbo_stream: turbo_stream.replace(
52
53
  "calibration_#{@response.id}_#{@metric.id}",
@@ -1,7 +1,7 @@
1
1
  module CompletionKit
2
2
  class MetricsController < ApplicationController
3
3
  include CompletionKit::TagFiltering
4
- before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants]
4
+ before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
5
5
 
6
6
  def index
7
7
  @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
@@ -12,8 +12,10 @@ module CompletionKit
12
12
  .includes(response: [:reviews, :run])
13
13
  .order(created_at: :desc)
14
14
  .limit(50)
15
- @latest_draft = JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
16
- @suggestion_drafts = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc)
15
+ @edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
16
+ @published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
17
+ @suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
18
+ @improve_disagreement_count = @disagreements.size
17
19
  end
18
20
 
19
21
  def new
@@ -47,15 +49,28 @@ module CompletionKit
47
49
  end
48
50
 
49
51
  def suggest_variants
50
- generator = JudgeVariantGenerator.new(@metric)
52
+ disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
53
+ if disagreement_count.zero?
54
+ redirect_to metric_path(@metric), alert: "Mark at least one row as Disagree before asking the model to suggest a change."
55
+ return
56
+ end
57
+
58
+ JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
59
+
60
+ generator = JudgeVariantGenerator.new(@metric, count: 1)
51
61
  variants = generator.call
52
62
  if variants.empty?
53
63
  redirect_to metric_path(@metric), alert: "The model returned no usable variants. Try again with a different model."
54
64
  return
55
65
  end
56
66
  generator.persist!(variants)
57
- label = variants.length == 1 ? "alternative" : "alternatives"
58
- redirect_to metric_path(@metric), notice: "Wrote #{variants.length} #{label} for this metric. Pick one to make it live."
67
+ redirect_to metric_path(@metric), notice: "Drafted a new version. Review it below."
68
+ end
69
+
70
+ def dismiss_suggestion
71
+ draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").find_by(id: params[:draft_id])
72
+ draft&.destroy
73
+ redirect_to metric_path(@metric), notice: "Dismissed."
59
74
  end
60
75
 
61
76
  def publish_draft
@@ -202,15 +202,86 @@ module CompletionKit
202
202
  def ck_format_maybe_json(text)
203
203
  s = text.to_s
204
204
  return s if s.strip.empty?
205
- first = s.strip[0]
205
+ payload = ck_unwrap_json_fence(s.strip)
206
+ first = payload[0]
206
207
  return s unless first == "{" || first == "["
207
208
  begin
208
- JSON.pretty_generate(JSON.parse(s))
209
+ ck_highlight_json(JSON.pretty_generate(JSON.parse(payload)))
209
210
  rescue JSON::ParserError
210
211
  s
211
212
  end
212
213
  end
213
214
 
215
+ def ck_unwrap_json_fence(text)
216
+ m = text.match(/\A```(?:json|JSON)?\s*\n(.*?)\n?```\s*\z/m)
217
+ m ? m[1].strip : text
218
+ end
219
+
220
+ def ck_highlight_json(text)
221
+ tokens = ck_tokenize_json(text)
222
+ is_key = ck_mark_json_keys(tokens)
223
+ parts = tokens.each_with_index.map do |(type, value), idx|
224
+ escaped = ERB::Util.html_escape(value)
225
+ case type
226
+ when :punct then %(<span class="ck-json-punct">#{escaped}</span>)
227
+ when :string
228
+ %(<span class="#{is_key[idx] ? "ck-json-key" : "ck-json-string"}">#{escaped}</span>)
229
+ when :number then %(<span class="ck-json-number">#{escaped}</span>)
230
+ when :keyword then %(<span class="ck-json-keyword">#{escaped}</span>)
231
+ else escaped
232
+ end
233
+ end
234
+ parts.join.html_safe
235
+ end
236
+
237
+ def ck_tokenize_json(text)
238
+ tokens = []
239
+ i = 0
240
+ len = text.length
241
+ while i < len
242
+ ch = text[i]
243
+ if ch == " " || ch == "\n" || ch == "\t"
244
+ tokens << [:ws, ch]
245
+ i += 1
246
+ elsif "{}[]:,".include?(ch)
247
+ tokens << [:punct, ch]
248
+ i += 1
249
+ elsif ch == '"'
250
+ j = i + 1
251
+ while j < len && text[j] != '"'
252
+ j += text[j] == "\\" ? 2 : 1
253
+ end
254
+ j = len - 1 if j >= len
255
+ tokens << [:string, text[i..j]]
256
+ i = j + 1
257
+ elsif ch == "-" || (ch >= "0" && ch <= "9")
258
+ j = i + 1
259
+ j += 1 while j < len && "0123456789.eE+-".include?(text[j])
260
+ tokens << [:number, text[i...j]]
261
+ i = j
262
+ elsif text[i, 4] == "true" || text[i, 4] == "null"
263
+ tokens << [:keyword, text[i, 4]]
264
+ i += 4
265
+ elsif text[i, 5] == "false"
266
+ tokens << [:keyword, "false"]
267
+ i += 5
268
+ else
269
+ tokens << [:other, ch]
270
+ i += 1
271
+ end
272
+ end
273
+ tokens
274
+ end
275
+
276
+ def ck_mark_json_keys(tokens)
277
+ tokens.each_with_index.map do |(type, _), idx|
278
+ next false unless type == :string
279
+ j = idx + 1
280
+ j += 1 while j < tokens.length && tokens[j][0] == :ws
281
+ j < tokens.length && tokens[j] == [:punct, ":"]
282
+ end
283
+ end
284
+
214
285
  def tag_filter_url(base_path, selected, toggling)
215
286
  remaining = selected.reject { |t| t.id == toggling.id }
216
287
  next_set = selected.include?(toggling) ? remaining : remaining + [toggling]
@@ -1,13 +1,15 @@
1
1
  module CompletionKit
2
2
  class JudgeVariantGenerator
3
- DEFAULT_VARIANT_COUNT = 3
3
+ DEFAULT_VARIANT_COUNT = 1
4
+ MAX_VARIANT_COUNT = 3
4
5
  DEFAULT_TEMPERATURE = 0.4
5
6
 
6
- Variant = Struct.new(:reasoning, :instruction, keyword_init: true)
7
+ Variant = Struct.new(:reasoning, :instruction, :rubric_bands, keyword_init: true)
7
8
 
8
9
  def initialize(metric, count: DEFAULT_VARIANT_COUNT, model: nil)
9
10
  @metric = metric
10
- @count = count
11
+ n = count.to_i
12
+ @count = n < 1 ? DEFAULT_VARIANT_COUNT : [n, MAX_VARIANT_COUNT].min
11
13
  @model = model || CompletionKit.config.judge_model
12
14
  end
13
15
 
@@ -23,7 +25,7 @@ module CompletionKit
23
25
  JudgeVersion.create!(
24
26
  metric: @metric,
25
27
  instruction: variant.instruction,
26
- rubric_bands: @metric.rubric_bands,
28
+ rubric_bands: variant.rubric_bands.presence || @metric.rubric_bands,
27
29
  state: "draft",
28
30
  source: "suggestion",
29
31
  current: false
@@ -42,14 +44,14 @@ module CompletionKit
42
44
  disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
43
45
  borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
44
46
  sections = []
45
- sections << "You are an expert evaluator. Rewrite a judge's grading instruction so it agrees better with humans on the cases below."
47
+ sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
46
48
  sections << ""
47
49
  sections << "## Current instruction"
48
50
  sections << "```"
49
51
  sections << @metric.instruction.to_s
50
52
  sections << "```"
51
53
  sections << ""
52
- sections << "## Rubric (unchanged across variants — only rewrite the instruction)"
54
+ sections << "## Current rubric (5 to 1)"
53
55
  sections << @metric.display_rubric_text
54
56
  sections << ""
55
57
  if disagreements.any?
@@ -65,7 +67,7 @@ module CompletionKit
65
67
  end
66
68
  if borderlines.any?
67
69
  sections << "## Rubric-ambiguous cases (humans marked these borderline)"
68
- sections << "Each case below is one where a human said the rubric was unclear. Use these to sharpen language, split overlapping bands, or call out edge cases explicitly."
70
+ sections << "These are cases where a human said the rubric itself was unclear. If the rubric needs sharpening, rewrite it."
69
71
  borderlines.each_with_index do |ex, i|
70
72
  sections << "### Borderline #{i + 1}"
71
73
  sections << "Input: #{ex[:input].to_s.truncate(200)}"
@@ -76,14 +78,20 @@ module CompletionKit
76
78
  end
77
79
  end
78
80
  sections << "## Task"
79
- sections << "Propose #{@count} alternative instructions. Each should be a focused rewrite not a wholesale rewrite of the rubric. Close the disagreement gap and disambiguate the borderline cases."
81
+ sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
80
82
  sections << ""
81
- sections << "Respond in EXACTLY this format, repeated #{@count} times:"
83
+ sections << "Respond in EXACTLY this format, repeated #{@count} time#{@count == 1 ? "" : "s"}:"
82
84
  sections << ""
83
85
  sections << "VARIANT:"
84
- sections << "REASONING: <one sentence explaining what this variant changes>"
86
+ sections << "REASONING: <one short sentence: what changes and why>"
85
87
  sections << "INSTRUCTION:"
86
88
  sections << "<the rewritten instruction>"
89
+ sections << "RUBRIC: # optional — omit this block if the rubric is unchanged"
90
+ sections << "5: <description for 5 stars>"
91
+ sections << "4: <description for 4 stars>"
92
+ sections << "3: <description for 3 stars>"
93
+ sections << "2: <description for 2 stars>"
94
+ sections << "1: <description for 1 star>"
87
95
  sections << "END_VARIANT"
88
96
  sections.join("\n")
89
97
  end
@@ -92,11 +100,21 @@ module CompletionKit
92
100
  blocks = text.to_s.scan(/VARIANT:(.*?)END_VARIANT/m).flatten
93
101
  blocks.filter_map do |raw|
94
102
  reasoning = raw[/REASONING:\s*(.*?)(?=INSTRUCTION:|\z)/m, 1].to_s.strip
95
- instruction = raw[/INSTRUCTION:\s*(.*)/m, 1].to_s.strip
103
+ instruction = raw[/INSTRUCTION:\s*(.*?)(?=RUBRIC:|\z)/m, 1].to_s.strip
96
104
  next if instruction.empty?
97
- Variant.new(reasoning: reasoning, instruction: instruction)
105
+ rubric_block = raw[/RUBRIC:\s*(.*)/m, 1].to_s
106
+ Variant.new(reasoning: reasoning, instruction: instruction, rubric_bands: parse_rubric(rubric_block))
98
107
  end
99
108
  end
109
+
110
+ def parse_rubric(block)
111
+ return nil if block.strip.empty?
112
+ bands = block.scan(/^\s*([1-5])\s*[:\-]\s*(.+?)\s*$/).map do |stars, description|
113
+ { "stars" => stars.to_i, "description" => description.strip }
114
+ end
115
+ return nil if bands.length != 5
116
+ bands.sort_by { |b| -b["stars"] }
117
+ end
100
118
  end
101
119
 
102
120
  module JudgeCalibrationExamples
@@ -10,7 +10,7 @@ module CompletionKit
10
10
  type: "object",
11
11
  properties: {
12
12
  metric_id: { type: "integer" },
13
- count: { type: "integer", description: "How many variants to request (default 3, max 5)." },
13
+ count: { type: "integer", description: "How many variants to request (default 1, max 3). One focused rewrite beats five reworded copies." },
14
14
  model: { type: "string", description: "Override the model used to generate variants. Defaults to CompletionKit.config.judge_model." }
15
15
  },
16
16
  required: ["metric_id"]
@@ -49,9 +49,7 @@ module CompletionKit
49
49
 
50
50
  def self.suggest(args)
51
51
  metric = CompletionKit::Metric.find(args["metric_id"])
52
- count = [args["count"].to_i, 5].min
53
- count = CompletionKit::JudgeVariantGenerator::DEFAULT_VARIANT_COUNT if count <= 0
54
- generator = CompletionKit::JudgeVariantGenerator.new(metric, count: count, model: args["model"])
52
+ generator = CompletionKit::JudgeVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
55
53
  variants = generator.call
56
54
  return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
57
55
  versions = generator.persist!(variants)
@@ -1,20 +1,20 @@
1
1
  module CompletionKit
2
2
  module Onboarding
3
3
  # Opt-in starter data for the onboarding page: one dataset + one prompt so a
4
- # brand-new install has something to poke at. Idempotent a no-op once the
4
+ # brand-new install has something to poke at. Idempotent. A no-op once the
5
5
  # workspace already has any prompt or dataset. Deliberately does NOT create a
6
6
  # provider credential (needs a real API key) or a run (user-initiated).
7
7
  module SampleData
8
8
  SAMPLE_CSV = <<~CSV.freeze
9
9
  ticket
10
10
  "My order #4827 arrived with a dented panel. I emailed photos 11 days ago and heard nothing. Today I was told the return window 'closed'. I paid $749. I want a refund or replacement, not store credit."
11
- "Tracking says delivered to my porch Tuesday 3:47pm. I was home all day, nothing arrived, neighbours' cameras show no van. Order #5102 a $315 mixer, wedding gift, wedding is Saturday. Can someone look today?"
11
+ "Tracking says delivered to my porch Tuesday 3:47pm. I was home all day, nothing arrived, neighbours' cameras show no van. Order #5102. A $315 mixer, wedding gift, wedding is Saturday. Can someone look today?"
12
12
  "WELCOME20 says 'invalid' at checkout but the promo email says it's good through May 31. Same email I'm signed in with. Tried Chrome and Safari. Cart is $186 waiting on you."
13
13
  CSV
14
14
 
15
15
  SAMPLE_PROMPT = {
16
16
  name: "Sample: Support reply",
17
- description: "A starter prompt drafts a warm, professional reply to a customer support ticket. Edit it or delete it; it's just here to get you going.",
17
+ description: "A starter prompt. Drafts a warm, professional reply to a customer support ticket. Edit it or delete it; it's just here to get you going.",
18
18
  template: "You are a senior customer-support specialist. Write a warm, professional reply to this ticket. Acknowledge the customer's situation, be specific about next steps, and don't be defensive.\n\nTicket:\n{{ticket}}",
19
19
  llm_model: "gpt-4o-mini"
20
20
  }.freeze
@@ -5,11 +5,11 @@
5
5
  <% error = local_assigns[:error] %>
6
6
  <% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
7
7
  <p class="ck-calibration__prompt">
8
- Your verdict
8
+ <span class="ck-calibration__label">Your verdict</span>
9
9
  <% if verdict_count > 0 %>
10
- <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "trust score →", metric_path(metric), class: "ck-link" %></span>
10
+ <span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "trust level →", metric_path(metric), class: "ck-link" %></span>
11
11
  <% else %>
12
- <span class="ck-calibration__count">Tell us what you think — was the score right? Verdicts roll up into the metric's <%= link_to "trust score", metric_path(metric), class: "ck-link" %>.</span>
12
+ <span class="ck-calibration__hint">Tell us what you think — was the score right? Verdicts roll up into the metric's <%= link_to "trust level", metric_path(metric), class: "ck-link" %>.</span>
13
13
  <% end %>
14
14
  </p>
15
15
  <div class="ck-calibration__buttons">
@@ -37,22 +37,27 @@
37
37
  <% end %>
38
38
 
39
39
  <% if active_verdict == "disagree" %>
40
+ <% existing_score = (calibration&.corrected_score || review&.ai_score)&.round %>
40
41
  <%= form_with url: run_response_calibrations_path(run, response_row),
41
42
  method: :post, local: false,
42
43
  class: "ck-calibration__detail" do |f| %>
43
44
  <%= hidden_field_tag :metric_id, metric.id %>
44
45
  <%= hidden_field_tag :verdict, "disagree" %>
45
- <label class="ck-label">
46
- What should the score have been?
47
- <span class="ck-calibration__value" data-calibration-value><%= calibration&.corrected_score || review&.ai_score || 3 %></span>
48
- </label>
49
- <input type="range" name="corrected_score" min="1" max="5" step="0.5"
50
- value="<%= calibration&.corrected_score || review&.ai_score || 3 %>"
51
- oninput="this.closest('.ck-calibration__detail').querySelector('[data-calibration-value]').textContent = this.value"
52
- class="ck-slider"
53
- required>
46
+ <p class="ck-label">What should the score have been?</p>
47
+ <fieldset class="ck-star-picker">
48
+ <legend class="ck-visually-hidden">Pick a score from 1 to 5 stars</legend>
49
+ <div class="ck-star-picker__row">
50
+ <% [5, 4, 3, 2, 1].each do |n| %>
51
+ <% radio_id = "ck-star-#{response_row.id}-#{metric.id}-#{n}" %>
52
+ <input type="radio" name="corrected_score" id="<%= radio_id %>" value="<%= n %>" <%= "checked" if existing_score == n %> required>
53
+ <label for="<%= radio_id %>" title="<%= pluralize(n, 'star') %>" aria-label="<%= pluralize(n, 'star') %>">
54
+ <svg viewBox="0 0 24 24" width="28" height="28" stroke-width="1.5" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
55
+ </label>
56
+ <% end %>
57
+ </div>
58
+ </fieldset>
54
59
  <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration&.note %></textarea>
55
- <%= f.submit (current_verdict == "disagree" ? "Update" : "Save disagree"), class: ck_button_classes(:dark) %>
60
+ <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
56
61
  <% end %>
57
62
  <% elsif active_verdict == "borderline" %>
58
63
  <%= form_with url: run_response_calibrations_path(run, response_row),
@@ -61,7 +66,7 @@
61
66
  <%= hidden_field_tag :metric_id, metric.id %>
62
67
  <%= hidden_field_tag :verdict, "borderline" %>
63
68
  <textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration&.note %></textarea>
64
- <%= f.submit (current_verdict == "borderline" ? "Update" : "Save"), class: ck_button_classes(:dark) %>
69
+ <%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
65
70
  <% end %>
66
71
  <% end %>
67
72
  </div>
@@ -1,6 +1,6 @@
1
1
  <% stats = local_assigns[:stats] %>
2
2
  <div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
3
- <p class="ck-trust-panel__label" title="How often this metric's scores match the humans who reviewed them.">Trust score</p>
3
+ <p class="ck-trust-panel__label" title="How often this metric's scores match the humans who reviewed them.">Trust level</p>
4
4
  <% if stats.counter_only? %>
5
5
  <div class="ck-trust-panel__body">
6
6
  <span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>
@@ -19,7 +19,7 @@
19
19
  <tr>
20
20
  <th scope="col">Name</th>
21
21
  <th scope="col">Instruction</th>
22
- <th scope="col" title="How often this metric's scores match the humans who reviewed them.">Trust score</th>
22
+ <th scope="col" title="How often this metric's scores match the humans who reviewed them.">Trust level</th>
23
23
  <th scope="col">In groups</th>
24
24
  <th scope="col"></th>
25
25
  </tr>
@@ -36,7 +36,7 @@
36
36
  <% end %>
37
37
  </td>
38
38
  <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
39
- <td data-label="Trust score" class="ck-metrics-table__trust">
39
+ <td data-label="Trust level" class="ck-metrics-table__trust">
40
40
  <% if CompletionKit.config.judge_calibration_enabled %>
41
41
  <% s = CompletionKit::MetricCalibrationStats.for(metric) %>
42
42
  <% if s.counter_only? %>
@@ -9,24 +9,58 @@
9
9
  <% if CompletionKit.config.judge_calibration_enabled %>
10
10
  <%= render "completion_kit/calibrations/trust_panel",
11
11
  stats: CompletionKit::MetricCalibrationStats.for(@metric) %>
12
- <% if @latest_draft %>
13
- <div class="ck-draft-banner">
14
- <span class="ck-chip ck-chip--soft">Draft pending</span>
15
- <span class="ck-meta-copy">A draft version of this judge is saved. Publishing it replaces the live instruction and rubric.</span>
16
- <%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @latest_draft.id),
17
- method: :post, form_class: "inline-block",
18
- class: ck_button_classes(:dark) %>
19
- </div>
12
+ <% if @edit_draft %>
13
+ <% pub_instr = @published_judge_version&.instruction.to_s %>
14
+ <% draft_instr = @edit_draft.instruction.to_s %>
15
+ <% instruction_changed = pub_instr != draft_instr %>
16
+ <% rubric_changed = @published_judge_version && @published_judge_version.rubric_bands != @edit_draft.rubric_bands %>
17
+ <section class="ck-card ck-card--spaced ck-draft-pending">
18
+ <div class="ck-prompt-preview__header">
19
+ <p class="ck-kicker">Draft pending</p>
20
+ <%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @edit_draft.id),
21
+ method: :post, form_class: "inline-block",
22
+ class: ck_button_classes(:dark) %>
23
+ </div>
24
+ <p class="ck-meta-copy">A draft of this metric is saved. Publishing it replaces the live instruction<%= ", rubric," if rubric_changed %> for future runs. Here's what changes.</p>
25
+
26
+ <% if instruction_changed %>
27
+ <div class="ck-suggest-diff">
28
+ <div class="ck-suggest-diff__pane">
29
+ <div class="ck-suggest-diff__header">
30
+ <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Currently live</span>
31
+ </div>
32
+ <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(pub_instr, draft_instr) %></pre>
33
+ </div>
34
+ <div class="ck-suggest-diff__pane">
35
+ <div class="ck-suggest-diff__header">
36
+ <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Draft</span>
37
+ </div>
38
+ <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(pub_instr, draft_instr) %></pre>
39
+ </div>
40
+ </div>
41
+ <% else %>
42
+ <p class="ck-meta-copy">The instruction is unchanged.</p>
43
+ <% end %>
44
+
45
+ <% if rubric_changed %>
46
+ <p class="ck-meta-copy"><strong>Rubric also changed.</strong> Edit the metric to inspect each band, or publish to apply the new wording.</p>
47
+ <% end %>
48
+ </section>
20
49
  <% end %>
21
50
  <% end %>
22
51
  </div>
23
52
  <div class="ck-actions">
24
53
  <% if CompletionKit.config.judge_calibration_enabled %>
25
- <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
26
- method: :post, form_class: "inline-block",
27
- class: ck_button_classes(:light, variant: :outline),
28
- title: "Ask the model to rewrite this metric's instruction based on the disagreements and rubric-ambiguous cases collected so far.",
29
- data: { turbo_confirm: "Ask the model to rewrite this metric's instruction based on the disagreements and rubric-ambiguous cases collected so far?" } %>
54
+ <% if @improve_disagreement_count.positive? %>
55
+ <%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
56
+ method: :post, form_class: "inline-block",
57
+ class: ck_button_classes(:light, variant: :outline),
58
+ title: "Rewrite this metric based on the disagreements collected so far.",
59
+ data: { turbo_confirm: "Rewrite this metric based on the disagreements collected so far?" } %>
60
+ <% else %>
61
+ <button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
62
+ title="Mark at least one row as Disagree before the model can suggest a change.">Improve the metric</button>
63
+ <% end %>
30
64
  <% end %>
31
65
  <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
32
66
  </div>
@@ -63,6 +97,49 @@
63
97
  </div>
64
98
  </section>
65
99
 
100
+ <% if CompletionKit.config.judge_calibration_enabled && @suggestion_draft %>
101
+ <% sd_current_instr = @published_judge_version&.instruction.to_s %>
102
+ <% sd_draft_instr = @suggestion_draft.instruction.to_s %>
103
+ <% sd_current_rubric = @published_judge_version&.rubric_bands || [] %>
104
+ <% sd_rubric_changed = @suggestion_draft.rubric_bands != sd_current_rubric %>
105
+ <section class="ck-card ck-card--spaced ck-draft-pending">
106
+ <div class="ck-prompt-preview__header">
107
+ <p class="ck-kicker">Suggested change</p>
108
+ <time class="ck-meta-copy" data-relative-time datetime="<%= @suggestion_draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(@suggestion_draft.created_at) %> ago</time>
109
+ </div>
110
+ <p class="ck-meta-copy">Based on your disagreements, the model proposed this rewrite. Use it to replace the live version, or discard.</p>
111
+
112
+ <div class="ck-suggest-diff">
113
+ <div class="ck-suggest-diff__pane">
114
+ <div class="ck-suggest-diff__header">
115
+ <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Currently live</span>
116
+ </div>
117
+ <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(sd_current_instr, sd_draft_instr) %></pre>
118
+ </div>
119
+ <div class="ck-suggest-diff__pane">
120
+ <div class="ck-suggest-diff__header">
121
+ <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Proposed</span>
122
+ </div>
123
+ <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(sd_current_instr, sd_draft_instr) %></pre>
124
+ </div>
125
+ </div>
126
+
127
+ <% if sd_rubric_changed %>
128
+ <p class="ck-meta-copy"><strong>Rubric also changed.</strong> Publishing applies the new rubric too.</p>
129
+ <% end %>
130
+
131
+ <div class="ck-actions">
132
+ <%= button_to "Discard", dismiss_suggestion_metric_path(@metric, draft_id: @suggestion_draft.id),
133
+ method: :delete, form_class: "inline-block",
134
+ class: ck_button_classes(:light, variant: :outline),
135
+ data: { turbo_confirm: "Drop this suggestion?" } %>
136
+ <%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: @suggestion_draft.id),
137
+ method: :post, form_class: "inline-block",
138
+ class: ck_button_classes(:dark) %>
139
+ </div>
140
+ </section>
141
+ <% end %>
142
+
66
143
  <% if CompletionKit.config.judge_calibration_enabled %>
67
144
  <section class="ck-card ck-card--spaced">
68
145
  <div class="ck-prompt-preview__header">
@@ -131,31 +208,6 @@
131
208
  <% end %>
132
209
  </section>
133
210
 
134
- <% if @suggestion_drafts.any? %>
135
- <section class="ck-card ck-card--spaced">
136
- <div class="ck-prompt-preview__header">
137
- <p class="ck-kicker">Suggested improvements</p>
138
- <span class="ck-chip"><%= @suggestion_drafts.size %> option<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
139
- </div>
140
- <p class="ck-meta-copy">Based on your verdicts, the model proposed these alternative instructions for this metric. Pick one to make it live — the previous version stays in history.</p>
141
- <div class="ck-suggestion-list">
142
- <% @suggestion_drafts.each do |draft| %>
143
- <article class="ck-suggestion-card">
144
- <header class="ck-suggestion-card__header">
145
- <span class="ck-chip ck-chip--soft">Option #<%= draft.id %></span>
146
- <time class="ck-meta-copy" data-relative-time datetime="<%= draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(draft.created_at) %> ago</time>
147
- </header>
148
- <pre class="ck-code ck-suggestion-card__instruction"><%= draft.instruction %></pre>
149
- <div class="ck-actions">
150
- <%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: draft.id),
151
- method: :post, form_class: "inline-block",
152
- class: ck_button_classes(:dark) %>
153
- </div>
154
- </article>
155
- <% end %>
156
- </div>
157
- </section>
158
- <% end %>
159
211
 
160
212
  <% if Array(@metric.few_shot_examples).any? %>
161
213
  <section class="ck-card ck-card--spaced">
@@ -112,9 +112,7 @@
112
112
  </div>
113
113
  </div>
114
114
  <% if review.ai_feedback.present? %>
115
- <div class="ck-review-card__feedback">
116
- <div class="ck-note-box"><%= review.ai_feedback %></div>
117
- </div>
115
+ <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
118
116
  <% end %>
119
117
  <% if CompletionKit.config.judge_calibration_enabled && review.metric && review.ai_score %>
120
118
  <% existing = CompletionKit::Calibration.find_by(
data/config/routes.rb CHANGED
@@ -17,6 +17,7 @@ CompletionKit::Engine.routes.draw do
17
17
  post :add_few_shot
18
18
  post :publish_draft
19
19
  post :suggest_variants
20
+ delete :dismiss_suggestion
20
21
  end
21
22
  end
22
23
  resources :metric_groups
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.5.39"
2
+ VERSION = "0.5.40"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.39
4
+ version: 0.5.40
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin