completion-kit 0.5.40 → 0.5.42
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +122 -1
- data/app/controllers/completion_kit/metrics_controller.rb +28 -1
- data/app/models/completion_kit/starter_metric_dismissal.rb +5 -0
- data/app/services/completion_kit/starter_metrics.rb +94 -0
- data/app/views/completion_kit/metrics/_starter_card.html.erb +11 -0
- data/app/views/completion_kit/metrics/index.html.erb +42 -21
- data/app/views/completion_kit/metrics/show.html.erb +52 -65
- data/app/views/completion_kit/metrics/starter_preview.html.erb +45 -0
- data/config/routes.rb +5 -0
- data/db/migrate/20260524000001_create_completion_kit_starter_metric_dismissals.rb +12 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +6 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cb4e02cd47c184c237c34642f1ece8c44bef6a2e79cfeeb3da913a536940216e
|
|
4
|
+
data.tar.gz: 516720138ec5dd2eeecc2d28cd9e7830c8c2ab2b4a183733afe44a939ca4d9af
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0f3403ecf234b58f659f49406e0264384a5ccbda7206e3673c3b4d96704658b1efe5c2e74943fd6110477a8acffeca3600af655980e76a139319e6dc4376779f
|
|
7
|
+
data.tar.gz: da16ea0344f7a00f8366faabe4cd9d53ef487fc67a06f415b7772ff3631fcb3edca6f30be9a59dd3e4cc9e40795cdd4977028c5fd880ab9d2d49d3563babe5aa
|
|
@@ -5413,9 +5413,11 @@ a.tag-mark {
|
|
|
5413
5413
|
}
|
|
5414
5414
|
|
|
5415
5415
|
.ck-metrics-table__trust {
|
|
5416
|
+
margin: 4px 0 0;
|
|
5416
5417
|
font-family: var(--ck-mono);
|
|
5417
|
-
font-size: 0.
|
|
5418
|
+
font-size: 0.72rem;
|
|
5418
5419
|
letter-spacing: 0.03em;
|
|
5420
|
+
color: var(--ck-dim);
|
|
5419
5421
|
}
|
|
5420
5422
|
.ck-metrics-table__trust-rate {
|
|
5421
5423
|
font-weight: 600;
|
|
@@ -5485,3 +5487,122 @@ a.tag-mark {
|
|
|
5485
5487
|
@keyframes ck-saved-flash {
|
|
5486
5488
|
0% { background: var(--ck-success); border-color: var(--ck-success); }
|
|
5487
5489
|
}
|
|
5490
|
+
|
|
5491
|
+
.ck-disagreement-list {
|
|
5492
|
+
list-style: none;
|
|
5493
|
+
padding: 0;
|
|
5494
|
+
margin: 12px 0 0;
|
|
5495
|
+
display: flex;
|
|
5496
|
+
flex-direction: column;
|
|
5497
|
+
gap: 12px;
|
|
5498
|
+
}
|
|
5499
|
+
.ck-disagreement {
|
|
5500
|
+
padding: 14px;
|
|
5501
|
+
background: var(--ck-surface-soft);
|
|
5502
|
+
border: 1px solid var(--ck-line);
|
|
5503
|
+
border-radius: 6px;
|
|
5504
|
+
display: flex;
|
|
5505
|
+
flex-direction: column;
|
|
5506
|
+
gap: 8px;
|
|
5507
|
+
}
|
|
5508
|
+
.ck-disagreement__head {
|
|
5509
|
+
display: flex;
|
|
5510
|
+
align-items: center;
|
|
5511
|
+
justify-content: space-between;
|
|
5512
|
+
gap: 12px;
|
|
5513
|
+
flex-wrap: wrap;
|
|
5514
|
+
}
|
|
5515
|
+
.ck-disagreement__scores {
|
|
5516
|
+
display: inline-flex;
|
|
5517
|
+
align-items: center;
|
|
5518
|
+
gap: 8px;
|
|
5519
|
+
flex-wrap: wrap;
|
|
5520
|
+
}
|
|
5521
|
+
.ck-disagreement__scores-label {
|
|
5522
|
+
font-family: var(--ck-mono);
|
|
5523
|
+
font-size: 0.7rem;
|
|
5524
|
+
letter-spacing: 0.08em;
|
|
5525
|
+
text-transform: uppercase;
|
|
5526
|
+
color: var(--ck-dim);
|
|
5527
|
+
}
|
|
5528
|
+
.ck-disagreement__scores-arrow {
|
|
5529
|
+
color: var(--ck-dim);
|
|
5530
|
+
}
|
|
5531
|
+
.ck-disagreement__note {
|
|
5532
|
+
margin: 0;
|
|
5533
|
+
color: var(--ck-text);
|
|
5534
|
+
font-size: 0.92rem;
|
|
5535
|
+
line-height: 1.45;
|
|
5536
|
+
}
|
|
5537
|
+
.ck-disagreement__source {
|
|
5538
|
+
margin: 0;
|
|
5539
|
+
font-size: 0.78rem;
|
|
5540
|
+
}
|
|
5541
|
+
|
|
5542
|
+
.ck-starter-row {
|
|
5543
|
+
margin-top: 2rem;
|
|
5544
|
+
padding-top: 1.5rem;
|
|
5545
|
+
border-top: 1px solid var(--ck-line);
|
|
5546
|
+
}
|
|
5547
|
+
.ck-starter-row--empty-state {
|
|
5548
|
+
margin-top: 0;
|
|
5549
|
+
padding-top: 0;
|
|
5550
|
+
border-top: 0;
|
|
5551
|
+
}
|
|
5552
|
+
.ck-starter-row .ck-kicker,
|
|
5553
|
+
.ck-starter-row .ck-title {
|
|
5554
|
+
margin-bottom: 0.5rem;
|
|
5555
|
+
}
|
|
5556
|
+
.ck-starter-row .ck-lead,
|
|
5557
|
+
.ck-starter-row .ck-meta-copy {
|
|
5558
|
+
margin-bottom: 1.25rem;
|
|
5559
|
+
}
|
|
5560
|
+
.ck-starter-grid {
|
|
5561
|
+
display: grid;
|
|
5562
|
+
grid-template-columns: repeat(auto-fill, minmax(260px, 1fr));
|
|
5563
|
+
gap: 12px;
|
|
5564
|
+
}
|
|
5565
|
+
.ck-starter-card {
|
|
5566
|
+
display: flex;
|
|
5567
|
+
flex-direction: column;
|
|
5568
|
+
gap: 8px;
|
|
5569
|
+
padding: 14px;
|
|
5570
|
+
background: var(--ck-surface);
|
|
5571
|
+
border: 1px solid var(--ck-line);
|
|
5572
|
+
border-radius: var(--ck-radius-lg);
|
|
5573
|
+
transition: border-color 0.12s, background 0.12s;
|
|
5574
|
+
}
|
|
5575
|
+
.ck-starter-card:hover {
|
|
5576
|
+
border-color: rgba(6, 182, 212, 0.35);
|
|
5577
|
+
background: var(--ck-surface-hover);
|
|
5578
|
+
}
|
|
5579
|
+
.ck-starter-card__name {
|
|
5580
|
+
margin: 0;
|
|
5581
|
+
font-size: 1rem;
|
|
5582
|
+
font-weight: 600;
|
|
5583
|
+
}
|
|
5584
|
+
.ck-starter-card__desc {
|
|
5585
|
+
margin: 0;
|
|
5586
|
+
font-size: 0.85rem;
|
|
5587
|
+
color: var(--ck-muted);
|
|
5588
|
+
line-height: 1.45;
|
|
5589
|
+
flex: 1;
|
|
5590
|
+
}
|
|
5591
|
+
.ck-starter-card__actions {
|
|
5592
|
+
display: flex;
|
|
5593
|
+
align-items: center;
|
|
5594
|
+
justify-content: space-between;
|
|
5595
|
+
gap: 10px;
|
|
5596
|
+
margin-top: 4px;
|
|
5597
|
+
}
|
|
5598
|
+
.ck-starter-card__dismiss {
|
|
5599
|
+
font-size: 0.75rem;
|
|
5600
|
+
color: var(--ck-dim);
|
|
5601
|
+
}
|
|
5602
|
+
.ck-starter-card__dismiss:hover,
|
|
5603
|
+
.ck-starter-card__dismiss:focus-visible {
|
|
5604
|
+
color: var(--ck-text);
|
|
5605
|
+
}
|
|
5606
|
+
.ck-actions--right {
|
|
5607
|
+
justify-content: flex-end;
|
|
5608
|
+
}
|
|
@@ -5,6 +5,33 @@ module CompletionKit
|
|
|
5
5
|
|
|
6
6
|
def index
|
|
7
7
|
@metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
|
|
8
|
+
@available_starters = StarterMetrics.available
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def starter_preview
|
|
12
|
+
@starter = StarterMetrics.find(params[:key])
|
|
13
|
+
return redirect_to(metrics_path, alert: "Unknown starter metric.") unless @starter
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def adopt_starter
|
|
17
|
+
starter = StarterMetrics.find(params[:key])
|
|
18
|
+
return redirect_to(metrics_path, alert: "Unknown starter metric.") unless starter
|
|
19
|
+
if Metric.exists?(name: starter.name)
|
|
20
|
+
return redirect_to(metrics_path, alert: "A metric named \"#{starter.name}\" already exists.")
|
|
21
|
+
end
|
|
22
|
+
metric = Metric.create!(
|
|
23
|
+
name: starter.name,
|
|
24
|
+
instruction: starter.instruction,
|
|
25
|
+
rubric_bands: starter.rubric_bands
|
|
26
|
+
)
|
|
27
|
+
redirect_to metric_path(metric), notice: "Added the \"#{starter.name}\" starter. Tweak any band before you run a judge against it."
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def dismiss_starter
|
|
31
|
+
starter = StarterMetrics.find(params[:key])
|
|
32
|
+
return redirect_to(metrics_path, alert: "Unknown starter metric.") unless starter
|
|
33
|
+
StarterMetricDismissal.find_or_create_by(starter_key: starter.key)
|
|
34
|
+
redirect_to metrics_path, notice: "Dismissed \"#{starter.name}\". It won't appear here again."
|
|
8
35
|
end
|
|
9
36
|
|
|
10
37
|
def show
|
|
@@ -109,7 +136,7 @@ module CompletionKit
|
|
|
109
136
|
"added_at" => Time.current.utc.iso8601
|
|
110
137
|
}
|
|
111
138
|
@metric.update!(few_shot_examples: examples)
|
|
112
|
-
redirect_to metric_path(@metric), notice: "
|
|
139
|
+
redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
|
|
113
140
|
end
|
|
114
141
|
|
|
115
142
|
private
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module StarterMetrics
|
|
3
|
+
Starter = Struct.new(:key, :name, :description, :catches, :instruction, :rubric_bands, keyword_init: true)
|
|
4
|
+
|
|
5
|
+
ALL = [
|
|
6
|
+
Starter.new(
|
|
7
|
+
key: "correctness",
|
|
8
|
+
name: "Correctness",
|
|
9
|
+
description: "Is the output factually right and free of made-up information?",
|
|
10
|
+
catches: "Hallucinations, wrong facts, subtle distortions. The most universally-asked question about an LLM's output.",
|
|
11
|
+
instruction: "Is the output factually right and free of made-up information? Penalise hallucinations and subtle factual distortions; reward outputs whose every claim checks out.",
|
|
12
|
+
rubric_bands: [
|
|
13
|
+
{ "stars" => 5, "description" => "Every fact in the output checks out." },
|
|
14
|
+
{ "stars" => 4, "description" => "Right in substance; minor imprecision or omission." },
|
|
15
|
+
{ "stars" => 3, "description" => "Mostly right, one or two facts are off." },
|
|
16
|
+
{ "stars" => 2, "description" => "Mostly wrong with a few right details." },
|
|
17
|
+
{ "stars" => 1, "description" => "Wrong, misleading, or contains fabricated facts." }
|
|
18
|
+
]
|
|
19
|
+
),
|
|
20
|
+
Starter.new(
|
|
21
|
+
key: "instruction_following",
|
|
22
|
+
name: "Instruction following",
|
|
23
|
+
description: "Did the model do everything that was asked?",
|
|
24
|
+
catches: "The response is factually right but ignores \"answer in two sentences\", \"use bullet points\", \"do not include X\". Distinct from Correctness — a response can be right and still fail this.",
|
|
25
|
+
instruction: "Did the model do every concrete thing the prompt asked for? Score against the explicit requirements in the prompt (format constraints, count limits, exclusions, audience cues). Factual accuracy is a different dimension — score that elsewhere.",
|
|
26
|
+
rubric_bands: [
|
|
27
|
+
{ "stars" => 5, "description" => "Followed every requirement in the prompt exactly." },
|
|
28
|
+
{ "stars" => 4, "description" => "Followed every requirement with a small slip." },
|
|
29
|
+
{ "stars" => 3, "description" => "Did the main thing, missed at least one explicit requirement." },
|
|
30
|
+
{ "stars" => 2, "description" => "Did some of what was asked, missed the main requirement." },
|
|
31
|
+
{ "stars" => 1, "description" => "Ignored the instructions or did something different." }
|
|
32
|
+
]
|
|
33
|
+
),
|
|
34
|
+
Starter.new(
|
|
35
|
+
key: "format_compliance",
|
|
36
|
+
name: "Format compliance",
|
|
37
|
+
description: "Does the output follow the required structure?",
|
|
38
|
+
catches: "Invalid JSON, missing schema fields, extra prose around a structured response, wrong casing on keys. Critical for any LLM wired into an API.",
|
|
39
|
+
instruction: "Does the output match the format the prompt asked for — JSON shape, schema, keys, casing, no stray prose? Score on whether a downstream parser would accept it without massaging.",
|
|
40
|
+
rubric_bands: [
|
|
41
|
+
{ "stars" => 5, "description" => "Exact spec, ready to consume programmatically." },
|
|
42
|
+
{ "stars" => 4, "description" => "Spec-compliant with one cosmetic issue." },
|
|
43
|
+
{ "stars" => 3, "description" => "Right shape, minor deviations (extra commentary, casing, ordering)." },
|
|
44
|
+
{ "stars" => 2, "description" => "Right format with substantive deviations (missing required fields, wrong types)." },
|
|
45
|
+
{ "stars" => 1, "description" => "Wrong format or unparseable." }
|
|
46
|
+
]
|
|
47
|
+
),
|
|
48
|
+
Starter.new(
|
|
49
|
+
key: "tone",
|
|
50
|
+
name: "Tone",
|
|
51
|
+
description: "Does the voice fit the audience the prompt asked for?",
|
|
52
|
+
catches: "Rude, robotic, off-brand, too casual, too formal. The dimension hardest to eyeball at scale and the one most user-facing surfaces care about.",
|
|
53
|
+
instruction: "Does the voice match the audience and brand the prompt called for? Reward outputs that sound like the persona the prompt asked for. Penalise rude, robotic, off-brand, or wrong-register replies.",
|
|
54
|
+
rubric_bands: [
|
|
55
|
+
{ "stars" => 5, "description" => "Sounds like the brand or persona the prompt asked for." },
|
|
56
|
+
{ "stars" => 4, "description" => "Right tone with a slip or two." },
|
|
57
|
+
{ "stars" => 3, "description" => "Acceptable, generic, no personality." },
|
|
58
|
+
{ "stars" => 2, "description" => "Mismatched tone; sounds like a different audience." },
|
|
59
|
+
{ "stars" => 1, "description" => "Off-tone in a way a user would notice (rude, condescending, jarring)." }
|
|
60
|
+
]
|
|
61
|
+
),
|
|
62
|
+
Starter.new(
|
|
63
|
+
key: "conciseness",
|
|
64
|
+
name: "Conciseness",
|
|
65
|
+
description: "Is it the right length — no padding, no missing detail?",
|
|
66
|
+
catches: "Rambling responses, repetitive caveats, over-hedging. LLMs default to verbose. Conciseness is the dimension where users most often see scores move after tuning.",
|
|
67
|
+
instruction: "Is the output the right length for the task — no padding, no missing detail, no hedging filler? Penalise rambling, repetition, over-caveating, and unnecessary preamble. Penalise too-short outputs that drop information.",
|
|
68
|
+
rubric_bands: [
|
|
69
|
+
{ "stars" => 5, "description" => "Exactly as long as the task needs, no more, no less." },
|
|
70
|
+
{ "stars" => 4, "description" => "Right length with a small redundancy." },
|
|
71
|
+
{ "stars" => 3, "description" => "Acceptable; trims could happen or detail could be added." },
|
|
72
|
+
{ "stars" => 2, "description" => "Noticeable filler or visible gaps." },
|
|
73
|
+
{ "stars" => 1, "description" => "Padded, repetitive, or so short it loses information." }
|
|
74
|
+
]
|
|
75
|
+
)
|
|
76
|
+
].freeze
|
|
77
|
+
|
|
78
|
+
module_function
|
|
79
|
+
|
|
80
|
+
def find(key)
|
|
81
|
+
ALL.find { |s| s.key == key }
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def available
|
|
85
|
+
adopted_names = Metric.where(name: ALL.map(&:name)).pluck(:name).to_set
|
|
86
|
+
dismissed_keys = StarterMetricDismissal.pluck(:starter_key).to_set
|
|
87
|
+
ALL.reject { |s| adopted_names.include?(s.name) || dismissed_keys.include?(s.key) }
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def adopted?(starter)
|
|
91
|
+
Metric.exists?(name: starter.name)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
<article class="ck-starter-card">
|
|
2
|
+
<h3 class="ck-starter-card__name"><%= link_to starter.name, starter_preview_metrics_path(key: starter.key), class: "ck-link" %></h3>
|
|
3
|
+
<p class="ck-starter-card__desc"><%= starter.description %></p>
|
|
4
|
+
<div class="ck-starter-card__actions">
|
|
5
|
+
<%= link_to "Preview", starter_preview_metrics_path(key: starter.key), class: ck_button_classes(:dark) + " ck-button--sm" %>
|
|
6
|
+
<%= button_to "Don't show this one", dismiss_starter_metrics_path(key: starter.key),
|
|
7
|
+
method: :post, form_class: "inline-block",
|
|
8
|
+
class: "ck-link ck-starter-card__dismiss",
|
|
9
|
+
data: { turbo_confirm: "Hide \"#{starter.name}\" from this list?" } %>
|
|
10
|
+
</div>
|
|
11
|
+
</article>
|
|
@@ -19,7 +19,6 @@
|
|
|
19
19
|
<tr>
|
|
20
20
|
<th scope="col">Name</th>
|
|
21
21
|
<th scope="col">Instruction</th>
|
|
22
|
-
<th scope="col" title="How often this metric's scores match the humans who reviewed them.">Trust level</th>
|
|
23
22
|
<th scope="col">In groups</th>
|
|
24
23
|
<th scope="col"></th>
|
|
25
24
|
</tr>
|
|
@@ -29,6 +28,21 @@
|
|
|
29
28
|
<tr onclick="window.location='<%= metric_path(metric) %>'" style="cursor: pointer;">
|
|
30
29
|
<td>
|
|
31
30
|
<%= link_to metric_path(metric), class: "ck-record-name" do %><strong><%= metric.name %></strong><% end %>
|
|
31
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
32
|
+
<% s = CompletionKit::MetricCalibrationStats.for(metric) %>
|
|
33
|
+
<p class="ck-metrics-table__trust">
|
|
34
|
+
<% if s.counter_only? %>
|
|
35
|
+
<% if s.sample_size.zero? %>
|
|
36
|
+
No verdicts yet
|
|
37
|
+
<% else %>
|
|
38
|
+
<%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts
|
|
39
|
+
<% end %>
|
|
40
|
+
<% else %>
|
|
41
|
+
<span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read — keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
|
|
42
|
+
±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %>
|
|
43
|
+
<% end %>
|
|
44
|
+
</p>
|
|
45
|
+
<% end %>
|
|
32
46
|
<% if metric.tags.any? %>
|
|
33
47
|
<div class="tag-marks-row">
|
|
34
48
|
<%= render "completion_kit/tags/marks", tags: metric.tags %>
|
|
@@ -36,23 +50,6 @@
|
|
|
36
50
|
<% end %>
|
|
37
51
|
</td>
|
|
38
52
|
<td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
|
|
39
|
-
<td data-label="Trust level" class="ck-metrics-table__trust">
|
|
40
|
-
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
41
|
-
<% s = CompletionKit::MetricCalibrationStats.for(metric) %>
|
|
42
|
-
<% if s.counter_only? %>
|
|
43
|
-
<% if s.sample_size.zero? %>
|
|
44
|
-
<span class="ck-meta-copy">No verdicts yet</span>
|
|
45
|
-
<% else %>
|
|
46
|
-
<span class="ck-meta-copy"><%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts</span>
|
|
47
|
-
<% end %>
|
|
48
|
-
<% else %>
|
|
49
|
-
<span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read — keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
|
|
50
|
-
<span class="ck-meta-copy">±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %></span>
|
|
51
|
-
<% end %>
|
|
52
|
-
<% else %>
|
|
53
|
-
<span class="ck-meta-copy">—</span>
|
|
54
|
-
<% end %>
|
|
55
|
-
</td>
|
|
56
53
|
<td data-label="In groups">
|
|
57
54
|
<% groups = metric.metric_groups %>
|
|
58
55
|
<% if groups.any? %>
|
|
@@ -78,12 +75,36 @@
|
|
|
78
75
|
Use the same metrics on multiple runs? <%= link_to "Group them →", metric_groups_path, class: "ck-link" %>
|
|
79
76
|
</p>
|
|
80
77
|
<% end %>
|
|
78
|
+
|
|
79
|
+
<% if @available_starters.any? %>
|
|
80
|
+
<section class="ck-starter-row">
|
|
81
|
+
<p class="ck-kicker">Add a starter metric</p>
|
|
82
|
+
<p class="ck-meta-copy">Pre-written rubrics for the dimensions most teams score against. Click a card to preview before it's created.</p>
|
|
83
|
+
<div class="ck-starter-grid">
|
|
84
|
+
<% @available_starters.each do |starter| %>
|
|
85
|
+
<%= render "starter_card", starter: starter %>
|
|
86
|
+
<% end %>
|
|
87
|
+
</div>
|
|
88
|
+
</section>
|
|
89
|
+
<% end %>
|
|
81
90
|
<% elsif @selected_tags.any? %>
|
|
82
91
|
<div class="ck-empty">
|
|
83
92
|
<p>No metrics match these tags. <%= link_to "Clear filters", metrics_path, class: "ck-link" %>.</p>
|
|
84
93
|
</div>
|
|
85
94
|
<% else %>
|
|
86
|
-
|
|
87
|
-
<
|
|
88
|
-
|
|
95
|
+
<% if @available_starters.any? %>
|
|
96
|
+
<section class="ck-starter-row ck-starter-row--empty-state">
|
|
97
|
+
<h2 class="ck-title ck-title--sm">Start with a ready-made rubric</h2>
|
|
98
|
+
<p class="ck-lead">Pick one of the dimensions below to drop in a pre-written 1–5 rubric. You can edit anything after adding it. Or <%= link_to "write your own from scratch", new_metric_path, class: "ck-link" %>.</p>
|
|
99
|
+
<div class="ck-starter-grid">
|
|
100
|
+
<% @available_starters.each do |starter| %>
|
|
101
|
+
<%= render "starter_card", starter: starter %>
|
|
102
|
+
<% end %>
|
|
103
|
+
</div>
|
|
104
|
+
</section>
|
|
105
|
+
<% else %>
|
|
106
|
+
<div class="ck-empty">
|
|
107
|
+
<p>No metrics yet. <%= link_to "Create your first metric", new_metric_path, class: "ck-link" %> to start scoring prompt outputs.</p>
|
|
108
|
+
</div>
|
|
109
|
+
<% end %>
|
|
89
110
|
<% end %>
|
|
@@ -140,82 +140,69 @@
|
|
|
140
140
|
</section>
|
|
141
141
|
<% end %>
|
|
142
142
|
|
|
143
|
-
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
143
|
+
<% if CompletionKit.config.judge_calibration_enabled && @disagreements.any? %>
|
|
144
144
|
<section class="ck-card ck-card--spaced">
|
|
145
145
|
<div class="ck-prompt-preview__header">
|
|
146
|
-
<p class="ck-kicker">
|
|
147
|
-
|
|
148
|
-
<span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
|
|
149
|
-
<% end %>
|
|
146
|
+
<p class="ck-kicker">Cases to learn from</p>
|
|
147
|
+
<span class="ck-chip"><%= pluralize(@disagreements.size, "row") %></span>
|
|
150
148
|
</div>
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
<
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
<% end %>
|
|
190
|
-
</td>
|
|
191
|
-
<td class="ck-meta-copy"><%= cal.note.to_s.truncate(120) %></td>
|
|
192
|
-
<td>
|
|
193
|
-
<% if already %>
|
|
194
|
-
<span class="ck-chip ck-chip--done">Saved as example</span>
|
|
195
|
-
<% else %>
|
|
196
|
-
<%= button_to "Teach the judge",
|
|
197
|
-
add_few_shot_metric_path(@metric, calibration_id: cal.id),
|
|
198
|
-
method: :post,
|
|
199
|
-
form_class: "inline-block",
|
|
200
|
-
class: ck_button_classes(:light, variant: :outline),
|
|
201
|
-
title: "Save this row as a teaching example. The judge will see it next time it grades." %>
|
|
202
|
-
<% end %>
|
|
203
|
-
</td>
|
|
204
|
-
</tr>
|
|
149
|
+
<p class="ck-meta-copy">Rows where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> — the judge sees them next time it grades.</p>
|
|
150
|
+
<% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
|
|
151
|
+
<ul class="ck-disagreement-list">
|
|
152
|
+
<% @disagreements.each do |cal| %>
|
|
153
|
+
<% review = cal.response.reviews.find { |r| r.metric_id == @metric.id } %>
|
|
154
|
+
<% already = existing_ids.include?(cal.id) %>
|
|
155
|
+
<li class="ck-disagreement">
|
|
156
|
+
<div class="ck-disagreement__head">
|
|
157
|
+
<div class="ck-disagreement__scores">
|
|
158
|
+
<span class="ck-disagreement__scores-label">Judge</span>
|
|
159
|
+
<% if review&.ai_score %>
|
|
160
|
+
<span class="<%= ck_badge_classes(ck_score_kind(review.ai_score.to_f)) %>"><%= review.ai_score %></span>
|
|
161
|
+
<% else %>
|
|
162
|
+
<span class="ck-meta-copy">—</span>
|
|
163
|
+
<% end %>
|
|
164
|
+
<span class="ck-disagreement__scores-arrow">→</span>
|
|
165
|
+
<span class="ck-disagreement__scores-label">Human</span>
|
|
166
|
+
<% if cal.corrected_score %>
|
|
167
|
+
<span class="<%= ck_badge_classes(ck_score_kind(cal.corrected_score.to_f)) %>"><%= cal.corrected_score %></span>
|
|
168
|
+
<% else %>
|
|
169
|
+
<span class="ck-meta-copy">—</span>
|
|
170
|
+
<% end %>
|
|
171
|
+
</div>
|
|
172
|
+
<div class="ck-disagreement__action">
|
|
173
|
+
<% if already %>
|
|
174
|
+
<span class="ck-chip ck-chip--done">Remembered</span>
|
|
175
|
+
<% else %>
|
|
176
|
+
<%= button_to "Remember this",
|
|
177
|
+
add_few_shot_metric_path(@metric, calibration_id: cal.id),
|
|
178
|
+
method: :post,
|
|
179
|
+
form_class: "inline-block",
|
|
180
|
+
class: ck_button_classes(:light, variant: :outline),
|
|
181
|
+
title: "Pin this row so the judge sees it next time it grades for this metric." %>
|
|
182
|
+
<% end %>
|
|
183
|
+
</div>
|
|
184
|
+
</div>
|
|
185
|
+
<% if cal.note.to_s.present? %>
|
|
186
|
+
<p class="ck-disagreement__note"><%= cal.note %></p>
|
|
205
187
|
<% end %>
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
188
|
+
<p class="ck-disagreement__source ck-meta-copy">
|
|
189
|
+
<%= link_to cal.response.run.name.to_s.truncate(50), ck_run_path(cal.response.run), class: "ck-link" %>
|
|
190
|
+
·
|
|
191
|
+
<%= link_to "row ##{cal.response.id}", run_response_path(cal.response.run, cal.response), class: "ck-link" %>
|
|
192
|
+
</p>
|
|
193
|
+
</li>
|
|
194
|
+
<% end %>
|
|
195
|
+
</ul>
|
|
209
196
|
</section>
|
|
210
197
|
|
|
211
198
|
|
|
212
199
|
<% if Array(@metric.few_shot_examples).any? %>
|
|
213
200
|
<section class="ck-card ck-card--spaced">
|
|
214
201
|
<div class="ck-prompt-preview__header">
|
|
215
|
-
<p class="ck-kicker">
|
|
216
|
-
<span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "
|
|
202
|
+
<p class="ck-kicker">What the judge remembers</p>
|
|
203
|
+
<span class="ck-chip"><%= pluralize(Array(@metric.few_shot_examples).size, "case") %></span>
|
|
217
204
|
</div>
|
|
218
|
-
<p class="ck-meta-copy">
|
|
205
|
+
<p class="ck-meta-copy">Rows you've pinned so the judge sees them next time it grades. Each one shows what the judge gave and what a human said it should have been.</p>
|
|
219
206
|
<ol class="ck-few-shot-list">
|
|
220
207
|
<% Array(@metric.few_shot_examples).each do |fs| %>
|
|
221
208
|
<li class="ck-few-shot-item">
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
<section class="ck-page-header">
|
|
2
|
+
<div>
|
|
3
|
+
<p class="ck-kicker">Starter metric</p>
|
|
4
|
+
<h1 class="ck-title"><%= @starter.name %></h1>
|
|
5
|
+
<p class="ck-lead"><%= @starter.description %></p>
|
|
6
|
+
</div>
|
|
7
|
+
<div class="ck-actions">
|
|
8
|
+
<%= link_to "← Back to metrics", metrics_path, class: ck_button_classes(:light, variant: :outline) %>
|
|
9
|
+
</div>
|
|
10
|
+
</section>
|
|
11
|
+
|
|
12
|
+
<section class="ck-card ck-card--spaced">
|
|
13
|
+
<p class="ck-kicker">What this catches</p>
|
|
14
|
+
<p class="ck-copy"><%= @starter.catches %></p>
|
|
15
|
+
</section>
|
|
16
|
+
|
|
17
|
+
<section class="ck-card ck-card--spaced">
|
|
18
|
+
<p class="ck-kicker">Instruction the judge will see</p>
|
|
19
|
+
<p class="ck-copy"><%= @starter.instruction %></p>
|
|
20
|
+
</section>
|
|
21
|
+
|
|
22
|
+
<section class="ck-card ck-card--spaced">
|
|
23
|
+
<p class="ck-kicker">Rubric</p>
|
|
24
|
+
<div class="ck-rubric-display">
|
|
25
|
+
<% @starter.rubric_bands.sort_by { |b| -b["stars"] }.each do |band| %>
|
|
26
|
+
<div class="ck-rubric-row ck-rubric-row--display">
|
|
27
|
+
<div class="ck-rubric-row__stars">
|
|
28
|
+
<% 5.times do |i| %>
|
|
29
|
+
<svg viewBox="0 0 24 24" width="16" height="16" stroke-width="1.75" class="ck-star <%= i < band["stars"] ? "ck-star--filled" : "ck-star--empty" %>"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
|
|
30
|
+
<% end %>
|
|
31
|
+
</div>
|
|
32
|
+
<div class="ck-rubric-row__fields">
|
|
33
|
+
<p class="ck-copy"><%= band["description"] %></p>
|
|
34
|
+
</div>
|
|
35
|
+
</div>
|
|
36
|
+
<% end %>
|
|
37
|
+
</div>
|
|
38
|
+
</section>
|
|
39
|
+
|
|
40
|
+
<div class="ck-actions ck-actions--right">
|
|
41
|
+
<%= link_to "Cancel", metrics_path, class: ck_button_classes(:light, variant: :outline) %>
|
|
42
|
+
<%= button_to "Add this metric", adopt_starter_metrics_path(key: @starter.key),
|
|
43
|
+
method: :post, form_class: "inline-block",
|
|
44
|
+
class: ck_button_classes(:dark) %>
|
|
45
|
+
</div>
|
data/config/routes.rb
CHANGED
|
@@ -13,6 +13,11 @@ CompletionKit::Engine.routes.draw do
|
|
|
13
13
|
|
|
14
14
|
resources :datasets
|
|
15
15
|
resources :metrics do
|
|
16
|
+
collection do
|
|
17
|
+
get "starters/:key", to: "metrics#starter_preview", as: :starter_preview
|
|
18
|
+
post "starters/:key", to: "metrics#adopt_starter", as: :adopt_starter
|
|
19
|
+
post "starters/:key/dismiss", to: "metrics#dismiss_starter", as: :dismiss_starter
|
|
20
|
+
end
|
|
16
21
|
member do
|
|
17
22
|
post :add_few_shot
|
|
18
23
|
post :publish_draft
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
class CreateCompletionKitStarterMetricDismissals < ActiveRecord::Migration[8.1]
|
|
2
|
+
def change
|
|
3
|
+
create_table :completion_kit_starter_metric_dismissals do |t|
|
|
4
|
+
t.string :starter_key, null: false
|
|
5
|
+
t.timestamps
|
|
6
|
+
end
|
|
7
|
+
|
|
8
|
+
add_index :completion_kit_starter_metric_dismissals, :starter_key,
|
|
9
|
+
unique: true,
|
|
10
|
+
name: "index_ck_starter_dismissals_on_key"
|
|
11
|
+
end
|
|
12
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.42
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -284,6 +284,7 @@ files:
|
|
|
284
284
|
- app/models/completion_kit/review.rb
|
|
285
285
|
- app/models/completion_kit/run.rb
|
|
286
286
|
- app/models/completion_kit/run_metric.rb
|
|
287
|
+
- app/models/completion_kit/starter_metric_dismissal.rb
|
|
287
288
|
- app/models/completion_kit/suggestion.rb
|
|
288
289
|
- app/models/completion_kit/tag.rb
|
|
289
290
|
- app/models/completion_kit/tagging.rb
|
|
@@ -318,6 +319,7 @@ files:
|
|
|
318
319
|
- app/services/completion_kit/open_router_client.rb
|
|
319
320
|
- app/services/completion_kit/prompt_improvement_service.rb
|
|
320
321
|
- app/services/completion_kit/provider_endpoint.rb
|
|
322
|
+
- app/services/completion_kit/starter_metrics.rb
|
|
321
323
|
- app/services/completion_kit/worker_health.rb
|
|
322
324
|
- app/validators/completion_kit/tenant_scoped_uniqueness_validator.rb
|
|
323
325
|
- app/views/completion_kit/api_reference/_authentication.html.erb
|
|
@@ -345,10 +347,12 @@ files:
|
|
|
345
347
|
- app/views/completion_kit/metric_groups/new.html.erb
|
|
346
348
|
- app/views/completion_kit/metric_groups/show.html.erb
|
|
347
349
|
- app/views/completion_kit/metrics/_form.html.erb
|
|
350
|
+
- app/views/completion_kit/metrics/_starter_card.html.erb
|
|
348
351
|
- app/views/completion_kit/metrics/edit.html.erb
|
|
349
352
|
- app/views/completion_kit/metrics/index.html.erb
|
|
350
353
|
- app/views/completion_kit/metrics/new.html.erb
|
|
351
354
|
- app/views/completion_kit/metrics/show.html.erb
|
|
355
|
+
- app/views/completion_kit/metrics/starter_preview.html.erb
|
|
352
356
|
- app/views/completion_kit/onboarding/_concept.html.erb
|
|
353
357
|
- app/views/completion_kit/onboarding/show.html.erb
|
|
354
358
|
- app/views/completion_kit/prompts/_form.html.erb
|
|
@@ -414,6 +418,7 @@ files:
|
|
|
414
418
|
- db/migrate/20260522000002_create_completion_kit_calibrations.rb
|
|
415
419
|
- db/migrate/20260523000001_add_few_shot_examples_to_completion_kit_metrics.rb
|
|
416
420
|
- db/migrate/20260523000002_add_state_to_completion_kit_judge_versions.rb
|
|
421
|
+
- db/migrate/20260524000001_create_completion_kit_starter_metric_dismissals.rb
|
|
417
422
|
- lib/completion-kit.rb
|
|
418
423
|
- lib/completion_kit.rb
|
|
419
424
|
- lib/completion_kit/concurrency_check.rb
|