completion-kit 0.5.38 → 0.5.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/{application.css.erb → application.css} +167 -54
- data/app/controllers/completion_kit/calibrations_controller.rb +35 -8
- data/app/controllers/completion_kit/metrics_controller.rb +21 -6
- data/app/helpers/completion_kit/application_helper.rb +73 -2
- data/app/services/completion_kit/judge_variant_generator.rb +70 -25
- data/app/services/completion_kit/mcp_tools/judges.rb +2 -4
- data/app/services/completion_kit/onboarding/sample_data.rb +3 -3
- data/app/views/completion_kit/calibrations/_buttons.html.erb +32 -19
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +1 -1
- data/app/views/completion_kit/metrics/index.html.erb +2 -2
- data/app/views/completion_kit/metrics/show.html.erb +91 -39
- data/app/views/completion_kit/responses/show.html.erb +1 -3
- data/config/routes.rb +1 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d8875fa22de9c8626e401706818b271f3cb26bffa7faae1f583b29503228cead
|
|
4
|
+
data.tar.gz: 5bf39fad883b2eed2f505b11403ab60fddf9efc5735073ff61c290d53f59a36d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a8f2a7f14235c1214b567b891defaf523a645a21a2409ed81df964893a260cccb1fc9bf63903794f952c91bc9c91f3c1e3850db751a08ce0edc49b360ad9642d
|
|
7
|
+
data.tar.gz: e148b500e498a00dc370bf203fea3e2618b9f1c8fccd1dc5f220ae77a4988a8934198ad32bbf25a29bc84f948c687fab9a0ae16b9c03fe575fe9fca4cf98a0ea
|
|
@@ -1,12 +1,26 @@
|
|
|
1
|
-
<% %w[400 500 700].each do |weight| %>
|
|
2
1
|
@font-face {
|
|
3
2
|
font-family: 'JetBrains Mono';
|
|
4
3
|
font-style: normal;
|
|
5
|
-
font-weight:
|
|
4
|
+
font-weight: 400;
|
|
5
|
+
font-display: swap;
|
|
6
|
+
src: url('completion_kit/jetbrains-mono-400.woff2') format('woff2');
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
@font-face {
|
|
10
|
+
font-family: 'JetBrains Mono';
|
|
11
|
+
font-style: normal;
|
|
12
|
+
font-weight: 500;
|
|
13
|
+
font-display: swap;
|
|
14
|
+
src: url('completion_kit/jetbrains-mono-500.woff2') format('woff2');
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
@font-face {
|
|
18
|
+
font-family: 'JetBrains Mono';
|
|
19
|
+
font-style: normal;
|
|
20
|
+
font-weight: 700;
|
|
6
21
|
font-display: swap;
|
|
7
|
-
src: url('
|
|
22
|
+
src: url('completion_kit/jetbrains-mono-700.woff2') format('woff2');
|
|
8
23
|
}
|
|
9
|
-
<% end %>
|
|
10
24
|
|
|
11
25
|
.turbo-progress-bar {
|
|
12
26
|
background-color: var(--ck-accent);
|
|
@@ -1527,6 +1541,25 @@ tr:hover .ck-chip--publish {
|
|
|
1527
1541
|
border: 0;
|
|
1528
1542
|
border-radius: 0;
|
|
1529
1543
|
background: transparent;
|
|
1544
|
+
white-space: pre;
|
|
1545
|
+
color: #93c5fd;
|
|
1546
|
+
font-size: 0.86rem;
|
|
1547
|
+
line-height: 1.55;
|
|
1548
|
+
}
|
|
1549
|
+
.ck-code-scroll-wrap > .ck-code .ck-json-key {
|
|
1550
|
+
color: #c4b5fd;
|
|
1551
|
+
}
|
|
1552
|
+
.ck-code-scroll-wrap > .ck-code .ck-json-string {
|
|
1553
|
+
color: #93c5fd;
|
|
1554
|
+
}
|
|
1555
|
+
.ck-code-scroll-wrap > .ck-code .ck-json-number {
|
|
1556
|
+
color: #fcd34d;
|
|
1557
|
+
}
|
|
1558
|
+
.ck-code-scroll-wrap > .ck-code .ck-json-keyword {
|
|
1559
|
+
color: #f9a8d4;
|
|
1560
|
+
}
|
|
1561
|
+
.ck-code-scroll-wrap > .ck-code .ck-json-punct {
|
|
1562
|
+
color: var(--ck-dim);
|
|
1530
1563
|
}
|
|
1531
1564
|
|
|
1532
1565
|
.ck-note-box {
|
|
@@ -2741,7 +2774,7 @@ select.ck-input {
|
|
|
2741
2774
|
border: 1px solid var(--ck-line);
|
|
2742
2775
|
border-radius: var(--ck-radius-lg);
|
|
2743
2776
|
background: var(--ck-surface);
|
|
2744
|
-
padding: 1.
|
|
2777
|
+
padding: 1.25rem;
|
|
2745
2778
|
}
|
|
2746
2779
|
|
|
2747
2780
|
.ck-review-card__header {
|
|
@@ -2751,6 +2784,11 @@ select.ck-input {
|
|
|
2751
2784
|
gap: 1rem;
|
|
2752
2785
|
}
|
|
2753
2786
|
|
|
2787
|
+
.ck-review-card__header .ck-inline {
|
|
2788
|
+
flex-wrap: nowrap;
|
|
2789
|
+
flex-shrink: 0;
|
|
2790
|
+
}
|
|
2791
|
+
|
|
2754
2792
|
.ck-review-card__metric {
|
|
2755
2793
|
font-family: var(--ck-mono);
|
|
2756
2794
|
font-size: 0.95rem;
|
|
@@ -2759,11 +2797,9 @@ select.ck-input {
|
|
|
2759
2797
|
}
|
|
2760
2798
|
|
|
2761
2799
|
.ck-review-card__feedback {
|
|
2762
|
-
margin
|
|
2763
|
-
|
|
2764
|
-
|
|
2765
|
-
.ck-review-card__feedback .ck-note-box {
|
|
2766
|
-
margin-top: 0;
|
|
2800
|
+
margin: 0.6rem 0 0;
|
|
2801
|
+
color: var(--ck-muted);
|
|
2802
|
+
line-height: 1.55;
|
|
2767
2803
|
}
|
|
2768
2804
|
|
|
2769
2805
|
@media (max-width: 900px) {
|
|
@@ -2800,6 +2836,14 @@ select.ck-input {
|
|
|
2800
2836
|
width: 100%;
|
|
2801
2837
|
}
|
|
2802
2838
|
|
|
2839
|
+
/* button_to renders a form.inline-block wrapping the button. When the inner
|
|
2840
|
+
button is a full .ck-button (not an icon-button or chip), the form should
|
|
2841
|
+
stretch with it. */
|
|
2842
|
+
form.inline-block:has(> .ck-button) {
|
|
2843
|
+
width: 100%;
|
|
2844
|
+
display: block;
|
|
2845
|
+
}
|
|
2846
|
+
|
|
2803
2847
|
/* Page header stacks: title, then lead text full-width, then action. */
|
|
2804
2848
|
.ck-page-header {
|
|
2805
2849
|
flex-direction: column;
|
|
@@ -2827,6 +2871,12 @@ select.ck-input {
|
|
|
2827
2871
|
padding: 1rem;
|
|
2828
2872
|
}
|
|
2829
2873
|
|
|
2874
|
+
.ck-review-card__header {
|
|
2875
|
+
flex-direction: column;
|
|
2876
|
+
align-items: flex-start;
|
|
2877
|
+
gap: 0.5rem;
|
|
2878
|
+
}
|
|
2879
|
+
|
|
2830
2880
|
/* Topbar nav collapses behind the hamburger trigger. */
|
|
2831
2881
|
.ck-nav-menu__trigger {
|
|
2832
2882
|
display: inline-flex;
|
|
@@ -4559,9 +4609,8 @@ a.tag-mark {
|
|
|
4559
4609
|
}
|
|
4560
4610
|
|
|
4561
4611
|
.ck-launch__progress {
|
|
4562
|
-
padding-bottom:
|
|
4563
|
-
margin-bottom:
|
|
4564
|
-
border-bottom: 1px solid var(--ck-line);
|
|
4612
|
+
padding-bottom: 0;
|
|
4613
|
+
margin-bottom: 1.25rem;
|
|
4565
4614
|
}
|
|
4566
4615
|
.ck-launch__progress-head {
|
|
4567
4616
|
display: flex;
|
|
@@ -5126,22 +5175,37 @@ a.tag-mark {
|
|
|
5126
5175
|
border-top: 1px dashed var(--ck-line);
|
|
5127
5176
|
}
|
|
5128
5177
|
.ck-calibration__prompt {
|
|
5178
|
+
margin: 0 0 10px;
|
|
5179
|
+
display: flex;
|
|
5180
|
+
align-items: baseline;
|
|
5181
|
+
flex-wrap: wrap;
|
|
5182
|
+
gap: 8px 12px;
|
|
5183
|
+
}
|
|
5184
|
+
.ck-calibration__label {
|
|
5129
5185
|
font-family: var(--ck-mono);
|
|
5130
5186
|
font-size: 0.72rem;
|
|
5131
5187
|
letter-spacing: 0.06em;
|
|
5132
5188
|
text-transform: uppercase;
|
|
5133
5189
|
color: var(--ck-dim);
|
|
5134
|
-
|
|
5135
|
-
display: flex;
|
|
5136
|
-
align-items: center;
|
|
5137
|
-
gap: 10px;
|
|
5190
|
+
flex-shrink: 0;
|
|
5138
5191
|
}
|
|
5139
5192
|
.ck-calibration__count {
|
|
5140
5193
|
font-family: var(--ck-mono);
|
|
5141
5194
|
font-size: 0.72rem;
|
|
5142
5195
|
letter-spacing: 0.03em;
|
|
5143
5196
|
color: var(--ck-accent);
|
|
5144
|
-
|
|
5197
|
+
}
|
|
5198
|
+
.ck-calibration__hint {
|
|
5199
|
+
font-size: 0.82rem;
|
|
5200
|
+
color: var(--ck-dim);
|
|
5201
|
+
line-height: 1.4;
|
|
5202
|
+
}
|
|
5203
|
+
@media (max-width: 640px) {
|
|
5204
|
+
.ck-calibration__prompt {
|
|
5205
|
+
flex-direction: column;
|
|
5206
|
+
align-items: flex-start;
|
|
5207
|
+
gap: 4px;
|
|
5208
|
+
}
|
|
5145
5209
|
}
|
|
5146
5210
|
.ck-calibration__buttons {
|
|
5147
5211
|
display: flex;
|
|
@@ -5196,11 +5260,13 @@ a.tag-mark {
|
|
|
5196
5260
|
margin-top: 12px;
|
|
5197
5261
|
display: flex;
|
|
5198
5262
|
flex-direction: column;
|
|
5199
|
-
gap:
|
|
5200
|
-
|
|
5201
|
-
|
|
5202
|
-
|
|
5203
|
-
|
|
5263
|
+
gap: 12px;
|
|
5264
|
+
}
|
|
5265
|
+
.ck-calibration__detail > * {
|
|
5266
|
+
margin: 0;
|
|
5267
|
+
}
|
|
5268
|
+
.ck-calibration__detail .ck-button {
|
|
5269
|
+
align-self: flex-start;
|
|
5204
5270
|
}
|
|
5205
5271
|
.ck-calibration__value {
|
|
5206
5272
|
color: var(--ck-accent);
|
|
@@ -5322,44 +5388,28 @@ a.tag-mark {
|
|
|
5322
5388
|
text-transform: uppercase;
|
|
5323
5389
|
}
|
|
5324
5390
|
|
|
5325
|
-
.ck-draft-
|
|
5391
|
+
.ck-draft-pending {
|
|
5392
|
+
border-color: rgba(6, 182, 212, 0.45);
|
|
5393
|
+
background: linear-gradient(180deg, var(--ck-accent-soft), var(--ck-surface));
|
|
5394
|
+
}
|
|
5395
|
+
|
|
5396
|
+
.ck-suggestion-banner {
|
|
5326
5397
|
display: inline-flex;
|
|
5327
5398
|
align-items: center;
|
|
5328
5399
|
gap: 10px;
|
|
5329
5400
|
margin-top: 10px;
|
|
5330
|
-
padding: 8px
|
|
5401
|
+
padding: 8px 14px;
|
|
5331
5402
|
background: var(--ck-accent-soft);
|
|
5332
|
-
border: 1px
|
|
5333
|
-
border-radius: 6px;
|
|
5334
|
-
}
|
|
5335
|
-
|
|
5336
|
-
.ck-suggestion-list {
|
|
5337
|
-
display: flex;
|
|
5338
|
-
flex-direction: column;
|
|
5339
|
-
gap: 12px;
|
|
5340
|
-
}
|
|
5341
|
-
.ck-suggestion-card {
|
|
5342
|
-
padding: 12px 14px;
|
|
5343
|
-
background: var(--ck-surface-soft);
|
|
5344
|
-
border: 1px solid var(--ck-line);
|
|
5403
|
+
border: 1px solid rgba(6, 182, 212, 0.35);
|
|
5345
5404
|
border-radius: 6px;
|
|
5346
|
-
|
|
5347
|
-
|
|
5348
|
-
|
|
5349
|
-
|
|
5350
|
-
.ck-suggestion-card__header {
|
|
5351
|
-
display: flex;
|
|
5352
|
-
align-items: center;
|
|
5353
|
-
gap: 10px;
|
|
5405
|
+
color: var(--ck-accent);
|
|
5406
|
+
font-family: var(--ck-mono);
|
|
5407
|
+
font-size: 0.82rem;
|
|
5408
|
+
text-decoration: none;
|
|
5354
5409
|
}
|
|
5355
|
-
.ck-suggestion-
|
|
5356
|
-
|
|
5357
|
-
|
|
5358
|
-
font-size: 0.85rem;
|
|
5359
|
-
background: var(--ck-bg-strong);
|
|
5360
|
-
padding: 10px 12px;
|
|
5361
|
-
border-radius: 4px;
|
|
5362
|
-
border: 1px solid var(--ck-line);
|
|
5410
|
+
.ck-suggestion-banner:hover,
|
|
5411
|
+
.ck-suggestion-banner:focus-visible {
|
|
5412
|
+
border-color: var(--ck-accent);
|
|
5363
5413
|
}
|
|
5364
5414
|
|
|
5365
5415
|
.ck-metrics-table__trust {
|
|
@@ -5372,3 +5422,66 @@ a.tag-mark {
|
|
|
5372
5422
|
color: var(--ck-success);
|
|
5373
5423
|
margin-right: 6px;
|
|
5374
5424
|
}
|
|
5425
|
+
|
|
5426
|
+
.ck-calibration__error {
|
|
5427
|
+
margin: 8px 0 0;
|
|
5428
|
+
padding: 8px 10px;
|
|
5429
|
+
background: var(--ck-danger-soft);
|
|
5430
|
+
border: 1px solid rgba(248, 113, 113, 0.3);
|
|
5431
|
+
border-radius: 4px;
|
|
5432
|
+
color: var(--ck-danger);
|
|
5433
|
+
font-size: 0.82rem;
|
|
5434
|
+
}
|
|
5435
|
+
|
|
5436
|
+
.ck-star-picker {
|
|
5437
|
+
border: 0;
|
|
5438
|
+
padding: 0;
|
|
5439
|
+
margin: 0;
|
|
5440
|
+
}
|
|
5441
|
+
.ck-star-picker__row {
|
|
5442
|
+
display: inline-flex;
|
|
5443
|
+
flex-direction: row-reverse;
|
|
5444
|
+
gap: 2px;
|
|
5445
|
+
}
|
|
5446
|
+
.ck-star-picker input {
|
|
5447
|
+
position: absolute;
|
|
5448
|
+
width: 1px;
|
|
5449
|
+
height: 1px;
|
|
5450
|
+
opacity: 0;
|
|
5451
|
+
pointer-events: none;
|
|
5452
|
+
}
|
|
5453
|
+
.ck-star-picker label {
|
|
5454
|
+
cursor: pointer;
|
|
5455
|
+
display: inline-flex;
|
|
5456
|
+
padding: 4px;
|
|
5457
|
+
border-radius: 4px;
|
|
5458
|
+
}
|
|
5459
|
+
.ck-star-picker label svg {
|
|
5460
|
+
fill: transparent;
|
|
5461
|
+
stroke: var(--ck-line-strong);
|
|
5462
|
+
transition: fill 0.08s, stroke 0.08s;
|
|
5463
|
+
}
|
|
5464
|
+
.ck-star-picker input:checked ~ label svg {
|
|
5465
|
+
fill: var(--ck-warning);
|
|
5466
|
+
stroke: var(--ck-warning);
|
|
5467
|
+
}
|
|
5468
|
+
.ck-star-picker__row:hover label svg {
|
|
5469
|
+
fill: transparent;
|
|
5470
|
+
stroke: var(--ck-line-strong);
|
|
5471
|
+
}
|
|
5472
|
+
.ck-star-picker__row:hover label:hover svg,
|
|
5473
|
+
.ck-star-picker__row:hover label:hover ~ label svg {
|
|
5474
|
+
fill: var(--ck-warning);
|
|
5475
|
+
stroke: var(--ck-warning);
|
|
5476
|
+
}
|
|
5477
|
+
.ck-star-picker input:focus-visible + label {
|
|
5478
|
+
outline: 2px solid var(--ck-accent);
|
|
5479
|
+
outline-offset: 2px;
|
|
5480
|
+
}
|
|
5481
|
+
|
|
5482
|
+
.ck-button--just-saved {
|
|
5483
|
+
animation: ck-saved-flash 1.4s ease-out;
|
|
5484
|
+
}
|
|
5485
|
+
@keyframes ck-saved-flash {
|
|
5486
|
+
0% { background: var(--ck-success); border-color: var(--ck-success); }
|
|
5487
|
+
}
|
|
@@ -5,9 +5,18 @@ module CompletionKit
|
|
|
5
5
|
|
|
6
6
|
def create
|
|
7
7
|
created_by = calibration_creator
|
|
8
|
-
|
|
8
|
+
existing = Calibration.find_by(
|
|
9
9
|
run_id: @run.id, response_id: @response.id, metric_id: @metric.id, created_by: created_by
|
|
10
10
|
)
|
|
11
|
+
|
|
12
|
+
if params[:verdict] == "disagree" && params[:corrected_score].blank?
|
|
13
|
+
render_calibration(calibration: existing, pending_verdict: "disagree")
|
|
14
|
+
return
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
calibration = existing || Calibration.new(
|
|
18
|
+
run: @run, response: @response, metric: @metric, created_by: created_by
|
|
19
|
+
)
|
|
11
20
|
calibration.assign_attributes(
|
|
12
21
|
judge_version: JudgeVersion.ensure_current_for(@metric),
|
|
13
22
|
verdict: params[:verdict],
|
|
@@ -16,19 +25,37 @@ module CompletionKit
|
|
|
16
25
|
)
|
|
17
26
|
|
|
18
27
|
if calibration.save
|
|
19
|
-
|
|
20
|
-
"calibration_#{@response.id}_#{@metric.id}",
|
|
21
|
-
partial: "completion_kit/calibrations/buttons",
|
|
22
|
-
locals: { review: review_for_metric, calibration: calibration, run: @run, response_row: @response, metric: @metric }
|
|
23
|
-
)
|
|
28
|
+
render_calibration(calibration: calibration, just_saved: true)
|
|
24
29
|
else
|
|
25
|
-
|
|
26
|
-
|
|
30
|
+
render_calibration(
|
|
31
|
+
calibration: existing,
|
|
32
|
+
pending_verdict: params[:verdict],
|
|
33
|
+
error: calibration.errors.full_messages.to_sentence,
|
|
34
|
+
status: :unprocessable_entity
|
|
35
|
+
)
|
|
27
36
|
end
|
|
28
37
|
end
|
|
29
38
|
|
|
30
39
|
private
|
|
31
40
|
|
|
41
|
+
def render_calibration(calibration:, pending_verdict: nil, error: nil, just_saved: false, status: :ok)
|
|
42
|
+
locals = {
|
|
43
|
+
review: review_for_metric,
|
|
44
|
+
calibration: calibration,
|
|
45
|
+
run: @run,
|
|
46
|
+
response_row: @response,
|
|
47
|
+
metric: @metric,
|
|
48
|
+
pending_verdict: pending_verdict,
|
|
49
|
+
error: error,
|
|
50
|
+
just_saved: just_saved
|
|
51
|
+
}
|
|
52
|
+
render turbo_stream: turbo_stream.replace(
|
|
53
|
+
"calibration_#{@response.id}_#{@metric.id}",
|
|
54
|
+
partial: "completion_kit/calibrations/buttons",
|
|
55
|
+
locals: locals
|
|
56
|
+
), status: status
|
|
57
|
+
end
|
|
58
|
+
|
|
32
59
|
def ensure_calibration_enabled
|
|
33
60
|
head :not_found unless CompletionKit.config.judge_calibration_enabled
|
|
34
61
|
end
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class MetricsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants]
|
|
4
|
+
before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
|
|
5
5
|
|
|
6
6
|
def index
|
|
7
7
|
@metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
|
|
@@ -12,8 +12,10 @@ module CompletionKit
|
|
|
12
12
|
.includes(response: [:reviews, :run])
|
|
13
13
|
.order(created_at: :desc)
|
|
14
14
|
.limit(50)
|
|
15
|
-
@
|
|
16
|
-
@
|
|
15
|
+
@edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
16
|
+
@published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
|
|
17
|
+
@suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
18
|
+
@improve_disagreement_count = @disagreements.size
|
|
17
19
|
end
|
|
18
20
|
|
|
19
21
|
def new
|
|
@@ -47,15 +49,28 @@ module CompletionKit
|
|
|
47
49
|
end
|
|
48
50
|
|
|
49
51
|
def suggest_variants
|
|
50
|
-
|
|
52
|
+
disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
53
|
+
if disagreement_count.zero?
|
|
54
|
+
redirect_to metric_path(@metric), alert: "Mark at least one row as Disagree before asking the model to suggest a change."
|
|
55
|
+
return
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
|
|
59
|
+
|
|
60
|
+
generator = JudgeVariantGenerator.new(@metric, count: 1)
|
|
51
61
|
variants = generator.call
|
|
52
62
|
if variants.empty?
|
|
53
63
|
redirect_to metric_path(@metric), alert: "The model returned no usable variants. Try again with a different model."
|
|
54
64
|
return
|
|
55
65
|
end
|
|
56
66
|
generator.persist!(variants)
|
|
57
|
-
|
|
58
|
-
|
|
67
|
+
redirect_to metric_path(@metric), notice: "Drafted a new version. Review it below."
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def dismiss_suggestion
|
|
71
|
+
draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").find_by(id: params[:draft_id])
|
|
72
|
+
draft&.destroy
|
|
73
|
+
redirect_to metric_path(@metric), notice: "Dismissed."
|
|
59
74
|
end
|
|
60
75
|
|
|
61
76
|
def publish_draft
|
|
@@ -202,15 +202,86 @@ module CompletionKit
|
|
|
202
202
|
def ck_format_maybe_json(text)
|
|
203
203
|
s = text.to_s
|
|
204
204
|
return s if s.strip.empty?
|
|
205
|
-
|
|
205
|
+
payload = ck_unwrap_json_fence(s.strip)
|
|
206
|
+
first = payload[0]
|
|
206
207
|
return s unless first == "{" || first == "["
|
|
207
208
|
begin
|
|
208
|
-
JSON.pretty_generate(JSON.parse(
|
|
209
|
+
ck_highlight_json(JSON.pretty_generate(JSON.parse(payload)))
|
|
209
210
|
rescue JSON::ParserError
|
|
210
211
|
s
|
|
211
212
|
end
|
|
212
213
|
end
|
|
213
214
|
|
|
215
|
+
def ck_unwrap_json_fence(text)
|
|
216
|
+
m = text.match(/\A```(?:json|JSON)?\s*\n(.*?)\n?```\s*\z/m)
|
|
217
|
+
m ? m[1].strip : text
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def ck_highlight_json(text)
|
|
221
|
+
tokens = ck_tokenize_json(text)
|
|
222
|
+
is_key = ck_mark_json_keys(tokens)
|
|
223
|
+
parts = tokens.each_with_index.map do |(type, value), idx|
|
|
224
|
+
escaped = ERB::Util.html_escape(value)
|
|
225
|
+
case type
|
|
226
|
+
when :punct then %(<span class="ck-json-punct">#{escaped}</span>)
|
|
227
|
+
when :string
|
|
228
|
+
%(<span class="#{is_key[idx] ? "ck-json-key" : "ck-json-string"}">#{escaped}</span>)
|
|
229
|
+
when :number then %(<span class="ck-json-number">#{escaped}</span>)
|
|
230
|
+
when :keyword then %(<span class="ck-json-keyword">#{escaped}</span>)
|
|
231
|
+
else escaped
|
|
232
|
+
end
|
|
233
|
+
end
|
|
234
|
+
parts.join.html_safe
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
def ck_tokenize_json(text)
|
|
238
|
+
tokens = []
|
|
239
|
+
i = 0
|
|
240
|
+
len = text.length
|
|
241
|
+
while i < len
|
|
242
|
+
ch = text[i]
|
|
243
|
+
if ch == " " || ch == "\n" || ch == "\t"
|
|
244
|
+
tokens << [:ws, ch]
|
|
245
|
+
i += 1
|
|
246
|
+
elsif "{}[]:,".include?(ch)
|
|
247
|
+
tokens << [:punct, ch]
|
|
248
|
+
i += 1
|
|
249
|
+
elsif ch == '"'
|
|
250
|
+
j = i + 1
|
|
251
|
+
while j < len && text[j] != '"'
|
|
252
|
+
j += text[j] == "\\" ? 2 : 1
|
|
253
|
+
end
|
|
254
|
+
j = len - 1 if j >= len
|
|
255
|
+
tokens << [:string, text[i..j]]
|
|
256
|
+
i = j + 1
|
|
257
|
+
elsif ch == "-" || (ch >= "0" && ch <= "9")
|
|
258
|
+
j = i + 1
|
|
259
|
+
j += 1 while j < len && "0123456789.eE+-".include?(text[j])
|
|
260
|
+
tokens << [:number, text[i...j]]
|
|
261
|
+
i = j
|
|
262
|
+
elsif text[i, 4] == "true" || text[i, 4] == "null"
|
|
263
|
+
tokens << [:keyword, text[i, 4]]
|
|
264
|
+
i += 4
|
|
265
|
+
elsif text[i, 5] == "false"
|
|
266
|
+
tokens << [:keyword, "false"]
|
|
267
|
+
i += 5
|
|
268
|
+
else
|
|
269
|
+
tokens << [:other, ch]
|
|
270
|
+
i += 1
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
tokens
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
def ck_mark_json_keys(tokens)
|
|
277
|
+
tokens.each_with_index.map do |(type, _), idx|
|
|
278
|
+
next false unless type == :string
|
|
279
|
+
j = idx + 1
|
|
280
|
+
j += 1 while j < tokens.length && tokens[j][0] == :ws
|
|
281
|
+
j < tokens.length && tokens[j] == [:punct, ":"]
|
|
282
|
+
end
|
|
283
|
+
end
|
|
284
|
+
|
|
214
285
|
def tag_filter_url(base_path, selected, toggling)
|
|
215
286
|
remaining = selected.reject { |t| t.id == toggling.id }
|
|
216
287
|
next_set = selected.include?(toggling) ? remaining : remaining + [toggling]
|
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class JudgeVariantGenerator
|
|
3
|
-
DEFAULT_VARIANT_COUNT =
|
|
3
|
+
DEFAULT_VARIANT_COUNT = 1
|
|
4
|
+
MAX_VARIANT_COUNT = 3
|
|
4
5
|
DEFAULT_TEMPERATURE = 0.4
|
|
5
6
|
|
|
6
|
-
Variant = Struct.new(:reasoning, :instruction, keyword_init: true)
|
|
7
|
+
Variant = Struct.new(:reasoning, :instruction, :rubric_bands, keyword_init: true)
|
|
7
8
|
|
|
8
9
|
def initialize(metric, count: DEFAULT_VARIANT_COUNT, model: nil)
|
|
9
10
|
@metric = metric
|
|
10
|
-
|
|
11
|
+
n = count.to_i
|
|
12
|
+
@count = n < 1 ? DEFAULT_VARIANT_COUNT : [n, MAX_VARIANT_COUNT].min
|
|
11
13
|
@model = model || CompletionKit.config.judge_model
|
|
12
14
|
end
|
|
13
15
|
|
|
@@ -23,7 +25,7 @@ module CompletionKit
|
|
|
23
25
|
JudgeVersion.create!(
|
|
24
26
|
metric: @metric,
|
|
25
27
|
instruction: variant.instruction,
|
|
26
|
-
rubric_bands: @metric.rubric_bands,
|
|
28
|
+
rubric_bands: variant.rubric_bands.presence || @metric.rubric_bands,
|
|
27
29
|
state: "draft",
|
|
28
30
|
source: "suggestion",
|
|
29
31
|
current: false
|
|
@@ -39,36 +41,57 @@ module CompletionKit
|
|
|
39
41
|
private
|
|
40
42
|
|
|
41
43
|
def build_meta_prompt
|
|
42
|
-
|
|
44
|
+
disagreements = JudgeCalibrationExamples.disagreements_for(@metric)
|
|
45
|
+
borderlines = JudgeCalibrationExamples.borderlines_for(@metric)
|
|
43
46
|
sections = []
|
|
44
|
-
sections << "You are an expert evaluator.
|
|
47
|
+
sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
|
|
45
48
|
sections << ""
|
|
46
49
|
sections << "## Current instruction"
|
|
47
50
|
sections << "```"
|
|
48
51
|
sections << @metric.instruction.to_s
|
|
49
52
|
sections << "```"
|
|
50
53
|
sections << ""
|
|
51
|
-
sections << "##
|
|
54
|
+
sections << "## Current rubric (5 to 1)"
|
|
52
55
|
sections << @metric.display_rubric_text
|
|
53
56
|
sections << ""
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
57
|
+
if disagreements.any?
|
|
58
|
+
sections << "## Recent disagreements (judge vs human)"
|
|
59
|
+
disagreements.each_with_index do |ex, i|
|
|
60
|
+
sections << "### Case #{i + 1}"
|
|
61
|
+
sections << "Input: #{ex[:input].to_s.truncate(200)}"
|
|
62
|
+
sections << "Output: #{ex[:output].to_s.truncate(200)}"
|
|
63
|
+
sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
|
|
64
|
+
sections << "Human said #{ex[:human_score]}/5: #{ex[:human_note].to_s.truncate(160)}"
|
|
65
|
+
sections << ""
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
if borderlines.any?
|
|
69
|
+
sections << "## Rubric-ambiguous cases (humans marked these borderline)"
|
|
70
|
+
sections << "These are cases where a human said the rubric itself was unclear. If the rubric needs sharpening, rewrite it."
|
|
71
|
+
borderlines.each_with_index do |ex, i|
|
|
72
|
+
sections << "### Borderline #{i + 1}"
|
|
73
|
+
sections << "Input: #{ex[:input].to_s.truncate(200)}"
|
|
74
|
+
sections << "Output: #{ex[:output].to_s.truncate(200)}"
|
|
75
|
+
sections << "Judge said #{ex[:judge_score]}/5: #{ex[:judge_feedback].to_s.truncate(160)}"
|
|
76
|
+
sections << "Human note: #{ex[:human_note].to_s.truncate(200)}" if ex[:human_note].to_s.present?
|
|
77
|
+
sections << ""
|
|
78
|
+
end
|
|
62
79
|
end
|
|
63
80
|
sections << "## Task"
|
|
64
|
-
sections << "
|
|
81
|
+
sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
|
|
65
82
|
sections << ""
|
|
66
|
-
sections << "Respond in EXACTLY this format, repeated #{@count}
|
|
83
|
+
sections << "Respond in EXACTLY this format, repeated #{@count} time#{@count == 1 ? "" : "s"}:"
|
|
67
84
|
sections << ""
|
|
68
85
|
sections << "VARIANT:"
|
|
69
|
-
sections << "REASONING: <one sentence
|
|
86
|
+
sections << "REASONING: <one short sentence: what changes and why>"
|
|
70
87
|
sections << "INSTRUCTION:"
|
|
71
88
|
sections << "<the rewritten instruction>"
|
|
89
|
+
sections << "RUBRIC: # optional — omit this block if the rubric is unchanged"
|
|
90
|
+
sections << "5: <description for 5 stars>"
|
|
91
|
+
sections << "4: <description for 4 stars>"
|
|
92
|
+
sections << "3: <description for 3 stars>"
|
|
93
|
+
sections << "2: <description for 2 stars>"
|
|
94
|
+
sections << "1: <description for 1 star>"
|
|
72
95
|
sections << "END_VARIANT"
|
|
73
96
|
sections.join("\n")
|
|
74
97
|
end
|
|
@@ -77,22 +100,44 @@ module CompletionKit
|
|
|
77
100
|
blocks = text.to_s.scan(/VARIANT:(.*?)END_VARIANT/m).flatten
|
|
78
101
|
blocks.filter_map do |raw|
|
|
79
102
|
reasoning = raw[/REASONING:\s*(.*?)(?=INSTRUCTION:|\z)/m, 1].to_s.strip
|
|
80
|
-
instruction = raw[/INSTRUCTION:\s*(
|
|
103
|
+
instruction = raw[/INSTRUCTION:\s*(.*?)(?=RUBRIC:|\z)/m, 1].to_s.strip
|
|
81
104
|
next if instruction.empty?
|
|
82
|
-
|
|
105
|
+
rubric_block = raw[/RUBRIC:\s*(.*)/m, 1].to_s
|
|
106
|
+
Variant.new(reasoning: reasoning, instruction: instruction, rubric_bands: parse_rubric(rubric_block))
|
|
83
107
|
end
|
|
84
108
|
end
|
|
109
|
+
|
|
110
|
+
def parse_rubric(block)
|
|
111
|
+
return nil if block.strip.empty?
|
|
112
|
+
bands = block.scan(/^\s*([1-5])\s*[:\-]\s*(.+?)\s*$/).map do |stars, description|
|
|
113
|
+
{ "stars" => stars.to_i, "description" => description.strip }
|
|
114
|
+
end
|
|
115
|
+
return nil if bands.length != 5
|
|
116
|
+
bands.sort_by { |b| -b["stars"] }
|
|
117
|
+
end
|
|
85
118
|
end
|
|
86
119
|
|
|
87
120
|
module JudgeCalibrationExamples
|
|
88
121
|
module_function
|
|
89
122
|
|
|
90
123
|
def for(metric, limit: 8)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
124
|
+
disagreements_for(metric, limit: limit)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def disagreements_for(metric, limit: 8)
|
|
128
|
+
calibrations_for(metric, verdict: "disagree", limit: limit)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def borderlines_for(metric, limit: 6)
|
|
132
|
+
calibrations_for(metric, verdict: "borderline", limit: limit)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def calibrations_for(metric, verdict:, limit:)
|
|
136
|
+
Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
137
|
+
.includes(response: :reviews)
|
|
138
|
+
.order(created_at: :desc)
|
|
139
|
+
.limit(limit)
|
|
140
|
+
.map do |cal|
|
|
96
141
|
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
97
142
|
{
|
|
98
143
|
input: cal.response.input_data,
|
|
@@ -10,7 +10,7 @@ module CompletionKit
|
|
|
10
10
|
type: "object",
|
|
11
11
|
properties: {
|
|
12
12
|
metric_id: { type: "integer" },
|
|
13
|
-
count: { type: "integer", description: "How many variants to request (default
|
|
13
|
+
count: { type: "integer", description: "How many variants to request (default 1, max 3). One focused rewrite beats five reworded copies." },
|
|
14
14
|
model: { type: "string", description: "Override the model used to generate variants. Defaults to CompletionKit.config.judge_model." }
|
|
15
15
|
},
|
|
16
16
|
required: ["metric_id"]
|
|
@@ -49,9 +49,7 @@ module CompletionKit
|
|
|
49
49
|
|
|
50
50
|
def self.suggest(args)
|
|
51
51
|
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
52
|
-
|
|
53
|
-
count = CompletionKit::JudgeVariantGenerator::DEFAULT_VARIANT_COUNT if count <= 0
|
|
54
|
-
generator = CompletionKit::JudgeVariantGenerator.new(metric, count: count, model: args["model"])
|
|
52
|
+
generator = CompletionKit::JudgeVariantGenerator.new(metric, count: args["count"].to_i, model: args["model"])
|
|
55
53
|
variants = generator.call
|
|
56
54
|
return error_result("Variant generator returned no parseable variants. Try again or change the model.") if variants.empty?
|
|
57
55
|
versions = generator.persist!(variants)
|
|
@@ -1,20 +1,20 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
module Onboarding
|
|
3
3
|
# Opt-in starter data for the onboarding page: one dataset + one prompt so a
|
|
4
|
-
# brand-new install has something to poke at. Idempotent
|
|
4
|
+
# brand-new install has something to poke at. Idempotent. A no-op once the
|
|
5
5
|
# workspace already has any prompt or dataset. Deliberately does NOT create a
|
|
6
6
|
# provider credential (needs a real API key) or a run (user-initiated).
|
|
7
7
|
module SampleData
|
|
8
8
|
SAMPLE_CSV = <<~CSV.freeze
|
|
9
9
|
ticket
|
|
10
10
|
"My order #4827 arrived with a dented panel. I emailed photos 11 days ago and heard nothing. Today I was told the return window 'closed'. I paid $749. I want a refund or replacement, not store credit."
|
|
11
|
-
"Tracking says delivered to my porch Tuesday 3:47pm. I was home all day, nothing arrived, neighbours' cameras show no van. Order #5102
|
|
11
|
+
"Tracking says delivered to my porch Tuesday 3:47pm. I was home all day, nothing arrived, neighbours' cameras show no van. Order #5102. A $315 mixer, wedding gift, wedding is Saturday. Can someone look today?"
|
|
12
12
|
"WELCOME20 says 'invalid' at checkout but the promo email says it's good through May 31. Same email I'm signed in with. Tried Chrome and Safari. Cart is $186 waiting on you."
|
|
13
13
|
CSV
|
|
14
14
|
|
|
15
15
|
SAMPLE_PROMPT = {
|
|
16
16
|
name: "Sample: Support reply",
|
|
17
|
-
description: "A starter prompt
|
|
17
|
+
description: "A starter prompt. Drafts a warm, professional reply to a customer support ticket. Edit it or delete it; it's just here to get you going.",
|
|
18
18
|
template: "You are a senior customer-support specialist. Write a warm, professional reply to this ticket. Acknowledge the customer's situation, be specific about next steps, and don't be defensive.\n\nTicket:\n{{ticket}}",
|
|
19
19
|
llm_model: "gpt-4o-mini"
|
|
20
20
|
}.freeze
|
|
@@ -1,12 +1,15 @@
|
|
|
1
1
|
<div id="calibration_<%= response_row.id %>_<%= metric.id %>" class="ck-calibration">
|
|
2
2
|
<% current_verdict = calibration&.verdict %>
|
|
3
|
+
<% pending_verdict = local_assigns[:pending_verdict] %>
|
|
4
|
+
<% active_verdict = pending_verdict || current_verdict %>
|
|
5
|
+
<% error = local_assigns[:error] %>
|
|
3
6
|
<% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
|
|
4
7
|
<p class="ck-calibration__prompt">
|
|
5
|
-
Your verdict
|
|
8
|
+
<span class="ck-calibration__label">Your verdict</span>
|
|
6
9
|
<% if verdict_count > 0 %>
|
|
7
|
-
<span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "
|
|
10
|
+
<span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> on this score · <%= link_to "trust level →", metric_path(metric), class: "ck-link" %></span>
|
|
8
11
|
<% else %>
|
|
9
|
-
<span class="ck-
|
|
12
|
+
<span class="ck-calibration__hint">Tell us what you think — was the score right? Verdicts roll up into the metric's <%= link_to "trust level", metric_path(metric), class: "ck-link" %>.</span>
|
|
10
13
|
<% end %>
|
|
11
14
|
</p>
|
|
12
15
|
<div class="ck-calibration__buttons">
|
|
@@ -20,8 +23,8 @@
|
|
|
20
23
|
<%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
|
|
21
24
|
method: :post,
|
|
22
25
|
form: { data: { turbo: "true" } },
|
|
23
|
-
class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict ==
|
|
24
|
-
"aria-pressed": (verdict ==
|
|
26
|
+
class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
|
|
27
|
+
"aria-pressed": (verdict == active_verdict).to_s,
|
|
25
28
|
title: verdict_hints[verdict] do %>
|
|
26
29
|
<%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
27
30
|
<span><%= verdict %></span>
|
|
@@ -29,31 +32,41 @@
|
|
|
29
32
|
<% end %>
|
|
30
33
|
</div>
|
|
31
34
|
|
|
32
|
-
<% if
|
|
35
|
+
<% if error.present? %>
|
|
36
|
+
<p class="ck-calibration__error" role="alert"><%= error %></p>
|
|
37
|
+
<% end %>
|
|
38
|
+
|
|
39
|
+
<% if active_verdict == "disagree" %>
|
|
40
|
+
<% existing_score = (calibration&.corrected_score || review&.ai_score)&.round %>
|
|
33
41
|
<%= form_with url: run_response_calibrations_path(run, response_row),
|
|
34
42
|
method: :post, local: false,
|
|
35
43
|
class: "ck-calibration__detail" do |f| %>
|
|
36
44
|
<%= hidden_field_tag :metric_id, metric.id %>
|
|
37
45
|
<%= hidden_field_tag :verdict, "disagree" %>
|
|
38
|
-
<
|
|
39
|
-
|
|
40
|
-
<
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
46
|
+
<p class="ck-label">What should the score have been?</p>
|
|
47
|
+
<fieldset class="ck-star-picker">
|
|
48
|
+
<legend class="ck-visually-hidden">Pick a score from 1 to 5 stars</legend>
|
|
49
|
+
<div class="ck-star-picker__row">
|
|
50
|
+
<% [5, 4, 3, 2, 1].each do |n| %>
|
|
51
|
+
<% radio_id = "ck-star-#{response_row.id}-#{metric.id}-#{n}" %>
|
|
52
|
+
<input type="radio" name="corrected_score" id="<%= radio_id %>" value="<%= n %>" <%= "checked" if existing_score == n %> required>
|
|
53
|
+
<label for="<%= radio_id %>" title="<%= pluralize(n, 'star') %>" aria-label="<%= pluralize(n, 'star') %>">
|
|
54
|
+
<svg viewBox="0 0 24 24" width="28" height="28" stroke-width="1.5" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
|
|
55
|
+
</label>
|
|
56
|
+
<% end %>
|
|
57
|
+
</div>
|
|
58
|
+
</fieldset>
|
|
59
|
+
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration&.note %></textarea>
|
|
60
|
+
<%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
|
|
48
61
|
<% end %>
|
|
49
|
-
<% elsif
|
|
62
|
+
<% elsif active_verdict == "borderline" %>
|
|
50
63
|
<%= form_with url: run_response_calibrations_path(run, response_row),
|
|
51
64
|
method: :post, local: false,
|
|
52
65
|
class: "ck-calibration__detail" do |f| %>
|
|
53
66
|
<%= hidden_field_tag :metric_id, metric.id %>
|
|
54
67
|
<%= hidden_field_tag :verdict, "borderline" %>
|
|
55
|
-
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration
|
|
56
|
-
<%= f.submit "Save", class: ck_button_classes(:dark) %>
|
|
68
|
+
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration&.note %></textarea>
|
|
69
|
+
<%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
|
|
57
70
|
<% end %>
|
|
58
71
|
<% end %>
|
|
59
72
|
</div>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<% stats = local_assigns[:stats] %>
|
|
2
2
|
<div class="ck-trust-panel ck-trust-panel--<%= stats.gate %>">
|
|
3
|
-
<p class="ck-trust-panel__label" title="How often
|
|
3
|
+
<p class="ck-trust-panel__label" title="How often this metric's scores match the humans who reviewed them.">Trust level</p>
|
|
4
4
|
<% if stats.counter_only? %>
|
|
5
5
|
<div class="ck-trust-panel__body">
|
|
6
6
|
<span class="ck-trust-panel__counter"><%= stats.sample_size %><span class="ck-trust-panel__counter-of">/ <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span></span>
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
<tr>
|
|
20
20
|
<th scope="col">Name</th>
|
|
21
21
|
<th scope="col">Instruction</th>
|
|
22
|
-
<th scope="col" title="How often
|
|
22
|
+
<th scope="col" title="How often this metric's scores match the humans who reviewed them.">Trust level</th>
|
|
23
23
|
<th scope="col">In groups</th>
|
|
24
24
|
<th scope="col"></th>
|
|
25
25
|
</tr>
|
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
<% end %>
|
|
37
37
|
</td>
|
|
38
38
|
<td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
|
|
39
|
-
<td data-label="
|
|
39
|
+
<td data-label="Trust level" class="ck-metrics-table__trust">
|
|
40
40
|
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
41
41
|
<% s = CompletionKit::MetricCalibrationStats.for(metric) %>
|
|
42
42
|
<% if s.counter_only? %>
|
|
@@ -9,24 +9,58 @@
|
|
|
9
9
|
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
10
10
|
<%= render "completion_kit/calibrations/trust_panel",
|
|
11
11
|
stats: CompletionKit::MetricCalibrationStats.for(@metric) %>
|
|
12
|
-
<% if @
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
12
|
+
<% if @edit_draft %>
|
|
13
|
+
<% pub_instr = @published_judge_version&.instruction.to_s %>
|
|
14
|
+
<% draft_instr = @edit_draft.instruction.to_s %>
|
|
15
|
+
<% instruction_changed = pub_instr != draft_instr %>
|
|
16
|
+
<% rubric_changed = @published_judge_version && @published_judge_version.rubric_bands != @edit_draft.rubric_bands %>
|
|
17
|
+
<section class="ck-card ck-card--spaced ck-draft-pending">
|
|
18
|
+
<div class="ck-prompt-preview__header">
|
|
19
|
+
<p class="ck-kicker">Draft pending</p>
|
|
20
|
+
<%= button_to "Publish this version", publish_draft_metric_path(@metric, draft_id: @edit_draft.id),
|
|
21
|
+
method: :post, form_class: "inline-block",
|
|
22
|
+
class: ck_button_classes(:dark) %>
|
|
23
|
+
</div>
|
|
24
|
+
<p class="ck-meta-copy">A draft of this metric is saved. Publishing it replaces the live instruction<%= ", rubric," if rubric_changed %> for future runs. Here's what changes.</p>
|
|
25
|
+
|
|
26
|
+
<% if instruction_changed %>
|
|
27
|
+
<div class="ck-suggest-diff">
|
|
28
|
+
<div class="ck-suggest-diff__pane">
|
|
29
|
+
<div class="ck-suggest-diff__header">
|
|
30
|
+
<span class="ck-suggest-diff__label ck-suggest-diff__label--before">Currently live</span>
|
|
31
|
+
</div>
|
|
32
|
+
<pre class="ck-suggest-diff__code"><%= ck_word_diff_old(pub_instr, draft_instr) %></pre>
|
|
33
|
+
</div>
|
|
34
|
+
<div class="ck-suggest-diff__pane">
|
|
35
|
+
<div class="ck-suggest-diff__header">
|
|
36
|
+
<span class="ck-suggest-diff__label ck-suggest-diff__label--after">Draft</span>
|
|
37
|
+
</div>
|
|
38
|
+
<pre class="ck-suggest-diff__code"><%= ck_word_diff_new(pub_instr, draft_instr) %></pre>
|
|
39
|
+
</div>
|
|
40
|
+
</div>
|
|
41
|
+
<% else %>
|
|
42
|
+
<p class="ck-meta-copy">The instruction is unchanged.</p>
|
|
43
|
+
<% end %>
|
|
44
|
+
|
|
45
|
+
<% if rubric_changed %>
|
|
46
|
+
<p class="ck-meta-copy"><strong>Rubric also changed.</strong> Edit the metric to inspect each band, or publish to apply the new wording.</p>
|
|
47
|
+
<% end %>
|
|
48
|
+
</section>
|
|
20
49
|
<% end %>
|
|
21
50
|
<% end %>
|
|
22
51
|
</div>
|
|
23
52
|
<div class="ck-actions">
|
|
24
53
|
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
54
|
+
<% if @improve_disagreement_count.positive? %>
|
|
55
|
+
<%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
|
|
56
|
+
method: :post, form_class: "inline-block",
|
|
57
|
+
class: ck_button_classes(:light, variant: :outline),
|
|
58
|
+
title: "Rewrite this metric based on the disagreements collected so far.",
|
|
59
|
+
data: { turbo_confirm: "Rewrite this metric based on the disagreements collected so far?" } %>
|
|
60
|
+
<% else %>
|
|
61
|
+
<button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
|
|
62
|
+
title="Mark at least one row as Disagree before the model can suggest a change.">Improve the metric</button>
|
|
63
|
+
<% end %>
|
|
30
64
|
<% end %>
|
|
31
65
|
<%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
|
|
32
66
|
</div>
|
|
@@ -41,7 +75,7 @@
|
|
|
41
75
|
<% if @metric.instruction.present? %>
|
|
42
76
|
<section class="ck-card">
|
|
43
77
|
<p class="ck-kicker">Instruction</p>
|
|
44
|
-
|
|
78
|
+
<%= simple_format(@metric.instruction, {}, class: "ck-copy") %>
|
|
45
79
|
</section>
|
|
46
80
|
<% end %>
|
|
47
81
|
|
|
@@ -63,6 +97,49 @@
|
|
|
63
97
|
</div>
|
|
64
98
|
</section>
|
|
65
99
|
|
|
100
|
+
<% if CompletionKit.config.judge_calibration_enabled && @suggestion_draft %>
|
|
101
|
+
<% sd_current_instr = @published_judge_version&.instruction.to_s %>
|
|
102
|
+
<% sd_draft_instr = @suggestion_draft.instruction.to_s %>
|
|
103
|
+
<% sd_current_rubric = @published_judge_version&.rubric_bands || [] %>
|
|
104
|
+
<% sd_rubric_changed = @suggestion_draft.rubric_bands != sd_current_rubric %>
|
|
105
|
+
<section class="ck-card ck-card--spaced ck-draft-pending">
|
|
106
|
+
<div class="ck-prompt-preview__header">
|
|
107
|
+
<p class="ck-kicker">Suggested change</p>
|
|
108
|
+
<time class="ck-meta-copy" data-relative-time datetime="<%= @suggestion_draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(@suggestion_draft.created_at) %> ago</time>
|
|
109
|
+
</div>
|
|
110
|
+
<p class="ck-meta-copy">Based on your disagreements, the model proposed this rewrite. Use it to replace the live version, or discard.</p>
|
|
111
|
+
|
|
112
|
+
<div class="ck-suggest-diff">
|
|
113
|
+
<div class="ck-suggest-diff__pane">
|
|
114
|
+
<div class="ck-suggest-diff__header">
|
|
115
|
+
<span class="ck-suggest-diff__label ck-suggest-diff__label--before">Currently live</span>
|
|
116
|
+
</div>
|
|
117
|
+
<pre class="ck-suggest-diff__code"><%= ck_word_diff_old(sd_current_instr, sd_draft_instr) %></pre>
|
|
118
|
+
</div>
|
|
119
|
+
<div class="ck-suggest-diff__pane">
|
|
120
|
+
<div class="ck-suggest-diff__header">
|
|
121
|
+
<span class="ck-suggest-diff__label ck-suggest-diff__label--after">Proposed</span>
|
|
122
|
+
</div>
|
|
123
|
+
<pre class="ck-suggest-diff__code"><%= ck_word_diff_new(sd_current_instr, sd_draft_instr) %></pre>
|
|
124
|
+
</div>
|
|
125
|
+
</div>
|
|
126
|
+
|
|
127
|
+
<% if sd_rubric_changed %>
|
|
128
|
+
<p class="ck-meta-copy"><strong>Rubric also changed.</strong> Publishing applies the new rubric too.</p>
|
|
129
|
+
<% end %>
|
|
130
|
+
|
|
131
|
+
<div class="ck-actions">
|
|
132
|
+
<%= button_to "Discard", dismiss_suggestion_metric_path(@metric, draft_id: @suggestion_draft.id),
|
|
133
|
+
method: :delete, form_class: "inline-block",
|
|
134
|
+
class: ck_button_classes(:light, variant: :outline),
|
|
135
|
+
data: { turbo_confirm: "Drop this suggestion?" } %>
|
|
136
|
+
<%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: @suggestion_draft.id),
|
|
137
|
+
method: :post, form_class: "inline-block",
|
|
138
|
+
class: ck_button_classes(:dark) %>
|
|
139
|
+
</div>
|
|
140
|
+
</section>
|
|
141
|
+
<% end %>
|
|
142
|
+
|
|
66
143
|
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
67
144
|
<section class="ck-card ck-card--spaced">
|
|
68
145
|
<div class="ck-prompt-preview__header">
|
|
@@ -131,31 +208,6 @@
|
|
|
131
208
|
<% end %>
|
|
132
209
|
</section>
|
|
133
210
|
|
|
134
|
-
<% if @suggestion_drafts.any? %>
|
|
135
|
-
<section class="ck-card ck-card--spaced">
|
|
136
|
-
<div class="ck-prompt-preview__header">
|
|
137
|
-
<p class="ck-kicker">Suggested rewrites</p>
|
|
138
|
-
<span class="ck-chip"><%= @suggestion_drafts.size %> option<%= @suggestion_drafts.size == 1 ? "" : "s" %></span>
|
|
139
|
-
</div>
|
|
140
|
-
<p class="ck-meta-copy">The model wrote these alternate instructions based on the disagreements above. Pick one to make it the live judge — the previous version is kept in history.</p>
|
|
141
|
-
<div class="ck-suggestion-list">
|
|
142
|
-
<% @suggestion_drafts.each do |draft| %>
|
|
143
|
-
<article class="ck-suggestion-card">
|
|
144
|
-
<header class="ck-suggestion-card__header">
|
|
145
|
-
<span class="ck-chip ck-chip--soft">Option #<%= draft.id %></span>
|
|
146
|
-
<time class="ck-meta-copy" data-relative-time datetime="<%= draft.created_at.utc.iso8601 %>"><%= time_ago_in_words(draft.created_at) %> ago</time>
|
|
147
|
-
</header>
|
|
148
|
-
<pre class="ck-code ck-suggestion-card__instruction"><%= draft.instruction %></pre>
|
|
149
|
-
<div class="ck-actions">
|
|
150
|
-
<%= button_to "Use this version", publish_draft_metric_path(@metric, draft_id: draft.id),
|
|
151
|
-
method: :post, form_class: "inline-block",
|
|
152
|
-
class: ck_button_classes(:dark) %>
|
|
153
|
-
</div>
|
|
154
|
-
</article>
|
|
155
|
-
<% end %>
|
|
156
|
-
</div>
|
|
157
|
-
</section>
|
|
158
|
-
<% end %>
|
|
159
211
|
|
|
160
212
|
<% if Array(@metric.few_shot_examples).any? %>
|
|
161
213
|
<section class="ck-card ck-card--spaced">
|
|
@@ -112,9 +112,7 @@
|
|
|
112
112
|
</div>
|
|
113
113
|
</div>
|
|
114
114
|
<% if review.ai_feedback.present? %>
|
|
115
|
-
<
|
|
116
|
-
<div class="ck-note-box"><%= review.ai_feedback %></div>
|
|
117
|
-
</div>
|
|
115
|
+
<p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
|
|
118
116
|
<% end %>
|
|
119
117
|
<% if CompletionKit.config.judge_calibration_enabled && review.metric && review.ai_score %>
|
|
120
118
|
<% existing = CompletionKit::Calibration.find_by(
|
data/config/routes.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.40
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -233,7 +233,7 @@ files:
|
|
|
233
233
|
- app/assets/images/completion_kit/favicon.ico
|
|
234
234
|
- app/assets/images/completion_kit/logo.png
|
|
235
235
|
- app/assets/javascripts/completion_kit/application.js
|
|
236
|
-
- app/assets/stylesheets/completion_kit/application.css
|
|
236
|
+
- app/assets/stylesheets/completion_kit/application.css
|
|
237
237
|
- app/controllers/completion_kit/api/v1/base_controller.rb
|
|
238
238
|
- app/controllers/completion_kit/api/v1/calibrations_controller.rb
|
|
239
239
|
- app/controllers/completion_kit/api/v1/datasets_controller.rb
|