completion-kit 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +167 -48
- data/app/controllers/completion_kit/metrics_controller.rb +27 -14
- data/app/jobs/completion_kit/judge_review_job.rb +9 -4
- data/app/jobs/completion_kit/metric_suggestion_job.rb +46 -0
- data/app/models/completion_kit/metric_version.rb +1 -0
- data/app/services/completion_kit/judge_service.rb +19 -3
- data/app/services/completion_kit/metric_calibration_examples.rb +56 -0
- data/app/services/completion_kit/metric_improvement_validator.rb +101 -0
- data/app/services/completion_kit/metric_variant_generator.rb +0 -36
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +8 -12
- data/app/views/completion_kit/metrics/_guiding_examples.html.erb +23 -0
- data/app/views/completion_kit/metrics/_suggestion_failed.html.erb +3 -0
- data/app/views/completion_kit/metrics/_suggestion_pending.html.erb +3 -0
- data/app/views/completion_kit/metrics/_suggestion_ready.html.erb +4 -0
- data/app/views/completion_kit/metrics/_validation_scoreboard.html.erb +12 -0
- data/app/views/completion_kit/metrics/index.html.erb +5 -0
- data/app/views/completion_kit/metrics/show.html.erb +38 -30
- data/app/views/completion_kit/responses/show.html.erb +7 -5
- data/config/routes.rb +1 -0
- data/db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb +5 -0
- data/db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb +5 -0
- data/lib/completion_kit/version.rb +1 -1
- data/lib/completion_kit.rb +2 -0
- metadata +11 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0b32ec77fb60d07f40e4b83827c2510aaeb695c96c9c6df86e4b42a7ec57516b
|
|
4
|
+
data.tar.gz: ade912039e4942c87d73c13443bd405533eec2988478e02cd1ccb87550de2783
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: bb8664ea804d59e3761ab385d1af98ecf7d110dd7e68e7003e1a4b2c059c5e377a5e42d350a46310f58c6d8c41c0e31a6aa0cdaf8a6b50b4d9f419e6fa60e474
|
|
7
|
+
data.tar.gz: 3bbe72cf7e99a4ae899765829ee8bee83703885ebdfa5b6b9f7253f25f2373b1dd22aadfeb071f4b340c1b7d33dfc4e590e4f8a2df2239afbbc305de009af2cf
|
|
@@ -686,16 +686,6 @@ tr:hover .ck-chip--publish {
|
|
|
686
686
|
justify-content: space-between;
|
|
687
687
|
gap: 10px;
|
|
688
688
|
}
|
|
689
|
-
.ck-version-state {
|
|
690
|
-
font-family: var(--ck-mono);
|
|
691
|
-
font-size: 0.66rem;
|
|
692
|
-
letter-spacing: 0.07em;
|
|
693
|
-
text-transform: uppercase;
|
|
694
|
-
color: var(--ck-dim);
|
|
695
|
-
}
|
|
696
|
-
.ck-version-state--live {
|
|
697
|
-
color: var(--ck-text);
|
|
698
|
-
}
|
|
699
689
|
|
|
700
690
|
.ck-chip--soft {
|
|
701
691
|
background: var(--ck-accent-soft);
|
|
@@ -2877,10 +2867,6 @@ select.ck-input {
|
|
|
2877
2867
|
line-height: 1.55;
|
|
2878
2868
|
}
|
|
2879
2869
|
|
|
2880
|
-
.ck-review-card--stale {
|
|
2881
|
-
border-left: 2px solid rgba(224, 164, 88, 0.45);
|
|
2882
|
-
}
|
|
2883
|
-
|
|
2884
2870
|
.ck-stale-versions-banner {
|
|
2885
2871
|
margin: 0 0 1rem;
|
|
2886
2872
|
padding: 0.9rem 1rem;
|
|
@@ -2908,12 +2894,6 @@ select.ck-input {
|
|
|
2908
2894
|
.ck-delta--zero { color: var(--ck-dim); }
|
|
2909
2895
|
|
|
2910
2896
|
.ck-run-compare-table td { vertical-align: middle; }
|
|
2911
|
-
.ck-review-card__stale-note {
|
|
2912
|
-
margin: 0.4rem 0 0;
|
|
2913
|
-
font-family: var(--ck-mono);
|
|
2914
|
-
font-size: 0.78rem;
|
|
2915
|
-
color: var(--ck-warning);
|
|
2916
|
-
}
|
|
2917
2897
|
|
|
2918
2898
|
@media (max-width: 900px) {
|
|
2919
2899
|
.ck-grid--sidebar,
|
|
@@ -3617,9 +3597,10 @@ select.ck-input {
|
|
|
3617
3597
|
}
|
|
3618
3598
|
|
|
3619
3599
|
.ck-metrics-table th:nth-child(1), .ck-metrics-table td:nth-child(1) { width: 18rem; white-space: normal; }
|
|
3620
|
-
.ck-metrics-table th:nth-child(2), .ck-metrics-table td:nth-child(2) { width:
|
|
3621
|
-
.ck-metrics-table th:nth-child(3), .ck-metrics-table td:nth-child(3) { width:
|
|
3622
|
-
.ck-metrics-table th:nth-child(4), .ck-metrics-table td:nth-child(4) { width:
|
|
3600
|
+
.ck-metrics-table th:nth-child(2), .ck-metrics-table td:nth-child(2) { width: 6rem; }
|
|
3601
|
+
.ck-metrics-table th:nth-child(3), .ck-metrics-table td:nth-child(3) { width: auto; }
|
|
3602
|
+
.ck-metrics-table th:nth-child(4), .ck-metrics-table td:nth-child(4) { width: 16rem; }
|
|
3603
|
+
.ck-metrics-table th:nth-child(5), .ck-metrics-table td:nth-child(5) { width: 3rem; }
|
|
3623
3604
|
.ck-metrics-table td:nth-child(1) strong { overflow-wrap: anywhere; }
|
|
3624
3605
|
|
|
3625
3606
|
.ck-datasets-table th:nth-child(1), .ck-datasets-table td:nth-child(1) { width: auto; }
|
|
@@ -3638,32 +3619,10 @@ select.ck-input {
|
|
|
3638
3619
|
.ck-prompt-versions-table th:nth-child(3), .ck-prompt-versions-table td:nth-child(3) { width: 8rem; white-space: nowrap; }
|
|
3639
3620
|
.ck-prompt-versions-table th:nth-child(4), .ck-prompt-versions-table td:nth-child(4) { width: auto; }
|
|
3640
3621
|
|
|
3641
|
-
.ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width:
|
|
3642
|
-
.ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width:
|
|
3643
|
-
.ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width:
|
|
3644
|
-
.ck-metric-versions-table th:nth-child(4), .ck-metric-versions-table td:nth-child(4) { width: 9rem; white-space: nowrap; }
|
|
3622
|
+
.ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 18rem; }
|
|
3623
|
+
.ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: 16rem; white-space: nowrap; }
|
|
3624
|
+
.ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: auto; white-space: nowrap; }
|
|
3645
3625
|
|
|
3646
|
-
.ck-change-link {
|
|
3647
|
-
background: none;
|
|
3648
|
-
border: 0;
|
|
3649
|
-
padding: 0;
|
|
3650
|
-
cursor: pointer;
|
|
3651
|
-
font-family: inherit;
|
|
3652
|
-
font-size: 0.86rem;
|
|
3653
|
-
text-align: left;
|
|
3654
|
-
color: var(--ck-text);
|
|
3655
|
-
}
|
|
3656
|
-
.ck-change-link:hover,
|
|
3657
|
-
.ck-change-link:focus-visible {
|
|
3658
|
-
color: var(--ck-accent);
|
|
3659
|
-
text-decoration: underline;
|
|
3660
|
-
}
|
|
3661
|
-
.ck-change-link--trivial {
|
|
3662
|
-
color: var(--ck-dim);
|
|
3663
|
-
}
|
|
3664
|
-
.ck-change-link--major {
|
|
3665
|
-
color: rgb(217, 119, 6);
|
|
3666
|
-
}
|
|
3667
3626
|
|
|
3668
3627
|
.ck-source-chip {
|
|
3669
3628
|
display: inline-block;
|
|
@@ -5632,6 +5591,11 @@ a.tag-mark {
|
|
|
5632
5591
|
.ck-trust-line__hint {
|
|
5633
5592
|
color: var(--ck-dim);
|
|
5634
5593
|
}
|
|
5594
|
+
.ck-trust-line__aside {
|
|
5595
|
+
margin: 4px 0 0;
|
|
5596
|
+
font-size: 0.78rem;
|
|
5597
|
+
color: var(--ck-muted);
|
|
5598
|
+
}
|
|
5635
5599
|
.ck-cal-stat {
|
|
5636
5600
|
display: inline-flex;
|
|
5637
5601
|
align-items: baseline;
|
|
@@ -5945,3 +5909,158 @@ a.tag-mark {
|
|
|
5945
5909
|
.ck-starter-actions .ck-button {
|
|
5946
5910
|
line-height: 1;
|
|
5947
5911
|
}
|
|
5912
|
+
|
|
5913
|
+
.ck-guiding {
|
|
5914
|
+
margin-top: 14px;
|
|
5915
|
+
padding-top: 12px;
|
|
5916
|
+
border-top: 1px solid var(--ck-line);
|
|
5917
|
+
}
|
|
5918
|
+
|
|
5919
|
+
.ck-guiding__head {
|
|
5920
|
+
display: flex;
|
|
5921
|
+
align-items: baseline;
|
|
5922
|
+
justify-content: space-between;
|
|
5923
|
+
gap: 12px;
|
|
5924
|
+
}
|
|
5925
|
+
|
|
5926
|
+
.ck-guiding__head .ck-kicker--inset {
|
|
5927
|
+
margin-top: 0;
|
|
5928
|
+
}
|
|
5929
|
+
|
|
5930
|
+
.ck-guiding__legend {
|
|
5931
|
+
font-family: var(--ck-mono);
|
|
5932
|
+
font-size: 0.64rem;
|
|
5933
|
+
letter-spacing: 0.09em;
|
|
5934
|
+
text-transform: uppercase;
|
|
5935
|
+
color: var(--ck-muted);
|
|
5936
|
+
}
|
|
5937
|
+
|
|
5938
|
+
.ck-guiding__list {
|
|
5939
|
+
list-style: none;
|
|
5940
|
+
margin: 8px -8px 0;
|
|
5941
|
+
padding: 0;
|
|
5942
|
+
display: flex;
|
|
5943
|
+
flex-direction: column;
|
|
5944
|
+
}
|
|
5945
|
+
|
|
5946
|
+
.ck-guiding__item {
|
|
5947
|
+
display: flex;
|
|
5948
|
+
align-items: center;
|
|
5949
|
+
gap: 12px;
|
|
5950
|
+
padding: 5px 8px;
|
|
5951
|
+
border-radius: 7px;
|
|
5952
|
+
transition: background 0.15s;
|
|
5953
|
+
}
|
|
5954
|
+
|
|
5955
|
+
.ck-guiding__item:hover {
|
|
5956
|
+
background: var(--ck-surface-hover);
|
|
5957
|
+
}
|
|
5958
|
+
|
|
5959
|
+
.ck-guiding__item:hover .ck-guiding__output {
|
|
5960
|
+
color: var(--ck-text);
|
|
5961
|
+
}
|
|
5962
|
+
|
|
5963
|
+
.ck-guiding__link {
|
|
5964
|
+
flex: 1;
|
|
5965
|
+
min-width: 0;
|
|
5966
|
+
display: flex;
|
|
5967
|
+
align-items: center;
|
|
5968
|
+
gap: 12px;
|
|
5969
|
+
text-decoration: none;
|
|
5970
|
+
color: inherit;
|
|
5971
|
+
}
|
|
5972
|
+
|
|
5973
|
+
.ck-guiding__output {
|
|
5974
|
+
flex: 1;
|
|
5975
|
+
min-width: 0;
|
|
5976
|
+
overflow: hidden;
|
|
5977
|
+
text-overflow: ellipsis;
|
|
5978
|
+
white-space: nowrap;
|
|
5979
|
+
color: var(--ck-dim);
|
|
5980
|
+
font-size: 0.86rem;
|
|
5981
|
+
}
|
|
5982
|
+
|
|
5983
|
+
.ck-guiding__scores {
|
|
5984
|
+
font-family: var(--ck-mono);
|
|
5985
|
+
font-size: 0.78rem;
|
|
5986
|
+
color: var(--ck-text);
|
|
5987
|
+
white-space: nowrap;
|
|
5988
|
+
}
|
|
5989
|
+
|
|
5990
|
+
.ck-guiding__judge {
|
|
5991
|
+
color: var(--ck-dim);
|
|
5992
|
+
}
|
|
5993
|
+
|
|
5994
|
+
.ck-guiding__human {
|
|
5995
|
+
color: var(--ck-text);
|
|
5996
|
+
font-weight: 600;
|
|
5997
|
+
}
|
|
5998
|
+
|
|
5999
|
+
.ck-guiding__item .ck-icon-btn {
|
|
6000
|
+
width: 2rem;
|
|
6001
|
+
height: 2rem;
|
|
6002
|
+
}
|
|
6003
|
+
|
|
6004
|
+
.ck-suggestion-status:empty { display: none; }
|
|
6005
|
+
.ck-suggestion-status {
|
|
6006
|
+
margin-top: 10px;
|
|
6007
|
+
display: flex;
|
|
6008
|
+
align-items: baseline;
|
|
6009
|
+
gap: 10px;
|
|
6010
|
+
flex-wrap: wrap;
|
|
6011
|
+
}
|
|
6012
|
+
|
|
6013
|
+
.ck-scoreboard {
|
|
6014
|
+
margin-bottom: 16px;
|
|
6015
|
+
padding-bottom: 14px;
|
|
6016
|
+
border-bottom: 1px solid var(--ck-line);
|
|
6017
|
+
}
|
|
6018
|
+
.ck-scoreboard__headline {
|
|
6019
|
+
margin: 0 0 8px;
|
|
6020
|
+
font-size: 0.95rem;
|
|
6021
|
+
color: var(--ck-text);
|
|
6022
|
+
}
|
|
6023
|
+
.ck-scoreboard__was {
|
|
6024
|
+
font-family: var(--ck-mono);
|
|
6025
|
+
font-size: 0.74rem;
|
|
6026
|
+
color: var(--ck-muted);
|
|
6027
|
+
margin-left: 6px;
|
|
6028
|
+
}
|
|
6029
|
+
.ck-scoreboard__tally {
|
|
6030
|
+
list-style: none;
|
|
6031
|
+
margin: 0;
|
|
6032
|
+
padding: 0;
|
|
6033
|
+
display: flex;
|
|
6034
|
+
gap: 18px;
|
|
6035
|
+
}
|
|
6036
|
+
.ck-scoreboard__stat {
|
|
6037
|
+
font-family: var(--ck-mono);
|
|
6038
|
+
font-size: 0.72rem;
|
|
6039
|
+
letter-spacing: 0.06em;
|
|
6040
|
+
text-transform: uppercase;
|
|
6041
|
+
color: var(--ck-muted);
|
|
6042
|
+
}
|
|
6043
|
+
.ck-scoreboard__stat strong { color: var(--ck-text); }
|
|
6044
|
+
.ck-scoreboard__stat--break strong { color: var(--ck-warning); }
|
|
6045
|
+
.ck-scoreboard__note {
|
|
6046
|
+
margin: 8px 0 0;
|
|
6047
|
+
font-size: 0.78rem;
|
|
6048
|
+
color: var(--ck-muted);
|
|
6049
|
+
}
|
|
6050
|
+
.ck-version-change {
|
|
6051
|
+
display: inline-flex;
|
|
6052
|
+
align-items: baseline;
|
|
6053
|
+
gap: 0.6rem;
|
|
6054
|
+
}
|
|
6055
|
+
.ck-version-score {
|
|
6056
|
+
font-family: var(--ck-mono);
|
|
6057
|
+
font-size: 0.74rem;
|
|
6058
|
+
color: var(--ck-dim);
|
|
6059
|
+
}
|
|
6060
|
+
.ck-version-score__label {
|
|
6061
|
+
font-size: 0.6rem;
|
|
6062
|
+
letter-spacing: 0.08em;
|
|
6063
|
+
text-transform: uppercase;
|
|
6064
|
+
color: var(--ck-muted);
|
|
6065
|
+
margin-right: 0.2rem;
|
|
6066
|
+
}
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class MetricsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion]
|
|
4
|
+
before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion, :exclude_example]
|
|
5
|
+
before_action :ensure_examples_from_reviews_enabled, only: [:exclude_example]
|
|
5
6
|
|
|
6
7
|
def index
|
|
7
8
|
@metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
|
|
8
9
|
@available_starters = StarterMetrics.available
|
|
10
|
+
@current_versions = MetricVersion.published.current.where(metric_id: @metrics.map(&:id)).index_by(&:metric_id)
|
|
9
11
|
end
|
|
10
12
|
|
|
11
13
|
def starter_preview
|
|
@@ -39,6 +41,7 @@ module CompletionKit
|
|
|
39
41
|
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
40
42
|
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
41
43
|
@versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
|
|
44
|
+
@guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricCalibrationExamples.judge_examples_for(@metric) : []
|
|
42
45
|
end
|
|
43
46
|
|
|
44
47
|
def new
|
|
@@ -114,26 +117,22 @@ module CompletionKit
|
|
|
114
117
|
|
|
115
118
|
def suggest_variants
|
|
116
119
|
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
117
|
-
|
|
118
|
-
if
|
|
120
|
+
counts = Calibration.where(metric_id: @metric.id, verdict: %w[agree disagree]).group(:verdict).count
|
|
121
|
+
if counts["disagree"].to_i.zero?
|
|
119
122
|
redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
|
|
120
123
|
return
|
|
121
124
|
end
|
|
122
125
|
|
|
123
|
-
|
|
126
|
+
MetricSuggestionJob.perform_later(@metric.id)
|
|
124
127
|
|
|
125
|
-
generator = MetricVariantGenerator.new(@metric, count: 1)
|
|
126
|
-
variants = generator.call
|
|
127
|
-
if variants.empty?
|
|
128
|
-
redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
|
|
129
|
-
return
|
|
130
|
-
end
|
|
131
|
-
versions = generator.persist!(variants)
|
|
132
|
-
new_version = versions.max_by(&:version_number)
|
|
133
128
|
if params[:back_to] == "edit"
|
|
134
|
-
redirect_to
|
|
129
|
+
redirect_to metric_path(@metric), notice: "Drafting a change from your reviews. It will appear here once it's tested."
|
|
135
130
|
else
|
|
136
|
-
|
|
131
|
+
render turbo_stream: turbo_stream.replace(
|
|
132
|
+
"ck-suggestion-status-#{@metric.id}",
|
|
133
|
+
partial: "completion_kit/metrics/suggestion_pending",
|
|
134
|
+
locals: { metric: @metric, count: counts.values.sum }
|
|
135
|
+
)
|
|
137
136
|
end
|
|
138
137
|
end
|
|
139
138
|
|
|
@@ -145,6 +144,16 @@ module CompletionKit
|
|
|
145
144
|
redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
|
|
146
145
|
end
|
|
147
146
|
|
|
147
|
+
def exclude_example
|
|
148
|
+
calibration = Calibration.where(metric_id: @metric.id).find(params[:calibration_id])
|
|
149
|
+
calibration.update!(excluded_from_examples: true)
|
|
150
|
+
render turbo_stream: turbo_stream.replace(
|
|
151
|
+
"ck-guiding-#{@metric.id}",
|
|
152
|
+
partial: "completion_kit/metrics/guiding_examples",
|
|
153
|
+
locals: { metric: @metric, examples: MetricCalibrationExamples.judge_examples_for(@metric) }
|
|
154
|
+
)
|
|
155
|
+
end
|
|
156
|
+
|
|
148
157
|
def publish_draft
|
|
149
158
|
scope = MetricVersion.where(metric_id: @metric.id)
|
|
150
159
|
version = if params[:draft_id].present?
|
|
@@ -176,6 +185,10 @@ module CompletionKit
|
|
|
176
185
|
|
|
177
186
|
private
|
|
178
187
|
|
|
188
|
+
def ensure_examples_from_reviews_enabled
|
|
189
|
+
head :not_found unless CompletionKit.config.judge_examples_from_reviews
|
|
190
|
+
end
|
|
191
|
+
|
|
179
192
|
def set_metric
|
|
180
193
|
@metric = Metric.find(params[:id])
|
|
181
194
|
end
|
|
@@ -58,7 +58,8 @@ module CompletionKit
|
|
|
58
58
|
run.prompt&.template,
|
|
59
59
|
criteria: metric.instruction.to_s,
|
|
60
60
|
rubric_text: metric.display_rubric_text,
|
|
61
|
-
input_data: response.input_data
|
|
61
|
+
input_data: response.input_data,
|
|
62
|
+
human_examples: review_examples_for(metric, response)
|
|
62
63
|
)
|
|
63
64
|
|
|
64
65
|
review = response.reviews.find_or_initialize_by(metric_id: metric.id)
|
|
@@ -80,9 +81,13 @@ module CompletionKit
|
|
|
80
81
|
|
|
81
82
|
private
|
|
82
83
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
84
|
+
def review_examples_for(metric, response)
|
|
85
|
+
return nil unless CompletionKit.config.judge_calibration_enabled
|
|
86
|
+
return nil unless CompletionKit.config.judge_examples_from_reviews
|
|
87
|
+
|
|
88
|
+
MetricCalibrationExamples.judge_examples_for(metric, exclude_response_id: response.id)
|
|
89
|
+
end
|
|
90
|
+
|
|
86
91
|
def confirm_judging_capability(judge_model_id)
|
|
87
92
|
model = Model.find_by(provider: ApiConfig.provider_for_model(judge_model_id), model_id: judge_model_id)
|
|
88
93
|
return unless model && model.supports_judging.nil?
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
require "faraday"
|
|
2
|
+
|
|
3
|
+
module CompletionKit
|
|
4
|
+
class MetricSuggestionJob < ApplicationJob
|
|
5
|
+
queue_as :llm
|
|
6
|
+
|
|
7
|
+
retry_on Faraday::TimeoutError, Faraday::ConnectionFailed, wait: :polynomially_longer, attempts: 5
|
|
8
|
+
retry_on CompletionKit::RateLimitError, wait: :polynomially_longer, attempts: 5
|
|
9
|
+
|
|
10
|
+
rescue_from(StandardError) do |error|
|
|
11
|
+
Rails.error.report(error, handled: true, context: { job: self.class.name })
|
|
12
|
+
broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def perform(metric_id)
|
|
16
|
+
@metric = Metric.find_by(id: metric_id)
|
|
17
|
+
return unless @metric
|
|
18
|
+
|
|
19
|
+
MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
|
|
20
|
+
|
|
21
|
+
generator = MetricVariantGenerator.new(@metric, count: 1)
|
|
22
|
+
variants = generator.call
|
|
23
|
+
if variants.empty?
|
|
24
|
+
broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
|
|
25
|
+
return
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
draft = generator.persist!(variants).max_by(&:version_number)
|
|
29
|
+
summary = MetricImprovementValidator.new(@metric, draft).call
|
|
30
|
+
draft.update!(validation_summary: summary)
|
|
31
|
+
|
|
32
|
+
broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_ready", locals: { metric: @metric, draft: draft })
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def broadcast_status(metric, partial:, locals:)
|
|
38
|
+
html = CompletionKit::ApplicationController.render(partial: partial, locals: locals)
|
|
39
|
+
Turbo::StreamsChannel.broadcast_replace_to(
|
|
40
|
+
"metric_#{metric.id}_suggestion",
|
|
41
|
+
target: "ck-suggestion-status-#{metric.id}",
|
|
42
|
+
html: html
|
|
43
|
+
)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -10,13 +10,14 @@ module CompletionKit
|
|
|
10
10
|
@judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
-
def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, **_extras)
|
|
13
|
+
def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil, **_extras)
|
|
14
14
|
raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
|
|
15
15
|
|
|
16
16
|
judge_prompt = build_judge_prompt(output, expected_output, prompt,
|
|
17
17
|
criteria: criteria,
|
|
18
18
|
rubric_text: rubric_text,
|
|
19
|
-
input_data: input_data
|
|
19
|
+
input_data: input_data,
|
|
20
|
+
human_examples: human_examples)
|
|
20
21
|
|
|
21
22
|
response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
|
|
22
23
|
raise StandardError, response if response.start_with?("Error:")
|
|
@@ -25,7 +26,7 @@ module CompletionKit
|
|
|
25
26
|
|
|
26
27
|
private
|
|
27
28
|
|
|
28
|
-
def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil)
|
|
29
|
+
def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil)
|
|
29
30
|
judge_prompt = <<~PROMPT
|
|
30
31
|
You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
|
|
31
32
|
|
|
@@ -42,6 +43,8 @@ module CompletionKit
|
|
|
42
43
|
judge_prompt += "\nCriteria: #{criteria}\n"
|
|
43
44
|
end
|
|
44
45
|
|
|
46
|
+
judge_prompt += human_examples_block(human_examples)
|
|
47
|
+
|
|
45
48
|
judge_prompt += <<~PROMPT
|
|
46
49
|
|
|
47
50
|
Original prompt: #{prompt || "Not provided"}
|
|
@@ -53,6 +56,19 @@ module CompletionKit
|
|
|
53
56
|
judge_prompt
|
|
54
57
|
end
|
|
55
58
|
|
|
59
|
+
def human_examples_block(examples)
|
|
60
|
+
return "" if examples.blank?
|
|
61
|
+
|
|
62
|
+
lines = ["", "Reviewed examples where a human corrected the judge on this metric. Weigh them when scoring:"]
|
|
63
|
+
examples.each_with_index do |example, index|
|
|
64
|
+
note = example[:human_note].to_s
|
|
65
|
+
line = "Example #{index + 1}: Output: #{example[:output].to_s.truncate(200)}. The judge scored this #{example[:judge_score].to_i}/5. A reviewer corrected it to #{example[:human_score].to_i}/5"
|
|
66
|
+
line += note.present? ? ": #{note.truncate(160)}" : "."
|
|
67
|
+
lines << line
|
|
68
|
+
end
|
|
69
|
+
lines.join("\n") + "\n"
|
|
70
|
+
end
|
|
71
|
+
|
|
56
72
|
def parse_judge_response(response)
|
|
57
73
|
score_match = response.match(/\*{0,2}Score:?\*{0,2}\s*(\d+(?:\.\d+)?)/i)
|
|
58
74
|
feedback_match = response.match(/\*{0,2}Feedback:?\*{0,2}\s*(.+)/mi)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module MetricCalibrationExamples
|
|
3
|
+
DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
|
|
4
|
+
|
|
5
|
+
module_function
|
|
6
|
+
|
|
7
|
+
def for(metric, limit: 8)
|
|
8
|
+
disagreements_for(metric, limit: limit)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def disagreements_for(metric, limit: 8)
|
|
12
|
+
calibrations_for(metric, verdict: "disagree", limit: limit)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def borderlines_for(metric, limit: 6)
|
|
16
|
+
calibrations_for(metric, verdict: "borderline", limit: limit)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
|
|
20
|
+
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
21
|
+
return [] unless current_version
|
|
22
|
+
|
|
23
|
+
relation = Calibration
|
|
24
|
+
.where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
|
|
25
|
+
.where.not(corrected_score: nil)
|
|
26
|
+
relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
|
|
27
|
+
map_examples(relation.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
|
|
28
|
+
.reject { |example| example[:judge_score].nil? }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def calibrations_for(metric, verdict:, limit:)
|
|
32
|
+
base = Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
33
|
+
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
34
|
+
scoped = current_version ? base.where(metric_version_id: current_version.id) : base
|
|
35
|
+
effective = scoped.exists? ? scoped : base
|
|
36
|
+
map_examples(effective.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def map_examples(relation, metric)
|
|
40
|
+
relation.map do |cal|
|
|
41
|
+
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
42
|
+
{
|
|
43
|
+
id: cal.id,
|
|
44
|
+
run_id: cal.run_id,
|
|
45
|
+
response_id: cal.response_id,
|
|
46
|
+
input: cal.response.input_data,
|
|
47
|
+
output: cal.response.response_text,
|
|
48
|
+
judge_score: review&.ai_score,
|
|
49
|
+
judge_feedback: review&.ai_feedback,
|
|
50
|
+
human_score: cal.corrected_score,
|
|
51
|
+
human_note: cal.note
|
|
52
|
+
}
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class MetricImprovementValidator
|
|
3
|
+
ANSWER_KEY_LIMIT = 30
|
|
4
|
+
|
|
5
|
+
def initialize(metric, candidate, scorer: nil)
|
|
6
|
+
@metric = metric
|
|
7
|
+
@candidate = candidate
|
|
8
|
+
@scorer = scorer || method(:rescore)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def call
|
|
12
|
+
key = answer_key
|
|
13
|
+
rows = []
|
|
14
|
+
key.each do |entry|
|
|
15
|
+
begin
|
|
16
|
+
score = @scorer.call(entry[:response], @candidate)
|
|
17
|
+
rescue StandardError
|
|
18
|
+
next
|
|
19
|
+
end
|
|
20
|
+
rows << classify(entry, score.to_i)
|
|
21
|
+
end
|
|
22
|
+
summarize(rows, key.size, key_capped?)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def answer_key
|
|
28
|
+
current = MetricVersion.current.find_by(metric_id: @metric.id)
|
|
29
|
+
return [] unless current
|
|
30
|
+
|
|
31
|
+
base = Calibration.where(metric_id: @metric.id, metric_version_id: current.id, verdict: %w[agree disagree])
|
|
32
|
+
@key_size_before_cap = base.count
|
|
33
|
+
base.includes(response: :reviews)
|
|
34
|
+
.order(created_at: :desc)
|
|
35
|
+
.limit(ANSWER_KEY_LIMIT)
|
|
36
|
+
.filter_map do |cal|
|
|
37
|
+
response = cal.response
|
|
38
|
+
next unless response.response_text.present?
|
|
39
|
+
review = response.reviews.find { |r| r.metric_id == @metric.id }
|
|
40
|
+
position = cal.verdict == "disagree" ? cal.corrected_score : review&.ai_score
|
|
41
|
+
next if position.nil?
|
|
42
|
+
{ response: response, verdict: cal.verdict, position: position }
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def key_capped?
|
|
47
|
+
@key_size_before_cap.to_i > ANSWER_KEY_LIMIT
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def classify(entry, candidate_score)
|
|
51
|
+
matched = candidate_score == entry[:position].to_i
|
|
52
|
+
outcome = if entry[:verdict] == "disagree"
|
|
53
|
+
matched ? "fix" : "still_off"
|
|
54
|
+
else
|
|
55
|
+
matched ? "keep" : "break"
|
|
56
|
+
end
|
|
57
|
+
{
|
|
58
|
+
"response_id" => entry[:response].id,
|
|
59
|
+
"verdict" => entry[:verdict],
|
|
60
|
+
"position" => entry[:position].to_i,
|
|
61
|
+
"candidate_score" => candidate_score,
|
|
62
|
+
"outcome" => outcome
|
|
63
|
+
}
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def summarize(rows, total, capped)
|
|
67
|
+
fixes = rows.count { |r| r["outcome"] == "fix" }
|
|
68
|
+
keeps = rows.count { |r| r["outcome"] == "keep" }
|
|
69
|
+
breaks = rows.count { |r| r["outcome"] == "break" }
|
|
70
|
+
still_off = rows.count { |r| r["outcome"] == "still_off" }
|
|
71
|
+
agreements = rows.count { |r| r["verdict"] == "agree" }
|
|
72
|
+
{
|
|
73
|
+
"total" => total,
|
|
74
|
+
"tested" => rows.size,
|
|
75
|
+
"capped" => capped,
|
|
76
|
+
"fixes" => fixes,
|
|
77
|
+
"keeps" => keeps,
|
|
78
|
+
"breaks" => breaks,
|
|
79
|
+
"still_off" => still_off,
|
|
80
|
+
"before" => agreements,
|
|
81
|
+
"after" => fixes + keeps,
|
|
82
|
+
"rows" => rows
|
|
83
|
+
}
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def rescore(response, candidate)
|
|
87
|
+
run = response.run
|
|
88
|
+
config = ApiConfig.for_model(run.judge_model).merge(judge_model: run.judge_model)
|
|
89
|
+
rubric_text = Metric.rubric_text_for(Metric.normalize_rubric_bands(candidate.rubric_bands))
|
|
90
|
+
result = JudgeService.new(config).evaluate(
|
|
91
|
+
response.response_text,
|
|
92
|
+
response.expected_output,
|
|
93
|
+
run.prompt&.template,
|
|
94
|
+
criteria: candidate.instruction.to_s,
|
|
95
|
+
rubric_text: rubric_text,
|
|
96
|
+
input_data: response.input_data
|
|
97
|
+
)
|
|
98
|
+
result[:score]
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
@@ -117,40 +117,4 @@ module CompletionKit
|
|
|
117
117
|
end
|
|
118
118
|
end
|
|
119
119
|
|
|
120
|
-
module MetricCalibrationExamples
|
|
121
|
-
module_function
|
|
122
|
-
|
|
123
|
-
def for(metric, limit: 8)
|
|
124
|
-
disagreements_for(metric, limit: limit)
|
|
125
|
-
end
|
|
126
|
-
|
|
127
|
-
def disagreements_for(metric, limit: 8)
|
|
128
|
-
calibrations_for(metric, verdict: "disagree", limit: limit)
|
|
129
|
-
end
|
|
130
|
-
|
|
131
|
-
def borderlines_for(metric, limit: 6)
|
|
132
|
-
calibrations_for(metric, verdict: "borderline", limit: limit)
|
|
133
|
-
end
|
|
134
|
-
|
|
135
|
-
def calibrations_for(metric, verdict:, limit:)
|
|
136
|
-
base = Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
137
|
-
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
138
|
-
scoped = current_version ? base.where(metric_version_id: current_version.id) : base
|
|
139
|
-
effective = scoped.exists? ? scoped : base
|
|
140
|
-
effective.includes(response: :reviews)
|
|
141
|
-
.order(created_at: :desc)
|
|
142
|
-
.limit(limit)
|
|
143
|
-
.map do |cal|
|
|
144
|
-
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
145
|
-
{
|
|
146
|
-
input: cal.response.input_data,
|
|
147
|
-
output: cal.response.response_text,
|
|
148
|
-
judge_score: review&.ai_score,
|
|
149
|
-
judge_feedback: review&.ai_feedback,
|
|
150
|
-
human_score: cal.corrected_score,
|
|
151
|
-
human_note: cal.note
|
|
152
|
-
}
|
|
153
|
-
end
|
|
154
|
-
end
|
|
155
|
-
end
|
|
156
120
|
end
|
|
@@ -2,15 +2,11 @@
|
|
|
2
2
|
<% metric = local_assigns[:metric] %>
|
|
3
3
|
<% anchor = metric&.name&.parameterize %>
|
|
4
4
|
<% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
|
|
5
|
-
<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
|
|
5
|
+
<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
|
|
6
6
|
created_by = CompletionKit.config.username.presence || "operator"
|
|
7
|
-
verdicted_ids =
|
|
8
|
-
CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
|
|
9
|
-
else
|
|
10
|
-
[]
|
|
11
|
-
end
|
|
7
|
+
verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
|
|
12
8
|
CompletionKit::Response.joins(:reviews)
|
|
13
|
-
.where(reviews: { metric_id: metric.id })
|
|
9
|
+
.where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
|
|
14
10
|
.where.not(reviews: { ai_score: nil })
|
|
15
11
|
.where.not(id: verdicted_ids)
|
|
16
12
|
.order(created_at: :desc).first
|
|
@@ -24,7 +20,7 @@
|
|
|
24
20
|
<p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
25
21
|
<% if stats.sample_size.zero? %>
|
|
26
22
|
<span class="ck-trust-line__lead">Not measured yet.</span>
|
|
27
|
-
<span class="ck-trust-line__hint"
|
|
23
|
+
<span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
|
|
28
24
|
<% if target_response %>
|
|
29
25
|
<%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
30
26
|
<% end %>
|
|
@@ -35,13 +31,13 @@
|
|
|
35
31
|
<%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
36
32
|
<% end %>
|
|
37
33
|
<% else %>
|
|
38
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">
|
|
39
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
|
|
40
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
|
|
41
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
|
|
34
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Agrees with you</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong> of <%= stats.sample_size %> reviews</span>
|
|
42
35
|
<% if stats.borderline_rate && stats.borderline_rate > 0 %>
|
|
43
36
|
<% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
|
|
44
37
|
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>
|
|
45
38
|
<% end %>
|
|
46
39
|
<% end %>
|
|
47
40
|
</p>
|
|
41
|
+
<% if stats.sample_size.zero? && prior_version_verdicts > 0 %>
|
|
42
|
+
<p class="ck-trust-line__aside"><%= pluralize(prior_version_verdicts, "review") %> from an earlier version <%= prior_version_verdicts == 1 ? "doesn't" : "don't" %> count toward this version.</p>
|
|
43
|
+
<% end %>
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
<div id="ck-guiding-<%= metric.id %>" class="ck-guiding">
|
|
2
|
+
<% if examples.any? %>
|
|
3
|
+
<div class="ck-guiding__head">
|
|
4
|
+
<p class="ck-kicker ck-kicker--inset">Guiding the judge</p>
|
|
5
|
+
<span class="ck-guiding__legend">Judge → Human</span>
|
|
6
|
+
</div>
|
|
7
|
+
<ul class="ck-guiding__list">
|
|
8
|
+
<% examples.each do |example| %>
|
|
9
|
+
<li class="ck-guiding__item">
|
|
10
|
+
<%= link_to run_response_path(example[:run_id], example[:response_id], anchor: metric.name.parameterize),
|
|
11
|
+
class: "ck-guiding__link", title: "Open this review" do %>
|
|
12
|
+
<span class="ck-guiding__output"><%= truncate(example[:output].to_s, length: 90) %></span>
|
|
13
|
+
<span class="ck-guiding__scores"><span class="ck-guiding__judge"><%= example[:judge_score].to_i %></span> → <span class="ck-guiding__human"><%= example[:human_score].to_i %></span></span>
|
|
14
|
+
<% end %>
|
|
15
|
+
<%= button_to exclude_example_metric_path(metric, calibration_id: example[:id]),
|
|
16
|
+
method: :post, form_class: "inline-block", class: "ck-icon-btn",
|
|
17
|
+
title: "Stop using this case", "aria-label": "Stop using this case",
|
|
18
|
+
data: { turbo_confirm: "Stop using this corrected case to guide the judge?" } do %><%= heroicon_tag "x-mark", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
19
|
+
</li>
|
|
20
|
+
<% end %>
|
|
21
|
+
</ul>
|
|
22
|
+
<% end %>
|
|
23
|
+
</div>
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
<div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status ck-suggestion-status--ready">
|
|
2
|
+
<span class="ck-cal-foot__note">Drafted <%= draft.version_label %> and tested it against your reviews.</span>
|
|
3
|
+
<%= link_to "Compare and publish →", CompletionKit::Engine.routes.url_helpers.metric_path(metric, show_change: draft.id), class: "ck-cal-link" %>
|
|
4
|
+
</div>
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
<% s = summary %>
|
|
2
|
+
<div class="ck-scoreboard">
|
|
3
|
+
<p class="ck-scoreboard__headline">Matches you on <strong><%= s["after"] %> of <%= s["tested"] %></strong> of your reviews <span class="ck-scoreboard__was">was <%= s["before"] %> of <%= s["tested"] %></span></p>
|
|
4
|
+
<ul class="ck-scoreboard__tally">
|
|
5
|
+
<li class="ck-scoreboard__stat ck-scoreboard__stat--fix">Fixes <strong><%= s["fixes"] %></strong></li>
|
|
6
|
+
<li class="ck-scoreboard__stat ck-scoreboard__stat--keep">Keeps <strong><%= s["keeps"] %></strong></li>
|
|
7
|
+
<li class="ck-scoreboard__stat ck-scoreboard__stat--break">Breaks <strong><%= s["breaks"] %></strong></li>
|
|
8
|
+
</ul>
|
|
9
|
+
<% if s["capped"] %>
|
|
10
|
+
<p class="ck-scoreboard__note">Tested against your 30 most recent reviews.</p>
|
|
11
|
+
<% end %>
|
|
12
|
+
</div>
|
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
<thead>
|
|
19
19
|
<tr>
|
|
20
20
|
<th scope="col">Name</th>
|
|
21
|
+
<th scope="col">Version</th>
|
|
21
22
|
<th scope="col">Instruction</th>
|
|
22
23
|
<th scope="col">In groups</th>
|
|
23
24
|
<th scope="col"></th>
|
|
@@ -34,6 +35,10 @@
|
|
|
34
35
|
</div>
|
|
35
36
|
<% end %>
|
|
36
37
|
</td>
|
|
38
|
+
<td data-label="Version">
|
|
39
|
+
<% v = @current_versions[metric.id] %>
|
|
40
|
+
<span class="ck-chip ck-chip--soft"><%= v ? v.version_label : "v1" %></span>
|
|
41
|
+
</td>
|
|
37
42
|
<td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
|
|
38
43
|
<td data-label="In groups">
|
|
39
44
|
<% groups = metric.metric_groups %>
|
|
@@ -50,7 +50,6 @@
|
|
|
50
50
|
<thead>
|
|
51
51
|
<tr>
|
|
52
52
|
<th scope="col">Version</th>
|
|
53
|
-
<th scope="col">Δ Change</th>
|
|
54
53
|
<th scope="col">Source</th>
|
|
55
54
|
<th scope="col">Created</th>
|
|
56
55
|
</tr>
|
|
@@ -60,39 +59,37 @@
|
|
|
60
59
|
<% pred = predecessor_of[v] %>
|
|
61
60
|
<tr>
|
|
62
61
|
<td>
|
|
62
|
+
<% summary = v.change_summary_against(pred) %>
|
|
63
63
|
<div class="ck-version-cell">
|
|
64
64
|
<div class="ck-version-cell__label">
|
|
65
65
|
<strong><%= v.version_label %></strong>
|
|
66
66
|
<% if v.current? %>
|
|
67
|
-
<span class="ck-
|
|
67
|
+
<span class="ck-chip">Published</span>
|
|
68
68
|
<% elsif v.draft? %>
|
|
69
|
-
<span class="ck-version-state">Draft</span>
|
|
70
69
|
<%= button_to "Publish", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
71
70
|
method: :post, form_class: "inline-block",
|
|
72
|
-
class: "ck-chip ck-chip--
|
|
71
|
+
class: "ck-chip ck-chip--publish" %>
|
|
73
72
|
<% else %>
|
|
74
|
-
<span class="ck-version-state">Past</span>
|
|
75
73
|
<%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
76
74
|
method: :post, form_class: "inline-block",
|
|
77
75
|
class: "ck-chip ck-chip--publish",
|
|
78
76
|
data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
|
|
79
77
|
<% end %>
|
|
80
78
|
</div>
|
|
79
|
+
<% vs = v.validation_summary %>
|
|
80
|
+
<% if summary %>
|
|
81
|
+
<div class="ck-version-change">
|
|
82
|
+
<% if v.draft? && vs.present? %>
|
|
83
|
+
<span class="ck-version-score"><span class="ck-version-score__label">Match</span> <%= vs["after"] %>/<%= vs["tested"] %></span>
|
|
84
|
+
<% end %>
|
|
85
|
+
<button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">Δ</button>
|
|
86
|
+
</div>
|
|
87
|
+
<% end %>
|
|
81
88
|
</div>
|
|
82
89
|
</td>
|
|
83
|
-
<td>
|
|
84
|
-
<% summary = v.change_summary_against(pred) %>
|
|
85
|
-
<% if summary %>
|
|
86
|
-
<button type="button" class="ck-change-link ck-change-link--<%= summary[:magnitude] %>"
|
|
87
|
-
title="Compare with <%= pred.version_label %>"
|
|
88
|
-
onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()"><%= summary[:label] %></button>
|
|
89
|
-
<% else %>
|
|
90
|
-
<span class="ck-meta-copy">—</span>
|
|
91
|
-
<% end %>
|
|
92
|
-
</td>
|
|
93
90
|
<td>
|
|
94
91
|
<% source_label, source_class = case v.source
|
|
95
|
-
when "suggestion" then ["AI
|
|
92
|
+
when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
|
|
96
93
|
when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
|
|
97
94
|
when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
|
|
98
95
|
else ["Original", "ck-source-chip ck-source-chip--initial"]
|
|
@@ -119,6 +116,7 @@
|
|
|
119
116
|
<% @versions.each do |v| %>
|
|
120
117
|
<% pred = predecessor_of[v] %>
|
|
121
118
|
<% next unless v.change_summary_against(pred) %>
|
|
119
|
+
<% vs = v.validation_summary %>
|
|
122
120
|
<dialog id="ck-mvdiff-<%= v.id %>" class="ck-modal" onclick="if(event.target===this)this.close()">
|
|
123
121
|
<article class="ck-modal__panel" tabindex="-1" onclick="event.stopPropagation()">
|
|
124
122
|
<header class="ck-modal__header">
|
|
@@ -129,6 +127,9 @@
|
|
|
129
127
|
<button type="button" class="ck-modal__close" aria-label="Close" onclick="this.closest('dialog').close()">×</button>
|
|
130
128
|
</header>
|
|
131
129
|
<div class="ck-modal__body">
|
|
130
|
+
<% if v.draft? && vs.present? %>
|
|
131
|
+
<%= render "completion_kit/metrics/validation_scoreboard", summary: vs %>
|
|
132
|
+
<% end %>
|
|
132
133
|
<% if pred.instruction.to_s != v.instruction.to_s %>
|
|
133
134
|
<div class="ck-suggest-diff">
|
|
134
135
|
<div class="ck-suggest-diff__pane">
|
|
@@ -161,8 +162,10 @@
|
|
|
161
162
|
title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
|
|
162
163
|
data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
163
164
|
<%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
|
|
165
|
+
<% net_negative = vs.present? && (vs["after"].to_i < vs["before"].to_i || vs["breaks"].to_i > vs["fixes"].to_i) %>
|
|
164
166
|
<%= button_to "Publish #{v.version_label} →", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
165
|
-
method: :post, form_class: "inline-block", class: ck_button_classes(:dark)
|
|
167
|
+
method: :post, form_class: "inline-block", class: ck_button_classes(:dark),
|
|
168
|
+
data: net_negative ? { turbo_confirm: "This agrees with you less than the current version. Publish anyway?" } : {} %>
|
|
166
169
|
</span>
|
|
167
170
|
<% else %>
|
|
168
171
|
<span class="ck-modal__foot-note">Roll this metric back to this version.</span>
|
|
@@ -177,20 +180,11 @@
|
|
|
177
180
|
<% end %>
|
|
178
181
|
|
|
179
182
|
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
183
|
+
<% draft = @suggestion_draft || @edit_draft %>
|
|
180
184
|
<section class="ck-card ck-card--spaced">
|
|
181
|
-
<
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
stats: CompletionKit::MetricCalibrationStats.for(@metric),
|
|
185
|
-
metric: @metric %>
|
|
186
|
-
<% draft = @suggestion_draft || @edit_draft %>
|
|
187
|
-
<% if draft %>
|
|
188
|
-
<div class="ck-cal-foot">
|
|
189
|
-
<span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
|
|
190
|
-
</div>
|
|
191
|
-
<% elsif @improve_disagreement_count.positive? %>
|
|
192
|
-
<div class="ck-cal-foot">
|
|
193
|
-
<span class="ck-cal-foot__note"><%= pluralize(@improve_disagreement_count, "case") %> where a reviewer's score didn't match the judge.</span>
|
|
185
|
+
<div class="ck-prompt-preview__header">
|
|
186
|
+
<p class="ck-kicker">Agreement</p>
|
|
187
|
+
<% if draft.nil? && @improve_disagreement_count.positive? %>
|
|
194
188
|
<%= button_to suggest_variants_metric_path(@metric),
|
|
195
189
|
method: :post, form_class: "inline-block",
|
|
196
190
|
class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
|
|
@@ -198,6 +192,20 @@
|
|
|
198
192
|
<%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
|
|
199
193
|
Suggest improvements
|
|
200
194
|
<% end %>
|
|
195
|
+
<% end %>
|
|
196
|
+
</div>
|
|
197
|
+
<%= turbo_stream_from "metric_#{@metric.id}_suggestion" %>
|
|
198
|
+
<div id="ck-suggestion-status-<%= @metric.id %>" class="ck-suggestion-status"></div>
|
|
199
|
+
<p class="ck-meta-copy">How often the judge lands on the same score you would. Review its scores to build that signal, and improve the metric to raise it.</p>
|
|
200
|
+
<%= render "completion_kit/calibrations/trust_panel",
|
|
201
|
+
stats: CompletionKit::MetricCalibrationStats.for(@metric),
|
|
202
|
+
metric: @metric %>
|
|
203
|
+
<% if CompletionKit.config.judge_examples_from_reviews %>
|
|
204
|
+
<%= render "completion_kit/metrics/guiding_examples", metric: @metric, examples: @guiding_examples %>
|
|
205
|
+
<% end %>
|
|
206
|
+
<% if draft %>
|
|
207
|
+
<div class="ck-cal-foot">
|
|
208
|
+
<span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
|
|
201
209
|
</div>
|
|
202
210
|
<% end %>
|
|
203
211
|
</section>
|
|
@@ -100,12 +100,17 @@
|
|
|
100
100
|
<% @reviews.each do |review| %>
|
|
101
101
|
<% review_version = review.metric_version %>
|
|
102
102
|
<% stale = review.stale_against_current_judge? %>
|
|
103
|
-
<div class="ck-review-card
|
|
103
|
+
<div class="ck-review-card" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
|
|
104
104
|
<div class="ck-review-card__header">
|
|
105
105
|
<span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
|
|
106
106
|
<div class="ck-inline">
|
|
107
107
|
<% if review_version %>
|
|
108
|
-
|
|
108
|
+
<% if stale %>
|
|
109
|
+
<% current_version = CompletionKit::MetricVersion.current.find_by(metric_id: review.metric_id) %>
|
|
110
|
+
<span class="ck-source-chip ck-source-chip--past" title="Scored on <%= review_version.version_label %>; the metric is now on <%= current_version.version_label %>, which may score this differently."><%= review_version.version_label %> → <%= current_version.version_label %></span>
|
|
111
|
+
<% else %>
|
|
112
|
+
<span class="ck-source-chip ck-source-chip--current" title="Scored on the metric's current version (<%= review_version.version_label %>)."><%= review_version.version_label %></span>
|
|
113
|
+
<% end %>
|
|
109
114
|
<% end %>
|
|
110
115
|
<% if review.ai_score %>
|
|
111
116
|
<% 5.times do |i| %>
|
|
@@ -116,9 +121,6 @@
|
|
|
116
121
|
<% end %>
|
|
117
122
|
</div>
|
|
118
123
|
</div>
|
|
119
|
-
<% if stale %>
|
|
120
|
-
<p class="ck-review-card__stale-note">Scored against a superseded version of this metric. Its current version may score this differently.</p>
|
|
121
|
-
<% end %>
|
|
122
124
|
<% if review.ai_feedback.present? %>
|
|
123
125
|
<p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
|
|
124
126
|
<% end %>
|
data/config/routes.rb
CHANGED
data/lib/completion_kit.rb
CHANGED
|
@@ -13,6 +13,7 @@ module CompletionKit
|
|
|
13
13
|
attr_accessor :api_rate_limit, :web_rate_limit
|
|
14
14
|
attr_accessor :allow_loopback_endpoints
|
|
15
15
|
attr_accessor :judge_calibration_enabled
|
|
16
|
+
attr_accessor :judge_examples_from_reviews
|
|
16
17
|
|
|
17
18
|
def initialize
|
|
18
19
|
@openai_api_key = ENV['OPENAI_API_KEY']
|
|
@@ -29,6 +30,7 @@ module CompletionKit
|
|
|
29
30
|
|
|
30
31
|
@allow_loopback_endpoints = true
|
|
31
32
|
@judge_calibration_enabled = true
|
|
33
|
+
@judge_examples_from_reviews = false
|
|
32
34
|
|
|
33
35
|
@api_reference_authentication_partial = "completion_kit/api_reference/authentication"
|
|
34
36
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.11.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -266,6 +266,7 @@ files:
|
|
|
266
266
|
- app/jobs/completion_kit/application_job.rb
|
|
267
267
|
- app/jobs/completion_kit/generate_row_job.rb
|
|
268
268
|
- app/jobs/completion_kit/judge_review_job.rb
|
|
269
|
+
- app/jobs/completion_kit/metric_suggestion_job.rb
|
|
269
270
|
- app/jobs/completion_kit/model_discovery_job.rb
|
|
270
271
|
- app/jobs/completion_kit/run_completion_check_job.rb
|
|
271
272
|
- app/mailers/completion_kit/application_mailer.rb
|
|
@@ -311,7 +312,9 @@ files:
|
|
|
311
312
|
- app/services/completion_kit/mcp_tools/responses.rb
|
|
312
313
|
- app/services/completion_kit/mcp_tools/runs.rb
|
|
313
314
|
- app/services/completion_kit/mcp_tools/tags.rb
|
|
315
|
+
- app/services/completion_kit/metric_calibration_examples.rb
|
|
314
316
|
- app/services/completion_kit/metric_calibration_stats.rb
|
|
317
|
+
- app/services/completion_kit/metric_improvement_validator.rb
|
|
315
318
|
- app/services/completion_kit/metric_variant_generator.rb
|
|
316
319
|
- app/services/completion_kit/model_discovery_service.rb
|
|
317
320
|
- app/services/completion_kit/ollama_client.rb
|
|
@@ -350,9 +353,14 @@ files:
|
|
|
350
353
|
- app/views/completion_kit/metric_groups/new.html.erb
|
|
351
354
|
- app/views/completion_kit/metric_groups/show.html.erb
|
|
352
355
|
- app/views/completion_kit/metrics/_form.html.erb
|
|
356
|
+
- app/views/completion_kit/metrics/_guiding_examples.html.erb
|
|
353
357
|
- app/views/completion_kit/metrics/_rubric_diff.html.erb
|
|
354
358
|
- app/views/completion_kit/metrics/_rubric_hint.html.erb
|
|
355
359
|
- app/views/completion_kit/metrics/_starter_card.html.erb
|
|
360
|
+
- app/views/completion_kit/metrics/_suggestion_failed.html.erb
|
|
361
|
+
- app/views/completion_kit/metrics/_suggestion_pending.html.erb
|
|
362
|
+
- app/views/completion_kit/metrics/_suggestion_ready.html.erb
|
|
363
|
+
- app/views/completion_kit/metrics/_validation_scoreboard.html.erb
|
|
356
364
|
- app/views/completion_kit/metrics/edit.html.erb
|
|
357
365
|
- app/views/completion_kit/metrics/index.html.erb
|
|
358
366
|
- app/views/completion_kit/metrics/new.html.erb
|
|
@@ -430,6 +438,8 @@ files:
|
|
|
430
438
|
- db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
|
|
431
439
|
- db/migrate/20260528000002_add_metric_version_to_reviews.rb
|
|
432
440
|
- db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
|
|
441
|
+
- db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb
|
|
442
|
+
- db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb
|
|
433
443
|
- lib/completion-kit.rb
|
|
434
444
|
- lib/completion_kit.rb
|
|
435
445
|
- lib/completion_kit/concurrency_check.rb
|