completion-kit 0.9.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +104 -48
- data/app/controllers/completion_kit/metrics_controller.rb +18 -1
- data/app/jobs/completion_kit/judge_review_job.rb +9 -4
- data/app/services/completion_kit/judge_service.rb +19 -3
- data/app/services/completion_kit/metric_calibration_examples.rb +56 -0
- data/app/services/completion_kit/metric_variant_generator.rb +0 -36
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +7 -8
- data/app/views/completion_kit/metrics/_guiding_examples.html.erb +23 -0
- data/app/views/completion_kit/metrics/index.html.erb +5 -0
- data/app/views/completion_kit/metrics/show.html.erb +22 -28
- data/app/views/completion_kit/responses/show.html.erb +7 -5
- data/config/routes.rb +1 -0
- data/db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb +5 -0
- data/lib/completion_kit/version.rb +1 -1
- data/lib/completion_kit.rb +2 -0
- metadata +4 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 4772b264a668a86e004f78c7bc2397d93f5266d8dc287b422728d951ab24fbcc
|
|
4
|
+
data.tar.gz: 63b221e144e930df9978607a78533d354f659ca2bf6141046291168af50d3cd7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a3249ae1c734dcee0c6f9410baf0400f4b16e091b220d86e8417dec91ff9943a165bbc6c8368629cc14054c3af7946bdb693f006a5756de40a809c41db5bbe3a
|
|
7
|
+
data.tar.gz: 541323c93b08f08f32c2f024e3709ee5f3e4e48144cd8e9cb2ba8891312fd9b5712e4ca39cbcc739b64ba61ff0ac890be3528e5bfd9c238e073d640b37b47e90
|
|
@@ -686,16 +686,6 @@ tr:hover .ck-chip--publish {
|
|
|
686
686
|
justify-content: space-between;
|
|
687
687
|
gap: 10px;
|
|
688
688
|
}
|
|
689
|
-
.ck-version-state {
|
|
690
|
-
font-family: var(--ck-mono);
|
|
691
|
-
font-size: 0.66rem;
|
|
692
|
-
letter-spacing: 0.07em;
|
|
693
|
-
text-transform: uppercase;
|
|
694
|
-
color: var(--ck-dim);
|
|
695
|
-
}
|
|
696
|
-
.ck-version-state--live {
|
|
697
|
-
color: var(--ck-text);
|
|
698
|
-
}
|
|
699
689
|
|
|
700
690
|
.ck-chip--soft {
|
|
701
691
|
background: var(--ck-accent-soft);
|
|
@@ -2877,10 +2867,6 @@ select.ck-input {
|
|
|
2877
2867
|
line-height: 1.55;
|
|
2878
2868
|
}
|
|
2879
2869
|
|
|
2880
|
-
.ck-review-card--stale {
|
|
2881
|
-
border-left: 2px solid rgba(224, 164, 88, 0.45);
|
|
2882
|
-
}
|
|
2883
|
-
|
|
2884
2870
|
.ck-stale-versions-banner {
|
|
2885
2871
|
margin: 0 0 1rem;
|
|
2886
2872
|
padding: 0.9rem 1rem;
|
|
@@ -2908,12 +2894,6 @@ select.ck-input {
|
|
|
2908
2894
|
.ck-delta--zero { color: var(--ck-dim); }
|
|
2909
2895
|
|
|
2910
2896
|
.ck-run-compare-table td { vertical-align: middle; }
|
|
2911
|
-
.ck-review-card__stale-note {
|
|
2912
|
-
margin: 0.4rem 0 0;
|
|
2913
|
-
font-family: var(--ck-mono);
|
|
2914
|
-
font-size: 0.78rem;
|
|
2915
|
-
color: var(--ck-warning);
|
|
2916
|
-
}
|
|
2917
2897
|
|
|
2918
2898
|
@media (max-width: 900px) {
|
|
2919
2899
|
.ck-grid--sidebar,
|
|
@@ -3617,9 +3597,10 @@ select.ck-input {
|
|
|
3617
3597
|
}
|
|
3618
3598
|
|
|
3619
3599
|
.ck-metrics-table th:nth-child(1), .ck-metrics-table td:nth-child(1) { width: 18rem; white-space: normal; }
|
|
3620
|
-
.ck-metrics-table th:nth-child(2), .ck-metrics-table td:nth-child(2) { width:
|
|
3621
|
-
.ck-metrics-table th:nth-child(3), .ck-metrics-table td:nth-child(3) { width:
|
|
3622
|
-
.ck-metrics-table th:nth-child(4), .ck-metrics-table td:nth-child(4) { width:
|
|
3600
|
+
.ck-metrics-table th:nth-child(2), .ck-metrics-table td:nth-child(2) { width: 6rem; }
|
|
3601
|
+
.ck-metrics-table th:nth-child(3), .ck-metrics-table td:nth-child(3) { width: auto; }
|
|
3602
|
+
.ck-metrics-table th:nth-child(4), .ck-metrics-table td:nth-child(4) { width: 16rem; }
|
|
3603
|
+
.ck-metrics-table th:nth-child(5), .ck-metrics-table td:nth-child(5) { width: 3rem; }
|
|
3623
3604
|
.ck-metrics-table td:nth-child(1) strong { overflow-wrap: anywhere; }
|
|
3624
3605
|
|
|
3625
3606
|
.ck-datasets-table th:nth-child(1), .ck-datasets-table td:nth-child(1) { width: auto; }
|
|
@@ -3638,32 +3619,11 @@ select.ck-input {
|
|
|
3638
3619
|
.ck-prompt-versions-table th:nth-child(3), .ck-prompt-versions-table td:nth-child(3) { width: 8rem; white-space: nowrap; }
|
|
3639
3620
|
.ck-prompt-versions-table th:nth-child(4), .ck-prompt-versions-table td:nth-child(4) { width: auto; }
|
|
3640
3621
|
|
|
3641
|
-
.ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width:
|
|
3642
|
-
.ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width:
|
|
3643
|
-
.ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width:
|
|
3644
|
-
.ck-metric-versions-table
|
|
3622
|
+
.ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 34%; }
|
|
3623
|
+
.ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: 33%; white-space: nowrap; }
|
|
3624
|
+
.ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: 33%; white-space: nowrap; }
|
|
3625
|
+
.ck-metric-versions-table .ck-version-cell { justify-content: flex-start; gap: 0.75rem; }
|
|
3645
3626
|
|
|
3646
|
-
.ck-change-link {
|
|
3647
|
-
background: none;
|
|
3648
|
-
border: 0;
|
|
3649
|
-
padding: 0;
|
|
3650
|
-
cursor: pointer;
|
|
3651
|
-
font-family: inherit;
|
|
3652
|
-
font-size: 0.86rem;
|
|
3653
|
-
text-align: left;
|
|
3654
|
-
color: var(--ck-text);
|
|
3655
|
-
}
|
|
3656
|
-
.ck-change-link:hover,
|
|
3657
|
-
.ck-change-link:focus-visible {
|
|
3658
|
-
color: var(--ck-accent);
|
|
3659
|
-
text-decoration: underline;
|
|
3660
|
-
}
|
|
3661
|
-
.ck-change-link--trivial {
|
|
3662
|
-
color: var(--ck-dim);
|
|
3663
|
-
}
|
|
3664
|
-
.ck-change-link--major {
|
|
3665
|
-
color: rgb(217, 119, 6);
|
|
3666
|
-
}
|
|
3667
3627
|
|
|
3668
3628
|
.ck-source-chip {
|
|
3669
3629
|
display: inline-block;
|
|
@@ -5632,6 +5592,11 @@ a.tag-mark {
|
|
|
5632
5592
|
.ck-trust-line__hint {
|
|
5633
5593
|
color: var(--ck-dim);
|
|
5634
5594
|
}
|
|
5595
|
+
.ck-trust-line__aside {
|
|
5596
|
+
margin: 4px 0 0;
|
|
5597
|
+
font-size: 0.78rem;
|
|
5598
|
+
color: var(--ck-muted);
|
|
5599
|
+
}
|
|
5635
5600
|
.ck-cal-stat {
|
|
5636
5601
|
display: inline-flex;
|
|
5637
5602
|
align-items: baseline;
|
|
@@ -5945,3 +5910,94 @@ a.tag-mark {
|
|
|
5945
5910
|
.ck-starter-actions .ck-button {
|
|
5946
5911
|
line-height: 1;
|
|
5947
5912
|
}
|
|
5913
|
+
|
|
5914
|
+
.ck-guiding {
|
|
5915
|
+
margin-top: 14px;
|
|
5916
|
+
padding-top: 12px;
|
|
5917
|
+
border-top: 1px solid var(--ck-line);
|
|
5918
|
+
}
|
|
5919
|
+
|
|
5920
|
+
.ck-guiding__head {
|
|
5921
|
+
display: flex;
|
|
5922
|
+
align-items: baseline;
|
|
5923
|
+
justify-content: space-between;
|
|
5924
|
+
gap: 12px;
|
|
5925
|
+
}
|
|
5926
|
+
|
|
5927
|
+
.ck-guiding__head .ck-kicker--inset {
|
|
5928
|
+
margin-top: 0;
|
|
5929
|
+
}
|
|
5930
|
+
|
|
5931
|
+
.ck-guiding__legend {
|
|
5932
|
+
font-family: var(--ck-mono);
|
|
5933
|
+
font-size: 0.64rem;
|
|
5934
|
+
letter-spacing: 0.09em;
|
|
5935
|
+
text-transform: uppercase;
|
|
5936
|
+
color: var(--ck-muted);
|
|
5937
|
+
}
|
|
5938
|
+
|
|
5939
|
+
.ck-guiding__list {
|
|
5940
|
+
list-style: none;
|
|
5941
|
+
margin: 8px -8px 0;
|
|
5942
|
+
padding: 0;
|
|
5943
|
+
display: flex;
|
|
5944
|
+
flex-direction: column;
|
|
5945
|
+
}
|
|
5946
|
+
|
|
5947
|
+
.ck-guiding__item {
|
|
5948
|
+
display: flex;
|
|
5949
|
+
align-items: center;
|
|
5950
|
+
gap: 12px;
|
|
5951
|
+
padding: 5px 8px;
|
|
5952
|
+
border-radius: 7px;
|
|
5953
|
+
transition: background 0.15s;
|
|
5954
|
+
}
|
|
5955
|
+
|
|
5956
|
+
.ck-guiding__item:hover {
|
|
5957
|
+
background: var(--ck-surface-hover);
|
|
5958
|
+
}
|
|
5959
|
+
|
|
5960
|
+
.ck-guiding__item:hover .ck-guiding__output {
|
|
5961
|
+
color: var(--ck-text);
|
|
5962
|
+
}
|
|
5963
|
+
|
|
5964
|
+
.ck-guiding__link {
|
|
5965
|
+
flex: 1;
|
|
5966
|
+
min-width: 0;
|
|
5967
|
+
display: flex;
|
|
5968
|
+
align-items: center;
|
|
5969
|
+
gap: 12px;
|
|
5970
|
+
text-decoration: none;
|
|
5971
|
+
color: inherit;
|
|
5972
|
+
}
|
|
5973
|
+
|
|
5974
|
+
.ck-guiding__output {
|
|
5975
|
+
flex: 1;
|
|
5976
|
+
min-width: 0;
|
|
5977
|
+
overflow: hidden;
|
|
5978
|
+
text-overflow: ellipsis;
|
|
5979
|
+
white-space: nowrap;
|
|
5980
|
+
color: var(--ck-dim);
|
|
5981
|
+
font-size: 0.86rem;
|
|
5982
|
+
}
|
|
5983
|
+
|
|
5984
|
+
.ck-guiding__scores {
|
|
5985
|
+
font-family: var(--ck-mono);
|
|
5986
|
+
font-size: 0.78rem;
|
|
5987
|
+
color: var(--ck-text);
|
|
5988
|
+
white-space: nowrap;
|
|
5989
|
+
}
|
|
5990
|
+
|
|
5991
|
+
.ck-guiding__judge {
|
|
5992
|
+
color: var(--ck-dim);
|
|
5993
|
+
}
|
|
5994
|
+
|
|
5995
|
+
.ck-guiding__human {
|
|
5996
|
+
color: var(--ck-text);
|
|
5997
|
+
font-weight: 600;
|
|
5998
|
+
}
|
|
5999
|
+
|
|
6000
|
+
.ck-guiding__item .ck-icon-btn {
|
|
6001
|
+
width: 2rem;
|
|
6002
|
+
height: 2rem;
|
|
6003
|
+
}
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class MetricsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion]
|
|
4
|
+
before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion, :exclude_example]
|
|
5
|
+
before_action :ensure_examples_from_reviews_enabled, only: [:exclude_example]
|
|
5
6
|
|
|
6
7
|
def index
|
|
7
8
|
@metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
|
|
8
9
|
@available_starters = StarterMetrics.available
|
|
10
|
+
@current_versions = MetricVersion.published.current.where(metric_id: @metrics.map(&:id)).index_by(&:metric_id)
|
|
9
11
|
end
|
|
10
12
|
|
|
11
13
|
def starter_preview
|
|
@@ -39,6 +41,7 @@ module CompletionKit
|
|
|
39
41
|
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
40
42
|
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
41
43
|
@versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
|
|
44
|
+
@guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricCalibrationExamples.judge_examples_for(@metric) : []
|
|
42
45
|
end
|
|
43
46
|
|
|
44
47
|
def new
|
|
@@ -145,6 +148,16 @@ module CompletionKit
|
|
|
145
148
|
redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
|
|
146
149
|
end
|
|
147
150
|
|
|
151
|
+
def exclude_example
|
|
152
|
+
calibration = Calibration.where(metric_id: @metric.id).find(params[:calibration_id])
|
|
153
|
+
calibration.update!(excluded_from_examples: true)
|
|
154
|
+
render turbo_stream: turbo_stream.replace(
|
|
155
|
+
"ck-guiding-#{@metric.id}",
|
|
156
|
+
partial: "completion_kit/metrics/guiding_examples",
|
|
157
|
+
locals: { metric: @metric, examples: MetricCalibrationExamples.judge_examples_for(@metric) }
|
|
158
|
+
)
|
|
159
|
+
end
|
|
160
|
+
|
|
148
161
|
def publish_draft
|
|
149
162
|
scope = MetricVersion.where(metric_id: @metric.id)
|
|
150
163
|
version = if params[:draft_id].present?
|
|
@@ -176,6 +189,10 @@ module CompletionKit
|
|
|
176
189
|
|
|
177
190
|
private
|
|
178
191
|
|
|
192
|
+
def ensure_examples_from_reviews_enabled
|
|
193
|
+
head :not_found unless CompletionKit.config.judge_examples_from_reviews
|
|
194
|
+
end
|
|
195
|
+
|
|
179
196
|
def set_metric
|
|
180
197
|
@metric = Metric.find(params[:id])
|
|
181
198
|
end
|
|
@@ -58,7 +58,8 @@ module CompletionKit
|
|
|
58
58
|
run.prompt&.template,
|
|
59
59
|
criteria: metric.instruction.to_s,
|
|
60
60
|
rubric_text: metric.display_rubric_text,
|
|
61
|
-
input_data: response.input_data
|
|
61
|
+
input_data: response.input_data,
|
|
62
|
+
human_examples: review_examples_for(metric, response)
|
|
62
63
|
)
|
|
63
64
|
|
|
64
65
|
review = response.reviews.find_or_initialize_by(metric_id: metric.id)
|
|
@@ -80,9 +81,13 @@ module CompletionKit
|
|
|
80
81
|
|
|
81
82
|
private
|
|
82
83
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
84
|
+
def review_examples_for(metric, response)
|
|
85
|
+
return nil unless CompletionKit.config.judge_calibration_enabled
|
|
86
|
+
return nil unless CompletionKit.config.judge_examples_from_reviews
|
|
87
|
+
|
|
88
|
+
MetricCalibrationExamples.judge_examples_for(metric, exclude_response_id: response.id)
|
|
89
|
+
end
|
|
90
|
+
|
|
86
91
|
def confirm_judging_capability(judge_model_id)
|
|
87
92
|
model = Model.find_by(provider: ApiConfig.provider_for_model(judge_model_id), model_id: judge_model_id)
|
|
88
93
|
return unless model && model.supports_judging.nil?
|
|
@@ -10,13 +10,14 @@ module CompletionKit
|
|
|
10
10
|
@judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
|
|
11
11
|
end
|
|
12
12
|
|
|
13
|
-
def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, **_extras)
|
|
13
|
+
def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil, **_extras)
|
|
14
14
|
raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
|
|
15
15
|
|
|
16
16
|
judge_prompt = build_judge_prompt(output, expected_output, prompt,
|
|
17
17
|
criteria: criteria,
|
|
18
18
|
rubric_text: rubric_text,
|
|
19
|
-
input_data: input_data
|
|
19
|
+
input_data: input_data,
|
|
20
|
+
human_examples: human_examples)
|
|
20
21
|
|
|
21
22
|
response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
|
|
22
23
|
raise StandardError, response if response.start_with?("Error:")
|
|
@@ -25,7 +26,7 @@ module CompletionKit
|
|
|
25
26
|
|
|
26
27
|
private
|
|
27
28
|
|
|
28
|
-
def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil)
|
|
29
|
+
def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil)
|
|
29
30
|
judge_prompt = <<~PROMPT
|
|
30
31
|
You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
|
|
31
32
|
|
|
@@ -42,6 +43,8 @@ module CompletionKit
|
|
|
42
43
|
judge_prompt += "\nCriteria: #{criteria}\n"
|
|
43
44
|
end
|
|
44
45
|
|
|
46
|
+
judge_prompt += human_examples_block(human_examples)
|
|
47
|
+
|
|
45
48
|
judge_prompt += <<~PROMPT
|
|
46
49
|
|
|
47
50
|
Original prompt: #{prompt || "Not provided"}
|
|
@@ -53,6 +56,19 @@ module CompletionKit
|
|
|
53
56
|
judge_prompt
|
|
54
57
|
end
|
|
55
58
|
|
|
59
|
+
def human_examples_block(examples)
|
|
60
|
+
return "" if examples.blank?
|
|
61
|
+
|
|
62
|
+
lines = ["", "Reviewed examples where a human corrected the judge on this metric. Weigh them when scoring:"]
|
|
63
|
+
examples.each_with_index do |example, index|
|
|
64
|
+
note = example[:human_note].to_s
|
|
65
|
+
line = "Example #{index + 1}: Output: #{example[:output].to_s.truncate(200)}. The judge scored this #{example[:judge_score].to_i}/5. A reviewer corrected it to #{example[:human_score].to_i}/5"
|
|
66
|
+
line += note.present? ? ": #{note.truncate(160)}" : "."
|
|
67
|
+
lines << line
|
|
68
|
+
end
|
|
69
|
+
lines.join("\n") + "\n"
|
|
70
|
+
end
|
|
71
|
+
|
|
56
72
|
def parse_judge_response(response)
|
|
57
73
|
score_match = response.match(/\*{0,2}Score:?\*{0,2}\s*(\d+(?:\.\d+)?)/i)
|
|
58
74
|
feedback_match = response.match(/\*{0,2}Feedback:?\*{0,2}\s*(.+)/mi)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module MetricCalibrationExamples
|
|
3
|
+
DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
|
|
4
|
+
|
|
5
|
+
module_function
|
|
6
|
+
|
|
7
|
+
def for(metric, limit: 8)
|
|
8
|
+
disagreements_for(metric, limit: limit)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def disagreements_for(metric, limit: 8)
|
|
12
|
+
calibrations_for(metric, verdict: "disagree", limit: limit)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def borderlines_for(metric, limit: 6)
|
|
16
|
+
calibrations_for(metric, verdict: "borderline", limit: limit)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
|
|
20
|
+
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
21
|
+
return [] unless current_version
|
|
22
|
+
|
|
23
|
+
relation = Calibration
|
|
24
|
+
.where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
|
|
25
|
+
.where.not(corrected_score: nil)
|
|
26
|
+
relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
|
|
27
|
+
map_examples(relation.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
|
|
28
|
+
.reject { |example| example[:judge_score].nil? }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def calibrations_for(metric, verdict:, limit:)
|
|
32
|
+
base = Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
33
|
+
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
34
|
+
scoped = current_version ? base.where(metric_version_id: current_version.id) : base
|
|
35
|
+
effective = scoped.exists? ? scoped : base
|
|
36
|
+
map_examples(effective.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def map_examples(relation, metric)
|
|
40
|
+
relation.map do |cal|
|
|
41
|
+
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
42
|
+
{
|
|
43
|
+
id: cal.id,
|
|
44
|
+
run_id: cal.run_id,
|
|
45
|
+
response_id: cal.response_id,
|
|
46
|
+
input: cal.response.input_data,
|
|
47
|
+
output: cal.response.response_text,
|
|
48
|
+
judge_score: review&.ai_score,
|
|
49
|
+
judge_feedback: review&.ai_feedback,
|
|
50
|
+
human_score: cal.corrected_score,
|
|
51
|
+
human_note: cal.note
|
|
52
|
+
}
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -117,40 +117,4 @@ module CompletionKit
|
|
|
117
117
|
end
|
|
118
118
|
end
|
|
119
119
|
|
|
120
|
-
module MetricCalibrationExamples
|
|
121
|
-
module_function
|
|
122
|
-
|
|
123
|
-
def for(metric, limit: 8)
|
|
124
|
-
disagreements_for(metric, limit: limit)
|
|
125
|
-
end
|
|
126
|
-
|
|
127
|
-
def disagreements_for(metric, limit: 8)
|
|
128
|
-
calibrations_for(metric, verdict: "disagree", limit: limit)
|
|
129
|
-
end
|
|
130
|
-
|
|
131
|
-
def borderlines_for(metric, limit: 6)
|
|
132
|
-
calibrations_for(metric, verdict: "borderline", limit: limit)
|
|
133
|
-
end
|
|
134
|
-
|
|
135
|
-
def calibrations_for(metric, verdict:, limit:)
|
|
136
|
-
base = Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
137
|
-
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
138
|
-
scoped = current_version ? base.where(metric_version_id: current_version.id) : base
|
|
139
|
-
effective = scoped.exists? ? scoped : base
|
|
140
|
-
effective.includes(response: :reviews)
|
|
141
|
-
.order(created_at: :desc)
|
|
142
|
-
.limit(limit)
|
|
143
|
-
.map do |cal|
|
|
144
|
-
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
145
|
-
{
|
|
146
|
-
input: cal.response.input_data,
|
|
147
|
-
output: cal.response.response_text,
|
|
148
|
-
judge_score: review&.ai_score,
|
|
149
|
-
judge_feedback: review&.ai_feedback,
|
|
150
|
-
human_score: cal.corrected_score,
|
|
151
|
-
human_note: cal.note
|
|
152
|
-
}
|
|
153
|
-
end
|
|
154
|
-
end
|
|
155
|
-
end
|
|
156
120
|
end
|
|
@@ -2,15 +2,11 @@
|
|
|
2
2
|
<% metric = local_assigns[:metric] %>
|
|
3
3
|
<% anchor = metric&.name&.parameterize %>
|
|
4
4
|
<% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
|
|
5
|
-
<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
|
|
5
|
+
<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
|
|
6
6
|
created_by = CompletionKit.config.username.presence || "operator"
|
|
7
|
-
verdicted_ids =
|
|
8
|
-
CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
|
|
9
|
-
else
|
|
10
|
-
[]
|
|
11
|
-
end
|
|
7
|
+
verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
|
|
12
8
|
CompletionKit::Response.joins(:reviews)
|
|
13
|
-
.where(reviews: { metric_id: metric.id })
|
|
9
|
+
.where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
|
|
14
10
|
.where.not(reviews: { ai_score: nil })
|
|
15
11
|
.where.not(id: verdicted_ids)
|
|
16
12
|
.order(created_at: :desc).first
|
|
@@ -24,7 +20,7 @@
|
|
|
24
20
|
<p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
25
21
|
<% if stats.sample_size.zero? %>
|
|
26
22
|
<span class="ck-trust-line__lead">Not measured yet.</span>
|
|
27
|
-
<span class="ck-trust-line__hint"
|
|
23
|
+
<span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
|
|
28
24
|
<% if target_response %>
|
|
29
25
|
<%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
30
26
|
<% end %>
|
|
@@ -45,3 +41,6 @@
|
|
|
45
41
|
<% end %>
|
|
46
42
|
<% end %>
|
|
47
43
|
</p>
|
|
44
|
+
<% if stats.sample_size.zero? && prior_version_verdicts > 0 %>
|
|
45
|
+
<p class="ck-trust-line__aside"><%= pluralize(prior_version_verdicts, "review") %> from an earlier version <%= prior_version_verdicts == 1 ? "doesn't" : "don't" %> count toward this version.</p>
|
|
46
|
+
<% end %>
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
<div id="ck-guiding-<%= metric.id %>" class="ck-guiding">
|
|
2
|
+
<% if examples.any? %>
|
|
3
|
+
<div class="ck-guiding__head">
|
|
4
|
+
<p class="ck-kicker ck-kicker--inset">Guiding the judge</p>
|
|
5
|
+
<span class="ck-guiding__legend">Judge → Human</span>
|
|
6
|
+
</div>
|
|
7
|
+
<ul class="ck-guiding__list">
|
|
8
|
+
<% examples.each do |example| %>
|
|
9
|
+
<li class="ck-guiding__item">
|
|
10
|
+
<%= link_to run_response_path(example[:run_id], example[:response_id], anchor: metric.name.parameterize),
|
|
11
|
+
class: "ck-guiding__link", title: "Open this review" do %>
|
|
12
|
+
<span class="ck-guiding__output"><%= truncate(example[:output].to_s, length: 90) %></span>
|
|
13
|
+
<span class="ck-guiding__scores"><span class="ck-guiding__judge"><%= example[:judge_score].to_i %></span> → <span class="ck-guiding__human"><%= example[:human_score].to_i %></span></span>
|
|
14
|
+
<% end %>
|
|
15
|
+
<%= button_to exclude_example_metric_path(metric, calibration_id: example[:id]),
|
|
16
|
+
method: :post, form_class: "inline-block", class: "ck-icon-btn",
|
|
17
|
+
title: "Stop using this case", "aria-label": "Stop using this case",
|
|
18
|
+
data: { turbo_confirm: "Stop using this corrected case to guide the judge?" } do %><%= heroicon_tag "x-mark", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
19
|
+
</li>
|
|
20
|
+
<% end %>
|
|
21
|
+
</ul>
|
|
22
|
+
<% end %>
|
|
23
|
+
</div>
|
|
@@ -18,6 +18,7 @@
|
|
|
18
18
|
<thead>
|
|
19
19
|
<tr>
|
|
20
20
|
<th scope="col">Name</th>
|
|
21
|
+
<th scope="col">Version</th>
|
|
21
22
|
<th scope="col">Instruction</th>
|
|
22
23
|
<th scope="col">In groups</th>
|
|
23
24
|
<th scope="col"></th>
|
|
@@ -34,6 +35,10 @@
|
|
|
34
35
|
</div>
|
|
35
36
|
<% end %>
|
|
36
37
|
</td>
|
|
38
|
+
<td data-label="Version">
|
|
39
|
+
<% v = @current_versions[metric.id] %>
|
|
40
|
+
<span class="ck-chip ck-chip--soft"><%= v ? v.version_label : "v1" %></span>
|
|
41
|
+
</td>
|
|
37
42
|
<td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
|
|
38
43
|
<td data-label="In groups">
|
|
39
44
|
<% groups = metric.metric_groups %>
|
|
@@ -50,7 +50,6 @@
|
|
|
50
50
|
<thead>
|
|
51
51
|
<tr>
|
|
52
52
|
<th scope="col">Version</th>
|
|
53
|
-
<th scope="col">Δ Change</th>
|
|
54
53
|
<th scope="col">Source</th>
|
|
55
54
|
<th scope="col">Created</th>
|
|
56
55
|
</tr>
|
|
@@ -60,36 +59,28 @@
|
|
|
60
59
|
<% pred = predecessor_of[v] %>
|
|
61
60
|
<tr>
|
|
62
61
|
<td>
|
|
62
|
+
<% summary = v.change_summary_against(pred) %>
|
|
63
63
|
<div class="ck-version-cell">
|
|
64
64
|
<div class="ck-version-cell__label">
|
|
65
65
|
<strong><%= v.version_label %></strong>
|
|
66
66
|
<% if v.current? %>
|
|
67
|
-
<span class="ck-
|
|
67
|
+
<span class="ck-chip">Published</span>
|
|
68
68
|
<% elsif v.draft? %>
|
|
69
|
-
<span class="ck-version-state">Draft</span>
|
|
70
69
|
<%= button_to "Publish", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
71
70
|
method: :post, form_class: "inline-block",
|
|
72
|
-
class: "ck-chip ck-chip--
|
|
71
|
+
class: "ck-chip ck-chip--publish" %>
|
|
73
72
|
<% else %>
|
|
74
|
-
<span class="ck-version-state">Past</span>
|
|
75
73
|
<%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
76
74
|
method: :post, form_class: "inline-block",
|
|
77
75
|
class: "ck-chip ck-chip--publish",
|
|
78
76
|
data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
|
|
79
77
|
<% end %>
|
|
80
78
|
</div>
|
|
79
|
+
<% if summary %>
|
|
80
|
+
<button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">Δ</button>
|
|
81
|
+
<% end %>
|
|
81
82
|
</div>
|
|
82
83
|
</td>
|
|
83
|
-
<td>
|
|
84
|
-
<% summary = v.change_summary_against(pred) %>
|
|
85
|
-
<% if summary %>
|
|
86
|
-
<button type="button" class="ck-change-link ck-change-link--<%= summary[:magnitude] %>"
|
|
87
|
-
title="Compare with <%= pred.version_label %>"
|
|
88
|
-
onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()"><%= summary[:label] %></button>
|
|
89
|
-
<% else %>
|
|
90
|
-
<span class="ck-meta-copy">—</span>
|
|
91
|
-
<% end %>
|
|
92
|
-
</td>
|
|
93
84
|
<td>
|
|
94
85
|
<% source_label, source_class = case v.source
|
|
95
86
|
when "suggestion" then ["AI calibration", "ck-source-chip ck-source-chip--ai"]
|
|
@@ -177,20 +168,11 @@
|
|
|
177
168
|
<% end %>
|
|
178
169
|
|
|
179
170
|
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
171
|
+
<% draft = @suggestion_draft || @edit_draft %>
|
|
180
172
|
<section class="ck-card ck-card--spaced">
|
|
181
|
-
<
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
stats: CompletionKit::MetricCalibrationStats.for(@metric),
|
|
185
|
-
metric: @metric %>
|
|
186
|
-
<% draft = @suggestion_draft || @edit_draft %>
|
|
187
|
-
<% if draft %>
|
|
188
|
-
<div class="ck-cal-foot">
|
|
189
|
-
<span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
|
|
190
|
-
</div>
|
|
191
|
-
<% elsif @improve_disagreement_count.positive? %>
|
|
192
|
-
<div class="ck-cal-foot">
|
|
193
|
-
<span class="ck-cal-foot__note"><%= pluralize(@improve_disagreement_count, "case") %> where a reviewer's score didn't match the judge.</span>
|
|
173
|
+
<div class="ck-prompt-preview__header">
|
|
174
|
+
<p class="ck-kicker">Calibration</p>
|
|
175
|
+
<% if draft.nil? && @improve_disagreement_count.positive? %>
|
|
194
176
|
<%= button_to suggest_variants_metric_path(@metric),
|
|
195
177
|
method: :post, form_class: "inline-block",
|
|
196
178
|
class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
|
|
@@ -198,6 +180,18 @@
|
|
|
198
180
|
<%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
|
|
199
181
|
Suggest improvements
|
|
200
182
|
<% end %>
|
|
183
|
+
<% end %>
|
|
184
|
+
</div>
|
|
185
|
+
<p class="ck-meta-copy">This is a measure of how often the judge's scores match a human reviewer. Review its scores to build that signal; the scores you disagree with become the cases the model learns from when you improve the metric.</p>
|
|
186
|
+
<%= render "completion_kit/calibrations/trust_panel",
|
|
187
|
+
stats: CompletionKit::MetricCalibrationStats.for(@metric),
|
|
188
|
+
metric: @metric %>
|
|
189
|
+
<% if CompletionKit.config.judge_examples_from_reviews %>
|
|
190
|
+
<%= render "completion_kit/metrics/guiding_examples", metric: @metric, examples: @guiding_examples %>
|
|
191
|
+
<% end %>
|
|
192
|
+
<% if draft %>
|
|
193
|
+
<div class="ck-cal-foot">
|
|
194
|
+
<span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
|
|
201
195
|
</div>
|
|
202
196
|
<% end %>
|
|
203
197
|
</section>
|
|
@@ -100,12 +100,17 @@
|
|
|
100
100
|
<% @reviews.each do |review| %>
|
|
101
101
|
<% review_version = review.metric_version %>
|
|
102
102
|
<% stale = review.stale_against_current_judge? %>
|
|
103
|
-
<div class="ck-review-card
|
|
103
|
+
<div class="ck-review-card" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
|
|
104
104
|
<div class="ck-review-card__header">
|
|
105
105
|
<span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
|
|
106
106
|
<div class="ck-inline">
|
|
107
107
|
<% if review_version %>
|
|
108
|
-
|
|
108
|
+
<% if stale %>
|
|
109
|
+
<% current_version = CompletionKit::MetricVersion.current.find_by(metric_id: review.metric_id) %>
|
|
110
|
+
<span class="ck-source-chip ck-source-chip--past" title="Scored on <%= review_version.version_label %>; the metric is now on <%= current_version.version_label %>, which may score this differently."><%= review_version.version_label %> → <%= current_version.version_label %></span>
|
|
111
|
+
<% else %>
|
|
112
|
+
<span class="ck-source-chip ck-source-chip--current" title="Scored on the metric's current version (<%= review_version.version_label %>)."><%= review_version.version_label %></span>
|
|
113
|
+
<% end %>
|
|
109
114
|
<% end %>
|
|
110
115
|
<% if review.ai_score %>
|
|
111
116
|
<% 5.times do |i| %>
|
|
@@ -116,9 +121,6 @@
|
|
|
116
121
|
<% end %>
|
|
117
122
|
</div>
|
|
118
123
|
</div>
|
|
119
|
-
<% if stale %>
|
|
120
|
-
<p class="ck-review-card__stale-note">Scored against a superseded version of this metric. Its current version may score this differently.</p>
|
|
121
|
-
<% end %>
|
|
122
124
|
<% if review.ai_feedback.present? %>
|
|
123
125
|
<p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
|
|
124
126
|
<% end %>
|
data/config/routes.rb
CHANGED
data/lib/completion_kit.rb
CHANGED
|
@@ -13,6 +13,7 @@ module CompletionKit
|
|
|
13
13
|
attr_accessor :api_rate_limit, :web_rate_limit
|
|
14
14
|
attr_accessor :allow_loopback_endpoints
|
|
15
15
|
attr_accessor :judge_calibration_enabled
|
|
16
|
+
attr_accessor :judge_examples_from_reviews
|
|
16
17
|
|
|
17
18
|
def initialize
|
|
18
19
|
@openai_api_key = ENV['OPENAI_API_KEY']
|
|
@@ -29,6 +30,7 @@ module CompletionKit
|
|
|
29
30
|
|
|
30
31
|
@allow_loopback_endpoints = true
|
|
31
32
|
@judge_calibration_enabled = true
|
|
33
|
+
@judge_examples_from_reviews = false
|
|
32
34
|
|
|
33
35
|
@api_reference_authentication_partial = "completion_kit/api_reference/authentication"
|
|
34
36
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.10.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -311,6 +311,7 @@ files:
|
|
|
311
311
|
- app/services/completion_kit/mcp_tools/responses.rb
|
|
312
312
|
- app/services/completion_kit/mcp_tools/runs.rb
|
|
313
313
|
- app/services/completion_kit/mcp_tools/tags.rb
|
|
314
|
+
- app/services/completion_kit/metric_calibration_examples.rb
|
|
314
315
|
- app/services/completion_kit/metric_calibration_stats.rb
|
|
315
316
|
- app/services/completion_kit/metric_variant_generator.rb
|
|
316
317
|
- app/services/completion_kit/model_discovery_service.rb
|
|
@@ -350,6 +351,7 @@ files:
|
|
|
350
351
|
- app/views/completion_kit/metric_groups/new.html.erb
|
|
351
352
|
- app/views/completion_kit/metric_groups/show.html.erb
|
|
352
353
|
- app/views/completion_kit/metrics/_form.html.erb
|
|
354
|
+
- app/views/completion_kit/metrics/_guiding_examples.html.erb
|
|
353
355
|
- app/views/completion_kit/metrics/_rubric_diff.html.erb
|
|
354
356
|
- app/views/completion_kit/metrics/_rubric_hint.html.erb
|
|
355
357
|
- app/views/completion_kit/metrics/_starter_card.html.erb
|
|
@@ -430,6 +432,7 @@ files:
|
|
|
430
432
|
- db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
|
|
431
433
|
- db/migrate/20260528000002_add_metric_version_to_reviews.rb
|
|
432
434
|
- db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
|
|
435
|
+
- db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb
|
|
433
436
|
- lib/completion-kit.rb
|
|
434
437
|
- lib/completion_kit.rb
|
|
435
438
|
- lib/completion_kit/concurrency_check.rb
|