completion-kit 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +67 -4
- data/app/controllers/completion_kit/metrics_controller.rb +9 -13
- data/app/jobs/completion_kit/metric_suggestion_job.rb +46 -0
- data/app/models/completion_kit/metric_version.rb +1 -0
- data/app/services/completion_kit/metric_improvement_validator.rb +101 -0
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +1 -4
- data/app/views/completion_kit/metrics/_suggestion_failed.html.erb +3 -0
- data/app/views/completion_kit/metrics/_suggestion_pending.html.erb +3 -0
- data/app/views/completion_kit/metrics/_suggestion_ready.html.erb +4 -0
- data/app/views/completion_kit/metrics/_validation_scoreboard.html.erb +12 -0
- data/app/views/completion_kit/metrics/show.html.erb +19 -5
- data/db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb +5 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +8 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0b32ec77fb60d07f40e4b83827c2510aaeb695c96c9c6df86e4b42a7ec57516b
|
|
4
|
+
data.tar.gz: ade912039e4942c87d73c13443bd405533eec2988478e02cd1ccb87550de2783
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: bb8664ea804d59e3761ab385d1af98ecf7d110dd7e68e7003e1a4b2c059c5e377a5e42d350a46310f58c6d8c41c0e31a6aa0cdaf8a6b50b4d9f419e6fa60e474
|
|
7
|
+
data.tar.gz: 3bbe72cf7e99a4ae899765829ee8bee83703885ebdfa5b6b9f7253f25f2373b1dd22aadfeb071f4b340c1b7d33dfc4e590e4f8a2df2239afbbc305de009af2cf
|
|
@@ -3619,10 +3619,9 @@ select.ck-input {
|
|
|
3619
3619
|
.ck-prompt-versions-table th:nth-child(3), .ck-prompt-versions-table td:nth-child(3) { width: 8rem; white-space: nowrap; }
|
|
3620
3620
|
.ck-prompt-versions-table th:nth-child(4), .ck-prompt-versions-table td:nth-child(4) { width: auto; }
|
|
3621
3621
|
|
|
3622
|
-
.ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width:
|
|
3623
|
-
.ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width:
|
|
3624
|
-
.ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width:
|
|
3625
|
-
.ck-metric-versions-table .ck-version-cell { justify-content: flex-start; gap: 0.75rem; }
|
|
3622
|
+
.ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 18rem; }
|
|
3623
|
+
.ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: 16rem; white-space: nowrap; }
|
|
3624
|
+
.ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: auto; white-space: nowrap; }
|
|
3626
3625
|
|
|
3627
3626
|
|
|
3628
3627
|
.ck-source-chip {
|
|
@@ -6001,3 +6000,67 @@ a.tag-mark {
|
|
|
6001
6000
|
width: 2rem;
|
|
6002
6001
|
height: 2rem;
|
|
6003
6002
|
}
|
|
6003
|
+
|
|
6004
|
+
.ck-suggestion-status:empty { display: none; }
|
|
6005
|
+
.ck-suggestion-status {
|
|
6006
|
+
margin-top: 10px;
|
|
6007
|
+
display: flex;
|
|
6008
|
+
align-items: baseline;
|
|
6009
|
+
gap: 10px;
|
|
6010
|
+
flex-wrap: wrap;
|
|
6011
|
+
}
|
|
6012
|
+
|
|
6013
|
+
.ck-scoreboard {
|
|
6014
|
+
margin-bottom: 16px;
|
|
6015
|
+
padding-bottom: 14px;
|
|
6016
|
+
border-bottom: 1px solid var(--ck-line);
|
|
6017
|
+
}
|
|
6018
|
+
.ck-scoreboard__headline {
|
|
6019
|
+
margin: 0 0 8px;
|
|
6020
|
+
font-size: 0.95rem;
|
|
6021
|
+
color: var(--ck-text);
|
|
6022
|
+
}
|
|
6023
|
+
.ck-scoreboard__was {
|
|
6024
|
+
font-family: var(--ck-mono);
|
|
6025
|
+
font-size: 0.74rem;
|
|
6026
|
+
color: var(--ck-muted);
|
|
6027
|
+
margin-left: 6px;
|
|
6028
|
+
}
|
|
6029
|
+
.ck-scoreboard__tally {
|
|
6030
|
+
list-style: none;
|
|
6031
|
+
margin: 0;
|
|
6032
|
+
padding: 0;
|
|
6033
|
+
display: flex;
|
|
6034
|
+
gap: 18px;
|
|
6035
|
+
}
|
|
6036
|
+
.ck-scoreboard__stat {
|
|
6037
|
+
font-family: var(--ck-mono);
|
|
6038
|
+
font-size: 0.72rem;
|
|
6039
|
+
letter-spacing: 0.06em;
|
|
6040
|
+
text-transform: uppercase;
|
|
6041
|
+
color: var(--ck-muted);
|
|
6042
|
+
}
|
|
6043
|
+
.ck-scoreboard__stat strong { color: var(--ck-text); }
|
|
6044
|
+
.ck-scoreboard__stat--break strong { color: var(--ck-warning); }
|
|
6045
|
+
.ck-scoreboard__note {
|
|
6046
|
+
margin: 8px 0 0;
|
|
6047
|
+
font-size: 0.78rem;
|
|
6048
|
+
color: var(--ck-muted);
|
|
6049
|
+
}
|
|
6050
|
+
.ck-version-change {
|
|
6051
|
+
display: inline-flex;
|
|
6052
|
+
align-items: baseline;
|
|
6053
|
+
gap: 0.6rem;
|
|
6054
|
+
}
|
|
6055
|
+
.ck-version-score {
|
|
6056
|
+
font-family: var(--ck-mono);
|
|
6057
|
+
font-size: 0.74rem;
|
|
6058
|
+
color: var(--ck-dim);
|
|
6059
|
+
}
|
|
6060
|
+
.ck-version-score__label {
|
|
6061
|
+
font-size: 0.6rem;
|
|
6062
|
+
letter-spacing: 0.08em;
|
|
6063
|
+
text-transform: uppercase;
|
|
6064
|
+
color: var(--ck-muted);
|
|
6065
|
+
margin-right: 0.2rem;
|
|
6066
|
+
}
|
|
@@ -117,26 +117,22 @@ module CompletionKit
|
|
|
117
117
|
|
|
118
118
|
def suggest_variants
|
|
119
119
|
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
120
|
-
|
|
121
|
-
if
|
|
120
|
+
counts = Calibration.where(metric_id: @metric.id, verdict: %w[agree disagree]).group(:verdict).count
|
|
121
|
+
if counts["disagree"].to_i.zero?
|
|
122
122
|
redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
|
|
123
123
|
return
|
|
124
124
|
end
|
|
125
125
|
|
|
126
|
-
|
|
126
|
+
MetricSuggestionJob.perform_later(@metric.id)
|
|
127
127
|
|
|
128
|
-
generator = MetricVariantGenerator.new(@metric, count: 1)
|
|
129
|
-
variants = generator.call
|
|
130
|
-
if variants.empty?
|
|
131
|
-
redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
|
|
132
|
-
return
|
|
133
|
-
end
|
|
134
|
-
versions = generator.persist!(variants)
|
|
135
|
-
new_version = versions.max_by(&:version_number)
|
|
136
128
|
if params[:back_to] == "edit"
|
|
137
|
-
redirect_to
|
|
129
|
+
redirect_to metric_path(@metric), notice: "Drafting a change from your reviews. It will appear here once it's tested."
|
|
138
130
|
else
|
|
139
|
-
|
|
131
|
+
render turbo_stream: turbo_stream.replace(
|
|
132
|
+
"ck-suggestion-status-#{@metric.id}",
|
|
133
|
+
partial: "completion_kit/metrics/suggestion_pending",
|
|
134
|
+
locals: { metric: @metric, count: counts.values.sum }
|
|
135
|
+
)
|
|
140
136
|
end
|
|
141
137
|
end
|
|
142
138
|
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
require "faraday"
|
|
2
|
+
|
|
3
|
+
module CompletionKit
|
|
4
|
+
class MetricSuggestionJob < ApplicationJob
|
|
5
|
+
queue_as :llm
|
|
6
|
+
|
|
7
|
+
retry_on Faraday::TimeoutError, Faraday::ConnectionFailed, wait: :polynomially_longer, attempts: 5
|
|
8
|
+
retry_on CompletionKit::RateLimitError, wait: :polynomially_longer, attempts: 5
|
|
9
|
+
|
|
10
|
+
rescue_from(StandardError) do |error|
|
|
11
|
+
Rails.error.report(error, handled: true, context: { job: self.class.name })
|
|
12
|
+
broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def perform(metric_id)
|
|
16
|
+
@metric = Metric.find_by(id: metric_id)
|
|
17
|
+
return unless @metric
|
|
18
|
+
|
|
19
|
+
MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
|
|
20
|
+
|
|
21
|
+
generator = MetricVariantGenerator.new(@metric, count: 1)
|
|
22
|
+
variants = generator.call
|
|
23
|
+
if variants.empty?
|
|
24
|
+
broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
|
|
25
|
+
return
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
draft = generator.persist!(variants).max_by(&:version_number)
|
|
29
|
+
summary = MetricImprovementValidator.new(@metric, draft).call
|
|
30
|
+
draft.update!(validation_summary: summary)
|
|
31
|
+
|
|
32
|
+
broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_ready", locals: { metric: @metric, draft: draft })
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def broadcast_status(metric, partial:, locals:)
|
|
38
|
+
html = CompletionKit::ApplicationController.render(partial: partial, locals: locals)
|
|
39
|
+
Turbo::StreamsChannel.broadcast_replace_to(
|
|
40
|
+
"metric_#{metric.id}_suggestion",
|
|
41
|
+
target: "ck-suggestion-status-#{metric.id}",
|
|
42
|
+
html: html
|
|
43
|
+
)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class MetricImprovementValidator
|
|
3
|
+
ANSWER_KEY_LIMIT = 30
|
|
4
|
+
|
|
5
|
+
def initialize(metric, candidate, scorer: nil)
|
|
6
|
+
@metric = metric
|
|
7
|
+
@candidate = candidate
|
|
8
|
+
@scorer = scorer || method(:rescore)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def call
|
|
12
|
+
key = answer_key
|
|
13
|
+
rows = []
|
|
14
|
+
key.each do |entry|
|
|
15
|
+
begin
|
|
16
|
+
score = @scorer.call(entry[:response], @candidate)
|
|
17
|
+
rescue StandardError
|
|
18
|
+
next
|
|
19
|
+
end
|
|
20
|
+
rows << classify(entry, score.to_i)
|
|
21
|
+
end
|
|
22
|
+
summarize(rows, key.size, key_capped?)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def answer_key
|
|
28
|
+
current = MetricVersion.current.find_by(metric_id: @metric.id)
|
|
29
|
+
return [] unless current
|
|
30
|
+
|
|
31
|
+
base = Calibration.where(metric_id: @metric.id, metric_version_id: current.id, verdict: %w[agree disagree])
|
|
32
|
+
@key_size_before_cap = base.count
|
|
33
|
+
base.includes(response: :reviews)
|
|
34
|
+
.order(created_at: :desc)
|
|
35
|
+
.limit(ANSWER_KEY_LIMIT)
|
|
36
|
+
.filter_map do |cal|
|
|
37
|
+
response = cal.response
|
|
38
|
+
next unless response.response_text.present?
|
|
39
|
+
review = response.reviews.find { |r| r.metric_id == @metric.id }
|
|
40
|
+
position = cal.verdict == "disagree" ? cal.corrected_score : review&.ai_score
|
|
41
|
+
next if position.nil?
|
|
42
|
+
{ response: response, verdict: cal.verdict, position: position }
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def key_capped?
|
|
47
|
+
@key_size_before_cap.to_i > ANSWER_KEY_LIMIT
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def classify(entry, candidate_score)
|
|
51
|
+
matched = candidate_score == entry[:position].to_i
|
|
52
|
+
outcome = if entry[:verdict] == "disagree"
|
|
53
|
+
matched ? "fix" : "still_off"
|
|
54
|
+
else
|
|
55
|
+
matched ? "keep" : "break"
|
|
56
|
+
end
|
|
57
|
+
{
|
|
58
|
+
"response_id" => entry[:response].id,
|
|
59
|
+
"verdict" => entry[:verdict],
|
|
60
|
+
"position" => entry[:position].to_i,
|
|
61
|
+
"candidate_score" => candidate_score,
|
|
62
|
+
"outcome" => outcome
|
|
63
|
+
}
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def summarize(rows, total, capped)
|
|
67
|
+
fixes = rows.count { |r| r["outcome"] == "fix" }
|
|
68
|
+
keeps = rows.count { |r| r["outcome"] == "keep" }
|
|
69
|
+
breaks = rows.count { |r| r["outcome"] == "break" }
|
|
70
|
+
still_off = rows.count { |r| r["outcome"] == "still_off" }
|
|
71
|
+
agreements = rows.count { |r| r["verdict"] == "agree" }
|
|
72
|
+
{
|
|
73
|
+
"total" => total,
|
|
74
|
+
"tested" => rows.size,
|
|
75
|
+
"capped" => capped,
|
|
76
|
+
"fixes" => fixes,
|
|
77
|
+
"keeps" => keeps,
|
|
78
|
+
"breaks" => breaks,
|
|
79
|
+
"still_off" => still_off,
|
|
80
|
+
"before" => agreements,
|
|
81
|
+
"after" => fixes + keeps,
|
|
82
|
+
"rows" => rows
|
|
83
|
+
}
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def rescore(response, candidate)
|
|
87
|
+
run = response.run
|
|
88
|
+
config = ApiConfig.for_model(run.judge_model).merge(judge_model: run.judge_model)
|
|
89
|
+
rubric_text = Metric.rubric_text_for(Metric.normalize_rubric_bands(candidate.rubric_bands))
|
|
90
|
+
result = JudgeService.new(config).evaluate(
|
|
91
|
+
response.response_text,
|
|
92
|
+
response.expected_output,
|
|
93
|
+
run.prompt&.template,
|
|
94
|
+
criteria: candidate.instruction.to_s,
|
|
95
|
+
rubric_text: rubric_text,
|
|
96
|
+
input_data: response.input_data
|
|
97
|
+
)
|
|
98
|
+
result[:score]
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
@@ -31,10 +31,7 @@
|
|
|
31
31
|
<%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
32
32
|
<% end %>
|
|
33
33
|
<% else %>
|
|
34
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">
|
|
35
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
|
|
36
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
|
|
37
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
|
|
34
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Agrees with you</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong> of <%= stats.sample_size %> reviews</span>
|
|
38
35
|
<% if stats.borderline_rate && stats.borderline_rate > 0 %>
|
|
39
36
|
<% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
|
|
40
37
|
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
<div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status ck-suggestion-status--ready">
|
|
2
|
+
<span class="ck-cal-foot__note">Drafted <%= draft.version_label %> and tested it against your reviews.</span>
|
|
3
|
+
<%= link_to "Compare and publish →", CompletionKit::Engine.routes.url_helpers.metric_path(metric, show_change: draft.id), class: "ck-cal-link" %>
|
|
4
|
+
</div>
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
<% s = summary %>
|
|
2
|
+
<div class="ck-scoreboard">
|
|
3
|
+
<p class="ck-scoreboard__headline">Matches you on <strong><%= s["after"] %> of <%= s["tested"] %></strong> of your reviews <span class="ck-scoreboard__was">was <%= s["before"] %> of <%= s["tested"] %></span></p>
|
|
4
|
+
<ul class="ck-scoreboard__tally">
|
|
5
|
+
<li class="ck-scoreboard__stat ck-scoreboard__stat--fix">Fixes <strong><%= s["fixes"] %></strong></li>
|
|
6
|
+
<li class="ck-scoreboard__stat ck-scoreboard__stat--keep">Keeps <strong><%= s["keeps"] %></strong></li>
|
|
7
|
+
<li class="ck-scoreboard__stat ck-scoreboard__stat--break">Breaks <strong><%= s["breaks"] %></strong></li>
|
|
8
|
+
</ul>
|
|
9
|
+
<% if s["capped"] %>
|
|
10
|
+
<p class="ck-scoreboard__note">Tested against your 30 most recent reviews.</p>
|
|
11
|
+
<% end %>
|
|
12
|
+
</div>
|
|
@@ -76,14 +76,20 @@
|
|
|
76
76
|
data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
|
|
77
77
|
<% end %>
|
|
78
78
|
</div>
|
|
79
|
+
<% vs = v.validation_summary %>
|
|
79
80
|
<% if summary %>
|
|
80
|
-
<
|
|
81
|
+
<div class="ck-version-change">
|
|
82
|
+
<% if v.draft? && vs.present? %>
|
|
83
|
+
<span class="ck-version-score"><span class="ck-version-score__label">Match</span> <%= vs["after"] %>/<%= vs["tested"] %></span>
|
|
84
|
+
<% end %>
|
|
85
|
+
<button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">Δ</button>
|
|
86
|
+
</div>
|
|
81
87
|
<% end %>
|
|
82
88
|
</div>
|
|
83
89
|
</td>
|
|
84
90
|
<td>
|
|
85
91
|
<% source_label, source_class = case v.source
|
|
86
|
-
when "suggestion" then ["AI
|
|
92
|
+
when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
|
|
87
93
|
when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
|
|
88
94
|
when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
|
|
89
95
|
else ["Original", "ck-source-chip ck-source-chip--initial"]
|
|
@@ -110,6 +116,7 @@
|
|
|
110
116
|
<% @versions.each do |v| %>
|
|
111
117
|
<% pred = predecessor_of[v] %>
|
|
112
118
|
<% next unless v.change_summary_against(pred) %>
|
|
119
|
+
<% vs = v.validation_summary %>
|
|
113
120
|
<dialog id="ck-mvdiff-<%= v.id %>" class="ck-modal" onclick="if(event.target===this)this.close()">
|
|
114
121
|
<article class="ck-modal__panel" tabindex="-1" onclick="event.stopPropagation()">
|
|
115
122
|
<header class="ck-modal__header">
|
|
@@ -120,6 +127,9 @@
|
|
|
120
127
|
<button type="button" class="ck-modal__close" aria-label="Close" onclick="this.closest('dialog').close()">×</button>
|
|
121
128
|
</header>
|
|
122
129
|
<div class="ck-modal__body">
|
|
130
|
+
<% if v.draft? && vs.present? %>
|
|
131
|
+
<%= render "completion_kit/metrics/validation_scoreboard", summary: vs %>
|
|
132
|
+
<% end %>
|
|
123
133
|
<% if pred.instruction.to_s != v.instruction.to_s %>
|
|
124
134
|
<div class="ck-suggest-diff">
|
|
125
135
|
<div class="ck-suggest-diff__pane">
|
|
@@ -152,8 +162,10 @@
|
|
|
152
162
|
title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
|
|
153
163
|
data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
154
164
|
<%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
|
|
165
|
+
<% net_negative = vs.present? && (vs["after"].to_i < vs["before"].to_i || vs["breaks"].to_i > vs["fixes"].to_i) %>
|
|
155
166
|
<%= button_to "Publish #{v.version_label} →", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
156
|
-
method: :post, form_class: "inline-block", class: ck_button_classes(:dark)
|
|
167
|
+
method: :post, form_class: "inline-block", class: ck_button_classes(:dark),
|
|
168
|
+
data: net_negative ? { turbo_confirm: "This agrees with you less than the current version. Publish anyway?" } : {} %>
|
|
157
169
|
</span>
|
|
158
170
|
<% else %>
|
|
159
171
|
<span class="ck-modal__foot-note">Roll this metric back to this version.</span>
|
|
@@ -171,7 +183,7 @@
|
|
|
171
183
|
<% draft = @suggestion_draft || @edit_draft %>
|
|
172
184
|
<section class="ck-card ck-card--spaced">
|
|
173
185
|
<div class="ck-prompt-preview__header">
|
|
174
|
-
<p class="ck-kicker">
|
|
186
|
+
<p class="ck-kicker">Agreement</p>
|
|
175
187
|
<% if draft.nil? && @improve_disagreement_count.positive? %>
|
|
176
188
|
<%= button_to suggest_variants_metric_path(@metric),
|
|
177
189
|
method: :post, form_class: "inline-block",
|
|
@@ -182,7 +194,9 @@
|
|
|
182
194
|
<% end %>
|
|
183
195
|
<% end %>
|
|
184
196
|
</div>
|
|
185
|
-
|
|
197
|
+
<%= turbo_stream_from "metric_#{@metric.id}_suggestion" %>
|
|
198
|
+
<div id="ck-suggestion-status-<%= @metric.id %>" class="ck-suggestion-status"></div>
|
|
199
|
+
<p class="ck-meta-copy">How often the judge lands on the same score you would. Review its scores to build that signal, and improve the metric to raise it.</p>
|
|
186
200
|
<%= render "completion_kit/calibrations/trust_panel",
|
|
187
201
|
stats: CompletionKit::MetricCalibrationStats.for(@metric),
|
|
188
202
|
metric: @metric %>
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.11.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -266,6 +266,7 @@ files:
|
|
|
266
266
|
- app/jobs/completion_kit/application_job.rb
|
|
267
267
|
- app/jobs/completion_kit/generate_row_job.rb
|
|
268
268
|
- app/jobs/completion_kit/judge_review_job.rb
|
|
269
|
+
- app/jobs/completion_kit/metric_suggestion_job.rb
|
|
269
270
|
- app/jobs/completion_kit/model_discovery_job.rb
|
|
270
271
|
- app/jobs/completion_kit/run_completion_check_job.rb
|
|
271
272
|
- app/mailers/completion_kit/application_mailer.rb
|
|
@@ -313,6 +314,7 @@ files:
|
|
|
313
314
|
- app/services/completion_kit/mcp_tools/tags.rb
|
|
314
315
|
- app/services/completion_kit/metric_calibration_examples.rb
|
|
315
316
|
- app/services/completion_kit/metric_calibration_stats.rb
|
|
317
|
+
- app/services/completion_kit/metric_improvement_validator.rb
|
|
316
318
|
- app/services/completion_kit/metric_variant_generator.rb
|
|
317
319
|
- app/services/completion_kit/model_discovery_service.rb
|
|
318
320
|
- app/services/completion_kit/ollama_client.rb
|
|
@@ -355,6 +357,10 @@ files:
|
|
|
355
357
|
- app/views/completion_kit/metrics/_rubric_diff.html.erb
|
|
356
358
|
- app/views/completion_kit/metrics/_rubric_hint.html.erb
|
|
357
359
|
- app/views/completion_kit/metrics/_starter_card.html.erb
|
|
360
|
+
- app/views/completion_kit/metrics/_suggestion_failed.html.erb
|
|
361
|
+
- app/views/completion_kit/metrics/_suggestion_pending.html.erb
|
|
362
|
+
- app/views/completion_kit/metrics/_suggestion_ready.html.erb
|
|
363
|
+
- app/views/completion_kit/metrics/_validation_scoreboard.html.erb
|
|
358
364
|
- app/views/completion_kit/metrics/edit.html.erb
|
|
359
365
|
- app/views/completion_kit/metrics/index.html.erb
|
|
360
366
|
- app/views/completion_kit/metrics/new.html.erb
|
|
@@ -433,6 +439,7 @@ files:
|
|
|
433
439
|
- db/migrate/20260528000002_add_metric_version_to_reviews.rb
|
|
434
440
|
- db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
|
|
435
441
|
- db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb
|
|
442
|
+
- db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb
|
|
436
443
|
- lib/completion-kit.rb
|
|
437
444
|
- lib/completion_kit.rb
|
|
438
445
|
- lib/completion_kit/concurrency_check.rb
|