completion-kit 0.5.44 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +13 -0
- data/app/controllers/completion_kit/metrics_controller.rb +14 -4
- data/app/controllers/completion_kit/runs_controller.rb +63 -1
- data/app/models/completion_kit/calibration.rb +0 -4
- data/app/models/completion_kit/metric_version.rb +0 -1
- data/app/models/completion_kit/run.rb +32 -0
- data/app/services/completion_kit/mcp_tools/judges.rb +2 -4
- data/app/services/completion_kit/metric_variant_generator.rb +20 -6
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +6 -1
- data/app/views/completion_kit/metrics/show.html.erb +1 -1
- data/app/views/completion_kit/runs/_actions.html.erb +1 -0
- data/app/views/completion_kit/runs/compare.html.erb +85 -0
- data/app/views/completion_kit/runs/compare_picker.html.erb +39 -0
- data/app/views/completion_kit/runs/show.html.erb +8 -2
- data/config/routes.rb +2 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +3 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d8454bbb11d5064ca0c6d4355c780425a28198280dffe7dd424d266fbeef6a09
|
|
4
|
+
data.tar.gz: 24c1da76e1e9118d5e2a732e8e45b684f588f553ad4d8bac89e239bc22c953c3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 6fbc5b8047a20240897e19c389bb3f6104d3e2a219794d190183b5433e14d524bb692eb0a27b36ab6471e596c2b9b8af2d70a4f56ae81aa327726fe92f092eb9
|
|
7
|
+
data.tar.gz: a3399003a48836fd457a8c8b488305fad6d006596c6f940a82b232e2a731dfbc3df5ded4ba8bc16b94690a88d56baaaff6edc6801f47f2bbf422ca8fb74270df
|
|
@@ -2834,6 +2834,19 @@ select.ck-input {
|
|
|
2834
2834
|
}
|
|
2835
2835
|
.ck-stale-versions-banner__body { min-width: 0; flex: 1 1 320px; }
|
|
2836
2836
|
.ck-stale-versions-banner .ck-kicker { color: var(--ck-warning); }
|
|
2837
|
+
|
|
2838
|
+
.ck-delta {
|
|
2839
|
+
font-family: var(--ck-mono);
|
|
2840
|
+
font-size: 0.78rem;
|
|
2841
|
+
letter-spacing: 0.04em;
|
|
2842
|
+
padding: 2px 6px;
|
|
2843
|
+
border-radius: 4px;
|
|
2844
|
+
}
|
|
2845
|
+
.ck-delta--positive { color: var(--ck-success); background: var(--ck-success-soft); }
|
|
2846
|
+
.ck-delta--negative { color: var(--ck-danger); background: var(--ck-danger-soft); }
|
|
2847
|
+
.ck-delta--zero { color: var(--ck-dim); }
|
|
2848
|
+
|
|
2849
|
+
.ck-run-compare-table td { vertical-align: middle; }
|
|
2837
2850
|
.ck-review-card__stale-note {
|
|
2838
2851
|
margin: 0.4rem 0 0;
|
|
2839
2852
|
font-family: var(--ck-mono);
|
|
@@ -42,8 +42,7 @@ module CompletionKit
|
|
|
42
42
|
.limit(50)
|
|
43
43
|
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
44
44
|
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
45
|
-
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree"
|
|
46
|
-
metric_version_id: @published_metric_version.id).count
|
|
45
|
+
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
47
46
|
@versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
|
|
48
47
|
end
|
|
49
48
|
|
|
@@ -157,9 +156,20 @@ module CompletionKit
|
|
|
157
156
|
return
|
|
158
157
|
end
|
|
159
158
|
|
|
159
|
+
was_published_already = version.published?
|
|
160
|
+
reverting = was_published_already && !version.current?
|
|
161
|
+
previously_current = MetricVersion.current.find_by(metric_id: @metric.id)
|
|
162
|
+
|
|
160
163
|
version.publish!
|
|
161
|
-
|
|
162
|
-
|
|
164
|
+
|
|
165
|
+
if reverting
|
|
166
|
+
prior_label = previously_current.version_label
|
|
167
|
+
redirect_to metric_path(@metric),
|
|
168
|
+
notice: "Reverted to #{@metric.name} #{version.version_label}. Pinned cases still flow to the judge, and calibration verdicts collected against #{prior_label} stay tied to it."
|
|
169
|
+
else
|
|
170
|
+
redirect_to metric_path(@metric),
|
|
171
|
+
notice: "#{@metric.name} #{version.version_label} is now the published version."
|
|
172
|
+
end
|
|
163
173
|
end
|
|
164
174
|
|
|
165
175
|
def add_few_shot
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class RunsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :refresh_status]
|
|
4
|
+
before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :regrade, :refresh_status, :compare]
|
|
5
5
|
before_action :load_form_collections, only: [:new, :edit, :create, :update]
|
|
6
6
|
|
|
7
7
|
def index
|
|
@@ -78,6 +78,29 @@ module CompletionKit
|
|
|
78
78
|
end
|
|
79
79
|
end
|
|
80
80
|
|
|
81
|
+
def compare
|
|
82
|
+
other_id = params[:with]
|
|
83
|
+
if other_id.blank?
|
|
84
|
+
@other_runs = Run.where(dataset_id: @run.dataset_id, prompt_id: @run.prompt_id)
|
|
85
|
+
.where.not(id: @run.id)
|
|
86
|
+
.order(created_at: :desc)
|
|
87
|
+
.limit(50)
|
|
88
|
+
return render(:compare_picker)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
@other_run = Run.find(other_id)
|
|
92
|
+
@comparison = build_run_comparison(@run, @other_run)
|
|
93
|
+
render(:compare)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def regrade
|
|
97
|
+
if @run.regrade!
|
|
98
|
+
redirect_to run_path(@run), notice: "Re-grading existing responses with the current judge."
|
|
99
|
+
else
|
|
100
|
+
redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
81
104
|
def rerun
|
|
82
105
|
new_run = Run.create!(
|
|
83
106
|
prompt_id: @run.prompt_id,
|
|
@@ -163,6 +186,45 @@ module CompletionKit
|
|
|
163
186
|
@run = Run.find(params[:id])
|
|
164
187
|
end
|
|
165
188
|
|
|
189
|
+
def build_run_comparison(left, right)
|
|
190
|
+
left_responses = left.responses.includes(:reviews).order(:row_index, :id)
|
|
191
|
+
right_responses = right.responses.includes(:reviews).order(:row_index, :id)
|
|
192
|
+
right_by_input = right_responses.each_with_object({}) { |r, h| h[r.input_data.to_s] ||= r }
|
|
193
|
+
|
|
194
|
+
all_reviews = left_responses.flat_map(&:reviews) + right_responses.flat_map(&:reviews)
|
|
195
|
+
metric_ids = all_reviews.map(&:metric_id).compact.uniq
|
|
196
|
+
metric_versions = MetricVersion.where(id: all_reviews.map(&:metric_version_id).compact.uniq).index_by(&:id)
|
|
197
|
+
|
|
198
|
+
rows = left_responses.map do |lr|
|
|
199
|
+
rr = right_by_input[lr.input_data.to_s]
|
|
200
|
+
{
|
|
201
|
+
left_response: lr,
|
|
202
|
+
right_response: rr,
|
|
203
|
+
per_metric: metric_ids.map do |mid|
|
|
204
|
+
l_review = lr.reviews.find { |r| r.metric_id == mid }
|
|
205
|
+
r_review = rr && rr.reviews.find { |r| r.metric_id == mid }
|
|
206
|
+
next nil if l_review.nil? && r_review.nil?
|
|
207
|
+
anchor = l_review || r_review
|
|
208
|
+
{
|
|
209
|
+
metric_id: mid,
|
|
210
|
+
metric_name: anchor.metric_name,
|
|
211
|
+
left_score: l_review ? l_review.ai_score : nil,
|
|
212
|
+
right_score: r_review ? r_review.ai_score : nil,
|
|
213
|
+
left_version_label: version_label_for(l_review, metric_versions),
|
|
214
|
+
right_version_label: version_label_for(r_review, metric_versions),
|
|
215
|
+
delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil
|
|
216
|
+
}
|
|
217
|
+
end.compact
|
|
218
|
+
}
|
|
219
|
+
end
|
|
220
|
+
{ rows: rows, metric_ids: metric_ids }
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def version_label_for(review, metric_versions)
|
|
224
|
+
return nil if review.nil? || review.metric_version_id.nil?
|
|
225
|
+
metric_versions[review.metric_version_id]&.version_label
|
|
226
|
+
end
|
|
227
|
+
|
|
166
228
|
def load_form_collections
|
|
167
229
|
@prompts = Prompt.order(:name)
|
|
168
230
|
@datasets = Dataset.order(:name)
|
|
@@ -7,10 +7,6 @@ module CompletionKit
|
|
|
7
7
|
belongs_to :metric
|
|
8
8
|
belongs_to :metric_version
|
|
9
9
|
|
|
10
|
-
alias_attribute :judge_version_id, :metric_version_id
|
|
11
|
-
alias_method :judge_version, :metric_version
|
|
12
|
-
alias_method :judge_version=, :metric_version=
|
|
13
|
-
|
|
14
10
|
validates :verdict, presence: true, inclusion: { in: VERDICTS }
|
|
15
11
|
validates :response_id,
|
|
16
12
|
uniqueness: { scope: [:metric_id, :created_by] }
|
|
@@ -179,6 +179,38 @@ module CompletionKit
|
|
|
179
179
|
start!
|
|
180
180
|
end
|
|
181
181
|
|
|
182
|
+
def regrade!
|
|
183
|
+
grading_metrics = metrics
|
|
184
|
+
return false if grading_metrics.empty? || !judge_configured?
|
|
185
|
+
|
|
186
|
+
eligible_responses = responses.where(status: "succeeded").where.not(response_text: nil)
|
|
187
|
+
response_ids = eligible_responses.pluck(:id)
|
|
188
|
+
return false if response_ids.empty?
|
|
189
|
+
|
|
190
|
+
transaction do
|
|
191
|
+
Review.where(response_id: response_ids).update_all(
|
|
192
|
+
status: "pending",
|
|
193
|
+
attempts: 0,
|
|
194
|
+
metric_version_id: nil,
|
|
195
|
+
ai_score: nil,
|
|
196
|
+
ai_feedback: nil,
|
|
197
|
+
error_provider: nil,
|
|
198
|
+
error_class: nil,
|
|
199
|
+
error_status: nil,
|
|
200
|
+
error_message: nil
|
|
201
|
+
)
|
|
202
|
+
update!(status: "running", failure_summary: nil, error_message: nil)
|
|
203
|
+
|
|
204
|
+
response_ids.each do |rid|
|
|
205
|
+
grading_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id) }
|
|
206
|
+
end
|
|
207
|
+
RunCompletionCheckJob.perform_later(id)
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
broadcast_ui
|
|
211
|
+
true
|
|
212
|
+
end
|
|
213
|
+
|
|
182
214
|
def progress_snapshot
|
|
183
215
|
generated_done = responses.where(status: "succeeded").count
|
|
184
216
|
generated_failed = responses.where(status: "failed").count
|
|
@@ -75,10 +75,8 @@ module CompletionKit
|
|
|
75
75
|
|
|
76
76
|
def self.compare(args)
|
|
77
77
|
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(a_id)
|
|
81
|
-
b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(b_id)
|
|
78
|
+
a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
|
|
79
|
+
b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
|
|
82
80
|
stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
|
|
83
81
|
stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
|
|
84
82
|
text_result({
|
|
@@ -43,6 +43,7 @@ module CompletionKit
|
|
|
43
43
|
def build_meta_prompt
|
|
44
44
|
disagreements = MetricCalibrationExamples.disagreements_for(@metric)
|
|
45
45
|
borderlines = MetricCalibrationExamples.borderlines_for(@metric)
|
|
46
|
+
pinned_examples = Array(@metric.few_shot_examples)
|
|
46
47
|
sections = []
|
|
47
48
|
sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
|
|
48
49
|
sections << ""
|
|
@@ -77,6 +78,18 @@ module CompletionKit
|
|
|
77
78
|
sections << ""
|
|
78
79
|
end
|
|
79
80
|
end
|
|
81
|
+
if pinned_examples.any?
|
|
82
|
+
sections << "## Pinned cases the judge already references"
|
|
83
|
+
sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
|
|
84
|
+
pinned_examples.each_with_index do |ex, i|
|
|
85
|
+
sections << "### Pinned #{i + 1}"
|
|
86
|
+
sections << "Input: #{ex["input"].to_s.truncate(200)}"
|
|
87
|
+
sections << "Output: #{ex["response"].to_s.truncate(200)}"
|
|
88
|
+
sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
|
|
89
|
+
sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
|
|
90
|
+
sections << ""
|
|
91
|
+
end
|
|
92
|
+
end
|
|
80
93
|
sections << "## Task"
|
|
81
94
|
sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
|
|
82
95
|
sections << ""
|
|
@@ -133,13 +146,14 @@ module CompletionKit
|
|
|
133
146
|
end
|
|
134
147
|
|
|
135
148
|
def calibrations_for(metric, verdict:, limit:)
|
|
136
|
-
|
|
149
|
+
base = Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
137
150
|
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
151
|
+
scoped = current_version ? base.where(metric_version_id: current_version.id) : base
|
|
152
|
+
effective = scoped.exists? ? scoped : base
|
|
153
|
+
effective.includes(response: :reviews)
|
|
154
|
+
.order(created_at: :desc)
|
|
155
|
+
.limit(limit)
|
|
156
|
+
.map do |cal|
|
|
143
157
|
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
144
158
|
{
|
|
145
159
|
input: cal.response.input_data,
|
|
@@ -15,12 +15,17 @@
|
|
|
15
15
|
.where.not(id: verdicted_ids)
|
|
16
16
|
.order(created_at: :desc).first
|
|
17
17
|
end %>
|
|
18
|
+
<% prior_version_verdicts = if stats.sample_size.zero? && metric && current_metric_version
|
|
19
|
+
CompletionKit::Calibration.where(metric_id: metric.id).where.not(metric_version_id: current_metric_version.id).count
|
|
20
|
+
else
|
|
21
|
+
0
|
|
22
|
+
end %>
|
|
18
23
|
|
|
19
24
|
<p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
20
25
|
<span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
|
|
21
26
|
<% if stats.sample_size.zero? %>
|
|
22
27
|
<span class="ck-trust-line__state">Not measured yet.</span>
|
|
23
|
-
<span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if target_response %>
|
|
28
|
+
<span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "verdict") %> on prior versions, tied to that version's history.)<% end %><% if target_response %>
|
|
24
29
|
<%= link_to "Give a verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %>
|
|
25
30
|
<% else %> Run this metric on a dataset, then give a verdict.<% end %></span>
|
|
26
31
|
<% elsif stats.counter_only? %>
|
|
@@ -165,7 +165,7 @@
|
|
|
165
165
|
<p class="ck-kicker">Cases to learn from</p>
|
|
166
166
|
<span class="ck-chip"><%= pluralize(@disagreements.size, "case") %></span>
|
|
167
167
|
</div>
|
|
168
|
-
<% mixed_versions = @disagreements.
|
|
168
|
+
<% mixed_versions = @disagreements.any? { |c| c.metric_version_id != @published_metric_version.id } %>
|
|
169
169
|
<p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades<%= " (pins flow into the current judge regardless of which version produced the verdict)" if mixed_versions %>.</p>
|
|
170
170
|
<% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
|
|
171
171
|
<ul class="ck-disagreement-list">
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
<%= button_to "Retry", generate_run_path(run), method: :post, class: ck_button_classes(:light, variant: :outline), form_class: "inline-block" %>
|
|
12
12
|
<%= button_to "Re-run as new", rerun_run_path(run), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
|
|
13
13
|
<% elsif run.status == "completed" %>
|
|
14
|
+
<%= link_to "Compare", compare_run_path(run), class: ck_button_classes(:light, variant: :outline) %>
|
|
14
15
|
<%= button_to "Re-run", rerun_run_path(run), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
|
|
15
16
|
<% end %>
|
|
16
17
|
<% end %>
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
<ol class="ck-breadcrumb">
|
|
2
|
+
<li><%= link_to "Runs", runs_path %></li>
|
|
3
|
+
<li><%= link_to @run.name, run_path(@run) %></li>
|
|
4
|
+
<li>vs <%= @other_run.name %></li>
|
|
5
|
+
</ol>
|
|
6
|
+
|
|
7
|
+
<section class="ck-page-header">
|
|
8
|
+
<div>
|
|
9
|
+
<h1 class="ck-title">Comparing runs</h1>
|
|
10
|
+
<p class="ck-meta-copy"><strong>A</strong>: <%= link_to @run.name, run_path(@run), class: "ck-link" %> · <strong>B</strong>: <%= link_to @other_run.name, run_path(@other_run), class: "ck-link" %></p>
|
|
11
|
+
</div>
|
|
12
|
+
<div class="ck-actions">
|
|
13
|
+
<%= link_to "Pick another", compare_run_path(@run), class: ck_button_classes(:light, variant: :outline) %>
|
|
14
|
+
</div>
|
|
15
|
+
</section>
|
|
16
|
+
|
|
17
|
+
<% if @comparison[:rows].empty? %>
|
|
18
|
+
<div class="ck-empty">
|
|
19
|
+
<p>No responses to compare yet.</p>
|
|
20
|
+
</div>
|
|
21
|
+
<% else %>
|
|
22
|
+
<table class="ck-results-table ck-run-compare-table">
|
|
23
|
+
<thead>
|
|
24
|
+
<tr>
|
|
25
|
+
<th scope="col">Case</th>
|
|
26
|
+
<th scope="col">Metric</th>
|
|
27
|
+
<th scope="col">A score</th>
|
|
28
|
+
<th scope="col">B score</th>
|
|
29
|
+
<th scope="col">Δ</th>
|
|
30
|
+
<th scope="col">A version</th>
|
|
31
|
+
<th scope="col">B version</th>
|
|
32
|
+
</tr>
|
|
33
|
+
</thead>
|
|
34
|
+
<tbody>
|
|
35
|
+
<% @comparison[:rows].each do |row| %>
|
|
36
|
+
<% case_label = ((row[:left_response].row_index || 0) + 1).to_s %>
|
|
37
|
+
<% row[:per_metric].each_with_index do |pm, idx| %>
|
|
38
|
+
<tr>
|
|
39
|
+
<% if idx == 0 %>
|
|
40
|
+
<td rowspan="<%= row[:per_metric].size %>">
|
|
41
|
+
<%= link_to case_label, run_response_path(@run, row[:left_response]), class: "ck-link" %>
|
|
42
|
+
<% if row[:right_response] %>
|
|
43
|
+
<span class="ck-meta-copy">/ <%= link_to "B", run_response_path(@other_run, row[:right_response]), class: "ck-link" %></span>
|
|
44
|
+
<% end %>
|
|
45
|
+
</td>
|
|
46
|
+
<% end %>
|
|
47
|
+
<td><%= pm[:metric_name] %></td>
|
|
48
|
+
<td>
|
|
49
|
+
<% if pm[:left_score] %>
|
|
50
|
+
<span class="<%= ck_badge_classes(ck_score_kind(pm[:left_score].to_f)) %>"><%= pm[:left_score] %></span>
|
|
51
|
+
<% else %>
|
|
52
|
+
<span class="ck-meta-copy">—</span>
|
|
53
|
+
<% end %>
|
|
54
|
+
</td>
|
|
55
|
+
<td>
|
|
56
|
+
<% if pm[:right_score] %>
|
|
57
|
+
<span class="<%= ck_badge_classes(ck_score_kind(pm[:right_score].to_f)) %>"><%= pm[:right_score] %></span>
|
|
58
|
+
<% else %>
|
|
59
|
+
<span class="ck-meta-copy">—</span>
|
|
60
|
+
<% end %>
|
|
61
|
+
</td>
|
|
62
|
+
<td>
|
|
63
|
+
<% if pm[:delta] %>
|
|
64
|
+
<% delta_class = pm[:delta] > 0 ? "ck-delta--positive" : pm[:delta] < 0 ? "ck-delta--negative" : "ck-delta--zero" %>
|
|
65
|
+
<span class="ck-delta <%= delta_class %>"><%= pm[:delta].positive? ? "+#{pm[:delta]}" : pm[:delta].to_s %></span>
|
|
66
|
+
<% else %>
|
|
67
|
+
<span class="ck-meta-copy">—</span>
|
|
68
|
+
<% end %>
|
|
69
|
+
</td>
|
|
70
|
+
<td>
|
|
71
|
+
<% if pm[:left_version_label] %>
|
|
72
|
+
<span class="ck-source-chip ck-source-chip--current"><%= pm[:left_version_label] %></span>
|
|
73
|
+
<% end %>
|
|
74
|
+
</td>
|
|
75
|
+
<td>
|
|
76
|
+
<% if pm[:right_version_label] %>
|
|
77
|
+
<span class="ck-source-chip ck-source-chip--current"><%= pm[:right_version_label] %></span>
|
|
78
|
+
<% end %>
|
|
79
|
+
</td>
|
|
80
|
+
</tr>
|
|
81
|
+
<% end %>
|
|
82
|
+
<% end %>
|
|
83
|
+
</tbody>
|
|
84
|
+
</table>
|
|
85
|
+
<% end %>
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
<ol class="ck-breadcrumb">
|
|
2
|
+
<li><%= link_to "Runs", runs_path %></li>
|
|
3
|
+
<li><%= link_to @run.name, run_path(@run) %></li>
|
|
4
|
+
<li>Compare</li>
|
|
5
|
+
</ol>
|
|
6
|
+
|
|
7
|
+
<section class="ck-page-header">
|
|
8
|
+
<div>
|
|
9
|
+
<h1 class="ck-title">Compare with another run</h1>
|
|
10
|
+
<p class="ck-lead">Pick a run on the same dataset and prompt to see per-case score deltas side by side.</p>
|
|
11
|
+
</div>
|
|
12
|
+
</section>
|
|
13
|
+
|
|
14
|
+
<% if @other_runs.any? %>
|
|
15
|
+
<table class="ck-results-table">
|
|
16
|
+
<thead>
|
|
17
|
+
<tr>
|
|
18
|
+
<th scope="col">Run</th>
|
|
19
|
+
<th scope="col">Judge</th>
|
|
20
|
+
<th scope="col">Created</th>
|
|
21
|
+
<th scope="col"></th>
|
|
22
|
+
</tr>
|
|
23
|
+
</thead>
|
|
24
|
+
<tbody>
|
|
25
|
+
<% @other_runs.each do |other| %>
|
|
26
|
+
<tr>
|
|
27
|
+
<td><%= link_to other.name, run_path(other), class: "ck-link" %></td>
|
|
28
|
+
<td class="ck-meta-copy"><%= other.judge_model %></td>
|
|
29
|
+
<td class="ck-meta-copy"><time datetime="<%= other.created_at.utc.iso8601 %>"><%= time_ago_in_words(other.created_at) %> ago</time></td>
|
|
30
|
+
<td class="ck-results-table__arrow"><%= link_to "Compare →", compare_run_path(@run, with: other.id), class: "ck-link" %></td>
|
|
31
|
+
</tr>
|
|
32
|
+
<% end %>
|
|
33
|
+
</tbody>
|
|
34
|
+
</table>
|
|
35
|
+
<% else %>
|
|
36
|
+
<div class="ck-empty">
|
|
37
|
+
<p>No other runs on this dataset + prompt combination yet. <%= link_to "Re-run from this one", rerun_run_path(@run), method: :post, class: "ck-link" %> to create one.</p>
|
|
38
|
+
</div>
|
|
39
|
+
<% end %>
|
|
@@ -32,10 +32,16 @@
|
|
|
32
32
|
</p>
|
|
33
33
|
</div>
|
|
34
34
|
<% if @run.status == "completed" %>
|
|
35
|
-
<%= button_to "Re-run
|
|
35
|
+
<%= button_to "Re-run from scratch",
|
|
36
36
|
rerun_run_path(@run), method: :post,
|
|
37
|
+
class: ck_button_classes(:light, variant: :outline), form_class: "inline-block",
|
|
38
|
+
title: "Create a new run that regenerates responses and grades them with the current judge.",
|
|
39
|
+
data: { turbo_confirm: "Create a new run with fresh responses and the current judge? The original run stays as a record." } %>
|
|
40
|
+
<%= button_to "Re-grade with current judge",
|
|
41
|
+
regrade_run_path(@run), method: :post,
|
|
37
42
|
class: ck_button_classes(:dark), form_class: "inline-block",
|
|
38
|
-
|
|
43
|
+
title: "Re-judge this run's existing responses against the current judge. Faster and cheaper than re-running.",
|
|
44
|
+
data: { turbo_confirm: "Re-judge this run's existing responses against the current judge?" } %>
|
|
39
45
|
<% end %>
|
|
40
46
|
</div>
|
|
41
47
|
<% end %>
|
data/config/routes.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.6.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -377,6 +377,8 @@ files:
|
|
|
377
377
|
- app/views/completion_kit/runs/_status_header.html.erb
|
|
378
378
|
- app/views/completion_kit/runs/_status_panel.html.erb
|
|
379
379
|
- app/views/completion_kit/runs/_table.html.erb
|
|
380
|
+
- app/views/completion_kit/runs/compare.html.erb
|
|
381
|
+
- app/views/completion_kit/runs/compare_picker.html.erb
|
|
380
382
|
- app/views/completion_kit/runs/edit.html.erb
|
|
381
383
|
- app/views/completion_kit/runs/index.html.erb
|
|
382
384
|
- app/views/completion_kit/runs/new.html.erb
|