completion-kit 0.18.1 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Rakefile +7 -0
- data/app/assets/stylesheets/completion_kit/application.css +6 -1
- data/app/controllers/completion_kit/api/v1/runs_controller.rb +5 -1
- data/app/controllers/completion_kit/dashboard_controller.rb +1 -0
- data/app/controllers/completion_kit/dashboard_dismissals_controller.rb +1 -0
- data/app/controllers/completion_kit/responses_controller.rb +2 -15
- data/app/controllers/completion_kit/runs_controller.rb +7 -16
- data/app/controllers/concerns/completion_kit/response_ordering.rb +25 -0
- data/app/helpers/completion_kit/application_helper.rb +28 -0
- data/app/services/completion_kit/dashboard_stats.rb +22 -0
- data/app/services/completion_kit/run_comparison.rb +13 -0
- data/app/views/completion_kit/dashboard/_failing_checks_card.html.erb +19 -0
- data/app/views/completion_kit/dashboard/show.html.erb +4 -1
- data/app/views/completion_kit/prompts/index.html.erb +6 -1
- data/app/views/completion_kit/prompts/show.html.erb +8 -2
- data/app/views/completion_kit/responses/show.html.erb +2 -0
- data/app/views/completion_kit/runs/_response_row.html.erb +20 -8
- data/app/views/completion_kit/runs/_row.html.erb +19 -6
- data/app/views/completion_kit/runs/_sort_toolbar.html.erb +1 -1
- data/app/views/completion_kit/runs/_status_panel.html.erb +35 -13
- data/app/views/completion_kit/runs/compare.html.erb +17 -3
- data/app/views/completion_kit/suggestions/show.html.erb +3 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +4 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 38312b44903cb1fa31fc64cb00d76713e4bc7c85eb51c9b17ebae6c988778509
|
|
4
|
+
data.tar.gz: 3bef91185c33760cbb9be5711e59a591115d53dde31b606f3eb92b334979d7ac
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 718185f7ad5d9644f32f068f1172abd4cb11b4de0a3c28099d3549d4678a437c9a98048c480e50005308e242671fe6d4746828ddee2b8a95e0efb561756d88c6
|
|
7
|
+
data.tar.gz: 54eccc031ee3b7d02bf543f06a86d9db6e00883925c61a5a85afb261734c999382a2c77076cbdf42bb9481a89cf4074b871f5cb347e60f21cb65e06772f17b8e
|
data/Rakefile
CHANGED
|
@@ -7,3 +7,10 @@ RSpec::Core::RakeTask.new(:spec) do |t|
|
|
|
7
7
|
end
|
|
8
8
|
|
|
9
9
|
task default: :spec
|
|
10
|
+
|
|
11
|
+
desc "Run the full suite with judge API keys cleared, matching CI's keyless environment"
|
|
12
|
+
task :release_guard do
|
|
13
|
+
sh "OPENAI_API_KEY= ANTHROPIC_API_KEY= OLLAMA_API_KEY= bundle exec rspec"
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
Rake::Task["release:guard_clean"].enhance([:release_guard])
|
|
@@ -413,6 +413,10 @@ form.button_to {
|
|
|
413
413
|
grid-template-columns: repeat(3, minmax(0, 1fr));
|
|
414
414
|
}
|
|
415
415
|
|
|
416
|
+
.ck-grid--cards-4 {
|
|
417
|
+
grid-template-columns: repeat(4, minmax(0, 1fr));
|
|
418
|
+
}
|
|
419
|
+
|
|
416
420
|
/* ── Dashboard: workspace stat ribbon ─────────────────────────────────
|
|
417
421
|
Replaces the old oversized count cards. One thin instrument strip,
|
|
418
422
|
four navigable segments split by hairline dividers. */
|
|
@@ -3698,7 +3702,7 @@ table.ck-runs-table {
|
|
|
3698
3702
|
}
|
|
3699
3703
|
|
|
3700
3704
|
.ck-runs-table td {
|
|
3701
|
-
vertical-align:
|
|
3705
|
+
vertical-align: top;
|
|
3702
3706
|
padding-top: 0.7rem;
|
|
3703
3707
|
padding-bottom: 0.7rem;
|
|
3704
3708
|
}
|
|
@@ -5163,6 +5167,7 @@ a.tag-mark {
|
|
|
5163
5167
|
.ck-failure-list__surface--run { color: var(--ck-warning); }
|
|
5164
5168
|
.ck-failure-list__surface--generation { color: var(--ck-danger); }
|
|
5165
5169
|
.ck-failure-list__surface--judge { color: var(--ck-info); }
|
|
5170
|
+
.ck-failure-list__surface--check { color: var(--ck-danger); }
|
|
5166
5171
|
.ck-failure-list__cause {
|
|
5167
5172
|
overflow: hidden;
|
|
5168
5173
|
text-overflow: ellipsis;
|
|
@@ -134,11 +134,15 @@ module CompletionKit
|
|
|
134
134
|
{
|
|
135
135
|
metric_id: mid,
|
|
136
136
|
metric_name: anchor.metric_name,
|
|
137
|
+
kind: anchor.check? ? "check" : "llm_judge",
|
|
137
138
|
left_score: l_review ? l_review.ai_score : nil,
|
|
138
139
|
right_score: r_review ? r_review.ai_score : nil,
|
|
140
|
+
left_passed: l_review&.passed,
|
|
141
|
+
right_passed: r_review&.passed,
|
|
139
142
|
left_metric_version_id: l_review&.metric_version_id,
|
|
140
143
|
right_metric_version_id: r_review&.metric_version_id,
|
|
141
|
-
delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil
|
|
144
|
+
delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil,
|
|
145
|
+
result_change: CompletionKit::RunComparison.result_change(l_review&.passed, r_review&.passed)
|
|
142
146
|
}
|
|
143
147
|
end.compact
|
|
144
148
|
}
|
|
@@ -14,6 +14,7 @@ module CompletionKit
|
|
|
14
14
|
@activity = DashboardStats.activity
|
|
15
15
|
@worst_metric = DashboardStats.worst_metric(since: 7.days.ago)
|
|
16
16
|
@failures = DashboardStats.failures(since: 7.days.ago)
|
|
17
|
+
@failing_checks = DashboardStats.failing_checks(since: 7.days.ago)
|
|
17
18
|
@ignored_metrics = DashboardDismissal.metrics
|
|
18
19
|
@ignored_failures = DashboardDismissal.failures
|
|
19
20
|
@prompt_changes = DashboardStats.prompt_changes
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class ResponsesController < ApplicationController
|
|
3
|
+
include CompletionKit::ResponseOrdering
|
|
3
4
|
before_action :set_run
|
|
4
5
|
before_action :set_response
|
|
5
6
|
|
|
@@ -24,21 +25,7 @@ module CompletionKit
|
|
|
24
25
|
end
|
|
25
26
|
|
|
26
27
|
def ordered_response_ids
|
|
27
|
-
|
|
28
|
-
@run.responses
|
|
29
|
-
.left_joins(:reviews)
|
|
30
|
-
.group("completion_kit_responses.id")
|
|
31
|
-
.order(Arel.sql("AVG(completion_kit_reviews.ai_score) ASC NULLS LAST"))
|
|
32
|
-
.pluck(:id)
|
|
33
|
-
elsif @run.judge_configured? && @sort != "none"
|
|
34
|
-
@run.responses
|
|
35
|
-
.left_joins(:reviews)
|
|
36
|
-
.group("completion_kit_responses.id")
|
|
37
|
-
.order(Arel.sql("AVG(completion_kit_reviews.ai_score) DESC NULLS LAST"))
|
|
38
|
-
.pluck(:id)
|
|
39
|
-
else
|
|
40
|
-
@run.responses.order(:id).pluck(:id)
|
|
41
|
-
end
|
|
28
|
+
ordered_responses_relation(@run, @sort).pluck(:id)
|
|
42
29
|
end
|
|
43
30
|
end
|
|
44
31
|
end
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class RunsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
+
include CompletionKit::ResponseOrdering
|
|
4
5
|
before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :regrade, :refresh_status, :compare]
|
|
5
6
|
before_action :load_form_collections, only: [:new, :edit, :create, :update]
|
|
6
7
|
|
|
@@ -10,21 +11,7 @@ module CompletionKit
|
|
|
10
11
|
end
|
|
11
12
|
|
|
12
13
|
def show
|
|
13
|
-
@responses =
|
|
14
|
-
@run.responses
|
|
15
|
-
.left_joins(:reviews)
|
|
16
|
-
.includes(:reviews)
|
|
17
|
-
.group("completion_kit_responses.id")
|
|
18
|
-
.order(Arel.sql("AVG(completion_kit_reviews.ai_score) ASC NULLS LAST"))
|
|
19
|
-
elsif @run.judge_configured?
|
|
20
|
-
@run.responses
|
|
21
|
-
.left_joins(:reviews)
|
|
22
|
-
.includes(:reviews)
|
|
23
|
-
.group("completion_kit_responses.id")
|
|
24
|
-
.order(Arel.sql("AVG(completion_kit_reviews.ai_score) DESC NULLS LAST"))
|
|
25
|
-
else
|
|
26
|
-
@run.responses.includes(:reviews).order(:id)
|
|
27
|
-
end
|
|
14
|
+
@responses = ordered_responses_relation(@run, params[:sort]).includes(:reviews)
|
|
28
15
|
end
|
|
29
16
|
|
|
30
17
|
def new
|
|
@@ -208,11 +195,15 @@ module CompletionKit
|
|
|
208
195
|
{
|
|
209
196
|
metric_id: mid,
|
|
210
197
|
metric_name: anchor.metric_name,
|
|
198
|
+
kind: anchor.check? ? "check" : "llm_judge",
|
|
211
199
|
left_score: l_review ? l_review.ai_score : nil,
|
|
212
200
|
right_score: r_review ? r_review.ai_score : nil,
|
|
201
|
+
left_passed: l_review&.passed,
|
|
202
|
+
right_passed: r_review&.passed,
|
|
213
203
|
left_version_label: version_label_for(l_review, metric_versions),
|
|
214
204
|
right_version_label: version_label_for(r_review, metric_versions),
|
|
215
|
-
delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil
|
|
205
|
+
delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil,
|
|
206
|
+
result_change: RunComparison.result_change(l_review&.passed, r_review&.passed)
|
|
216
207
|
}
|
|
217
208
|
end.compact
|
|
218
209
|
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module ResponseOrdering
|
|
3
|
+
extend ActiveSupport::Concern
|
|
4
|
+
|
|
5
|
+
private
|
|
6
|
+
|
|
7
|
+
FAILED_CHECKS_SQL = "SUM(CASE WHEN completion_kit_reviews.passed IS FALSE THEN 1 ELSE 0 END)".freeze
|
|
8
|
+
RUBRIC_AVG_SQL = "AVG(completion_kit_reviews.ai_score)".freeze
|
|
9
|
+
|
|
10
|
+
def ordered_responses_relation(run, sort)
|
|
11
|
+
return run.responses.order(:id) unless run.gradable?
|
|
12
|
+
|
|
13
|
+
composite = if sort == "score_asc"
|
|
14
|
+
"#{FAILED_CHECKS_SQL} DESC, #{RUBRIC_AVG_SQL} ASC NULLS LAST"
|
|
15
|
+
else
|
|
16
|
+
"#{FAILED_CHECKS_SQL} ASC, #{RUBRIC_AVG_SQL} DESC NULLS LAST"
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
run.responses
|
|
20
|
+
.left_joins(:reviews)
|
|
21
|
+
.group("completion_kit_responses.id")
|
|
22
|
+
.order(Arel.sql("#{composite}, completion_kit_responses.id ASC"))
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -159,6 +159,34 @@ module CompletionKit
|
|
|
159
159
|
:low
|
|
160
160
|
end
|
|
161
161
|
|
|
162
|
+
def ck_check_badge(passed)
|
|
163
|
+
if passed == true
|
|
164
|
+
content_tag(:span, "Pass", class: ck_badge_classes(:high))
|
|
165
|
+
elsif passed == false
|
|
166
|
+
content_tag(:span, "Fail", class: ck_badge_classes(:low))
|
|
167
|
+
else
|
|
168
|
+
content_tag(:span, "Pending", class: ck_badge_classes(:pending))
|
|
169
|
+
end
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def ck_result_change_badge(change)
|
|
173
|
+
case change
|
|
174
|
+
when "broke"
|
|
175
|
+
content_tag(:span, "Broke", class: "ck-delta ck-delta--negative")
|
|
176
|
+
when "fixed"
|
|
177
|
+
content_tag(:span, "Fixed", class: "ck-delta ck-delta--positive")
|
|
178
|
+
when "same"
|
|
179
|
+
content_tag(:span, "Same", class: "ck-delta ck-delta--zero")
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def ck_pass_rate_kind(rate)
|
|
184
|
+
return :high if rate >= 0.9
|
|
185
|
+
return :medium if rate >= 0.7
|
|
186
|
+
|
|
187
|
+
:low
|
|
188
|
+
end
|
|
189
|
+
|
|
162
190
|
def ck_word_diff_old(old_text, new_text)
|
|
163
191
|
diff_tokens(old_text, new_text, :old)
|
|
164
192
|
end
|
|
@@ -58,6 +58,28 @@ module CompletionKit
|
|
|
58
58
|
scored_reviews_since(since).where(metric_id: metric_id).average(:ai_score)&.to_f&.round(2)
|
|
59
59
|
end
|
|
60
60
|
|
|
61
|
+
def self.metric_pass_rate(metric_id, since:)
|
|
62
|
+
resolved = Review.joins(:response)
|
|
63
|
+
.where(metric_id: metric_id)
|
|
64
|
+
.where("completion_kit_reviews.created_at >= ?", since)
|
|
65
|
+
.where.not(passed: nil)
|
|
66
|
+
total = resolved.count
|
|
67
|
+
return nil if total.zero?
|
|
68
|
+
|
|
69
|
+
(resolved.where(passed: true).count.to_f / total).round(2)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def self.failing_checks(since:)
|
|
73
|
+
reviews = Review.where(passed: false)
|
|
74
|
+
.where("completion_kit_reviews.created_at >= ?", since)
|
|
75
|
+
.includes(response: :run)
|
|
76
|
+
.order(updated_at: :desc)
|
|
77
|
+
items = reviews.map do |review|
|
|
78
|
+
{ metric_name: review.metric_name, response: review.response, run: review.response.run }
|
|
79
|
+
end
|
|
80
|
+
{ count: items.size, items: items }
|
|
81
|
+
end
|
|
82
|
+
|
|
61
83
|
# Everything that terminally failed in the window across all three
|
|
62
84
|
# surfaces — failed runs, failed generations, failed judge reviews —
|
|
63
85
|
# excluding any the user has dismissed. Returns a count and an items list
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module RunComparison
|
|
3
|
+
module_function
|
|
4
|
+
|
|
5
|
+
def result_change(left_passed, right_passed)
|
|
6
|
+
return nil if left_passed.nil? || right_passed.nil?
|
|
7
|
+
return "broke" if left_passed && !right_passed
|
|
8
|
+
return "fixed" if !left_passed && right_passed
|
|
9
|
+
|
|
10
|
+
"same"
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
<div class="ck-card ck-stat-card ck-rise" id="ck-failing-checks-card" style="--rise-delay: 200ms;">
|
|
2
|
+
<p class="ck-kicker">Failing checks · last 7 days</p>
|
|
3
|
+
<div class="ck-stat-card__body">
|
|
4
|
+
<span class="ck-stat-card__count<%= failing_checks[:count].positive? ? ' is-danger' : ' is-clean' %>"><%= failing_checks[:count] %></span>
|
|
5
|
+
</div>
|
|
6
|
+
|
|
7
|
+
<% if failing_checks[:items].any? %>
|
|
8
|
+
<ul class="ck-failure-list">
|
|
9
|
+
<% failing_checks[:items].first(5).each do |item| %>
|
|
10
|
+
<li class="ck-failure-list__item">
|
|
11
|
+
<span class="ck-failure-list__surface ck-failure-list__surface--check">check</span>
|
|
12
|
+
<%= link_to item[:metric_name], completion_kit.run_path(item[:run]), class: "ck-link ck-failure-list__cause" %>
|
|
13
|
+
</li>
|
|
14
|
+
<% end %>
|
|
15
|
+
</ul>
|
|
16
|
+
<% else %>
|
|
17
|
+
<div class="ck-stat-card__foot"><span>No failing checks this week.</span></div>
|
|
18
|
+
<% end %>
|
|
19
|
+
</div>
|
|
@@ -28,7 +28,7 @@
|
|
|
28
28
|
</nav>
|
|
29
29
|
|
|
30
30
|
<% if @activity %>
|
|
31
|
-
<div class="ck-grid ck-grid--cards ck-grid--cards-
|
|
31
|
+
<div class="ck-grid ck-grid--cards ck-grid--cards-4 ck-pulse-grid">
|
|
32
32
|
<div class="ck-card ck-stat-card ck-rise" style="--rise-delay: 60ms;">
|
|
33
33
|
<p class="ck-kicker">Activity · last 14 days</p>
|
|
34
34
|
<% activity_max = @activity.map { |d| d[:count] }.max %>
|
|
@@ -49,6 +49,9 @@
|
|
|
49
49
|
|
|
50
50
|
<%= render "completion_kit/dashboard/failures_card",
|
|
51
51
|
failures: @failures, ignored_failures: @ignored_failures %>
|
|
52
|
+
|
|
53
|
+
<%= render "completion_kit/dashboard/failing_checks_card",
|
|
54
|
+
failing_checks: @failing_checks %>
|
|
52
55
|
</div>
|
|
53
56
|
|
|
54
57
|
<div class="ck-card ck-card--spaced ck-rise" style="--rise-delay: 240ms;">
|
|
@@ -54,10 +54,15 @@
|
|
|
54
54
|
<% family_runs = CompletionKit::Run.where(prompt_id: prompt.family_versions.select(:id)).display_scoped %>
|
|
55
55
|
<% current_version_runs = prompt.runs.display_scoped.includes(responses: :reviews) %>
|
|
56
56
|
<% best_score = current_version_runs.map(&:avg_score).compact.max %>
|
|
57
|
+
<% best_pass_rate = current_version_runs.map(&:check_pass_rate).compact.max %>
|
|
57
58
|
<td>
|
|
58
59
|
<% if best_score %>
|
|
59
60
|
<span class="<%= ck_badge_classes(ck_score_kind(best_score)) %>"><%= best_score %></span>
|
|
60
|
-
<%
|
|
61
|
+
<% end %>
|
|
62
|
+
<% if best_pass_rate %>
|
|
63
|
+
<span class="<%= ck_badge_classes(ck_pass_rate_kind(best_pass_rate)) %>"><%= (best_pass_rate * 100).round %>%</span>
|
|
64
|
+
<% end %>
|
|
65
|
+
<% unless best_score || best_pass_rate %>
|
|
61
66
|
<span class="ck-prompts-table__dim">—</span>
|
|
62
67
|
<% end %>
|
|
63
68
|
</td>
|
|
@@ -64,7 +64,9 @@
|
|
|
64
64
|
</thead>
|
|
65
65
|
<tbody>
|
|
66
66
|
<% versions.each do |v| %>
|
|
67
|
-
<%
|
|
67
|
+
<% scoped_runs = v.runs.display_scoped %>
|
|
68
|
+
<% best_score = scoped_runs.map(&:avg_score).compact.max %>
|
|
69
|
+
<% best_pass_rate = scoped_runs.map(&:check_pass_rate).compact.max %>
|
|
68
70
|
<% pred = predecessor_of[v] %>
|
|
69
71
|
<tr class="<%= "ck-results-table__row--active" if v.id == @prompt.id %>" onclick="window.location='<%= prompt_path(v) %>'" style="cursor: pointer;">
|
|
70
72
|
<td>
|
|
@@ -86,7 +88,11 @@
|
|
|
86
88
|
<td>
|
|
87
89
|
<% if best_score %>
|
|
88
90
|
<span class="<%= ck_badge_classes(ck_score_kind(best_score)) %>"><%= best_score %></span>
|
|
89
|
-
<%
|
|
91
|
+
<% end %>
|
|
92
|
+
<% if best_pass_rate %>
|
|
93
|
+
<span class="<%= ck_badge_classes(ck_pass_rate_kind(best_pass_rate)) %>"><%= (best_pass_rate * 100).round %>%</span>
|
|
94
|
+
<% end %>
|
|
95
|
+
<% unless best_score || best_pass_rate %>
|
|
90
96
|
<span class="ck-prompts-table__dim">—</span>
|
|
91
97
|
<% end %>
|
|
92
98
|
</td>
|
|
@@ -116,6 +116,8 @@
|
|
|
116
116
|
<% 5.times do |i| %>
|
|
117
117
|
<svg viewBox="0 0 24 24" width="16" height="16" stroke-width="1.75" class="ck-star <%= i < review.ai_score.to_i ? "ck-star--filled" : "ck-star--empty" %>"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
|
|
118
118
|
<% end %>
|
|
119
|
+
<% elsif review.check? && !review.passed.nil? %>
|
|
120
|
+
<%= ck_check_badge(review.passed) %>
|
|
119
121
|
<% else %>
|
|
120
122
|
<span class="<%= ck_badge_classes(:pending) %>">Pending</span>
|
|
121
123
|
<% end %>
|
|
@@ -10,14 +10,21 @@
|
|
|
10
10
|
<% end %>
|
|
11
11
|
</td>
|
|
12
12
|
<td data-label="Metrics">
|
|
13
|
-
<%
|
|
14
|
-
<% if
|
|
13
|
+
<% pip_reviews = response.reviews.select { |r| r.ai_score.present? || !r.passed.nil? }.sort_by { |r| r.metric_name.to_s.downcase } %>
|
|
14
|
+
<% if pip_reviews.any? %>
|
|
15
15
|
<span class="ck-metric-bar ck-metric-bar--compact">
|
|
16
|
-
<%
|
|
17
|
-
|
|
18
|
-
<span class="ck-metric-
|
|
19
|
-
|
|
20
|
-
|
|
16
|
+
<% pip_reviews.each do |r| %>
|
|
17
|
+
<% if r.ai_score.present? %>
|
|
18
|
+
<span class="ck-metric-pip ck-metric-pip--<%= ck_score_kind(r.ai_score.to_f) %>">
|
|
19
|
+
<span class="ck-metric-pip__bar"></span>
|
|
20
|
+
<span class="ck-metric-pip__label"><%= r.metric_name %> <strong><%= r.ai_score %></strong></span>
|
|
21
|
+
</span>
|
|
22
|
+
<% else %>
|
|
23
|
+
<span class="ck-metric-pip ck-metric-pip--<%= ck_pass_rate_kind(r.passed ? 1.0 : 0.0) %>">
|
|
24
|
+
<span class="ck-metric-pip__bar"></span>
|
|
25
|
+
<span class="ck-metric-pip__label"><%= r.metric_name %> <strong><%= r.passed ? "Pass" : "Fail" %></strong></span>
|
|
26
|
+
</span>
|
|
27
|
+
<% end %>
|
|
21
28
|
<% end %>
|
|
22
29
|
</span>
|
|
23
30
|
<% else %>
|
|
@@ -26,7 +33,12 @@
|
|
|
26
33
|
</td>
|
|
27
34
|
<td data-label="Avg score">
|
|
28
35
|
<% if response.reviewed? %>
|
|
29
|
-
|
|
36
|
+
<% if response.score %>
|
|
37
|
+
<span class="<%= ck_badge_classes(ck_score_kind(response.score.to_f)) %>"><%= response.score %></span>
|
|
38
|
+
<% end %>
|
|
39
|
+
<% if response.checks_total.positive? %>
|
|
40
|
+
<span class="<%= ck_badge_classes(ck_pass_rate_kind(response.checks_passed.to_f / response.checks_total)) %>"><%= response.checks_passed %>/<%= response.checks_total %></span>
|
|
41
|
+
<% end %>
|
|
30
42
|
<% else %>
|
|
31
43
|
<span class="ck-response-cell__dim">—</span>
|
|
32
44
|
<% end %>
|
|
@@ -34,10 +34,17 @@
|
|
|
34
34
|
<% if metrics.any? %>
|
|
35
35
|
<div class="ck-metric-bar ck-metric-bar--compact">
|
|
36
36
|
<% metrics.each do |m| %>
|
|
37
|
-
|
|
38
|
-
<span class="ck-metric-
|
|
39
|
-
|
|
40
|
-
|
|
37
|
+
<% if m[:kind] == "check" %>
|
|
38
|
+
<span class="ck-metric-pip ck-metric-pip--<%= ck_pass_rate_kind(m[:pass_rate]) %>">
|
|
39
|
+
<span class="ck-metric-pip__bar"></span>
|
|
40
|
+
<span class="ck-metric-pip__label"><%= m[:name] %> <strong><%= (m[:pass_rate] * 100).round %>%</strong></span>
|
|
41
|
+
</span>
|
|
42
|
+
<% else %>
|
|
43
|
+
<span class="ck-metric-pip ck-metric-pip--<%= ck_score_kind(m[:avg]) %>">
|
|
44
|
+
<span class="ck-metric-pip__bar"></span>
|
|
45
|
+
<span class="ck-metric-pip__label"><%= m[:name] %> <strong><%= m[:avg] %></strong></span>
|
|
46
|
+
</span>
|
|
47
|
+
<% end %>
|
|
41
48
|
<% end %>
|
|
42
49
|
</div>
|
|
43
50
|
<% else %>
|
|
@@ -46,8 +53,14 @@
|
|
|
46
53
|
</td>
|
|
47
54
|
<td>
|
|
48
55
|
<% avg = run.avg_score %>
|
|
49
|
-
<%
|
|
50
|
-
|
|
56
|
+
<% pass_rate = run.check_pass_rate %>
|
|
57
|
+
<% if avg || pass_rate %>
|
|
58
|
+
<% if avg %>
|
|
59
|
+
<span class="<%= ck_badge_classes(ck_score_kind(avg)) %>"><%= avg %></span>
|
|
60
|
+
<% end %>
|
|
61
|
+
<% if pass_rate %>
|
|
62
|
+
<span class="<%= ck_badge_classes(ck_pass_rate_kind(pass_rate)) %>"><%= (pass_rate * 100).round %>%</span>
|
|
63
|
+
<% end %>
|
|
51
64
|
<% else %>
|
|
52
65
|
<span class="ck-runs-table__dim">—</span>
|
|
53
66
|
<% end %>
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
<div id="run_sort_toolbar">
|
|
2
|
-
<% if run.
|
|
2
|
+
<% if run.gradable? %>
|
|
3
3
|
<% active = run.status == "completed" && run.responses.joins(:reviews).exists? %>
|
|
4
4
|
<div class="ck-toolbar" style="margin-top: 1.5rem;<%= ' visibility: hidden;' unless active %>" aria-hidden="<%= !active %>">
|
|
5
5
|
<%= link_to "Best first", run_path(run, sort: "score_desc"), class: params[:sort].blank? || params[:sort] == "score_desc" ? ck_button_classes(:dark) : ck_button_classes(:light, variant: :outline), tabindex: active ? nil : -1 %>
|
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
<div id="run_status_panel" aria-live="polite" aria-atomic="true">
|
|
3
3
|
<% if run.status.in?(%w[running completed]) && snap[:generated_total] > 0 %>
|
|
4
4
|
<% failed_count = snap[:generated_failed] + snap[:judged_failed] %>
|
|
5
|
-
<%
|
|
5
|
+
<% show_grading = snap[:judged_total] > 0 || run.gradable? %>
|
|
6
|
+
<% has_llm_metric = run.llm_metrics.any? %>
|
|
7
|
+
<% has_checks = run.check_metrics.any? %>
|
|
6
8
|
<% metric_avgs = run.metric_averages.sort_by { |m| m[:name].to_s.downcase } %>
|
|
7
9
|
<% metric_lookup = metric_avgs.index_by { |m| m[:name].to_s.downcase } %>
|
|
8
10
|
<section class="ck-run-status ck-run-status--<%= run.status %>">
|
|
@@ -18,7 +20,7 @@
|
|
|
18
20
|
<p class="ck-run-status__cell-value ck-run-status__summary-line">
|
|
19
21
|
<span class="ck-run-status__summary-num"><%= snap[:generated_done] %></span>
|
|
20
22
|
<span class="ck-run-status__summary-text">of <%= snap[:generated_total] %> responses</span>
|
|
21
|
-
<% if
|
|
23
|
+
<% if show_grading %>
|
|
22
24
|
<span class="ck-run-status__summary-sep">·</span>
|
|
23
25
|
<span class="ck-run-status__summary-num"><%= snap[:judged_done] %></span>
|
|
24
26
|
<span class="ck-run-status__summary-text">of <%= snap[:judged_total] %> judged</span>
|
|
@@ -30,7 +32,7 @@
|
|
|
30
32
|
</p>
|
|
31
33
|
</div>
|
|
32
34
|
|
|
33
|
-
<% if
|
|
35
|
+
<% if show_grading %>
|
|
34
36
|
<div class="ck-run-status__cell">
|
|
35
37
|
<p class="ck-run-status__metric-label">Metrics</p>
|
|
36
38
|
<div class="ck-run-status__cell-value">
|
|
@@ -38,7 +40,12 @@
|
|
|
38
40
|
<div class="ck-metric-bar ck-metric-bar--compact">
|
|
39
41
|
<% run.metrics.order(:name).each do |metric| %>
|
|
40
42
|
<% avg_for_metric = metric_lookup[metric.name.to_s.downcase] %>
|
|
41
|
-
<% if avg_for_metric %>
|
|
43
|
+
<% if avg_for_metric && avg_for_metric[:kind] == "check" %>
|
|
44
|
+
<span class="ck-metric-pip ck-metric-pip--<%= ck_pass_rate_kind(avg_for_metric[:pass_rate]) %>">
|
|
45
|
+
<span class="ck-metric-pip__bar"></span>
|
|
46
|
+
<span class="ck-metric-pip__label"><%= metric.name %> <strong><%= (avg_for_metric[:pass_rate] * 100).round %>%</strong></span>
|
|
47
|
+
</span>
|
|
48
|
+
<% elsif avg_for_metric %>
|
|
42
49
|
<span class="ck-metric-pip ck-metric-pip--<%= ck_score_kind(avg_for_metric[:avg]) %>">
|
|
43
50
|
<span class="ck-metric-pip__bar"></span>
|
|
44
51
|
<span class="ck-metric-pip__label"><%= metric.name %> <strong><%= avg_for_metric[:avg] %></strong></span>
|
|
@@ -57,16 +64,31 @@
|
|
|
57
64
|
</div>
|
|
58
65
|
</div>
|
|
59
66
|
|
|
60
|
-
|
|
61
|
-
<
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
67
|
+
<% if has_llm_metric %>
|
|
68
|
+
<div class="ck-run-status__cell">
|
|
69
|
+
<p class="ck-run-status__metric-label">Avg score</p>
|
|
70
|
+
<div class="ck-run-status__cell-value">
|
|
71
|
+
<% if run.avg_score %>
|
|
72
|
+
<span class="<%= ck_badge_classes(ck_score_kind(run.avg_score)) %> ck-badge--lg"><%= run.avg_score %></span>
|
|
73
|
+
<% else %>
|
|
74
|
+
<span class="ck-run-status__cell-empty">—</span>
|
|
75
|
+
<% end %>
|
|
76
|
+
</div>
|
|
68
77
|
</div>
|
|
69
|
-
|
|
78
|
+
<% end %>
|
|
79
|
+
|
|
80
|
+
<% if has_checks %>
|
|
81
|
+
<div class="ck-run-status__cell">
|
|
82
|
+
<p class="ck-run-status__metric-label">Checks passed</p>
|
|
83
|
+
<div class="ck-run-status__cell-value">
|
|
84
|
+
<% if run.check_pass_rate %>
|
|
85
|
+
<span class="<%= ck_badge_classes(ck_pass_rate_kind(run.check_pass_rate)) %> ck-badge--lg"><%= (run.check_pass_rate * 100).round %>%</span>
|
|
86
|
+
<% else %>
|
|
87
|
+
<span class="ck-run-status__cell-empty">—</span>
|
|
88
|
+
<% end %>
|
|
89
|
+
</div>
|
|
90
|
+
</div>
|
|
91
|
+
<% end %>
|
|
70
92
|
<% end %>
|
|
71
93
|
</div>
|
|
72
94
|
|
|
@@ -46,21 +46,35 @@
|
|
|
46
46
|
<% end %>
|
|
47
47
|
<td><%= pm[:metric_name] %></td>
|
|
48
48
|
<td>
|
|
49
|
-
<% if pm[:
|
|
49
|
+
<% if pm[:kind] == "check" %>
|
|
50
|
+
<% if pm[:left_passed].nil? %>
|
|
51
|
+
<span class="ck-meta-copy">—</span>
|
|
52
|
+
<% else %>
|
|
53
|
+
<%= ck_check_badge(pm[:left_passed]) %>
|
|
54
|
+
<% end %>
|
|
55
|
+
<% elsif pm[:left_score] %>
|
|
50
56
|
<span class="<%= ck_badge_classes(ck_score_kind(pm[:left_score].to_f)) %>"><%= pm[:left_score] %></span>
|
|
51
57
|
<% else %>
|
|
52
58
|
<span class="ck-meta-copy">—</span>
|
|
53
59
|
<% end %>
|
|
54
60
|
</td>
|
|
55
61
|
<td>
|
|
56
|
-
<% if pm[:
|
|
62
|
+
<% if pm[:kind] == "check" %>
|
|
63
|
+
<% if pm[:right_passed].nil? %>
|
|
64
|
+
<span class="ck-meta-copy">—</span>
|
|
65
|
+
<% else %>
|
|
66
|
+
<%= ck_check_badge(pm[:right_passed]) %>
|
|
67
|
+
<% end %>
|
|
68
|
+
<% elsif pm[:right_score] %>
|
|
57
69
|
<span class="<%= ck_badge_classes(ck_score_kind(pm[:right_score].to_f)) %>"><%= pm[:right_score] %></span>
|
|
58
70
|
<% else %>
|
|
59
71
|
<span class="ck-meta-copy">—</span>
|
|
60
72
|
<% end %>
|
|
61
73
|
</td>
|
|
62
74
|
<td>
|
|
63
|
-
<% if pm[:
|
|
75
|
+
<% if pm[:kind] == "check" %>
|
|
76
|
+
<%= ck_result_change_badge(pm[:result_change]) || content_tag(:span, "—", class: "ck-meta-copy") %>
|
|
77
|
+
<% elsif pm[:delta] %>
|
|
64
78
|
<% delta_class = pm[:delta] > 0 ? "ck-delta--positive" : pm[:delta] < 0 ? "ck-delta--negative" : "ck-delta--zero" %>
|
|
65
79
|
<span class="ck-delta <%= delta_class %>"><%= pm[:delta].positive? ? "+#{pm[:delta]}" : pm[:delta].to_s %></span>
|
|
66
80
|
<% else %>
|
|
@@ -20,6 +20,9 @@
|
|
|
20
20
|
<% if @run.avg_score %>
|
|
21
21
|
<span class="<%= ck_badge_classes(ck_score_kind(@run.avg_score)) %>"><%= @run.avg_score %></span>
|
|
22
22
|
<% end %>
|
|
23
|
+
<% if @run.check_pass_rate %>
|
|
24
|
+
<span class="<%= ck_badge_classes(ck_pass_rate_kind(@run.check_pass_rate)) %>"><%= (@run.check_pass_rate * 100).round %>%</span>
|
|
25
|
+
<% end %>
|
|
23
26
|
</p>
|
|
24
27
|
</div>
|
|
25
28
|
<div class="ck-actions">
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.19.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -261,6 +261,7 @@ files:
|
|
|
261
261
|
- app/controllers/completion_kit/runs_controller.rb
|
|
262
262
|
- app/controllers/completion_kit/suggestions_controller.rb
|
|
263
263
|
- app/controllers/completion_kit/tags_controller.rb
|
|
264
|
+
- app/controllers/concerns/completion_kit/response_ordering.rb
|
|
264
265
|
- app/controllers/concerns/completion_kit/tag_filtering.rb
|
|
265
266
|
- app/helpers/completion_kit/application_helper.rb
|
|
266
267
|
- app/jobs/completion_kit/application_job.rb
|
|
@@ -339,6 +340,7 @@ files:
|
|
|
339
340
|
- app/services/completion_kit/prompt_improvement_service.rb
|
|
340
341
|
- app/services/completion_kit/prompt_improvement_validator.rb
|
|
341
342
|
- app/services/completion_kit/provider_endpoint.rb
|
|
343
|
+
- app/services/completion_kit/run_comparison.rb
|
|
342
344
|
- app/services/completion_kit/starter_metrics.rb
|
|
343
345
|
- app/services/completion_kit/worker_health.rb
|
|
344
346
|
- app/validators/completion_kit/tenant_scoped_uniqueness_validator.rb
|
|
@@ -352,6 +354,7 @@ files:
|
|
|
352
354
|
- app/views/completion_kit/api_reference/index.html.erb
|
|
353
355
|
- app/views/completion_kit/dashboard/_eye_icon.html.erb
|
|
354
356
|
- app/views/completion_kit/dashboard/_eye_off_icon.html.erb
|
|
357
|
+
- app/views/completion_kit/dashboard/_failing_checks_card.html.erb
|
|
355
358
|
- app/views/completion_kit/dashboard/_failures_card.html.erb
|
|
356
359
|
- app/views/completion_kit/dashboard/_worst_metric_card.html.erb
|
|
357
360
|
- app/views/completion_kit/dashboard/show.html.erb
|