completion-kit 0.18.1 → 0.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 620084f7c112f139684433d2d6f2d8ee407bfd5c7aaf1612ac4a5334c404e490
4
- data.tar.gz: 7e93349d581b10eaa993250d678f80550492a0075c426b8970a97c498b277ce6
3
+ metadata.gz: 38312b44903cb1fa31fc64cb00d76713e4bc7c85eb51c9b17ebae6c988778509
4
+ data.tar.gz: 3bef91185c33760cbb9be5711e59a591115d53dde31b606f3eb92b334979d7ac
5
5
  SHA512:
6
- metadata.gz: e94ead519660a768f88f7981128f338f39448837ad12cf45ca26409a7b574bc9d6a2e6980f2fef839c5fbd44d17ad465b4cce21937068a98db8312e06f437dde
7
- data.tar.gz: 4f31db3df58c983066fbd19b71f8f978215755f592f8637e0116a1b3f0fb33c213079517503077cda2622c52895531c0226f0407f0c32ba33d0788c134bae64f
6
+ metadata.gz: 718185f7ad5d9644f32f068f1172abd4cb11b4de0a3c28099d3549d4678a437c9a98048c480e50005308e242671fe6d4746828ddee2b8a95e0efb561756d88c6
7
+ data.tar.gz: 54eccc031ee3b7d02bf543f06a86d9db6e00883925c61a5a85afb261734c999382a2c77076cbdf42bb9481a89cf4074b871f5cb347e60f21cb65e06772f17b8e
data/Rakefile CHANGED
@@ -7,3 +7,10 @@ RSpec::Core::RakeTask.new(:spec) do |t|
7
7
  end
8
8
 
9
9
  task default: :spec
10
+
11
+ desc "Run the full suite with judge API keys cleared, matching CI's keyless environment"
12
+ task :release_guard do
13
+ sh "OPENAI_API_KEY= ANTHROPIC_API_KEY= OLLAMA_API_KEY= bundle exec rspec"
14
+ end
15
+
16
+ Rake::Task["release:guard_clean"].enhance([:release_guard])
@@ -413,6 +413,10 @@ form.button_to {
413
413
  grid-template-columns: repeat(3, minmax(0, 1fr));
414
414
  }
415
415
 
416
+ .ck-grid--cards-4 {
417
+ grid-template-columns: repeat(4, minmax(0, 1fr));
418
+ }
419
+
416
420
  /* ── Dashboard: workspace stat ribbon ─────────────────────────────────
417
421
  Replaces the old oversized count cards. One thin instrument strip,
418
422
  four navigable segments split by hairline dividers. */
@@ -3698,7 +3702,7 @@ table.ck-runs-table {
3698
3702
  }
3699
3703
 
3700
3704
  .ck-runs-table td {
3701
- vertical-align: middle;
3705
+ vertical-align: top;
3702
3706
  padding-top: 0.7rem;
3703
3707
  padding-bottom: 0.7rem;
3704
3708
  }
@@ -5163,6 +5167,7 @@ a.tag-mark {
5163
5167
  .ck-failure-list__surface--run { color: var(--ck-warning); }
5164
5168
  .ck-failure-list__surface--generation { color: var(--ck-danger); }
5165
5169
  .ck-failure-list__surface--judge { color: var(--ck-info); }
5170
+ .ck-failure-list__surface--check { color: var(--ck-danger); }
5166
5171
  .ck-failure-list__cause {
5167
5172
  overflow: hidden;
5168
5173
  text-overflow: ellipsis;
@@ -134,11 +134,15 @@ module CompletionKit
134
134
  {
135
135
  metric_id: mid,
136
136
  metric_name: anchor.metric_name,
137
+ kind: anchor.check? ? "check" : "llm_judge",
137
138
  left_score: l_review ? l_review.ai_score : nil,
138
139
  right_score: r_review ? r_review.ai_score : nil,
140
+ left_passed: l_review&.passed,
141
+ right_passed: r_review&.passed,
139
142
  left_metric_version_id: l_review&.metric_version_id,
140
143
  right_metric_version_id: r_review&.metric_version_id,
141
- delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil
144
+ delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil,
145
+ result_change: CompletionKit::RunComparison.result_change(l_review&.passed, r_review&.passed)
142
146
  }
143
147
  end.compact
144
148
  }
@@ -14,6 +14,7 @@ module CompletionKit
14
14
  @activity = DashboardStats.activity
15
15
  @worst_metric = DashboardStats.worst_metric(since: 7.days.ago)
16
16
  @failures = DashboardStats.failures(since: 7.days.ago)
17
+ @failing_checks = DashboardStats.failing_checks(since: 7.days.ago)
17
18
  @ignored_metrics = DashboardDismissal.metrics
18
19
  @ignored_failures = DashboardDismissal.failures
19
20
  @prompt_changes = DashboardStats.prompt_changes
@@ -27,6 +27,7 @@ module CompletionKit
27
27
 
28
28
  def baseline_for(record)
29
29
  return nil unless record.is_a?(Metric)
30
+ return DashboardStats.metric_pass_rate(record.id, since: WINDOW.ago) if record.check?
30
31
  DashboardStats.metric_average(record.id, since: WINDOW.ago)
31
32
  end
32
33
 
@@ -1,5 +1,6 @@
1
1
  module CompletionKit
2
2
  class ResponsesController < ApplicationController
3
+ include CompletionKit::ResponseOrdering
3
4
  before_action :set_run
4
5
  before_action :set_response
5
6
 
@@ -24,21 +25,7 @@ module CompletionKit
24
25
  end
25
26
 
26
27
  def ordered_response_ids
27
- if @run.judge_configured? && @sort == "score_asc"
28
- @run.responses
29
- .left_joins(:reviews)
30
- .group("completion_kit_responses.id")
31
- .order(Arel.sql("AVG(completion_kit_reviews.ai_score) ASC NULLS LAST"))
32
- .pluck(:id)
33
- elsif @run.judge_configured? && @sort != "none"
34
- @run.responses
35
- .left_joins(:reviews)
36
- .group("completion_kit_responses.id")
37
- .order(Arel.sql("AVG(completion_kit_reviews.ai_score) DESC NULLS LAST"))
38
- .pluck(:id)
39
- else
40
- @run.responses.order(:id).pluck(:id)
41
- end
28
+ ordered_responses_relation(@run, @sort).pluck(:id)
42
29
  end
43
30
  end
44
31
  end
@@ -1,6 +1,7 @@
1
1
  module CompletionKit
2
2
  class RunsController < ApplicationController
3
3
  include CompletionKit::TagFiltering
4
+ include CompletionKit::ResponseOrdering
4
5
  before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :regrade, :refresh_status, :compare]
5
6
  before_action :load_form_collections, only: [:new, :edit, :create, :update]
6
7
 
@@ -10,21 +11,7 @@ module CompletionKit
10
11
  end
11
12
 
12
13
  def show
13
- @responses = if @run.judge_configured? && params[:sort] == "score_asc"
14
- @run.responses
15
- .left_joins(:reviews)
16
- .includes(:reviews)
17
- .group("completion_kit_responses.id")
18
- .order(Arel.sql("AVG(completion_kit_reviews.ai_score) ASC NULLS LAST"))
19
- elsif @run.judge_configured?
20
- @run.responses
21
- .left_joins(:reviews)
22
- .includes(:reviews)
23
- .group("completion_kit_responses.id")
24
- .order(Arel.sql("AVG(completion_kit_reviews.ai_score) DESC NULLS LAST"))
25
- else
26
- @run.responses.includes(:reviews).order(:id)
27
- end
14
+ @responses = ordered_responses_relation(@run, params[:sort]).includes(:reviews)
28
15
  end
29
16
 
30
17
  def new
@@ -208,11 +195,15 @@ module CompletionKit
208
195
  {
209
196
  metric_id: mid,
210
197
  metric_name: anchor.metric_name,
198
+ kind: anchor.check? ? "check" : "llm_judge",
211
199
  left_score: l_review ? l_review.ai_score : nil,
212
200
  right_score: r_review ? r_review.ai_score : nil,
201
+ left_passed: l_review&.passed,
202
+ right_passed: r_review&.passed,
213
203
  left_version_label: version_label_for(l_review, metric_versions),
214
204
  right_version_label: version_label_for(r_review, metric_versions),
215
- delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil
205
+ delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil,
206
+ result_change: RunComparison.result_change(l_review&.passed, r_review&.passed)
216
207
  }
217
208
  end.compact
218
209
  }
@@ -0,0 +1,25 @@
1
+ module CompletionKit
2
+ module ResponseOrdering
3
+ extend ActiveSupport::Concern
4
+
5
+ private
6
+
7
+ FAILED_CHECKS_SQL = "SUM(CASE WHEN completion_kit_reviews.passed IS FALSE THEN 1 ELSE 0 END)".freeze
8
+ RUBRIC_AVG_SQL = "AVG(completion_kit_reviews.ai_score)".freeze
9
+
10
+ def ordered_responses_relation(run, sort)
11
+ return run.responses.order(:id) unless run.gradable?
12
+
13
+ composite = if sort == "score_asc"
14
+ "#{FAILED_CHECKS_SQL} DESC, #{RUBRIC_AVG_SQL} ASC NULLS LAST"
15
+ else
16
+ "#{FAILED_CHECKS_SQL} ASC, #{RUBRIC_AVG_SQL} DESC NULLS LAST"
17
+ end
18
+
19
+ run.responses
20
+ .left_joins(:reviews)
21
+ .group("completion_kit_responses.id")
22
+ .order(Arel.sql("#{composite}, completion_kit_responses.id ASC"))
23
+ end
24
+ end
25
+ end
@@ -159,6 +159,34 @@ module CompletionKit
159
159
  :low
160
160
  end
161
161
 
162
+ def ck_check_badge(passed)
163
+ if passed == true
164
+ content_tag(:span, "Pass", class: ck_badge_classes(:high))
165
+ elsif passed == false
166
+ content_tag(:span, "Fail", class: ck_badge_classes(:low))
167
+ else
168
+ content_tag(:span, "Pending", class: ck_badge_classes(:pending))
169
+ end
170
+ end
171
+
172
+ def ck_result_change_badge(change)
173
+ case change
174
+ when "broke"
175
+ content_tag(:span, "Broke", class: "ck-delta ck-delta--negative")
176
+ when "fixed"
177
+ content_tag(:span, "Fixed", class: "ck-delta ck-delta--positive")
178
+ when "same"
179
+ content_tag(:span, "Same", class: "ck-delta ck-delta--zero")
180
+ end
181
+ end
182
+
183
+ def ck_pass_rate_kind(rate)
184
+ return :high if rate >= 0.9
185
+ return :medium if rate >= 0.7
186
+
187
+ :low
188
+ end
189
+
162
190
  def ck_word_diff_old(old_text, new_text)
163
191
  diff_tokens(old_text, new_text, :old)
164
192
  end
@@ -58,6 +58,28 @@ module CompletionKit
58
58
  scored_reviews_since(since).where(metric_id: metric_id).average(:ai_score)&.to_f&.round(2)
59
59
  end
60
60
 
61
+ def self.metric_pass_rate(metric_id, since:)
62
+ resolved = Review.joins(:response)
63
+ .where(metric_id: metric_id)
64
+ .where("completion_kit_reviews.created_at >= ?", since)
65
+ .where.not(passed: nil)
66
+ total = resolved.count
67
+ return nil if total.zero?
68
+
69
+ (resolved.where(passed: true).count.to_f / total).round(2)
70
+ end
71
+
72
+ def self.failing_checks(since:)
73
+ reviews = Review.where(passed: false)
74
+ .where("completion_kit_reviews.created_at >= ?", since)
75
+ .includes(response: :run)
76
+ .order(updated_at: :desc)
77
+ items = reviews.map do |review|
78
+ { metric_name: review.metric_name, response: review.response, run: review.response.run }
79
+ end
80
+ { count: items.size, items: items }
81
+ end
82
+
61
83
  # Everything that terminally failed in the window across all three
62
84
  # surfaces — failed runs, failed generations, failed judge reviews —
63
85
  # excluding any the user has dismissed. Returns a count and an items list
@@ -0,0 +1,13 @@
1
+ module CompletionKit
2
+ module RunComparison
3
+ module_function
4
+
5
+ def result_change(left_passed, right_passed)
6
+ return nil if left_passed.nil? || right_passed.nil?
7
+ return "broke" if left_passed && !right_passed
8
+ return "fixed" if !left_passed && right_passed
9
+
10
+ "same"
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,19 @@
1
+ <div class="ck-card ck-stat-card ck-rise" id="ck-failing-checks-card" style="--rise-delay: 200ms;">
2
+ <p class="ck-kicker">Failing checks · last 7 days</p>
3
+ <div class="ck-stat-card__body">
4
+ <span class="ck-stat-card__count<%= failing_checks[:count].positive? ? ' is-danger' : ' is-clean' %>"><%= failing_checks[:count] %></span>
5
+ </div>
6
+
7
+ <% if failing_checks[:items].any? %>
8
+ <ul class="ck-failure-list">
9
+ <% failing_checks[:items].first(5).each do |item| %>
10
+ <li class="ck-failure-list__item">
11
+ <span class="ck-failure-list__surface ck-failure-list__surface--check">check</span>
12
+ <%= link_to item[:metric_name], completion_kit.run_path(item[:run]), class: "ck-link ck-failure-list__cause" %>
13
+ </li>
14
+ <% end %>
15
+ </ul>
16
+ <% else %>
17
+ <div class="ck-stat-card__foot"><span>No failing checks this week.</span></div>
18
+ <% end %>
19
+ </div>
@@ -28,7 +28,7 @@
28
28
  </nav>
29
29
 
30
30
  <% if @activity %>
31
- <div class="ck-grid ck-grid--cards ck-grid--cards-3 ck-pulse-grid">
31
+ <div class="ck-grid ck-grid--cards ck-grid--cards-4 ck-pulse-grid">
32
32
  <div class="ck-card ck-stat-card ck-rise" style="--rise-delay: 60ms;">
33
33
  <p class="ck-kicker">Activity · last 14 days</p>
34
34
  <% activity_max = @activity.map { |d| d[:count] }.max %>
@@ -49,6 +49,9 @@
49
49
 
50
50
  <%= render "completion_kit/dashboard/failures_card",
51
51
  failures: @failures, ignored_failures: @ignored_failures %>
52
+
53
+ <%= render "completion_kit/dashboard/failing_checks_card",
54
+ failing_checks: @failing_checks %>
52
55
  </div>
53
56
 
54
57
  <div class="ck-card ck-card--spaced ck-rise" style="--rise-delay: 240ms;">
@@ -54,10 +54,15 @@
54
54
  <% family_runs = CompletionKit::Run.where(prompt_id: prompt.family_versions.select(:id)).display_scoped %>
55
55
  <% current_version_runs = prompt.runs.display_scoped.includes(responses: :reviews) %>
56
56
  <% best_score = current_version_runs.map(&:avg_score).compact.max %>
57
+ <% best_pass_rate = current_version_runs.map(&:check_pass_rate).compact.max %>
57
58
  <td>
58
59
  <% if best_score %>
59
60
  <span class="<%= ck_badge_classes(ck_score_kind(best_score)) %>"><%= best_score %></span>
60
- <% else %>
61
+ <% end %>
62
+ <% if best_pass_rate %>
63
+ <span class="<%= ck_badge_classes(ck_pass_rate_kind(best_pass_rate)) %>"><%= (best_pass_rate * 100).round %>%</span>
64
+ <% end %>
65
+ <% unless best_score || best_pass_rate %>
61
66
  <span class="ck-prompts-table__dim">—</span>
62
67
  <% end %>
63
68
  </td>
@@ -64,7 +64,9 @@
64
64
  </thead>
65
65
  <tbody>
66
66
  <% versions.each do |v| %>
67
- <% best_score = v.runs.display_scoped.map(&:avg_score).compact.max %>
67
+ <% scoped_runs = v.runs.display_scoped %>
68
+ <% best_score = scoped_runs.map(&:avg_score).compact.max %>
69
+ <% best_pass_rate = scoped_runs.map(&:check_pass_rate).compact.max %>
68
70
  <% pred = predecessor_of[v] %>
69
71
  <tr class="<%= "ck-results-table__row--active" if v.id == @prompt.id %>" onclick="window.location='<%= prompt_path(v) %>'" style="cursor: pointer;">
70
72
  <td>
@@ -86,7 +88,11 @@
86
88
  <td>
87
89
  <% if best_score %>
88
90
  <span class="<%= ck_badge_classes(ck_score_kind(best_score)) %>"><%= best_score %></span>
89
- <% else %>
91
+ <% end %>
92
+ <% if best_pass_rate %>
93
+ <span class="<%= ck_badge_classes(ck_pass_rate_kind(best_pass_rate)) %>"><%= (best_pass_rate * 100).round %>%</span>
94
+ <% end %>
95
+ <% unless best_score || best_pass_rate %>
90
96
  <span class="ck-prompts-table__dim">—</span>
91
97
  <% end %>
92
98
  </td>
@@ -116,6 +116,8 @@
116
116
  <% 5.times do |i| %>
117
117
  <svg viewBox="0 0 24 24" width="16" height="16" stroke-width="1.75" class="ck-star <%= i < review.ai_score.to_i ? "ck-star--filled" : "ck-star--empty" %>"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
118
118
  <% end %>
119
+ <% elsif review.check? && !review.passed.nil? %>
120
+ <%= ck_check_badge(review.passed) %>
119
121
  <% else %>
120
122
  <span class="<%= ck_badge_classes(:pending) %>">Pending</span>
121
123
  <% end %>
@@ -10,14 +10,21 @@
10
10
  <% end %>
11
11
  </td>
12
12
  <td data-label="Metrics">
13
- <% scored_reviews = response.reviews.select { |r| r.ai_score.present? }.sort_by { |r| r.metric_name.to_s.downcase } %>
14
- <% if scored_reviews.any? %>
13
+ <% pip_reviews = response.reviews.select { |r| r.ai_score.present? || !r.passed.nil? }.sort_by { |r| r.metric_name.to_s.downcase } %>
14
+ <% if pip_reviews.any? %>
15
15
  <span class="ck-metric-bar ck-metric-bar--compact">
16
- <% scored_reviews.each do |r| %>
17
- <span class="ck-metric-pip ck-metric-pip--<%= ck_score_kind(r.ai_score.to_f) %>">
18
- <span class="ck-metric-pip__bar"></span>
19
- <span class="ck-metric-pip__label"><%= r.metric_name %> <strong><%= r.ai_score %></strong></span>
20
- </span>
16
+ <% pip_reviews.each do |r| %>
17
+ <% if r.ai_score.present? %>
18
+ <span class="ck-metric-pip ck-metric-pip--<%= ck_score_kind(r.ai_score.to_f) %>">
19
+ <span class="ck-metric-pip__bar"></span>
20
+ <span class="ck-metric-pip__label"><%= r.metric_name %> <strong><%= r.ai_score %></strong></span>
21
+ </span>
22
+ <% else %>
23
+ <span class="ck-metric-pip ck-metric-pip--<%= ck_pass_rate_kind(r.passed ? 1.0 : 0.0) %>">
24
+ <span class="ck-metric-pip__bar"></span>
25
+ <span class="ck-metric-pip__label"><%= r.metric_name %> <strong><%= r.passed ? "Pass" : "Fail" %></strong></span>
26
+ </span>
27
+ <% end %>
21
28
  <% end %>
22
29
  </span>
23
30
  <% else %>
@@ -26,7 +33,12 @@
26
33
  </td>
27
34
  <td data-label="Avg score">
28
35
  <% if response.reviewed? %>
29
- <span class="<%= ck_badge_classes(ck_score_kind(response.score.to_f)) %>"><%= response.score %></span>
36
+ <% if response.score %>
37
+ <span class="<%= ck_badge_classes(ck_score_kind(response.score.to_f)) %>"><%= response.score %></span>
38
+ <% end %>
39
+ <% if response.checks_total.positive? %>
40
+ <span class="<%= ck_badge_classes(ck_pass_rate_kind(response.checks_passed.to_f / response.checks_total)) %>"><%= response.checks_passed %>/<%= response.checks_total %></span>
41
+ <% end %>
30
42
  <% else %>
31
43
  <span class="ck-response-cell__dim">—</span>
32
44
  <% end %>
@@ -34,10 +34,17 @@
34
34
  <% if metrics.any? %>
35
35
  <div class="ck-metric-bar ck-metric-bar--compact">
36
36
  <% metrics.each do |m| %>
37
- <span class="ck-metric-pip ck-metric-pip--<%= ck_score_kind(m[:avg]) %>">
38
- <span class="ck-metric-pip__bar"></span>
39
- <span class="ck-metric-pip__label"><%= m[:name] %> <strong><%= m[:avg] %></strong></span>
40
- </span>
37
+ <% if m[:kind] == "check" %>
38
+ <span class="ck-metric-pip ck-metric-pip--<%= ck_pass_rate_kind(m[:pass_rate]) %>">
39
+ <span class="ck-metric-pip__bar"></span>
40
+ <span class="ck-metric-pip__label"><%= m[:name] %> <strong><%= (m[:pass_rate] * 100).round %>%</strong></span>
41
+ </span>
42
+ <% else %>
43
+ <span class="ck-metric-pip ck-metric-pip--<%= ck_score_kind(m[:avg]) %>">
44
+ <span class="ck-metric-pip__bar"></span>
45
+ <span class="ck-metric-pip__label"><%= m[:name] %> <strong><%= m[:avg] %></strong></span>
46
+ </span>
47
+ <% end %>
41
48
  <% end %>
42
49
  </div>
43
50
  <% else %>
@@ -46,8 +53,14 @@
46
53
  </td>
47
54
  <td>
48
55
  <% avg = run.avg_score %>
49
- <% if avg %>
50
- <span class="<%= ck_badge_classes(ck_score_kind(avg)) %>"><%= avg %></span>
56
+ <% pass_rate = run.check_pass_rate %>
57
+ <% if avg || pass_rate %>
58
+ <% if avg %>
59
+ <span class="<%= ck_badge_classes(ck_score_kind(avg)) %>"><%= avg %></span>
60
+ <% end %>
61
+ <% if pass_rate %>
62
+ <span class="<%= ck_badge_classes(ck_pass_rate_kind(pass_rate)) %>"><%= (pass_rate * 100).round %>%</span>
63
+ <% end %>
51
64
  <% else %>
52
65
  <span class="ck-runs-table__dim">—</span>
53
66
  <% end %>
@@ -1,5 +1,5 @@
1
1
  <div id="run_sort_toolbar">
2
- <% if run.judge_configured? %>
2
+ <% if run.gradable? %>
3
3
  <% active = run.status == "completed" && run.responses.joins(:reviews).exists? %>
4
4
  <div class="ck-toolbar" style="margin-top: 1.5rem;<%= ' visibility: hidden;' unless active %>" aria-hidden="<%= !active %>">
5
5
  <%= link_to "Best first", run_path(run, sort: "score_desc"), class: params[:sort].blank? || params[:sort] == "score_desc" ? ck_button_classes(:dark) : ck_button_classes(:light, variant: :outline), tabindex: active ? nil : -1 %>
@@ -2,7 +2,9 @@
2
2
  <div id="run_status_panel" aria-live="polite" aria-atomic="true">
3
3
  <% if run.status.in?(%w[running completed]) && snap[:generated_total] > 0 %>
4
4
  <% failed_count = snap[:generated_failed] + snap[:judged_failed] %>
5
- <% has_judge = snap[:judged_total] > 0 || run.judge_configured? %>
5
+ <% show_grading = snap[:judged_total] > 0 || run.gradable? %>
6
+ <% has_llm_metric = run.llm_metrics.any? %>
7
+ <% has_checks = run.check_metrics.any? %>
6
8
  <% metric_avgs = run.metric_averages.sort_by { |m| m[:name].to_s.downcase } %>
7
9
  <% metric_lookup = metric_avgs.index_by { |m| m[:name].to_s.downcase } %>
8
10
  <section class="ck-run-status ck-run-status--<%= run.status %>">
@@ -18,7 +20,7 @@
18
20
  <p class="ck-run-status__cell-value ck-run-status__summary-line">
19
21
  <span class="ck-run-status__summary-num"><%= snap[:generated_done] %></span>
20
22
  <span class="ck-run-status__summary-text">of <%= snap[:generated_total] %> responses</span>
21
- <% if has_judge %>
23
+ <% if show_grading %>
22
24
  <span class="ck-run-status__summary-sep">·</span>
23
25
  <span class="ck-run-status__summary-num"><%= snap[:judged_done] %></span>
24
26
  <span class="ck-run-status__summary-text">of <%= snap[:judged_total] %> judged</span>
@@ -30,7 +32,7 @@
30
32
  </p>
31
33
  </div>
32
34
 
33
- <% if has_judge %>
35
+ <% if show_grading %>
34
36
  <div class="ck-run-status__cell">
35
37
  <p class="ck-run-status__metric-label">Metrics</p>
36
38
  <div class="ck-run-status__cell-value">
@@ -38,7 +40,12 @@
38
40
  <div class="ck-metric-bar ck-metric-bar--compact">
39
41
  <% run.metrics.order(:name).each do |metric| %>
40
42
  <% avg_for_metric = metric_lookup[metric.name.to_s.downcase] %>
41
- <% if avg_for_metric %>
43
+ <% if avg_for_metric && avg_for_metric[:kind] == "check" %>
44
+ <span class="ck-metric-pip ck-metric-pip--<%= ck_pass_rate_kind(avg_for_metric[:pass_rate]) %>">
45
+ <span class="ck-metric-pip__bar"></span>
46
+ <span class="ck-metric-pip__label"><%= metric.name %> <strong><%= (avg_for_metric[:pass_rate] * 100).round %>%</strong></span>
47
+ </span>
48
+ <% elsif avg_for_metric %>
42
49
  <span class="ck-metric-pip ck-metric-pip--<%= ck_score_kind(avg_for_metric[:avg]) %>">
43
50
  <span class="ck-metric-pip__bar"></span>
44
51
  <span class="ck-metric-pip__label"><%= metric.name %> <strong><%= avg_for_metric[:avg] %></strong></span>
@@ -57,16 +64,31 @@
57
64
  </div>
58
65
  </div>
59
66
 
60
- <div class="ck-run-status__cell">
61
- <p class="ck-run-status__metric-label">Avg score</p>
62
- <div class="ck-run-status__cell-value">
63
- <% if run.avg_score %>
64
- <span class="<%= ck_badge_classes(ck_score_kind(run.avg_score)) %> ck-badge--lg"><%= run.avg_score %></span>
65
- <% else %>
66
- <span class="ck-run-status__cell-empty">—</span>
67
- <% end %>
67
+ <% if has_llm_metric %>
68
+ <div class="ck-run-status__cell">
69
+ <p class="ck-run-status__metric-label">Avg score</p>
70
+ <div class="ck-run-status__cell-value">
71
+ <% if run.avg_score %>
72
+ <span class="<%= ck_badge_classes(ck_score_kind(run.avg_score)) %> ck-badge--lg"><%= run.avg_score %></span>
73
+ <% else %>
74
+ <span class="ck-run-status__cell-empty">—</span>
75
+ <% end %>
76
+ </div>
68
77
  </div>
69
- </div>
78
+ <% end %>
79
+
80
+ <% if has_checks %>
81
+ <div class="ck-run-status__cell">
82
+ <p class="ck-run-status__metric-label">Checks passed</p>
83
+ <div class="ck-run-status__cell-value">
84
+ <% if run.check_pass_rate %>
85
+ <span class="<%= ck_badge_classes(ck_pass_rate_kind(run.check_pass_rate)) %> ck-badge--lg"><%= (run.check_pass_rate * 100).round %>%</span>
86
+ <% else %>
87
+ <span class="ck-run-status__cell-empty">—</span>
88
+ <% end %>
89
+ </div>
90
+ </div>
91
+ <% end %>
70
92
  <% end %>
71
93
  </div>
72
94
 
@@ -46,21 +46,35 @@
46
46
  <% end %>
47
47
  <td><%= pm[:metric_name] %></td>
48
48
  <td>
49
- <% if pm[:left_score] %>
49
+ <% if pm[:kind] == "check" %>
50
+ <% if pm[:left_passed].nil? %>
51
+ <span class="ck-meta-copy">—</span>
52
+ <% else %>
53
+ <%= ck_check_badge(pm[:left_passed]) %>
54
+ <% end %>
55
+ <% elsif pm[:left_score] %>
50
56
  <span class="<%= ck_badge_classes(ck_score_kind(pm[:left_score].to_f)) %>"><%= pm[:left_score] %></span>
51
57
  <% else %>
52
58
  <span class="ck-meta-copy">—</span>
53
59
  <% end %>
54
60
  </td>
55
61
  <td>
56
- <% if pm[:right_score] %>
62
+ <% if pm[:kind] == "check" %>
63
+ <% if pm[:right_passed].nil? %>
64
+ <span class="ck-meta-copy">—</span>
65
+ <% else %>
66
+ <%= ck_check_badge(pm[:right_passed]) %>
67
+ <% end %>
68
+ <% elsif pm[:right_score] %>
57
69
  <span class="<%= ck_badge_classes(ck_score_kind(pm[:right_score].to_f)) %>"><%= pm[:right_score] %></span>
58
70
  <% else %>
59
71
  <span class="ck-meta-copy">—</span>
60
72
  <% end %>
61
73
  </td>
62
74
  <td>
63
- <% if pm[:delta] %>
75
+ <% if pm[:kind] == "check" %>
76
+ <%= ck_result_change_badge(pm[:result_change]) || content_tag(:span, "—", class: "ck-meta-copy") %>
77
+ <% elsif pm[:delta] %>
64
78
  <% delta_class = pm[:delta] > 0 ? "ck-delta--positive" : pm[:delta] < 0 ? "ck-delta--negative" : "ck-delta--zero" %>
65
79
  <span class="ck-delta <%= delta_class %>"><%= pm[:delta].positive? ? "+#{pm[:delta]}" : pm[:delta].to_s %></span>
66
80
  <% else %>
@@ -20,6 +20,9 @@
20
20
  <% if @run.avg_score %>
21
21
  <span class="<%= ck_badge_classes(ck_score_kind(@run.avg_score)) %>"><%= @run.avg_score %></span>
22
22
  <% end %>
23
+ <% if @run.check_pass_rate %>
24
+ <span class="<%= ck_badge_classes(ck_pass_rate_kind(@run.check_pass_rate)) %>"><%= (@run.check_pass_rate * 100).round %>%</span>
25
+ <% end %>
23
26
  </p>
24
27
  </div>
25
28
  <div class="ck-actions">
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.18.1"
2
+ VERSION = "0.19.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.18.1
4
+ version: 0.19.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin
@@ -261,6 +261,7 @@ files:
261
261
  - app/controllers/completion_kit/runs_controller.rb
262
262
  - app/controllers/completion_kit/suggestions_controller.rb
263
263
  - app/controllers/completion_kit/tags_controller.rb
264
+ - app/controllers/concerns/completion_kit/response_ordering.rb
264
265
  - app/controllers/concerns/completion_kit/tag_filtering.rb
265
266
  - app/helpers/completion_kit/application_helper.rb
266
267
  - app/jobs/completion_kit/application_job.rb
@@ -339,6 +340,7 @@ files:
339
340
  - app/services/completion_kit/prompt_improvement_service.rb
340
341
  - app/services/completion_kit/prompt_improvement_validator.rb
341
342
  - app/services/completion_kit/provider_endpoint.rb
343
+ - app/services/completion_kit/run_comparison.rb
342
344
  - app/services/completion_kit/starter_metrics.rb
343
345
  - app/services/completion_kit/worker_health.rb
344
346
  - app/validators/completion_kit/tenant_scoped_uniqueness_validator.rb
@@ -352,6 +354,7 @@ files:
352
354
  - app/views/completion_kit/api_reference/index.html.erb
353
355
  - app/views/completion_kit/dashboard/_eye_icon.html.erb
354
356
  - app/views/completion_kit/dashboard/_eye_off_icon.html.erb
357
+ - app/views/completion_kit/dashboard/_failing_checks_card.html.erb
355
358
  - app/views/completion_kit/dashboard/_failures_card.html.erb
356
359
  - app/views/completion_kit/dashboard/_worst_metric_card.html.erb
357
360
  - app/views/completion_kit/dashboard/show.html.erb