completion-kit 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4772b264a668a86e004f78c7bc2397d93f5266d8dc287b422728d951ab24fbcc
4
- data.tar.gz: 63b221e144e930df9978607a78533d354f659ca2bf6141046291168af50d3cd7
3
+ metadata.gz: 0b32ec77fb60d07f40e4b83827c2510aaeb695c96c9c6df86e4b42a7ec57516b
4
+ data.tar.gz: ade912039e4942c87d73c13443bd405533eec2988478e02cd1ccb87550de2783
5
5
  SHA512:
6
- metadata.gz: a3249ae1c734dcee0c6f9410baf0400f4b16e091b220d86e8417dec91ff9943a165bbc6c8368629cc14054c3af7946bdb693f006a5756de40a809c41db5bbe3a
7
- data.tar.gz: 541323c93b08f08f32c2f024e3709ee5f3e4e48144cd8e9cb2ba8891312fd9b5712e4ca39cbcc739b64ba61ff0ac890be3528e5bfd9c238e073d640b37b47e90
6
+ metadata.gz: bb8664ea804d59e3761ab385d1af98ecf7d110dd7e68e7003e1a4b2c059c5e377a5e42d350a46310f58c6d8c41c0e31a6aa0cdaf8a6b50b4d9f419e6fa60e474
7
+ data.tar.gz: 3bbe72cf7e99a4ae899765829ee8bee83703885ebdfa5b6b9f7253f25f2373b1dd22aadfeb071f4b340c1b7d33dfc4e590e4f8a2df2239afbbc305de009af2cf
@@ -3619,10 +3619,9 @@ select.ck-input {
3619
3619
  .ck-prompt-versions-table th:nth-child(3), .ck-prompt-versions-table td:nth-child(3) { width: 8rem; white-space: nowrap; }
3620
3620
  .ck-prompt-versions-table th:nth-child(4), .ck-prompt-versions-table td:nth-child(4) { width: auto; }
3621
3621
 
3622
- .ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 34%; }
3623
- .ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: 33%; white-space: nowrap; }
3624
- .ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: 33%; white-space: nowrap; }
3625
- .ck-metric-versions-table .ck-version-cell { justify-content: flex-start; gap: 0.75rem; }
3622
+ .ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 18rem; }
3623
+ .ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: 16rem; white-space: nowrap; }
3624
+ .ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: auto; white-space: nowrap; }
3626
3625
 
3627
3626
 
3628
3627
  .ck-source-chip {
@@ -6001,3 +6000,67 @@ a.tag-mark {
6001
6000
  width: 2rem;
6002
6001
  height: 2rem;
6003
6002
  }
6003
+
6004
+ .ck-suggestion-status:empty { display: none; }
6005
+ .ck-suggestion-status {
6006
+ margin-top: 10px;
6007
+ display: flex;
6008
+ align-items: baseline;
6009
+ gap: 10px;
6010
+ flex-wrap: wrap;
6011
+ }
6012
+
6013
+ .ck-scoreboard {
6014
+ margin-bottom: 16px;
6015
+ padding-bottom: 14px;
6016
+ border-bottom: 1px solid var(--ck-line);
6017
+ }
6018
+ .ck-scoreboard__headline {
6019
+ margin: 0 0 8px;
6020
+ font-size: 0.95rem;
6021
+ color: var(--ck-text);
6022
+ }
6023
+ .ck-scoreboard__was {
6024
+ font-family: var(--ck-mono);
6025
+ font-size: 0.74rem;
6026
+ color: var(--ck-muted);
6027
+ margin-left: 6px;
6028
+ }
6029
+ .ck-scoreboard__tally {
6030
+ list-style: none;
6031
+ margin: 0;
6032
+ padding: 0;
6033
+ display: flex;
6034
+ gap: 18px;
6035
+ }
6036
+ .ck-scoreboard__stat {
6037
+ font-family: var(--ck-mono);
6038
+ font-size: 0.72rem;
6039
+ letter-spacing: 0.06em;
6040
+ text-transform: uppercase;
6041
+ color: var(--ck-muted);
6042
+ }
6043
+ .ck-scoreboard__stat strong { color: var(--ck-text); }
6044
+ .ck-scoreboard__stat--break strong { color: var(--ck-warning); }
6045
+ .ck-scoreboard__note {
6046
+ margin: 8px 0 0;
6047
+ font-size: 0.78rem;
6048
+ color: var(--ck-muted);
6049
+ }
6050
+ .ck-version-change {
6051
+ display: inline-flex;
6052
+ align-items: baseline;
6053
+ gap: 0.6rem;
6054
+ }
6055
+ .ck-version-score {
6056
+ font-family: var(--ck-mono);
6057
+ font-size: 0.74rem;
6058
+ color: var(--ck-dim);
6059
+ }
6060
+ .ck-version-score__label {
6061
+ font-size: 0.6rem;
6062
+ letter-spacing: 0.08em;
6063
+ text-transform: uppercase;
6064
+ color: var(--ck-muted);
6065
+ margin-right: 0.2rem;
6066
+ }
@@ -117,26 +117,22 @@ module CompletionKit
117
117
 
118
118
  def suggest_variants
119
119
  target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
120
- disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
121
- if disagreement_count.zero?
120
+ counts = Calibration.where(metric_id: @metric.id, verdict: %w[agree disagree]).group(:verdict).count
121
+ if counts["disagree"].to_i.zero?
122
122
  redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
123
123
  return
124
124
  end
125
125
 
126
- MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
126
+ MetricSuggestionJob.perform_later(@metric.id)
127
127
 
128
- generator = MetricVariantGenerator.new(@metric, count: 1)
129
- variants = generator.call
130
- if variants.empty?
131
- redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
132
- return
133
- end
134
- versions = generator.persist!(variants)
135
- new_version = versions.max_by(&:version_number)
136
128
  if params[:back_to] == "edit"
137
- redirect_to edit_metric_path(@metric), notice: "Drafted #{new_version.version_label} from your reviews. Review the proposed changes below, then Publish to use it."
129
+ redirect_to metric_path(@metric), notice: "Drafting a change from your reviews. It will appear here once it's tested."
138
130
  else
139
- redirect_to metric_path(@metric, show_change: new_version.id), notice: "Drafted #{new_version.version_label} from your reviews."
131
+ render turbo_stream: turbo_stream.replace(
132
+ "ck-suggestion-status-#{@metric.id}",
133
+ partial: "completion_kit/metrics/suggestion_pending",
134
+ locals: { metric: @metric, count: counts.values.sum }
135
+ )
140
136
  end
141
137
  end
142
138
 
@@ -0,0 +1,46 @@
1
+ require "faraday"
2
+
3
+ module CompletionKit
4
+ class MetricSuggestionJob < ApplicationJob
5
+ queue_as :llm
6
+
7
+ retry_on Faraday::TimeoutError, Faraday::ConnectionFailed, wait: :polynomially_longer, attempts: 5
8
+ retry_on CompletionKit::RateLimitError, wait: :polynomially_longer, attempts: 5
9
+
10
+ rescue_from(StandardError) do |error|
11
+ Rails.error.report(error, handled: true, context: { job: self.class.name })
12
+ broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
13
+ end
14
+
15
+ def perform(metric_id)
16
+ @metric = Metric.find_by(id: metric_id)
17
+ return unless @metric
18
+
19
+ MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
20
+
21
+ generator = MetricVariantGenerator.new(@metric, count: 1)
22
+ variants = generator.call
23
+ if variants.empty?
24
+ broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
25
+ return
26
+ end
27
+
28
+ draft = generator.persist!(variants).max_by(&:version_number)
29
+ summary = MetricImprovementValidator.new(@metric, draft).call
30
+ draft.update!(validation_summary: summary)
31
+
32
+ broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_ready", locals: { metric: @metric, draft: draft })
33
+ end
34
+
35
+ private
36
+
37
+ def broadcast_status(metric, partial:, locals:)
38
+ html = CompletionKit::ApplicationController.render(partial: partial, locals: locals)
39
+ Turbo::StreamsChannel.broadcast_replace_to(
40
+ "metric_#{metric.id}_suggestion",
41
+ target: "ck-suggestion-status-#{metric.id}",
42
+ html: html
43
+ )
44
+ end
45
+ end
46
+ end
@@ -6,6 +6,7 @@ module CompletionKit
6
6
  has_many :calibrations, dependent: :destroy
7
7
 
8
8
  serialize :rubric_bands, coder: JSON
9
+ serialize :validation_summary, coder: JSON
9
10
 
10
11
  before_validation :assign_version_number, on: :create
11
12
 
@@ -0,0 +1,101 @@
1
+ module CompletionKit
2
+ class MetricImprovementValidator
3
+ ANSWER_KEY_LIMIT = 30
4
+
5
+ def initialize(metric, candidate, scorer: nil)
6
+ @metric = metric
7
+ @candidate = candidate
8
+ @scorer = scorer || method(:rescore)
9
+ end
10
+
11
+ def call
12
+ key = answer_key
13
+ rows = []
14
+ key.each do |entry|
15
+ begin
16
+ score = @scorer.call(entry[:response], @candidate)
17
+ rescue StandardError
18
+ next
19
+ end
20
+ rows << classify(entry, score.to_i)
21
+ end
22
+ summarize(rows, key.size, key_capped?)
23
+ end
24
+
25
+ private
26
+
27
+ def answer_key
28
+ current = MetricVersion.current.find_by(metric_id: @metric.id)
29
+ return [] unless current
30
+
31
+ base = Calibration.where(metric_id: @metric.id, metric_version_id: current.id, verdict: %w[agree disagree])
32
+ @key_size_before_cap = base.count
33
+ base.includes(response: :reviews)
34
+ .order(created_at: :desc)
35
+ .limit(ANSWER_KEY_LIMIT)
36
+ .filter_map do |cal|
37
+ response = cal.response
38
+ next unless response.response_text.present?
39
+ review = response.reviews.find { |r| r.metric_id == @metric.id }
40
+ position = cal.verdict == "disagree" ? cal.corrected_score : review&.ai_score
41
+ next if position.nil?
42
+ { response: response, verdict: cal.verdict, position: position }
43
+ end
44
+ end
45
+
46
+ def key_capped?
47
+ @key_size_before_cap.to_i > ANSWER_KEY_LIMIT
48
+ end
49
+
50
+ def classify(entry, candidate_score)
51
+ matched = candidate_score == entry[:position].to_i
52
+ outcome = if entry[:verdict] == "disagree"
53
+ matched ? "fix" : "still_off"
54
+ else
55
+ matched ? "keep" : "break"
56
+ end
57
+ {
58
+ "response_id" => entry[:response].id,
59
+ "verdict" => entry[:verdict],
60
+ "position" => entry[:position].to_i,
61
+ "candidate_score" => candidate_score,
62
+ "outcome" => outcome
63
+ }
64
+ end
65
+
66
+ def summarize(rows, total, capped)
67
+ fixes = rows.count { |r| r["outcome"] == "fix" }
68
+ keeps = rows.count { |r| r["outcome"] == "keep" }
69
+ breaks = rows.count { |r| r["outcome"] == "break" }
70
+ still_off = rows.count { |r| r["outcome"] == "still_off" }
71
+ agreements = rows.count { |r| r["verdict"] == "agree" }
72
+ {
73
+ "total" => total,
74
+ "tested" => rows.size,
75
+ "capped" => capped,
76
+ "fixes" => fixes,
77
+ "keeps" => keeps,
78
+ "breaks" => breaks,
79
+ "still_off" => still_off,
80
+ "before" => agreements,
81
+ "after" => fixes + keeps,
82
+ "rows" => rows
83
+ }
84
+ end
85
+
86
+ def rescore(response, candidate)
87
+ run = response.run
88
+ config = ApiConfig.for_model(run.judge_model).merge(judge_model: run.judge_model)
89
+ rubric_text = Metric.rubric_text_for(Metric.normalize_rubric_bands(candidate.rubric_bands))
90
+ result = JudgeService.new(config).evaluate(
91
+ response.response_text,
92
+ response.expected_output,
93
+ run.prompt&.template,
94
+ criteria: candidate.instruction.to_s,
95
+ rubric_text: rubric_text,
96
+ input_data: response.input_data
97
+ )
98
+ result[:score]
99
+ end
100
+ end
101
+ end
@@ -31,10 +31,7 @@
31
31
  <%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
32
32
  <% end %>
33
33
  <% else %>
34
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agreement</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong></span>
35
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
36
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
37
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
34
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agrees with you</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong> of <%= stats.sample_size %> reviews</span>
38
35
  <% if stats.borderline_rate && stats.borderline_rate > 0 %>
39
36
  <% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
40
37
  <span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>
@@ -0,0 +1,3 @@
1
+ <div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status">
2
+ <span class="ck-cal-foot__note">The model returned no usable change. Try again, or review a few more scores first.</span>
3
+ </div>
@@ -0,0 +1,3 @@
1
+ <div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status ck-suggestion-status--pending">
2
+ <span class="ck-cal-foot__note">Drafting a change and testing it against your <%= pluralize(count, "review") %>…</span>
3
+ </div>
@@ -0,0 +1,4 @@
1
+ <div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status ck-suggestion-status--ready">
2
+ <span class="ck-cal-foot__note">Drafted <%= draft.version_label %> and tested it against your reviews.</span>
3
+ <%= link_to "Compare and publish →", CompletionKit::Engine.routes.url_helpers.metric_path(metric, show_change: draft.id), class: "ck-cal-link" %>
4
+ </div>
@@ -0,0 +1,12 @@
1
+ <% s = summary %>
2
+ <div class="ck-scoreboard">
3
+ <p class="ck-scoreboard__headline">Matches you on <strong><%= s["after"] %> of <%= s["tested"] %></strong> of your reviews <span class="ck-scoreboard__was">was <%= s["before"] %> of <%= s["tested"] %></span></p>
4
+ <ul class="ck-scoreboard__tally">
5
+ <li class="ck-scoreboard__stat ck-scoreboard__stat--fix">Fixes <strong><%= s["fixes"] %></strong></li>
6
+ <li class="ck-scoreboard__stat ck-scoreboard__stat--keep">Keeps <strong><%= s["keeps"] %></strong></li>
7
+ <li class="ck-scoreboard__stat ck-scoreboard__stat--break">Breaks <strong><%= s["breaks"] %></strong></li>
8
+ </ul>
9
+ <% if s["capped"] %>
10
+ <p class="ck-scoreboard__note">Tested against your 30 most recent reviews.</p>
11
+ <% end %>
12
+ </div>
@@ -76,14 +76,20 @@
76
76
  data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
77
77
  <% end %>
78
78
  </div>
79
+ <% vs = v.validation_summary %>
79
80
  <% if summary %>
80
- <button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">&Delta;</button>
81
+ <div class="ck-version-change">
82
+ <% if v.draft? && vs.present? %>
83
+ <span class="ck-version-score"><span class="ck-version-score__label">Match</span> <%= vs["after"] %>/<%= vs["tested"] %></span>
84
+ <% end %>
85
+ <button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">&Delta;</button>
86
+ </div>
81
87
  <% end %>
82
88
  </div>
83
89
  </td>
84
90
  <td>
85
91
  <% source_label, source_class = case v.source
86
- when "suggestion" then ["AI calibration", "ck-source-chip ck-source-chip--ai"]
92
+ when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
87
93
  when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
88
94
  when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
89
95
  else ["Original", "ck-source-chip ck-source-chip--initial"]
@@ -110,6 +116,7 @@
110
116
  <% @versions.each do |v| %>
111
117
  <% pred = predecessor_of[v] %>
112
118
  <% next unless v.change_summary_against(pred) %>
119
+ <% vs = v.validation_summary %>
113
120
  <dialog id="ck-mvdiff-<%= v.id %>" class="ck-modal" onclick="if(event.target===this)this.close()">
114
121
  <article class="ck-modal__panel" tabindex="-1" onclick="event.stopPropagation()">
115
122
  <header class="ck-modal__header">
@@ -120,6 +127,9 @@
120
127
  <button type="button" class="ck-modal__close" aria-label="Close" onclick="this.closest('dialog').close()">&times;</button>
121
128
  </header>
122
129
  <div class="ck-modal__body">
130
+ <% if v.draft? && vs.present? %>
131
+ <%= render "completion_kit/metrics/validation_scoreboard", summary: vs %>
132
+ <% end %>
123
133
  <% if pred.instruction.to_s != v.instruction.to_s %>
124
134
  <div class="ck-suggest-diff">
125
135
  <div class="ck-suggest-diff__pane">
@@ -152,8 +162,10 @@
152
162
  title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
153
163
  data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
154
164
  <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
165
+ <% net_negative = vs.present? && (vs["after"].to_i < vs["before"].to_i || vs["breaks"].to_i > vs["fixes"].to_i) %>
155
166
  <%= button_to "Publish #{v.version_label} →", publish_draft_metric_path(@metric, draft_id: v.id),
156
- method: :post, form_class: "inline-block", class: ck_button_classes(:dark) %>
167
+ method: :post, form_class: "inline-block", class: ck_button_classes(:dark),
168
+ data: net_negative ? { turbo_confirm: "This agrees with you less than the current version. Publish anyway?" } : {} %>
157
169
  </span>
158
170
  <% else %>
159
171
  <span class="ck-modal__foot-note">Roll this metric back to this version.</span>
@@ -171,7 +183,7 @@
171
183
  <% draft = @suggestion_draft || @edit_draft %>
172
184
  <section class="ck-card ck-card--spaced">
173
185
  <div class="ck-prompt-preview__header">
174
- <p class="ck-kicker">Calibration</p>
186
+ <p class="ck-kicker">Agreement</p>
175
187
  <% if draft.nil? && @improve_disagreement_count.positive? %>
176
188
  <%= button_to suggest_variants_metric_path(@metric),
177
189
  method: :post, form_class: "inline-block",
@@ -182,7 +194,9 @@
182
194
  <% end %>
183
195
  <% end %>
184
196
  </div>
185
- <p class="ck-meta-copy">This is a measure of how often the judge's scores match a human reviewer. Review its scores to build that signal; the scores you disagree with become the cases the model learns from when you improve the metric.</p>
197
+ <%= turbo_stream_from "metric_#{@metric.id}_suggestion" %>
198
+ <div id="ck-suggestion-status-<%= @metric.id %>" class="ck-suggestion-status"></div>
199
+ <p class="ck-meta-copy">How often the judge lands on the same score you would. Review its scores to build that signal, and improve the metric to raise it.</p>
186
200
  <%= render "completion_kit/calibrations/trust_panel",
187
201
  stats: CompletionKit::MetricCalibrationStats.for(@metric),
188
202
  metric: @metric %>
@@ -0,0 +1,5 @@
1
+ class AddValidationSummaryToCompletionKitMetricVersions < ActiveRecord::Migration[8.1]
2
+ def change
3
+ add_column :completion_kit_metric_versions, :validation_summary, :text
4
+ end
5
+ end
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.10.0"
2
+ VERSION = "0.11.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin
@@ -266,6 +266,7 @@ files:
266
266
  - app/jobs/completion_kit/application_job.rb
267
267
  - app/jobs/completion_kit/generate_row_job.rb
268
268
  - app/jobs/completion_kit/judge_review_job.rb
269
+ - app/jobs/completion_kit/metric_suggestion_job.rb
269
270
  - app/jobs/completion_kit/model_discovery_job.rb
270
271
  - app/jobs/completion_kit/run_completion_check_job.rb
271
272
  - app/mailers/completion_kit/application_mailer.rb
@@ -313,6 +314,7 @@ files:
313
314
  - app/services/completion_kit/mcp_tools/tags.rb
314
315
  - app/services/completion_kit/metric_calibration_examples.rb
315
316
  - app/services/completion_kit/metric_calibration_stats.rb
317
+ - app/services/completion_kit/metric_improvement_validator.rb
316
318
  - app/services/completion_kit/metric_variant_generator.rb
317
319
  - app/services/completion_kit/model_discovery_service.rb
318
320
  - app/services/completion_kit/ollama_client.rb
@@ -355,6 +357,10 @@ files:
355
357
  - app/views/completion_kit/metrics/_rubric_diff.html.erb
356
358
  - app/views/completion_kit/metrics/_rubric_hint.html.erb
357
359
  - app/views/completion_kit/metrics/_starter_card.html.erb
360
+ - app/views/completion_kit/metrics/_suggestion_failed.html.erb
361
+ - app/views/completion_kit/metrics/_suggestion_pending.html.erb
362
+ - app/views/completion_kit/metrics/_suggestion_ready.html.erb
363
+ - app/views/completion_kit/metrics/_validation_scoreboard.html.erb
358
364
  - app/views/completion_kit/metrics/edit.html.erb
359
365
  - app/views/completion_kit/metrics/index.html.erb
360
366
  - app/views/completion_kit/metrics/new.html.erb
@@ -433,6 +439,7 @@ files:
433
439
  - db/migrate/20260528000002_add_metric_version_to_reviews.rb
434
440
  - db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
435
441
  - db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb
442
+ - db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb
436
443
  - lib/completion-kit.rb
437
444
  - lib/completion_kit.rb
438
445
  - lib/completion_kit/concurrency_check.rb