completion-kit 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/stylesheets/completion_kit/application.css +167 -48
  3. data/app/controllers/completion_kit/metrics_controller.rb +27 -14
  4. data/app/jobs/completion_kit/judge_review_job.rb +9 -4
  5. data/app/jobs/completion_kit/metric_suggestion_job.rb +46 -0
  6. data/app/models/completion_kit/metric_version.rb +1 -0
  7. data/app/services/completion_kit/judge_service.rb +19 -3
  8. data/app/services/completion_kit/metric_calibration_examples.rb +56 -0
  9. data/app/services/completion_kit/metric_improvement_validator.rb +101 -0
  10. data/app/services/completion_kit/metric_variant_generator.rb +0 -36
  11. data/app/views/completion_kit/calibrations/_trust_panel.html.erb +8 -12
  12. data/app/views/completion_kit/metrics/_guiding_examples.html.erb +23 -0
  13. data/app/views/completion_kit/metrics/_suggestion_failed.html.erb +3 -0
  14. data/app/views/completion_kit/metrics/_suggestion_pending.html.erb +3 -0
  15. data/app/views/completion_kit/metrics/_suggestion_ready.html.erb +4 -0
  16. data/app/views/completion_kit/metrics/_validation_scoreboard.html.erb +12 -0
  17. data/app/views/completion_kit/metrics/index.html.erb +5 -0
  18. data/app/views/completion_kit/metrics/show.html.erb +38 -30
  19. data/app/views/completion_kit/responses/show.html.erb +7 -5
  20. data/config/routes.rb +1 -0
  21. data/db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb +5 -0
  22. data/db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb +5 -0
  23. data/lib/completion_kit/version.rb +1 -1
  24. data/lib/completion_kit.rb +2 -0
  25. metadata +11 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ddf80d4e74705494435d5ae2d9f0ed5ce0dd927f32bffcb1a13819076f94bced
4
- data.tar.gz: 74eadf6abc0f173d0047c961502aeaaab9b5b3de7dd155a00d5d054fb5b8f6e6
3
+ metadata.gz: 0b32ec77fb60d07f40e4b83827c2510aaeb695c96c9c6df86e4b42a7ec57516b
4
+ data.tar.gz: ade912039e4942c87d73c13443bd405533eec2988478e02cd1ccb87550de2783
5
5
  SHA512:
6
- metadata.gz: fa0c962d8282310584ff52a849eeb7efc3c66debe9246d8231e5e24e55c45e8566b4edf83a19d3a021dc4a41b6241c042e0af3059c1b47ba709412220628ed96
7
- data.tar.gz: 20cdeabe363e212a572cbe6b1f08128a7aae88f5a2ef50a3b3d012e5fbef2a64c93571bc6ebb28e971dc819ac57d060a11a3006f1b7a60ebeb265957be221eab
6
+ metadata.gz: bb8664ea804d59e3761ab385d1af98ecf7d110dd7e68e7003e1a4b2c059c5e377a5e42d350a46310f58c6d8c41c0e31a6aa0cdaf8a6b50b4d9f419e6fa60e474
7
+ data.tar.gz: 3bbe72cf7e99a4ae899765829ee8bee83703885ebdfa5b6b9f7253f25f2373b1dd22aadfeb071f4b340c1b7d33dfc4e590e4f8a2df2239afbbc305de009af2cf
@@ -686,16 +686,6 @@ tr:hover .ck-chip--publish {
686
686
  justify-content: space-between;
687
687
  gap: 10px;
688
688
  }
689
- .ck-version-state {
690
- font-family: var(--ck-mono);
691
- font-size: 0.66rem;
692
- letter-spacing: 0.07em;
693
- text-transform: uppercase;
694
- color: var(--ck-dim);
695
- }
696
- .ck-version-state--live {
697
- color: var(--ck-text);
698
- }
699
689
 
700
690
  .ck-chip--soft {
701
691
  background: var(--ck-accent-soft);
@@ -2877,10 +2867,6 @@ select.ck-input {
2877
2867
  line-height: 1.55;
2878
2868
  }
2879
2869
 
2880
- .ck-review-card--stale {
2881
- border-left: 2px solid rgba(224, 164, 88, 0.45);
2882
- }
2883
-
2884
2870
  .ck-stale-versions-banner {
2885
2871
  margin: 0 0 1rem;
2886
2872
  padding: 0.9rem 1rem;
@@ -2908,12 +2894,6 @@ select.ck-input {
2908
2894
  .ck-delta--zero { color: var(--ck-dim); }
2909
2895
 
2910
2896
  .ck-run-compare-table td { vertical-align: middle; }
2911
- .ck-review-card__stale-note {
2912
- margin: 0.4rem 0 0;
2913
- font-family: var(--ck-mono);
2914
- font-size: 0.78rem;
2915
- color: var(--ck-warning);
2916
- }
2917
2897
 
2918
2898
  @media (max-width: 900px) {
2919
2899
  .ck-grid--sidebar,
@@ -3617,9 +3597,10 @@ select.ck-input {
3617
3597
  }
3618
3598
 
3619
3599
  .ck-metrics-table th:nth-child(1), .ck-metrics-table td:nth-child(1) { width: 18rem; white-space: normal; }
3620
- .ck-metrics-table th:nth-child(2), .ck-metrics-table td:nth-child(2) { width: auto; }
3621
- .ck-metrics-table th:nth-child(3), .ck-metrics-table td:nth-child(3) { width: 16rem; }
3622
- .ck-metrics-table th:nth-child(4), .ck-metrics-table td:nth-child(4) { width: 3rem; }
3600
+ .ck-metrics-table th:nth-child(2), .ck-metrics-table td:nth-child(2) { width: 6rem; }
3601
+ .ck-metrics-table th:nth-child(3), .ck-metrics-table td:nth-child(3) { width: auto; }
3602
+ .ck-metrics-table th:nth-child(4), .ck-metrics-table td:nth-child(4) { width: 16rem; }
3603
+ .ck-metrics-table th:nth-child(5), .ck-metrics-table td:nth-child(5) { width: 3rem; }
3623
3604
  .ck-metrics-table td:nth-child(1) strong { overflow-wrap: anywhere; }
3624
3605
 
3625
3606
  .ck-datasets-table th:nth-child(1), .ck-datasets-table td:nth-child(1) { width: auto; }
@@ -3638,32 +3619,10 @@ select.ck-input {
3638
3619
  .ck-prompt-versions-table th:nth-child(3), .ck-prompt-versions-table td:nth-child(3) { width: 8rem; white-space: nowrap; }
3639
3620
  .ck-prompt-versions-table th:nth-child(4), .ck-prompt-versions-table td:nth-child(4) { width: auto; }
3640
3621
 
3641
- .ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 14rem; }
3642
- .ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: auto; }
3643
- .ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: 9rem; white-space: nowrap; }
3644
- .ck-metric-versions-table th:nth-child(4), .ck-metric-versions-table td:nth-child(4) { width: 9rem; white-space: nowrap; }
3622
+ .ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 18rem; }
3623
+ .ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: 16rem; white-space: nowrap; }
3624
+ .ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: auto; white-space: nowrap; }
3645
3625
 
3646
- .ck-change-link {
3647
- background: none;
3648
- border: 0;
3649
- padding: 0;
3650
- cursor: pointer;
3651
- font-family: inherit;
3652
- font-size: 0.86rem;
3653
- text-align: left;
3654
- color: var(--ck-text);
3655
- }
3656
- .ck-change-link:hover,
3657
- .ck-change-link:focus-visible {
3658
- color: var(--ck-accent);
3659
- text-decoration: underline;
3660
- }
3661
- .ck-change-link--trivial {
3662
- color: var(--ck-dim);
3663
- }
3664
- .ck-change-link--major {
3665
- color: rgb(217, 119, 6);
3666
- }
3667
3626
 
3668
3627
  .ck-source-chip {
3669
3628
  display: inline-block;
@@ -5632,6 +5591,11 @@ a.tag-mark {
5632
5591
  .ck-trust-line__hint {
5633
5592
  color: var(--ck-dim);
5634
5593
  }
5594
+ .ck-trust-line__aside {
5595
+ margin: 4px 0 0;
5596
+ font-size: 0.78rem;
5597
+ color: var(--ck-muted);
5598
+ }
5635
5599
  .ck-cal-stat {
5636
5600
  display: inline-flex;
5637
5601
  align-items: baseline;
@@ -5945,3 +5909,158 @@ a.tag-mark {
5945
5909
  .ck-starter-actions .ck-button {
5946
5910
  line-height: 1;
5947
5911
  }
5912
+
5913
+ .ck-guiding {
5914
+ margin-top: 14px;
5915
+ padding-top: 12px;
5916
+ border-top: 1px solid var(--ck-line);
5917
+ }
5918
+
5919
+ .ck-guiding__head {
5920
+ display: flex;
5921
+ align-items: baseline;
5922
+ justify-content: space-between;
5923
+ gap: 12px;
5924
+ }
5925
+
5926
+ .ck-guiding__head .ck-kicker--inset {
5927
+ margin-top: 0;
5928
+ }
5929
+
5930
+ .ck-guiding__legend {
5931
+ font-family: var(--ck-mono);
5932
+ font-size: 0.64rem;
5933
+ letter-spacing: 0.09em;
5934
+ text-transform: uppercase;
5935
+ color: var(--ck-muted);
5936
+ }
5937
+
5938
+ .ck-guiding__list {
5939
+ list-style: none;
5940
+ margin: 8px -8px 0;
5941
+ padding: 0;
5942
+ display: flex;
5943
+ flex-direction: column;
5944
+ }
5945
+
5946
+ .ck-guiding__item {
5947
+ display: flex;
5948
+ align-items: center;
5949
+ gap: 12px;
5950
+ padding: 5px 8px;
5951
+ border-radius: 7px;
5952
+ transition: background 0.15s;
5953
+ }
5954
+
5955
+ .ck-guiding__item:hover {
5956
+ background: var(--ck-surface-hover);
5957
+ }
5958
+
5959
+ .ck-guiding__item:hover .ck-guiding__output {
5960
+ color: var(--ck-text);
5961
+ }
5962
+
5963
+ .ck-guiding__link {
5964
+ flex: 1;
5965
+ min-width: 0;
5966
+ display: flex;
5967
+ align-items: center;
5968
+ gap: 12px;
5969
+ text-decoration: none;
5970
+ color: inherit;
5971
+ }
5972
+
5973
+ .ck-guiding__output {
5974
+ flex: 1;
5975
+ min-width: 0;
5976
+ overflow: hidden;
5977
+ text-overflow: ellipsis;
5978
+ white-space: nowrap;
5979
+ color: var(--ck-dim);
5980
+ font-size: 0.86rem;
5981
+ }
5982
+
5983
+ .ck-guiding__scores {
5984
+ font-family: var(--ck-mono);
5985
+ font-size: 0.78rem;
5986
+ color: var(--ck-text);
5987
+ white-space: nowrap;
5988
+ }
5989
+
5990
+ .ck-guiding__judge {
5991
+ color: var(--ck-dim);
5992
+ }
5993
+
5994
+ .ck-guiding__human {
5995
+ color: var(--ck-text);
5996
+ font-weight: 600;
5997
+ }
5998
+
5999
+ .ck-guiding__item .ck-icon-btn {
6000
+ width: 2rem;
6001
+ height: 2rem;
6002
+ }
6003
+
6004
+ .ck-suggestion-status:empty { display: none; }
6005
+ .ck-suggestion-status {
6006
+ margin-top: 10px;
6007
+ display: flex;
6008
+ align-items: baseline;
6009
+ gap: 10px;
6010
+ flex-wrap: wrap;
6011
+ }
6012
+
6013
+ .ck-scoreboard {
6014
+ margin-bottom: 16px;
6015
+ padding-bottom: 14px;
6016
+ border-bottom: 1px solid var(--ck-line);
6017
+ }
6018
+ .ck-scoreboard__headline {
6019
+ margin: 0 0 8px;
6020
+ font-size: 0.95rem;
6021
+ color: var(--ck-text);
6022
+ }
6023
+ .ck-scoreboard__was {
6024
+ font-family: var(--ck-mono);
6025
+ font-size: 0.74rem;
6026
+ color: var(--ck-muted);
6027
+ margin-left: 6px;
6028
+ }
6029
+ .ck-scoreboard__tally {
6030
+ list-style: none;
6031
+ margin: 0;
6032
+ padding: 0;
6033
+ display: flex;
6034
+ gap: 18px;
6035
+ }
6036
+ .ck-scoreboard__stat {
6037
+ font-family: var(--ck-mono);
6038
+ font-size: 0.72rem;
6039
+ letter-spacing: 0.06em;
6040
+ text-transform: uppercase;
6041
+ color: var(--ck-muted);
6042
+ }
6043
+ .ck-scoreboard__stat strong { color: var(--ck-text); }
6044
+ .ck-scoreboard__stat--break strong { color: var(--ck-warning); }
6045
+ .ck-scoreboard__note {
6046
+ margin: 8px 0 0;
6047
+ font-size: 0.78rem;
6048
+ color: var(--ck-muted);
6049
+ }
6050
+ .ck-version-change {
6051
+ display: inline-flex;
6052
+ align-items: baseline;
6053
+ gap: 0.6rem;
6054
+ }
6055
+ .ck-version-score {
6056
+ font-family: var(--ck-mono);
6057
+ font-size: 0.74rem;
6058
+ color: var(--ck-dim);
6059
+ }
6060
+ .ck-version-score__label {
6061
+ font-size: 0.6rem;
6062
+ letter-spacing: 0.08em;
6063
+ text-transform: uppercase;
6064
+ color: var(--ck-muted);
6065
+ margin-right: 0.2rem;
6066
+ }
@@ -1,11 +1,13 @@
1
1
  module CompletionKit
2
2
  class MetricsController < ApplicationController
3
3
  include CompletionKit::TagFiltering
4
- before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion]
4
+ before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion, :exclude_example]
5
+ before_action :ensure_examples_from_reviews_enabled, only: [:exclude_example]
5
6
 
6
7
  def index
7
8
  @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
8
9
  @available_starters = StarterMetrics.available
10
+ @current_versions = MetricVersion.published.current.where(metric_id: @metrics.map(&:id)).index_by(&:metric_id)
9
11
  end
10
12
 
11
13
  def starter_preview
@@ -39,6 +41,7 @@ module CompletionKit
39
41
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
40
42
  @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
41
43
  @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
44
+ @guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricCalibrationExamples.judge_examples_for(@metric) : []
42
45
  end
43
46
 
44
47
  def new
@@ -114,26 +117,22 @@ module CompletionKit
114
117
 
115
118
  def suggest_variants
116
119
  target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
117
- disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
118
- if disagreement_count.zero?
120
+ counts = Calibration.where(metric_id: @metric.id, verdict: %w[agree disagree]).group(:verdict).count
121
+ if counts["disagree"].to_i.zero?
119
122
  redirect_to target, alert: "Mark at least one case as Disagree before asking the model to suggest a change."
120
123
  return
121
124
  end
122
125
 
123
- MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
126
+ MetricSuggestionJob.perform_later(@metric.id)
124
127
 
125
- generator = MetricVariantGenerator.new(@metric, count: 1)
126
- variants = generator.call
127
- if variants.empty?
128
- redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
129
- return
130
- end
131
- versions = generator.persist!(variants)
132
- new_version = versions.max_by(&:version_number)
133
128
  if params[:back_to] == "edit"
134
- redirect_to edit_metric_path(@metric), notice: "Drafted #{new_version.version_label} from your reviews. Review the proposed changes below, then Publish to use it."
129
+ redirect_to metric_path(@metric), notice: "Drafting a change from your reviews. It will appear here once it's tested."
135
130
  else
136
- redirect_to metric_path(@metric, show_change: new_version.id), notice: "Drafted #{new_version.version_label} from your reviews."
131
+ render turbo_stream: turbo_stream.replace(
132
+ "ck-suggestion-status-#{@metric.id}",
133
+ partial: "completion_kit/metrics/suggestion_pending",
134
+ locals: { metric: @metric, count: counts.values.sum }
135
+ )
137
136
  end
138
137
  end
139
138
 
@@ -145,6 +144,16 @@ module CompletionKit
145
144
  redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
146
145
  end
147
146
 
147
+ def exclude_example
148
+ calibration = Calibration.where(metric_id: @metric.id).find(params[:calibration_id])
149
+ calibration.update!(excluded_from_examples: true)
150
+ render turbo_stream: turbo_stream.replace(
151
+ "ck-guiding-#{@metric.id}",
152
+ partial: "completion_kit/metrics/guiding_examples",
153
+ locals: { metric: @metric, examples: MetricCalibrationExamples.judge_examples_for(@metric) }
154
+ )
155
+ end
156
+
148
157
  def publish_draft
149
158
  scope = MetricVersion.where(metric_id: @metric.id)
150
159
  version = if params[:draft_id].present?
@@ -176,6 +185,10 @@ module CompletionKit
176
185
 
177
186
  private
178
187
 
188
+ def ensure_examples_from_reviews_enabled
189
+ head :not_found unless CompletionKit.config.judge_examples_from_reviews
190
+ end
191
+
179
192
  def set_metric
180
193
  @metric = Metric.find(params[:id])
181
194
  end
@@ -58,7 +58,8 @@ module CompletionKit
58
58
  run.prompt&.template,
59
59
  criteria: metric.instruction.to_s,
60
60
  rubric_text: metric.display_rubric_text,
61
- input_data: response.input_data
61
+ input_data: response.input_data,
62
+ human_examples: review_examples_for(metric, response)
62
63
  )
63
64
 
64
65
  review = response.reviews.find_or_initialize_by(metric_id: metric.id)
@@ -80,9 +81,13 @@ module CompletionKit
80
81
 
81
82
  private
82
83
 
83
- # A model with supports_judging == nil ("untested") just produced a valid
84
- # review promote it to confirmed. No-op once confirmed (so repeated runs
85
- # don't churn the row), and a model already flagged as a bad judge stays so.
84
+ def review_examples_for(metric, response)
85
+ return nil unless CompletionKit.config.judge_calibration_enabled
86
+ return nil unless CompletionKit.config.judge_examples_from_reviews
87
+
88
+ MetricCalibrationExamples.judge_examples_for(metric, exclude_response_id: response.id)
89
+ end
90
+
86
91
  def confirm_judging_capability(judge_model_id)
87
92
  model = Model.find_by(provider: ApiConfig.provider_for_model(judge_model_id), model_id: judge_model_id)
88
93
  return unless model && model.supports_judging.nil?
@@ -0,0 +1,46 @@
1
+ require "faraday"
2
+
3
+ module CompletionKit
4
+ class MetricSuggestionJob < ApplicationJob
5
+ queue_as :llm
6
+
7
+ retry_on Faraday::TimeoutError, Faraday::ConnectionFailed, wait: :polynomially_longer, attempts: 5
8
+ retry_on CompletionKit::RateLimitError, wait: :polynomially_longer, attempts: 5
9
+
10
+ rescue_from(StandardError) do |error|
11
+ Rails.error.report(error, handled: true, context: { job: self.class.name })
12
+ broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
13
+ end
14
+
15
+ def perform(metric_id)
16
+ @metric = Metric.find_by(id: metric_id)
17
+ return unless @metric
18
+
19
+ MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
20
+
21
+ generator = MetricVariantGenerator.new(@metric, count: 1)
22
+ variants = generator.call
23
+ if variants.empty?
24
+ broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
25
+ return
26
+ end
27
+
28
+ draft = generator.persist!(variants).max_by(&:version_number)
29
+ summary = MetricImprovementValidator.new(@metric, draft).call
30
+ draft.update!(validation_summary: summary)
31
+
32
+ broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_ready", locals: { metric: @metric, draft: draft })
33
+ end
34
+
35
+ private
36
+
37
+ def broadcast_status(metric, partial:, locals:)
38
+ html = CompletionKit::ApplicationController.render(partial: partial, locals: locals)
39
+ Turbo::StreamsChannel.broadcast_replace_to(
40
+ "metric_#{metric.id}_suggestion",
41
+ target: "ck-suggestion-status-#{metric.id}",
42
+ html: html
43
+ )
44
+ end
45
+ end
46
+ end
@@ -6,6 +6,7 @@ module CompletionKit
6
6
  has_many :calibrations, dependent: :destroy
7
7
 
8
8
  serialize :rubric_bands, coder: JSON
9
+ serialize :validation_summary, coder: JSON
9
10
 
10
11
  before_validation :assign_version_number, on: :create
11
12
 
@@ -10,13 +10,14 @@ module CompletionKit
10
10
  @judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
11
11
  end
12
12
 
13
- def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, **_extras)
13
+ def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil, **_extras)
14
14
  raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
15
15
 
16
16
  judge_prompt = build_judge_prompt(output, expected_output, prompt,
17
17
  criteria: criteria,
18
18
  rubric_text: rubric_text,
19
- input_data: input_data)
19
+ input_data: input_data,
20
+ human_examples: human_examples)
20
21
 
21
22
  response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
22
23
  raise StandardError, response if response.start_with?("Error:")
@@ -25,7 +26,7 @@ module CompletionKit
25
26
 
26
27
  private
27
28
 
28
- def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil)
29
+ def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil)
29
30
  judge_prompt = <<~PROMPT
30
31
  You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
31
32
 
@@ -42,6 +43,8 @@ module CompletionKit
42
43
  judge_prompt += "\nCriteria: #{criteria}\n"
43
44
  end
44
45
 
46
+ judge_prompt += human_examples_block(human_examples)
47
+
45
48
  judge_prompt += <<~PROMPT
46
49
 
47
50
  Original prompt: #{prompt || "Not provided"}
@@ -53,6 +56,19 @@ module CompletionKit
53
56
  judge_prompt
54
57
  end
55
58
 
59
+ def human_examples_block(examples)
60
+ return "" if examples.blank?
61
+
62
+ lines = ["", "Reviewed examples where a human corrected the judge on this metric. Weigh them when scoring:"]
63
+ examples.each_with_index do |example, index|
64
+ note = example[:human_note].to_s
65
+ line = "Example #{index + 1}: Output: #{example[:output].to_s.truncate(200)}. The judge scored this #{example[:judge_score].to_i}/5. A reviewer corrected it to #{example[:human_score].to_i}/5"
66
+ line += note.present? ? ": #{note.truncate(160)}" : "."
67
+ lines << line
68
+ end
69
+ lines.join("\n") + "\n"
70
+ end
71
+
56
72
  def parse_judge_response(response)
57
73
  score_match = response.match(/\*{0,2}Score:?\*{0,2}\s*(\d+(?:\.\d+)?)/i)
58
74
  feedback_match = response.match(/\*{0,2}Feedback:?\*{0,2}\s*(.+)/mi)
@@ -0,0 +1,56 @@
1
+ module CompletionKit
2
+ module MetricCalibrationExamples
3
+ DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
4
+
5
+ module_function
6
+
7
+ def for(metric, limit: 8)
8
+ disagreements_for(metric, limit: limit)
9
+ end
10
+
11
+ def disagreements_for(metric, limit: 8)
12
+ calibrations_for(metric, verdict: "disagree", limit: limit)
13
+ end
14
+
15
+ def borderlines_for(metric, limit: 6)
16
+ calibrations_for(metric, verdict: "borderline", limit: limit)
17
+ end
18
+
19
+ def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
20
+ current_version = MetricVersion.current.find_by(metric_id: metric.id)
21
+ return [] unless current_version
22
+
23
+ relation = Calibration
24
+ .where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
25
+ .where.not(corrected_score: nil)
26
+ relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
27
+ map_examples(relation.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
28
+ .reject { |example| example[:judge_score].nil? }
29
+ end
30
+
31
+ def calibrations_for(metric, verdict:, limit:)
32
+ base = Calibration.where(metric_id: metric.id, verdict: verdict)
33
+ current_version = MetricVersion.current.find_by(metric_id: metric.id)
34
+ scoped = current_version ? base.where(metric_version_id: current_version.id) : base
35
+ effective = scoped.exists? ? scoped : base
36
+ map_examples(effective.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
37
+ end
38
+
39
+ def map_examples(relation, metric)
40
+ relation.map do |cal|
41
+ review = cal.response.reviews.find { |r| r.metric_id == metric.id }
42
+ {
43
+ id: cal.id,
44
+ run_id: cal.run_id,
45
+ response_id: cal.response_id,
46
+ input: cal.response.input_data,
47
+ output: cal.response.response_text,
48
+ judge_score: review&.ai_score,
49
+ judge_feedback: review&.ai_feedback,
50
+ human_score: cal.corrected_score,
51
+ human_note: cal.note
52
+ }
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,101 @@
1
+ module CompletionKit
2
+ class MetricImprovementValidator
3
+ ANSWER_KEY_LIMIT = 30
4
+
5
+ def initialize(metric, candidate, scorer: nil)
6
+ @metric = metric
7
+ @candidate = candidate
8
+ @scorer = scorer || method(:rescore)
9
+ end
10
+
11
+ def call
12
+ key = answer_key
13
+ rows = []
14
+ key.each do |entry|
15
+ begin
16
+ score = @scorer.call(entry[:response], @candidate)
17
+ rescue StandardError
18
+ next
19
+ end
20
+ rows << classify(entry, score.to_i)
21
+ end
22
+ summarize(rows, key.size, key_capped?)
23
+ end
24
+
25
+ private
26
+
27
+ def answer_key
28
+ current = MetricVersion.current.find_by(metric_id: @metric.id)
29
+ return [] unless current
30
+
31
+ base = Calibration.where(metric_id: @metric.id, metric_version_id: current.id, verdict: %w[agree disagree])
32
+ @key_size_before_cap = base.count
33
+ base.includes(response: :reviews)
34
+ .order(created_at: :desc)
35
+ .limit(ANSWER_KEY_LIMIT)
36
+ .filter_map do |cal|
37
+ response = cal.response
38
+ next unless response.response_text.present?
39
+ review = response.reviews.find { |r| r.metric_id == @metric.id }
40
+ position = cal.verdict == "disagree" ? cal.corrected_score : review&.ai_score
41
+ next if position.nil?
42
+ { response: response, verdict: cal.verdict, position: position }
43
+ end
44
+ end
45
+
46
+ def key_capped?
47
+ @key_size_before_cap.to_i > ANSWER_KEY_LIMIT
48
+ end
49
+
50
+ def classify(entry, candidate_score)
51
+ matched = candidate_score == entry[:position].to_i
52
+ outcome = if entry[:verdict] == "disagree"
53
+ matched ? "fix" : "still_off"
54
+ else
55
+ matched ? "keep" : "break"
56
+ end
57
+ {
58
+ "response_id" => entry[:response].id,
59
+ "verdict" => entry[:verdict],
60
+ "position" => entry[:position].to_i,
61
+ "candidate_score" => candidate_score,
62
+ "outcome" => outcome
63
+ }
64
+ end
65
+
66
+ def summarize(rows, total, capped)
67
+ fixes = rows.count { |r| r["outcome"] == "fix" }
68
+ keeps = rows.count { |r| r["outcome"] == "keep" }
69
+ breaks = rows.count { |r| r["outcome"] == "break" }
70
+ still_off = rows.count { |r| r["outcome"] == "still_off" }
71
+ agreements = rows.count { |r| r["verdict"] == "agree" }
72
+ {
73
+ "total" => total,
74
+ "tested" => rows.size,
75
+ "capped" => capped,
76
+ "fixes" => fixes,
77
+ "keeps" => keeps,
78
+ "breaks" => breaks,
79
+ "still_off" => still_off,
80
+ "before" => agreements,
81
+ "after" => fixes + keeps,
82
+ "rows" => rows
83
+ }
84
+ end
85
+
86
+ def rescore(response, candidate)
87
+ run = response.run
88
+ config = ApiConfig.for_model(run.judge_model).merge(judge_model: run.judge_model)
89
+ rubric_text = Metric.rubric_text_for(Metric.normalize_rubric_bands(candidate.rubric_bands))
90
+ result = JudgeService.new(config).evaluate(
91
+ response.response_text,
92
+ response.expected_output,
93
+ run.prompt&.template,
94
+ criteria: candidate.instruction.to_s,
95
+ rubric_text: rubric_text,
96
+ input_data: response.input_data
97
+ )
98
+ result[:score]
99
+ end
100
+ end
101
+ end
@@ -117,40 +117,4 @@ module CompletionKit
117
117
  end
118
118
  end
119
119
 
120
- module MetricCalibrationExamples
121
- module_function
122
-
123
- def for(metric, limit: 8)
124
- disagreements_for(metric, limit: limit)
125
- end
126
-
127
- def disagreements_for(metric, limit: 8)
128
- calibrations_for(metric, verdict: "disagree", limit: limit)
129
- end
130
-
131
- def borderlines_for(metric, limit: 6)
132
- calibrations_for(metric, verdict: "borderline", limit: limit)
133
- end
134
-
135
- def calibrations_for(metric, verdict:, limit:)
136
- base = Calibration.where(metric_id: metric.id, verdict: verdict)
137
- current_version = MetricVersion.current.find_by(metric_id: metric.id)
138
- scoped = current_version ? base.where(metric_version_id: current_version.id) : base
139
- effective = scoped.exists? ? scoped : base
140
- effective.includes(response: :reviews)
141
- .order(created_at: :desc)
142
- .limit(limit)
143
- .map do |cal|
144
- review = cal.response.reviews.find { |r| r.metric_id == metric.id }
145
- {
146
- input: cal.response.input_data,
147
- output: cal.response.response_text,
148
- judge_score: review&.ai_score,
149
- judge_feedback: review&.ai_feedback,
150
- human_score: cal.corrected_score,
151
- human_note: cal.note
152
- }
153
- end
154
- end
155
- end
156
120
  end
@@ -2,15 +2,11 @@
2
2
  <% metric = local_assigns[:metric] %>
3
3
  <% anchor = metric&.name&.parameterize %>
4
4
  <% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
5
- <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
5
+ <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
6
6
  created_by = CompletionKit.config.username.presence || "operator"
7
- verdicted_ids = if current_metric_version
8
- CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
9
- else
10
- []
11
- end
7
+ verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
12
8
  CompletionKit::Response.joins(:reviews)
13
- .where(reviews: { metric_id: metric.id })
9
+ .where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
14
10
  .where.not(reviews: { ai_score: nil })
15
11
  .where.not(id: verdicted_ids)
16
12
  .order(created_at: :desc).first
@@ -24,7 +20,7 @@
24
20
  <p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
25
21
  <% if stats.sample_size.zero? %>
26
22
  <span class="ck-trust-line__lead">Not measured yet.</span>
27
- <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "earlier-version review") %> kept on file.)<% end %></span>
23
+ <span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
28
24
  <% if target_response %>
29
25
  <%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
30
26
  <% end %>
@@ -35,13 +31,13 @@
35
31
  <%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
36
32
  <% end %>
37
33
  <% else %>
38
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agreement</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong></span>
39
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
40
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
41
- <span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
34
+ <span class="ck-cal-stat"><span class="ck-cal-stat__label">Agrees with you</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong> of <%= stats.sample_size %> reviews</span>
42
35
  <% if stats.borderline_rate && stats.borderline_rate > 0 %>
43
36
  <% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
44
37
  <span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>
45
38
  <% end %>
46
39
  <% end %>
47
40
  </p>
41
+ <% if stats.sample_size.zero? && prior_version_verdicts > 0 %>
42
+ <p class="ck-trust-line__aside"><%= pluralize(prior_version_verdicts, "review") %> from an earlier version <%= prior_version_verdicts == 1 ? "doesn't" : "don't" %> count toward this version.</p>
43
+ <% end %>
@@ -0,0 +1,23 @@
1
+ <div id="ck-guiding-<%= metric.id %>" class="ck-guiding">
2
+ <% if examples.any? %>
3
+ <div class="ck-guiding__head">
4
+ <p class="ck-kicker ck-kicker--inset">Guiding the judge</p>
5
+ <span class="ck-guiding__legend">Judge &rarr; Human</span>
6
+ </div>
7
+ <ul class="ck-guiding__list">
8
+ <% examples.each do |example| %>
9
+ <li class="ck-guiding__item">
10
+ <%= link_to run_response_path(example[:run_id], example[:response_id], anchor: metric.name.parameterize),
11
+ class: "ck-guiding__link", title: "Open this review" do %>
12
+ <span class="ck-guiding__output"><%= truncate(example[:output].to_s, length: 90) %></span>
13
+ <span class="ck-guiding__scores"><span class="ck-guiding__judge"><%= example[:judge_score].to_i %></span> &rarr; <span class="ck-guiding__human"><%= example[:human_score].to_i %></span></span>
14
+ <% end %>
15
+ <%= button_to exclude_example_metric_path(metric, calibration_id: example[:id]),
16
+ method: :post, form_class: "inline-block", class: "ck-icon-btn",
17
+ title: "Stop using this case", "aria-label": "Stop using this case",
18
+ data: { turbo_confirm: "Stop using this corrected case to guide the judge?" } do %><%= heroicon_tag "x-mark", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
19
+ </li>
20
+ <% end %>
21
+ </ul>
22
+ <% end %>
23
+ </div>
@@ -0,0 +1,3 @@
1
+ <div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status">
2
+ <span class="ck-cal-foot__note">The model returned no usable change. Try again, or review a few more scores first.</span>
3
+ </div>
@@ -0,0 +1,3 @@
1
+ <div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status ck-suggestion-status--pending">
2
+ <span class="ck-cal-foot__note">Drafting a change and testing it against your <%= pluralize(count, "review") %>…</span>
3
+ </div>
@@ -0,0 +1,4 @@
1
+ <div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status ck-suggestion-status--ready">
2
+ <span class="ck-cal-foot__note">Drafted <%= draft.version_label %> and tested it against your reviews.</span>
3
+ <%= link_to "Compare and publish →", CompletionKit::Engine.routes.url_helpers.metric_path(metric, show_change: draft.id), class: "ck-cal-link" %>
4
+ </div>
@@ -0,0 +1,12 @@
1
+ <% s = summary %>
2
+ <div class="ck-scoreboard">
3
+ <p class="ck-scoreboard__headline">Matches you on <strong><%= s["after"] %> of <%= s["tested"] %></strong> of your reviews <span class="ck-scoreboard__was">was <%= s["before"] %> of <%= s["tested"] %></span></p>
4
+ <ul class="ck-scoreboard__tally">
5
+ <li class="ck-scoreboard__stat ck-scoreboard__stat--fix">Fixes <strong><%= s["fixes"] %></strong></li>
6
+ <li class="ck-scoreboard__stat ck-scoreboard__stat--keep">Keeps <strong><%= s["keeps"] %></strong></li>
7
+ <li class="ck-scoreboard__stat ck-scoreboard__stat--break">Breaks <strong><%= s["breaks"] %></strong></li>
8
+ </ul>
9
+ <% if s["capped"] %>
10
+ <p class="ck-scoreboard__note">Tested against your 30 most recent reviews.</p>
11
+ <% end %>
12
+ </div>
@@ -18,6 +18,7 @@
18
18
  <thead>
19
19
  <tr>
20
20
  <th scope="col">Name</th>
21
+ <th scope="col">Version</th>
21
22
  <th scope="col">Instruction</th>
22
23
  <th scope="col">In groups</th>
23
24
  <th scope="col"></th>
@@ -34,6 +35,10 @@
34
35
  </div>
35
36
  <% end %>
36
37
  </td>
38
+ <td data-label="Version">
39
+ <% v = @current_versions[metric.id] %>
40
+ <span class="ck-chip ck-chip--soft"><%= v ? v.version_label : "v1" %></span>
41
+ </td>
37
42
  <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
38
43
  <td data-label="In groups">
39
44
  <% groups = metric.metric_groups %>
@@ -50,7 +50,6 @@
50
50
  <thead>
51
51
  <tr>
52
52
  <th scope="col">Version</th>
53
- <th scope="col">&Delta; Change</th>
54
53
  <th scope="col">Source</th>
55
54
  <th scope="col">Created</th>
56
55
  </tr>
@@ -60,39 +59,37 @@
60
59
  <% pred = predecessor_of[v] %>
61
60
  <tr>
62
61
  <td>
62
+ <% summary = v.change_summary_against(pred) %>
63
63
  <div class="ck-version-cell">
64
64
  <div class="ck-version-cell__label">
65
65
  <strong><%= v.version_label %></strong>
66
66
  <% if v.current? %>
67
- <span class="ck-version-state ck-version-state--live">Published</span>
67
+ <span class="ck-chip">Published</span>
68
68
  <% elsif v.draft? %>
69
- <span class="ck-version-state">Draft</span>
70
69
  <%= button_to "Publish", publish_draft_metric_path(@metric, draft_id: v.id),
71
70
  method: :post, form_class: "inline-block",
72
- class: "ck-chip ck-chip--cta" %>
71
+ class: "ck-chip ck-chip--publish" %>
73
72
  <% else %>
74
- <span class="ck-version-state">Past</span>
75
73
  <%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
76
74
  method: :post, form_class: "inline-block",
77
75
  class: "ck-chip ck-chip--publish",
78
76
  data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
79
77
  <% end %>
80
78
  </div>
79
+ <% vs = v.validation_summary %>
80
+ <% if summary %>
81
+ <div class="ck-version-change">
82
+ <% if v.draft? && vs.present? %>
83
+ <span class="ck-version-score"><span class="ck-version-score__label">Match</span> <%= vs["after"] %>/<%= vs["tested"] %></span>
84
+ <% end %>
85
+ <button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">&Delta;</button>
86
+ </div>
87
+ <% end %>
81
88
  </div>
82
89
  </td>
83
- <td>
84
- <% summary = v.change_summary_against(pred) %>
85
- <% if summary %>
86
- <button type="button" class="ck-change-link ck-change-link--<%= summary[:magnitude] %>"
87
- title="Compare with <%= pred.version_label %>"
88
- onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()"><%= summary[:label] %></button>
89
- <% else %>
90
- <span class="ck-meta-copy">—</span>
91
- <% end %>
92
- </td>
93
90
  <td>
94
91
  <% source_label, source_class = case v.source
95
- when "suggestion" then ["AI calibration", "ck-source-chip ck-source-chip--ai"]
92
+ when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
96
93
  when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
97
94
  when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
98
95
  else ["Original", "ck-source-chip ck-source-chip--initial"]
@@ -119,6 +116,7 @@
119
116
  <% @versions.each do |v| %>
120
117
  <% pred = predecessor_of[v] %>
121
118
  <% next unless v.change_summary_against(pred) %>
119
+ <% vs = v.validation_summary %>
122
120
  <dialog id="ck-mvdiff-<%= v.id %>" class="ck-modal" onclick="if(event.target===this)this.close()">
123
121
  <article class="ck-modal__panel" tabindex="-1" onclick="event.stopPropagation()">
124
122
  <header class="ck-modal__header">
@@ -129,6 +127,9 @@
129
127
  <button type="button" class="ck-modal__close" aria-label="Close" onclick="this.closest('dialog').close()">&times;</button>
130
128
  </header>
131
129
  <div class="ck-modal__body">
130
+ <% if v.draft? && vs.present? %>
131
+ <%= render "completion_kit/metrics/validation_scoreboard", summary: vs %>
132
+ <% end %>
132
133
  <% if pred.instruction.to_s != v.instruction.to_s %>
133
134
  <div class="ck-suggest-diff">
134
135
  <div class="ck-suggest-diff__pane">
@@ -161,8 +162,10 @@
161
162
  title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
162
163
  data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
163
164
  <%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
165
+ <% net_negative = vs.present? && (vs["after"].to_i < vs["before"].to_i || vs["breaks"].to_i > vs["fixes"].to_i) %>
164
166
  <%= button_to "Publish #{v.version_label} →", publish_draft_metric_path(@metric, draft_id: v.id),
165
- method: :post, form_class: "inline-block", class: ck_button_classes(:dark) %>
167
+ method: :post, form_class: "inline-block", class: ck_button_classes(:dark),
168
+ data: net_negative ? { turbo_confirm: "This agrees with you less than the current version. Publish anyway?" } : {} %>
166
169
  </span>
167
170
  <% else %>
168
171
  <span class="ck-modal__foot-note">Roll this metric back to this version.</span>
@@ -177,20 +180,11 @@
177
180
  <% end %>
178
181
 
179
182
  <% if CompletionKit.config.judge_calibration_enabled %>
183
+ <% draft = @suggestion_draft || @edit_draft %>
180
184
  <section class="ck-card ck-card--spaced">
181
- <p class="ck-kicker">Calibration</p>
182
- <p class="ck-meta-copy">This is a measure of how often the judge's scores match a human reviewer. Review its scores to build that signal; the scores you disagree with become the cases the model learns from when you improve the metric.</p>
183
- <%= render "completion_kit/calibrations/trust_panel",
184
- stats: CompletionKit::MetricCalibrationStats.for(@metric),
185
- metric: @metric %>
186
- <% draft = @suggestion_draft || @edit_draft %>
187
- <% if draft %>
188
- <div class="ck-cal-foot">
189
- <span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
190
- </div>
191
- <% elsif @improve_disagreement_count.positive? %>
192
- <div class="ck-cal-foot">
193
- <span class="ck-cal-foot__note"><%= pluralize(@improve_disagreement_count, "case") %> where a reviewer's score didn't match the judge.</span>
185
+ <div class="ck-prompt-preview__header">
186
+ <p class="ck-kicker">Agreement</p>
187
+ <% if draft.nil? && @improve_disagreement_count.positive? %>
194
188
  <%= button_to suggest_variants_metric_path(@metric),
195
189
  method: :post, form_class: "inline-block",
196
190
  class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
@@ -198,6 +192,20 @@
198
192
  <%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
199
193
  Suggest improvements
200
194
  <% end %>
195
+ <% end %>
196
+ </div>
197
+ <%= turbo_stream_from "metric_#{@metric.id}_suggestion" %>
198
+ <div id="ck-suggestion-status-<%= @metric.id %>" class="ck-suggestion-status"></div>
199
+ <p class="ck-meta-copy">How often the judge lands on the same score you would. Review its scores to build that signal, and improve the metric to raise it.</p>
200
+ <%= render "completion_kit/calibrations/trust_panel",
201
+ stats: CompletionKit::MetricCalibrationStats.for(@metric),
202
+ metric: @metric %>
203
+ <% if CompletionKit.config.judge_examples_from_reviews %>
204
+ <%= render "completion_kit/metrics/guiding_examples", metric: @metric, examples: @guiding_examples %>
205
+ <% end %>
206
+ <% if draft %>
207
+ <div class="ck-cal-foot">
208
+ <span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
201
209
  </div>
202
210
  <% end %>
203
211
  </section>
@@ -100,12 +100,17 @@
100
100
  <% @reviews.each do |review| %>
101
101
  <% review_version = review.metric_version %>
102
102
  <% stale = review.stale_against_current_judge? %>
103
- <div class="ck-review-card<%= " ck-review-card--stale" if stale %>" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
103
+ <div class="ck-review-card" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
104
104
  <div class="ck-review-card__header">
105
105
  <span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
106
106
  <div class="ck-inline">
107
107
  <% if review_version %>
108
- <span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Scored against #{review_version.version_label} of this metric. The metric has been republished since." : "Scored against the metric's current version (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
108
+ <% if stale %>
109
+ <% current_version = CompletionKit::MetricVersion.current.find_by(metric_id: review.metric_id) %>
110
+ <span class="ck-source-chip ck-source-chip--past" title="Scored on <%= review_version.version_label %>; the metric is now on <%= current_version.version_label %>, which may score this differently."><%= review_version.version_label %> &rarr; <%= current_version.version_label %></span>
111
+ <% else %>
112
+ <span class="ck-source-chip ck-source-chip--current" title="Scored on the metric's current version (<%= review_version.version_label %>)."><%= review_version.version_label %></span>
113
+ <% end %>
109
114
  <% end %>
110
115
  <% if review.ai_score %>
111
116
  <% 5.times do |i| %>
@@ -116,9 +121,6 @@
116
121
  <% end %>
117
122
  </div>
118
123
  </div>
119
- <% if stale %>
120
- <p class="ck-review-card__stale-note">Scored against a superseded version of this metric. Its current version may score this differently.</p>
121
- <% end %>
122
124
  <% if review.ai_feedback.present? %>
123
125
  <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
124
126
  <% end %>
data/config/routes.rb CHANGED
@@ -22,6 +22,7 @@ CompletionKit::Engine.routes.draw do
22
22
  post :publish_draft
23
23
  post :suggest_variants
24
24
  delete :dismiss_suggestion
25
+ post :exclude_example
25
26
  end
26
27
  end
27
28
  resources :metric_groups
@@ -0,0 +1,5 @@
1
+ class AddExcludedFromExamplesToCompletionKitCalibrations < ActiveRecord::Migration[8.1]
2
+ def change
3
+ add_column :completion_kit_calibrations, :excluded_from_examples, :boolean, null: false, default: false
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ class AddValidationSummaryToCompletionKitMetricVersions < ActiveRecord::Migration[8.1]
2
+ def change
3
+ add_column :completion_kit_metric_versions, :validation_summary, :text
4
+ end
5
+ end
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.9.0"
2
+ VERSION = "0.11.0"
3
3
  end
@@ -13,6 +13,7 @@ module CompletionKit
13
13
  attr_accessor :api_rate_limit, :web_rate_limit
14
14
  attr_accessor :allow_loopback_endpoints
15
15
  attr_accessor :judge_calibration_enabled
16
+ attr_accessor :judge_examples_from_reviews
16
17
 
17
18
  def initialize
18
19
  @openai_api_key = ENV['OPENAI_API_KEY']
@@ -29,6 +30,7 @@ module CompletionKit
29
30
 
30
31
  @allow_loopback_endpoints = true
31
32
  @judge_calibration_enabled = true
33
+ @judge_examples_from_reviews = false
32
34
 
33
35
  @api_reference_authentication_partial = "completion_kit/api_reference/authentication"
34
36
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin
@@ -266,6 +266,7 @@ files:
266
266
  - app/jobs/completion_kit/application_job.rb
267
267
  - app/jobs/completion_kit/generate_row_job.rb
268
268
  - app/jobs/completion_kit/judge_review_job.rb
269
+ - app/jobs/completion_kit/metric_suggestion_job.rb
269
270
  - app/jobs/completion_kit/model_discovery_job.rb
270
271
  - app/jobs/completion_kit/run_completion_check_job.rb
271
272
  - app/mailers/completion_kit/application_mailer.rb
@@ -311,7 +312,9 @@ files:
311
312
  - app/services/completion_kit/mcp_tools/responses.rb
312
313
  - app/services/completion_kit/mcp_tools/runs.rb
313
314
  - app/services/completion_kit/mcp_tools/tags.rb
315
+ - app/services/completion_kit/metric_calibration_examples.rb
314
316
  - app/services/completion_kit/metric_calibration_stats.rb
317
+ - app/services/completion_kit/metric_improvement_validator.rb
315
318
  - app/services/completion_kit/metric_variant_generator.rb
316
319
  - app/services/completion_kit/model_discovery_service.rb
317
320
  - app/services/completion_kit/ollama_client.rb
@@ -350,9 +353,14 @@ files:
350
353
  - app/views/completion_kit/metric_groups/new.html.erb
351
354
  - app/views/completion_kit/metric_groups/show.html.erb
352
355
  - app/views/completion_kit/metrics/_form.html.erb
356
+ - app/views/completion_kit/metrics/_guiding_examples.html.erb
353
357
  - app/views/completion_kit/metrics/_rubric_diff.html.erb
354
358
  - app/views/completion_kit/metrics/_rubric_hint.html.erb
355
359
  - app/views/completion_kit/metrics/_starter_card.html.erb
360
+ - app/views/completion_kit/metrics/_suggestion_failed.html.erb
361
+ - app/views/completion_kit/metrics/_suggestion_pending.html.erb
362
+ - app/views/completion_kit/metrics/_suggestion_ready.html.erb
363
+ - app/views/completion_kit/metrics/_validation_scoreboard.html.erb
356
364
  - app/views/completion_kit/metrics/edit.html.erb
357
365
  - app/views/completion_kit/metrics/index.html.erb
358
366
  - app/views/completion_kit/metrics/new.html.erb
@@ -430,6 +438,8 @@ files:
430
438
  - db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
431
439
  - db/migrate/20260528000002_add_metric_version_to_reviews.rb
432
440
  - db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
441
+ - db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb
442
+ - db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb
433
443
  - lib/completion-kit.rb
434
444
  - lib/completion_kit.rb
435
445
  - lib/completion_kit/concurrency_check.rb