completion-kit 0.9.0 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ddf80d4e74705494435d5ae2d9f0ed5ce0dd927f32bffcb1a13819076f94bced
4
- data.tar.gz: 74eadf6abc0f173d0047c961502aeaaab9b5b3de7dd155a00d5d054fb5b8f6e6
3
+ metadata.gz: 4772b264a668a86e004f78c7bc2397d93f5266d8dc287b422728d951ab24fbcc
4
+ data.tar.gz: 63b221e144e930df9978607a78533d354f659ca2bf6141046291168af50d3cd7
5
5
  SHA512:
6
- metadata.gz: fa0c962d8282310584ff52a849eeb7efc3c66debe9246d8231e5e24e55c45e8566b4edf83a19d3a021dc4a41b6241c042e0af3059c1b47ba709412220628ed96
7
- data.tar.gz: 20cdeabe363e212a572cbe6b1f08128a7aae88f5a2ef50a3b3d012e5fbef2a64c93571bc6ebb28e971dc819ac57d060a11a3006f1b7a60ebeb265957be221eab
6
+ metadata.gz: a3249ae1c734dcee0c6f9410baf0400f4b16e091b220d86e8417dec91ff9943a165bbc6c8368629cc14054c3af7946bdb693f006a5756de40a809c41db5bbe3a
7
+ data.tar.gz: 541323c93b08f08f32c2f024e3709ee5f3e4e48144cd8e9cb2ba8891312fd9b5712e4ca39cbcc739b64ba61ff0ac890be3528e5bfd9c238e073d640b37b47e90
@@ -686,16 +686,6 @@ tr:hover .ck-chip--publish {
686
686
  justify-content: space-between;
687
687
  gap: 10px;
688
688
  }
689
- .ck-version-state {
690
- font-family: var(--ck-mono);
691
- font-size: 0.66rem;
692
- letter-spacing: 0.07em;
693
- text-transform: uppercase;
694
- color: var(--ck-dim);
695
- }
696
- .ck-version-state--live {
697
- color: var(--ck-text);
698
- }
699
689
 
700
690
  .ck-chip--soft {
701
691
  background: var(--ck-accent-soft);
@@ -2877,10 +2867,6 @@ select.ck-input {
2877
2867
  line-height: 1.55;
2878
2868
  }
2879
2869
 
2880
- .ck-review-card--stale {
2881
- border-left: 2px solid rgba(224, 164, 88, 0.45);
2882
- }
2883
-
2884
2870
  .ck-stale-versions-banner {
2885
2871
  margin: 0 0 1rem;
2886
2872
  padding: 0.9rem 1rem;
@@ -2908,12 +2894,6 @@ select.ck-input {
2908
2894
  .ck-delta--zero { color: var(--ck-dim); }
2909
2895
 
2910
2896
  .ck-run-compare-table td { vertical-align: middle; }
2911
- .ck-review-card__stale-note {
2912
- margin: 0.4rem 0 0;
2913
- font-family: var(--ck-mono);
2914
- font-size: 0.78rem;
2915
- color: var(--ck-warning);
2916
- }
2917
2897
 
2918
2898
  @media (max-width: 900px) {
2919
2899
  .ck-grid--sidebar,
@@ -3617,9 +3597,10 @@ select.ck-input {
3617
3597
  }
3618
3598
 
3619
3599
  .ck-metrics-table th:nth-child(1), .ck-metrics-table td:nth-child(1) { width: 18rem; white-space: normal; }
3620
- .ck-metrics-table th:nth-child(2), .ck-metrics-table td:nth-child(2) { width: auto; }
3621
- .ck-metrics-table th:nth-child(3), .ck-metrics-table td:nth-child(3) { width: 16rem; }
3622
- .ck-metrics-table th:nth-child(4), .ck-metrics-table td:nth-child(4) { width: 3rem; }
3600
+ .ck-metrics-table th:nth-child(2), .ck-metrics-table td:nth-child(2) { width: 6rem; }
3601
+ .ck-metrics-table th:nth-child(3), .ck-metrics-table td:nth-child(3) { width: auto; }
3602
+ .ck-metrics-table th:nth-child(4), .ck-metrics-table td:nth-child(4) { width: 16rem; }
3603
+ .ck-metrics-table th:nth-child(5), .ck-metrics-table td:nth-child(5) { width: 3rem; }
3623
3604
  .ck-metrics-table td:nth-child(1) strong { overflow-wrap: anywhere; }
3624
3605
 
3625
3606
  .ck-datasets-table th:nth-child(1), .ck-datasets-table td:nth-child(1) { width: auto; }
@@ -3638,32 +3619,11 @@ select.ck-input {
3638
3619
  .ck-prompt-versions-table th:nth-child(3), .ck-prompt-versions-table td:nth-child(3) { width: 8rem; white-space: nowrap; }
3639
3620
  .ck-prompt-versions-table th:nth-child(4), .ck-prompt-versions-table td:nth-child(4) { width: auto; }
3640
3621
 
3641
- .ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 14rem; }
3642
- .ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: auto; }
3643
- .ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: 9rem; white-space: nowrap; }
3644
- .ck-metric-versions-table th:nth-child(4), .ck-metric-versions-table td:nth-child(4) { width: 9rem; white-space: nowrap; }
3622
+ .ck-metric-versions-table th:nth-child(1), .ck-metric-versions-table td:nth-child(1) { width: 34%; }
3623
+ .ck-metric-versions-table th:nth-child(2), .ck-metric-versions-table td:nth-child(2) { width: 33%; white-space: nowrap; }
3624
+ .ck-metric-versions-table th:nth-child(3), .ck-metric-versions-table td:nth-child(3) { width: 33%; white-space: nowrap; }
3625
+ .ck-metric-versions-table .ck-version-cell { justify-content: flex-start; gap: 0.75rem; }
3645
3626
 
3646
- .ck-change-link {
3647
- background: none;
3648
- border: 0;
3649
- padding: 0;
3650
- cursor: pointer;
3651
- font-family: inherit;
3652
- font-size: 0.86rem;
3653
- text-align: left;
3654
- color: var(--ck-text);
3655
- }
3656
- .ck-change-link:hover,
3657
- .ck-change-link:focus-visible {
3658
- color: var(--ck-accent);
3659
- text-decoration: underline;
3660
- }
3661
- .ck-change-link--trivial {
3662
- color: var(--ck-dim);
3663
- }
3664
- .ck-change-link--major {
3665
- color: rgb(217, 119, 6);
3666
- }
3667
3627
 
3668
3628
  .ck-source-chip {
3669
3629
  display: inline-block;
@@ -5632,6 +5592,11 @@ a.tag-mark {
5632
5592
  .ck-trust-line__hint {
5633
5593
  color: var(--ck-dim);
5634
5594
  }
5595
+ .ck-trust-line__aside {
5596
+ margin: 4px 0 0;
5597
+ font-size: 0.78rem;
5598
+ color: var(--ck-muted);
5599
+ }
5635
5600
  .ck-cal-stat {
5636
5601
  display: inline-flex;
5637
5602
  align-items: baseline;
@@ -5945,3 +5910,94 @@ a.tag-mark {
5945
5910
  .ck-starter-actions .ck-button {
5946
5911
  line-height: 1;
5947
5912
  }
5913
+
5914
+ .ck-guiding {
5915
+ margin-top: 14px;
5916
+ padding-top: 12px;
5917
+ border-top: 1px solid var(--ck-line);
5918
+ }
5919
+
5920
+ .ck-guiding__head {
5921
+ display: flex;
5922
+ align-items: baseline;
5923
+ justify-content: space-between;
5924
+ gap: 12px;
5925
+ }
5926
+
5927
+ .ck-guiding__head .ck-kicker--inset {
5928
+ margin-top: 0;
5929
+ }
5930
+
5931
+ .ck-guiding__legend {
5932
+ font-family: var(--ck-mono);
5933
+ font-size: 0.64rem;
5934
+ letter-spacing: 0.09em;
5935
+ text-transform: uppercase;
5936
+ color: var(--ck-muted);
5937
+ }
5938
+
5939
+ .ck-guiding__list {
5940
+ list-style: none;
5941
+ margin: 8px -8px 0;
5942
+ padding: 0;
5943
+ display: flex;
5944
+ flex-direction: column;
5945
+ }
5946
+
5947
+ .ck-guiding__item {
5948
+ display: flex;
5949
+ align-items: center;
5950
+ gap: 12px;
5951
+ padding: 5px 8px;
5952
+ border-radius: 7px;
5953
+ transition: background 0.15s;
5954
+ }
5955
+
5956
+ .ck-guiding__item:hover {
5957
+ background: var(--ck-surface-hover);
5958
+ }
5959
+
5960
+ .ck-guiding__item:hover .ck-guiding__output {
5961
+ color: var(--ck-text);
5962
+ }
5963
+
5964
+ .ck-guiding__link {
5965
+ flex: 1;
5966
+ min-width: 0;
5967
+ display: flex;
5968
+ align-items: center;
5969
+ gap: 12px;
5970
+ text-decoration: none;
5971
+ color: inherit;
5972
+ }
5973
+
5974
+ .ck-guiding__output {
5975
+ flex: 1;
5976
+ min-width: 0;
5977
+ overflow: hidden;
5978
+ text-overflow: ellipsis;
5979
+ white-space: nowrap;
5980
+ color: var(--ck-dim);
5981
+ font-size: 0.86rem;
5982
+ }
5983
+
5984
+ .ck-guiding__scores {
5985
+ font-family: var(--ck-mono);
5986
+ font-size: 0.78rem;
5987
+ color: var(--ck-text);
5988
+ white-space: nowrap;
5989
+ }
5990
+
5991
+ .ck-guiding__judge {
5992
+ color: var(--ck-dim);
5993
+ }
5994
+
5995
+ .ck-guiding__human {
5996
+ color: var(--ck-text);
5997
+ font-weight: 600;
5998
+ }
5999
+
6000
+ .ck-guiding__item .ck-icon-btn {
6001
+ width: 2rem;
6002
+ height: 2rem;
6003
+ }
@@ -1,11 +1,13 @@
1
1
  module CompletionKit
2
2
  class MetricsController < ApplicationController
3
3
  include CompletionKit::TagFiltering
4
- before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion]
4
+ before_action :set_metric, only: [:show, :edit, :update, :destroy, :publish_draft, :suggest_variants, :dismiss_suggestion, :exclude_example]
5
+ before_action :ensure_examples_from_reviews_enabled, only: [:exclude_example]
5
6
 
6
7
  def index
7
8
  @metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
8
9
  @available_starters = StarterMetrics.available
10
+ @current_versions = MetricVersion.published.current.where(metric_id: @metrics.map(&:id)).index_by(&:metric_id)
9
11
  end
10
12
 
11
13
  def starter_preview
@@ -39,6 +41,7 @@ module CompletionKit
39
41
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
40
42
  @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
41
43
  @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
44
+ @guiding_examples = CompletionKit.config.judge_examples_from_reviews ? MetricCalibrationExamples.judge_examples_for(@metric) : []
42
45
  end
43
46
 
44
47
  def new
@@ -145,6 +148,16 @@ module CompletionKit
145
148
  redirect_to target, notice: label ? "Discarded draft #{label}." : "Draft already gone."
146
149
  end
147
150
 
151
+ def exclude_example
152
+ calibration = Calibration.where(metric_id: @metric.id).find(params[:calibration_id])
153
+ calibration.update!(excluded_from_examples: true)
154
+ render turbo_stream: turbo_stream.replace(
155
+ "ck-guiding-#{@metric.id}",
156
+ partial: "completion_kit/metrics/guiding_examples",
157
+ locals: { metric: @metric, examples: MetricCalibrationExamples.judge_examples_for(@metric) }
158
+ )
159
+ end
160
+
148
161
  def publish_draft
149
162
  scope = MetricVersion.where(metric_id: @metric.id)
150
163
  version = if params[:draft_id].present?
@@ -176,6 +189,10 @@ module CompletionKit
176
189
 
177
190
  private
178
191
 
192
+ def ensure_examples_from_reviews_enabled
193
+ head :not_found unless CompletionKit.config.judge_examples_from_reviews
194
+ end
195
+
179
196
  def set_metric
180
197
  @metric = Metric.find(params[:id])
181
198
  end
@@ -58,7 +58,8 @@ module CompletionKit
58
58
  run.prompt&.template,
59
59
  criteria: metric.instruction.to_s,
60
60
  rubric_text: metric.display_rubric_text,
61
- input_data: response.input_data
61
+ input_data: response.input_data,
62
+ human_examples: review_examples_for(metric, response)
62
63
  )
63
64
 
64
65
  review = response.reviews.find_or_initialize_by(metric_id: metric.id)
@@ -80,9 +81,13 @@ module CompletionKit
80
81
 
81
82
  private
82
83
 
83
- # A model with supports_judging == nil ("untested") just produced a valid
84
- # review promote it to confirmed. No-op once confirmed (so repeated runs
85
- # don't churn the row), and a model already flagged as a bad judge stays so.
84
+ def review_examples_for(metric, response)
85
+ return nil unless CompletionKit.config.judge_calibration_enabled
86
+ return nil unless CompletionKit.config.judge_examples_from_reviews
87
+
88
+ MetricCalibrationExamples.judge_examples_for(metric, exclude_response_id: response.id)
89
+ end
90
+
86
91
  def confirm_judging_capability(judge_model_id)
87
92
  model = Model.find_by(provider: ApiConfig.provider_for_model(judge_model_id), model_id: judge_model_id)
88
93
  return unless model && model.supports_judging.nil?
@@ -10,13 +10,14 @@ module CompletionKit
10
10
  @judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
11
11
  end
12
12
 
13
- def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, **_extras)
13
+ def evaluate(output, expected_output = nil, prompt = nil, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil, **_extras)
14
14
  raise CompletionKit::ConfigurationError, "Judge not configured" unless @judge_client.configured?
15
15
 
16
16
  judge_prompt = build_judge_prompt(output, expected_output, prompt,
17
17
  criteria: criteria,
18
18
  rubric_text: rubric_text,
19
- input_data: input_data)
19
+ input_data: input_data,
20
+ human_examples: human_examples)
20
21
 
21
22
  response = @judge_client.generate_completion(judge_prompt, model: @judge_model)
22
23
  raise StandardError, response if response.start_with?("Error:")
@@ -25,7 +26,7 @@ module CompletionKit
25
26
 
26
27
  private
27
28
 
28
- def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil)
29
+ def build_judge_prompt(output, expected_output, prompt, criteria: nil, rubric_text: nil, input_data: nil, human_examples: nil)
29
30
  judge_prompt = <<~PROMPT
30
31
  You are an expert evaluator. You MUST respond with ONLY two lines in this exact format, nothing else:
31
32
 
@@ -42,6 +43,8 @@ module CompletionKit
42
43
  judge_prompt += "\nCriteria: #{criteria}\n"
43
44
  end
44
45
 
46
+ judge_prompt += human_examples_block(human_examples)
47
+
45
48
  judge_prompt += <<~PROMPT
46
49
 
47
50
  Original prompt: #{prompt || "Not provided"}
@@ -53,6 +56,19 @@ module CompletionKit
53
56
  judge_prompt
54
57
  end
55
58
 
59
+ def human_examples_block(examples)
60
+ return "" if examples.blank?
61
+
62
+ lines = ["", "Reviewed examples where a human corrected the judge on this metric. Weigh them when scoring:"]
63
+ examples.each_with_index do |example, index|
64
+ note = example[:human_note].to_s
65
+ line = "Example #{index + 1}: Output: #{example[:output].to_s.truncate(200)}. The judge scored this #{example[:judge_score].to_i}/5. A reviewer corrected it to #{example[:human_score].to_i}/5"
66
+ line += note.present? ? ": #{note.truncate(160)}" : "."
67
+ lines << line
68
+ end
69
+ lines.join("\n") + "\n"
70
+ end
71
+
56
72
  def parse_judge_response(response)
57
73
  score_match = response.match(/\*{0,2}Score:?\*{0,2}\s*(\d+(?:\.\d+)?)/i)
58
74
  feedback_match = response.match(/\*{0,2}Feedback:?\*{0,2}\s*(.+)/mi)
@@ -0,0 +1,56 @@
1
+ module CompletionKit
2
+ module MetricCalibrationExamples
3
+ DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
4
+
5
+ module_function
6
+
7
+ def for(metric, limit: 8)
8
+ disagreements_for(metric, limit: limit)
9
+ end
10
+
11
+ def disagreements_for(metric, limit: 8)
12
+ calibrations_for(metric, verdict: "disagree", limit: limit)
13
+ end
14
+
15
+ def borderlines_for(metric, limit: 6)
16
+ calibrations_for(metric, verdict: "borderline", limit: limit)
17
+ end
18
+
19
+ def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
20
+ current_version = MetricVersion.current.find_by(metric_id: metric.id)
21
+ return [] unless current_version
22
+
23
+ relation = Calibration
24
+ .where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
25
+ .where.not(corrected_score: nil)
26
+ relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
27
+ map_examples(relation.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
28
+ .reject { |example| example[:judge_score].nil? }
29
+ end
30
+
31
+ def calibrations_for(metric, verdict:, limit:)
32
+ base = Calibration.where(metric_id: metric.id, verdict: verdict)
33
+ current_version = MetricVersion.current.find_by(metric_id: metric.id)
34
+ scoped = current_version ? base.where(metric_version_id: current_version.id) : base
35
+ effective = scoped.exists? ? scoped : base
36
+ map_examples(effective.includes(response: :reviews).order(created_at: :desc).limit(limit), metric)
37
+ end
38
+
39
+ def map_examples(relation, metric)
40
+ relation.map do |cal|
41
+ review = cal.response.reviews.find { |r| r.metric_id == metric.id }
42
+ {
43
+ id: cal.id,
44
+ run_id: cal.run_id,
45
+ response_id: cal.response_id,
46
+ input: cal.response.input_data,
47
+ output: cal.response.response_text,
48
+ judge_score: review&.ai_score,
49
+ judge_feedback: review&.ai_feedback,
50
+ human_score: cal.corrected_score,
51
+ human_note: cal.note
52
+ }
53
+ end
54
+ end
55
+ end
56
+ end
@@ -117,40 +117,4 @@ module CompletionKit
117
117
  end
118
118
  end
119
119
 
120
- module MetricCalibrationExamples
121
- module_function
122
-
123
- def for(metric, limit: 8)
124
- disagreements_for(metric, limit: limit)
125
- end
126
-
127
- def disagreements_for(metric, limit: 8)
128
- calibrations_for(metric, verdict: "disagree", limit: limit)
129
- end
130
-
131
- def borderlines_for(metric, limit: 6)
132
- calibrations_for(metric, verdict: "borderline", limit: limit)
133
- end
134
-
135
- def calibrations_for(metric, verdict:, limit:)
136
- base = Calibration.where(metric_id: metric.id, verdict: verdict)
137
- current_version = MetricVersion.current.find_by(metric_id: metric.id)
138
- scoped = current_version ? base.where(metric_version_id: current_version.id) : base
139
- effective = scoped.exists? ? scoped : base
140
- effective.includes(response: :reviews)
141
- .order(created_at: :desc)
142
- .limit(limit)
143
- .map do |cal|
144
- review = cal.response.reviews.find { |r| r.metric_id == metric.id }
145
- {
146
- input: cal.response.input_data,
147
- output: cal.response.response_text,
148
- judge_score: review&.ai_score,
149
- judge_feedback: review&.ai_feedback,
150
- human_score: cal.corrected_score,
151
- human_note: cal.note
152
- }
153
- end
154
- end
155
- end
156
120
  end
@@ -2,15 +2,11 @@
2
2
  <% metric = local_assigns[:metric] %>
3
3
  <% anchor = metric&.name&.parameterize %>
4
4
  <% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
5
- <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
5
+ <% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
6
6
  created_by = CompletionKit.config.username.presence || "operator"
7
- verdicted_ids = if current_metric_version
8
- CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
9
- else
10
- []
11
- end
7
+ verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
12
8
  CompletionKit::Response.joins(:reviews)
13
- .where(reviews: { metric_id: metric.id })
9
+ .where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
14
10
  .where.not(reviews: { ai_score: nil })
15
11
  .where.not(id: verdicted_ids)
16
12
  .order(created_at: :desc).first
@@ -24,7 +20,7 @@
24
20
  <p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
25
21
  <% if stats.sample_size.zero? %>
26
22
  <span class="ck-trust-line__lead">Not measured yet.</span>
27
- <span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "earlier-version review") %> kept on file.)<% end %></span>
23
+ <span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
28
24
  <% if target_response %>
29
25
  <%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
30
26
  <% end %>
@@ -45,3 +41,6 @@
45
41
  <% end %>
46
42
  <% end %>
47
43
  </p>
44
+ <% if stats.sample_size.zero? && prior_version_verdicts > 0 %>
45
+ <p class="ck-trust-line__aside"><%= pluralize(prior_version_verdicts, "review") %> from an earlier version <%= prior_version_verdicts == 1 ? "doesn't" : "don't" %> count toward this version.</p>
46
+ <% end %>
@@ -0,0 +1,23 @@
1
+ <div id="ck-guiding-<%= metric.id %>" class="ck-guiding">
2
+ <% if examples.any? %>
3
+ <div class="ck-guiding__head">
4
+ <p class="ck-kicker ck-kicker--inset">Guiding the judge</p>
5
+ <span class="ck-guiding__legend">Judge &rarr; Human</span>
6
+ </div>
7
+ <ul class="ck-guiding__list">
8
+ <% examples.each do |example| %>
9
+ <li class="ck-guiding__item">
10
+ <%= link_to run_response_path(example[:run_id], example[:response_id], anchor: metric.name.parameterize),
11
+ class: "ck-guiding__link", title: "Open this review" do %>
12
+ <span class="ck-guiding__output"><%= truncate(example[:output].to_s, length: 90) %></span>
13
+ <span class="ck-guiding__scores"><span class="ck-guiding__judge"><%= example[:judge_score].to_i %></span> &rarr; <span class="ck-guiding__human"><%= example[:human_score].to_i %></span></span>
14
+ <% end %>
15
+ <%= button_to exclude_example_metric_path(metric, calibration_id: example[:id]),
16
+ method: :post, form_class: "inline-block", class: "ck-icon-btn",
17
+ title: "Stop using this case", "aria-label": "Stop using this case",
18
+ data: { turbo_confirm: "Stop using this corrected case to guide the judge?" } do %><%= heroicon_tag "x-mark", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
19
+ </li>
20
+ <% end %>
21
+ </ul>
22
+ <% end %>
23
+ </div>
@@ -18,6 +18,7 @@
18
18
  <thead>
19
19
  <tr>
20
20
  <th scope="col">Name</th>
21
+ <th scope="col">Version</th>
21
22
  <th scope="col">Instruction</th>
22
23
  <th scope="col">In groups</th>
23
24
  <th scope="col"></th>
@@ -34,6 +35,10 @@
34
35
  </div>
35
36
  <% end %>
36
37
  </td>
38
+ <td data-label="Version">
39
+ <% v = @current_versions[metric.id] %>
40
+ <span class="ck-chip ck-chip--soft"><%= v ? v.version_label : "v1" %></span>
41
+ </td>
37
42
  <td data-label="Instruction" class="ck-meta-copy"><div class="ck-clamp-2"><%= metric.instruction.presence || "—" %></div></td>
38
43
  <td data-label="In groups">
39
44
  <% groups = metric.metric_groups %>
@@ -50,7 +50,6 @@
50
50
  <thead>
51
51
  <tr>
52
52
  <th scope="col">Version</th>
53
- <th scope="col">&Delta; Change</th>
54
53
  <th scope="col">Source</th>
55
54
  <th scope="col">Created</th>
56
55
  </tr>
@@ -60,36 +59,28 @@
60
59
  <% pred = predecessor_of[v] %>
61
60
  <tr>
62
61
  <td>
62
+ <% summary = v.change_summary_against(pred) %>
63
63
  <div class="ck-version-cell">
64
64
  <div class="ck-version-cell__label">
65
65
  <strong><%= v.version_label %></strong>
66
66
  <% if v.current? %>
67
- <span class="ck-version-state ck-version-state--live">Published</span>
67
+ <span class="ck-chip">Published</span>
68
68
  <% elsif v.draft? %>
69
- <span class="ck-version-state">Draft</span>
70
69
  <%= button_to "Publish", publish_draft_metric_path(@metric, draft_id: v.id),
71
70
  method: :post, form_class: "inline-block",
72
- class: "ck-chip ck-chip--cta" %>
71
+ class: "ck-chip ck-chip--publish" %>
73
72
  <% else %>
74
- <span class="ck-version-state">Past</span>
75
73
  <%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
76
74
  method: :post, form_class: "inline-block",
77
75
  class: "ck-chip ck-chip--publish",
78
76
  data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
79
77
  <% end %>
80
78
  </div>
79
+ <% if summary %>
80
+ <button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">&Delta;</button>
81
+ <% end %>
81
82
  </div>
82
83
  </td>
83
- <td>
84
- <% summary = v.change_summary_against(pred) %>
85
- <% if summary %>
86
- <button type="button" class="ck-change-link ck-change-link--<%= summary[:magnitude] %>"
87
- title="Compare with <%= pred.version_label %>"
88
- onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()"><%= summary[:label] %></button>
89
- <% else %>
90
- <span class="ck-meta-copy">—</span>
91
- <% end %>
92
- </td>
93
84
  <td>
94
85
  <% source_label, source_class = case v.source
95
86
  when "suggestion" then ["AI calibration", "ck-source-chip ck-source-chip--ai"]
@@ -177,20 +168,11 @@
177
168
  <% end %>
178
169
 
179
170
  <% if CompletionKit.config.judge_calibration_enabled %>
171
+ <% draft = @suggestion_draft || @edit_draft %>
180
172
  <section class="ck-card ck-card--spaced">
181
- <p class="ck-kicker">Calibration</p>
182
- <p class="ck-meta-copy">This is a measure of how often the judge's scores match a human reviewer. Review its scores to build that signal; the scores you disagree with become the cases the model learns from when you improve the metric.</p>
183
- <%= render "completion_kit/calibrations/trust_panel",
184
- stats: CompletionKit::MetricCalibrationStats.for(@metric),
185
- metric: @metric %>
186
- <% draft = @suggestion_draft || @edit_draft %>
187
- <% if draft %>
188
- <div class="ck-cal-foot">
189
- <span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
190
- </div>
191
- <% elsif @improve_disagreement_count.positive? %>
192
- <div class="ck-cal-foot">
193
- <span class="ck-cal-foot__note"><%= pluralize(@improve_disagreement_count, "case") %> where a reviewer's score didn't match the judge.</span>
173
+ <div class="ck-prompt-preview__header">
174
+ <p class="ck-kicker">Calibration</p>
175
+ <% if draft.nil? && @improve_disagreement_count.positive? %>
194
176
  <%= button_to suggest_variants_metric_path(@metric),
195
177
  method: :post, form_class: "inline-block",
196
178
  class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
@@ -198,6 +180,18 @@
198
180
  <%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
199
181
  Suggest improvements
200
182
  <% end %>
183
+ <% end %>
184
+ </div>
185
+ <p class="ck-meta-copy">This is a measure of how often the judge's scores match a human reviewer. Review its scores to build that signal; the scores you disagree with become the cases the model learns from when you improve the metric.</p>
186
+ <%= render "completion_kit/calibrations/trust_panel",
187
+ stats: CompletionKit::MetricCalibrationStats.for(@metric),
188
+ metric: @metric %>
189
+ <% if CompletionKit.config.judge_examples_from_reviews %>
190
+ <%= render "completion_kit/metrics/guiding_examples", metric: @metric, examples: @guiding_examples %>
191
+ <% end %>
192
+ <% if draft %>
193
+ <div class="ck-cal-foot">
194
+ <span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
201
195
  </div>
202
196
  <% end %>
203
197
  </section>
@@ -100,12 +100,17 @@
100
100
  <% @reviews.each do |review| %>
101
101
  <% review_version = review.metric_version %>
102
102
  <% stale = review.stale_against_current_judge? %>
103
- <div class="ck-review-card<%= " ck-review-card--stale" if stale %>" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
103
+ <div class="ck-review-card" id="<%= review.metric&.name&.parameterize || "review-#{review.id}" %>">
104
104
  <div class="ck-review-card__header">
105
105
  <span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
106
106
  <div class="ck-inline">
107
107
  <% if review_version %>
108
- <span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Scored against #{review_version.version_label} of this metric. The metric has been republished since." : "Scored against the metric's current version (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
108
+ <% if stale %>
109
+ <% current_version = CompletionKit::MetricVersion.current.find_by(metric_id: review.metric_id) %>
110
+ <span class="ck-source-chip ck-source-chip--past" title="Scored on <%= review_version.version_label %>; the metric is now on <%= current_version.version_label %>, which may score this differently."><%= review_version.version_label %> &rarr; <%= current_version.version_label %></span>
111
+ <% else %>
112
+ <span class="ck-source-chip ck-source-chip--current" title="Scored on the metric's current version (<%= review_version.version_label %>)."><%= review_version.version_label %></span>
113
+ <% end %>
109
114
  <% end %>
110
115
  <% if review.ai_score %>
111
116
  <% 5.times do |i| %>
@@ -116,9 +121,6 @@
116
121
  <% end %>
117
122
  </div>
118
123
  </div>
119
- <% if stale %>
120
- <p class="ck-review-card__stale-note">Scored against a superseded version of this metric. Its current version may score this differently.</p>
121
- <% end %>
122
124
  <% if review.ai_feedback.present? %>
123
125
  <p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
124
126
  <% end %>
data/config/routes.rb CHANGED
@@ -22,6 +22,7 @@ CompletionKit::Engine.routes.draw do
22
22
  post :publish_draft
23
23
  post :suggest_variants
24
24
  delete :dismiss_suggestion
25
+ post :exclude_example
25
26
  end
26
27
  end
27
28
  resources :metric_groups
@@ -0,0 +1,5 @@
1
+ class AddExcludedFromExamplesToCompletionKitCalibrations < ActiveRecord::Migration[8.1]
2
+ def change
3
+ add_column :completion_kit_calibrations, :excluded_from_examples, :boolean, null: false, default: false
4
+ end
5
+ end
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.9.0"
2
+ VERSION = "0.10.0"
3
3
  end
@@ -13,6 +13,7 @@ module CompletionKit
13
13
  attr_accessor :api_rate_limit, :web_rate_limit
14
14
  attr_accessor :allow_loopback_endpoints
15
15
  attr_accessor :judge_calibration_enabled
16
+ attr_accessor :judge_examples_from_reviews
16
17
 
17
18
  def initialize
18
19
  @openai_api_key = ENV['OPENAI_API_KEY']
@@ -29,6 +30,7 @@ module CompletionKit
29
30
 
30
31
  @allow_loopback_endpoints = true
31
32
  @judge_calibration_enabled = true
33
+ @judge_examples_from_reviews = false
32
34
 
33
35
  @api_reference_authentication_partial = "completion_kit/api_reference/authentication"
34
36
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.0
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin
@@ -311,6 +311,7 @@ files:
311
311
  - app/services/completion_kit/mcp_tools/responses.rb
312
312
  - app/services/completion_kit/mcp_tools/runs.rb
313
313
  - app/services/completion_kit/mcp_tools/tags.rb
314
+ - app/services/completion_kit/metric_calibration_examples.rb
314
315
  - app/services/completion_kit/metric_calibration_stats.rb
315
316
  - app/services/completion_kit/metric_variant_generator.rb
316
317
  - app/services/completion_kit/model_discovery_service.rb
@@ -350,6 +351,7 @@ files:
350
351
  - app/views/completion_kit/metric_groups/new.html.erb
351
352
  - app/views/completion_kit/metric_groups/show.html.erb
352
353
  - app/views/completion_kit/metrics/_form.html.erb
354
+ - app/views/completion_kit/metrics/_guiding_examples.html.erb
353
355
  - app/views/completion_kit/metrics/_rubric_diff.html.erb
354
356
  - app/views/completion_kit/metrics/_rubric_hint.html.erb
355
357
  - app/views/completion_kit/metrics/_starter_card.html.erb
@@ -430,6 +432,7 @@ files:
430
432
  - db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
431
433
  - db/migrate/20260528000002_add_metric_version_to_reviews.rb
432
434
  - db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
435
+ - db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb
433
436
  - lib/completion-kit.rb
434
437
  - lib/completion_kit.rb
435
438
  - lib/completion_kit/concurrency_check.rb