completion-kit 0.7.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +146 -325
- data/app/controllers/completion_kit/api/v1/base_controller.rb +14 -4
- data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +2 -2
- data/app/controllers/completion_kit/api/v1/datasets_controller.rb +2 -2
- data/app/controllers/completion_kit/api/v1/metric_groups_controller.rb +2 -2
- data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +1 -1
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +5 -32
- data/app/controllers/completion_kit/api/v1/prompts_controller.rb +2 -2
- data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb +2 -2
- data/app/controllers/completion_kit/api/v1/runs_controller.rb +7 -7
- data/app/controllers/completion_kit/api/v1/tags_controller.rb +2 -2
- data/app/controllers/completion_kit/metrics_controller.rb +14 -37
- data/app/controllers/completion_kit/runs_controller.rb +2 -2
- data/app/jobs/completion_kit/generate_row_job.rb +2 -4
- data/app/jobs/completion_kit/judge_review_job.rb +4 -19
- data/app/models/completion_kit/metric.rb +0 -1
- data/app/models/completion_kit/metric_version.rb +35 -0
- data/app/models/completion_kit/run.rb +0 -1
- data/app/services/completion_kit/judge_service.rb +3 -10
- data/app/services/completion_kit/mcp_tools/metric_versions.rb +1 -1
- data/app/services/completion_kit/metric_variant_generator.rb +0 -13
- data/app/views/completion_kit/api_reference/_body.html.erb +2 -12
- data/app/views/completion_kit/api_reference/index.html.erb +4 -0
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +19 -12
- data/app/views/completion_kit/metrics/_form.html.erb +11 -12
- data/app/views/completion_kit/metrics/edit.html.erb +18 -0
- data/app/views/completion_kit/metrics/index.html.erb +0 -17
- data/app/views/completion_kit/metrics/show.html.erb +87 -105
- data/app/views/completion_kit/responses/show.html.erb +2 -2
- data/app/views/completion_kit/runs/show.html.erb +7 -7
- data/config/routes.rb +0 -4
- data/db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb +5 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +2 -1
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
<% stats = local_assigns[:stats] %>
|
|
2
2
|
<% metric = local_assigns[:metric] %>
|
|
3
3
|
<% anchor = metric&.name&.parameterize %>
|
|
4
|
+
<% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
|
|
4
5
|
<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
|
|
5
6
|
created_by = CompletionKit.config.username.presence || "operator"
|
|
6
|
-
current_metric_version = CompletionKit::MetricVersion.current.find_by(metric_id: metric.id)
|
|
7
7
|
verdicted_ids = if current_metric_version
|
|
8
8
|
CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
|
|
9
9
|
else
|
|
@@ -22,19 +22,26 @@
|
|
|
22
22
|
end %>
|
|
23
23
|
|
|
24
24
|
<p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
25
|
-
<span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
|
|
26
25
|
<% if stats.sample_size.zero? %>
|
|
27
|
-
<span class="ck-trust-
|
|
28
|
-
<span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %>
|
|
29
|
-
|
|
30
|
-
|
|
26
|
+
<span class="ck-trust-line__lead">Not measured yet.</span>
|
|
27
|
+
<span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> human reviews of the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "earlier-version review") %> kept on file.)<% end %></span>
|
|
28
|
+
<% if target_response %>
|
|
29
|
+
<%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
30
|
+
<% end %>
|
|
31
31
|
<% elsif stats.counter_only? %>
|
|
32
|
-
<span class="ck-
|
|
33
|
-
|
|
32
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></strong></span>
|
|
33
|
+
<% if stats.short_to_target > 0 %><span class="ck-trust-line__hint"><%= stats.short_to_target %> more to report a rate</span><% end %>
|
|
34
|
+
<% if target_response %>
|
|
35
|
+
<%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
36
|
+
<% end %>
|
|
34
37
|
<% else %>
|
|
35
|
-
<span class="ck-
|
|
36
|
-
<span class="ck-
|
|
37
|
-
<span class="ck-
|
|
38
|
-
<span class="ck-
|
|
38
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Agreement</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong></span>
|
|
39
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
|
|
40
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
|
|
41
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
|
|
42
|
+
<% if stats.borderline_rate && stats.borderline_rate > 0 %>
|
|
43
|
+
<% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
|
|
44
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>
|
|
45
|
+
<% end %>
|
|
39
46
|
<% end %>
|
|
40
47
|
</p>
|
|
@@ -40,20 +40,19 @@
|
|
|
40
40
|
<% if suggestion %>
|
|
41
41
|
<div class="ck-suggestion-banner" role="status">
|
|
42
42
|
<div class="ck-suggestion-banner__body">
|
|
43
|
-
<p class="ck-kicker"
|
|
44
|
-
<p class="ck-meta-copy">Based on
|
|
43
|
+
<p class="ck-kicker ck-kicker--icon"><%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>Proposed changes</p>
|
|
44
|
+
<p class="ck-meta-copy">Based on human reviews, here are some proposed changes to the metric.</p>
|
|
45
45
|
</div>
|
|
46
46
|
<div class="ck-suggestion-banner__actions">
|
|
47
|
-
<%= button_to
|
|
48
|
-
method: :post, form_class: "inline-block",
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
<%= button_to "Take everything", publish_draft_metric_path(metric, draft_id: suggestion.id),
|
|
47
|
+
<%= button_to suggest_variants_metric_path(metric, back_to: "edit"),
|
|
48
|
+
method: :post, form_class: "inline-block", class: "ck-icon-btn",
|
|
49
|
+
title: "Try again", "aria-label": "Try again",
|
|
50
|
+
data: { turbo_confirm: "Replace these changes with fresh ones from the model?" } do %><%= heroicon_tag "arrow-path", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
51
|
+
<%= button_to dismiss_suggestion_metric_path(metric, draft_id: suggestion.id, back_to: "edit"),
|
|
52
|
+
method: :delete, form_class: "inline-block", class: "ck-icon-btn",
|
|
53
|
+
title: "Discard these changes", "aria-label": "Discard",
|
|
54
|
+
data: { turbo_confirm: "Drop these changes?" } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
55
|
+
<%= button_to "Apply all", publish_draft_metric_path(metric, draft_id: suggestion.id),
|
|
57
56
|
method: :post, form_class: "inline-block",
|
|
58
57
|
class: ck_button_classes(:dark) %>
|
|
59
58
|
</div>
|
|
@@ -10,6 +10,24 @@
|
|
|
10
10
|
</div>
|
|
11
11
|
</section>
|
|
12
12
|
|
|
13
|
+
<% if CompletionKit.config.judge_calibration_enabled && @suggestion_draft.nil? && @edit_draft.nil? && @improve_disagreement_count.to_i.positive? %>
|
|
14
|
+
<div class="ck-suggestion-banner" role="status">
|
|
15
|
+
<div class="ck-suggestion-banner__body">
|
|
16
|
+
<p class="ck-kicker">Improve from reviews</p>
|
|
17
|
+
<p class="ck-meta-copy">Based on human reviews, the model can propose changes to this metric.</p>
|
|
18
|
+
</div>
|
|
19
|
+
<div class="ck-suggestion-banner__actions">
|
|
20
|
+
<%= button_to suggest_variants_metric_path(@metric, back_to: "edit"),
|
|
21
|
+
method: :post, form_class: "inline-block",
|
|
22
|
+
class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
|
|
23
|
+
data: { turbo_confirm: "Draft improvements to this metric from your human reviews? You can edit or apply them here before publishing." } do %>
|
|
24
|
+
<%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
|
|
25
|
+
Suggest improvements
|
|
26
|
+
<% end %>
|
|
27
|
+
</div>
|
|
28
|
+
</div>
|
|
29
|
+
<% end %>
|
|
30
|
+
|
|
13
31
|
<%= render "form",
|
|
14
32
|
metric: @metric,
|
|
15
33
|
suggestion_draft: @suggestion_draft,
|
|
@@ -28,23 +28,6 @@
|
|
|
28
28
|
<tr onclick="window.location='<%= metric_path(metric) %>'" style="cursor: pointer;">
|
|
29
29
|
<td>
|
|
30
30
|
<%= link_to metric_path(metric), class: "ck-record-name" do %><strong><%= metric.name %></strong><% end %>
|
|
31
|
-
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
32
|
-
<% s = CompletionKit::MetricCalibrationStats.for(metric) %>
|
|
33
|
-
<p class="ck-metrics-table__trust" title="Calibration: how often this metric's scores match the humans who reviewed them.">
|
|
34
|
-
<%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true", class: "ck-trust-icon" %>
|
|
35
|
-
<span class="ck-metrics-table__trust-label">Calibration</span>
|
|
36
|
-
<% if s.counter_only? %>
|
|
37
|
-
<% if s.sample_size.zero? %>
|
|
38
|
-
<span class="ck-metrics-table__trust-state">Not measured yet</span>
|
|
39
|
-
<% else %>
|
|
40
|
-
<%= s.sample_size %> / <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts
|
|
41
|
-
<% end %>
|
|
42
|
-
<% else %>
|
|
43
|
-
<span class="ck-metrics-table__trust-rate" title="<%= s.firm? ? 'Settled read.' : 'Early read. Keep giving verdicts.' %>">~<%= (s.agreement_point * 100).round %>%</span>
|
|
44
|
-
±<%= (s.margin * 100).round %> pt · <%= s.firm? ? "settled" : "early" %>
|
|
45
|
-
<% end %>
|
|
46
|
-
</p>
|
|
47
|
-
<% end %>
|
|
48
31
|
<% if metric.tags.any? %>
|
|
49
32
|
<div class="tag-marks-row">
|
|
50
33
|
<%= render "completion_kit/tags/marks", tags: metric.tags %>
|
|
@@ -6,11 +6,6 @@
|
|
|
6
6
|
<section class="ck-page-header">
|
|
7
7
|
<div>
|
|
8
8
|
<h1 class="ck-title"><%= @metric.name %></h1>
|
|
9
|
-
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
10
|
-
<%= render "completion_kit/calibrations/trust_panel",
|
|
11
|
-
stats: CompletionKit::MetricCalibrationStats.for(@metric),
|
|
12
|
-
metric: @metric %>
|
|
13
|
-
<% end %>
|
|
14
9
|
<% if @metric.tags.any? %>
|
|
15
10
|
<div class="tag-marks-row tag-marks-row--header">
|
|
16
11
|
<%= render "completion_kit/tags/marks", tags: @metric.tags %>
|
|
@@ -18,23 +13,6 @@
|
|
|
18
13
|
<% end %>
|
|
19
14
|
</div>
|
|
20
15
|
<div class="ck-actions">
|
|
21
|
-
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
22
|
-
<% if @suggestion_draft || @edit_draft %>
|
|
23
|
-
<% review_title = @suggestion_draft ? "The model proposed improvements based on your disagreements. Review and apply what you want." : "An unpublished draft of this metric is saved. Review and publish, or keep editing." %>
|
|
24
|
-
<%= link_to "Review changes →", edit_metric_path(@metric),
|
|
25
|
-
class: ck_button_classes(:dark),
|
|
26
|
-
title: review_title %>
|
|
27
|
-
<% elsif @improve_disagreement_count.positive? %>
|
|
28
|
-
<%= button_to "Improve the metric", suggest_variants_metric_path(@metric),
|
|
29
|
-
method: :post, form_class: "inline-block",
|
|
30
|
-
class: ck_button_classes(:light, variant: :outline),
|
|
31
|
-
title: "Ask the model to suggest improvements to this metric's instruction and rubric based on the disagreements collected so far.",
|
|
32
|
-
data: { turbo_confirm: "Ask the model for suggested improvements based on the disagreements collected so far?" } %>
|
|
33
|
-
<% else %>
|
|
34
|
-
<button type="button" class="<%= ck_button_classes(:light, variant: :outline) %>" disabled
|
|
35
|
-
title="Mark at least one case as Disagree before the model can suggest a change.">Improve the metric</button>
|
|
36
|
-
<% end %>
|
|
37
|
-
<% end %>
|
|
38
16
|
<%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
|
|
39
17
|
</div>
|
|
40
18
|
</section>
|
|
@@ -64,15 +42,15 @@
|
|
|
64
42
|
</div>
|
|
65
43
|
</section>
|
|
66
44
|
|
|
67
|
-
<% if CompletionKit.config.judge_calibration_enabled && @versions.
|
|
45
|
+
<% if CompletionKit.config.judge_calibration_enabled && @versions.any? %>
|
|
68
46
|
<% predecessor_of = @versions.index_with { |v| @versions.detect { |o| o.version_number < v.version_number } } %>
|
|
69
|
-
<% version_changed = ->(v, pred) { pred && (pred.instruction.to_s != v.instruction.to_s || pred.rubric_bands != v.rubric_bands) } %>
|
|
70
47
|
<section class="ck-card ck-card--spaced">
|
|
71
48
|
<p class="ck-kicker">Versions</p>
|
|
72
49
|
<table class="ck-results-table ck-metric-versions-table">
|
|
73
50
|
<thead>
|
|
74
51
|
<tr>
|
|
75
52
|
<th scope="col">Version</th>
|
|
53
|
+
<th scope="col">Δ Change</th>
|
|
76
54
|
<th scope="col">Source</th>
|
|
77
55
|
<th scope="col">Created</th>
|
|
78
56
|
</tr>
|
|
@@ -86,28 +64,35 @@
|
|
|
86
64
|
<div class="ck-version-cell__label">
|
|
87
65
|
<strong><%= v.version_label %></strong>
|
|
88
66
|
<% if v.current? %>
|
|
89
|
-
<span class="ck-
|
|
67
|
+
<span class="ck-version-state ck-version-state--live">Published</span>
|
|
90
68
|
<% elsif v.draft? %>
|
|
69
|
+
<span class="ck-version-state">Draft</span>
|
|
91
70
|
<%= button_to "Publish", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
92
71
|
method: :post, form_class: "inline-block",
|
|
93
|
-
class: "ck-chip ck-chip--
|
|
72
|
+
class: "ck-chip ck-chip--cta" %>
|
|
94
73
|
<% else %>
|
|
74
|
+
<span class="ck-version-state">Past</span>
|
|
95
75
|
<%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
96
76
|
method: :post, form_class: "inline-block",
|
|
97
77
|
class: "ck-chip ck-chip--publish",
|
|
98
|
-
data: { turbo_confirm: "
|
|
78
|
+
data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
|
|
99
79
|
<% end %>
|
|
100
80
|
</div>
|
|
101
|
-
<% if version_changed.call(v, pred) %>
|
|
102
|
-
<button type="button" class="ck-cell-link ck-cell-link--delta"
|
|
103
|
-
title="What changed from #{pred.version_label}"
|
|
104
|
-
onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">Δ</button>
|
|
105
|
-
<% end %>
|
|
106
81
|
</div>
|
|
107
82
|
</td>
|
|
83
|
+
<td>
|
|
84
|
+
<% summary = v.change_summary_against(pred) %>
|
|
85
|
+
<% if summary %>
|
|
86
|
+
<button type="button" class="ck-change-link ck-change-link--<%= summary[:magnitude] %>"
|
|
87
|
+
title="Compare with <%= pred.version_label %>"
|
|
88
|
+
onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()"><%= summary[:label] %></button>
|
|
89
|
+
<% else %>
|
|
90
|
+
<span class="ck-meta-copy">—</span>
|
|
91
|
+
<% end %>
|
|
92
|
+
</td>
|
|
108
93
|
<td>
|
|
109
94
|
<% source_label, source_class = case v.source
|
|
110
|
-
when "suggestion" then ["AI
|
|
95
|
+
when "suggestion" then ["AI calibration", "ck-source-chip ck-source-chip--ai"]
|
|
111
96
|
when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
|
|
112
97
|
when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
|
|
113
98
|
else ["Original", "ck-source-chip ck-source-chip--initial"]
|
|
@@ -115,7 +100,15 @@
|
|
|
115
100
|
<span class="<%= source_class %>"><%= source_label %></span>
|
|
116
101
|
</td>
|
|
117
102
|
<td class="ck-meta-copy">
|
|
118
|
-
<
|
|
103
|
+
<div class="ck-version-created">
|
|
104
|
+
<time datetime="<%= v.created_at.utc.iso8601 %>" data-relative-time><%= time_ago_in_words(v.created_at) %> ago</time>
|
|
105
|
+
<% if v.draft? %>
|
|
106
|
+
<%= button_to dismiss_suggestion_metric_path(@metric, draft_id: v.id),
|
|
107
|
+
method: :delete, form_class: "inline-block", class: "ck-icon-btn",
|
|
108
|
+
title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
|
|
109
|
+
data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
110
|
+
<% end %>
|
|
111
|
+
</div>
|
|
119
112
|
</td>
|
|
120
113
|
</tr>
|
|
121
114
|
<% end %>
|
|
@@ -125,7 +118,7 @@
|
|
|
125
118
|
|
|
126
119
|
<% @versions.each do |v| %>
|
|
127
120
|
<% pred = predecessor_of[v] %>
|
|
128
|
-
<% next unless
|
|
121
|
+
<% next unless v.change_summary_against(pred) %>
|
|
129
122
|
<dialog id="ck-mvdiff-<%= v.id %>" class="ck-modal" onclick="if(event.target===this)this.close()">
|
|
130
123
|
<article class="ck-modal__panel" tabindex="-1" onclick="event.stopPropagation()">
|
|
131
124
|
<header class="ck-modal__header">
|
|
@@ -148,85 +141,74 @@
|
|
|
148
141
|
</div>
|
|
149
142
|
</div>
|
|
150
143
|
<% end %>
|
|
151
|
-
<%
|
|
144
|
+
<% pred_bands = CompletionKit::Metric.normalize_rubric_bands(pred.rubric_bands) %>
|
|
145
|
+
<% v_bands = CompletionKit::Metric.normalize_rubric_bands(v.rubric_bands) %>
|
|
146
|
+
<% if pred_bands != v_bands %>
|
|
152
147
|
<p class="ck-kicker ck-kicker--inset">Rubric changes</p>
|
|
153
148
|
<%= render "completion_kit/metrics/rubric_diff",
|
|
154
|
-
current_bands:
|
|
155
|
-
draft_bands:
|
|
149
|
+
current_bands: pred_bands,
|
|
150
|
+
draft_bands: v_bands %>
|
|
156
151
|
<% end %>
|
|
157
152
|
</div>
|
|
153
|
+
<footer class="ck-modal__footer ck-modal__footer--split">
|
|
154
|
+
<% if v.current? %>
|
|
155
|
+
<span class="ck-modal__foot-note">This is the metric's published version.</span>
|
|
156
|
+
<% elsif v.draft? %>
|
|
157
|
+
<span class="ck-modal__foot-note">Happy with it? Publish to use <%= v.version_label %> for this metric from now on. Tweak it with Edit.</span>
|
|
158
|
+
<span class="ck-modal__foot-actions">
|
|
159
|
+
<%= button_to dismiss_suggestion_metric_path(@metric, draft_id: v.id),
|
|
160
|
+
method: :delete, form_class: "inline-block", class: "ck-icon-btn",
|
|
161
|
+
title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
|
|
162
|
+
data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
163
|
+
<%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
|
|
164
|
+
<%= button_to "Publish #{v.version_label} →", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
165
|
+
method: :post, form_class: "inline-block", class: ck_button_classes(:dark) %>
|
|
166
|
+
</span>
|
|
167
|
+
<% else %>
|
|
168
|
+
<span class="ck-modal__foot-note">Roll this metric back to this version.</span>
|
|
169
|
+
<%= button_to "Make #{v.version_label} current →", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
170
|
+
method: :post, form_class: "inline-block", class: ck_button_classes(:dark),
|
|
171
|
+
data: { turbo_confirm: "Make #{v.version_label} the version to use? #{v.version_label} will be used in test runs using this metric now. Reviews you have already given stay with the version they were made against." } %>
|
|
172
|
+
<% end %>
|
|
173
|
+
</footer>
|
|
158
174
|
</article>
|
|
159
175
|
</dialog>
|
|
160
176
|
<% end %>
|
|
161
177
|
<% end %>
|
|
162
178
|
|
|
163
|
-
<% if CompletionKit.config.judge_calibration_enabled
|
|
179
|
+
<% if CompletionKit.config.judge_calibration_enabled %>
|
|
164
180
|
<section class="ck-card ck-card--spaced">
|
|
165
|
-
<
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
<%
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
<% else %>
|
|
188
|
-
<span class="ck-meta-copy">—</span>
|
|
189
|
-
<% end %>
|
|
190
|
-
<span class="ck-disagreement__scores-arrow">→</span>
|
|
191
|
-
<span class="ck-disagreement__scores-label">Human</span>
|
|
192
|
-
<% if cal.corrected_score %>
|
|
193
|
-
<span class="<%= ck_badge_classes(ck_score_kind(cal.corrected_score.to_f)) %>"><%= cal.corrected_score %></span>
|
|
194
|
-
<% else %>
|
|
195
|
-
<span class="ck-meta-copy">—</span>
|
|
196
|
-
<% end %>
|
|
197
|
-
</div>
|
|
198
|
-
<div class="ck-disagreement__action">
|
|
199
|
-
<% if already %>
|
|
200
|
-
<%= button_to "Forget",
|
|
201
|
-
remove_few_shot_metric_path(@metric, calibration_id: cal.id),
|
|
202
|
-
method: :delete,
|
|
203
|
-
form_class: "inline-block",
|
|
204
|
-
class: ck_button_classes(:light, variant: :outline),
|
|
205
|
-
title: "Stop showing this case to the judge.",
|
|
206
|
-
data: { turbo_confirm: "Stop showing this case to the judge?" } %>
|
|
207
|
-
<span class="ck-chip ck-chip--done" title="The judge sees this row when it grades for this metric.">Remembered</span>
|
|
208
|
-
<% else %>
|
|
209
|
-
<%= button_to "Remember this",
|
|
210
|
-
add_few_shot_metric_path(@metric, calibration_id: cal.id),
|
|
211
|
-
method: :post,
|
|
212
|
-
form_class: "inline-block",
|
|
213
|
-
class: ck_button_classes(:light, variant: :outline),
|
|
214
|
-
title: "Pin this case so the judge sees it next time it grades for this metric." %>
|
|
215
|
-
<% end %>
|
|
216
|
-
</div>
|
|
217
|
-
</div>
|
|
218
|
-
<% if cal.note.to_s.present? %>
|
|
219
|
-
<p class="ck-disagreement__note"><%= cal.note %></p>
|
|
220
|
-
<% end %>
|
|
221
|
-
<p class="ck-disagreement__source ck-meta-copy">
|
|
222
|
-
<%= link_to run_response_path(cal.response.run, cal.response, anchor: @metric.name.parameterize),
|
|
223
|
-
class: "ck-disagreement__source-link" do %>
|
|
224
|
-
<% case_display = cal.response.row_index.nil? ? "##{cal.response.id}" : (cal.response.row_index + 1).to_s %>
|
|
225
|
-
View case <%= case_display %> in <%= cal.response.run.name.to_s.truncate(50) %> →
|
|
226
|
-
<% end %>
|
|
227
|
-
</p>
|
|
228
|
-
</li>
|
|
229
|
-
<% end %>
|
|
230
|
-
</ul>
|
|
181
|
+
<p class="ck-kicker">Calibration</p>
|
|
182
|
+
<p class="ck-meta-copy">This is a measure of how often the judge's scores match a human reviewer. Review its scores to build that signal; the scores you disagree with become the cases the model learns from when you improve the metric.</p>
|
|
183
|
+
<%= render "completion_kit/calibrations/trust_panel",
|
|
184
|
+
stats: CompletionKit::MetricCalibrationStats.for(@metric),
|
|
185
|
+
metric: @metric %>
|
|
186
|
+
<% draft = @suggestion_draft || @edit_draft %>
|
|
187
|
+
<% if draft %>
|
|
188
|
+
<div class="ck-cal-foot">
|
|
189
|
+
<span class="ck-cal-foot__note">A draft improvement (<%= draft.version_label %>) is waiting in the Versions table above. Open its change to compare, then Publish to use it.</span>
|
|
190
|
+
</div>
|
|
191
|
+
<% elsif @improve_disagreement_count.positive? %>
|
|
192
|
+
<div class="ck-cal-foot">
|
|
193
|
+
<span class="ck-cal-foot__note"><%= pluralize(@improve_disagreement_count, "case") %> where a reviewer's score didn't match the judge.</span>
|
|
194
|
+
<%= button_to suggest_variants_metric_path(@metric),
|
|
195
|
+
method: :post, form_class: "inline-block",
|
|
196
|
+
class: ck_button_classes(:light, variant: :outline) + " ck-button--sm",
|
|
197
|
+
data: { turbo_confirm: "Draft improvements to this metric from your human reviews? It stays a draft until you compare it and publish." } do %>
|
|
198
|
+
<%= heroicon_tag "sparkles", variant: :outline, class: "ck-magic-icon", "aria-hidden": "true" %>
|
|
199
|
+
Suggest improvements
|
|
200
|
+
<% end %>
|
|
201
|
+
</div>
|
|
202
|
+
<% end %>
|
|
231
203
|
</section>
|
|
232
204
|
<% end %>
|
|
205
|
+
|
|
206
|
+
<% if params[:show_change].present? %>
|
|
207
|
+
<script>
|
|
208
|
+
(function () {
|
|
209
|
+
var dialog = document.getElementById("ck-mvdiff-<%= params[:show_change].to_i %>");
|
|
210
|
+
if (dialog && typeof dialog.showModal === "function") dialog.showModal();
|
|
211
|
+
})();
|
|
212
|
+
</script>
|
|
213
|
+
<% end %>
|
|
214
|
+
|
|
@@ -105,7 +105,7 @@
|
|
|
105
105
|
<span class="ck-review-card__metric"><% if review.metric %><%= link_to review.metric_name, metric_path(review.metric), class: "ck-link" %><% else %><%= review.metric_name %><% end %></span>
|
|
106
106
|
<div class="ck-inline">
|
|
107
107
|
<% if review_version %>
|
|
108
|
-
<span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "
|
|
108
|
+
<span class="ck-source-chip <%= stale ? "ck-source-chip--past" : "ck-source-chip--current" %>" title="<%= stale ? "Scored against #{review_version.version_label} of this metric. The metric has been republished since." : "Scored against the metric's current version (#{review_version.version_label})." %>"><%= review_version.version_label %></span>
|
|
109
109
|
<% end %>
|
|
110
110
|
<% if review.ai_score %>
|
|
111
111
|
<% 5.times do |i| %>
|
|
@@ -117,7 +117,7 @@
|
|
|
117
117
|
</div>
|
|
118
118
|
</div>
|
|
119
119
|
<% if stale %>
|
|
120
|
-
<p class="ck-review-card__stale-note">Scored against a superseded version of this metric.
|
|
120
|
+
<p class="ck-review-card__stale-note">Scored against a superseded version of this metric. Its current version may score this differently.</p>
|
|
121
121
|
<% end %>
|
|
122
122
|
<% if review.ai_feedback.present? %>
|
|
123
123
|
<p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
|
|
@@ -23,25 +23,25 @@
|
|
|
23
23
|
<% if stale_summary.any? %>
|
|
24
24
|
<div class="ck-stale-versions-banner" role="status">
|
|
25
25
|
<div class="ck-stale-versions-banner__body">
|
|
26
|
-
<p class="ck-kicker">Stale
|
|
26
|
+
<p class="ck-kicker">Stale metric versions</p>
|
|
27
27
|
<p class="ck-meta-copy">
|
|
28
28
|
This run was scored against metric versions that are no longer live.
|
|
29
29
|
<% stale_summary.values.each_with_index do |s, i| %>
|
|
30
30
|
<%= ", " if i > 0 %><strong><%= s[:metric_name] %></strong> (scored by <%= s[:scored_labels].join(", ") %>; live is <%= s[:current_label] %>)<% end %>.
|
|
31
|
-
Re-run to refresh the scores with the current
|
|
31
|
+
Re-run to refresh the scores with the current metrics.
|
|
32
32
|
</p>
|
|
33
33
|
</div>
|
|
34
34
|
<% if @run.status == "completed" %>
|
|
35
35
|
<%= button_to "Re-run from scratch",
|
|
36
36
|
rerun_run_path(@run), method: :post,
|
|
37
37
|
class: ck_button_classes(:light, variant: :outline), form_class: "inline-block",
|
|
38
|
-
title: "Create a new run that regenerates responses and grades them with the current
|
|
39
|
-
data: { turbo_confirm: "Create a new run with fresh responses and the current
|
|
40
|
-
<%= button_to "Re-grade with current
|
|
38
|
+
title: "Create a new run that regenerates responses and grades them with the current metrics.",
|
|
39
|
+
data: { turbo_confirm: "Create a new run with fresh responses and the current metrics? The original run stays as a record." } %>
|
|
40
|
+
<%= button_to "Re-grade with current metrics",
|
|
41
41
|
regrade_run_path(@run), method: :post,
|
|
42
42
|
class: ck_button_classes(:dark), form_class: "inline-block",
|
|
43
|
-
title: "Re-
|
|
44
|
-
data: { turbo_confirm: "Re-
|
|
43
|
+
title: "Re-grade this run's existing responses against the current metrics. Faster and cheaper than re-running.",
|
|
44
|
+
data: { turbo_confirm: "Re-grade this run's existing responses against the current metrics?" } %>
|
|
45
45
|
<% end %>
|
|
46
46
|
</div>
|
|
47
47
|
<% end %>
|
data/config/routes.rb
CHANGED
|
@@ -19,8 +19,6 @@ CompletionKit::Engine.routes.draw do
|
|
|
19
19
|
post "starters/:key/dismiss", to: "metrics#dismiss_starter", as: :dismiss_starter
|
|
20
20
|
end
|
|
21
21
|
member do
|
|
22
|
-
post :add_few_shot
|
|
23
|
-
delete :remove_few_shot
|
|
24
22
|
post :publish_draft
|
|
25
23
|
post :suggest_variants
|
|
26
24
|
delete :dismiss_suggestion
|
|
@@ -89,8 +87,6 @@ CompletionKit::Engine.routes.draw do
|
|
|
89
87
|
end
|
|
90
88
|
member do
|
|
91
89
|
post :suggest_variants
|
|
92
|
-
post :add_few_shot
|
|
93
|
-
delete :remove_few_shot
|
|
94
90
|
end
|
|
95
91
|
end
|
|
96
92
|
resources :metric_groups
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.9.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -429,6 +429,7 @@ files:
|
|
|
429
429
|
- db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb
|
|
430
430
|
- db/migrate/20260528000001_rename_judge_version_to_metric_version.rb
|
|
431
431
|
- db/migrate/20260528000002_add_metric_version_to_reviews.rb
|
|
432
|
+
- db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
|
|
432
433
|
- lib/completion-kit.rb
|
|
433
434
|
- lib/completion_kit.rb
|
|
434
435
|
- lib/completion_kit/concurrency_check.rb
|