completion-kit 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +118 -55
- data/app/controllers/completion_kit/{calibrations_controller.rb → agreements_controller.rb} +19 -19
- data/app/controllers/completion_kit/api/v1/{calibrations_controller.rb → agreements_controller.rb} +18 -18
- data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +2 -7
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +1 -1
- data/app/controllers/completion_kit/metrics_controller.rb +18 -23
- data/app/jobs/completion_kit/judge_review_job.rb +2 -2
- data/app/jobs/completion_kit/metric_suggestion_job.rb +46 -0
- data/app/models/completion_kit/{calibration.rb → agreement.rb} +1 -1
- data/app/models/completion_kit/metric_version.rb +2 -17
- data/app/models/completion_kit/review.rb +1 -0
- data/app/services/completion_kit/{calibration_math.rb → agreement_math.rb} +1 -1
- data/app/services/completion_kit/mcp_dispatcher.rb +2 -2
- data/app/services/completion_kit/mcp_tools/{calibrations.rb → agreements.rb} +11 -11
- data/app/services/completion_kit/mcp_tools/judges.rb +3 -3
- data/app/services/completion_kit/mcp_tools/metric_versions.rb +2 -7
- data/app/services/completion_kit/{metric_calibration_examples.rb → metric_agreement_examples.rb} +6 -6
- data/app/services/completion_kit/{metric_calibration_stats.rb → metric_agreement_stats.rb} +6 -6
- data/app/services/completion_kit/metric_improvement_validator.rb +101 -0
- data/app/services/completion_kit/metric_variant_generator.rb +2 -2
- data/app/views/completion_kit/{calibrations → agreements}/_buttons.html.erb +33 -33
- data/app/views/completion_kit/{calibrations → agreements}/_trust_panel.html.erb +6 -9
- data/app/views/completion_kit/api_reference/_body.html.erb +15 -15
- data/app/views/completion_kit/metrics/_guiding_examples.html.erb +1 -1
- data/app/views/completion_kit/metrics/_suggestion_failed.html.erb +3 -0
- data/app/views/completion_kit/metrics/_suggestion_pending.html.erb +3 -0
- data/app/views/completion_kit/metrics/_suggestion_ready.html.erb +4 -0
- data/app/views/completion_kit/metrics/_validation_scoreboard.html.erb +12 -0
- data/app/views/completion_kit/metrics/edit.html.erb +1 -1
- data/app/views/completion_kit/metrics/show.html.erb +25 -11
- data/app/views/completion_kit/responses/show.html.erb +4 -4
- data/app/views/completion_kit/runs/show.html.erb +1 -1
- data/config/routes.rb +3 -3
- data/db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb +5 -0
- data/db/migrate/20260531000002_backfill_review_metric_versions.rb +33 -0
- data/db/migrate/20260531000003_add_metric_version_fk_to_reviews.rb +6 -0
- data/db/migrate/20260531000004_rename_calibrations_to_agreements.rb +19 -0
- data/lib/completion_kit/version.rb +1 -1
- data/lib/completion_kit.rb +2 -2
- metadata +20 -10
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
<input type="radio" name="ck-api-tab" id="ck-tab-datasets" class="ck-api-tabs__radio">
|
|
18
18
|
<input type="radio" name="ck-api-tab" id="ck-tab-metrics" class="ck-api-tabs__radio">
|
|
19
19
|
<input type="radio" name="ck-api-tab" id="ck-tab-metric-groups" class="ck-api-tabs__radio">
|
|
20
|
-
<input type="radio" name="ck-api-tab" id="ck-tab-
|
|
20
|
+
<input type="radio" name="ck-api-tab" id="ck-tab-agreements" class="ck-api-tabs__radio">
|
|
21
21
|
<input type="radio" name="ck-api-tab" id="ck-tab-tags" class="ck-api-tabs__radio">
|
|
22
22
|
<input type="radio" name="ck-api-tab" id="ck-tab-providers" class="ck-api-tabs__radio">
|
|
23
23
|
|
|
@@ -29,7 +29,7 @@
|
|
|
29
29
|
<label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
|
|
30
30
|
<label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">10</span></label>
|
|
31
31
|
<label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
|
|
32
|
-
<label for="ck-tab-
|
|
32
|
+
<label for="ck-tab-agreements" class="ck-api-tabs__label">Agreements <span class="ck-api-tabs__count">3</span></label>
|
|
33
33
|
<label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
|
|
34
34
|
<label for="ck-tab-providers" class="ck-api-tabs__label">Providers <span class="ck-api-tabs__count">5</span></label>
|
|
35
35
|
</nav>
|
|
@@ -238,8 +238,8 @@
|
|
|
238
238
|
} %>
|
|
239
239
|
|
|
240
240
|
<div class="ck-api-endpoint" style="padding-top: 1.5rem;">
|
|
241
|
-
<p class="ck-kicker" style="margin-bottom: 0.5rem;">
|
|
242
|
-
<p class="ck-meta-copy">Drive metric improvement from disagree-flagged
|
|
241
|
+
<p class="ck-kicker" style="margin-bottom: 0.5rem;">Agreement loop</p>
|
|
242
|
+
<p class="ck-meta-copy">Drive metric improvement from disagree-flagged agreements: ask the model to rewrite the instruction and rubric into a new draft version.</p>
|
|
243
243
|
</div>
|
|
244
244
|
<div class="ck-api-endpoint">
|
|
245
245
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
|
|
@@ -250,7 +250,7 @@
|
|
|
250
250
|
|
|
251
251
|
<div class="ck-api-endpoint" style="padding-top: 1.5rem;">
|
|
252
252
|
<p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>
|
|
253
|
-
<p class="ck-meta-copy">Every metric carries a history of versions (the current published one, prior published ones, and any draft suggestions). Reviews and
|
|
253
|
+
<p class="ck-meta-copy">Every metric carries a history of versions (the current published one, prior published ones, and any draft suggestions). Reviews and agreements record the version they ran against, so the API can surface stale state and let you revert.</p>
|
|
254
254
|
</div>
|
|
255
255
|
<div class="ck-api-endpoint">
|
|
256
256
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/metrics/:metric_id/metric_versions</p>
|
|
@@ -294,23 +294,23 @@
|
|
|
294
294
|
</div>
|
|
295
295
|
|
|
296
296
|
<div class="ck-api-tabs__panel">
|
|
297
|
-
<h2 class="ck-section-title">
|
|
298
|
-
<p class="ck-copy">Per-verdict feedback events on a response/metric pair: agree, disagree (with a corrected score and note), or borderline.
|
|
297
|
+
<h2 class="ck-section-title">Agreements</h2>
|
|
298
|
+
<p class="ck-copy">Per-verdict feedback events on a response/metric pair: agree, disagree (with a corrected score and note), or borderline. Agreements capture the metric version that was current when the verdict was cast, which is what drives the trust signal and the "stale" indicators across the rest of the API.</p>
|
|
299
299
|
<div class="ck-api-endpoint">
|
|
300
|
-
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/
|
|
301
|
-
<p class="ck-meta-copy">List
|
|
300
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/agreements</p>
|
|
301
|
+
<p class="ck-meta-copy">List agreements across all runs. Supports filtering by any combination of the query params below.</p>
|
|
302
302
|
<p class="ck-api-params"><strong>Optional filters:</strong> <code>run_id</code>, <code>response_id</code>, <code>metric_id</code>, <code>metric_version_id</code>, <code>created_by</code>, <code>verdict</code> (<code>agree</code>, <code>disagree</code>, or <code>borderline</code>)</p>
|
|
303
|
-
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/
|
|
303
|
+
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/agreements?metric_id=1&verdict=disagree\" \\\n -H \"Authorization: Bearer #{token}\"" %>
|
|
304
304
|
</div>
|
|
305
305
|
<div class="ck-api-endpoint">
|
|
306
|
-
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:run_id/responses/:response_id/metrics/:metric_id/
|
|
307
|
-
<p class="ck-meta-copy">Cast
|
|
306
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:run_id/responses/:response_id/metrics/:metric_id/agreements</p>
|
|
307
|
+
<p class="ck-meta-copy">Cast an agreement on a specific response/metric pair. The metric version on the record is set automatically from the run's review.</p>
|
|
308
308
|
<p class="ck-api-params"><strong>Required:</strong> <code>verdict</code>, <code>created_by</code> <strong>Optional:</strong> <code>corrected_score</code>, <code>note</code></p>
|
|
309
|
-
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/responses/42/metrics/3/
|
|
309
|
+
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/responses/42/metrics/3/agreements \\\n -H \"Authorization: Bearer #{token}\" \\\n -H \"Content-Type: application/json\" \\\n -d '{\"verdict\": \"disagree\", \"corrected_score\": 3, \"note\": \"too generous\", \"created_by\": \"alice\"}'" %>
|
|
310
310
|
</div>
|
|
311
311
|
<div class="ck-api-endpoint">
|
|
312
|
-
<p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/
|
|
313
|
-
<p class="ck-meta-copy">Delete
|
|
312
|
+
<p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/agreements/:id</p>
|
|
313
|
+
<p class="ck-meta-copy">Delete an agreement. Returns 204 No Content.</p>
|
|
314
314
|
</div>
|
|
315
315
|
</div>
|
|
316
316
|
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
<span class="ck-guiding__output"><%= truncate(example[:output].to_s, length: 90) %></span>
|
|
13
13
|
<span class="ck-guiding__scores"><span class="ck-guiding__judge"><%= example[:judge_score].to_i %></span> → <span class="ck-guiding__human"><%= example[:human_score].to_i %></span></span>
|
|
14
14
|
<% end %>
|
|
15
|
-
<%= button_to exclude_example_metric_path(metric,
|
|
15
|
+
<%= button_to exclude_example_metric_path(metric, agreement_id: example[:id]),
|
|
16
16
|
method: :post, form_class: "inline-block", class: "ck-icon-btn",
|
|
17
17
|
title: "Stop using this case", "aria-label": "Stop using this case",
|
|
18
18
|
data: { turbo_confirm: "Stop using this corrected case to guide the judge?" } do %><%= heroicon_tag "x-mark", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
<div id="ck-suggestion-status-<%= metric.id %>" class="ck-suggestion-status ck-suggestion-status--ready">
|
|
2
|
+
<span class="ck-cal-foot__note">Drafted <%= draft.version_label %> and tested it against your reviews.</span>
|
|
3
|
+
<%= link_to "Compare and publish →", CompletionKit::Engine.routes.url_helpers.metric_path(metric, show_change: draft.id), class: "ck-cal-link" %>
|
|
4
|
+
</div>
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
<% s = summary %>
|
|
2
|
+
<div class="ck-scoreboard">
|
|
3
|
+
<p class="ck-scoreboard__headline">Matches you on <strong><%= s["after"] %> of <%= s["tested"] %></strong> of your reviews <span class="ck-scoreboard__was">was <%= s["before"] %> of <%= s["tested"] %></span></p>
|
|
4
|
+
<ul class="ck-scoreboard__tally">
|
|
5
|
+
<li class="ck-scoreboard__stat ck-scoreboard__stat--fix">Fixes <strong><%= s["fixes"] %></strong></li>
|
|
6
|
+
<li class="ck-scoreboard__stat ck-scoreboard__stat--keep">Keeps <strong><%= s["keeps"] %></strong></li>
|
|
7
|
+
<li class="ck-scoreboard__stat ck-scoreboard__stat--break">Breaks <strong><%= s["breaks"] %></strong></li>
|
|
8
|
+
</ul>
|
|
9
|
+
<% if s["capped"] %>
|
|
10
|
+
<p class="ck-scoreboard__note">Tested against your 30 most recent reviews.</p>
|
|
11
|
+
<% end %>
|
|
12
|
+
</div>
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
</div>
|
|
11
11
|
</section>
|
|
12
12
|
|
|
13
|
-
<% if CompletionKit.config.
|
|
13
|
+
<% if CompletionKit.config.judge_agreement_enabled && @suggestion_draft.nil? && @edit_draft.nil? && @improve_disagreement_count.to_i.positive? %>
|
|
14
14
|
<div class="ck-suggestion-banner" role="status">
|
|
15
15
|
<div class="ck-suggestion-banner__body">
|
|
16
16
|
<p class="ck-kicker">Improve from reviews</p>
|
|
@@ -42,7 +42,7 @@
|
|
|
42
42
|
</div>
|
|
43
43
|
</section>
|
|
44
44
|
|
|
45
|
-
<% if CompletionKit.config.
|
|
45
|
+
<% if CompletionKit.config.judge_agreement_enabled && @versions.any? %>
|
|
46
46
|
<% predecessor_of = @versions.index_with { |v| @versions.detect { |o| o.version_number < v.version_number } } %>
|
|
47
47
|
<section class="ck-card ck-card--spaced">
|
|
48
48
|
<p class="ck-kicker">Versions</p>
|
|
@@ -73,17 +73,23 @@
|
|
|
73
73
|
<%= button_to "Make current", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
74
74
|
method: :post, form_class: "inline-block",
|
|
75
75
|
class: "ck-chip ck-chip--publish",
|
|
76
|
-
data: { turbo_confirm: "Make #{v.version_label} the version to use?
|
|
76
|
+
data: { turbo_confirm: "Make #{v.version_label} the version to use? It becomes the version used in test runs, and the reviews you gave on it count again. Reviews on the version you're leaving stay with it." } %>
|
|
77
77
|
<% end %>
|
|
78
78
|
</div>
|
|
79
|
+
<% vs = v.validation_summary %>
|
|
79
80
|
<% if summary %>
|
|
80
|
-
<
|
|
81
|
+
<div class="ck-version-change">
|
|
82
|
+
<% if v.draft? && vs.present? %>
|
|
83
|
+
<span class="ck-version-score"><span class="ck-version-score__label">Match</span> <%= vs["after"] %>/<%= vs["tested"] %></span>
|
|
84
|
+
<% end %>
|
|
85
|
+
<button type="button" class="ck-cell-link ck-cell-link--delta" title="What changed from <%= pred.version_label %>" onclick="document.getElementById('ck-mvdiff-<%= v.id %>').showModal()">Δ</button>
|
|
86
|
+
</div>
|
|
81
87
|
<% end %>
|
|
82
88
|
</div>
|
|
83
89
|
</td>
|
|
84
90
|
<td>
|
|
85
91
|
<% source_label, source_class = case v.source
|
|
86
|
-
when "suggestion" then ["AI
|
|
92
|
+
when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
|
|
87
93
|
when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
|
|
88
94
|
when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
|
|
89
95
|
else ["Original", "ck-source-chip ck-source-chip--initial"]
|
|
@@ -110,6 +116,7 @@
|
|
|
110
116
|
<% @versions.each do |v| %>
|
|
111
117
|
<% pred = predecessor_of[v] %>
|
|
112
118
|
<% next unless v.change_summary_against(pred) %>
|
|
119
|
+
<% vs = v.validation_summary %>
|
|
113
120
|
<dialog id="ck-mvdiff-<%= v.id %>" class="ck-modal" onclick="if(event.target===this)this.close()">
|
|
114
121
|
<article class="ck-modal__panel" tabindex="-1" onclick="event.stopPropagation()">
|
|
115
122
|
<header class="ck-modal__header">
|
|
@@ -120,6 +127,9 @@
|
|
|
120
127
|
<button type="button" class="ck-modal__close" aria-label="Close" onclick="this.closest('dialog').close()">×</button>
|
|
121
128
|
</header>
|
|
122
129
|
<div class="ck-modal__body">
|
|
130
|
+
<% if v.draft? && vs.present? %>
|
|
131
|
+
<%= render "completion_kit/metrics/validation_scoreboard", summary: vs %>
|
|
132
|
+
<% end %>
|
|
123
133
|
<% if pred.instruction.to_s != v.instruction.to_s %>
|
|
124
134
|
<div class="ck-suggest-diff">
|
|
125
135
|
<div class="ck-suggest-diff__pane">
|
|
@@ -152,14 +162,16 @@
|
|
|
152
162
|
title: "Discard draft #{v.version_label}", "aria-label": "Discard draft #{v.version_label}",
|
|
153
163
|
data: { turbo_confirm: "Discard draft #{v.version_label}? This can't be undone." } do %><%= heroicon_tag "trash", variant: :outline, size: 16, "aria-hidden": "true" %><% end %>
|
|
154
164
|
<%= link_to "Edit", edit_metric_path(@metric), class: ck_button_classes(:light, variant: :outline) %>
|
|
165
|
+
<% net_negative = vs.present? && (vs["after"].to_i < vs["before"].to_i || vs["breaks"].to_i > vs["fixes"].to_i) %>
|
|
155
166
|
<%= button_to "Publish #{v.version_label} →", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
156
|
-
method: :post, form_class: "inline-block", class: ck_button_classes(:dark)
|
|
167
|
+
method: :post, form_class: "inline-block", class: ck_button_classes(:dark),
|
|
168
|
+
data: net_negative ? { turbo_confirm: "This agrees with you less than the current version. Publish anyway?" } : {} %>
|
|
157
169
|
</span>
|
|
158
170
|
<% else %>
|
|
159
171
|
<span class="ck-modal__foot-note">Roll this metric back to this version.</span>
|
|
160
172
|
<%= button_to "Make #{v.version_label} current →", publish_draft_metric_path(@metric, draft_id: v.id),
|
|
161
173
|
method: :post, form_class: "inline-block", class: ck_button_classes(:dark),
|
|
162
|
-
data: { turbo_confirm: "Make #{v.version_label} the version to use?
|
|
174
|
+
data: { turbo_confirm: "Make #{v.version_label} the version to use? It becomes the version used in test runs, and the reviews you gave on it count again. Reviews on the version you're leaving stay with it." } %>
|
|
163
175
|
<% end %>
|
|
164
176
|
</footer>
|
|
165
177
|
</article>
|
|
@@ -167,11 +179,11 @@
|
|
|
167
179
|
<% end %>
|
|
168
180
|
<% end %>
|
|
169
181
|
|
|
170
|
-
<% if CompletionKit.config.
|
|
182
|
+
<% if CompletionKit.config.judge_agreement_enabled %>
|
|
171
183
|
<% draft = @suggestion_draft || @edit_draft %>
|
|
172
184
|
<section class="ck-card ck-card--spaced">
|
|
173
185
|
<div class="ck-prompt-preview__header">
|
|
174
|
-
<p class="ck-kicker">
|
|
186
|
+
<p class="ck-kicker">Agreement</p>
|
|
175
187
|
<% if draft.nil? && @improve_disagreement_count.positive? %>
|
|
176
188
|
<%= button_to suggest_variants_metric_path(@metric),
|
|
177
189
|
method: :post, form_class: "inline-block",
|
|
@@ -182,9 +194,11 @@
|
|
|
182
194
|
<% end %>
|
|
183
195
|
<% end %>
|
|
184
196
|
</div>
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
197
|
+
<%= turbo_stream_from "metric_#{@metric.id}_suggestion" %>
|
|
198
|
+
<div id="ck-suggestion-status-<%= @metric.id %>" class="ck-suggestion-status"></div>
|
|
199
|
+
<p class="ck-meta-copy">How often the judge lands on the same score you would. Review its scores to build that signal, and improve the metric to raise it.</p>
|
|
200
|
+
<%= render "completion_kit/agreements/trust_panel",
|
|
201
|
+
stats: CompletionKit::MetricAgreementStats.for(@metric),
|
|
188
202
|
metric: @metric %>
|
|
189
203
|
<% if CompletionKit.config.judge_examples_from_reviews %>
|
|
190
204
|
<%= render "completion_kit/metrics/guiding_examples", metric: @metric, examples: @guiding_examples %>
|
|
@@ -124,13 +124,13 @@
|
|
|
124
124
|
<% if review.ai_feedback.present? %>
|
|
125
125
|
<p class="ck-review-card__feedback"><%= review.ai_feedback %></p>
|
|
126
126
|
<% end %>
|
|
127
|
-
<% if CompletionKit.config.
|
|
128
|
-
<% existing = CompletionKit::
|
|
127
|
+
<% if CompletionKit.config.judge_agreement_enabled && review.metric && review.ai_score %>
|
|
128
|
+
<% existing = CompletionKit::Agreement.find_by(
|
|
129
129
|
response_id: @response.id, metric_id: review.metric_id,
|
|
130
130
|
created_by: CompletionKit.config.username.presence || "operator"
|
|
131
131
|
) %>
|
|
132
|
-
<%= render "completion_kit/
|
|
133
|
-
review: review,
|
|
132
|
+
<%= render "completion_kit/agreements/buttons",
|
|
133
|
+
review: review, agreement: existing, run: @run,
|
|
134
134
|
response_row: @response, metric: review.metric %>
|
|
135
135
|
<% end %>
|
|
136
136
|
</div>
|
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
<% dataset_preview_lines = dataset_lines.first(50) %>
|
|
19
19
|
<% end %>
|
|
20
20
|
|
|
21
|
-
<% if CompletionKit.config.
|
|
21
|
+
<% if CompletionKit.config.judge_agreement_enabled %>
|
|
22
22
|
<% stale_summary = @run.stale_review_summary %>
|
|
23
23
|
<% if stale_summary.any? %>
|
|
24
24
|
<div class="ck-stale-versions-banner" role="status">
|
data/config/routes.rb
CHANGED
|
@@ -41,7 +41,7 @@ CompletionKit::Engine.routes.draw do
|
|
|
41
41
|
get :compare
|
|
42
42
|
end
|
|
43
43
|
resources :responses, only: [:show] do
|
|
44
|
-
resources :
|
|
44
|
+
resources :agreements, only: [:create]
|
|
45
45
|
end
|
|
46
46
|
end
|
|
47
47
|
|
|
@@ -75,7 +75,7 @@ CompletionKit::Engine.routes.draw do
|
|
|
75
75
|
end
|
|
76
76
|
resources :responses, only: [:index, :show] do
|
|
77
77
|
resources :metrics, only: [] do
|
|
78
|
-
resources :
|
|
78
|
+
resources :agreements, only: [:index, :create]
|
|
79
79
|
end
|
|
80
80
|
end
|
|
81
81
|
end
|
|
@@ -93,7 +93,7 @@ CompletionKit::Engine.routes.draw do
|
|
|
93
93
|
resources :metric_groups
|
|
94
94
|
resources :tags
|
|
95
95
|
resources :provider_credentials
|
|
96
|
-
resources :
|
|
96
|
+
resources :agreements, only: [:index, :destroy]
|
|
97
97
|
end
|
|
98
98
|
end
|
|
99
99
|
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
class BackfillReviewMetricVersions < ActiveRecord::Migration[8.1]
|
|
2
|
+
def up
|
|
3
|
+
quoted_true = ActiveRecord::Base.connection.quote(true)
|
|
4
|
+
now = ActiveRecord::Base.connection.quote(Time.current)
|
|
5
|
+
|
|
6
|
+
execute <<~SQL
|
|
7
|
+
INSERT INTO completion_kit_metric_versions
|
|
8
|
+
(metric_id, instruction, rubric_bands, current, state, version_number, published_at, created_at, updated_at)
|
|
9
|
+
SELECT m.id, m.instruction, m.rubric_bands, #{quoted_true}, 'published', 1, #{now}, #{now}, #{now}
|
|
10
|
+
FROM completion_kit_metrics m
|
|
11
|
+
WHERE NOT EXISTS (
|
|
12
|
+
SELECT 1 FROM completion_kit_metric_versions mv WHERE mv.metric_id = m.id
|
|
13
|
+
)
|
|
14
|
+
SQL
|
|
15
|
+
|
|
16
|
+
execute <<~SQL
|
|
17
|
+
UPDATE completion_kit_reviews
|
|
18
|
+
SET metric_version_id = (
|
|
19
|
+
SELECT mv.id FROM completion_kit_metric_versions mv
|
|
20
|
+
WHERE mv.metric_id = completion_kit_reviews.metric_id AND mv.current = #{quoted_true}
|
|
21
|
+
LIMIT 1
|
|
22
|
+
)
|
|
23
|
+
WHERE metric_id IS NOT NULL
|
|
24
|
+
AND (
|
|
25
|
+
metric_version_id IS NULL
|
|
26
|
+
OR metric_version_id NOT IN (SELECT id FROM completion_kit_metric_versions)
|
|
27
|
+
)
|
|
28
|
+
SQL
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def down
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
class RenameCalibrationsToAgreements < ActiveRecord::Migration[8.1]
|
|
2
|
+
CALIBRATION_INDEXES = {
|
|
3
|
+
"index_ck_calibrations_on_metric_id" => "index_ck_agreements_on_metric_id",
|
|
4
|
+
"index_ck_calibrations_on_metric_version_id" => "index_ck_agreements_on_metric_version_id",
|
|
5
|
+
"index_ck_calibrations_on_response_id" => "index_ck_agreements_on_response_id",
|
|
6
|
+
"index_ck_calibrations_on_run_id" => "index_ck_agreements_on_run_id",
|
|
7
|
+
"index_ck_calibrations_on_response_metric_user" => "index_ck_agreements_on_response_metric_user"
|
|
8
|
+
}.freeze
|
|
9
|
+
|
|
10
|
+
def up
|
|
11
|
+
rename_table :completion_kit_calibrations, :completion_kit_agreements
|
|
12
|
+
CALIBRATION_INDEXES.each { |old_name, new_name| rename_index :completion_kit_agreements, old_name, new_name }
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def down
|
|
16
|
+
CALIBRATION_INDEXES.each { |old_name, new_name| rename_index :completion_kit_agreements, new_name, old_name }
|
|
17
|
+
rename_table :completion_kit_agreements, :completion_kit_calibrations
|
|
18
|
+
end
|
|
19
|
+
end
|
data/lib/completion_kit.rb
CHANGED
|
@@ -12,7 +12,7 @@ module CompletionKit
|
|
|
12
12
|
attr_accessor :api_reference_authentication_partial
|
|
13
13
|
attr_accessor :api_rate_limit, :web_rate_limit
|
|
14
14
|
attr_accessor :allow_loopback_endpoints
|
|
15
|
-
attr_accessor :
|
|
15
|
+
attr_accessor :judge_agreement_enabled
|
|
16
16
|
attr_accessor :judge_examples_from_reviews
|
|
17
17
|
|
|
18
18
|
def initialize
|
|
@@ -29,7 +29,7 @@ module CompletionKit
|
|
|
29
29
|
@web_rate_limit = 300
|
|
30
30
|
|
|
31
31
|
@allow_loopback_endpoints = true
|
|
32
|
-
@
|
|
32
|
+
@judge_agreement_enabled = true
|
|
33
33
|
@judge_examples_from_reviews = false
|
|
34
34
|
|
|
35
35
|
@api_reference_authentication_partial = "completion_kit/api_reference/authentication"
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.12.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -234,8 +234,9 @@ files:
|
|
|
234
234
|
- app/assets/images/completion_kit/logo.png
|
|
235
235
|
- app/assets/javascripts/completion_kit/application.js
|
|
236
236
|
- app/assets/stylesheets/completion_kit/application.css
|
|
237
|
+
- app/controllers/completion_kit/agreements_controller.rb
|
|
238
|
+
- app/controllers/completion_kit/api/v1/agreements_controller.rb
|
|
237
239
|
- app/controllers/completion_kit/api/v1/base_controller.rb
|
|
238
|
-
- app/controllers/completion_kit/api/v1/calibrations_controller.rb
|
|
239
240
|
- app/controllers/completion_kit/api/v1/datasets_controller.rb
|
|
240
241
|
- app/controllers/completion_kit/api/v1/metric_groups_controller.rb
|
|
241
242
|
- app/controllers/completion_kit/api/v1/metric_versions_controller.rb
|
|
@@ -247,7 +248,6 @@ files:
|
|
|
247
248
|
- app/controllers/completion_kit/api/v1/tags_controller.rb
|
|
248
249
|
- app/controllers/completion_kit/api_reference_controller.rb
|
|
249
250
|
- app/controllers/completion_kit/application_controller.rb
|
|
250
|
-
- app/controllers/completion_kit/calibrations_controller.rb
|
|
251
251
|
- app/controllers/completion_kit/dashboard_controller.rb
|
|
252
252
|
- app/controllers/completion_kit/dashboard_dismissals_controller.rb
|
|
253
253
|
- app/controllers/completion_kit/datasets_controller.rb
|
|
@@ -266,11 +266,12 @@ files:
|
|
|
266
266
|
- app/jobs/completion_kit/application_job.rb
|
|
267
267
|
- app/jobs/completion_kit/generate_row_job.rb
|
|
268
268
|
- app/jobs/completion_kit/judge_review_job.rb
|
|
269
|
+
- app/jobs/completion_kit/metric_suggestion_job.rb
|
|
269
270
|
- app/jobs/completion_kit/model_discovery_job.rb
|
|
270
271
|
- app/jobs/completion_kit/run_completion_check_job.rb
|
|
271
272
|
- app/mailers/completion_kit/application_mailer.rb
|
|
273
|
+
- app/models/completion_kit/agreement.rb
|
|
272
274
|
- app/models/completion_kit/application_record.rb
|
|
273
|
-
- app/models/completion_kit/calibration.rb
|
|
274
275
|
- app/models/completion_kit/dashboard_dismissal.rb
|
|
275
276
|
- app/models/completion_kit/dataset.rb
|
|
276
277
|
- app/models/completion_kit/mcp_session.rb
|
|
@@ -291,16 +292,16 @@ files:
|
|
|
291
292
|
- app/models/completion_kit/tagging.rb
|
|
292
293
|
- app/models/concerns/completion_kit/has_job_status.rb
|
|
293
294
|
- app/models/concerns/completion_kit/taggable.rb
|
|
295
|
+
- app/services/completion_kit/agreement_math.rb
|
|
294
296
|
- app/services/completion_kit/anthropic_client.rb
|
|
295
297
|
- app/services/completion_kit/api_config.rb
|
|
296
|
-
- app/services/completion_kit/calibration_math.rb
|
|
297
298
|
- app/services/completion_kit/csv_processor.rb
|
|
298
299
|
- app/services/completion_kit/dashboard_stats.rb
|
|
299
300
|
- app/services/completion_kit/judge_service.rb
|
|
300
301
|
- app/services/completion_kit/llm_client.rb
|
|
301
302
|
- app/services/completion_kit/mcp_dispatcher.rb
|
|
303
|
+
- app/services/completion_kit/mcp_tools/agreements.rb
|
|
302
304
|
- app/services/completion_kit/mcp_tools/base.rb
|
|
303
|
-
- app/services/completion_kit/mcp_tools/calibrations.rb
|
|
304
305
|
- app/services/completion_kit/mcp_tools/datasets.rb
|
|
305
306
|
- app/services/completion_kit/mcp_tools/judges.rb
|
|
306
307
|
- app/services/completion_kit/mcp_tools/metric_groups.rb
|
|
@@ -311,8 +312,9 @@ files:
|
|
|
311
312
|
- app/services/completion_kit/mcp_tools/responses.rb
|
|
312
313
|
- app/services/completion_kit/mcp_tools/runs.rb
|
|
313
314
|
- app/services/completion_kit/mcp_tools/tags.rb
|
|
314
|
-
- app/services/completion_kit/
|
|
315
|
-
- app/services/completion_kit/
|
|
315
|
+
- app/services/completion_kit/metric_agreement_examples.rb
|
|
316
|
+
- app/services/completion_kit/metric_agreement_stats.rb
|
|
317
|
+
- app/services/completion_kit/metric_improvement_validator.rb
|
|
316
318
|
- app/services/completion_kit/metric_variant_generator.rb
|
|
317
319
|
- app/services/completion_kit/model_discovery_service.rb
|
|
318
320
|
- app/services/completion_kit/ollama_client.rb
|
|
@@ -326,14 +328,14 @@ files:
|
|
|
326
328
|
- app/services/completion_kit/starter_metrics.rb
|
|
327
329
|
- app/services/completion_kit/worker_health.rb
|
|
328
330
|
- app/validators/completion_kit/tenant_scoped_uniqueness_validator.rb
|
|
331
|
+
- app/views/completion_kit/agreements/_buttons.html.erb
|
|
332
|
+
- app/views/completion_kit/agreements/_trust_panel.html.erb
|
|
329
333
|
- app/views/completion_kit/api_reference/_authentication.html.erb
|
|
330
334
|
- app/views/completion_kit/api_reference/_body.html.erb
|
|
331
335
|
- app/views/completion_kit/api_reference/_example.html.erb
|
|
332
336
|
- app/views/completion_kit/api_reference/_resource_card.html.erb
|
|
333
337
|
- app/views/completion_kit/api_reference/_resource_list.html.erb
|
|
334
338
|
- app/views/completion_kit/api_reference/index.html.erb
|
|
335
|
-
- app/views/completion_kit/calibrations/_buttons.html.erb
|
|
336
|
-
- app/views/completion_kit/calibrations/_trust_panel.html.erb
|
|
337
339
|
- app/views/completion_kit/dashboard/_eye_icon.html.erb
|
|
338
340
|
- app/views/completion_kit/dashboard/_eye_off_icon.html.erb
|
|
339
341
|
- app/views/completion_kit/dashboard/_failures_card.html.erb
|
|
@@ -355,6 +357,10 @@ files:
|
|
|
355
357
|
- app/views/completion_kit/metrics/_rubric_diff.html.erb
|
|
356
358
|
- app/views/completion_kit/metrics/_rubric_hint.html.erb
|
|
357
359
|
- app/views/completion_kit/metrics/_starter_card.html.erb
|
|
360
|
+
- app/views/completion_kit/metrics/_suggestion_failed.html.erb
|
|
361
|
+
- app/views/completion_kit/metrics/_suggestion_pending.html.erb
|
|
362
|
+
- app/views/completion_kit/metrics/_suggestion_ready.html.erb
|
|
363
|
+
- app/views/completion_kit/metrics/_validation_scoreboard.html.erb
|
|
358
364
|
- app/views/completion_kit/metrics/edit.html.erb
|
|
359
365
|
- app/views/completion_kit/metrics/index.html.erb
|
|
360
366
|
- app/views/completion_kit/metrics/new.html.erb
|
|
@@ -433,6 +439,10 @@ files:
|
|
|
433
439
|
- db/migrate/20260528000002_add_metric_version_to_reviews.rb
|
|
434
440
|
- db/migrate/20260529000001_remove_few_shot_examples_from_completion_kit_metrics.rb
|
|
435
441
|
- db/migrate/20260530000001_add_excluded_from_examples_to_completion_kit_calibrations.rb
|
|
442
|
+
- db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb
|
|
443
|
+
- db/migrate/20260531000002_backfill_review_metric_versions.rb
|
|
444
|
+
- db/migrate/20260531000003_add_metric_version_fk_to_reviews.rb
|
|
445
|
+
- db/migrate/20260531000004_rename_calibrations_to_agreements.rb
|
|
436
446
|
- lib/completion-kit.rb
|
|
437
447
|
- lib/completion_kit.rb
|
|
438
448
|
- lib/completion_kit/concurrency_check.rb
|