completion-kit 0.5.44 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +31 -4
- data/app/controllers/completion_kit/api/v1/base_controller.rb +22 -0
- data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +22 -3
- data/app/controllers/completion_kit/api/v1/datasets_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/metric_groups_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +51 -0
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +49 -2
- data/app/controllers/completion_kit/api/v1/prompts_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb +1 -1
- data/app/controllers/completion_kit/api/v1/responses_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/runs_controller.rb +75 -2
- data/app/controllers/completion_kit/api/v1/tags_controller.rb +1 -1
- data/app/controllers/completion_kit/metrics_controller.rb +15 -5
- data/app/controllers/completion_kit/runs_controller.rb +64 -2
- data/app/helpers/completion_kit/application_helper.rb +0 -14
- data/app/jobs/completion_kit/generate_row_job.rb +3 -8
- data/app/jobs/completion_kit/judge_review_job.rb +6 -9
- data/app/models/completion_kit/calibration.rb +0 -4
- data/app/models/completion_kit/metric.rb +1 -0
- data/app/models/completion_kit/metric_version.rb +16 -1
- data/app/models/completion_kit/response.rb +13 -17
- data/app/models/completion_kit/review.rb +18 -22
- data/app/models/completion_kit/run.rb +58 -22
- data/app/models/concerns/completion_kit/has_job_status.rb +31 -0
- data/app/services/completion_kit/mcp_dispatcher.rb +3 -1
- data/app/services/completion_kit/mcp_tools/judges.rb +2 -4
- data/app/services/completion_kit/mcp_tools/metric_versions.rb +67 -0
- data/app/services/completion_kit/metric_variant_generator.rb +20 -6
- data/app/services/completion_kit/starter_metrics.rb +5 -5
- data/app/views/completion_kit/api_reference/_body.html.erb +91 -6
- data/app/views/completion_kit/api_reference/index.html.erb +8 -0
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +6 -1
- data/app/views/completion_kit/metrics/index.html.erb +3 -3
- data/app/views/completion_kit/metrics/show.html.erb +2 -1
- data/app/views/completion_kit/runs/_actions.html.erb +1 -0
- data/app/views/completion_kit/runs/compare.html.erb +85 -0
- data/app/views/completion_kit/runs/compare_picker.html.erb +39 -0
- data/app/views/completion_kit/runs/show.html.erb +8 -2
- data/config/routes.rb +18 -1
- data/lib/completion_kit/version.rb +1 -1
- metadata +6 -1
|
@@ -17,17 +17,19 @@
|
|
|
17
17
|
<input type="radio" name="ck-api-tab" id="ck-tab-datasets" class="ck-api-tabs__radio">
|
|
18
18
|
<input type="radio" name="ck-api-tab" id="ck-tab-metrics" class="ck-api-tabs__radio">
|
|
19
19
|
<input type="radio" name="ck-api-tab" id="ck-tab-metric-groups" class="ck-api-tabs__radio">
|
|
20
|
+
<input type="radio" name="ck-api-tab" id="ck-tab-calibrations" class="ck-api-tabs__radio">
|
|
20
21
|
<input type="radio" name="ck-api-tab" id="ck-tab-tags" class="ck-api-tabs__radio">
|
|
21
22
|
<input type="radio" name="ck-api-tab" id="ck-tab-providers" class="ck-api-tabs__radio">
|
|
22
23
|
|
|
23
24
|
<nav class="ck-api-tabs__nav">
|
|
24
|
-
<label for="ck-tab-mcp" class="ck-api-tabs__label">MCP <span class="ck-api-tabs__count"
|
|
25
|
+
<label for="ck-tab-mcp" class="ck-api-tabs__label">MCP <span class="ck-api-tabs__count"><%= CompletionKit::McpDispatcher.tool_definitions.size %></span></label>
|
|
25
26
|
<label for="ck-tab-prompts" class="ck-api-tabs__label">Prompts <span class="ck-api-tabs__count">6</span></label>
|
|
26
|
-
<label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">
|
|
27
|
+
<label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">10</span></label>
|
|
27
28
|
<label for="ck-tab-responses" class="ck-api-tabs__label">Responses <span class="ck-api-tabs__count">2</span></label>
|
|
28
29
|
<label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
|
|
29
|
-
<label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">
|
|
30
|
+
<label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">12</span></label>
|
|
30
31
|
<label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
|
|
32
|
+
<label for="ck-tab-calibrations" class="ck-api-tabs__label">Calibrations <span class="ck-api-tabs__count">3</span></label>
|
|
31
33
|
<label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
|
|
32
34
|
<label for="ck-tab-providers" class="ck-api-tabs__label">Providers <span class="ck-api-tabs__count">5</span></label>
|
|
33
35
|
</nav>
|
|
@@ -36,7 +38,7 @@
|
|
|
36
38
|
|
|
37
39
|
<div class="ck-api-tabs__panel">
|
|
38
40
|
<h2 class="ck-section-title">MCP Server</h2>
|
|
39
|
-
<p class="ck-copy">Connect Claude Code, Cursor, or any <a href="https://modelcontextprotocol.io" class="ck-link">MCP</a> client to manage prompts, runs, datasets, and metrics conversationally.
|
|
41
|
+
<p class="ck-copy">Connect Claude Code, Cursor, or any <a href="https://modelcontextprotocol.io" class="ck-link">MCP</a> client to manage prompts, runs, datasets, and metrics conversationally. <%= CompletionKit::McpDispatcher.tool_definitions.size %> tools over streamable HTTP.</p>
|
|
40
42
|
|
|
41
43
|
<div class="ck-mcp-install-grid">
|
|
42
44
|
<div class="ck-mcp-install-card">
|
|
@@ -116,7 +118,8 @@
|
|
|
116
118
|
<p class="ck-copy">Create runs, generate LLM responses, and judge them with metrics.</p>
|
|
117
119
|
<div class="ck-api-endpoint">
|
|
118
120
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/runs</p>
|
|
119
|
-
<p class="ck-meta-copy">List
|
|
121
|
+
<p class="ck-meta-copy">List runs with response counts and average scores. Supports pagination (<code>limit</code>, <code>offset</code>) and the following filters.</p>
|
|
122
|
+
<p class="ck-api-params"><strong>Optional filters:</strong> <code>status</code> (<code>pending</code>, <code>running</code>, <code>completed</code>, <code>failed</code>), <code>prompt_id</code>, <code>dataset_id</code>, <code>tag[]</code></p>
|
|
120
123
|
</div>
|
|
121
124
|
<div class="ck-api-endpoint">
|
|
122
125
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs</p>
|
|
@@ -139,6 +142,24 @@
|
|
|
139
142
|
<p class="ck-meta-copy">Start generating responses. Returns 202 Accepted. Poll the run to check progress.</p>
|
|
140
143
|
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/generate \\\n -H \"Authorization: Bearer #{token}\"" %>
|
|
141
144
|
</div>
|
|
145
|
+
<div class="ck-api-endpoint">
|
|
146
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:id/retry_failures</p>
|
|
147
|
+
<p class="ck-meta-copy">Re-queue any responses that failed during generation. Returns 202 Accepted.</p>
|
|
148
|
+
</div>
|
|
149
|
+
<div class="ck-api-endpoint">
|
|
150
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:id/rerun</p>
|
|
151
|
+
<p class="ck-meta-copy">Clone the run and start generating responses on the copy against the current prompt and metric versions. Returns the new run with 201 Created. Useful for capturing a fresh baseline after metric edits.</p>
|
|
152
|
+
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/rerun \\\n -H \"Authorization: Bearer #{token}\"" %>
|
|
153
|
+
</div>
|
|
154
|
+
<div class="ck-api-endpoint">
|
|
155
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:id/regrade</p>
|
|
156
|
+
<p class="ck-meta-copy">Re-judge the existing successful responses against the current metric versions without regenerating model output. Returns 202 Accepted, or 422 if no responses are eligible.</p>
|
|
157
|
+
</div>
|
|
158
|
+
<div class="ck-api-endpoint">
|
|
159
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/runs/:id/compare?with=:other_id</p>
|
|
160
|
+
<p class="ck-meta-copy">Side-by-side comparison against another run. Returns <code>{rows: [...], metric_ids: [...]}</code> with one row per input case, per-metric scores on both sides, and the delta. Cases that exist on only one side are still returned with the missing side nulled out.</p>
|
|
161
|
+
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/runs/1/compare?with=2\" \\\n -H \"Authorization: Bearer #{token}\"" %>
|
|
162
|
+
</div>
|
|
142
163
|
<div class="ck-api-endpoint">
|
|
143
164
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">PATCH</span> /api/v1/runs/:id</p>
|
|
144
165
|
<p class="ck-meta-copy">Update a run. Accepts same params as create.</p>
|
|
@@ -154,7 +175,8 @@
|
|
|
154
175
|
<p class="ck-copy">Read-only access to generated responses and their review scores. Nested under runs.</p>
|
|
155
176
|
<div class="ck-api-endpoint">
|
|
156
177
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/runs/:run_id/responses</p>
|
|
157
|
-
<p class="ck-meta-copy">List
|
|
178
|
+
<p class="ck-meta-copy">List responses for a run, including nested review scores.</p>
|
|
179
|
+
<p class="ck-api-params"><strong>Optional filters:</strong> <code>status</code> (<code>pending</code>, <code>succeeded</code>, <code>failed</code>), plus <code>limit</code> and <code>offset</code></p>
|
|
158
180
|
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl #{base_url}/api/v1/runs/1/responses \\\n -H \"Authorization: Bearer #{token}\"" %>
|
|
159
181
|
</div>
|
|
160
182
|
<div class="ck-api-endpoint">
|
|
@@ -214,6 +236,48 @@
|
|
|
214
236
|
{ name: m.name, subtitle: m.instruction.presence&.truncate(100),
|
|
215
237
|
url: "#{base_url}/api/v1/metrics/#{m.id}", dom_id: "metric_ep_#{m.id}" }
|
|
216
238
|
} %>
|
|
239
|
+
|
|
240
|
+
<div class="ck-api-endpoint" style="padding-top: 1.5rem;">
|
|
241
|
+
<p class="ck-kicker" style="margin-bottom: 0.5rem;">Calibration loop</p>
|
|
242
|
+
<p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model for variants, then pin individual cases as few-shot examples on the metric.</p>
|
|
243
|
+
</div>
|
|
244
|
+
<div class="ck-api-endpoint">
|
|
245
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
|
|
246
|
+
<p class="ck-meta-copy">Generate draft metric versions from the current disagreements. Returns 201 with the new draft versions, 422 if no disagreements exist or the model produced nothing usable.</p>
|
|
247
|
+
<p class="ck-api-params"><strong>Optional:</strong> <code>count</code>, <code>model</code></p>
|
|
248
|
+
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/metrics/1/suggest_variants \\\n -H \"Authorization: Bearer #{token}\"" %>
|
|
249
|
+
</div>
|
|
250
|
+
<div class="ck-api-endpoint">
|
|
251
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/add_few_shot</p>
|
|
252
|
+
<p class="ck-meta-copy">Pin a disagree calibration as a few-shot example on the metric. Returns the updated metric, or 404 if the calibration is not a disagree on this metric.</p>
|
|
253
|
+
<p class="ck-api-params"><strong>Required:</strong> <code>calibration_id</code></p>
|
|
254
|
+
</div>
|
|
255
|
+
<div class="ck-api-endpoint">
|
|
256
|
+
<p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:id/remove_few_shot</p>
|
|
257
|
+
<p class="ck-meta-copy">Drop the pinned few-shot example by calibration ID.</p>
|
|
258
|
+
<p class="ck-api-params"><strong>Required:</strong> <code>calibration_id</code></p>
|
|
259
|
+
</div>
|
|
260
|
+
|
|
261
|
+
<div class="ck-api-endpoint" style="padding-top: 1.5rem;">
|
|
262
|
+
<p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>
|
|
263
|
+
<p class="ck-meta-copy">Every metric carries a history of versions (the current published one, prior published ones, and any draft suggestions). Reviews and calibrations record the version they ran against, so the API can surface stale state and let you revert.</p>
|
|
264
|
+
</div>
|
|
265
|
+
<div class="ck-api-endpoint">
|
|
266
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/metrics/:metric_id/metric_versions</p>
|
|
267
|
+
<p class="ck-meta-copy">List every version for the metric, newest version_number first.</p>
|
|
268
|
+
</div>
|
|
269
|
+
<div class="ck-api-endpoint">
|
|
270
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/metrics/:metric_id/metric_versions/:id</p>
|
|
271
|
+
<p class="ck-meta-copy">Get a single version with its instruction, rubric bands, state, and source.</p>
|
|
272
|
+
</div>
|
|
273
|
+
<div class="ck-api-endpoint">
|
|
274
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:metric_id/metric_versions/:id/publish</p>
|
|
275
|
+
<p class="ck-meta-copy">Publish the version as current. Works for a draft (promote) or a superseded published version (revert). Copies the version's instruction and rubric back onto the metric.</p>
|
|
276
|
+
</div>
|
|
277
|
+
<div class="ck-api-endpoint">
|
|
278
|
+
<p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:metric_id/metric_versions/:id</p>
|
|
279
|
+
<p class="ck-meta-copy">Dismiss a draft version. Returns 204 No Content, or 409 Conflict if the version is published (published versions are immutable history).</p>
|
|
280
|
+
</div>
|
|
217
281
|
</div>
|
|
218
282
|
|
|
219
283
|
<div class="ck-api-tabs__panel">
|
|
@@ -239,6 +303,27 @@
|
|
|
239
303
|
} %>
|
|
240
304
|
</div>
|
|
241
305
|
|
|
306
|
+
<div class="ck-api-tabs__panel">
|
|
307
|
+
<h2 class="ck-section-title">Calibrations</h2>
|
|
308
|
+
<p class="ck-copy">Per-verdict feedback events on a response/metric pair: agree, disagree (with a corrected score and note), or borderline. Calibrations capture the metric version that was current when the verdict was cast, which is what drives the trust signal and the "stale" indicators across the rest of the API.</p>
|
|
309
|
+
<div class="ck-api-endpoint">
|
|
310
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/calibrations</p>
|
|
311
|
+
<p class="ck-meta-copy">List calibrations across all runs. Supports filtering by any combination of the query params below.</p>
|
|
312
|
+
<p class="ck-api-params"><strong>Optional filters:</strong> <code>run_id</code>, <code>response_id</code>, <code>metric_id</code>, <code>metric_version_id</code>, <code>created_by</code>, <code>verdict</code> (<code>agree</code>, <code>disagree</code>, or <code>borderline</code>)</p>
|
|
313
|
+
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/calibrations?metric_id=1&verdict=disagree\" \\\n -H \"Authorization: Bearer #{token}\"" %>
|
|
314
|
+
</div>
|
|
315
|
+
<div class="ck-api-endpoint">
|
|
316
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:run_id/responses/:response_id/metrics/:metric_id/calibrations</p>
|
|
317
|
+
<p class="ck-meta-copy">Cast a calibration on a specific response/metric pair. The metric version on the record is set automatically from the run's review.</p>
|
|
318
|
+
<p class="ck-api-params"><strong>Required:</strong> <code>verdict</code>, <code>created_by</code> <strong>Optional:</strong> <code>corrected_score</code>, <code>note</code></p>
|
|
319
|
+
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/responses/42/metrics/3/calibrations \\\n -H \"Authorization: Bearer #{token}\" \\\n -H \"Content-Type: application/json\" \\\n -d '{\"verdict\": \"disagree\", \"corrected_score\": 3, \"note\": \"too generous\", \"created_by\": \"alice\"}'" %>
|
|
320
|
+
</div>
|
|
321
|
+
<div class="ck-api-endpoint">
|
|
322
|
+
<p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/calibrations/:id</p>
|
|
323
|
+
<p class="ck-meta-copy">Delete a calibration. Returns 204 No Content.</p>
|
|
324
|
+
</div>
|
|
325
|
+
</div>
|
|
326
|
+
|
|
242
327
|
<div class="ck-api-tabs__panel">
|
|
243
328
|
<h2 class="ck-section-title">Tags</h2>
|
|
244
329
|
<p class="ck-copy">Domain labels you can attach to metrics, prompts, runs, and datasets. Tags are auto-assigned a color from a 10-color palette. Each index page can be filtered by one or more tags using <code>?tag[]=name</code> query params (OR semantics).</p>
|
|
@@ -12,6 +12,14 @@
|
|
|
12
12
|
<p class="ck-kicker">Authentication</p>
|
|
13
13
|
<%= render CompletionKit.config.api_reference_authentication_partial, token: @token %>
|
|
14
14
|
</div>
|
|
15
|
+
<div>
|
|
16
|
+
<p class="ck-kicker">Pagination</p>
|
|
17
|
+
<p class="ck-meta-copy">Every index endpoint accepts <code>?limit=</code> and <code>?offset=</code> (default limit 50, max 500). The server returns <code>X-Total-Count</code>, <code>X-Limit</code>, and <code>X-Offset</code> headers so the caller can build cursors without re-counting.</p>
|
|
18
|
+
</div>
|
|
19
|
+
<div>
|
|
20
|
+
<p class="ck-kicker">Tag filtering</p>
|
|
21
|
+
<p class="ck-meta-copy">Prompts, runs, metrics, datasets, and metric groups accept <code>?tag[]=name</code> (repeat for OR semantics).</p>
|
|
22
|
+
</div>
|
|
15
23
|
</div>
|
|
16
24
|
</div>
|
|
17
25
|
</div>
|
|
@@ -15,12 +15,17 @@
|
|
|
15
15
|
.where.not(id: verdicted_ids)
|
|
16
16
|
.order(created_at: :desc).first
|
|
17
17
|
end %>
|
|
18
|
+
<% prior_version_verdicts = if stats.sample_size.zero? && metric && current_metric_version
|
|
19
|
+
CompletionKit::Calibration.where(metric_id: metric.id).where.not(metric_version_id: current_metric_version.id).count
|
|
20
|
+
else
|
|
21
|
+
0
|
|
22
|
+
end %>
|
|
18
23
|
|
|
19
24
|
<p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
20
25
|
<span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
|
|
21
26
|
<% if stats.sample_size.zero? %>
|
|
22
27
|
<span class="ck-trust-line__state">Not measured yet.</span>
|
|
23
|
-
<span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if target_response %>
|
|
28
|
+
<span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if prior_version_verdicts > 0 %> (<%= pluralize(prior_version_verdicts, "verdict") %> on prior versions, tied to that version's history.)<% end %><% if target_response %>
|
|
24
29
|
<%= link_to "Give a verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %>
|
|
25
30
|
<% else %> Run this metric on a dataset, then give a verdict.<% end %></span>
|
|
26
31
|
<% elsif stats.counter_only? %>
|
|
@@ -80,7 +80,7 @@
|
|
|
80
80
|
|
|
81
81
|
<% if @available_starters.any? %>
|
|
82
82
|
<section class="ck-starter-row">
|
|
83
|
-
<p class="ck-kicker">
|
|
83
|
+
<p class="ck-kicker">Skip the blank page</p>
|
|
84
84
|
<p class="ck-meta-copy">Pre-written rubrics for the dimensions most teams score against. Click a card to preview before it's created.</p>
|
|
85
85
|
<div class="ck-starter-grid">
|
|
86
86
|
<% @available_starters.each do |starter| %>
|
|
@@ -96,8 +96,8 @@
|
|
|
96
96
|
<% else %>
|
|
97
97
|
<% if @available_starters.any? %>
|
|
98
98
|
<section class="ck-starter-row ck-starter-row--empty-state">
|
|
99
|
-
<h2 class="ck-title ck-title--sm">
|
|
100
|
-
<p class="ck-lead">
|
|
99
|
+
<h2 class="ck-title ck-title--sm">Skip the blank page</h2>
|
|
100
|
+
<p class="ck-lead">Five rubrics we've worked through for common evaluation dimensions. Adopt one to drop in a pre-written 1–5 scale, edit anything after. Or <%= link_to "write your own from scratch", new_metric_path, class: "ck-link" %>.</p>
|
|
101
101
|
<div class="ck-starter-grid">
|
|
102
102
|
<% @available_starters.each do |starter| %>
|
|
103
103
|
<%= render "starter_card", starter: starter %>
|
|
@@ -109,6 +109,7 @@
|
|
|
109
109
|
<% source_label, source_class = case v.source
|
|
110
110
|
when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
|
|
111
111
|
when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
|
|
112
|
+
when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
|
|
112
113
|
else ["Original", "ck-source-chip ck-source-chip--initial"]
|
|
113
114
|
end %>
|
|
114
115
|
<span class="<%= source_class %>"><%= source_label %></span>
|
|
@@ -165,7 +166,7 @@
|
|
|
165
166
|
<p class="ck-kicker">Cases to learn from</p>
|
|
166
167
|
<span class="ck-chip"><%= pluralize(@disagreements.size, "case") %></span>
|
|
167
168
|
</div>
|
|
168
|
-
<% mixed_versions = @disagreements.
|
|
169
|
+
<% mixed_versions = @disagreements.any? { |c| c.metric_version_id != @published_metric_version.id } %>
|
|
169
170
|
<p class="ck-meta-copy">Cases where a reviewer's score didn't match the judge's. Pin useful ones with <strong>Remember this</strong> so the judge sees them next time it grades<%= " (pins flow into the current judge regardless of which version produced the verdict)" if mixed_versions %>.</p>
|
|
170
171
|
<% existing_ids = Array(@metric.few_shot_examples).map { |fs| fs["calibration_id"] } %>
|
|
171
172
|
<ul class="ck-disagreement-list">
|
|
@@ -11,6 +11,7 @@
|
|
|
11
11
|
<%= button_to "Retry", generate_run_path(run), method: :post, class: ck_button_classes(:light, variant: :outline), form_class: "inline-block" %>
|
|
12
12
|
<%= button_to "Re-run as new", rerun_run_path(run), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
|
|
13
13
|
<% elsif run.status == "completed" %>
|
|
14
|
+
<%= link_to "Compare", compare_run_path(run), class: ck_button_classes(:light, variant: :outline) %>
|
|
14
15
|
<%= button_to "Re-run", rerun_run_path(run), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
|
|
15
16
|
<% end %>
|
|
16
17
|
<% end %>
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
<ol class="ck-breadcrumb">
|
|
2
|
+
<li><%= link_to "Runs", runs_path %></li>
|
|
3
|
+
<li><%= link_to @run.name, run_path(@run) %></li>
|
|
4
|
+
<li>vs <%= @other_run.name %></li>
|
|
5
|
+
</ol>
|
|
6
|
+
|
|
7
|
+
<section class="ck-page-header">
|
|
8
|
+
<div>
|
|
9
|
+
<h1 class="ck-title">Comparing runs</h1>
|
|
10
|
+
<p class="ck-meta-copy"><strong>A</strong>: <%= link_to @run.name, run_path(@run), class: "ck-link" %> · <strong>B</strong>: <%= link_to @other_run.name, run_path(@other_run), class: "ck-link" %></p>
|
|
11
|
+
</div>
|
|
12
|
+
<div class="ck-actions">
|
|
13
|
+
<%= link_to "Pick another", compare_run_path(@run), class: ck_button_classes(:light, variant: :outline) %>
|
|
14
|
+
</div>
|
|
15
|
+
</section>
|
|
16
|
+
|
|
17
|
+
<% if @comparison[:rows].empty? %>
|
|
18
|
+
<div class="ck-empty">
|
|
19
|
+
<p>No responses to compare yet.</p>
|
|
20
|
+
</div>
|
|
21
|
+
<% else %>
|
|
22
|
+
<table class="ck-results-table ck-run-compare-table">
|
|
23
|
+
<thead>
|
|
24
|
+
<tr>
|
|
25
|
+
<th scope="col">Case</th>
|
|
26
|
+
<th scope="col">Metric</th>
|
|
27
|
+
<th scope="col">A score</th>
|
|
28
|
+
<th scope="col">B score</th>
|
|
29
|
+
<th scope="col">Δ</th>
|
|
30
|
+
<th scope="col">A version</th>
|
|
31
|
+
<th scope="col">B version</th>
|
|
32
|
+
</tr>
|
|
33
|
+
</thead>
|
|
34
|
+
<tbody>
|
|
35
|
+
<% @comparison[:rows].each do |row| %>
|
|
36
|
+
<% case_label = ((row[:left_response].row_index || 0) + 1).to_s %>
|
|
37
|
+
<% row[:per_metric].each_with_index do |pm, idx| %>
|
|
38
|
+
<tr>
|
|
39
|
+
<% if idx == 0 %>
|
|
40
|
+
<td rowspan="<%= row[:per_metric].size %>">
|
|
41
|
+
<%= link_to case_label, run_response_path(@run, row[:left_response]), class: "ck-link" %>
|
|
42
|
+
<% if row[:right_response] %>
|
|
43
|
+
<span class="ck-meta-copy">/ <%= link_to "B", run_response_path(@other_run, row[:right_response]), class: "ck-link" %></span>
|
|
44
|
+
<% end %>
|
|
45
|
+
</td>
|
|
46
|
+
<% end %>
|
|
47
|
+
<td><%= pm[:metric_name] %></td>
|
|
48
|
+
<td>
|
|
49
|
+
<% if pm[:left_score] %>
|
|
50
|
+
<span class="<%= ck_badge_classes(ck_score_kind(pm[:left_score].to_f)) %>"><%= pm[:left_score] %></span>
|
|
51
|
+
<% else %>
|
|
52
|
+
<span class="ck-meta-copy">—</span>
|
|
53
|
+
<% end %>
|
|
54
|
+
</td>
|
|
55
|
+
<td>
|
|
56
|
+
<% if pm[:right_score] %>
|
|
57
|
+
<span class="<%= ck_badge_classes(ck_score_kind(pm[:right_score].to_f)) %>"><%= pm[:right_score] %></span>
|
|
58
|
+
<% else %>
|
|
59
|
+
<span class="ck-meta-copy">—</span>
|
|
60
|
+
<% end %>
|
|
61
|
+
</td>
|
|
62
|
+
<td>
|
|
63
|
+
<% if pm[:delta] %>
|
|
64
|
+
<% delta_class = pm[:delta] > 0 ? "ck-delta--positive" : pm[:delta] < 0 ? "ck-delta--negative" : "ck-delta--zero" %>
|
|
65
|
+
<span class="ck-delta <%= delta_class %>"><%= pm[:delta].positive? ? "+#{pm[:delta]}" : pm[:delta].to_s %></span>
|
|
66
|
+
<% else %>
|
|
67
|
+
<span class="ck-meta-copy">—</span>
|
|
68
|
+
<% end %>
|
|
69
|
+
</td>
|
|
70
|
+
<td>
|
|
71
|
+
<% if pm[:left_version_label] %>
|
|
72
|
+
<span class="ck-source-chip ck-source-chip--current"><%= pm[:left_version_label] %></span>
|
|
73
|
+
<% end %>
|
|
74
|
+
</td>
|
|
75
|
+
<td>
|
|
76
|
+
<% if pm[:right_version_label] %>
|
|
77
|
+
<span class="ck-source-chip ck-source-chip--current"><%= pm[:right_version_label] %></span>
|
|
78
|
+
<% end %>
|
|
79
|
+
</td>
|
|
80
|
+
</tr>
|
|
81
|
+
<% end %>
|
|
82
|
+
<% end %>
|
|
83
|
+
</tbody>
|
|
84
|
+
</table>
|
|
85
|
+
<% end %>
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
<ol class="ck-breadcrumb">
|
|
2
|
+
<li><%= link_to "Runs", runs_path %></li>
|
|
3
|
+
<li><%= link_to @run.name, run_path(@run) %></li>
|
|
4
|
+
<li>Compare</li>
|
|
5
|
+
</ol>
|
|
6
|
+
|
|
7
|
+
<section class="ck-page-header">
|
|
8
|
+
<div>
|
|
9
|
+
<h1 class="ck-title">Compare with another run</h1>
|
|
10
|
+
<p class="ck-lead">Pick a run on the same dataset and prompt to see per-case score deltas side by side.</p>
|
|
11
|
+
</div>
|
|
12
|
+
</section>
|
|
13
|
+
|
|
14
|
+
<% if @other_runs.any? %>
|
|
15
|
+
<table class="ck-results-table">
|
|
16
|
+
<thead>
|
|
17
|
+
<tr>
|
|
18
|
+
<th scope="col">Run</th>
|
|
19
|
+
<th scope="col">Judge</th>
|
|
20
|
+
<th scope="col">Created</th>
|
|
21
|
+
<th scope="col"></th>
|
|
22
|
+
</tr>
|
|
23
|
+
</thead>
|
|
24
|
+
<tbody>
|
|
25
|
+
<% @other_runs.each do |other| %>
|
|
26
|
+
<tr>
|
|
27
|
+
<td><%= link_to other.name, run_path(other), class: "ck-link" %></td>
|
|
28
|
+
<td class="ck-meta-copy"><%= other.judge_model %></td>
|
|
29
|
+
<td class="ck-meta-copy"><time datetime="<%= other.created_at.utc.iso8601 %>"><%= time_ago_in_words(other.created_at) %> ago</time></td>
|
|
30
|
+
<td class="ck-results-table__arrow"><%= link_to "Compare →", compare_run_path(@run, with: other.id), class: "ck-link" %></td>
|
|
31
|
+
</tr>
|
|
32
|
+
<% end %>
|
|
33
|
+
</tbody>
|
|
34
|
+
</table>
|
|
35
|
+
<% else %>
|
|
36
|
+
<div class="ck-empty">
|
|
37
|
+
<p>No other runs on this dataset + prompt combination yet. <%= link_to "Re-run from this one", rerun_run_path(@run), method: :post, class: "ck-link" %> to create one.</p>
|
|
38
|
+
</div>
|
|
39
|
+
<% end %>
|
|
@@ -32,10 +32,16 @@
|
|
|
32
32
|
</p>
|
|
33
33
|
</div>
|
|
34
34
|
<% if @run.status == "completed" %>
|
|
35
|
-
<%= button_to "Re-run
|
|
35
|
+
<%= button_to "Re-run from scratch",
|
|
36
36
|
rerun_run_path(@run), method: :post,
|
|
37
|
+
class: ck_button_classes(:light, variant: :outline), form_class: "inline-block",
|
|
38
|
+
title: "Create a new run that regenerates responses and grades them with the current judge.",
|
|
39
|
+
data: { turbo_confirm: "Create a new run with fresh responses and the current judge? The original run stays as a record." } %>
|
|
40
|
+
<%= button_to "Re-grade with current judge",
|
|
41
|
+
regrade_run_path(@run), method: :post,
|
|
37
42
|
class: ck_button_classes(:dark), form_class: "inline-block",
|
|
38
|
-
|
|
43
|
+
title: "Re-judge this run's existing responses against the current judge. Faster and cheaper than re-running.",
|
|
44
|
+
data: { turbo_confirm: "Re-judge this run's existing responses against the current judge?" } %>
|
|
39
45
|
<% end %>
|
|
40
46
|
</div>
|
|
41
47
|
<% end %>
|
data/config/routes.rb
CHANGED
|
@@ -37,7 +37,9 @@ CompletionKit::Engine.routes.draw do
|
|
|
37
37
|
post :suggest
|
|
38
38
|
post :retry_failures
|
|
39
39
|
post :rerun
|
|
40
|
+
post :regrade
|
|
40
41
|
get :refresh_status
|
|
42
|
+
get :compare
|
|
41
43
|
end
|
|
42
44
|
resources :responses, only: [:show] do
|
|
43
45
|
resources :calibrations, only: [:create]
|
|
@@ -68,6 +70,9 @@ CompletionKit::Engine.routes.draw do
|
|
|
68
70
|
member do
|
|
69
71
|
post :generate
|
|
70
72
|
post :retry_failures
|
|
73
|
+
post :rerun
|
|
74
|
+
post :regrade
|
|
75
|
+
get :compare
|
|
71
76
|
end
|
|
72
77
|
resources :responses, only: [:index, :show] do
|
|
73
78
|
resources :metrics, only: [] do
|
|
@@ -76,10 +81,22 @@ CompletionKit::Engine.routes.draw do
|
|
|
76
81
|
end
|
|
77
82
|
end
|
|
78
83
|
resources :datasets
|
|
79
|
-
resources :metrics
|
|
84
|
+
resources :metrics do
|
|
85
|
+
resources :metric_versions, only: [:index, :show, :destroy] do
|
|
86
|
+
member do
|
|
87
|
+
post :publish
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
member do
|
|
91
|
+
post :suggest_variants
|
|
92
|
+
post :add_few_shot
|
|
93
|
+
delete :remove_few_shot
|
|
94
|
+
end
|
|
95
|
+
end
|
|
80
96
|
resources :metric_groups
|
|
81
97
|
resources :tags
|
|
82
98
|
resources :provider_credentials
|
|
99
|
+
resources :calibrations, only: [:index, :destroy]
|
|
83
100
|
end
|
|
84
101
|
end
|
|
85
102
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.7.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -238,6 +238,7 @@ files:
|
|
|
238
238
|
- app/controllers/completion_kit/api/v1/calibrations_controller.rb
|
|
239
239
|
- app/controllers/completion_kit/api/v1/datasets_controller.rb
|
|
240
240
|
- app/controllers/completion_kit/api/v1/metric_groups_controller.rb
|
|
241
|
+
- app/controllers/completion_kit/api/v1/metric_versions_controller.rb
|
|
241
242
|
- app/controllers/completion_kit/api/v1/metrics_controller.rb
|
|
242
243
|
- app/controllers/completion_kit/api/v1/prompts_controller.rb
|
|
243
244
|
- app/controllers/completion_kit/api/v1/provider_credentials_controller.rb
|
|
@@ -288,6 +289,7 @@ files:
|
|
|
288
289
|
- app/models/completion_kit/suggestion.rb
|
|
289
290
|
- app/models/completion_kit/tag.rb
|
|
290
291
|
- app/models/completion_kit/tagging.rb
|
|
292
|
+
- app/models/concerns/completion_kit/has_job_status.rb
|
|
291
293
|
- app/models/concerns/completion_kit/taggable.rb
|
|
292
294
|
- app/services/completion_kit/anthropic_client.rb
|
|
293
295
|
- app/services/completion_kit/api_config.rb
|
|
@@ -302,6 +304,7 @@ files:
|
|
|
302
304
|
- app/services/completion_kit/mcp_tools/datasets.rb
|
|
303
305
|
- app/services/completion_kit/mcp_tools/judges.rb
|
|
304
306
|
- app/services/completion_kit/mcp_tools/metric_groups.rb
|
|
307
|
+
- app/services/completion_kit/mcp_tools/metric_versions.rb
|
|
305
308
|
- app/services/completion_kit/mcp_tools/metrics.rb
|
|
306
309
|
- app/services/completion_kit/mcp_tools/prompts.rb
|
|
307
310
|
- app/services/completion_kit/mcp_tools/provider_credentials.rb
|
|
@@ -377,6 +380,8 @@ files:
|
|
|
377
380
|
- app/views/completion_kit/runs/_status_header.html.erb
|
|
378
381
|
- app/views/completion_kit/runs/_status_panel.html.erb
|
|
379
382
|
- app/views/completion_kit/runs/_table.html.erb
|
|
383
|
+
- app/views/completion_kit/runs/compare.html.erb
|
|
384
|
+
- app/views/completion_kit/runs/compare_picker.html.erb
|
|
380
385
|
- app/views/completion_kit/runs/edit.html.erb
|
|
381
386
|
- app/views/completion_kit/runs/index.html.erb
|
|
382
387
|
- app/views/completion_kit/runs/new.html.erb
|