completion-kit 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/stylesheets/completion_kit/application.css +18 -4
  3. data/app/controllers/completion_kit/api/v1/base_controller.rb +22 -0
  4. data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +22 -3
  5. data/app/controllers/completion_kit/api/v1/datasets_controller.rb +3 -1
  6. data/app/controllers/completion_kit/api/v1/metric_groups_controller.rb +3 -1
  7. data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +51 -0
  8. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +49 -2
  9. data/app/controllers/completion_kit/api/v1/prompts_controller.rb +3 -1
  10. data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb +1 -1
  11. data/app/controllers/completion_kit/api/v1/responses_controller.rb +3 -1
  12. data/app/controllers/completion_kit/api/v1/runs_controller.rb +75 -2
  13. data/app/controllers/completion_kit/api/v1/tags_controller.rb +1 -1
  14. data/app/controllers/completion_kit/metrics_controller.rb +3 -3
  15. data/app/controllers/completion_kit/runs_controller.rb +1 -1
  16. data/app/helpers/completion_kit/application_helper.rb +0 -14
  17. data/app/jobs/completion_kit/generate_row_job.rb +3 -8
  18. data/app/jobs/completion_kit/judge_review_job.rb +6 -9
  19. data/app/models/completion_kit/metric.rb +1 -0
  20. data/app/models/completion_kit/metric_version.rb +16 -0
  21. data/app/models/completion_kit/response.rb +13 -17
  22. data/app/models/completion_kit/review.rb +18 -22
  23. data/app/models/completion_kit/run.rb +27 -23
  24. data/app/models/concerns/completion_kit/has_job_status.rb +31 -0
  25. data/app/services/completion_kit/mcp_dispatcher.rb +3 -1
  26. data/app/services/completion_kit/mcp_tools/metric_versions.rb +67 -0
  27. data/app/services/completion_kit/starter_metrics.rb +5 -5
  28. data/app/views/completion_kit/api_reference/_body.html.erb +91 -6
  29. data/app/views/completion_kit/api_reference/index.html.erb +8 -0
  30. data/app/views/completion_kit/metrics/index.html.erb +3 -3
  31. data/app/views/completion_kit/metrics/show.html.erb +1 -0
  32. data/config/routes.rb +16 -1
  33. data/lib/completion_kit/version.rb +1 -1
  34. metadata +4 -1
@@ -1,37 +1,25 @@
1
1
  module CompletionKit
2
2
  class Review < ApplicationRecord
3
- STATUSES = %w[pending retrying succeeded failed].freeze
4
- TERMINAL_STATUSES = %w[succeeded failed].freeze
3
+ include HasJobStatus
5
4
 
6
5
  belongs_to :response
7
6
  belongs_to :metric, optional: true
8
7
  belongs_to :metric_version, optional: true
9
8
  has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
10
9
 
11
- def stale_against_current_judge?
12
- return false unless metric_id && metric_version_id
13
- current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
14
- return false if current_id.nil?
15
- metric_version_id != current_id
16
- end
17
-
18
10
  validates :metric_name, presence: true
19
- validates :status, inclusion: { in: STATUSES }
20
11
  validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
21
12
 
22
13
  before_validation :set_default_status
23
14
 
24
- def terminal?
25
- TERMINAL_STATUSES.include?(status)
26
- end
27
-
28
- def succeeded?
29
- status == "succeeded"
30
- end
15
+ after_save_commit :broadcast_parent_row_update, unless: :destroyed?
16
+ after_save_commit :broadcast_run_progress, if: :should_broadcast_progress?
31
17
 
32
- def error_payload
33
- return nil if error_class.blank?
34
- { provider: error_provider, class: error_class, status: error_status, message: error_message }
18
+ def stale_against_current_judge?
19
+ return false unless metric_id && metric_version_id
20
+ current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
21
+ return false if current_id.nil?
22
+ metric_version_id != current_id
35
23
  end
36
24
 
37
25
  def as_json(options = {})
@@ -46,8 +34,16 @@ module CompletionKit
46
34
 
47
35
  private
48
36
 
49
- def set_default_status
50
- self.status ||= "pending"
37
+ def broadcast_parent_row_update
38
+ response.run.broadcast_response_update(response)
39
+ end
40
+
41
+ def broadcast_run_progress
42
+ response.run.broadcast_progress
43
+ end
44
+
45
+ def should_broadcast_progress?
46
+ saved_change_to_status? && terminal?
51
47
  end
52
48
  end
53
49
  end
@@ -43,7 +43,7 @@ module CompletionKit
43
43
  end
44
44
 
45
45
  def outstanding_work_zero?
46
- return false if responses.where.not(status: Response::TERMINAL_STATUSES).exists?
46
+ return false if responses.where.not(status: HasJobStatus::TERMINAL_STATUSES).exists?
47
47
 
48
48
  metric_ids = metrics.pluck(:id)
49
49
  return true if metric_ids.empty?
@@ -55,7 +55,7 @@ module CompletionKit
55
55
  terminal_review_count = Review.where(
56
56
  response_id: succeeded_response_ids,
57
57
  metric_id: metric_ids,
58
- status: Review::TERMINAL_STATUSES
58
+ status: HasJobStatus::TERMINAL_STATUSES
59
59
  ).count
60
60
 
61
61
  terminal_review_count >= expected_reviews
@@ -118,6 +118,10 @@ module CompletionKit
118
118
  end
119
119
 
120
120
  def start!
121
+ unless %w[pending failed].include?(status)
122
+ return fail_with_summary!("Cannot start a run in state \"#{status}\". Use rerun to create a fresh copy, or retry_failures / regrade to work with the existing responses.")
123
+ end
124
+
121
125
  rows = if dataset
122
126
  CsvProcessor.process_self(self)
123
127
  else
@@ -161,7 +165,7 @@ module CompletionKit
161
165
  response = responses.create!(attrs)
162
166
 
163
167
  if judge_only?
164
- metrics.each { |m| JudgeReviewJob.perform_later(response.id, m.id) } if judge_configured?
168
+ metrics.each { |m| JudgeReviewJob.perform_later(response.id, m.id, id) } if judge_configured?
165
169
  else
166
170
  GenerateRowJob.perform_later(id, response.id)
167
171
  end
@@ -202,7 +206,7 @@ module CompletionKit
202
206
  update!(status: "running", failure_summary: nil, error_message: nil)
203
207
 
204
208
  response_ids.each do |rid|
205
- grading_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id) }
209
+ grading_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id, id) }
206
210
  end
207
211
  RunCompletionCheckJob.perform_later(id)
208
212
  end
@@ -272,17 +276,6 @@ module CompletionKit
272
276
  }
273
277
  end
274
278
 
275
- private
276
-
277
- def fail_with_summary!(message)
278
- errors.add(:base, message)
279
- if persisted?
280
- update_columns(status: "failed", failure_summary: message, error_message: message)
281
- broadcast_ui
282
- end
283
- false
284
- end
285
-
286
279
  def broadcast_ui
287
280
  broadcast_progress
288
281
  broadcast_status_header
@@ -290,14 +283,6 @@ module CompletionKit
290
283
  broadcast_sort_toolbar
291
284
  end
292
285
 
293
- def render_engine_partial(partial, locals)
294
- CompletionKit::Engine.warm_routes!
295
- CompletionKit::ApplicationController.render(
296
- partial: partial,
297
- locals: locals
298
- )
299
- end
300
-
301
286
  def broadcast_progress
302
287
  reload
303
288
  broadcast_replace_to(
@@ -356,6 +341,25 @@ module CompletionKit
356
341
  )
357
342
  end
358
343
 
344
+ private
345
+
346
+ def fail_with_summary!(message)
347
+ errors.add(:base, message)
348
+ if persisted?
349
+ update_columns(status: "failed", failure_summary: message, error_message: message)
350
+ broadcast_ui
351
+ end
352
+ false
353
+ end
354
+
355
+ def render_engine_partial(partial, locals)
356
+ CompletionKit::Engine.warm_routes!
357
+ CompletionKit::ApplicationController.render(
358
+ partial: partial,
359
+ locals: locals
360
+ )
361
+ end
362
+
359
363
  def set_default_status
360
364
  self.status ||= "pending"
361
365
  end
@@ -0,0 +1,31 @@
1
+ module CompletionKit
2
+ module HasJobStatus
3
+ extend ActiveSupport::Concern
4
+
5
+ STATUSES = %w[pending retrying succeeded failed].freeze
6
+ TERMINAL_STATUSES = %w[succeeded failed].freeze
7
+
8
+ included do
9
+ validates :status, inclusion: { in: STATUSES }
10
+ end
11
+
12
+ def terminal?
13
+ TERMINAL_STATUSES.include?(status)
14
+ end
15
+
16
+ def succeeded?
17
+ status == "succeeded"
18
+ end
19
+
20
+ def error_payload
21
+ return nil if error_class.blank?
22
+ { provider: error_provider, class: error_class, status: error_status, message: error_message }
23
+ end
24
+
25
+ private
26
+
27
+ def set_default_status
28
+ self.status ||= "pending"
29
+ end
30
+ end
31
+ end
@@ -32,6 +32,7 @@ module CompletionKit
32
32
  McpTools::Datasets.definitions +
33
33
  McpTools::Metrics.definitions +
34
34
  McpTools::MetricGroups.definitions +
35
+ McpTools::MetricVersions.definitions +
35
36
  McpTools::ProviderCredentials.definitions +
36
37
  McpTools::Tags.definitions +
37
38
  McpTools::Calibrations.definitions +
@@ -44,8 +45,9 @@ module CompletionKit
44
45
  when /\Aruns_/ then McpTools::Runs.call(name, arguments)
45
46
  when /\Aresponses_/ then McpTools::Responses.call(name, arguments)
46
47
  when /\Adatasets_/ then McpTools::Datasets.call(name, arguments)
47
- when /\Ametrics_/ then McpTools::Metrics.call(name, arguments)
48
+ when /\Ametric_versions_/ then McpTools::MetricVersions.call(name, arguments)
48
49
  when /\Ametric_groups_/ then McpTools::MetricGroups.call(name, arguments)
50
+ when /\Ametrics_/ then McpTools::Metrics.call(name, arguments)
49
51
  when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
50
52
  when /\Atags_/ then McpTools::Tags.call(name, arguments)
51
53
  when /\Acalibrations_/ then McpTools::Calibrations.call(name, arguments)
@@ -0,0 +1,67 @@
1
+ module CompletionKit
2
+ module McpTools
3
+ module MetricVersions
4
+ extend Base
5
+
6
+ TOOLS = {
7
+ "metric_versions_list" => {
8
+ description: "List every MetricVersion (drafts + published) for a metric, newest first. Each row carries version_number, state, source, current flag, and timestamps.",
9
+ inputSchema: {
10
+ type: "object",
11
+ properties: {
12
+ metric_id: { type: "integer" }
13
+ },
14
+ required: ["metric_id"]
15
+ },
16
+ handler: :list
17
+ },
18
+ "metric_versions_publish" => {
19
+ description: "Publish a MetricVersion as the live judge for its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge actually uses them.",
20
+ inputSchema: {
21
+ type: "object",
22
+ properties: {
23
+ metric_version_id: { type: "integer" }
24
+ },
25
+ required: ["metric_version_id"]
26
+ },
27
+ handler: :publish
28
+ },
29
+ "metric_versions_dismiss" => {
30
+ description: "Destroy a draft MetricVersion (use for either source: 'edit' or source: 'suggestion'). Published versions are refused — to demote a published version, publish a different one as current instead.",
31
+ inputSchema: {
32
+ type: "object",
33
+ properties: {
34
+ metric_version_id: { type: "integer" }
35
+ },
36
+ required: ["metric_version_id"]
37
+ },
38
+ handler: :dismiss
39
+ }
40
+ }.freeze
41
+
42
+ def self.list(args)
43
+ metric = CompletionKit::Metric.find(args["metric_id"])
44
+ versions = CompletionKit::MetricVersion.where(metric_id: metric.id).order(version_number: :desc)
45
+ text_result(versions.map(&:as_json))
46
+ end
47
+
48
+ def self.publish(args)
49
+ version = CompletionKit::MetricVersion.find(args["metric_version_id"])
50
+ if version.published? && !version.current?
51
+ audit = version.revert!
52
+ text_result(audit.as_json)
53
+ else
54
+ version.publish!
55
+ text_result(version.reload.as_json)
56
+ end
57
+ end
58
+
59
+ def self.dismiss(args)
60
+ version = CompletionKit::MetricVersion.find(args["metric_version_id"])
61
+ return error_result("Cannot dismiss a published version. Publish a different version as current instead.") if version.published?
62
+ version.destroy!
63
+ text_result({id: version.id, destroyed: true})
64
+ end
65
+ end
66
+ end
67
+ end
@@ -21,8 +21,8 @@ module CompletionKit
21
21
  key: "instruction_following",
22
22
  name: "Instruction following",
23
23
  description: "Did the model do everything that was asked?",
24
- catches: "The response is factually right but ignores \"answer in two sentences\", \"use bullet points\", \"do not include X\". Distinct from Correctness a response can be right and still fail this.",
25
- instruction: "Did the model do every concrete thing the prompt asked for? Score against the explicit requirements in the prompt (format constraints, count limits, exclusions, audience cues). Factual accuracy is a different dimension score that elsewhere.",
24
+ catches: "The response is factually right but ignores \"answer in two sentences\", \"use bullet points\", \"do not include X\". Distinct from Correctness; a response can be right and still fail this.",
25
+ instruction: "Did the model do every concrete thing the prompt asked for? Score against the explicit requirements in the prompt (format constraints, count limits, exclusions, audience cues). Factual accuracy is a different dimension; score that elsewhere.",
26
26
  rubric_bands: [
27
27
  { "stars" => 5, "description" => "Followed every requirement in the prompt exactly." },
28
28
  { "stars" => 4, "description" => "Followed every requirement with a small slip." },
@@ -36,7 +36,7 @@ module CompletionKit
36
36
  name: "Format compliance",
37
37
  description: "Does the output follow the required structure?",
38
38
  catches: "Invalid JSON, missing schema fields, extra prose around a structured response, wrong casing on keys. Critical for any LLM wired into an API.",
39
- instruction: "Does the output match the format the prompt asked for JSON shape, schema, keys, casing, no stray prose? Score on whether a downstream parser would accept it without massaging.",
39
+ instruction: "Does the output match the format the prompt asked for: JSON shape, schema, keys, casing, no stray prose? Score on whether a downstream parser would accept it without massaging.",
40
40
  rubric_bands: [
41
41
  { "stars" => 5, "description" => "Exact spec, ready to consume programmatically." },
42
42
  { "stars" => 4, "description" => "Spec-compliant with one cosmetic issue." },
@@ -62,9 +62,9 @@ module CompletionKit
62
62
  Starter.new(
63
63
  key: "conciseness",
64
64
  name: "Conciseness",
65
- description: "Is it the right length no padding, no missing detail?",
65
+ description: "Is it the right length, no padding, no missing detail?",
66
66
  catches: "Rambling responses, repetitive caveats, over-hedging. LLMs default to verbose. Conciseness is the dimension where users most often see scores move after tuning.",
67
- instruction: "Is the output the right length for the task no padding, no missing detail, no hedging filler? Penalise rambling, repetition, over-caveating, and unnecessary preamble. Penalise too-short outputs that drop information.",
67
+ instruction: "Is the output the right length for the task: no padding, no missing detail, no hedging filler? Penalise rambling, repetition, over-caveating, and unnecessary preamble. Penalise too-short outputs that drop information.",
68
68
  rubric_bands: [
69
69
  { "stars" => 5, "description" => "Exactly as long as the task needs, no more, no less." },
70
70
  { "stars" => 4, "description" => "Right length with a small redundancy." },
@@ -17,17 +17,19 @@
17
17
  <input type="radio" name="ck-api-tab" id="ck-tab-datasets" class="ck-api-tabs__radio">
18
18
  <input type="radio" name="ck-api-tab" id="ck-tab-metrics" class="ck-api-tabs__radio">
19
19
  <input type="radio" name="ck-api-tab" id="ck-tab-metric-groups" class="ck-api-tabs__radio">
20
+ <input type="radio" name="ck-api-tab" id="ck-tab-calibrations" class="ck-api-tabs__radio">
20
21
  <input type="radio" name="ck-api-tab" id="ck-tab-tags" class="ck-api-tabs__radio">
21
22
  <input type="radio" name="ck-api-tab" id="ck-tab-providers" class="ck-api-tabs__radio">
22
23
 
23
24
  <nav class="ck-api-tabs__nav">
24
- <label for="ck-tab-mcp" class="ck-api-tabs__label">MCP <span class="ck-api-tabs__count">35</span></label>
25
+ <label for="ck-tab-mcp" class="ck-api-tabs__label">MCP <span class="ck-api-tabs__count"><%= CompletionKit::McpDispatcher.tool_definitions.size %></span></label>
25
26
  <label for="ck-tab-prompts" class="ck-api-tabs__label">Prompts <span class="ck-api-tabs__count">6</span></label>
26
- <label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">7</span></label>
27
+ <label for="ck-tab-runs" class="ck-api-tabs__label">Runs <span class="ck-api-tabs__count">10</span></label>
27
28
  <label for="ck-tab-responses" class="ck-api-tabs__label">Responses <span class="ck-api-tabs__count">2</span></label>
28
29
  <label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
29
- <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">5</span></label>
30
+ <label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">12</span></label>
30
31
  <label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
32
+ <label for="ck-tab-calibrations" class="ck-api-tabs__label">Calibrations <span class="ck-api-tabs__count">3</span></label>
31
33
  <label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
32
34
  <label for="ck-tab-providers" class="ck-api-tabs__label">Providers <span class="ck-api-tabs__count">5</span></label>
33
35
  </nav>
@@ -36,7 +38,7 @@
36
38
 
37
39
  <div class="ck-api-tabs__panel">
38
40
  <h2 class="ck-section-title">MCP Server</h2>
39
- <p class="ck-copy">Connect Claude Code, Cursor, or any <a href="https://modelcontextprotocol.io" class="ck-link">MCP</a> client to manage prompts, runs, datasets, and metrics conversationally. 35 tools over streamable HTTP.</p>
41
+ <p class="ck-copy">Connect Claude Code, Cursor, or any <a href="https://modelcontextprotocol.io" class="ck-link">MCP</a> client to manage prompts, runs, datasets, and metrics conversationally. <%= CompletionKit::McpDispatcher.tool_definitions.size %> tools over streamable HTTP.</p>
40
42
 
41
43
  <div class="ck-mcp-install-grid">
42
44
  <div class="ck-mcp-install-card">
@@ -116,7 +118,8 @@
116
118
  <p class="ck-copy">Create runs, generate LLM responses, and judge them with metrics.</p>
117
119
  <div class="ck-api-endpoint">
118
120
  <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/runs</p>
119
- <p class="ck-meta-copy">List all runs with response counts and average scores.</p>
121
+ <p class="ck-meta-copy">List runs with response counts and average scores. Supports pagination (<code>limit</code>, <code>offset</code>) and the following filters.</p>
122
+ <p class="ck-api-params"><strong>Optional filters:</strong>&ensp;<code>status</code> (<code>pending</code>, <code>running</code>, <code>completed</code>, <code>failed</code>), <code>prompt_id</code>, <code>dataset_id</code>, <code>tag[]</code></p>
120
123
  </div>
121
124
  <div class="ck-api-endpoint">
122
125
  <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs</p>
@@ -139,6 +142,24 @@
139
142
  <p class="ck-meta-copy">Start generating responses. Returns 202 Accepted. Poll the run to check progress.</p>
140
143
  <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/generate \\\n -H \"Authorization: Bearer #{token}\"" %>
141
144
  </div>
145
+ <div class="ck-api-endpoint">
146
+ <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:id/retry_failures</p>
147
+ <p class="ck-meta-copy">Re-queue any responses that failed during generation. Returns 202 Accepted.</p>
148
+ </div>
149
+ <div class="ck-api-endpoint">
150
+ <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:id/rerun</p>
151
+ <p class="ck-meta-copy">Clone the run and start generating responses on the copy against the current prompt and metric versions. Returns the new run with 201 Created. Useful for capturing a fresh baseline after metric edits.</p>
152
+ <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/rerun \\\n -H \"Authorization: Bearer #{token}\"" %>
153
+ </div>
154
+ <div class="ck-api-endpoint">
155
+ <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:id/regrade</p>
156
+ <p class="ck-meta-copy">Re-judge the existing successful responses against the current metric versions without regenerating model output. Returns 202 Accepted, or 422 if no responses are eligible.</p>
157
+ </div>
158
+ <div class="ck-api-endpoint">
159
+ <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/runs/:id/compare?with=:other_id</p>
160
+ <p class="ck-meta-copy">Side-by-side comparison against another run. Returns <code>{rows: [...], metric_ids: [...]}</code> with one row per input case, per-metric scores on both sides, and the delta. Cases that exist on only one side are still returned with the missing side nulled out.</p>
161
+ <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/runs/1/compare?with=2\" \\\n -H \"Authorization: Bearer #{token}\"" %>
162
+ </div>
142
163
  <div class="ck-api-endpoint">
143
164
  <p class="ck-api-method"><span class="ck-chip ck-chip--soft">PATCH</span> /api/v1/runs/:id</p>
144
165
  <p class="ck-meta-copy">Update a run. Accepts same params as create.</p>
@@ -154,7 +175,8 @@
154
175
  <p class="ck-copy">Read-only access to generated responses and their review scores. Nested under runs.</p>
155
176
  <div class="ck-api-endpoint">
156
177
  <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/runs/:run_id/responses</p>
157
- <p class="ck-meta-copy">List all responses for a run, including nested review scores.</p>
178
+ <p class="ck-meta-copy">List responses for a run, including nested review scores.</p>
179
+ <p class="ck-api-params"><strong>Optional filters:</strong>&ensp;<code>status</code> (<code>pending</code>, <code>succeeded</code>, <code>failed</code>), plus <code>limit</code> and <code>offset</code></p>
158
180
  <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl #{base_url}/api/v1/runs/1/responses \\\n -H \"Authorization: Bearer #{token}\"" %>
159
181
  </div>
160
182
  <div class="ck-api-endpoint">
@@ -214,6 +236,48 @@
214
236
  { name: m.name, subtitle: m.instruction.presence&.truncate(100),
215
237
  url: "#{base_url}/api/v1/metrics/#{m.id}", dom_id: "metric_ep_#{m.id}" }
216
238
  } %>
239
+
240
+ <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
241
+ <p class="ck-kicker" style="margin-bottom: 0.5rem;">Calibration loop</p>
242
+ <p class="ck-meta-copy">Drive metric improvement from disagree-flagged calibrations: ask the model for variants, then pin individual cases as few-shot examples on the metric.</p>
243
+ </div>
244
+ <div class="ck-api-endpoint">
245
+ <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
246
+ <p class="ck-meta-copy">Generate draft metric versions from the current disagreements. Returns 201 with the new draft versions, 422 if no disagreements exist or the model produced nothing usable.</p>
247
+ <p class="ck-api-params"><strong>Optional:</strong>&ensp;<code>count</code>, <code>model</code></p>
248
+ <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/metrics/1/suggest_variants \\\n -H \"Authorization: Bearer #{token}\"" %>
249
+ </div>
250
+ <div class="ck-api-endpoint">
251
+ <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/add_few_shot</p>
252
+ <p class="ck-meta-copy">Pin a disagree calibration as a few-shot example on the metric. Returns the updated metric, or 404 if the calibration is not a disagree on this metric.</p>
253
+ <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
254
+ </div>
255
+ <div class="ck-api-endpoint">
256
+ <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:id/remove_few_shot</p>
257
+ <p class="ck-meta-copy">Drop the pinned few-shot example by calibration ID.</p>
258
+ <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>calibration_id</code></p>
259
+ </div>
260
+
261
+ <div class="ck-api-endpoint" style="padding-top: 1.5rem;">
262
+ <p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>
263
+ <p class="ck-meta-copy">Every metric carries a history of versions (the current published one, prior published ones, and any draft suggestions). Reviews and calibrations record the version they ran against, so the API can surface stale state and let you revert.</p>
264
+ </div>
265
+ <div class="ck-api-endpoint">
266
+ <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/metrics/:metric_id/metric_versions</p>
267
+ <p class="ck-meta-copy">List every version for the metric, newest version_number first.</p>
268
+ </div>
269
+ <div class="ck-api-endpoint">
270
+ <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/metrics/:metric_id/metric_versions/:id</p>
271
+ <p class="ck-meta-copy">Get a single version with its instruction, rubric bands, state, and source.</p>
272
+ </div>
273
+ <div class="ck-api-endpoint">
274
+ <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:metric_id/metric_versions/:id/publish</p>
275
+ <p class="ck-meta-copy">Publish the version as current. Works for a draft (promote) or a superseded published version (revert). Copies the version's instruction and rubric back onto the metric.</p>
276
+ </div>
277
+ <div class="ck-api-endpoint">
278
+ <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/metrics/:metric_id/metric_versions/:id</p>
279
+ <p class="ck-meta-copy">Dismiss a draft version. Returns 204 No Content, or 409 Conflict if the version is published (published versions are immutable history).</p>
280
+ </div>
217
281
  </div>
218
282
 
219
283
  <div class="ck-api-tabs__panel">
@@ -239,6 +303,27 @@
239
303
  } %>
240
304
  </div>
241
305
 
306
+ <div class="ck-api-tabs__panel">
307
+ <h2 class="ck-section-title">Calibrations</h2>
308
+ <p class="ck-copy">Per-verdict feedback events on a response/metric pair: agree, disagree (with a corrected score and note), or borderline. Calibrations capture the metric version that was current when the verdict was cast, which is what drives the trust signal and the "stale" indicators across the rest of the API.</p>
309
+ <div class="ck-api-endpoint">
310
+ <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/calibrations</p>
311
+ <p class="ck-meta-copy">List calibrations across all runs. Supports filtering by any combination of the query params below.</p>
312
+ <p class="ck-api-params"><strong>Optional filters:</strong>&ensp;<code>run_id</code>, <code>response_id</code>, <code>metric_id</code>, <code>metric_version_id</code>, <code>created_by</code>, <code>verdict</code> (<code>agree</code>, <code>disagree</code>, or <code>borderline</code>)</p>
313
+ <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/calibrations?metric_id=1&verdict=disagree\" \\\n -H \"Authorization: Bearer #{token}\"" %>
314
+ </div>
315
+ <div class="ck-api-endpoint">
316
+ <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:run_id/responses/:response_id/metrics/:metric_id/calibrations</p>
317
+ <p class="ck-meta-copy">Cast a calibration on a specific response/metric pair. The metric version on the record is set automatically from the run's review.</p>
318
+ <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>verdict</code>, <code>created_by</code>&emsp;<strong>Optional:</strong>&ensp;<code>corrected_score</code>, <code>note</code></p>
319
+ <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/responses/42/metrics/3/calibrations \\\n -H \"Authorization: Bearer #{token}\" \\\n -H \"Content-Type: application/json\" \\\n -d '{\"verdict\": \"disagree\", \"corrected_score\": 3, \"note\": \"too generous\", \"created_by\": \"alice\"}'" %>
320
+ </div>
321
+ <div class="ck-api-endpoint">
322
+ <p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/calibrations/:id</p>
323
+ <p class="ck-meta-copy">Delete a calibration. Returns 204 No Content.</p>
324
+ </div>
325
+ </div>
326
+
242
327
  <div class="ck-api-tabs__panel">
243
328
  <h2 class="ck-section-title">Tags</h2>
244
329
  <p class="ck-copy">Domain labels you can attach to metrics, prompts, runs, and datasets. Tags are auto-assigned a color from a 10-color palette. Each index page can be filtered by one or more tags using <code>?tag[]=name</code> query params (OR semantics).</p>
@@ -12,6 +12,14 @@
12
12
  <p class="ck-kicker">Authentication</p>
13
13
  <%= render CompletionKit.config.api_reference_authentication_partial, token: @token %>
14
14
  </div>
15
+ <div>
16
+ <p class="ck-kicker">Pagination</p>
17
+ <p class="ck-meta-copy">Every index endpoint accepts <code>?limit=</code> and <code>?offset=</code> (default limit 50, max 500). The server returns <code>X-Total-Count</code>, <code>X-Limit</code>, and <code>X-Offset</code> headers so the caller can build cursors without re-counting.</p>
18
+ </div>
19
+ <div>
20
+ <p class="ck-kicker">Tag filtering</p>
21
+ <p class="ck-meta-copy">Prompts, runs, metrics, datasets, and metric groups accept <code>?tag[]=name</code> (repeat for OR semantics).</p>
22
+ </div>
15
23
  </div>
16
24
  </div>
17
25
  </div>
@@ -80,7 +80,7 @@
80
80
 
81
81
  <% if @available_starters.any? %>
82
82
  <section class="ck-starter-row">
83
- <p class="ck-kicker">Add a starter metric</p>
83
+ <p class="ck-kicker">Skip the blank page</p>
84
84
  <p class="ck-meta-copy">Pre-written rubrics for the dimensions most teams score against. Click a card to preview before it's created.</p>
85
85
  <div class="ck-starter-grid">
86
86
  <% @available_starters.each do |starter| %>
@@ -96,8 +96,8 @@
96
96
  <% else %>
97
97
  <% if @available_starters.any? %>
98
98
  <section class="ck-starter-row ck-starter-row--empty-state">
99
- <h2 class="ck-title ck-title--sm">Start with a ready-made rubric</h2>
100
- <p class="ck-lead">Pick one of the dimensions below to drop in a pre-written 15 rubric. You can edit anything after adding it. Or <%= link_to "write your own from scratch", new_metric_path, class: "ck-link" %>.</p>
99
+ <h2 class="ck-title ck-title--sm">Skip the blank page</h2>
100
+ <p class="ck-lead">Five rubrics we've worked through for common evaluation dimensions. Adopt one to drop in a pre-written 1&ndash;5 scale, edit anything after. Or <%= link_to "write your own from scratch", new_metric_path, class: "ck-link" %>.</p>
101
101
  <div class="ck-starter-grid">
102
102
  <% @available_starters.each do |starter| %>
103
103
  <%= render "starter_card", starter: starter %>
@@ -109,6 +109,7 @@
109
109
  <% source_label, source_class = case v.source
110
110
  when "suggestion" then ["AI suggestion", "ck-source-chip ck-source-chip--ai"]
111
111
  when "edit" then ["Manual edit", "ck-source-chip ck-source-chip--manual"]
112
+ when "revert" then ["Reverted", "ck-source-chip ck-source-chip--revert"]
112
113
  else ["Original", "ck-source-chip ck-source-chip--initial"]
113
114
  end %>
114
115
  <span class="<%= source_class %>"><%= source_label %></span>
data/config/routes.rb CHANGED
@@ -70,6 +70,9 @@ CompletionKit::Engine.routes.draw do
70
70
  member do
71
71
  post :generate
72
72
  post :retry_failures
73
+ post :rerun
74
+ post :regrade
75
+ get :compare
73
76
  end
74
77
  resources :responses, only: [:index, :show] do
75
78
  resources :metrics, only: [] do
@@ -78,10 +81,22 @@ CompletionKit::Engine.routes.draw do
78
81
  end
79
82
  end
80
83
  resources :datasets
81
- resources :metrics
84
+ resources :metrics do
85
+ resources :metric_versions, only: [:index, :show, :destroy] do
86
+ member do
87
+ post :publish
88
+ end
89
+ end
90
+ member do
91
+ post :suggest_variants
92
+ post :add_few_shot
93
+ delete :remove_few_shot
94
+ end
95
+ end
82
96
  resources :metric_groups
83
97
  resources :tags
84
98
  resources :provider_credentials
99
+ resources :calibrations, only: [:index, :destroy]
85
100
  end
86
101
  end
87
102
 
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.6.0"
2
+ VERSION = "0.7.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin
@@ -238,6 +238,7 @@ files:
238
238
  - app/controllers/completion_kit/api/v1/calibrations_controller.rb
239
239
  - app/controllers/completion_kit/api/v1/datasets_controller.rb
240
240
  - app/controllers/completion_kit/api/v1/metric_groups_controller.rb
241
+ - app/controllers/completion_kit/api/v1/metric_versions_controller.rb
241
242
  - app/controllers/completion_kit/api/v1/metrics_controller.rb
242
243
  - app/controllers/completion_kit/api/v1/prompts_controller.rb
243
244
  - app/controllers/completion_kit/api/v1/provider_credentials_controller.rb
@@ -288,6 +289,7 @@ files:
288
289
  - app/models/completion_kit/suggestion.rb
289
290
  - app/models/completion_kit/tag.rb
290
291
  - app/models/completion_kit/tagging.rb
292
+ - app/models/concerns/completion_kit/has_job_status.rb
291
293
  - app/models/concerns/completion_kit/taggable.rb
292
294
  - app/services/completion_kit/anthropic_client.rb
293
295
  - app/services/completion_kit/api_config.rb
@@ -302,6 +304,7 @@ files:
302
304
  - app/services/completion_kit/mcp_tools/datasets.rb
303
305
  - app/services/completion_kit/mcp_tools/judges.rb
304
306
  - app/services/completion_kit/mcp_tools/metric_groups.rb
307
+ - app/services/completion_kit/mcp_tools/metric_versions.rb
305
308
  - app/services/completion_kit/mcp_tools/metrics.rb
306
309
  - app/services/completion_kit/mcp_tools/prompts.rb
307
310
  - app/services/completion_kit/mcp_tools/provider_credentials.rb