completion-kit 0.15.1 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: de445b87c636d825b79ee4a743b250ef0e7f8f16fdbde0562feb8c37f96f44d4
4
- data.tar.gz: c24bd4f6cdc431bed4017acd6cbbd818090248cd54872ba9247db68a248dfbc2
3
+ metadata.gz: c07d6c47ea3beb045a95e9ef2aa91a4e2afbe24668810826d7e50b1e8202b5d0
4
+ data.tar.gz: b60493ca5889e0a90c5168f3fbdf6ceab3c621cf7790152334c51b988c564aca
5
5
  SHA512:
6
- metadata.gz: 6e3381b4664127dd65fd3306dfbb51e8ec2102737bbc7c3b3a07d6b4281a147c9242bc929b10c9b1fa82cccb7cfec42abdb75092c8c88710294a75520b82cc41
7
- data.tar.gz: 3cfe75ecd0f396f100c1b61ce177284b8afe930e1e07ed386ab575a06e986f9f837cfc796b7a77ef01265486e848f0d1b55a67333ae5e05651bd48b98ad0ab9e
6
+ metadata.gz: af5b8ff0082999d04cc348a75b3fd91f2b4b5fab71424bccdfccb2f0431d18f9998175b50f89676f31144d3d8524a2da14c258c7306ce0c7aedbdd40ce9b89ea
7
+ data.tar.gz: e999cad7d73effda025db4d70b3c1d854e406158cd6308f2624a7bfb82b28abcd62af073dbc3bef6705dec4ecce1d45d1d832e89478837abbe91b220a407e489
@@ -4116,6 +4116,23 @@ table.ck-runs-table {
4116
4116
  height: 14px;
4117
4117
  }
4118
4118
 
4119
+ .ck-suggest-progress {
4120
+ margin: 1.5rem 0;
4121
+ padding: 1.5rem;
4122
+ border: 1px solid var(--ck-line);
4123
+ border-radius: var(--ck-radius-lg);
4124
+ background: var(--ck-surface);
4125
+ }
4126
+
4127
+ .ck-suggest-progress .ck-meta-copy {
4128
+ margin: 0.5rem 0 0;
4129
+ max-width: 62ch;
4130
+ }
4131
+
4132
+ .ck-suggest-progress form {
4133
+ margin-top: 1rem;
4134
+ }
4135
+
4119
4136
  .ck-suggest-reasoning {
4120
4137
  margin: 1.5rem 0;
4121
4138
  padding: 1.25rem 1.5rem;
@@ -137,14 +137,12 @@ module CompletionKit
137
137
  return
138
138
  end
139
139
 
140
- service = PromptImprovementService.new(@run)
141
- result = service.suggest
142
140
  suggestion = @run.suggestions.create!(
143
141
  prompt: @run.prompt,
144
- reasoning: result["reasoning"],
145
- suggested_template: result["suggested_template"],
146
- original_template: result["original_template"]
142
+ original_template: @run.prompt.template,
143
+ status: "pending"
147
144
  )
145
+ PromptSuggestionJob.perform_later(suggestion.id)
148
146
  redirect_to suggestion_path(suggestion, from: "run")
149
147
  end
150
148
 
@@ -8,6 +8,16 @@ module CompletionKit
8
8
  end
9
9
 
10
10
  def apply
11
+ if @suggestion.applied_at?
12
+ redirect_to suggestion_path(@suggestion), notice: "Suggestion already applied."
13
+ return
14
+ end
15
+
16
+ unless @suggestion.ready?
17
+ redirect_to suggestion_path(@suggestion), alert: "This suggestion isn't ready to apply yet."
18
+ return
19
+ end
20
+
11
21
  run = @suggestion.run
12
22
  new_prompt = run.prompt.clone_as_new_version(template: @suggestion.suggested_template)
13
23
  new_prompt.publish!
@@ -0,0 +1,55 @@
1
+ require "faraday"
2
+
3
+ module CompletionKit
4
+ class PromptSuggestionJob < ApplicationJob
5
+ queue_as :llm
6
+
7
+ retry_on Faraday::TimeoutError, Faraday::ConnectionFailed, wait: :polynomially_longer, attempts: 5
8
+ retry_on CompletionKit::RateLimitError, wait: :polynomially_longer, attempts: 5
9
+
10
+ rescue_from(StandardError) do |error|
11
+ Rails.error.report(error, handled: true, context: { job: self.class.name })
12
+ if @suggestion
13
+ @suggestion.update_columns(status: "failed")
14
+ broadcast(@suggestion)
15
+ end
16
+ end
17
+
18
+ def perform(suggestion_id)
19
+ @suggestion = Suggestion.find_by(id: suggestion_id)
20
+ return unless @suggestion
21
+
22
+ run = @suggestion.run
23
+ result = PromptImprovementService.new(run).suggest
24
+
25
+ if result["suggested_template"].blank?
26
+ @suggestion.update!(status: "failed")
27
+ broadcast(@suggestion)
28
+ return
29
+ end
30
+
31
+ summary = PromptImprovementValidator.new(run, result["suggested_template"]).call
32
+ @suggestion.update!(
33
+ reasoning: result["reasoning"],
34
+ suggested_template: result["suggested_template"],
35
+ validation_summary: summary,
36
+ status: "ready"
37
+ )
38
+ broadcast(@suggestion)
39
+ end
40
+
41
+ private
42
+
43
+ def broadcast(suggestion)
44
+ html = CompletionKit::ApplicationController.render(
45
+ partial: "completion_kit/suggestions/state",
46
+ locals: { suggestion: suggestion, run: suggestion.run }
47
+ )
48
+ Turbo::StreamsChannel.broadcast_replace_to(
49
+ "completion_kit_suggestion_#{suggestion.id}",
50
+ target: "ck-suggestion-status-#{suggestion.id}",
51
+ html: html
52
+ )
53
+ end
54
+ end
55
+ end
@@ -3,6 +3,32 @@ module CompletionKit
3
3
  belongs_to :run
4
4
  belongs_to :prompt
5
5
 
6
- validates :suggested_template, presence: true
6
+ serialize :validation_summary, coder: JSON
7
+
8
+ validates :suggested_template, presence: true, if: :ready?
9
+
10
+ def pending?
11
+ status == "pending"
12
+ end
13
+
14
+ def failed?
15
+ status == "failed"
16
+ end
17
+
18
+ def ready?
19
+ !pending? && !failed?
20
+ end
21
+
22
+ def validated?
23
+ vs = validation_summary
24
+ vs.present? && vs["after_avg"].present?
25
+ end
26
+
27
+ def net_negative?
28
+ return false unless validated?
29
+
30
+ vs = validation_summary
31
+ vs["after_avg"].to_f < vs["before_avg"].to_f || vs["regressed"].to_i > vs["improved"].to_i
32
+ end
7
33
  end
8
34
  end
@@ -39,14 +39,20 @@ module CompletionKit
39
39
  available_match = available_models.find { |model| model[:id] == model_name.to_s }
40
40
  return available_match[:provider] if available_match
41
41
 
42
- case model_name.to_s
43
- when /\Agpt-/
44
- "openai"
45
- when /\Aclaude-/
46
- "anthropic"
47
- else
48
- nil
49
- end
42
+ guess = case model_name.to_s
43
+ when /\Agpt-/ then "openai"
44
+ when /\Aclaude-/ then "anthropic"
45
+ end
46
+ configured = ProviderCredential.distinct.pluck(:provider)
47
+ return guess if configured.empty?
48
+
49
+ guess if guess && configured.include?(guess)
50
+ end
51
+
52
+ def self.default_judge_model
53
+ configured = CompletionKit.config.judge_model
54
+ configured = configured.call if configured.respond_to?(:call)
55
+ configured.presence || Model.for_judging.order(:provider, :display_name).first&.model_id
50
56
  end
51
57
 
52
58
  def self.valid_for_model?(model_name)
@@ -6,7 +6,7 @@ module CompletionKit
6
6
  class JudgeService
7
7
  def initialize(config = {})
8
8
  @config = config
9
- @judge_model = config[:judge_model] || CompletionKit.config.judge_model
9
+ @judge_model = config[:judge_model].presence || ApiConfig.default_judge_model
10
10
  @judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
11
11
  end
12
12
 
@@ -43,11 +43,23 @@ module CompletionKit
43
43
  judge_prompt += "\nCriteria: #{criteria}\n"
44
44
  end
45
45
 
46
+ judge_prompt += "\nScore strictly on the dimension described above. Do not raise or lower the score for qualities the rubric and criteria do not mention.\n"
47
+
46
48
  judge_prompt += human_examples_block(human_examples)
47
49
 
50
+ if prompt.present?
51
+ judge_prompt += <<~PROMPT
52
+
53
+ The prompt that generated the output is shown below for reference. Weigh it only when the dimension you are scoring is about adherence to what was asked: following instructions, matching a required format or schema, or hitting a requested tone or persona. For dimensions about the output's intrinsic quality, such as factual correctness or conciseness, judge the output on its own and ignore the prompt's specific rules. If the output breaks a prompt rule that is unrelated to the dimension you are scoring, such as a content restriction, a banned topic, or a length limit, do not lower the score for breaking it.
54
+
55
+ Original prompt: #{prompt}
56
+
57
+ Reminder: score only the dimension named in the criteria above.
58
+ PROMPT
59
+ end
60
+
48
61
  judge_prompt += <<~PROMPT
49
62
 
50
- Original prompt: #{prompt || "Not provided"}
51
63
  #{input_data.present? ? "Input data: #{input_data}" : ""}
52
64
  #{expected_output.present? ? "Expected output: #{expected_output}" : ""}
53
65
  AI output to evaluate: #{output}
@@ -52,7 +52,7 @@ module CompletionKit
52
52
  properties: {
53
53
  metric_id: {type: "integer"},
54
54
  count: {type: "integer", description: "How many variants to request (default 1, max 3). One focused rewrite beats five reworded copies."},
55
- model: {type: "string", description: "Override the model used to generate variants. Defaults to CompletionKit.config.judge_model."}
55
+ model: {type: "string", description: "Override the model used to generate variants. Defaults to the configured judge model or an available judging model."}
56
56
  },
57
57
  required: ["metric_id"]
58
58
  },
@@ -110,18 +110,25 @@ module CompletionKit
110
110
  return error_result("Judge-only runs don't have a prompt to improve.") if run.prompt.nil?
111
111
 
112
112
  result = PromptImprovementService.new(run).suggest
113
+ return error_result("The model didn't return a usable rewrite.") if result["suggested_template"].blank?
114
+
115
+ validation = PromptImprovementValidator.new(run, result["suggested_template"]).call
113
116
  suggestion = run.suggestions.create!(
114
117
  prompt: run.prompt,
115
118
  reasoning: result["reasoning"],
116
119
  suggested_template: result["suggested_template"],
117
- original_template: result["original_template"]
120
+ original_template: result["original_template"],
121
+ validation_summary: validation,
122
+ status: "ready"
118
123
  )
119
124
  text_result(
120
125
  suggestion_id: suggestion.id,
121
126
  prompt_id: run.prompt.id,
122
127
  reasoning: suggestion.reasoning,
123
128
  suggested_template: suggestion.suggested_template,
124
- original_template: suggestion.original_template
129
+ original_template: suggestion.original_template,
130
+ validation: validation,
131
+ net_negative: suggestion.net_negative?
125
132
  )
126
133
  end
127
134
  end
@@ -10,10 +10,12 @@ module CompletionKit
10
10
  @metric = metric
11
11
  n = count.to_i
12
12
  @count = n < 1 ? DEFAULT_VARIANT_COUNT : [n, MAX_VARIANT_COUNT].min
13
- @model = model || CompletionKit.config.judge_model
13
+ @model = model.presence || ApiConfig.default_judge_model
14
14
  end
15
15
 
16
16
  def call
17
+ raise CompletionKit::ConfigurationError, "No judging model available; set CompletionKit.config.judge_model or add a provider with a judging model" if @model.blank?
18
+
17
19
  client = LlmClient.for_model(@model, ApiConfig.for_model(@model))
18
20
  raw = client.generate_completion(build_meta_prompt, model: @model, max_tokens: 2500, temperature: DEFAULT_TEMPERATURE)
19
21
  parse(raw).first(@count)
@@ -15,8 +15,7 @@ module CompletionKit
15
15
  SAMPLE_PROMPT = {
16
16
  name: "Sample: Support reply",
17
17
  description: "A starter prompt. Drafts a warm, professional reply to a customer support ticket. Edit it or delete it; it's just here to get you going.",
18
- template: "You are a senior customer-support specialist. Write a warm, professional reply to this ticket. Acknowledge the customer's situation, be specific about next steps, and don't be defensive.\n\nTicket:\n{{ticket}}",
19
- llm_model: "gpt-4o-mini"
18
+ template: "You are a senior customer-support specialist. Write a warm, professional reply to this ticket. Acknowledge the customer's situation, be specific about next steps, and don't be defensive.\n\nTicket:\n{{ticket}}"
20
19
  }.freeze
21
20
 
22
21
  module_function
@@ -25,11 +24,15 @@ module CompletionKit
25
24
  return if CompletionKit::Prompt.exists? || CompletionKit::Dataset.exists?
26
25
 
27
26
  CompletionKit::Dataset.create!(name: "Sample: Customer tickets", csv_data: SAMPLE_CSV)
27
+
28
+ model = CompletionKit::Model.for_generation.order(:provider, :display_name).first&.model_id
29
+ return unless model
30
+
28
31
  CompletionKit::Prompt.create!(
29
32
  name: SAMPLE_PROMPT[:name],
30
33
  description: SAMPLE_PROMPT[:description],
31
34
  template: SAMPLE_PROMPT[:template],
32
- llm_model: SAMPLE_PROMPT[:llm_model]
35
+ llm_model: model
33
36
  )
34
37
  end
35
38
  end
@@ -0,0 +1,106 @@
1
+ require "json"
2
+
3
+ module CompletionKit
4
+ class PromptImprovementValidator
5
+ HELD_OUT_LIMIT = 30
6
+
7
+ Candidate = Struct.new(:template)
8
+
9
+ def initialize(run, candidate_template, generator: nil, judge: nil)
10
+ @run = run
11
+ @candidate = candidate_template
12
+ @generator = generator || method(:generate)
13
+ @judge = judge || method(:judge_score)
14
+ end
15
+
16
+ def call
17
+ rows = held_out.filter_map do |response|
18
+ new_text = @generator.call(response)
19
+ next if new_text.blank?
20
+
21
+ after = @judge.call(response, new_text)
22
+ next if after.nil?
23
+
24
+ row_for(response, after)
25
+ rescue StandardError
26
+ next
27
+ end
28
+ summarize(rows, @total.to_i, @total.to_i > HELD_OUT_LIMIT)
29
+ end
30
+
31
+ private
32
+
33
+ def held_out
34
+ scope = @run.responses
35
+ .where.not(response_text: [nil, ""])
36
+ .where.not(input_data: [nil, ""])
37
+ .where(id: Review.where.not(ai_score: nil).select(:response_id))
38
+ @total = scope.count
39
+ scope.order(:row_index).limit(HELD_OUT_LIMIT).to_a
40
+ end
41
+
42
+ def row_for(response, after)
43
+ before = response.score
44
+ {
45
+ "response_id" => response.id,
46
+ "before" => before.round(2),
47
+ "after" => after.to_f.round(2),
48
+ "delta" => (after.to_f - before).round(2)
49
+ }
50
+ end
51
+
52
+ def summarize(rows, total, capped)
53
+ improved = rows.count { |r| r["after"] > r["before"] }
54
+ regressed = rows.count { |r| r["after"] < r["before"] }
55
+ {
56
+ "total" => total,
57
+ "tested" => rows.size,
58
+ "capped" => capped,
59
+ "before_avg" => avg(rows.map { |r| r["before"] }),
60
+ "after_avg" => avg(rows.map { |r| r["after"] }),
61
+ "improved" => improved,
62
+ "regressed" => regressed,
63
+ "unchanged" => rows.size - improved - regressed,
64
+ "rows" => rows
65
+ }
66
+ end
67
+
68
+ def avg(values)
69
+ return nil if values.empty?
70
+
71
+ (values.sum / values.size).round(2)
72
+ end
73
+
74
+ def generate(response)
75
+ rendered = CsvProcessor.apply_variables(Candidate.new(@candidate), parse_input(response.input_data))
76
+ model = @run.prompt.llm_model
77
+ client = LlmClient.for_model(model, ApiConfig.for_model(model))
78
+ raise CompletionKit::ConfigurationError, client.configuration_errors.join(", ") unless client.configured?
79
+
80
+ text = client.generate_completion(rendered, model: model, temperature: @run.temperature)
81
+ raise StandardError, text if text.to_s.start_with?("Error:")
82
+
83
+ text
84
+ end
85
+
86
+ def judge_score(response, new_text)
87
+ config = ApiConfig.for_model(@run.judge_model).merge(judge_model: @run.judge_model)
88
+ judge = JudgeService.new(config)
89
+ scores = @run.metrics.filter_map do |metric|
90
+ judge.evaluate(
91
+ new_text, response.expected_output, @candidate,
92
+ criteria: metric.instruction.to_s,
93
+ rubric_text: metric.display_rubric_text,
94
+ input_data: response.input_data
95
+ )[:score]
96
+ end
97
+ avg(scores)
98
+ end
99
+
100
+ def parse_input(raw)
101
+ JSON.parse(raw)
102
+ rescue JSON::ParserError
103
+ {}
104
+ end
105
+ end
106
+ end
@@ -46,7 +46,7 @@
46
46
  <span class="ck-mcp-install-card__icon">&#9654;</span>
47
47
  Claude Code
48
48
  </div>
49
- <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "claude mcp add completion-kit \\\n --transport http \\\n --url #{base_url}/mcp \\\n --header \"Authorization: Bearer #{token}\"" %>
49
+ <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "claude mcp add --transport http completion-kit \\\n #{base_url}/mcp \\\n --header \"Authorization: Bearer #{token}\"" %>
50
50
  </div>
51
51
  <div class="ck-mcp-install-card">
52
52
  <div class="ck-mcp-install-card__header">
@@ -199,9 +199,10 @@
199
199
  </div>
200
200
  <div class="ck-api-endpoint">
201
201
  <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/datasets</p>
202
- <p class="ck-meta-copy">Create a dataset.</p>
203
- <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>name</code>, <code>csv_data</code></p>
202
+ <p class="ck-meta-copy">Create a dataset from inline CSV or an uploaded CSV file.</p>
203
+ <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>name</code>, and either <code>csv_data</code> (inline CSV) or a multipart <code>file</code> (CSV upload, preferred for large datasets)</p>
204
204
  <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/datasets \\\n -H \"Authorization: Bearer #{token}\" \\\n -H \"Content-Type: application/json\" \\\n -d '{\"name\": \"tickets\", \"csv_data\": \"text,expected_output\\\\nHello,Hi\"}'" %>
205
+ <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/datasets \\\n -H \"Authorization: Bearer #{token}\" \\\n -F \"name=tickets\" \\\n -F \"file=@tickets.csv\"" %>
205
206
  </div>
206
207
  <div class="ck-api-endpoint">
207
208
  <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span>&ensp;<span class="ck-chip ck-chip--soft">PATCH</span>&ensp;<span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/datasets/:id</p>
@@ -0,0 +1,16 @@
1
+ <% s = summary %>
2
+ <div class="ck-scoreboard">
3
+ <% if s["after_avg"] && s["before_avg"] %>
4
+ <p class="ck-scoreboard__headline">Scored <strong><%= s["after_avg"] %></strong> across <%= pluralize(s["tested"], "held-out response") %> <span class="ck-scoreboard__was">was <%= s["before_avg"] %></span></p>
5
+ <ul class="ck-scoreboard__tally">
6
+ <li class="ck-scoreboard__stat ck-scoreboard__stat--fix">Improved <strong><%= s["improved"] %></strong></li>
7
+ <li class="ck-scoreboard__stat ck-scoreboard__stat--keep">Held <strong><%= s["unchanged"] %></strong></li>
8
+ <li class="ck-scoreboard__stat ck-scoreboard__stat--break">Regressed <strong><%= s["regressed"] %></strong></li>
9
+ </ul>
10
+ <% else %>
11
+ <p class="ck-scoreboard__headline">Couldn't re-score this rewrite against the run's responses.</p>
12
+ <% end %>
13
+ <% if s["capped"] %>
14
+ <p class="ck-scoreboard__note">Tested against this run's 30 most recent responses.</p>
15
+ <% end %>
16
+ </div>
@@ -0,0 +1,56 @@
1
+ <div id="ck-suggestion-status-<%= suggestion.id %>" class="ck-suggestion-state">
2
+ <% if suggestion.pending? %>
3
+ <div class="ck-suggest-progress">
4
+ <p class="ck-kicker">Validating</p>
5
+ <p class="ck-meta-copy">Drafting a stronger prompt and re-scoring it against this run's responses. This page updates on its own when it finishes.</p>
6
+ </div>
7
+ <% elsif suggestion.failed? %>
8
+ <div class="ck-suggest-progress">
9
+ <p class="ck-kicker">Try again</p>
10
+ <p class="ck-meta-copy">We couldn't produce a validated rewrite this time. Review a few more responses, then try again.</p>
11
+ <%= button_to "Try again", suggest_run_path(run), method: :post, class: ck_button_classes(:light, variant: :outline), form_class: "inline-block" %>
12
+ </div>
13
+ <% else %>
14
+ <% if suggestion.validation_summary.present? %>
15
+ <%= render "completion_kit/suggestions/scoreboard", summary: suggestion.validation_summary %>
16
+ <% end %>
17
+
18
+ <div class="ck-suggest-reasoning">
19
+ <p class="ck-kicker">Why these changes</p>
20
+ <div class="ck-suggest-reasoning__body"><%= simple_format(suggestion.reasoning) %></div>
21
+ </div>
22
+
23
+ <div class="ck-suggest-diff">
24
+ <div class="ck-suggest-diff__pane">
25
+ <div class="ck-suggest-diff__header">
26
+ <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Original prompt</span>
27
+ <span class="ck-suggest-diff__version"><%= suggestion.prompt.version_label %></span>
28
+ </div>
29
+ <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(suggestion.original_template, suggestion.suggested_template) %></pre>
30
+ </div>
31
+ <div class="ck-suggest-diff__pane">
32
+ <div class="ck-suggest-diff__header">
33
+ <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Suggested prompt</span>
34
+ </div>
35
+ <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(suggestion.original_template, suggestion.suggested_template) %></pre>
36
+ </div>
37
+ </div>
38
+
39
+ <div class="ck-suggest-full">
40
+ <p class="ck-kicker">Full suggested prompt</p>
41
+ <pre class="ck-code ck-code--dark"><%= suggestion.suggested_template %></pre>
42
+ </div>
43
+
44
+ <div class="ck-actions">
45
+ <% if suggestion.applied_at? %>
46
+ <span class="ck-chip" style="background: var(--ck-success-soft); color: var(--ck-success);">Applied</span>
47
+ <% elsif !suggestion.validated? %>
48
+ <%= button_to "Apply anyway", apply_suggestion_path(suggestion), method: :post, class: ck_button_classes(:light, variant: :outline), form: { class: "inline-block", data: { turbo_confirm: "This rewrite couldn't be re-scored against the run's responses. Apply it anyway?" } } %>
49
+ <% elsif suggestion.net_negative? %>
50
+ <%= button_to "Apply anyway", apply_suggestion_path(suggestion), method: :post, class: ck_button_classes(:light, variant: :outline), form: { class: "inline-block", data: { turbo_confirm: "This rewrite scored lower than the original on the held-out responses. Apply it anyway?" } } %>
51
+ <% else %>
52
+ <%= button_to "Apply suggestion", apply_suggestion_path(suggestion), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
53
+ <% end %>
54
+ </div>
55
+ <% end %>
56
+ </div>
@@ -19,8 +19,6 @@
19
19
  &middot; <%= @run.responses.count %> responses scored
20
20
  <% if @run.avg_score %>
21
21
  <span class="<%= ck_badge_classes(ck_score_kind(@run.avg_score)) %>"><%= @run.avg_score %></span>
22
- <% else %>
23
- &mdash;
24
22
  <% end %>
25
23
  </p>
26
24
  </div>
@@ -30,36 +28,8 @@
30
28
  <% else %>
31
29
  <%= link_to "Back to prompt", prompt_path(@run.prompt), class: ck_button_classes(:light, variant: :outline) %>
32
30
  <% end %>
33
- <% if @suggestion.applied_at? %>
34
- <span class="ck-chip" style="background: var(--ck-success-soft); color: var(--ck-success);">Applied</span>
35
- <% else %>
36
- <%= button_to "Apply suggestion", apply_suggestion_path(@suggestion), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
37
- <% end %>
38
31
  </div>
39
32
  </section>
40
33
 
41
- <div class="ck-suggest-reasoning">
42
- <p class="ck-kicker">Why these changes</p>
43
- <div class="ck-suggest-reasoning__body"><%= simple_format(@suggestion.reasoning) %></div>
44
- </div>
45
-
46
- <div class="ck-suggest-diff">
47
- <div class="ck-suggest-diff__pane">
48
- <div class="ck-suggest-diff__header">
49
- <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Original prompt</span>
50
- <span class="ck-suggest-diff__version"><%= @suggestion.prompt.version_label %></span>
51
- </div>
52
- <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(@suggestion.original_template, @suggestion.suggested_template) %></pre>
53
- </div>
54
- <div class="ck-suggest-diff__pane">
55
- <div class="ck-suggest-diff__header">
56
- <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Suggested prompt</span>
57
- </div>
58
- <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(@suggestion.original_template, @suggestion.suggested_template) %></pre>
59
- </div>
60
- </div>
61
-
62
- <div class="ck-suggest-full">
63
- <p class="ck-kicker">Full suggested prompt</p>
64
- <pre class="ck-code ck-code--dark"><%= @suggestion.suggested_template %></pre>
65
- </div>
34
+ <%= turbo_stream_from "completion_kit_suggestion_#{@suggestion.id}" %>
35
+ <%= render "completion_kit/suggestions/state", suggestion: @suggestion, run: @run %>
@@ -0,0 +1,6 @@
1
+ class AddValidationToCompletionKitSuggestions < ActiveRecord::Migration[7.1]
2
+ def change
3
+ add_column :completion_kit_suggestions, :validation_summary, :text
4
+ add_column :completion_kit_suggestions, :status, :string, default: "ready", null: false
5
+ end
6
+ end
@@ -1,3 +1,3 @@
1
1
  module CompletionKit
2
- VERSION = "0.15.1"
2
+ VERSION = "0.16.0"
3
3
  end
@@ -21,7 +21,7 @@ module CompletionKit
21
21
  @ollama_api_key = ENV['OLLAMA_API_KEY']
22
22
  @ollama_api_endpoint = ENV['OLLAMA_API_ENDPOINT']
23
23
 
24
- @judge_model = "gpt-4.1"
24
+ @judge_model = nil
25
25
  @high_quality_threshold = 4
26
26
  @medium_quality_threshold = 3
27
27
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: completion-kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.1
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Damien Bastin
@@ -268,6 +268,7 @@ files:
268
268
  - app/jobs/completion_kit/judge_review_job.rb
269
269
  - app/jobs/completion_kit/metric_suggestion_job.rb
270
270
  - app/jobs/completion_kit/model_discovery_job.rb
271
+ - app/jobs/completion_kit/prompt_suggestion_job.rb
271
272
  - app/jobs/completion_kit/run_completion_check_job.rb
272
273
  - app/mailers/completion_kit/application_mailer.rb
273
274
  - app/models/completion_kit/agreement.rb
@@ -324,6 +325,7 @@ files:
324
325
  - app/services/completion_kit/open_ai_client.rb
325
326
  - app/services/completion_kit/open_router_client.rb
326
327
  - app/services/completion_kit/prompt_improvement_service.rb
328
+ - app/services/completion_kit/prompt_improvement_validator.rb
327
329
  - app/services/completion_kit/provider_endpoint.rb
328
330
  - app/services/completion_kit/starter_metrics.rb
329
331
  - app/services/completion_kit/worker_health.rb
@@ -396,6 +398,8 @@ files:
396
398
  - app/views/completion_kit/runs/new.html.erb
397
399
  - app/views/completion_kit/runs/show.html.erb
398
400
  - app/views/completion_kit/shared/_settings_nav.html.erb
401
+ - app/views/completion_kit/suggestions/_scoreboard.html.erb
402
+ - app/views/completion_kit/suggestions/_state.html.erb
399
403
  - app/views/completion_kit/suggestions/show.html.erb
400
404
  - app/views/completion_kit/tags/_filter_bar.html.erb
401
405
  - app/views/completion_kit/tags/_form.html.erb
@@ -444,6 +448,7 @@ files:
444
448
  - db/migrate/20260531000002_backfill_review_metric_versions.rb
445
449
  - db/migrate/20260531000003_add_metric_version_fk_to_reviews.rb
446
450
  - db/migrate/20260531000004_rename_calibrations_to_agreements.rb
451
+ - db/migrate/20260611000001_add_validation_to_completion_kit_suggestions.rb
447
452
  - lib/completion-kit.rb
448
453
  - lib/completion_kit.rb
449
454
  - lib/completion_kit/concurrency_check.rb