RubyGems - completion-kit - Versions diffs - 0.15.1 → 0.16.0 - Mend

completion-kit 0.15.1 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: de445b87c636d825b79ee4a743b250ef0e7f8f16fdbde0562feb8c37f96f44d4
-  data.tar.gz: c24bd4f6cdc431bed4017acd6cbbd818090248cd54872ba9247db68a248dfbc2
+  metadata.gz: c07d6c47ea3beb045a95e9ef2aa91a4e2afbe24668810826d7e50b1e8202b5d0
+  data.tar.gz: b60493ca5889e0a90c5168f3fbdf6ceab3c621cf7790152334c51b988c564aca
 SHA512:
-  metadata.gz: 6e3381b4664127dd65fd3306dfbb51e8ec2102737bbc7c3b3a07d6b4281a147c9242bc929b10c9b1fa82cccb7cfec42abdb75092c8c88710294a75520b82cc41
-  data.tar.gz: 3cfe75ecd0f396f100c1b61ce177284b8afe930e1e07ed386ab575a06e986f9f837cfc796b7a77ef01265486e848f0d1b55a67333ae5e05651bd48b98ad0ab9e
+  metadata.gz: af5b8ff0082999d04cc348a75b3fd91f2b4b5fab71424bccdfccb2f0431d18f9998175b50f89676f31144d3d8524a2da14c258c7306ce0c7aedbdd40ce9b89ea
+  data.tar.gz: e999cad7d73effda025db4d70b3c1d854e406158cd6308f2624a7bfb82b28abcd62af073dbc3bef6705dec4ecce1d45d1d832e89478837abbe91b220a407e489

data/app/assets/stylesheets/completion_kit/application.css CHANGED Viewed

@@ -4116,6 +4116,23 @@ table.ck-runs-table {
   height: 14px;
 }
+.ck-suggest-progress {
+  margin: 1.5rem 0;
+  padding: 1.5rem;
+  border: 1px solid var(--ck-line);
+  border-radius: var(--ck-radius-lg);
+  background: var(--ck-surface);
+}
+.ck-suggest-progress .ck-meta-copy {
+  margin: 0.5rem 0 0;
+  max-width: 62ch;
+}
+.ck-suggest-progress form {
+  margin-top: 1rem;
+}
 .ck-suggest-reasoning {
   margin: 1.5rem 0;
   padding: 1.25rem 1.5rem;

data/app/controllers/completion_kit/runs_controller.rb CHANGED Viewed

@@ -137,14 +137,12 @@ module CompletionKit
         return
       end
-      service = PromptImprovementService.new(@run)
-      result = service.suggest
       suggestion = @run.suggestions.create!(
         prompt: @run.prompt,
-        reasoning: result["reasoning"],
-        suggested_template: result["suggested_template"],
-        original_template: result["original_template"]
+        original_template: @run.prompt.template,
+        status: "pending"
       )
+      PromptSuggestionJob.perform_later(suggestion.id)
       redirect_to suggestion_path(suggestion, from: "run")
     end

data/app/controllers/completion_kit/suggestions_controller.rb CHANGED Viewed

@@ -8,6 +8,16 @@ module CompletionKit
     end
     def apply
+      if @suggestion.applied_at?
+        redirect_to suggestion_path(@suggestion), notice: "Suggestion already applied."
+        return
+      end
+      unless @suggestion.ready?
+        redirect_to suggestion_path(@suggestion), alert: "This suggestion isn't ready to apply yet."
+        return
+      end
       run = @suggestion.run
       new_prompt = run.prompt.clone_as_new_version(template: @suggestion.suggested_template)
       new_prompt.publish!

data/app/jobs/completion_kit/prompt_suggestion_job.rb ADDED Viewed

@@ -0,0 +1,55 @@
+require "faraday"
+module CompletionKit
+  class PromptSuggestionJob < ApplicationJob
+    queue_as :llm
+    retry_on Faraday::TimeoutError, Faraday::ConnectionFailed, wait: :polynomially_longer, attempts: 5
+    retry_on CompletionKit::RateLimitError, wait: :polynomially_longer, attempts: 5
+    rescue_from(StandardError) do |error|
+      Rails.error.report(error, handled: true, context: { job: self.class.name })
+      if @suggestion
+        @suggestion.update_columns(status: "failed")
+        broadcast(@suggestion)
+      end
+    end
+    def perform(suggestion_id)
+      @suggestion = Suggestion.find_by(id: suggestion_id)
+      return unless @suggestion
+      run = @suggestion.run
+      result = PromptImprovementService.new(run).suggest
+      if result["suggested_template"].blank?
+        @suggestion.update!(status: "failed")
+        broadcast(@suggestion)
+        return
+      end
+      summary = PromptImprovementValidator.new(run, result["suggested_template"]).call
+      @suggestion.update!(
+        reasoning: result["reasoning"],
+        suggested_template: result["suggested_template"],
+        validation_summary: summary,
+        status: "ready"
+      )
+      broadcast(@suggestion)
+    end
+    private
+    def broadcast(suggestion)
+      html = CompletionKit::ApplicationController.render(
+        partial: "completion_kit/suggestions/state",
+        locals: { suggestion: suggestion, run: suggestion.run }
+      )
+      Turbo::StreamsChannel.broadcast_replace_to(
+        "completion_kit_suggestion_#{suggestion.id}",
+        target: "ck-suggestion-status-#{suggestion.id}",
+        html: html
+      )
+    end
+  end
+end

data/app/models/completion_kit/suggestion.rb CHANGED Viewed

@@ -3,6 +3,32 @@ module CompletionKit
     belongs_to :run
     belongs_to :prompt
-    validates :suggested_template, presence: true
+    serialize :validation_summary, coder: JSON
+    validates :suggested_template, presence: true, if: :ready?
+    def pending?
+      status == "pending"
+    end
+    def failed?
+      status == "failed"
+    end
+    def ready?
+      !pending? && !failed?
+    end
+    def validated?
+      vs = validation_summary
+      vs.present? && vs["after_avg"].present?
+    end
+    def net_negative?
+      return false unless validated?
+      vs = validation_summary
+      vs["after_avg"].to_f < vs["before_avg"].to_f || vs["regressed"].to_i > vs["improved"].to_i
+    end
   end
 end

data/app/services/completion_kit/api_config.rb CHANGED Viewed

@@ -39,14 +39,20 @@ module CompletionKit
       available_match = available_models.find { |model| model[:id] == model_name.to_s }
       return available_match[:provider] if available_match
-      case model_name.to_s
-      when /\Agpt-/
-        "openai"
-      when /\Aclaude-/
-        "anthropic"
-      else
-        nil
-      end
+      guess = case model_name.to_s
+              when /\Agpt-/ then "openai"
+              when /\Aclaude-/ then "anthropic"
+              end
+      configured = ProviderCredential.distinct.pluck(:provider)
+      return guess if configured.empty?
+      guess if guess && configured.include?(guess)
+    end
+    def self.default_judge_model
+      configured = CompletionKit.config.judge_model
+      configured = configured.call if configured.respond_to?(:call)
+      configured.presence || Model.for_judging.order(:provider, :display_name).first&.model_id
     end
     def self.valid_for_model?(model_name)

data/app/services/completion_kit/judge_service.rb CHANGED Viewed

@@ -6,7 +6,7 @@ module CompletionKit
   class JudgeService
     def initialize(config = {})
       @config = config
-      @judge_model = config[:judge_model] || CompletionKit.config.judge_model
+      @judge_model = config[:judge_model].presence || ApiConfig.default_judge_model
       @judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
     end
@@ -43,11 +43,23 @@ module CompletionKit
         judge_prompt += "\nCriteria: #{criteria}\n"
       end
+      judge_prompt += "\nScore strictly on the dimension described above. Do not raise or lower the score for qualities the rubric and criteria do not mention.\n"
       judge_prompt += human_examples_block(human_examples)
+      if prompt.present?
+        judge_prompt += <<~PROMPT
+          The prompt that generated the output is shown below for reference. Weigh it only when the dimension you are scoring is about adherence to what was asked: following instructions, matching a required format or schema, or hitting a requested tone or persona. For dimensions about the output's intrinsic quality, such as factual correctness or conciseness, judge the output on its own and ignore the prompt's specific rules. If the output breaks a prompt rule that is unrelated to the dimension you are scoring, such as a content restriction, a banned topic, or a length limit, do not lower the score for breaking it.
+          Original prompt: #{prompt}
+          Reminder: score only the dimension named in the criteria above.
+        PROMPT
+      end
       judge_prompt += <<~PROMPT
-        Original prompt: #{prompt || "Not provided"}
         #{input_data.present? ? "Input data: #{input_data}" : ""}
         #{expected_output.present? ? "Expected output: #{expected_output}" : ""}
         AI output to evaluate: #{output}

data/app/services/completion_kit/mcp_tools/metrics.rb CHANGED Viewed

@@ -52,7 +52,7 @@ module CompletionKit
             properties: {
               metric_id: {type: "integer"},
               count: {type: "integer", description: "How many variants to request (default 1, max 3). One focused rewrite beats five reworded copies."},
-              model: {type: "string", description: "Override the model used to generate variants. Defaults to CompletionKit.config.judge_model."}
+              model: {type: "string", description: "Override the model used to generate variants. Defaults to the configured judge model or an available judging model."}
             },
             required: ["metric_id"]
           },

data/app/services/completion_kit/mcp_tools/prompts.rb CHANGED Viewed

@@ -110,18 +110,25 @@ module CompletionKit
         return error_result("Judge-only runs don't have a prompt to improve.") if run.prompt.nil?
         result = PromptImprovementService.new(run).suggest
+        return error_result("The model didn't return a usable rewrite.") if result["suggested_template"].blank?
+        validation = PromptImprovementValidator.new(run, result["suggested_template"]).call
         suggestion = run.suggestions.create!(
           prompt: run.prompt,
           reasoning: result["reasoning"],
           suggested_template: result["suggested_template"],
-          original_template: result["original_template"]
+          original_template: result["original_template"],
+          validation_summary: validation,
+          status: "ready"
         )
         text_result(
           suggestion_id: suggestion.id,
           prompt_id: run.prompt.id,
           reasoning: suggestion.reasoning,
           suggested_template: suggestion.suggested_template,
-          original_template: suggestion.original_template
+          original_template: suggestion.original_template,
+          validation: validation,
+          net_negative: suggestion.net_negative?
         )
       end
     end

data/app/services/completion_kit/metric_variant_generator.rb CHANGED Viewed

@@ -10,10 +10,12 @@ module CompletionKit
       @metric = metric
       n = count.to_i
       @count = n < 1 ? DEFAULT_VARIANT_COUNT : [n, MAX_VARIANT_COUNT].min
-      @model = model || CompletionKit.config.judge_model
+      @model = model.presence || ApiConfig.default_judge_model
     end
     def call
+      raise CompletionKit::ConfigurationError, "No judging model available; set CompletionKit.config.judge_model or add a provider with a judging model" if @model.blank?
       client = LlmClient.for_model(@model, ApiConfig.for_model(@model))
       raw = client.generate_completion(build_meta_prompt, model: @model, max_tokens: 2500, temperature: DEFAULT_TEMPERATURE)
       parse(raw).first(@count)

data/app/services/completion_kit/onboarding/sample_data.rb CHANGED Viewed

@@ -15,8 +15,7 @@ module CompletionKit
       SAMPLE_PROMPT = {
         name: "Sample: Support reply",
         description: "A starter prompt. Drafts a warm, professional reply to a customer support ticket. Edit it or delete it; it's just here to get you going.",
-        template: "You are a senior customer-support specialist. Write a warm, professional reply to this ticket. Acknowledge the customer's situation, be specific about next steps, and don't be defensive.\n\nTicket:\n{{ticket}}",
-        llm_model: "gpt-4o-mini"
+        template: "You are a senior customer-support specialist. Write a warm, professional reply to this ticket. Acknowledge the customer's situation, be specific about next steps, and don't be defensive.\n\nTicket:\n{{ticket}}"
       }.freeze
       module_function
@@ -25,11 +24,15 @@ module CompletionKit
         return if CompletionKit::Prompt.exists? || CompletionKit::Dataset.exists?
         CompletionKit::Dataset.create!(name: "Sample: Customer tickets", csv_data: SAMPLE_CSV)
+        model = CompletionKit::Model.for_generation.order(:provider, :display_name).first&.model_id
+        return unless model
         CompletionKit::Prompt.create!(
           name: SAMPLE_PROMPT[:name],
           description: SAMPLE_PROMPT[:description],
           template: SAMPLE_PROMPT[:template],
-          llm_model: SAMPLE_PROMPT[:llm_model]
+          llm_model: model
         )
       end
     end

data/app/services/completion_kit/prompt_improvement_validator.rb ADDED Viewed

@@ -0,0 +1,106 @@
+require "json"
+module CompletionKit
+  class PromptImprovementValidator
+    HELD_OUT_LIMIT = 30
+    Candidate = Struct.new(:template)
+    def initialize(run, candidate_template, generator: nil, judge: nil)
+      @run = run
+      @candidate = candidate_template
+      @generator = generator || method(:generate)
+      @judge = judge || method(:judge_score)
+    end
+    def call
+      rows = held_out.filter_map do |response|
+        new_text = @generator.call(response)
+        next if new_text.blank?
+        after = @judge.call(response, new_text)
+        next if after.nil?
+        row_for(response, after)
+      rescue StandardError
+        next
+      end
+      summarize(rows, @total.to_i, @total.to_i > HELD_OUT_LIMIT)
+    end
+    private
+    def held_out
+      scope = @run.responses
+                  .where.not(response_text: [nil, ""])
+                  .where.not(input_data: [nil, ""])
+                  .where(id: Review.where.not(ai_score: nil).select(:response_id))
+      @total = scope.count
+      scope.order(:row_index).limit(HELD_OUT_LIMIT).to_a
+    end
+    def row_for(response, after)
+      before = response.score
+      {
+        "response_id" => response.id,
+        "before" => before.round(2),
+        "after" => after.to_f.round(2),
+        "delta" => (after.to_f - before).round(2)
+      }
+    end
+    def summarize(rows, total, capped)
+      improved = rows.count { |r| r["after"] > r["before"] }
+      regressed = rows.count { |r| r["after"] < r["before"] }
+      {
+        "total" => total,
+        "tested" => rows.size,
+        "capped" => capped,
+        "before_avg" => avg(rows.map { |r| r["before"] }),
+        "after_avg" => avg(rows.map { |r| r["after"] }),
+        "improved" => improved,
+        "regressed" => regressed,
+        "unchanged" => rows.size - improved - regressed,
+        "rows" => rows
+      }
+    end
+    def avg(values)
+      return nil if values.empty?
+      (values.sum / values.size).round(2)
+    end
+    def generate(response)
+      rendered = CsvProcessor.apply_variables(Candidate.new(@candidate), parse_input(response.input_data))
+      model = @run.prompt.llm_model
+      client = LlmClient.for_model(model, ApiConfig.for_model(model))
+      raise CompletionKit::ConfigurationError, client.configuration_errors.join(", ") unless client.configured?
+      text = client.generate_completion(rendered, model: model, temperature: @run.temperature)
+      raise StandardError, text if text.to_s.start_with?("Error:")
+      text
+    end
+    def judge_score(response, new_text)
+      config = ApiConfig.for_model(@run.judge_model).merge(judge_model: @run.judge_model)
+      judge = JudgeService.new(config)
+      scores = @run.metrics.filter_map do |metric|
+        judge.evaluate(
+          new_text, response.expected_output, @candidate,
+          criteria: metric.instruction.to_s,
+          rubric_text: metric.display_rubric_text,
+          input_data: response.input_data
+        )[:score]
+      end
+      avg(scores)
+    end
+    def parse_input(raw)
+      JSON.parse(raw)
+    rescue JSON::ParserError
+      {}
+    end
+  end
+end

data/app/views/completion_kit/api_reference/_body.html.erb CHANGED Viewed

@@ -46,7 +46,7 @@
             <span class="ck-mcp-install-card__icon">&#9654;</span>
             Claude Code
           </div>
-          <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "claude mcp add completion-kit \\\n  --transport http \\\n  --url #{base_url}/mcp \\\n  --header \"Authorization: Bearer #{token}\"" %>
+          <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "claude mcp add --transport http completion-kit \\\n  #{base_url}/mcp \\\n  --header \"Authorization: Bearer #{token}\"" %>
         </div>
         <div class="ck-mcp-install-card">
           <div class="ck-mcp-install-card__header">
@@ -199,9 +199,10 @@
       </div>
       <div class="ck-api-endpoint">
         <p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/datasets</p>
-        <p class="ck-meta-copy">Create a dataset.</p>
-        <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>name</code>, <code>csv_data</code></p>
+        <p class="ck-meta-copy">Create a dataset from inline CSV or an uploaded CSV file.</p>
+        <p class="ck-api-params"><strong>Required:</strong>&ensp;<code>name</code>, and either <code>csv_data</code> (inline CSV) or a multipart <code>file</code> (CSV upload, preferred for large datasets)</p>
         <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/datasets \\\n  -H \"Authorization: Bearer #{token}\" \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\"name\": \"tickets\", \"csv_data\": \"text,expected_output\\\\nHello,Hi\"}'" %>
+        <%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/datasets \\\n  -H \"Authorization: Bearer #{token}\" \\\n  -F \"name=tickets\" \\\n  -F \"file=@tickets.csv\"" %>
       </div>
       <div class="ck-api-endpoint">
         <p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span>&ensp;<span class="ck-chip ck-chip--soft">PATCH</span>&ensp;<span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/datasets/:id</p>

data/app/views/completion_kit/suggestions/_scoreboard.html.erb ADDED Viewed

@@ -0,0 +1,16 @@
+<% s = summary %>
+<div class="ck-scoreboard">
+  <% if s["after_avg"] && s["before_avg"] %>
+    <p class="ck-scoreboard__headline">Scored <strong><%= s["after_avg"] %></strong> across <%= pluralize(s["tested"], "held-out response") %> <span class="ck-scoreboard__was">was <%= s["before_avg"] %></span></p>
+    <ul class="ck-scoreboard__tally">
+      <li class="ck-scoreboard__stat ck-scoreboard__stat--fix">Improved <strong><%= s["improved"] %></strong></li>
+      <li class="ck-scoreboard__stat ck-scoreboard__stat--keep">Held <strong><%= s["unchanged"] %></strong></li>
+      <li class="ck-scoreboard__stat ck-scoreboard__stat--break">Regressed <strong><%= s["regressed"] %></strong></li>
+    </ul>
+  <% else %>
+    <p class="ck-scoreboard__headline">Couldn't re-score this rewrite against the run's responses.</p>
+  <% end %>
+  <% if s["capped"] %>
+    <p class="ck-scoreboard__note">Tested against this run's 30 most recent responses.</p>
+  <% end %>
+</div>

data/app/views/completion_kit/suggestions/_state.html.erb ADDED Viewed

@@ -0,0 +1,56 @@
+<div id="ck-suggestion-status-<%= suggestion.id %>" class="ck-suggestion-state">
+  <% if suggestion.pending? %>
+    <div class="ck-suggest-progress">
+      <p class="ck-kicker">Validating</p>
+      <p class="ck-meta-copy">Drafting a stronger prompt and re-scoring it against this run's responses. This page updates on its own when it finishes.</p>
+    </div>
+  <% elsif suggestion.failed? %>
+    <div class="ck-suggest-progress">
+      <p class="ck-kicker">Try again</p>
+      <p class="ck-meta-copy">We couldn't produce a validated rewrite this time. Review a few more responses, then try again.</p>
+      <%= button_to "Try again", suggest_run_path(run), method: :post, class: ck_button_classes(:light, variant: :outline), form_class: "inline-block" %>
+    </div>
+  <% else %>
+    <% if suggestion.validation_summary.present? %>
+      <%= render "completion_kit/suggestions/scoreboard", summary: suggestion.validation_summary %>
+    <% end %>
+    <div class="ck-suggest-reasoning">
+      <p class="ck-kicker">Why these changes</p>
+      <div class="ck-suggest-reasoning__body"><%= simple_format(suggestion.reasoning) %></div>
+    </div>
+    <div class="ck-suggest-diff">
+      <div class="ck-suggest-diff__pane">
+        <div class="ck-suggest-diff__header">
+          <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Original prompt</span>
+          <span class="ck-suggest-diff__version"><%= suggestion.prompt.version_label %></span>
+        </div>
+        <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(suggestion.original_template, suggestion.suggested_template) %></pre>
+      </div>
+      <div class="ck-suggest-diff__pane">
+        <div class="ck-suggest-diff__header">
+          <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Suggested prompt</span>
+        </div>
+        <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(suggestion.original_template, suggestion.suggested_template) %></pre>
+      </div>
+    </div>
+    <div class="ck-suggest-full">
+      <p class="ck-kicker">Full suggested prompt</p>
+      <pre class="ck-code ck-code--dark"><%= suggestion.suggested_template %></pre>
+    </div>
+    <div class="ck-actions">
+      <% if suggestion.applied_at? %>
+        <span class="ck-chip" style="background: var(--ck-success-soft); color: var(--ck-success);">Applied</span>
+      <% elsif !suggestion.validated? %>
+        <%= button_to "Apply anyway", apply_suggestion_path(suggestion), method: :post, class: ck_button_classes(:light, variant: :outline), form: { class: "inline-block", data: { turbo_confirm: "This rewrite couldn't be re-scored against the run's responses. Apply it anyway?" } } %>
+      <% elsif suggestion.net_negative? %>
+        <%= button_to "Apply anyway", apply_suggestion_path(suggestion), method: :post, class: ck_button_classes(:light, variant: :outline), form: { class: "inline-block", data: { turbo_confirm: "This rewrite scored lower than the original on the held-out responses. Apply it anyway?" } } %>
+      <% else %>
+        <%= button_to "Apply suggestion", apply_suggestion_path(suggestion), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
+      <% end %>
+    </div>
+  <% end %>
+</div>

data/app/views/completion_kit/suggestions/show.html.erb CHANGED Viewed

@@ -19,8 +19,6 @@
       &middot; <%= @run.responses.count %> responses scored
       <% if @run.avg_score %>
         <span class="<%= ck_badge_classes(ck_score_kind(@run.avg_score)) %>"><%= @run.avg_score %></span>
-      <% else %>
-        &mdash;
       <% end %>
     </p>
   </div>
@@ -30,36 +28,8 @@
     <% else %>
       <%= link_to "Back to prompt", prompt_path(@run.prompt), class: ck_button_classes(:light, variant: :outline) %>
     <% end %>
-    <% if @suggestion.applied_at? %>
-      <span class="ck-chip" style="background: var(--ck-success-soft); color: var(--ck-success);">Applied</span>
-    <% else %>
-      <%= button_to "Apply suggestion", apply_suggestion_path(@suggestion), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
-    <% end %>
   </div>
 </section>
-<div class="ck-suggest-reasoning">
-  <p class="ck-kicker">Why these changes</p>
-  <div class="ck-suggest-reasoning__body"><%= simple_format(@suggestion.reasoning) %></div>
-</div>
-<div class="ck-suggest-diff">
-  <div class="ck-suggest-diff__pane">
-    <div class="ck-suggest-diff__header">
-      <span class="ck-suggest-diff__label ck-suggest-diff__label--before">Original prompt</span>
-      <span class="ck-suggest-diff__version"><%= @suggestion.prompt.version_label %></span>
-    </div>
-    <pre class="ck-suggest-diff__code"><%= ck_word_diff_old(@suggestion.original_template, @suggestion.suggested_template) %></pre>
-  </div>
-  <div class="ck-suggest-diff__pane">
-    <div class="ck-suggest-diff__header">
-      <span class="ck-suggest-diff__label ck-suggest-diff__label--after">Suggested prompt</span>
-    </div>
-    <pre class="ck-suggest-diff__code"><%= ck_word_diff_new(@suggestion.original_template, @suggestion.suggested_template) %></pre>
-  </div>
-</div>
-<div class="ck-suggest-full">
-  <p class="ck-kicker">Full suggested prompt</p>
-  <pre class="ck-code ck-code--dark"><%= @suggestion.suggested_template %></pre>
-</div>
+<%= turbo_stream_from "completion_kit_suggestion_#{@suggestion.id}" %>
+<%= render "completion_kit/suggestions/state", suggestion: @suggestion, run: @run %>

data/db/migrate/20260611000001_add_validation_to_completion_kit_suggestions.rb ADDED Viewed

@@ -0,0 +1,6 @@
+class AddValidationToCompletionKitSuggestions < ActiveRecord::Migration[7.1]
+  def change
+    add_column :completion_kit_suggestions, :validation_summary, :text
+    add_column :completion_kit_suggestions, :status, :string, default: "ready", null: false
+  end
+end

data/lib/completion_kit/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module CompletionKit
-  VERSION = "0.15.1"
+  VERSION = "0.16.0"
 end

data/lib/completion_kit.rb CHANGED Viewed

@@ -21,7 +21,7 @@ module CompletionKit
       @ollama_api_key = ENV['OLLAMA_API_KEY']
       @ollama_api_endpoint = ENV['OLLAMA_API_ENDPOINT']
-      @judge_model = "gpt-4.1"
+      @judge_model = nil
       @high_quality_threshold = 4
       @medium_quality_threshold = 3

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: completion-kit
 version: !ruby/object:Gem::Version
-  version: 0.15.1
+  version: 0.16.0
 platform: ruby
 authors:
 - Damien Bastin
@@ -268,6 +268,7 @@ files:
 - app/jobs/completion_kit/judge_review_job.rb
 - app/jobs/completion_kit/metric_suggestion_job.rb
 - app/jobs/completion_kit/model_discovery_job.rb
+- app/jobs/completion_kit/prompt_suggestion_job.rb
 - app/jobs/completion_kit/run_completion_check_job.rb
 - app/mailers/completion_kit/application_mailer.rb
 - app/models/completion_kit/agreement.rb
@@ -324,6 +325,7 @@ files:
 - app/services/completion_kit/open_ai_client.rb
 - app/services/completion_kit/open_router_client.rb
 - app/services/completion_kit/prompt_improvement_service.rb
+- app/services/completion_kit/prompt_improvement_validator.rb
 - app/services/completion_kit/provider_endpoint.rb
 - app/services/completion_kit/starter_metrics.rb
 - app/services/completion_kit/worker_health.rb
@@ -396,6 +398,8 @@ files:
 - app/views/completion_kit/runs/new.html.erb
 - app/views/completion_kit/runs/show.html.erb
 - app/views/completion_kit/shared/_settings_nav.html.erb
+- app/views/completion_kit/suggestions/_scoreboard.html.erb
+- app/views/completion_kit/suggestions/_state.html.erb
 - app/views/completion_kit/suggestions/show.html.erb
 - app/views/completion_kit/tags/_filter_bar.html.erb
 - app/views/completion_kit/tags/_form.html.erb
@@ -444,6 +448,7 @@ files:
 - db/migrate/20260531000002_backfill_review_metric_versions.rb
 - db/migrate/20260531000003_add_metric_version_fk_to_reviews.rb
 - db/migrate/20260531000004_rename_calibrations_to_agreements.rb
+- db/migrate/20260611000001_add_validation_to_completion_kit_suggestions.rb
 - lib/completion-kit.rb
 - lib/completion_kit.rb
 - lib/completion_kit/concurrency_check.rb