RubyGems - completion-kit - Versions diffs - 0.5.44 → 0.7.0 - Mend

completion-kit 0.5.44 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

data/app/jobs/completion_kit/judge_review_job.rb CHANGED Viewed

@@ -5,7 +5,9 @@ module CompletionKit
     queue_as :llm
     limits_concurrency to: ENV.fetch("COMPLETION_KIT_PER_RUN_CONCURRENCY", 5).to_i,
-                       key: ->(response_id, _) { "run:#{Response.find_by(id: response_id)&.run_id}" },
+                       key: ->(response_id, _metric_id, run_id = nil) {
+                         "run:#{run_id || Response.where(id: response_id).pick(:run_id)}"
+                       },
                        duration: 10.minutes
     def self.rate_limit_wait(executions)
@@ -29,7 +31,7 @@ module CompletionKit
     end
     before_perform do |job|
-      response_id, metric_id = job.arguments
+      response_id, metric_id, _run_id = job.arguments
       response = Response.find_by(id: response_id)
       next unless response
       review = response.reviews.find_or_initialize_by(metric_id: metric_id)
@@ -37,10 +39,9 @@ module CompletionKit
       review.attempts = (review.attempts || 0) + 1
       review.status = "retrying"
       review.save!(validate: false)
-      response.run.send(:broadcast_response_update, response) if response.run
     end
-    def perform(response_id, metric_id)
+    def perform(response_id, metric_id, _run_id = nil)
       @response_id = response_id
       @metric_id = metric_id
@@ -75,8 +76,6 @@ module CompletionKit
       review.save!
       confirm_judging_capability(run.judge_model)
-      run.send(:broadcast_response_update, response)
-      run.send(:broadcast_progress)
       enqueue_completion_check
     end
@@ -107,13 +106,11 @@ module CompletionKit
         error_message: error.message.to_s.truncate(2000)
       )
       review.save!(validate: false)
-      response.run&.send(:broadcast_response_update, response)
-      response.run&.send(:broadcast_progress)
     end
     def provider_for(response)
       run = response.run
-      return nil unless run&.judge_model
+      return nil unless run.judge_model
       ApiConfig.provider_for_model(run.judge_model)
     end

data/app/models/completion_kit/calibration.rb CHANGED Viewed

@@ -7,10 +7,6 @@ module CompletionKit
     belongs_to :metric
     belongs_to :metric_version
-    alias_attribute :judge_version_id, :metric_version_id
-    alias_method :judge_version, :metric_version
-    alias_method :judge_version=, :metric_version=
     validates :verdict, presence: true, inclusion: { in: VERDICTS }
     validates :response_id,
               uniqueness: { scope: [:metric_id, :created_by] }

data/app/models/completion_kit/metric.rb CHANGED Viewed

@@ -12,6 +12,7 @@ module CompletionKit
     has_many :metric_group_memberships, dependent: :destroy
     has_many :metric_groups, through: :metric_group_memberships, source: :metric_group
+    has_many :metric_versions, dependent: :destroy
     has_many :reviews, dependent: :nullify
     has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy

data/app/models/completion_kit/metric_version.rb CHANGED Viewed

@@ -53,6 +53,22 @@ module CompletionKit
       self
     end
+    def revert!
+      raise ArgumentError, "only a published version can be reverted to" unless published?
+      audit = nil
+      MetricVersion.transaction do
+        audit = self.class.create!(
+          metric: metric,
+          instruction: instruction,
+          rubric_bands: rubric_bands,
+          state: "draft",
+          source: "revert"
+        )
+        audit.publish!
+      end
+      audit
+    end
     def as_json(options = {})
       {
         id: id,
@@ -77,5 +93,4 @@ module CompletionKit
     end
   end
-  JudgeVersion = MetricVersion
 end

data/app/models/completion_kit/response.rb CHANGED Viewed

@@ -1,7 +1,6 @@
 module CompletionKit
   class Response < ApplicationRecord
-    STATUSES = %w[pending retrying succeeded failed].freeze
-    TERMINAL_STATUSES = %w[succeeded failed].freeze
+    include HasJobStatus
     belongs_to :run
     has_many :reviews, dependent: :destroy
@@ -10,17 +9,11 @@ module CompletionKit
     delegate :prompt, to: :run
     validates :response_text, presence: true, if: :succeeded?
-    validates :status, inclusion: { in: STATUSES }
     before_validation :set_default_status, on: :create
-    def terminal?
-      TERMINAL_STATUSES.include?(status)
-    end
-    def succeeded?
-      status == "succeeded"
-    end
+    after_save_commit :broadcast_row_update, unless: :destroyed?
+    after_save_commit :broadcast_run_progress, if: :should_broadcast_progress?
     def as_json(options = {})
       {
@@ -47,19 +40,22 @@ module CompletionKit
     def fully_reviewed?
       metric_ids = run.metric_ids
       return true if metric_ids.empty?
-      reviewed_metric_ids = reviews.where(status: Review::TERMINAL_STATUSES).pluck(:metric_id).uniq
+      reviewed_metric_ids = reviews.where(status: HasJobStatus::TERMINAL_STATUSES).pluck(:metric_id).uniq
       (metric_ids - reviewed_metric_ids).empty?
     end
-    def error_payload
-      return nil if error_class.blank?
-      { provider: error_provider, class: error_class, status: error_status, message: error_message }
+    private
+    def broadcast_row_update
+      run.broadcast_response_update(self)
     end
-    private
+    def broadcast_run_progress
+      run.broadcast_progress
+    end
-    def set_default_status
-      self.status ||= "pending"
+    def should_broadcast_progress?
+      saved_change_to_status? && terminal?
     end
   end
 end

data/app/models/completion_kit/review.rb CHANGED Viewed

@@ -1,37 +1,25 @@
 module CompletionKit
   class Review < ApplicationRecord
-    STATUSES = %w[pending retrying succeeded failed].freeze
-    TERMINAL_STATUSES = %w[succeeded failed].freeze
+    include HasJobStatus
     belongs_to :response
     belongs_to :metric, optional: true
     belongs_to :metric_version, optional: true
     has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
-    def stale_against_current_judge?
-      return false unless metric_id && metric_version_id
-      current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
-      return false if current_id.nil?
-      metric_version_id != current_id
-    end
     validates :metric_name, presence: true
-    validates :status, inclusion: { in: STATUSES }
     validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
     before_validation :set_default_status
-    def terminal?
-      TERMINAL_STATUSES.include?(status)
-    end
-    def succeeded?
-      status == "succeeded"
-    end
+    after_save_commit :broadcast_parent_row_update, unless: :destroyed?
+    after_save_commit :broadcast_run_progress, if: :should_broadcast_progress?
-    def error_payload
-      return nil if error_class.blank?
-      { provider: error_provider, class: error_class, status: error_status, message: error_message }
+    def stale_against_current_judge?
+      return false unless metric_id && metric_version_id
+      current_id = MetricVersion.current.where(metric_id: metric_id).limit(1).pick(:id)
+      return false if current_id.nil?
+      metric_version_id != current_id
     end
     def as_json(options = {})
@@ -46,8 +34,16 @@ module CompletionKit
     private
-    def set_default_status
-      self.status ||= "pending"
+    def broadcast_parent_row_update
+      response.run.broadcast_response_update(response)
+    end
+    def broadcast_run_progress
+      response.run.broadcast_progress
+    end
+    def should_broadcast_progress?
+      saved_change_to_status? && terminal?
     end
   end
 end

data/app/models/completion_kit/run.rb CHANGED Viewed

@@ -43,7 +43,7 @@ module CompletionKit
     end
     def outstanding_work_zero?
-      return false if responses.where.not(status: Response::TERMINAL_STATUSES).exists?
+      return false if responses.where.not(status: HasJobStatus::TERMINAL_STATUSES).exists?
       metric_ids = metrics.pluck(:id)
       return true if metric_ids.empty?
@@ -55,7 +55,7 @@ module CompletionKit
       terminal_review_count = Review.where(
         response_id: succeeded_response_ids,
         metric_id: metric_ids,
-        status: Review::TERMINAL_STATUSES
+        status: HasJobStatus::TERMINAL_STATUSES
       ).count
       terminal_review_count >= expected_reviews
@@ -118,6 +118,10 @@ module CompletionKit
     end
     def start!
+      unless %w[pending failed].include?(status)
+        return fail_with_summary!("Cannot start a run in state \"#{status}\". Use rerun to create a fresh copy, or retry_failures / regrade to work with the existing responses.")
+      end
       rows = if dataset
                CsvProcessor.process_self(self)
              else
@@ -161,7 +165,7 @@ module CompletionKit
           response = responses.create!(attrs)
           if judge_only?
-            metrics.each { |m| JudgeReviewJob.perform_later(response.id, m.id) } if judge_configured?
+            metrics.each { |m| JudgeReviewJob.perform_later(response.id, m.id, id) } if judge_configured?
           else
             GenerateRowJob.perform_later(id, response.id)
           end
@@ -179,6 +183,38 @@ module CompletionKit
       start!
     end
+    def regrade!
+      grading_metrics = metrics
+      return false if grading_metrics.empty? || !judge_configured?
+      eligible_responses = responses.where(status: "succeeded").where.not(response_text: nil)
+      response_ids = eligible_responses.pluck(:id)
+      return false if response_ids.empty?
+      transaction do
+        Review.where(response_id: response_ids).update_all(
+          status: "pending",
+          attempts: 0,
+          metric_version_id: nil,
+          ai_score: nil,
+          ai_feedback: nil,
+          error_provider: nil,
+          error_class: nil,
+          error_status: nil,
+          error_message: nil
+        )
+        update!(status: "running", failure_summary: nil, error_message: nil)
+        response_ids.each do |rid|
+          grading_metrics.each { |m| JudgeReviewJob.perform_later(rid, m.id, id) }
+        end
+        RunCompletionCheckJob.perform_later(id)
+      end
+      broadcast_ui
+      true
+    end
     def progress_snapshot
       generated_done = responses.where(status: "succeeded").count
       generated_failed = responses.where(status: "failed").count
@@ -240,17 +276,6 @@ module CompletionKit
       }
     end
-    private
-    def fail_with_summary!(message)
-      errors.add(:base, message)
-      if persisted?
-        update_columns(status: "failed", failure_summary: message, error_message: message)
-        broadcast_ui
-      end
-      false
-    end
     def broadcast_ui
       broadcast_progress
       broadcast_status_header
@@ -258,14 +283,6 @@ module CompletionKit
       broadcast_sort_toolbar
     end
-    def render_engine_partial(partial, locals)
-      CompletionKit::Engine.warm_routes!
-      CompletionKit::ApplicationController.render(
-        partial: partial,
-        locals: locals
-      )
-    end
     def broadcast_progress
       reload
       broadcast_replace_to(
@@ -324,6 +341,25 @@ module CompletionKit
       )
     end
+    private
+    def fail_with_summary!(message)
+      errors.add(:base, message)
+      if persisted?
+        update_columns(status: "failed", failure_summary: message, error_message: message)
+        broadcast_ui
+      end
+      false
+    end
+    def render_engine_partial(partial, locals)
+      CompletionKit::Engine.warm_routes!
+      CompletionKit::ApplicationController.render(
+        partial: partial,
+        locals: locals
+      )
+    end
     def set_default_status
       self.status ||= "pending"
     end

data/app/models/concerns/completion_kit/has_job_status.rb ADDED Viewed

@@ -0,0 +1,31 @@
+module CompletionKit
+  module HasJobStatus
+    extend ActiveSupport::Concern
+    STATUSES = %w[pending retrying succeeded failed].freeze
+    TERMINAL_STATUSES = %w[succeeded failed].freeze
+    included do
+      validates :status, inclusion: { in: STATUSES }
+    end
+    def terminal?
+      TERMINAL_STATUSES.include?(status)
+    end
+    def succeeded?
+      status == "succeeded"
+    end
+    def error_payload
+      return nil if error_class.blank?
+      { provider: error_provider, class: error_class, status: error_status, message: error_message }
+    end
+    private
+    def set_default_status
+      self.status ||= "pending"
+    end
+  end
+end

data/app/services/completion_kit/mcp_dispatcher.rb CHANGED Viewed

@@ -32,6 +32,7 @@ module CompletionKit
         McpTools::Datasets.definitions +
         McpTools::Metrics.definitions +
         McpTools::MetricGroups.definitions +
+        McpTools::MetricVersions.definitions +
         McpTools::ProviderCredentials.definitions +
         McpTools::Tags.definitions +
         McpTools::Calibrations.definitions +
@@ -44,8 +45,9 @@ module CompletionKit
       when /\Aruns_/                 then McpTools::Runs.call(name, arguments)
       when /\Aresponses_/            then McpTools::Responses.call(name, arguments)
       when /\Adatasets_/             then McpTools::Datasets.call(name, arguments)
-      when /\Ametrics_/              then McpTools::Metrics.call(name, arguments)
+      when /\Ametric_versions_/      then McpTools::MetricVersions.call(name, arguments)
       when /\Ametric_groups_/        then McpTools::MetricGroups.call(name, arguments)
+      when /\Ametrics_/              then McpTools::Metrics.call(name, arguments)
       when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
       when /\Atags_/                 then McpTools::Tags.call(name, arguments)
       when /\Acalibrations_/         then McpTools::Calibrations.call(name, arguments)

data/app/services/completion_kit/mcp_tools/judges.rb CHANGED Viewed

@@ -75,10 +75,8 @@ module CompletionKit
       def self.compare(args)
         metric = CompletionKit::Metric.find(args["metric_id"])
-        a_id = args["metric_version_a_id"] || args["judge_version_a_id"]
-        b_id = args["metric_version_b_id"] || args["judge_version_b_id"]
-        a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(a_id)
-        b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(b_id)
+        a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
+        b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
         stats_a = CompletionKit::MetricCalibrationStats.for(metric, metric_version: a)
         stats_b = CompletionKit::MetricCalibrationStats.for(metric, metric_version: b)
         text_result({

data/app/services/completion_kit/mcp_tools/metric_versions.rb ADDED Viewed

@@ -0,0 +1,67 @@
+module CompletionKit
+  module McpTools
+    module MetricVersions
+      extend Base
+      TOOLS = {
+        "metric_versions_list" => {
+          description: "List every MetricVersion (drafts + published) for a metric, newest first. Each row carries version_number, state, source, current flag, and timestamps.",
+          inputSchema: {
+            type: "object",
+            properties: {
+              metric_id: { type: "integer" }
+            },
+            required: ["metric_id"]
+          },
+          handler: :list
+        },
+        "metric_versions_publish" => {
+          description: "Publish a MetricVersion as the live judge for its metric. Works for both 'draft → published' and 'revert to an older published version → current'. Transactionally flips current, demotes peers, and writes the version's instruction + rubric_bands back onto the metric so the judge actually uses them.",
+          inputSchema: {
+            type: "object",
+            properties: {
+              metric_version_id: { type: "integer" }
+            },
+            required: ["metric_version_id"]
+          },
+          handler: :publish
+        },
+        "metric_versions_dismiss" => {
+          description: "Destroy a draft MetricVersion (use for either source: 'edit' or source: 'suggestion'). Published versions are refused — to demote a published version, publish a different one as current instead.",
+          inputSchema: {
+            type: "object",
+            properties: {
+              metric_version_id: { type: "integer" }
+            },
+            required: ["metric_version_id"]
+          },
+          handler: :dismiss
+        }
+      }.freeze
+      def self.list(args)
+        metric = CompletionKit::Metric.find(args["metric_id"])
+        versions = CompletionKit::MetricVersion.where(metric_id: metric.id).order(version_number: :desc)
+        text_result(versions.map(&:as_json))
+      end
+      def self.publish(args)
+        version = CompletionKit::MetricVersion.find(args["metric_version_id"])
+        if version.published? && !version.current?
+          audit = version.revert!
+          text_result(audit.as_json)
+        else
+          version.publish!
+          text_result(version.reload.as_json)
+        end
+      end
+      def self.dismiss(args)
+        version = CompletionKit::MetricVersion.find(args["metric_version_id"])
+        return error_result("Cannot dismiss a published version. Publish a different version as current instead.") if version.published?
+        version.destroy!
+        text_result({id: version.id, destroyed: true})
+      end
+    end
+  end
+end

data/app/services/completion_kit/metric_variant_generator.rb CHANGED Viewed

@@ -43,6 +43,7 @@ module CompletionKit
     def build_meta_prompt
       disagreements = MetricCalibrationExamples.disagreements_for(@metric)
       borderlines = MetricCalibrationExamples.borderlines_for(@metric)
+      pinned_examples = Array(@metric.few_shot_examples)
       sections = []
       sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
       sections << ""
@@ -77,6 +78,18 @@ module CompletionKit
           sections << ""
         end
       end
+      if pinned_examples.any?
+        sections << "## Pinned cases the judge already references"
+        sections << "These are cases the operator pinned for the judge to remember. The improved rubric must remain consistent with these — that is, the new instruction + rubric should produce roughly the human_score on these inputs, not the judge_score."
+        pinned_examples.each_with_index do |ex, i|
+          sections << "### Pinned #{i + 1}"
+          sections << "Input: #{ex["input"].to_s.truncate(200)}"
+          sections << "Output: #{ex["response"].to_s.truncate(200)}"
+          sections << "Judge previously said #{ex["judge_score"]}/5: #{ex["judge_feedback"].to_s.truncate(160)}"
+          sections << "Human said #{ex["human_score"]}/5: #{ex["human_note"].to_s.truncate(160)}"
+          sections << ""
+        end
+      end
       sections << "## Task"
       sections << "Make one substantive change. Don't just reword. If the disagreements look like instruction problems, rewrite the instruction. If they look like rubric problems (overlapping bands, undefined edge cases), rewrite the rubric. Rewrite both if both are wrong."
       sections << ""
@@ -133,13 +146,14 @@ module CompletionKit
     end
     def calibrations_for(metric, verdict:, limit:)
-      scope = Calibration.where(metric_id: metric.id, verdict: verdict)
+      base = Calibration.where(metric_id: metric.id, verdict: verdict)
       current_version = MetricVersion.current.find_by(metric_id: metric.id)
-      scope = scope.where(metric_version_id: current_version.id) if current_version
-      scope.includes(response: :reviews)
-           .order(created_at: :desc)
-           .limit(limit)
-           .map do |cal|
+      scoped = current_version ? base.where(metric_version_id: current_version.id) : base
+      effective = scoped.exists? ? scoped : base
+      effective.includes(response: :reviews)
+               .order(created_at: :desc)
+               .limit(limit)
+               .map do |cal|
         review = cal.response.reviews.find { |r| r.metric_id == metric.id }
         {
           input: cal.response.input_data,

data/app/services/completion_kit/starter_metrics.rb CHANGED Viewed

@@ -21,8 +21,8 @@ module CompletionKit
         key: "instruction_following",
         name: "Instruction following",
         description: "Did the model do everything that was asked?",
-        catches: "The response is factually right but ignores \"answer in two sentences\", \"use bullet points\", \"do not include X\". Distinct from Correctness — a response can be right and still fail this.",
-        instruction: "Did the model do every concrete thing the prompt asked for? Score against the explicit requirements in the prompt (format constraints, count limits, exclusions, audience cues). Factual accuracy is a different dimension — score that elsewhere.",
+        catches: "The response is factually right but ignores \"answer in two sentences\", \"use bullet points\", \"do not include X\". Distinct from Correctness; a response can be right and still fail this.",
+        instruction: "Did the model do every concrete thing the prompt asked for? Score against the explicit requirements in the prompt (format constraints, count limits, exclusions, audience cues). Factual accuracy is a different dimension; score that elsewhere.",
         rubric_bands: [
           { "stars" => 5, "description" => "Followed every requirement in the prompt exactly." },
           { "stars" => 4, "description" => "Followed every requirement with a small slip." },
@@ -36,7 +36,7 @@ module CompletionKit
         name: "Format compliance",
         description: "Does the output follow the required structure?",
         catches: "Invalid JSON, missing schema fields, extra prose around a structured response, wrong casing on keys. Critical for any LLM wired into an API.",
-        instruction: "Does the output match the format the prompt asked for — JSON shape, schema, keys, casing, no stray prose? Score on whether a downstream parser would accept it without massaging.",
+        instruction: "Does the output match the format the prompt asked for: JSON shape, schema, keys, casing, no stray prose? Score on whether a downstream parser would accept it without massaging.",
         rubric_bands: [
           { "stars" => 5, "description" => "Exact spec, ready to consume programmatically." },
           { "stars" => 4, "description" => "Spec-compliant with one cosmetic issue." },
@@ -62,9 +62,9 @@ module CompletionKit
       Starter.new(
         key: "conciseness",
         name: "Conciseness",
-        description: "Is it the right length — no padding, no missing detail?",
+        description: "Is it the right length, no padding, no missing detail?",
         catches: "Rambling responses, repetitive caveats, over-hedging. LLMs default to verbose. Conciseness is the dimension where users most often see scores move after tuning.",
-        instruction: "Is the output the right length for the task — no padding, no missing detail, no hedging filler? Penalise rambling, repetition, over-caveating, and unnecessary preamble. Penalise too-short outputs that drop information.",
+        instruction: "Is the output the right length for the task: no padding, no missing detail, no hedging filler? Penalise rambling, repetition, over-caveating, and unnecessary preamble. Penalise too-short outputs that drop information.",
         rubric_bands: [
           { "stars" => 5, "description" => "Exactly as long as the task needs, no more, no less." },
           { "stars" => 4, "description" => "Right length with a small redundancy." },