completion-kit 0.5.44 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. checksums.yaml +4 -4
  2. data/app/assets/stylesheets/completion_kit/application.css +31 -4
  3. data/app/controllers/completion_kit/api/v1/base_controller.rb +22 -0
  4. data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +22 -3
  5. data/app/controllers/completion_kit/api/v1/datasets_controller.rb +3 -1
  6. data/app/controllers/completion_kit/api/v1/metric_groups_controller.rb +3 -1
  7. data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +51 -0
  8. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +49 -2
  9. data/app/controllers/completion_kit/api/v1/prompts_controller.rb +3 -1
  10. data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb +1 -1
  11. data/app/controllers/completion_kit/api/v1/responses_controller.rb +3 -1
  12. data/app/controllers/completion_kit/api/v1/runs_controller.rb +75 -2
  13. data/app/controllers/completion_kit/api/v1/tags_controller.rb +1 -1
  14. data/app/controllers/completion_kit/metrics_controller.rb +15 -5
  15. data/app/controllers/completion_kit/runs_controller.rb +64 -2
  16. data/app/helpers/completion_kit/application_helper.rb +0 -14
  17. data/app/jobs/completion_kit/generate_row_job.rb +3 -8
  18. data/app/jobs/completion_kit/judge_review_job.rb +6 -9
  19. data/app/models/completion_kit/calibration.rb +0 -4
  20. data/app/models/completion_kit/metric.rb +1 -0
  21. data/app/models/completion_kit/metric_version.rb +16 -1
  22. data/app/models/completion_kit/response.rb +13 -17
  23. data/app/models/completion_kit/review.rb +18 -22
  24. data/app/models/completion_kit/run.rb +58 -22
  25. data/app/models/concerns/completion_kit/has_job_status.rb +31 -0
  26. data/app/services/completion_kit/mcp_dispatcher.rb +3 -1
  27. data/app/services/completion_kit/mcp_tools/judges.rb +2 -4
  28. data/app/services/completion_kit/mcp_tools/metric_versions.rb +67 -0
  29. data/app/services/completion_kit/metric_variant_generator.rb +20 -6
  30. data/app/services/completion_kit/starter_metrics.rb +5 -5
  31. data/app/views/completion_kit/api_reference/_body.html.erb +91 -6
  32. data/app/views/completion_kit/api_reference/index.html.erb +8 -0
  33. data/app/views/completion_kit/calibrations/_trust_panel.html.erb +6 -1
  34. data/app/views/completion_kit/metrics/index.html.erb +3 -3
  35. data/app/views/completion_kit/metrics/show.html.erb +2 -1
  36. data/app/views/completion_kit/runs/_actions.html.erb +1 -0
  37. data/app/views/completion_kit/runs/compare.html.erb +85 -0
  38. data/app/views/completion_kit/runs/compare_picker.html.erb +39 -0
  39. data/app/views/completion_kit/runs/show.html.erb +8 -2
  40. data/config/routes.rb +18 -1
  41. data/lib/completion_kit/version.rb +1 -1
  42. metadata +6 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d81df0996441d12c0fb540b9f29bb514813adcdbea3ceefb515d318f28947731
4
- data.tar.gz: 606764f41e74cec3284f1155d7ef86e77a61af708af2320d5b02640827741f7a
3
+ metadata.gz: 285fba79d665c4fe077b42f0e8ce888ce84a314b95698a561eea9fb48c75b045
4
+ data.tar.gz: a984971e294bda341824e3696cab11662f82fcee2c50974798356754ba55126c
5
5
  SHA512:
6
- metadata.gz: 9e468cd12eb143f4b5eb64333339199420db4c9d0c78ec548965972eee5e326d574a80c6c3092d63f4d99d88901ce3470ac688468d2813f5370e589568fba669
7
- data.tar.gz: 7377f00a31d539297f9e79059083aa7bfef782d18d1ecfcb9f7da1ff648ce1eaf6f8a94bc55d56fcca22a47e09c7fcb1bc89981563aa351e4293c47f8d886570
6
+ metadata.gz: 1a12beaf77a9d8071949bede78336910eb8da43598609ac6307e09493d1c088a82cc3056ccaf63b3ae4eeb4ea022d19c182a05c0f037bc8a31ba21246cc5bd56
7
+ data.tar.gz: e007e6eeb9f7e89f3aa5ba8397338ce19778b5040f184b8cb6c012faf3f6ea464f6c3d5423fd7f7b36882494fbb90d70e068da7fe15bb76e67e9992585ba80c6
@@ -2834,6 +2834,19 @@ select.ck-input {
2834
2834
  }
2835
2835
  .ck-stale-versions-banner__body { min-width: 0; flex: 1 1 320px; }
2836
2836
  .ck-stale-versions-banner .ck-kicker { color: var(--ck-warning); }
2837
+
2838
+ .ck-delta {
2839
+ font-family: var(--ck-mono);
2840
+ font-size: 0.78rem;
2841
+ letter-spacing: 0.04em;
2842
+ padding: 2px 6px;
2843
+ border-radius: 4px;
2844
+ }
2845
+ .ck-delta--positive { color: var(--ck-success); background: var(--ck-success-soft); }
2846
+ .ck-delta--negative { color: var(--ck-danger); background: var(--ck-danger-soft); }
2847
+ .ck-delta--zero { color: var(--ck-dim); }
2848
+
2849
+ .ck-run-compare-table td { vertical-align: middle; }
2837
2850
  .ck-review-card__stale-note {
2838
2851
  margin: 0.4rem 0 0;
2839
2852
  font-family: var(--ck-mono);
@@ -3104,6 +3117,7 @@ select.ck-input {
3104
3117
  #ck-tab-datasets:checked ~ .ck-api-tabs__nav label[for="ck-tab-datasets"],
3105
3118
  #ck-tab-metrics:checked ~ .ck-api-tabs__nav label[for="ck-tab-metrics"],
3106
3119
  #ck-tab-metric-groups:checked ~ .ck-api-tabs__nav label[for="ck-tab-metric-groups"],
3120
+ #ck-tab-calibrations:checked ~ .ck-api-tabs__nav label[for="ck-tab-calibrations"],
3107
3121
  #ck-tab-tags:checked ~ .ck-api-tabs__nav label[for="ck-tab-tags"],
3108
3122
  #ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"] {
3109
3123
  color: var(--ck-accent);
@@ -3118,8 +3132,9 @@ select.ck-input {
3118
3132
  #ck-tab-datasets:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(5),
3119
3133
  #ck-tab-metrics:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(6),
3120
3134
  #ck-tab-metric-groups:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(7),
3121
- #ck-tab-tags:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(8),
3122
- #ck-tab-providers:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(9) {
3135
+ #ck-tab-calibrations:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(8),
3136
+ #ck-tab-tags:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(9),
3137
+ #ck-tab-providers:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(10) {
3123
3138
  display: block;
3124
3139
  }
3125
3140
 
@@ -3159,6 +3174,7 @@ select.ck-input {
3159
3174
  #ck-tab-datasets:checked ~ .ck-api-tabs__nav label[for="ck-tab-datasets"],
3160
3175
  #ck-tab-metrics:checked ~ .ck-api-tabs__nav label[for="ck-tab-metrics"],
3161
3176
  #ck-tab-metric-groups:checked ~ .ck-api-tabs__nav label[for="ck-tab-metric-groups"],
3177
+ #ck-tab-calibrations:checked ~ .ck-api-tabs__nav label[for="ck-tab-calibrations"],
3162
3178
  #ck-tab-tags:checked ~ .ck-api-tabs__nav label[for="ck-tab-tags"],
3163
3179
  #ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"] {
3164
3180
  border-left-color: transparent;
@@ -3590,6 +3606,11 @@ select.ck-input {
3590
3606
  border-color: var(--ck-line);
3591
3607
  color: var(--ck-dim);
3592
3608
  }
3609
+ .ck-source-chip--revert {
3610
+ border-color: rgba(245, 158, 11, 0.35);
3611
+ background: rgba(245, 158, 11, 0.08);
3612
+ color: rgb(217, 119, 6);
3613
+ }
3593
3614
  .ck-source-chip--current {
3594
3615
  border-color: var(--ck-line-strong);
3595
3616
  color: var(--ck-text);
@@ -6008,8 +6029,14 @@ a.tag-mark {
6008
6029
  }
6009
6030
  .ck-starter-grid {
6010
6031
  display: grid;
6011
- grid-template-columns: repeat(auto-fill, minmax(240px, 1fr));
6012
- gap: 10px;
6032
+ grid-template-columns: repeat(4, 1fr);
6033
+ gap: 12px;
6034
+ }
6035
+ @media (max-width: 1000px) {
6036
+ .ck-starter-grid { grid-template-columns: repeat(2, 1fr); }
6037
+ }
6038
+ @media (max-width: 600px) {
6039
+ .ck-starter-grid { grid-template-columns: 1fr; }
6013
6040
  }
6014
6041
  .ck-starter-card {
6015
6042
  display: flex;
@@ -25,6 +25,28 @@ module CompletionKit
25
25
  render json: {error: "Record not found"}, status: :not_found
26
26
  end
27
27
 
28
+ PAGINATION_DEFAULT_LIMIT = 50
29
+ PAGINATION_MAX_LIMIT = 500
30
+
31
+ def paginate(scope)
32
+ total = scope.count
33
+ limit = (params[:limit].presence || PAGINATION_DEFAULT_LIMIT).to_i
34
+ limit = PAGINATION_DEFAULT_LIMIT if limit <= 0
35
+ limit = PAGINATION_MAX_LIMIT if limit > PAGINATION_MAX_LIMIT
36
+ offset = params[:offset].to_i
37
+ offset = 0 if offset < 0
38
+ response.set_header("X-Total-Count", total.to_s)
39
+ response.set_header("X-Limit", limit.to_s)
40
+ response.set_header("X-Offset", offset.to_s)
41
+ scope.limit(limit).offset(offset)
42
+ end
43
+
44
+ def filter_by_tags(scope)
45
+ names = Array(params[:tag]).map(&:to_s).reject(&:blank?)
46
+ return scope if names.empty?
47
+ scope.joins(:tags).where(completion_kit_tags: { name: names }).distinct
48
+ end
49
+
28
50
  end
29
51
  end
30
52
  end
@@ -3,10 +3,18 @@ module CompletionKit
3
3
  module V1
4
4
  class CalibrationsController < BaseController
5
5
  before_action :ensure_calibration_enabled
6
- before_action :set_scope
6
+ before_action :set_nested_scope, only: [:create]
7
+ before_action :load_calibration, only: [:destroy]
7
8
 
8
9
  def index
9
- render json: scope_calibrations
10
+ scope = Calibration.all
11
+ scope = scope.where(run_id: params[:run_id]) if params[:run_id].present?
12
+ scope = scope.where(response_id: params[:response_id]) if params[:response_id].present?
13
+ scope = scope.where(metric_id: params[:metric_id]) if params[:metric_id].present?
14
+ scope = scope.where(metric_version_id: params[:metric_version_id]) if params[:metric_version_id].present?
15
+ scope = scope.where(created_by: params[:created_by]) if params[:created_by].present?
16
+ scope = scope.where(verdict: params[:verdict]) if params[:verdict].present?
17
+ render json: paginate(scope.order(:created_at))
10
18
  end
11
19
 
12
20
  def create
@@ -26,13 +34,18 @@ module CompletionKit
26
34
  end
27
35
  end
28
36
 
37
+ def destroy
38
+ @calibration.destroy!
39
+ head :no_content
40
+ end
41
+
29
42
  private
30
43
 
31
44
  def ensure_calibration_enabled
32
45
  render(json: { error: "Calibration disabled" }, status: :not_found) unless CompletionKit.config.judge_calibration_enabled
33
46
  end
34
47
 
35
- def set_scope
48
+ def set_nested_scope
36
49
  @run = Run.find(params[:run_id])
37
50
  @response = @run.responses.find(params[:response_id])
38
51
  @metric = Metric.find(params[:metric_id])
@@ -40,6 +53,12 @@ module CompletionKit
40
53
  not_found
41
54
  end
42
55
 
56
+ def load_calibration
57
+ @calibration = Calibration.find(params[:id])
58
+ rescue ActiveRecord::RecordNotFound
59
+ not_found
60
+ end
61
+
43
62
  def scope_calibrations
44
63
  Calibration.where(run_id: @run.id, response_id: @response.id, metric_id: @metric.id)
45
64
  end
@@ -5,7 +5,9 @@ module CompletionKit
5
5
  before_action :set_dataset, only: [:show, :update, :destroy]
6
6
 
7
7
  def index
8
- render json: Dataset.includes(:tags).order(created_at: :desc)
8
+ scope = Dataset.includes(:tags)
9
+ scope = filter_by_tags(scope)
10
+ render json: paginate(scope.order(created_at: :desc))
9
11
  end
10
12
 
11
13
  def show
@@ -5,7 +5,9 @@ module CompletionKit
5
5
  before_action :set_metric_group, only: [:show, :update, :destroy]
6
6
 
7
7
  def index
8
- render json: MetricGroup.includes(:tags).order(created_at: :desc)
8
+ scope = MetricGroup.includes(:tags)
9
+ scope = filter_by_tags(scope)
10
+ render json: paginate(scope.order(created_at: :desc))
9
11
  end
10
12
 
11
13
  def show
@@ -0,0 +1,51 @@
1
+ module CompletionKit
2
+ module Api
3
+ module V1
4
+ class MetricVersionsController < BaseController
5
+ before_action :set_metric
6
+ before_action :set_version, only: [:show, :publish, :destroy]
7
+
8
+ def index
9
+ render json: paginate(@metric.metric_versions.order(version_number: :desc))
10
+ end
11
+
12
+ def show
13
+ render json: @version
14
+ end
15
+
16
+ def publish
17
+ if @version.published? && !@version.current?
18
+ audit = @version.revert!
19
+ render json: audit
20
+ else
21
+ @version.publish!
22
+ render json: @version.reload
23
+ end
24
+ end
25
+
26
+ def destroy
27
+ if @version.published?
28
+ render json: { error: "Cannot dismiss a published version. Publish a different version as current instead." }, status: :conflict
29
+ return
30
+ end
31
+ @version.destroy!
32
+ head :no_content
33
+ end
34
+
35
+ private
36
+
37
+ def set_metric
38
+ @metric = Metric.find(params[:metric_id])
39
+ rescue ActiveRecord::RecordNotFound
40
+ not_found
41
+ end
42
+
43
+ def set_version
44
+ @version = @metric.metric_versions.find(params[:id])
45
+ rescue ActiveRecord::RecordNotFound
46
+ not_found
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -2,10 +2,12 @@ module CompletionKit
2
2
  module Api
3
3
  module V1
4
4
  class MetricsController < BaseController
5
- before_action :set_metric, only: [:show, :update, :destroy]
5
+ before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants, :add_few_shot, :remove_few_shot]
6
6
 
7
7
  def index
8
- render json: Metric.includes(:tags).order(created_at: :desc)
8
+ scope = Metric.includes(:tags)
9
+ scope = filter_by_tags(scope)
10
+ render json: paginate(scope.order(created_at: :desc))
9
11
  end
10
12
 
11
13
  def show
@@ -34,6 +36,51 @@ module CompletionKit
34
36
  head :no_content
35
37
  end
36
38
 
39
+ def suggest_variants
40
+ disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
41
+ if disagreement_count.zero?
42
+ render json: { error: "Mark at least one case as Disagree before asking the model to suggest a change." }, status: :unprocessable_entity
43
+ return
44
+ end
45
+
46
+ MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
47
+ generator = MetricVariantGenerator.new(@metric, count: params[:count].to_i, model: params[:model])
48
+ variants = generator.call
49
+ if variants.empty?
50
+ render json: { error: "The model returned no usable variants. Try again with a different model." }, status: :unprocessable_entity
51
+ return
52
+ end
53
+ versions = generator.persist!(variants)
54
+ render json: versions, status: :created
55
+ end
56
+
57
+ def add_few_shot
58
+ calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
59
+ review = calibration.response.reviews.find_by(metric_id: @metric.id)
60
+ examples = Array(@metric.few_shot_examples)
61
+ examples << {
62
+ "input" => calibration.response.input_data.to_s.truncate(2000),
63
+ "response" => calibration.response.response_text.to_s.truncate(2000),
64
+ "judge_score" => review&.ai_score&.to_f,
65
+ "judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
66
+ "human_score" => calibration.corrected_score&.to_f,
67
+ "human_note" => calibration.note.to_s.truncate(1000),
68
+ "calibration_id" => calibration.id,
69
+ "added_at" => Time.current.utc.iso8601
70
+ }
71
+ @metric.update!(few_shot_examples: examples)
72
+ render json: @metric.reload
73
+ rescue ActiveRecord::RecordNotFound
74
+ render json: { error: "Calibration not found or not a disagree on this metric." }, status: :not_found
75
+ end
76
+
77
+ def remove_few_shot
78
+ cal_id = params[:calibration_id].to_i
79
+ remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
80
+ @metric.update!(few_shot_examples: remaining)
81
+ render json: @metric.reload
82
+ end
83
+
37
84
  private
38
85
 
39
86
  def set_metric
@@ -5,7 +5,9 @@ module CompletionKit
5
5
  before_action :set_prompt, only: [:show, :update, :destroy, :publish]
6
6
 
7
7
  def index
8
- render json: Prompt.includes(:tags).order(created_at: :desc)
8
+ scope = Prompt.includes(:tags)
9
+ scope = filter_by_tags(scope)
10
+ render json: paginate(scope.order(created_at: :desc))
9
11
  end
10
12
 
11
13
  def show
@@ -5,7 +5,7 @@ module CompletionKit
5
5
  before_action :set_credential, only: [:show, :update, :destroy]
6
6
 
7
7
  def index
8
- render json: ProviderCredential.order(created_at: :desc)
8
+ render json: paginate(ProviderCredential.order(created_at: :desc))
9
9
  end
10
10
 
11
11
  def show
@@ -6,7 +6,9 @@ module CompletionKit
6
6
  before_action :set_response, only: [:show]
7
7
 
8
8
  def index
9
- render json: @run.responses.includes(:reviews)
9
+ scope = @run.responses.includes(:reviews)
10
+ scope = scope.where(status: params[:status]) if params[:status].present?
11
+ render json: paginate(scope.order(:id))
10
12
  end
11
13
 
12
14
  def show
@@ -2,10 +2,15 @@ module CompletionKit
2
2
  module Api
3
3
  module V1
4
4
  class RunsController < BaseController
5
- before_action :set_run, only: [:show, :update, :destroy, :generate, :retry_failures]
5
+ before_action :set_run, only: [:show, :update, :destroy, :generate, :retry_failures, :rerun, :regrade, :compare]
6
6
 
7
7
  def index
8
- render json: Run.includes(:tags).order(created_at: :desc)
8
+ scope = Run.includes(:tags)
9
+ scope = scope.where(status: params[:status]) if params[:status].present?
10
+ scope = scope.where(prompt_id: params[:prompt_id]) if params[:prompt_id].present?
11
+ scope = scope.where(dataset_id: params[:dataset_id]) if params[:dataset_id].present?
12
+ scope = filter_by_tags(scope)
13
+ render json: paginate(scope.order(created_at: :desc))
9
14
  end
10
15
 
11
16
  def show
@@ -71,8 +76,76 @@ module CompletionKit
71
76
  render json: @run.reload, status: :accepted
72
77
  end
73
78
 
79
+ def rerun
80
+ new_run = Run.create!(
81
+ prompt_id: @run.prompt_id,
82
+ dataset_id: @run.dataset_id,
83
+ judge_model: @run.judge_model,
84
+ temperature: @run.temperature,
85
+ output_column: @run.output_column,
86
+ tag_names: @run.tag_names,
87
+ status: "pending"
88
+ )
89
+ new_run.replace_metrics!(@run.metric_ids)
90
+ if new_run.start!
91
+ render json: new_run.reload, status: :accepted
92
+ else
93
+ render json: { errors: [new_run.failure_summary || "Could not start the new run."] }, status: :unprocessable_entity
94
+ end
95
+ end
96
+
97
+ def regrade
98
+ if @run.regrade!
99
+ render json: @run.reload, status: :accepted
100
+ else
101
+ render json: { error: "Nothing to re-grade. The run has no succeeded responses or no metrics attached." }, status: :unprocessable_entity
102
+ end
103
+ end
104
+
105
+ def compare
106
+ other = Run.find(params[:with])
107
+ comparison = build_run_comparison(@run, other)
108
+ render json: { left_run_id: @run.id, right_run_id: other.id, metric_ids: comparison[:metric_ids], rows: comparison[:rows] }
109
+ rescue ActiveRecord::RecordNotFound
110
+ render json: { error: "Other run not found. Pass ?with=<run_id>." }, status: :not_found
111
+ end
112
+
74
113
  private
75
114
 
115
+ def build_run_comparison(left, right)
116
+ left_responses = left.responses.includes(:reviews).order(:row_index, :id)
117
+ right_responses = right.responses.includes(:reviews).order(:row_index, :id)
118
+ right_by_input = right_responses.each_with_object({}) { |r, h| h[r.input_data.to_s] ||= r }
119
+ all_reviews = left_responses.flat_map(&:reviews) + right_responses.flat_map(&:reviews)
120
+ metric_ids = all_reviews.map(&:metric_id).compact.uniq
121
+ metric_versions = MetricVersion.where(id: all_reviews.map(&:metric_version_id).compact.uniq).index_by(&:id)
122
+
123
+ rows = left_responses.map do |lr|
124
+ rr = right_by_input[lr.input_data.to_s]
125
+ {
126
+ left_response_id: lr.id,
127
+ right_response_id: rr&.id,
128
+ row_index: lr.row_index,
129
+ per_metric: metric_ids.map do |mid|
130
+ l_review = lr.reviews.find { |r| r.metric_id == mid }
131
+ r_review = rr && rr.reviews.find { |r| r.metric_id == mid }
132
+ next nil if l_review.nil? && r_review.nil?
133
+ anchor = l_review || r_review
134
+ {
135
+ metric_id: mid,
136
+ metric_name: anchor.metric_name,
137
+ left_score: l_review ? l_review.ai_score : nil,
138
+ right_score: r_review ? r_review.ai_score : nil,
139
+ left_metric_version_id: l_review&.metric_version_id,
140
+ right_metric_version_id: r_review&.metric_version_id,
141
+ delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil
142
+ }
143
+ end.compact
144
+ }
145
+ end
146
+ { rows: rows, metric_ids: metric_ids }
147
+ end
148
+
76
149
  def set_run
77
150
  @run = Run.find(params[:id])
78
151
  rescue ActiveRecord::RecordNotFound
@@ -5,7 +5,7 @@ module CompletionKit
5
5
  before_action :set_tag, only: [:show, :update, :destroy]
6
6
 
7
7
  def index
8
- render json: Tag.order(:name)
8
+ render json: paginate(Tag.order(:name))
9
9
  end
10
10
 
11
11
  def show
@@ -42,8 +42,7 @@ module CompletionKit
42
42
  .limit(50)
43
43
  @edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
44
44
  @suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
45
- @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree",
46
- metric_version_id: @published_metric_version.id).count
45
+ @improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
47
46
  @versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
48
47
  end
49
48
 
@@ -157,9 +156,20 @@ module CompletionKit
157
156
  return
158
157
  end
159
158
 
160
- version.publish!
161
- redirect_to metric_path(@metric),
162
- notice: "#{@metric.name} #{version.version_label} is now the published version."
159
+ was_published_already = version.published?
160
+ reverting = was_published_already && !version.current?
161
+ previously_current = MetricVersion.current.find_by(metric_id: @metric.id)
162
+
163
+ if reverting
164
+ audit = version.revert!
165
+ prior_label = previously_current.version_label
166
+ redirect_to metric_path(@metric),
167
+ notice: "Reverted to #{@metric.name} #{version.version_label} (now logged as #{audit.version_label}). Pinned cases still flow to the judge, and calibration verdicts collected against #{prior_label} stay tied to it."
168
+ else
169
+ version.publish!
170
+ redirect_to metric_path(@metric),
171
+ notice: "#{@metric.name} #{version.version_label} is now the published version."
172
+ end
163
173
  end
164
174
 
165
175
  def add_few_shot
@@ -1,7 +1,7 @@
1
1
  module CompletionKit
2
2
  class RunsController < ApplicationController
3
3
  include CompletionKit::TagFiltering
4
- before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :refresh_status]
4
+ before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :regrade, :refresh_status, :compare]
5
5
  before_action :load_form_collections, only: [:new, :edit, :create, :update]
6
6
 
7
7
  def index
@@ -78,6 +78,29 @@ module CompletionKit
78
78
  end
79
79
  end
80
80
 
81
+ def compare
82
+ other_id = params[:with]
83
+ if other_id.blank?
84
+ @other_runs = Run.where(dataset_id: @run.dataset_id, prompt_id: @run.prompt_id)
85
+ .where.not(id: @run.id)
86
+ .order(created_at: :desc)
87
+ .limit(50)
88
+ return render(:compare_picker)
89
+ end
90
+
91
+ @other_run = Run.find(other_id)
92
+ @comparison = build_run_comparison(@run, @other_run)
93
+ render(:compare)
94
+ end
95
+
96
+ def regrade
97
+ if @run.regrade!
98
+ redirect_to run_path(@run), notice: "Re-grading existing responses with the current judge."
99
+ else
100
+ redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
101
+ end
102
+ end
103
+
81
104
  def rerun
82
105
  new_run = Run.create!(
83
106
  prompt_id: @run.prompt_id,
@@ -153,7 +176,7 @@ module CompletionKit
153
176
  failed_response_ids.each { |rid| GenerateRowJob.perform_later(@run.id, rid) }
154
177
  end
155
178
 
156
- @run.send(:broadcast_ui)
179
+ @run.broadcast_ui
157
180
  redirect_to run_path(@run)
158
181
  end
159
182
 
@@ -163,6 +186,45 @@ module CompletionKit
163
186
  @run = Run.find(params[:id])
164
187
  end
165
188
 
189
+ def build_run_comparison(left, right)
190
+ left_responses = left.responses.includes(:reviews).order(:row_index, :id)
191
+ right_responses = right.responses.includes(:reviews).order(:row_index, :id)
192
+ right_by_input = right_responses.each_with_object({}) { |r, h| h[r.input_data.to_s] ||= r }
193
+
194
+ all_reviews = left_responses.flat_map(&:reviews) + right_responses.flat_map(&:reviews)
195
+ metric_ids = all_reviews.map(&:metric_id).compact.uniq
196
+ metric_versions = MetricVersion.where(id: all_reviews.map(&:metric_version_id).compact.uniq).index_by(&:id)
197
+
198
+ rows = left_responses.map do |lr|
199
+ rr = right_by_input[lr.input_data.to_s]
200
+ {
201
+ left_response: lr,
202
+ right_response: rr,
203
+ per_metric: metric_ids.map do |mid|
204
+ l_review = lr.reviews.find { |r| r.metric_id == mid }
205
+ r_review = rr && rr.reviews.find { |r| r.metric_id == mid }
206
+ next nil if l_review.nil? && r_review.nil?
207
+ anchor = l_review || r_review
208
+ {
209
+ metric_id: mid,
210
+ metric_name: anchor.metric_name,
211
+ left_score: l_review ? l_review.ai_score : nil,
212
+ right_score: r_review ? r_review.ai_score : nil,
213
+ left_version_label: version_label_for(l_review, metric_versions),
214
+ right_version_label: version_label_for(r_review, metric_versions),
215
+ delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil
216
+ }
217
+ end.compact
218
+ }
219
+ end
220
+ { rows: rows, metric_ids: metric_ids }
221
+ end
222
+
223
+ def version_label_for(review, metric_versions)
224
+ return nil if review.nil? || review.metric_version_id.nil?
225
+ metric_versions[review.metric_version_id]&.version_label
226
+ end
227
+
166
228
  def load_form_collections
167
229
  @prompts = Prompt.order(:name)
168
230
  @datasets = Dataset.order(:name)
@@ -53,20 +53,6 @@ module CompletionKit
53
53
  end
54
54
  end
55
55
 
56
- def ck_run_status_label(run)
57
- case run.status
58
- when "pending" then "Ready to run"
59
- when "running"
60
- if run.progress_total.to_i > 0
61
- "Running (#{run.progress_current}/#{run.progress_total})"
62
- else
63
- "Running…"
64
- end
65
- when "completed" then "Completed"
66
- when "failed" then "Failed"
67
- else run.status.capitalize
68
- end
69
- end
70
56
 
71
57
  def ck_provider_label(provider)
72
58
  CompletionKit::ProviderCredential::PROVIDER_LABELS[provider.to_s] || provider.to_s.titleize
@@ -31,8 +31,7 @@ module CompletionKit
31
31
  before_perform do |job|
32
32
  response = Response.find_by(id: job.arguments.last)
33
33
  next unless response
34
- response.update_columns(status: "retrying", attempts: response.attempts + 1)
35
- response.run.send(:broadcast_response_update, response) if response.run
34
+ response.update!(status: "retrying", attempts: response.attempts + 1)
36
35
  end
37
36
 
38
37
  def perform(run_id, response_id)
@@ -61,12 +60,10 @@ module CompletionKit
61
60
  response_text: text,
62
61
  error_provider: nil, error_class: nil, error_status: nil, error_message: nil
63
62
  )
64
- run.send(:broadcast_response_update, response)
65
- run.send(:broadcast_progress)
66
63
 
67
64
  if run.judge_configured?
68
65
  run.metrics.each do |metric|
69
- JudgeReviewJob.perform_later(response.id, metric.id)
66
+ JudgeReviewJob.perform_later(response.id, metric.id, run.id)
70
67
  end
71
68
  end
72
69
 
@@ -87,15 +84,13 @@ module CompletionKit
87
84
  response = Response.find_by(id: response_id)
88
85
  return unless response
89
86
 
90
- response.update_columns(
87
+ response.update!(
91
88
  status: "failed",
92
89
  error_provider: provider_for(response),
93
90
  error_class: error.class.name,
94
91
  error_status: error.respond_to?(:status) ? error.status : nil,
95
92
  error_message: error.message.to_s.truncate(2000)
96
93
  )
97
- response.run&.send(:broadcast_response_update, response)
98
- response.run&.send(:broadcast_progress)
99
94
  end
100
95
 
101
96
  def provider_for(response)