completion-kit 0.5.44 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +31 -4
- data/app/controllers/completion_kit/api/v1/base_controller.rb +22 -0
- data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +22 -3
- data/app/controllers/completion_kit/api/v1/datasets_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/metric_groups_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +51 -0
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +49 -2
- data/app/controllers/completion_kit/api/v1/prompts_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb +1 -1
- data/app/controllers/completion_kit/api/v1/responses_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/runs_controller.rb +75 -2
- data/app/controllers/completion_kit/api/v1/tags_controller.rb +1 -1
- data/app/controllers/completion_kit/metrics_controller.rb +15 -5
- data/app/controllers/completion_kit/runs_controller.rb +64 -2
- data/app/helpers/completion_kit/application_helper.rb +0 -14
- data/app/jobs/completion_kit/generate_row_job.rb +3 -8
- data/app/jobs/completion_kit/judge_review_job.rb +6 -9
- data/app/models/completion_kit/calibration.rb +0 -4
- data/app/models/completion_kit/metric.rb +1 -0
- data/app/models/completion_kit/metric_version.rb +16 -1
- data/app/models/completion_kit/response.rb +13 -17
- data/app/models/completion_kit/review.rb +18 -22
- data/app/models/completion_kit/run.rb +58 -22
- data/app/models/concerns/completion_kit/has_job_status.rb +31 -0
- data/app/services/completion_kit/mcp_dispatcher.rb +3 -1
- data/app/services/completion_kit/mcp_tools/judges.rb +2 -4
- data/app/services/completion_kit/mcp_tools/metric_versions.rb +67 -0
- data/app/services/completion_kit/metric_variant_generator.rb +20 -6
- data/app/services/completion_kit/starter_metrics.rb +5 -5
- data/app/views/completion_kit/api_reference/_body.html.erb +91 -6
- data/app/views/completion_kit/api_reference/index.html.erb +8 -0
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +6 -1
- data/app/views/completion_kit/metrics/index.html.erb +3 -3
- data/app/views/completion_kit/metrics/show.html.erb +2 -1
- data/app/views/completion_kit/runs/_actions.html.erb +1 -0
- data/app/views/completion_kit/runs/compare.html.erb +85 -0
- data/app/views/completion_kit/runs/compare_picker.html.erb +39 -0
- data/app/views/completion_kit/runs/show.html.erb +8 -2
- data/config/routes.rb +18 -1
- data/lib/completion_kit/version.rb +1 -1
- metadata +6 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 285fba79d665c4fe077b42f0e8ce888ce84a314b95698a561eea9fb48c75b045
|
|
4
|
+
data.tar.gz: a984971e294bda341824e3696cab11662f82fcee2c50974798356754ba55126c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1a12beaf77a9d8071949bede78336910eb8da43598609ac6307e09493d1c088a82cc3056ccaf63b3ae4eeb4ea022d19c182a05c0f037bc8a31ba21246cc5bd56
|
|
7
|
+
data.tar.gz: e007e6eeb9f7e89f3aa5ba8397338ce19778b5040f184b8cb6c012faf3f6ea464f6c3d5423fd7f7b36882494fbb90d70e068da7fe15bb76e67e9992585ba80c6
|
|
@@ -2834,6 +2834,19 @@ select.ck-input {
|
|
|
2834
2834
|
}
|
|
2835
2835
|
.ck-stale-versions-banner__body { min-width: 0; flex: 1 1 320px; }
|
|
2836
2836
|
.ck-stale-versions-banner .ck-kicker { color: var(--ck-warning); }
|
|
2837
|
+
|
|
2838
|
+
.ck-delta {
|
|
2839
|
+
font-family: var(--ck-mono);
|
|
2840
|
+
font-size: 0.78rem;
|
|
2841
|
+
letter-spacing: 0.04em;
|
|
2842
|
+
padding: 2px 6px;
|
|
2843
|
+
border-radius: 4px;
|
|
2844
|
+
}
|
|
2845
|
+
.ck-delta--positive { color: var(--ck-success); background: var(--ck-success-soft); }
|
|
2846
|
+
.ck-delta--negative { color: var(--ck-danger); background: var(--ck-danger-soft); }
|
|
2847
|
+
.ck-delta--zero { color: var(--ck-dim); }
|
|
2848
|
+
|
|
2849
|
+
.ck-run-compare-table td { vertical-align: middle; }
|
|
2837
2850
|
.ck-review-card__stale-note {
|
|
2838
2851
|
margin: 0.4rem 0 0;
|
|
2839
2852
|
font-family: var(--ck-mono);
|
|
@@ -3104,6 +3117,7 @@ select.ck-input {
|
|
|
3104
3117
|
#ck-tab-datasets:checked ~ .ck-api-tabs__nav label[for="ck-tab-datasets"],
|
|
3105
3118
|
#ck-tab-metrics:checked ~ .ck-api-tabs__nav label[for="ck-tab-metrics"],
|
|
3106
3119
|
#ck-tab-metric-groups:checked ~ .ck-api-tabs__nav label[for="ck-tab-metric-groups"],
|
|
3120
|
+
#ck-tab-calibrations:checked ~ .ck-api-tabs__nav label[for="ck-tab-calibrations"],
|
|
3107
3121
|
#ck-tab-tags:checked ~ .ck-api-tabs__nav label[for="ck-tab-tags"],
|
|
3108
3122
|
#ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"] {
|
|
3109
3123
|
color: var(--ck-accent);
|
|
@@ -3118,8 +3132,9 @@ select.ck-input {
|
|
|
3118
3132
|
#ck-tab-datasets:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(5),
|
|
3119
3133
|
#ck-tab-metrics:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(6),
|
|
3120
3134
|
#ck-tab-metric-groups:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(7),
|
|
3121
|
-
#ck-tab-
|
|
3122
|
-
#ck-tab-
|
|
3135
|
+
#ck-tab-calibrations:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(8),
|
|
3136
|
+
#ck-tab-tags:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(9),
|
|
3137
|
+
#ck-tab-providers:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(10) {
|
|
3123
3138
|
display: block;
|
|
3124
3139
|
}
|
|
3125
3140
|
|
|
@@ -3159,6 +3174,7 @@ select.ck-input {
|
|
|
3159
3174
|
#ck-tab-datasets:checked ~ .ck-api-tabs__nav label[for="ck-tab-datasets"],
|
|
3160
3175
|
#ck-tab-metrics:checked ~ .ck-api-tabs__nav label[for="ck-tab-metrics"],
|
|
3161
3176
|
#ck-tab-metric-groups:checked ~ .ck-api-tabs__nav label[for="ck-tab-metric-groups"],
|
|
3177
|
+
#ck-tab-calibrations:checked ~ .ck-api-tabs__nav label[for="ck-tab-calibrations"],
|
|
3162
3178
|
#ck-tab-tags:checked ~ .ck-api-tabs__nav label[for="ck-tab-tags"],
|
|
3163
3179
|
#ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"] {
|
|
3164
3180
|
border-left-color: transparent;
|
|
@@ -3590,6 +3606,11 @@ select.ck-input {
|
|
|
3590
3606
|
border-color: var(--ck-line);
|
|
3591
3607
|
color: var(--ck-dim);
|
|
3592
3608
|
}
|
|
3609
|
+
.ck-source-chip--revert {
|
|
3610
|
+
border-color: rgba(245, 158, 11, 0.35);
|
|
3611
|
+
background: rgba(245, 158, 11, 0.08);
|
|
3612
|
+
color: rgb(217, 119, 6);
|
|
3613
|
+
}
|
|
3593
3614
|
.ck-source-chip--current {
|
|
3594
3615
|
border-color: var(--ck-line-strong);
|
|
3595
3616
|
color: var(--ck-text);
|
|
@@ -6008,8 +6029,14 @@ a.tag-mark {
|
|
|
6008
6029
|
}
|
|
6009
6030
|
.ck-starter-grid {
|
|
6010
6031
|
display: grid;
|
|
6011
|
-
grid-template-columns: repeat(
|
|
6012
|
-
gap:
|
|
6032
|
+
grid-template-columns: repeat(4, 1fr);
|
|
6033
|
+
gap: 12px;
|
|
6034
|
+
}
|
|
6035
|
+
@media (max-width: 1000px) {
|
|
6036
|
+
.ck-starter-grid { grid-template-columns: repeat(2, 1fr); }
|
|
6037
|
+
}
|
|
6038
|
+
@media (max-width: 600px) {
|
|
6039
|
+
.ck-starter-grid { grid-template-columns: 1fr; }
|
|
6013
6040
|
}
|
|
6014
6041
|
.ck-starter-card {
|
|
6015
6042
|
display: flex;
|
|
@@ -25,6 +25,28 @@ module CompletionKit
|
|
|
25
25
|
render json: {error: "Record not found"}, status: :not_found
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
+
PAGINATION_DEFAULT_LIMIT = 50
|
|
29
|
+
PAGINATION_MAX_LIMIT = 500
|
|
30
|
+
|
|
31
|
+
def paginate(scope)
|
|
32
|
+
total = scope.count
|
|
33
|
+
limit = (params[:limit].presence || PAGINATION_DEFAULT_LIMIT).to_i
|
|
34
|
+
limit = PAGINATION_DEFAULT_LIMIT if limit <= 0
|
|
35
|
+
limit = PAGINATION_MAX_LIMIT if limit > PAGINATION_MAX_LIMIT
|
|
36
|
+
offset = params[:offset].to_i
|
|
37
|
+
offset = 0 if offset < 0
|
|
38
|
+
response.set_header("X-Total-Count", total.to_s)
|
|
39
|
+
response.set_header("X-Limit", limit.to_s)
|
|
40
|
+
response.set_header("X-Offset", offset.to_s)
|
|
41
|
+
scope.limit(limit).offset(offset)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def filter_by_tags(scope)
|
|
45
|
+
names = Array(params[:tag]).map(&:to_s).reject(&:blank?)
|
|
46
|
+
return scope if names.empty?
|
|
47
|
+
scope.joins(:tags).where(completion_kit_tags: { name: names }).distinct
|
|
48
|
+
end
|
|
49
|
+
|
|
28
50
|
end
|
|
29
51
|
end
|
|
30
52
|
end
|
|
@@ -3,10 +3,18 @@ module CompletionKit
|
|
|
3
3
|
module V1
|
|
4
4
|
class CalibrationsController < BaseController
|
|
5
5
|
before_action :ensure_calibration_enabled
|
|
6
|
-
before_action :
|
|
6
|
+
before_action :set_nested_scope, only: [:create]
|
|
7
|
+
before_action :load_calibration, only: [:destroy]
|
|
7
8
|
|
|
8
9
|
def index
|
|
9
|
-
|
|
10
|
+
scope = Calibration.all
|
|
11
|
+
scope = scope.where(run_id: params[:run_id]) if params[:run_id].present?
|
|
12
|
+
scope = scope.where(response_id: params[:response_id]) if params[:response_id].present?
|
|
13
|
+
scope = scope.where(metric_id: params[:metric_id]) if params[:metric_id].present?
|
|
14
|
+
scope = scope.where(metric_version_id: params[:metric_version_id]) if params[:metric_version_id].present?
|
|
15
|
+
scope = scope.where(created_by: params[:created_by]) if params[:created_by].present?
|
|
16
|
+
scope = scope.where(verdict: params[:verdict]) if params[:verdict].present?
|
|
17
|
+
render json: paginate(scope.order(:created_at))
|
|
10
18
|
end
|
|
11
19
|
|
|
12
20
|
def create
|
|
@@ -26,13 +34,18 @@ module CompletionKit
|
|
|
26
34
|
end
|
|
27
35
|
end
|
|
28
36
|
|
|
37
|
+
def destroy
|
|
38
|
+
@calibration.destroy!
|
|
39
|
+
head :no_content
|
|
40
|
+
end
|
|
41
|
+
|
|
29
42
|
private
|
|
30
43
|
|
|
31
44
|
def ensure_calibration_enabled
|
|
32
45
|
render(json: { error: "Calibration disabled" }, status: :not_found) unless CompletionKit.config.judge_calibration_enabled
|
|
33
46
|
end
|
|
34
47
|
|
|
35
|
-
def
|
|
48
|
+
def set_nested_scope
|
|
36
49
|
@run = Run.find(params[:run_id])
|
|
37
50
|
@response = @run.responses.find(params[:response_id])
|
|
38
51
|
@metric = Metric.find(params[:metric_id])
|
|
@@ -40,6 +53,12 @@ module CompletionKit
|
|
|
40
53
|
not_found
|
|
41
54
|
end
|
|
42
55
|
|
|
56
|
+
def load_calibration
|
|
57
|
+
@calibration = Calibration.find(params[:id])
|
|
58
|
+
rescue ActiveRecord::RecordNotFound
|
|
59
|
+
not_found
|
|
60
|
+
end
|
|
61
|
+
|
|
43
62
|
def scope_calibrations
|
|
44
63
|
Calibration.where(run_id: @run.id, response_id: @response.id, metric_id: @metric.id)
|
|
45
64
|
end
|
|
@@ -5,7 +5,9 @@ module CompletionKit
|
|
|
5
5
|
before_action :set_dataset, only: [:show, :update, :destroy]
|
|
6
6
|
|
|
7
7
|
def index
|
|
8
|
-
|
|
8
|
+
scope = Dataset.includes(:tags)
|
|
9
|
+
scope = filter_by_tags(scope)
|
|
10
|
+
render json: paginate(scope.order(created_at: :desc))
|
|
9
11
|
end
|
|
10
12
|
|
|
11
13
|
def show
|
|
@@ -5,7 +5,9 @@ module CompletionKit
|
|
|
5
5
|
before_action :set_metric_group, only: [:show, :update, :destroy]
|
|
6
6
|
|
|
7
7
|
def index
|
|
8
|
-
|
|
8
|
+
scope = MetricGroup.includes(:tags)
|
|
9
|
+
scope = filter_by_tags(scope)
|
|
10
|
+
render json: paginate(scope.order(created_at: :desc))
|
|
9
11
|
end
|
|
10
12
|
|
|
11
13
|
def show
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Api
|
|
3
|
+
module V1
|
|
4
|
+
class MetricVersionsController < BaseController
|
|
5
|
+
before_action :set_metric
|
|
6
|
+
before_action :set_version, only: [:show, :publish, :destroy]
|
|
7
|
+
|
|
8
|
+
def index
|
|
9
|
+
render json: paginate(@metric.metric_versions.order(version_number: :desc))
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def show
|
|
13
|
+
render json: @version
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def publish
|
|
17
|
+
if @version.published? && !@version.current?
|
|
18
|
+
audit = @version.revert!
|
|
19
|
+
render json: audit
|
|
20
|
+
else
|
|
21
|
+
@version.publish!
|
|
22
|
+
render json: @version.reload
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def destroy
|
|
27
|
+
if @version.published?
|
|
28
|
+
render json: { error: "Cannot dismiss a published version. Publish a different version as current instead." }, status: :conflict
|
|
29
|
+
return
|
|
30
|
+
end
|
|
31
|
+
@version.destroy!
|
|
32
|
+
head :no_content
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def set_metric
|
|
38
|
+
@metric = Metric.find(params[:metric_id])
|
|
39
|
+
rescue ActiveRecord::RecordNotFound
|
|
40
|
+
not_found
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def set_version
|
|
44
|
+
@version = @metric.metric_versions.find(params[:id])
|
|
45
|
+
rescue ActiveRecord::RecordNotFound
|
|
46
|
+
not_found
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -2,10 +2,12 @@ module CompletionKit
|
|
|
2
2
|
module Api
|
|
3
3
|
module V1
|
|
4
4
|
class MetricsController < BaseController
|
|
5
|
-
before_action :set_metric, only: [:show, :update, :destroy]
|
|
5
|
+
before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants, :add_few_shot, :remove_few_shot]
|
|
6
6
|
|
|
7
7
|
def index
|
|
8
|
-
|
|
8
|
+
scope = Metric.includes(:tags)
|
|
9
|
+
scope = filter_by_tags(scope)
|
|
10
|
+
render json: paginate(scope.order(created_at: :desc))
|
|
9
11
|
end
|
|
10
12
|
|
|
11
13
|
def show
|
|
@@ -34,6 +36,51 @@ module CompletionKit
|
|
|
34
36
|
head :no_content
|
|
35
37
|
end
|
|
36
38
|
|
|
39
|
+
def suggest_variants
|
|
40
|
+
disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
41
|
+
if disagreement_count.zero?
|
|
42
|
+
render json: { error: "Mark at least one case as Disagree before asking the model to suggest a change." }, status: :unprocessable_entity
|
|
43
|
+
return
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
|
|
47
|
+
generator = MetricVariantGenerator.new(@metric, count: params[:count].to_i, model: params[:model])
|
|
48
|
+
variants = generator.call
|
|
49
|
+
if variants.empty?
|
|
50
|
+
render json: { error: "The model returned no usable variants. Try again with a different model." }, status: :unprocessable_entity
|
|
51
|
+
return
|
|
52
|
+
end
|
|
53
|
+
versions = generator.persist!(variants)
|
|
54
|
+
render json: versions, status: :created
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def add_few_shot
|
|
58
|
+
calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
|
|
59
|
+
review = calibration.response.reviews.find_by(metric_id: @metric.id)
|
|
60
|
+
examples = Array(@metric.few_shot_examples)
|
|
61
|
+
examples << {
|
|
62
|
+
"input" => calibration.response.input_data.to_s.truncate(2000),
|
|
63
|
+
"response" => calibration.response.response_text.to_s.truncate(2000),
|
|
64
|
+
"judge_score" => review&.ai_score&.to_f,
|
|
65
|
+
"judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
|
|
66
|
+
"human_score" => calibration.corrected_score&.to_f,
|
|
67
|
+
"human_note" => calibration.note.to_s.truncate(1000),
|
|
68
|
+
"calibration_id" => calibration.id,
|
|
69
|
+
"added_at" => Time.current.utc.iso8601
|
|
70
|
+
}
|
|
71
|
+
@metric.update!(few_shot_examples: examples)
|
|
72
|
+
render json: @metric.reload
|
|
73
|
+
rescue ActiveRecord::RecordNotFound
|
|
74
|
+
render json: { error: "Calibration not found or not a disagree on this metric." }, status: :not_found
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def remove_few_shot
|
|
78
|
+
cal_id = params[:calibration_id].to_i
|
|
79
|
+
remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
|
|
80
|
+
@metric.update!(few_shot_examples: remaining)
|
|
81
|
+
render json: @metric.reload
|
|
82
|
+
end
|
|
83
|
+
|
|
37
84
|
private
|
|
38
85
|
|
|
39
86
|
def set_metric
|
|
@@ -5,7 +5,9 @@ module CompletionKit
|
|
|
5
5
|
before_action :set_prompt, only: [:show, :update, :destroy, :publish]
|
|
6
6
|
|
|
7
7
|
def index
|
|
8
|
-
|
|
8
|
+
scope = Prompt.includes(:tags)
|
|
9
|
+
scope = filter_by_tags(scope)
|
|
10
|
+
render json: paginate(scope.order(created_at: :desc))
|
|
9
11
|
end
|
|
10
12
|
|
|
11
13
|
def show
|
|
@@ -6,7 +6,9 @@ module CompletionKit
|
|
|
6
6
|
before_action :set_response, only: [:show]
|
|
7
7
|
|
|
8
8
|
def index
|
|
9
|
-
|
|
9
|
+
scope = @run.responses.includes(:reviews)
|
|
10
|
+
scope = scope.where(status: params[:status]) if params[:status].present?
|
|
11
|
+
render json: paginate(scope.order(:id))
|
|
10
12
|
end
|
|
11
13
|
|
|
12
14
|
def show
|
|
@@ -2,10 +2,15 @@ module CompletionKit
|
|
|
2
2
|
module Api
|
|
3
3
|
module V1
|
|
4
4
|
class RunsController < BaseController
|
|
5
|
-
before_action :set_run, only: [:show, :update, :destroy, :generate, :retry_failures]
|
|
5
|
+
before_action :set_run, only: [:show, :update, :destroy, :generate, :retry_failures, :rerun, :regrade, :compare]
|
|
6
6
|
|
|
7
7
|
def index
|
|
8
|
-
|
|
8
|
+
scope = Run.includes(:tags)
|
|
9
|
+
scope = scope.where(status: params[:status]) if params[:status].present?
|
|
10
|
+
scope = scope.where(prompt_id: params[:prompt_id]) if params[:prompt_id].present?
|
|
11
|
+
scope = scope.where(dataset_id: params[:dataset_id]) if params[:dataset_id].present?
|
|
12
|
+
scope = filter_by_tags(scope)
|
|
13
|
+
render json: paginate(scope.order(created_at: :desc))
|
|
9
14
|
end
|
|
10
15
|
|
|
11
16
|
def show
|
|
@@ -71,8 +76,76 @@ module CompletionKit
|
|
|
71
76
|
render json: @run.reload, status: :accepted
|
|
72
77
|
end
|
|
73
78
|
|
|
79
|
+
def rerun
|
|
80
|
+
new_run = Run.create!(
|
|
81
|
+
prompt_id: @run.prompt_id,
|
|
82
|
+
dataset_id: @run.dataset_id,
|
|
83
|
+
judge_model: @run.judge_model,
|
|
84
|
+
temperature: @run.temperature,
|
|
85
|
+
output_column: @run.output_column,
|
|
86
|
+
tag_names: @run.tag_names,
|
|
87
|
+
status: "pending"
|
|
88
|
+
)
|
|
89
|
+
new_run.replace_metrics!(@run.metric_ids)
|
|
90
|
+
if new_run.start!
|
|
91
|
+
render json: new_run.reload, status: :accepted
|
|
92
|
+
else
|
|
93
|
+
render json: { errors: [new_run.failure_summary || "Could not start the new run."] }, status: :unprocessable_entity
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def regrade
|
|
98
|
+
if @run.regrade!
|
|
99
|
+
render json: @run.reload, status: :accepted
|
|
100
|
+
else
|
|
101
|
+
render json: { error: "Nothing to re-grade. The run has no succeeded responses or no metrics attached." }, status: :unprocessable_entity
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def compare
|
|
106
|
+
other = Run.find(params[:with])
|
|
107
|
+
comparison = build_run_comparison(@run, other)
|
|
108
|
+
render json: { left_run_id: @run.id, right_run_id: other.id, metric_ids: comparison[:metric_ids], rows: comparison[:rows] }
|
|
109
|
+
rescue ActiveRecord::RecordNotFound
|
|
110
|
+
render json: { error: "Other run not found. Pass ?with=<run_id>." }, status: :not_found
|
|
111
|
+
end
|
|
112
|
+
|
|
74
113
|
private
|
|
75
114
|
|
|
115
|
+
def build_run_comparison(left, right)
|
|
116
|
+
left_responses = left.responses.includes(:reviews).order(:row_index, :id)
|
|
117
|
+
right_responses = right.responses.includes(:reviews).order(:row_index, :id)
|
|
118
|
+
right_by_input = right_responses.each_with_object({}) { |r, h| h[r.input_data.to_s] ||= r }
|
|
119
|
+
all_reviews = left_responses.flat_map(&:reviews) + right_responses.flat_map(&:reviews)
|
|
120
|
+
metric_ids = all_reviews.map(&:metric_id).compact.uniq
|
|
121
|
+
metric_versions = MetricVersion.where(id: all_reviews.map(&:metric_version_id).compact.uniq).index_by(&:id)
|
|
122
|
+
|
|
123
|
+
rows = left_responses.map do |lr|
|
|
124
|
+
rr = right_by_input[lr.input_data.to_s]
|
|
125
|
+
{
|
|
126
|
+
left_response_id: lr.id,
|
|
127
|
+
right_response_id: rr&.id,
|
|
128
|
+
row_index: lr.row_index,
|
|
129
|
+
per_metric: metric_ids.map do |mid|
|
|
130
|
+
l_review = lr.reviews.find { |r| r.metric_id == mid }
|
|
131
|
+
r_review = rr && rr.reviews.find { |r| r.metric_id == mid }
|
|
132
|
+
next nil if l_review.nil? && r_review.nil?
|
|
133
|
+
anchor = l_review || r_review
|
|
134
|
+
{
|
|
135
|
+
metric_id: mid,
|
|
136
|
+
metric_name: anchor.metric_name,
|
|
137
|
+
left_score: l_review ? l_review.ai_score : nil,
|
|
138
|
+
right_score: r_review ? r_review.ai_score : nil,
|
|
139
|
+
left_metric_version_id: l_review&.metric_version_id,
|
|
140
|
+
right_metric_version_id: r_review&.metric_version_id,
|
|
141
|
+
delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil
|
|
142
|
+
}
|
|
143
|
+
end.compact
|
|
144
|
+
}
|
|
145
|
+
end
|
|
146
|
+
{ rows: rows, metric_ids: metric_ids }
|
|
147
|
+
end
|
|
148
|
+
|
|
76
149
|
def set_run
|
|
77
150
|
@run = Run.find(params[:id])
|
|
78
151
|
rescue ActiveRecord::RecordNotFound
|
|
@@ -42,8 +42,7 @@ module CompletionKit
|
|
|
42
42
|
.limit(50)
|
|
43
43
|
@edit_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
44
44
|
@suggestion_draft = MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
45
|
-
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree"
|
|
46
|
-
metric_version_id: @published_metric_version.id).count
|
|
45
|
+
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
47
46
|
@versions = MetricVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
|
|
48
47
|
end
|
|
49
48
|
|
|
@@ -157,9 +156,20 @@ module CompletionKit
|
|
|
157
156
|
return
|
|
158
157
|
end
|
|
159
158
|
|
|
160
|
-
version.
|
|
161
|
-
|
|
162
|
-
|
|
159
|
+
was_published_already = version.published?
|
|
160
|
+
reverting = was_published_already && !version.current?
|
|
161
|
+
previously_current = MetricVersion.current.find_by(metric_id: @metric.id)
|
|
162
|
+
|
|
163
|
+
if reverting
|
|
164
|
+
audit = version.revert!
|
|
165
|
+
prior_label = previously_current.version_label
|
|
166
|
+
redirect_to metric_path(@metric),
|
|
167
|
+
notice: "Reverted to #{@metric.name} #{version.version_label} (now logged as #{audit.version_label}). Pinned cases still flow to the judge, and calibration verdicts collected against #{prior_label} stay tied to it."
|
|
168
|
+
else
|
|
169
|
+
version.publish!
|
|
170
|
+
redirect_to metric_path(@metric),
|
|
171
|
+
notice: "#{@metric.name} #{version.version_label} is now the published version."
|
|
172
|
+
end
|
|
163
173
|
end
|
|
164
174
|
|
|
165
175
|
def add_few_shot
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class RunsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :refresh_status]
|
|
4
|
+
before_action :set_run, only: [:show, :edit, :update, :destroy, :generate, :suggest, :retry_failures, :rerun, :regrade, :refresh_status, :compare]
|
|
5
5
|
before_action :load_form_collections, only: [:new, :edit, :create, :update]
|
|
6
6
|
|
|
7
7
|
def index
|
|
@@ -78,6 +78,29 @@ module CompletionKit
|
|
|
78
78
|
end
|
|
79
79
|
end
|
|
80
80
|
|
|
81
|
+
def compare
|
|
82
|
+
other_id = params[:with]
|
|
83
|
+
if other_id.blank?
|
|
84
|
+
@other_runs = Run.where(dataset_id: @run.dataset_id, prompt_id: @run.prompt_id)
|
|
85
|
+
.where.not(id: @run.id)
|
|
86
|
+
.order(created_at: :desc)
|
|
87
|
+
.limit(50)
|
|
88
|
+
return render(:compare_picker)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
@other_run = Run.find(other_id)
|
|
92
|
+
@comparison = build_run_comparison(@run, @other_run)
|
|
93
|
+
render(:compare)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def regrade
|
|
97
|
+
if @run.regrade!
|
|
98
|
+
redirect_to run_path(@run), notice: "Re-grading existing responses with the current judge."
|
|
99
|
+
else
|
|
100
|
+
redirect_to run_path(@run), alert: "Nothing to re-grade. The run has no succeeded responses or no metrics attached."
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
81
104
|
def rerun
|
|
82
105
|
new_run = Run.create!(
|
|
83
106
|
prompt_id: @run.prompt_id,
|
|
@@ -153,7 +176,7 @@ module CompletionKit
|
|
|
153
176
|
failed_response_ids.each { |rid| GenerateRowJob.perform_later(@run.id, rid) }
|
|
154
177
|
end
|
|
155
178
|
|
|
156
|
-
@run.
|
|
179
|
+
@run.broadcast_ui
|
|
157
180
|
redirect_to run_path(@run)
|
|
158
181
|
end
|
|
159
182
|
|
|
@@ -163,6 +186,45 @@ module CompletionKit
|
|
|
163
186
|
@run = Run.find(params[:id])
|
|
164
187
|
end
|
|
165
188
|
|
|
189
|
+
def build_run_comparison(left, right)
|
|
190
|
+
left_responses = left.responses.includes(:reviews).order(:row_index, :id)
|
|
191
|
+
right_responses = right.responses.includes(:reviews).order(:row_index, :id)
|
|
192
|
+
right_by_input = right_responses.each_with_object({}) { |r, h| h[r.input_data.to_s] ||= r }
|
|
193
|
+
|
|
194
|
+
all_reviews = left_responses.flat_map(&:reviews) + right_responses.flat_map(&:reviews)
|
|
195
|
+
metric_ids = all_reviews.map(&:metric_id).compact.uniq
|
|
196
|
+
metric_versions = MetricVersion.where(id: all_reviews.map(&:metric_version_id).compact.uniq).index_by(&:id)
|
|
197
|
+
|
|
198
|
+
rows = left_responses.map do |lr|
|
|
199
|
+
rr = right_by_input[lr.input_data.to_s]
|
|
200
|
+
{
|
|
201
|
+
left_response: lr,
|
|
202
|
+
right_response: rr,
|
|
203
|
+
per_metric: metric_ids.map do |mid|
|
|
204
|
+
l_review = lr.reviews.find { |r| r.metric_id == mid }
|
|
205
|
+
r_review = rr && rr.reviews.find { |r| r.metric_id == mid }
|
|
206
|
+
next nil if l_review.nil? && r_review.nil?
|
|
207
|
+
anchor = l_review || r_review
|
|
208
|
+
{
|
|
209
|
+
metric_id: mid,
|
|
210
|
+
metric_name: anchor.metric_name,
|
|
211
|
+
left_score: l_review ? l_review.ai_score : nil,
|
|
212
|
+
right_score: r_review ? r_review.ai_score : nil,
|
|
213
|
+
left_version_label: version_label_for(l_review, metric_versions),
|
|
214
|
+
right_version_label: version_label_for(r_review, metric_versions),
|
|
215
|
+
delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil
|
|
216
|
+
}
|
|
217
|
+
end.compact
|
|
218
|
+
}
|
|
219
|
+
end
|
|
220
|
+
{ rows: rows, metric_ids: metric_ids }
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
def version_label_for(review, metric_versions)
|
|
224
|
+
return nil if review.nil? || review.metric_version_id.nil?
|
|
225
|
+
metric_versions[review.metric_version_id]&.version_label
|
|
226
|
+
end
|
|
227
|
+
|
|
166
228
|
def load_form_collections
|
|
167
229
|
@prompts = Prompt.order(:name)
|
|
168
230
|
@datasets = Dataset.order(:name)
|
|
@@ -53,20 +53,6 @@ module CompletionKit
|
|
|
53
53
|
end
|
|
54
54
|
end
|
|
55
55
|
|
|
56
|
-
def ck_run_status_label(run)
|
|
57
|
-
case run.status
|
|
58
|
-
when "pending" then "Ready to run"
|
|
59
|
-
when "running"
|
|
60
|
-
if run.progress_total.to_i > 0
|
|
61
|
-
"Running (#{run.progress_current}/#{run.progress_total})"
|
|
62
|
-
else
|
|
63
|
-
"Running…"
|
|
64
|
-
end
|
|
65
|
-
when "completed" then "Completed"
|
|
66
|
-
when "failed" then "Failed"
|
|
67
|
-
else run.status.capitalize
|
|
68
|
-
end
|
|
69
|
-
end
|
|
70
56
|
|
|
71
57
|
def ck_provider_label(provider)
|
|
72
58
|
CompletionKit::ProviderCredential::PROVIDER_LABELS[provider.to_s] || provider.to_s.titleize
|
|
@@ -31,8 +31,7 @@ module CompletionKit
|
|
|
31
31
|
before_perform do |job|
|
|
32
32
|
response = Response.find_by(id: job.arguments.last)
|
|
33
33
|
next unless response
|
|
34
|
-
response.
|
|
35
|
-
response.run.send(:broadcast_response_update, response) if response.run
|
|
34
|
+
response.update!(status: "retrying", attempts: response.attempts + 1)
|
|
36
35
|
end
|
|
37
36
|
|
|
38
37
|
def perform(run_id, response_id)
|
|
@@ -61,12 +60,10 @@ module CompletionKit
|
|
|
61
60
|
response_text: text,
|
|
62
61
|
error_provider: nil, error_class: nil, error_status: nil, error_message: nil
|
|
63
62
|
)
|
|
64
|
-
run.send(:broadcast_response_update, response)
|
|
65
|
-
run.send(:broadcast_progress)
|
|
66
63
|
|
|
67
64
|
if run.judge_configured?
|
|
68
65
|
run.metrics.each do |metric|
|
|
69
|
-
JudgeReviewJob.perform_later(response.id, metric.id)
|
|
66
|
+
JudgeReviewJob.perform_later(response.id, metric.id, run.id)
|
|
70
67
|
end
|
|
71
68
|
end
|
|
72
69
|
|
|
@@ -87,15 +84,13 @@ module CompletionKit
|
|
|
87
84
|
response = Response.find_by(id: response_id)
|
|
88
85
|
return unless response
|
|
89
86
|
|
|
90
|
-
response.
|
|
87
|
+
response.update!(
|
|
91
88
|
status: "failed",
|
|
92
89
|
error_provider: provider_for(response),
|
|
93
90
|
error_class: error.class.name,
|
|
94
91
|
error_status: error.respond_to?(:status) ? error.status : nil,
|
|
95
92
|
error_message: error.message.to_s.truncate(2000)
|
|
96
93
|
)
|
|
97
|
-
response.run&.send(:broadcast_response_update, response)
|
|
98
|
-
response.run&.send(:broadcast_progress)
|
|
99
94
|
end
|
|
100
95
|
|
|
101
96
|
def provider_for(response)
|