completion-kit 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +18 -4
- data/app/controllers/completion_kit/api/v1/base_controller.rb +22 -0
- data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +22 -3
- data/app/controllers/completion_kit/api/v1/datasets_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/metric_groups_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +51 -0
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +49 -2
- data/app/controllers/completion_kit/api/v1/prompts_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb +1 -1
- data/app/controllers/completion_kit/api/v1/responses_controller.rb +3 -1
- data/app/controllers/completion_kit/api/v1/runs_controller.rb +75 -2
- data/app/controllers/completion_kit/api/v1/tags_controller.rb +1 -1
- data/app/controllers/completion_kit/metrics_controller.rb +3 -3
- data/app/controllers/completion_kit/runs_controller.rb +1 -1
- data/app/helpers/completion_kit/application_helper.rb +0 -14
- data/app/jobs/completion_kit/generate_row_job.rb +3 -8
- data/app/jobs/completion_kit/judge_review_job.rb +6 -9
- data/app/models/completion_kit/metric.rb +1 -0
- data/app/models/completion_kit/metric_version.rb +16 -0
- data/app/models/completion_kit/response.rb +13 -17
- data/app/models/completion_kit/review.rb +18 -22
- data/app/models/completion_kit/run.rb +27 -23
- data/app/models/concerns/completion_kit/has_job_status.rb +31 -0
- data/app/services/completion_kit/mcp_dispatcher.rb +3 -1
- data/app/services/completion_kit/mcp_tools/metric_versions.rb +67 -0
- data/app/services/completion_kit/starter_metrics.rb +5 -5
- data/app/views/completion_kit/api_reference/_body.html.erb +91 -6
- data/app/views/completion_kit/api_reference/index.html.erb +8 -0
- data/app/views/completion_kit/metrics/index.html.erb +3 -3
- data/app/views/completion_kit/metrics/show.html.erb +1 -0
- data/config/routes.rb +16 -1
- data/lib/completion_kit/version.rb +1 -1
- metadata +4 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 285fba79d665c4fe077b42f0e8ce888ce84a314b95698a561eea9fb48c75b045
|
|
4
|
+
data.tar.gz: a984971e294bda341824e3696cab11662f82fcee2c50974798356754ba55126c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 1a12beaf77a9d8071949bede78336910eb8da43598609ac6307e09493d1c088a82cc3056ccaf63b3ae4eeb4ea022d19c182a05c0f037bc8a31ba21246cc5bd56
|
|
7
|
+
data.tar.gz: e007e6eeb9f7e89f3aa5ba8397338ce19778b5040f184b8cb6c012faf3f6ea464f6c3d5423fd7f7b36882494fbb90d70e068da7fe15bb76e67e9992585ba80c6
|
|
@@ -3117,6 +3117,7 @@ select.ck-input {
|
|
|
3117
3117
|
#ck-tab-datasets:checked ~ .ck-api-tabs__nav label[for="ck-tab-datasets"],
|
|
3118
3118
|
#ck-tab-metrics:checked ~ .ck-api-tabs__nav label[for="ck-tab-metrics"],
|
|
3119
3119
|
#ck-tab-metric-groups:checked ~ .ck-api-tabs__nav label[for="ck-tab-metric-groups"],
|
|
3120
|
+
#ck-tab-calibrations:checked ~ .ck-api-tabs__nav label[for="ck-tab-calibrations"],
|
|
3120
3121
|
#ck-tab-tags:checked ~ .ck-api-tabs__nav label[for="ck-tab-tags"],
|
|
3121
3122
|
#ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"] {
|
|
3122
3123
|
color: var(--ck-accent);
|
|
@@ -3131,8 +3132,9 @@ select.ck-input {
|
|
|
3131
3132
|
#ck-tab-datasets:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(5),
|
|
3132
3133
|
#ck-tab-metrics:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(6),
|
|
3133
3134
|
#ck-tab-metric-groups:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(7),
|
|
3134
|
-
#ck-tab-
|
|
3135
|
-
#ck-tab-
|
|
3135
|
+
#ck-tab-calibrations:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(8),
|
|
3136
|
+
#ck-tab-tags:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(9),
|
|
3137
|
+
#ck-tab-providers:checked ~ .ck-api-tabs__panels .ck-api-tabs__panel:nth-child(10) {
|
|
3136
3138
|
display: block;
|
|
3137
3139
|
}
|
|
3138
3140
|
|
|
@@ -3172,6 +3174,7 @@ select.ck-input {
|
|
|
3172
3174
|
#ck-tab-datasets:checked ~ .ck-api-tabs__nav label[for="ck-tab-datasets"],
|
|
3173
3175
|
#ck-tab-metrics:checked ~ .ck-api-tabs__nav label[for="ck-tab-metrics"],
|
|
3174
3176
|
#ck-tab-metric-groups:checked ~ .ck-api-tabs__nav label[for="ck-tab-metric-groups"],
|
|
3177
|
+
#ck-tab-calibrations:checked ~ .ck-api-tabs__nav label[for="ck-tab-calibrations"],
|
|
3175
3178
|
#ck-tab-tags:checked ~ .ck-api-tabs__nav label[for="ck-tab-tags"],
|
|
3176
3179
|
#ck-tab-providers:checked ~ .ck-api-tabs__nav label[for="ck-tab-providers"] {
|
|
3177
3180
|
border-left-color: transparent;
|
|
@@ -3603,6 +3606,11 @@ select.ck-input {
|
|
|
3603
3606
|
border-color: var(--ck-line);
|
|
3604
3607
|
color: var(--ck-dim);
|
|
3605
3608
|
}
|
|
3609
|
+
.ck-source-chip--revert {
|
|
3610
|
+
border-color: rgba(245, 158, 11, 0.35);
|
|
3611
|
+
background: rgba(245, 158, 11, 0.08);
|
|
3612
|
+
color: rgb(217, 119, 6);
|
|
3613
|
+
}
|
|
3606
3614
|
.ck-source-chip--current {
|
|
3607
3615
|
border-color: var(--ck-line-strong);
|
|
3608
3616
|
color: var(--ck-text);
|
|
@@ -6021,8 +6029,14 @@ a.tag-mark {
|
|
|
6021
6029
|
}
|
|
6022
6030
|
.ck-starter-grid {
|
|
6023
6031
|
display: grid;
|
|
6024
|
-
grid-template-columns: repeat(
|
|
6025
|
-
gap:
|
|
6032
|
+
grid-template-columns: repeat(4, 1fr);
|
|
6033
|
+
gap: 12px;
|
|
6034
|
+
}
|
|
6035
|
+
@media (max-width: 1000px) {
|
|
6036
|
+
.ck-starter-grid { grid-template-columns: repeat(2, 1fr); }
|
|
6037
|
+
}
|
|
6038
|
+
@media (max-width: 600px) {
|
|
6039
|
+
.ck-starter-grid { grid-template-columns: 1fr; }
|
|
6026
6040
|
}
|
|
6027
6041
|
.ck-starter-card {
|
|
6028
6042
|
display: flex;
|
|
@@ -25,6 +25,28 @@ module CompletionKit
|
|
|
25
25
|
render json: {error: "Record not found"}, status: :not_found
|
|
26
26
|
end
|
|
27
27
|
|
|
28
|
+
PAGINATION_DEFAULT_LIMIT = 50
|
|
29
|
+
PAGINATION_MAX_LIMIT = 500
|
|
30
|
+
|
|
31
|
+
def paginate(scope)
|
|
32
|
+
total = scope.count
|
|
33
|
+
limit = (params[:limit].presence || PAGINATION_DEFAULT_LIMIT).to_i
|
|
34
|
+
limit = PAGINATION_DEFAULT_LIMIT if limit <= 0
|
|
35
|
+
limit = PAGINATION_MAX_LIMIT if limit > PAGINATION_MAX_LIMIT
|
|
36
|
+
offset = params[:offset].to_i
|
|
37
|
+
offset = 0 if offset < 0
|
|
38
|
+
response.set_header("X-Total-Count", total.to_s)
|
|
39
|
+
response.set_header("X-Limit", limit.to_s)
|
|
40
|
+
response.set_header("X-Offset", offset.to_s)
|
|
41
|
+
scope.limit(limit).offset(offset)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def filter_by_tags(scope)
|
|
45
|
+
names = Array(params[:tag]).map(&:to_s).reject(&:blank?)
|
|
46
|
+
return scope if names.empty?
|
|
47
|
+
scope.joins(:tags).where(completion_kit_tags: { name: names }).distinct
|
|
48
|
+
end
|
|
49
|
+
|
|
28
50
|
end
|
|
29
51
|
end
|
|
30
52
|
end
|
|
@@ -3,10 +3,18 @@ module CompletionKit
|
|
|
3
3
|
module V1
|
|
4
4
|
class CalibrationsController < BaseController
|
|
5
5
|
before_action :ensure_calibration_enabled
|
|
6
|
-
before_action :
|
|
6
|
+
before_action :set_nested_scope, only: [:create]
|
|
7
|
+
before_action :load_calibration, only: [:destroy]
|
|
7
8
|
|
|
8
9
|
def index
|
|
9
|
-
|
|
10
|
+
scope = Calibration.all
|
|
11
|
+
scope = scope.where(run_id: params[:run_id]) if params[:run_id].present?
|
|
12
|
+
scope = scope.where(response_id: params[:response_id]) if params[:response_id].present?
|
|
13
|
+
scope = scope.where(metric_id: params[:metric_id]) if params[:metric_id].present?
|
|
14
|
+
scope = scope.where(metric_version_id: params[:metric_version_id]) if params[:metric_version_id].present?
|
|
15
|
+
scope = scope.where(created_by: params[:created_by]) if params[:created_by].present?
|
|
16
|
+
scope = scope.where(verdict: params[:verdict]) if params[:verdict].present?
|
|
17
|
+
render json: paginate(scope.order(:created_at))
|
|
10
18
|
end
|
|
11
19
|
|
|
12
20
|
def create
|
|
@@ -26,13 +34,18 @@ module CompletionKit
|
|
|
26
34
|
end
|
|
27
35
|
end
|
|
28
36
|
|
|
37
|
+
def destroy
|
|
38
|
+
@calibration.destroy!
|
|
39
|
+
head :no_content
|
|
40
|
+
end
|
|
41
|
+
|
|
29
42
|
private
|
|
30
43
|
|
|
31
44
|
def ensure_calibration_enabled
|
|
32
45
|
render(json: { error: "Calibration disabled" }, status: :not_found) unless CompletionKit.config.judge_calibration_enabled
|
|
33
46
|
end
|
|
34
47
|
|
|
35
|
-
def
|
|
48
|
+
def set_nested_scope
|
|
36
49
|
@run = Run.find(params[:run_id])
|
|
37
50
|
@response = @run.responses.find(params[:response_id])
|
|
38
51
|
@metric = Metric.find(params[:metric_id])
|
|
@@ -40,6 +53,12 @@ module CompletionKit
|
|
|
40
53
|
not_found
|
|
41
54
|
end
|
|
42
55
|
|
|
56
|
+
def load_calibration
|
|
57
|
+
@calibration = Calibration.find(params[:id])
|
|
58
|
+
rescue ActiveRecord::RecordNotFound
|
|
59
|
+
not_found
|
|
60
|
+
end
|
|
61
|
+
|
|
43
62
|
def scope_calibrations
|
|
44
63
|
Calibration.where(run_id: @run.id, response_id: @response.id, metric_id: @metric.id)
|
|
45
64
|
end
|
|
@@ -5,7 +5,9 @@ module CompletionKit
|
|
|
5
5
|
before_action :set_dataset, only: [:show, :update, :destroy]
|
|
6
6
|
|
|
7
7
|
def index
|
|
8
|
-
|
|
8
|
+
scope = Dataset.includes(:tags)
|
|
9
|
+
scope = filter_by_tags(scope)
|
|
10
|
+
render json: paginate(scope.order(created_at: :desc))
|
|
9
11
|
end
|
|
10
12
|
|
|
11
13
|
def show
|
|
@@ -5,7 +5,9 @@ module CompletionKit
|
|
|
5
5
|
before_action :set_metric_group, only: [:show, :update, :destroy]
|
|
6
6
|
|
|
7
7
|
def index
|
|
8
|
-
|
|
8
|
+
scope = MetricGroup.includes(:tags)
|
|
9
|
+
scope = filter_by_tags(scope)
|
|
10
|
+
render json: paginate(scope.order(created_at: :desc))
|
|
9
11
|
end
|
|
10
12
|
|
|
11
13
|
def show
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Api
|
|
3
|
+
module V1
|
|
4
|
+
class MetricVersionsController < BaseController
|
|
5
|
+
before_action :set_metric
|
|
6
|
+
before_action :set_version, only: [:show, :publish, :destroy]
|
|
7
|
+
|
|
8
|
+
def index
|
|
9
|
+
render json: paginate(@metric.metric_versions.order(version_number: :desc))
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def show
|
|
13
|
+
render json: @version
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def publish
|
|
17
|
+
if @version.published? && !@version.current?
|
|
18
|
+
audit = @version.revert!
|
|
19
|
+
render json: audit
|
|
20
|
+
else
|
|
21
|
+
@version.publish!
|
|
22
|
+
render json: @version.reload
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def destroy
|
|
27
|
+
if @version.published?
|
|
28
|
+
render json: { error: "Cannot dismiss a published version. Publish a different version as current instead." }, status: :conflict
|
|
29
|
+
return
|
|
30
|
+
end
|
|
31
|
+
@version.destroy!
|
|
32
|
+
head :no_content
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def set_metric
|
|
38
|
+
@metric = Metric.find(params[:metric_id])
|
|
39
|
+
rescue ActiveRecord::RecordNotFound
|
|
40
|
+
not_found
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def set_version
|
|
44
|
+
@version = @metric.metric_versions.find(params[:id])
|
|
45
|
+
rescue ActiveRecord::RecordNotFound
|
|
46
|
+
not_found
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -2,10 +2,12 @@ module CompletionKit
|
|
|
2
2
|
module Api
|
|
3
3
|
module V1
|
|
4
4
|
class MetricsController < BaseController
|
|
5
|
-
before_action :set_metric, only: [:show, :update, :destroy]
|
|
5
|
+
before_action :set_metric, only: [:show, :update, :destroy, :suggest_variants, :add_few_shot, :remove_few_shot]
|
|
6
6
|
|
|
7
7
|
def index
|
|
8
|
-
|
|
8
|
+
scope = Metric.includes(:tags)
|
|
9
|
+
scope = filter_by_tags(scope)
|
|
10
|
+
render json: paginate(scope.order(created_at: :desc))
|
|
9
11
|
end
|
|
10
12
|
|
|
11
13
|
def show
|
|
@@ -34,6 +36,51 @@ module CompletionKit
|
|
|
34
36
|
head :no_content
|
|
35
37
|
end
|
|
36
38
|
|
|
39
|
+
def suggest_variants
|
|
40
|
+
disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
41
|
+
if disagreement_count.zero?
|
|
42
|
+
render json: { error: "Mark at least one case as Disagree before asking the model to suggest a change." }, status: :unprocessable_entity
|
|
43
|
+
return
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
|
|
47
|
+
generator = MetricVariantGenerator.new(@metric, count: params[:count].to_i, model: params[:model])
|
|
48
|
+
variants = generator.call
|
|
49
|
+
if variants.empty?
|
|
50
|
+
render json: { error: "The model returned no usable variants. Try again with a different model." }, status: :unprocessable_entity
|
|
51
|
+
return
|
|
52
|
+
end
|
|
53
|
+
versions = generator.persist!(variants)
|
|
54
|
+
render json: versions, status: :created
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def add_few_shot
|
|
58
|
+
calibration = Calibration.where(metric_id: @metric.id, verdict: "disagree").find(params[:calibration_id])
|
|
59
|
+
review = calibration.response.reviews.find_by(metric_id: @metric.id)
|
|
60
|
+
examples = Array(@metric.few_shot_examples)
|
|
61
|
+
examples << {
|
|
62
|
+
"input" => calibration.response.input_data.to_s.truncate(2000),
|
|
63
|
+
"response" => calibration.response.response_text.to_s.truncate(2000),
|
|
64
|
+
"judge_score" => review&.ai_score&.to_f,
|
|
65
|
+
"judge_feedback" => review&.ai_feedback.to_s.truncate(1000),
|
|
66
|
+
"human_score" => calibration.corrected_score&.to_f,
|
|
67
|
+
"human_note" => calibration.note.to_s.truncate(1000),
|
|
68
|
+
"calibration_id" => calibration.id,
|
|
69
|
+
"added_at" => Time.current.utc.iso8601
|
|
70
|
+
}
|
|
71
|
+
@metric.update!(few_shot_examples: examples)
|
|
72
|
+
render json: @metric.reload
|
|
73
|
+
rescue ActiveRecord::RecordNotFound
|
|
74
|
+
render json: { error: "Calibration not found or not a disagree on this metric." }, status: :not_found
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def remove_few_shot
|
|
78
|
+
cal_id = params[:calibration_id].to_i
|
|
79
|
+
remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
|
|
80
|
+
@metric.update!(few_shot_examples: remaining)
|
|
81
|
+
render json: @metric.reload
|
|
82
|
+
end
|
|
83
|
+
|
|
37
84
|
private
|
|
38
85
|
|
|
39
86
|
def set_metric
|
|
@@ -5,7 +5,9 @@ module CompletionKit
|
|
|
5
5
|
before_action :set_prompt, only: [:show, :update, :destroy, :publish]
|
|
6
6
|
|
|
7
7
|
def index
|
|
8
|
-
|
|
8
|
+
scope = Prompt.includes(:tags)
|
|
9
|
+
scope = filter_by_tags(scope)
|
|
10
|
+
render json: paginate(scope.order(created_at: :desc))
|
|
9
11
|
end
|
|
10
12
|
|
|
11
13
|
def show
|
|
@@ -6,7 +6,9 @@ module CompletionKit
|
|
|
6
6
|
before_action :set_response, only: [:show]
|
|
7
7
|
|
|
8
8
|
def index
|
|
9
|
-
|
|
9
|
+
scope = @run.responses.includes(:reviews)
|
|
10
|
+
scope = scope.where(status: params[:status]) if params[:status].present?
|
|
11
|
+
render json: paginate(scope.order(:id))
|
|
10
12
|
end
|
|
11
13
|
|
|
12
14
|
def show
|
|
@@ -2,10 +2,15 @@ module CompletionKit
|
|
|
2
2
|
module Api
|
|
3
3
|
module V1
|
|
4
4
|
class RunsController < BaseController
|
|
5
|
-
before_action :set_run, only: [:show, :update, :destroy, :generate, :retry_failures]
|
|
5
|
+
before_action :set_run, only: [:show, :update, :destroy, :generate, :retry_failures, :rerun, :regrade, :compare]
|
|
6
6
|
|
|
7
7
|
def index
|
|
8
|
-
|
|
8
|
+
scope = Run.includes(:tags)
|
|
9
|
+
scope = scope.where(status: params[:status]) if params[:status].present?
|
|
10
|
+
scope = scope.where(prompt_id: params[:prompt_id]) if params[:prompt_id].present?
|
|
11
|
+
scope = scope.where(dataset_id: params[:dataset_id]) if params[:dataset_id].present?
|
|
12
|
+
scope = filter_by_tags(scope)
|
|
13
|
+
render json: paginate(scope.order(created_at: :desc))
|
|
9
14
|
end
|
|
10
15
|
|
|
11
16
|
def show
|
|
@@ -71,8 +76,76 @@ module CompletionKit
|
|
|
71
76
|
render json: @run.reload, status: :accepted
|
|
72
77
|
end
|
|
73
78
|
|
|
79
|
+
def rerun
|
|
80
|
+
new_run = Run.create!(
|
|
81
|
+
prompt_id: @run.prompt_id,
|
|
82
|
+
dataset_id: @run.dataset_id,
|
|
83
|
+
judge_model: @run.judge_model,
|
|
84
|
+
temperature: @run.temperature,
|
|
85
|
+
output_column: @run.output_column,
|
|
86
|
+
tag_names: @run.tag_names,
|
|
87
|
+
status: "pending"
|
|
88
|
+
)
|
|
89
|
+
new_run.replace_metrics!(@run.metric_ids)
|
|
90
|
+
if new_run.start!
|
|
91
|
+
render json: new_run.reload, status: :accepted
|
|
92
|
+
else
|
|
93
|
+
render json: { errors: [new_run.failure_summary || "Could not start the new run."] }, status: :unprocessable_entity
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def regrade
|
|
98
|
+
if @run.regrade!
|
|
99
|
+
render json: @run.reload, status: :accepted
|
|
100
|
+
else
|
|
101
|
+
render json: { error: "Nothing to re-grade. The run has no succeeded responses or no metrics attached." }, status: :unprocessable_entity
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def compare
|
|
106
|
+
other = Run.find(params[:with])
|
|
107
|
+
comparison = build_run_comparison(@run, other)
|
|
108
|
+
render json: { left_run_id: @run.id, right_run_id: other.id, metric_ids: comparison[:metric_ids], rows: comparison[:rows] }
|
|
109
|
+
rescue ActiveRecord::RecordNotFound
|
|
110
|
+
render json: { error: "Other run not found. Pass ?with=<run_id>." }, status: :not_found
|
|
111
|
+
end
|
|
112
|
+
|
|
74
113
|
private
|
|
75
114
|
|
|
115
|
+
def build_run_comparison(left, right)
|
|
116
|
+
left_responses = left.responses.includes(:reviews).order(:row_index, :id)
|
|
117
|
+
right_responses = right.responses.includes(:reviews).order(:row_index, :id)
|
|
118
|
+
right_by_input = right_responses.each_with_object({}) { |r, h| h[r.input_data.to_s] ||= r }
|
|
119
|
+
all_reviews = left_responses.flat_map(&:reviews) + right_responses.flat_map(&:reviews)
|
|
120
|
+
metric_ids = all_reviews.map(&:metric_id).compact.uniq
|
|
121
|
+
metric_versions = MetricVersion.where(id: all_reviews.map(&:metric_version_id).compact.uniq).index_by(&:id)
|
|
122
|
+
|
|
123
|
+
rows = left_responses.map do |lr|
|
|
124
|
+
rr = right_by_input[lr.input_data.to_s]
|
|
125
|
+
{
|
|
126
|
+
left_response_id: lr.id,
|
|
127
|
+
right_response_id: rr&.id,
|
|
128
|
+
row_index: lr.row_index,
|
|
129
|
+
per_metric: metric_ids.map do |mid|
|
|
130
|
+
l_review = lr.reviews.find { |r| r.metric_id == mid }
|
|
131
|
+
r_review = rr && rr.reviews.find { |r| r.metric_id == mid }
|
|
132
|
+
next nil if l_review.nil? && r_review.nil?
|
|
133
|
+
anchor = l_review || r_review
|
|
134
|
+
{
|
|
135
|
+
metric_id: mid,
|
|
136
|
+
metric_name: anchor.metric_name,
|
|
137
|
+
left_score: l_review ? l_review.ai_score : nil,
|
|
138
|
+
right_score: r_review ? r_review.ai_score : nil,
|
|
139
|
+
left_metric_version_id: l_review&.metric_version_id,
|
|
140
|
+
right_metric_version_id: r_review&.metric_version_id,
|
|
141
|
+
delta: (l_review&.ai_score && r_review&.ai_score) ? (r_review.ai_score.to_f - l_review.ai_score.to_f).round(2) : nil
|
|
142
|
+
}
|
|
143
|
+
end.compact
|
|
144
|
+
}
|
|
145
|
+
end
|
|
146
|
+
{ rows: rows, metric_ids: metric_ids }
|
|
147
|
+
end
|
|
148
|
+
|
|
76
149
|
def set_run
|
|
77
150
|
@run = Run.find(params[:id])
|
|
78
151
|
rescue ActiveRecord::RecordNotFound
|
|
@@ -160,13 +160,13 @@ module CompletionKit
|
|
|
160
160
|
reverting = was_published_already && !version.current?
|
|
161
161
|
previously_current = MetricVersion.current.find_by(metric_id: @metric.id)
|
|
162
162
|
|
|
163
|
-
version.publish!
|
|
164
|
-
|
|
165
163
|
if reverting
|
|
164
|
+
audit = version.revert!
|
|
166
165
|
prior_label = previously_current.version_label
|
|
167
166
|
redirect_to metric_path(@metric),
|
|
168
|
-
notice: "Reverted to #{@metric.name} #{version.version_label}. Pinned cases still flow to the judge, and calibration verdicts collected against #{prior_label} stay tied to it."
|
|
167
|
+
notice: "Reverted to #{@metric.name} #{version.version_label} (now logged as #{audit.version_label}). Pinned cases still flow to the judge, and calibration verdicts collected against #{prior_label} stay tied to it."
|
|
169
168
|
else
|
|
169
|
+
version.publish!
|
|
170
170
|
redirect_to metric_path(@metric),
|
|
171
171
|
notice: "#{@metric.name} #{version.version_label} is now the published version."
|
|
172
172
|
end
|
|
@@ -53,20 +53,6 @@ module CompletionKit
|
|
|
53
53
|
end
|
|
54
54
|
end
|
|
55
55
|
|
|
56
|
-
def ck_run_status_label(run)
|
|
57
|
-
case run.status
|
|
58
|
-
when "pending" then "Ready to run"
|
|
59
|
-
when "running"
|
|
60
|
-
if run.progress_total.to_i > 0
|
|
61
|
-
"Running (#{run.progress_current}/#{run.progress_total})"
|
|
62
|
-
else
|
|
63
|
-
"Running…"
|
|
64
|
-
end
|
|
65
|
-
when "completed" then "Completed"
|
|
66
|
-
when "failed" then "Failed"
|
|
67
|
-
else run.status.capitalize
|
|
68
|
-
end
|
|
69
|
-
end
|
|
70
56
|
|
|
71
57
|
def ck_provider_label(provider)
|
|
72
58
|
CompletionKit::ProviderCredential::PROVIDER_LABELS[provider.to_s] || provider.to_s.titleize
|
|
@@ -31,8 +31,7 @@ module CompletionKit
|
|
|
31
31
|
before_perform do |job|
|
|
32
32
|
response = Response.find_by(id: job.arguments.last)
|
|
33
33
|
next unless response
|
|
34
|
-
response.
|
|
35
|
-
response.run.send(:broadcast_response_update, response) if response.run
|
|
34
|
+
response.update!(status: "retrying", attempts: response.attempts + 1)
|
|
36
35
|
end
|
|
37
36
|
|
|
38
37
|
def perform(run_id, response_id)
|
|
@@ -61,12 +60,10 @@ module CompletionKit
|
|
|
61
60
|
response_text: text,
|
|
62
61
|
error_provider: nil, error_class: nil, error_status: nil, error_message: nil
|
|
63
62
|
)
|
|
64
|
-
run.send(:broadcast_response_update, response)
|
|
65
|
-
run.send(:broadcast_progress)
|
|
66
63
|
|
|
67
64
|
if run.judge_configured?
|
|
68
65
|
run.metrics.each do |metric|
|
|
69
|
-
JudgeReviewJob.perform_later(response.id, metric.id)
|
|
66
|
+
JudgeReviewJob.perform_later(response.id, metric.id, run.id)
|
|
70
67
|
end
|
|
71
68
|
end
|
|
72
69
|
|
|
@@ -87,15 +84,13 @@ module CompletionKit
|
|
|
87
84
|
response = Response.find_by(id: response_id)
|
|
88
85
|
return unless response
|
|
89
86
|
|
|
90
|
-
response.
|
|
87
|
+
response.update!(
|
|
91
88
|
status: "failed",
|
|
92
89
|
error_provider: provider_for(response),
|
|
93
90
|
error_class: error.class.name,
|
|
94
91
|
error_status: error.respond_to?(:status) ? error.status : nil,
|
|
95
92
|
error_message: error.message.to_s.truncate(2000)
|
|
96
93
|
)
|
|
97
|
-
response.run&.send(:broadcast_response_update, response)
|
|
98
|
-
response.run&.send(:broadcast_progress)
|
|
99
94
|
end
|
|
100
95
|
|
|
101
96
|
def provider_for(response)
|
|
@@ -5,7 +5,9 @@ module CompletionKit
|
|
|
5
5
|
queue_as :llm
|
|
6
6
|
|
|
7
7
|
limits_concurrency to: ENV.fetch("COMPLETION_KIT_PER_RUN_CONCURRENCY", 5).to_i,
|
|
8
|
-
key: ->(response_id,
|
|
8
|
+
key: ->(response_id, _metric_id, run_id = nil) {
|
|
9
|
+
"run:#{run_id || Response.where(id: response_id).pick(:run_id)}"
|
|
10
|
+
},
|
|
9
11
|
duration: 10.minutes
|
|
10
12
|
|
|
11
13
|
def self.rate_limit_wait(executions)
|
|
@@ -29,7 +31,7 @@ module CompletionKit
|
|
|
29
31
|
end
|
|
30
32
|
|
|
31
33
|
before_perform do |job|
|
|
32
|
-
response_id, metric_id = job.arguments
|
|
34
|
+
response_id, metric_id, _run_id = job.arguments
|
|
33
35
|
response = Response.find_by(id: response_id)
|
|
34
36
|
next unless response
|
|
35
37
|
review = response.reviews.find_or_initialize_by(metric_id: metric_id)
|
|
@@ -37,10 +39,9 @@ module CompletionKit
|
|
|
37
39
|
review.attempts = (review.attempts || 0) + 1
|
|
38
40
|
review.status = "retrying"
|
|
39
41
|
review.save!(validate: false)
|
|
40
|
-
response.run.send(:broadcast_response_update, response) if response.run
|
|
41
42
|
end
|
|
42
43
|
|
|
43
|
-
def perform(response_id, metric_id)
|
|
44
|
+
def perform(response_id, metric_id, _run_id = nil)
|
|
44
45
|
@response_id = response_id
|
|
45
46
|
@metric_id = metric_id
|
|
46
47
|
|
|
@@ -75,8 +76,6 @@ module CompletionKit
|
|
|
75
76
|
review.save!
|
|
76
77
|
|
|
77
78
|
confirm_judging_capability(run.judge_model)
|
|
78
|
-
run.send(:broadcast_response_update, response)
|
|
79
|
-
run.send(:broadcast_progress)
|
|
80
79
|
enqueue_completion_check
|
|
81
80
|
end
|
|
82
81
|
|
|
@@ -107,13 +106,11 @@ module CompletionKit
|
|
|
107
106
|
error_message: error.message.to_s.truncate(2000)
|
|
108
107
|
)
|
|
109
108
|
review.save!(validate: false)
|
|
110
|
-
response.run&.send(:broadcast_response_update, response)
|
|
111
|
-
response.run&.send(:broadcast_progress)
|
|
112
109
|
end
|
|
113
110
|
|
|
114
111
|
def provider_for(response)
|
|
115
112
|
run = response.run
|
|
116
|
-
return nil unless run
|
|
113
|
+
return nil unless run.judge_model
|
|
117
114
|
ApiConfig.provider_for_model(run.judge_model)
|
|
118
115
|
end
|
|
119
116
|
|
|
@@ -12,6 +12,7 @@ module CompletionKit
|
|
|
12
12
|
|
|
13
13
|
has_many :metric_group_memberships, dependent: :destroy
|
|
14
14
|
has_many :metric_groups, through: :metric_group_memberships, source: :metric_group
|
|
15
|
+
has_many :metric_versions, dependent: :destroy
|
|
15
16
|
has_many :reviews, dependent: :nullify
|
|
16
17
|
has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
|
|
17
18
|
|
|
@@ -53,6 +53,22 @@ module CompletionKit
|
|
|
53
53
|
self
|
|
54
54
|
end
|
|
55
55
|
|
|
56
|
+
def revert!
|
|
57
|
+
raise ArgumentError, "only a published version can be reverted to" unless published?
|
|
58
|
+
audit = nil
|
|
59
|
+
MetricVersion.transaction do
|
|
60
|
+
audit = self.class.create!(
|
|
61
|
+
metric: metric,
|
|
62
|
+
instruction: instruction,
|
|
63
|
+
rubric_bands: rubric_bands,
|
|
64
|
+
state: "draft",
|
|
65
|
+
source: "revert"
|
|
66
|
+
)
|
|
67
|
+
audit.publish!
|
|
68
|
+
end
|
|
69
|
+
audit
|
|
70
|
+
end
|
|
71
|
+
|
|
56
72
|
def as_json(options = {})
|
|
57
73
|
{
|
|
58
74
|
id: id,
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class Response < ApplicationRecord
|
|
3
|
-
|
|
4
|
-
TERMINAL_STATUSES = %w[succeeded failed].freeze
|
|
3
|
+
include HasJobStatus
|
|
5
4
|
|
|
6
5
|
belongs_to :run
|
|
7
6
|
has_many :reviews, dependent: :destroy
|
|
@@ -10,17 +9,11 @@ module CompletionKit
|
|
|
10
9
|
delegate :prompt, to: :run
|
|
11
10
|
|
|
12
11
|
validates :response_text, presence: true, if: :succeeded?
|
|
13
|
-
validates :status, inclusion: { in: STATUSES }
|
|
14
12
|
|
|
15
13
|
before_validation :set_default_status, on: :create
|
|
16
14
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
end
|
|
20
|
-
|
|
21
|
-
def succeeded?
|
|
22
|
-
status == "succeeded"
|
|
23
|
-
end
|
|
15
|
+
after_save_commit :broadcast_row_update, unless: :destroyed?
|
|
16
|
+
after_save_commit :broadcast_run_progress, if: :should_broadcast_progress?
|
|
24
17
|
|
|
25
18
|
def as_json(options = {})
|
|
26
19
|
{
|
|
@@ -47,19 +40,22 @@ module CompletionKit
|
|
|
47
40
|
def fully_reviewed?
|
|
48
41
|
metric_ids = run.metric_ids
|
|
49
42
|
return true if metric_ids.empty?
|
|
50
|
-
reviewed_metric_ids = reviews.where(status:
|
|
43
|
+
reviewed_metric_ids = reviews.where(status: HasJobStatus::TERMINAL_STATUSES).pluck(:metric_id).uniq
|
|
51
44
|
(metric_ids - reviewed_metric_ids).empty?
|
|
52
45
|
end
|
|
53
46
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
def broadcast_row_update
|
|
50
|
+
run.broadcast_response_update(self)
|
|
57
51
|
end
|
|
58
52
|
|
|
59
|
-
|
|
53
|
+
def broadcast_run_progress
|
|
54
|
+
run.broadcast_progress
|
|
55
|
+
end
|
|
60
56
|
|
|
61
|
-
def
|
|
62
|
-
|
|
57
|
+
def should_broadcast_progress?
|
|
58
|
+
saved_change_to_status? && terminal?
|
|
63
59
|
end
|
|
64
60
|
end
|
|
65
61
|
end
|