completion-kit 0.5.34 → 0.5.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css.erb +54 -0
- data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +57 -0
- data/app/controllers/completion_kit/calibrations_controller.rb +50 -0
- data/app/models/completion_kit/calibration.rb +47 -0
- data/app/models/completion_kit/judge_version.rb +32 -0
- data/app/services/completion_kit/mcp_dispatcher.rb +3 -1
- data/app/services/completion_kit/mcp_tools/calibrations.rb +73 -0
- data/app/views/completion_kit/calibrations/_buttons.html.erb +50 -0
- data/app/views/completion_kit/responses/show.html.erb +9 -0
- data/config/routes.rb +8 -2
- data/db/migrate/20260522000001_create_completion_kit_judge_versions.rb +28 -0
- data/db/migrate/20260522000002_create_completion_kit_calibrations.rb +32 -0
- data/lib/completion_kit/version.rb +1 -1
- data/lib/completion_kit.rb +2 -0
- metadata +9 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7ec9f216056f47d007b8a512a009dea02aeaad328a9c4832ac0c1e122b816b15
|
|
4
|
+
data.tar.gz: 413f1b8e8ca28ed2c14e55210299a28bef42acb1a437a3ee9c3bd88b62d03bf9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c837fd6ddc33c5799145bac2a0b1dba4ca1df807365c621330b619c47607e666a30325b47ce6d34e2ecb93e13d4e14a9538d597e0d56e47b9f7a1a836432be0d
|
|
7
|
+
data.tar.gz: 06b9ac7200883cd19d11ff127898d52d940b3f7a110cd5e7744bfcb2abab797fa05e923ba76121f6c00c6aeb8ac5d7b5d5e8b79b07d773ad0c0d5650ec41b90d
|
|
@@ -5119,3 +5119,57 @@ a.tag-mark {
|
|
|
5119
5119
|
outline: 2px solid var(--ck-accent);
|
|
5120
5120
|
outline-offset: 2px;
|
|
5121
5121
|
}
|
|
5122
|
+
|
|
5123
|
+
.ck-calibration {
|
|
5124
|
+
margin-top: 12px;
|
|
5125
|
+
padding-top: 12px;
|
|
5126
|
+
border-top: 1px dashed rgba(255, 255, 255, 0.08);
|
|
5127
|
+
}
|
|
5128
|
+
.ck-calibration__prompt {
|
|
5129
|
+
font-size: 0.8rem;
|
|
5130
|
+
color: var(--ck-dim);
|
|
5131
|
+
margin: 0 0 8px;
|
|
5132
|
+
display: flex;
|
|
5133
|
+
align-items: center;
|
|
5134
|
+
gap: 8px;
|
|
5135
|
+
}
|
|
5136
|
+
.ck-calibration__count {
|
|
5137
|
+
font-size: 0.75rem;
|
|
5138
|
+
color: var(--ck-accent);
|
|
5139
|
+
}
|
|
5140
|
+
.ck-calibration__buttons {
|
|
5141
|
+
display: flex;
|
|
5142
|
+
gap: 8px;
|
|
5143
|
+
flex-wrap: wrap;
|
|
5144
|
+
}
|
|
5145
|
+
.ck-calibration__pill {
|
|
5146
|
+
display: inline-flex;
|
|
5147
|
+
align-items: center;
|
|
5148
|
+
gap: 6px;
|
|
5149
|
+
padding: 6px 12px;
|
|
5150
|
+
border-radius: 999px;
|
|
5151
|
+
font-size: 0.85rem;
|
|
5152
|
+
background: transparent;
|
|
5153
|
+
border: 1px solid rgba(255, 255, 255, 0.18);
|
|
5154
|
+
color: inherit;
|
|
5155
|
+
cursor: pointer;
|
|
5156
|
+
}
|
|
5157
|
+
.ck-calibration__pill:hover,
|
|
5158
|
+
.ck-calibration__pill:focus-visible {
|
|
5159
|
+
border-color: var(--ck-accent);
|
|
5160
|
+
}
|
|
5161
|
+
.ck-calibration__pill.is-active {
|
|
5162
|
+
background: var(--ck-accent);
|
|
5163
|
+
color: #0b1320;
|
|
5164
|
+
border-color: var(--ck-accent);
|
|
5165
|
+
}
|
|
5166
|
+
.ck-calibration__detail {
|
|
5167
|
+
margin-top: 10px;
|
|
5168
|
+
display: flex;
|
|
5169
|
+
flex-direction: column;
|
|
5170
|
+
gap: 8px;
|
|
5171
|
+
}
|
|
5172
|
+
.ck-calibration__value {
|
|
5173
|
+
color: var(--ck-accent);
|
|
5174
|
+
font-weight: 600;
|
|
5175
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Api
|
|
3
|
+
module V1
|
|
4
|
+
class CalibrationsController < BaseController
|
|
5
|
+
before_action :ensure_calibration_enabled
|
|
6
|
+
before_action :set_scope
|
|
7
|
+
|
|
8
|
+
def index
|
|
9
|
+
render json: scope_calibrations
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def create
|
|
13
|
+
calibration = scope_calibrations.find_or_initialize_by(created_by: created_by_param)
|
|
14
|
+
calibration.assign_attributes(
|
|
15
|
+
run: @run,
|
|
16
|
+
response: @response,
|
|
17
|
+
metric: @metric,
|
|
18
|
+
judge_version: JudgeVersion.ensure_current_for(@metric),
|
|
19
|
+
**calibration_params
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
if calibration.save
|
|
23
|
+
render json: calibration, status: calibration.previously_new_record? ? :created : :ok
|
|
24
|
+
else
|
|
25
|
+
render json: { errors: calibration.errors }, status: :unprocessable_entity
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
def ensure_calibration_enabled
|
|
32
|
+
render(json: { error: "Calibration disabled" }, status: :not_found) unless CompletionKit.config.judge_calibration_enabled
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def set_scope
|
|
36
|
+
@run = Run.find(params[:run_id])
|
|
37
|
+
@response = @run.responses.find(params[:response_id])
|
|
38
|
+
@metric = Metric.find(params[:metric_id])
|
|
39
|
+
rescue ActiveRecord::RecordNotFound
|
|
40
|
+
not_found
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def scope_calibrations
|
|
44
|
+
Calibration.where(run_id: @run.id, response_id: @response.id, metric_id: @metric.id)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def calibration_params
|
|
48
|
+
params.permit(:verdict, :corrected_score, :note).to_h.symbolize_keys
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def created_by_param
|
|
52
|
+
params[:created_by].presence || "api"
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class CalibrationsController < ApplicationController
|
|
3
|
+
before_action :ensure_calibration_enabled
|
|
4
|
+
before_action :set_scope
|
|
5
|
+
|
|
6
|
+
def create
|
|
7
|
+
created_by = calibration_creator
|
|
8
|
+
calibration = Calibration.find_or_initialize_by(
|
|
9
|
+
run_id: @run.id, response_id: @response.id, metric_id: @metric.id, created_by: created_by
|
|
10
|
+
)
|
|
11
|
+
calibration.assign_attributes(
|
|
12
|
+
judge_version: JudgeVersion.ensure_current_for(@metric),
|
|
13
|
+
verdict: params[:verdict],
|
|
14
|
+
corrected_score: params[:corrected_score].presence,
|
|
15
|
+
note: params[:note].presence
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
if calibration.save
|
|
19
|
+
render turbo_stream: turbo_stream.replace(
|
|
20
|
+
"calibration_#{@response.id}_#{@metric.id}",
|
|
21
|
+
partial: "completion_kit/calibrations/buttons",
|
|
22
|
+
locals: { review: review_for_metric, calibration: calibration, run: @run, response_row: @response, metric: @metric }
|
|
23
|
+
)
|
|
24
|
+
else
|
|
25
|
+
flash[:alert] = calibration.errors.full_messages.to_sentence
|
|
26
|
+
redirect_to run_response_path(@run, @response)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def ensure_calibration_enabled
|
|
33
|
+
head :not_found unless CompletionKit.config.judge_calibration_enabled
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def set_scope
|
|
37
|
+
@run = Run.find(params[:run_id])
|
|
38
|
+
@response = @run.responses.find(params[:response_id])
|
|
39
|
+
@metric = Metric.find(params[:metric_id])
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def review_for_metric
|
|
43
|
+
@response.reviews.find_by(metric_id: @metric.id)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def calibration_creator
|
|
47
|
+
request.env["HTTP_X_REMOTE_USER"].presence || CompletionKit.config.username.presence || "operator"
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class Calibration < ApplicationRecord
|
|
3
|
+
VERDICTS = %w[agree disagree borderline].freeze
|
|
4
|
+
|
|
5
|
+
belongs_to :run
|
|
6
|
+
belongs_to :response
|
|
7
|
+
belongs_to :metric
|
|
8
|
+
belongs_to :judge_version
|
|
9
|
+
|
|
10
|
+
validates :verdict, presence: true, inclusion: { in: VERDICTS }
|
|
11
|
+
validates :response_id,
|
|
12
|
+
uniqueness: { scope: [:metric_id, :created_by] }
|
|
13
|
+
validate :corrected_score_required_when_disagreeing
|
|
14
|
+
validate :corrected_score_within_rubric
|
|
15
|
+
|
|
16
|
+
scope :for_run, ->(run_id) { where(run_id: run_id) }
|
|
17
|
+
scope :for_metric, ->(metric_id) { where(metric_id: metric_id) }
|
|
18
|
+
|
|
19
|
+
def as_json(options = {})
|
|
20
|
+
{
|
|
21
|
+
id: id,
|
|
22
|
+
run_id: run_id,
|
|
23
|
+
response_id: response_id,
|
|
24
|
+
metric_id: metric_id,
|
|
25
|
+
judge_version_id: judge_version_id,
|
|
26
|
+
verdict: verdict,
|
|
27
|
+
corrected_score: corrected_score,
|
|
28
|
+
note: note,
|
|
29
|
+
created_by: created_by,
|
|
30
|
+
created_at: created_at
|
|
31
|
+
}
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def corrected_score_required_when_disagreeing
|
|
37
|
+
return unless verdict == "disagree"
|
|
38
|
+
errors.add(:corrected_score, "must be set when disagreeing with the judge") if corrected_score.blank?
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def corrected_score_within_rubric
|
|
42
|
+
return if corrected_score.blank?
|
|
43
|
+
score = corrected_score.to_f
|
|
44
|
+
errors.add(:corrected_score, "must be between 1 and 5") unless score >= 1 && score <= 5
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class JudgeVersion < ApplicationRecord
|
|
3
|
+
belongs_to :metric
|
|
4
|
+
has_many :calibrations, dependent: :destroy
|
|
5
|
+
|
|
6
|
+
serialize :rubric_bands, coder: JSON
|
|
7
|
+
|
|
8
|
+
validates :metric_id, presence: true
|
|
9
|
+
|
|
10
|
+
scope :current, -> { where(current: true) }
|
|
11
|
+
|
|
12
|
+
def self.ensure_current_for(metric)
|
|
13
|
+
current.find_by(metric_id: metric.id) || create!(
|
|
14
|
+
metric: metric,
|
|
15
|
+
instruction: metric.instruction,
|
|
16
|
+
rubric_bands: metric.rubric_bands,
|
|
17
|
+
current: true
|
|
18
|
+
)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def as_json(options = {})
|
|
22
|
+
{
|
|
23
|
+
id: id,
|
|
24
|
+
metric_id: metric_id,
|
|
25
|
+
instruction: instruction,
|
|
26
|
+
rubric_bands: rubric_bands,
|
|
27
|
+
current: current,
|
|
28
|
+
created_at: created_at
|
|
29
|
+
}
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -33,7 +33,8 @@ module CompletionKit
|
|
|
33
33
|
McpTools::Metrics.definitions +
|
|
34
34
|
McpTools::MetricGroups.definitions +
|
|
35
35
|
McpTools::ProviderCredentials.definitions +
|
|
36
|
-
McpTools::Tags.definitions
|
|
36
|
+
McpTools::Tags.definitions +
|
|
37
|
+
McpTools::Calibrations.definitions
|
|
37
38
|
end
|
|
38
39
|
|
|
39
40
|
def self.call_tool(name, arguments)
|
|
@@ -46,6 +47,7 @@ module CompletionKit
|
|
|
46
47
|
when /\Ametric_groups_/ then McpTools::MetricGroups.call(name, arguments)
|
|
47
48
|
when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
|
|
48
49
|
when /\Atags_/ then McpTools::Tags.call(name, arguments)
|
|
50
|
+
when /\Acalibrations_/ then McpTools::Calibrations.call(name, arguments)
|
|
49
51
|
else raise MethodNotFound, "Unknown tool: #{name}"
|
|
50
52
|
end
|
|
51
53
|
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module McpTools
|
|
3
|
+
module Calibrations
|
|
4
|
+
extend Base
|
|
5
|
+
|
|
6
|
+
TOOLS = {
|
|
7
|
+
"calibrations_list" => {
|
|
8
|
+
description: "List calibrations. Filter by run_id, response_id, metric_id, or created_by.",
|
|
9
|
+
inputSchema: {
|
|
10
|
+
type: "object",
|
|
11
|
+
properties: {
|
|
12
|
+
run_id: {type: "integer"},
|
|
13
|
+
response_id: {type: "integer"},
|
|
14
|
+
metric_id: {type: "integer"},
|
|
15
|
+
created_by: {type: "string"}
|
|
16
|
+
},
|
|
17
|
+
required: []
|
|
18
|
+
},
|
|
19
|
+
handler: :list
|
|
20
|
+
},
|
|
21
|
+
"calibrations_create" => {
|
|
22
|
+
description: "Upsert a calibration for (run, response, metric, created_by). Verdict is one of agree, disagree, borderline. corrected_score (1..5) is required when verdict is 'disagree'.",
|
|
23
|
+
inputSchema: {
|
|
24
|
+
type: "object",
|
|
25
|
+
properties: {
|
|
26
|
+
run_id: {type: "integer"},
|
|
27
|
+
response_id: {type: "integer"},
|
|
28
|
+
metric_id: {type: "integer"},
|
|
29
|
+
verdict: {type: "string", enum: %w[agree disagree borderline]},
|
|
30
|
+
corrected_score: {type: "number"},
|
|
31
|
+
note: {type: "string"},
|
|
32
|
+
created_by: {type: "string"}
|
|
33
|
+
},
|
|
34
|
+
required: ["run_id", "response_id", "metric_id", "verdict"]
|
|
35
|
+
},
|
|
36
|
+
handler: :create
|
|
37
|
+
}
|
|
38
|
+
}.freeze
|
|
39
|
+
|
|
40
|
+
def self.list(args)
|
|
41
|
+
scope = CompletionKit::Calibration.all
|
|
42
|
+
scope = scope.where(run_id: args["run_id"]) if args["run_id"]
|
|
43
|
+
scope = scope.where(response_id: args["response_id"]) if args["response_id"]
|
|
44
|
+
scope = scope.where(metric_id: args["metric_id"]) if args["metric_id"]
|
|
45
|
+
scope = scope.where(created_by: args["created_by"]) if args["created_by"]
|
|
46
|
+
text_result(scope.order(:created_at).map(&:as_json))
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def self.create(args)
|
|
50
|
+
run = CompletionKit::Run.find(args["run_id"])
|
|
51
|
+
response = run.responses.find(args["response_id"])
|
|
52
|
+
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
53
|
+
created_by = args["created_by"].presence || "mcp"
|
|
54
|
+
|
|
55
|
+
calibration = CompletionKit::Calibration.find_or_initialize_by(
|
|
56
|
+
run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
|
|
57
|
+
)
|
|
58
|
+
calibration.assign_attributes(
|
|
59
|
+
judge_version: CompletionKit::JudgeVersion.ensure_current_for(metric),
|
|
60
|
+
verdict: args["verdict"],
|
|
61
|
+
corrected_score: args["corrected_score"],
|
|
62
|
+
note: args["note"]
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if calibration.save
|
|
66
|
+
text_result(calibration.as_json)
|
|
67
|
+
else
|
|
68
|
+
error_result(calibration.errors.full_messages.join(", "))
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
<div id="calibration_<%= response_row.id %>_<%= metric.id %>" class="ck-calibration">
|
|
2
|
+
<% current_verdict = calibration&.verdict %>
|
|
3
|
+
<% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
|
|
4
|
+
<p class="ck-calibration__prompt">
|
|
5
|
+
How does this score feel?
|
|
6
|
+
<% if verdict_count > 0 %>
|
|
7
|
+
<span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> collected</span>
|
|
8
|
+
<% end %>
|
|
9
|
+
</p>
|
|
10
|
+
<div class="ck-calibration__buttons">
|
|
11
|
+
<% CompletionKit::Calibration::VERDICTS.each do |verdict| %>
|
|
12
|
+
<%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
|
|
13
|
+
method: :post,
|
|
14
|
+
form: { data: { turbo: "true" } },
|
|
15
|
+
class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == current_verdict}",
|
|
16
|
+
"aria-pressed": (verdict == current_verdict).to_s do %>
|
|
17
|
+
<% case verdict
|
|
18
|
+
when "agree" %>👍 Agree<% when "disagree" %>👎 Disagree<% else %>🤔 Borderline<% end %>
|
|
19
|
+
<% end %>
|
|
20
|
+
<% end %>
|
|
21
|
+
</div>
|
|
22
|
+
|
|
23
|
+
<% if current_verdict == "disagree" %>
|
|
24
|
+
<%= form_with url: run_response_calibrations_path(run, response_row),
|
|
25
|
+
method: :post, local: false,
|
|
26
|
+
class: "ck-calibration__detail" do |f| %>
|
|
27
|
+
<%= hidden_field_tag :metric_id, metric.id %>
|
|
28
|
+
<%= hidden_field_tag :verdict, "disagree" %>
|
|
29
|
+
<label class="ck-label">
|
|
30
|
+
Your score
|
|
31
|
+
<span class="ck-calibration__value" data-calibration-value><%= calibration.corrected_score || review&.ai_score || 3 %></span>
|
|
32
|
+
</label>
|
|
33
|
+
<input type="range" name="corrected_score" min="1" max="5" step="0.5"
|
|
34
|
+
value="<%= calibration.corrected_score || review&.ai_score || 3 %>"
|
|
35
|
+
oninput="this.closest('.ck-calibration__detail').querySelector('[data-calibration-value]').textContent = this.value"
|
|
36
|
+
class="ck-slider">
|
|
37
|
+
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration.note %></textarea>
|
|
38
|
+
<%= f.submit "Save", class: ck_button_classes(:dark) %>
|
|
39
|
+
<% end %>
|
|
40
|
+
<% elsif current_verdict == "borderline" %>
|
|
41
|
+
<%= form_with url: run_response_calibrations_path(run, response_row),
|
|
42
|
+
method: :post, local: false,
|
|
43
|
+
class: "ck-calibration__detail" do |f| %>
|
|
44
|
+
<%= hidden_field_tag :metric_id, metric.id %>
|
|
45
|
+
<%= hidden_field_tag :verdict, "borderline" %>
|
|
46
|
+
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration.note %></textarea>
|
|
47
|
+
<%= f.submit "Save", class: ck_button_classes(:dark) %>
|
|
48
|
+
<% end %>
|
|
49
|
+
<% end %>
|
|
50
|
+
</div>
|
|
@@ -116,6 +116,15 @@
|
|
|
116
116
|
<div class="ck-note-box"><%= review.ai_feedback %></div>
|
|
117
117
|
</div>
|
|
118
118
|
<% end %>
|
|
119
|
+
<% if CompletionKit.config.judge_calibration_enabled && review.metric && review.ai_score %>
|
|
120
|
+
<% existing = CompletionKit::Calibration.find_by(
|
|
121
|
+
response_id: @response.id, metric_id: review.metric_id,
|
|
122
|
+
created_by: CompletionKit.config.username.presence || "operator"
|
|
123
|
+
) %>
|
|
124
|
+
<%= render "completion_kit/calibrations/buttons",
|
|
125
|
+
review: review, calibration: existing, run: @run,
|
|
126
|
+
response_row: @response, metric: review.metric %>
|
|
127
|
+
<% end %>
|
|
119
128
|
</div>
|
|
120
129
|
<% end %>
|
|
121
130
|
</div>
|
data/config/routes.rb
CHANGED
|
@@ -26,7 +26,9 @@ CompletionKit::Engine.routes.draw do
|
|
|
26
26
|
post :rerun
|
|
27
27
|
get :refresh_status
|
|
28
28
|
end
|
|
29
|
-
resources :responses, only: [:show]
|
|
29
|
+
resources :responses, only: [:show] do
|
|
30
|
+
resources :calibrations, only: [:create]
|
|
31
|
+
end
|
|
30
32
|
end
|
|
31
33
|
|
|
32
34
|
resources :suggestions, only: [:show] do
|
|
@@ -54,7 +56,11 @@ CompletionKit::Engine.routes.draw do
|
|
|
54
56
|
post :generate
|
|
55
57
|
post :retry_failures
|
|
56
58
|
end
|
|
57
|
-
resources :responses, only: [:index, :show]
|
|
59
|
+
resources :responses, only: [:index, :show] do
|
|
60
|
+
resources :metrics, only: [] do
|
|
61
|
+
resources :calibrations, only: [:index, :create]
|
|
62
|
+
end
|
|
63
|
+
end
|
|
58
64
|
end
|
|
59
65
|
resources :datasets
|
|
60
66
|
resources :metrics
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
class CreateCompletionKitJudgeVersions < ActiveRecord::Migration[8.1]
|
|
2
|
+
def change
|
|
3
|
+
create_table :completion_kit_judge_versions do |t|
|
|
4
|
+
t.references :metric,
|
|
5
|
+
null: false,
|
|
6
|
+
foreign_key: { to_table: :completion_kit_metrics, on_delete: :cascade },
|
|
7
|
+
index: { name: "index_ck_judge_versions_on_metric_id" }
|
|
8
|
+
t.text :instruction
|
|
9
|
+
t.text :rubric_bands
|
|
10
|
+
t.boolean :current, null: false, default: true
|
|
11
|
+
t.timestamps
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
add_index :completion_kit_judge_versions,
|
|
15
|
+
[:metric_id, :current],
|
|
16
|
+
name: "index_ck_judge_versions_on_metric_current"
|
|
17
|
+
|
|
18
|
+
reversible do |dir|
|
|
19
|
+
dir.up do
|
|
20
|
+
metric_model = Class.new(ActiveRecord::Base) { self.table_name = "completion_kit_metrics" }
|
|
21
|
+
jv_model = Class.new(ActiveRecord::Base) { self.table_name = "completion_kit_judge_versions" }
|
|
22
|
+
metric_model.find_each do |m|
|
|
23
|
+
jv_model.create!(metric_id: m.id, instruction: m["instruction"], rubric_bands: m["rubric_bands"], current: true)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
class CreateCompletionKitCalibrations < ActiveRecord::Migration[8.1]
|
|
2
|
+
def change
|
|
3
|
+
create_table :completion_kit_calibrations do |t|
|
|
4
|
+
t.references :run,
|
|
5
|
+
null: false,
|
|
6
|
+
foreign_key: { to_table: :completion_kit_runs, on_delete: :cascade },
|
|
7
|
+
index: { name: "index_ck_calibrations_on_run_id" }
|
|
8
|
+
t.references :response,
|
|
9
|
+
null: false,
|
|
10
|
+
foreign_key: { to_table: :completion_kit_responses, on_delete: :cascade },
|
|
11
|
+
index: { name: "index_ck_calibrations_on_response_id" }
|
|
12
|
+
t.references :metric,
|
|
13
|
+
null: false,
|
|
14
|
+
foreign_key: { to_table: :completion_kit_metrics, on_delete: :cascade },
|
|
15
|
+
index: { name: "index_ck_calibrations_on_metric_id" }
|
|
16
|
+
t.references :judge_version,
|
|
17
|
+
null: false,
|
|
18
|
+
foreign_key: { to_table: :completion_kit_judge_versions, on_delete: :cascade },
|
|
19
|
+
index: { name: "index_ck_calibrations_on_judge_version_id" }
|
|
20
|
+
t.string :verdict, null: false
|
|
21
|
+
t.string :created_by
|
|
22
|
+
t.decimal :corrected_score, precision: 4, scale: 1
|
|
23
|
+
t.text :note
|
|
24
|
+
t.timestamps
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
add_index :completion_kit_calibrations,
|
|
28
|
+
[:response_id, :metric_id, :created_by],
|
|
29
|
+
unique: true,
|
|
30
|
+
name: "index_ck_calibrations_on_response_metric_user"
|
|
31
|
+
end
|
|
32
|
+
end
|
data/lib/completion_kit.rb
CHANGED
|
@@ -12,6 +12,7 @@ module CompletionKit
|
|
|
12
12
|
attr_accessor :api_reference_authentication_partial
|
|
13
13
|
attr_accessor :api_rate_limit, :web_rate_limit
|
|
14
14
|
attr_accessor :allow_loopback_endpoints
|
|
15
|
+
attr_accessor :judge_calibration_enabled
|
|
15
16
|
|
|
16
17
|
def initialize
|
|
17
18
|
@openai_api_key = ENV['OPENAI_API_KEY']
|
|
@@ -27,6 +28,7 @@ module CompletionKit
|
|
|
27
28
|
@web_rate_limit = 300
|
|
28
29
|
|
|
29
30
|
@allow_loopback_endpoints = true
|
|
31
|
+
@judge_calibration_enabled = true
|
|
30
32
|
|
|
31
33
|
@api_reference_authentication_partial = "completion_kit/api_reference/authentication"
|
|
32
34
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.35
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -235,6 +235,7 @@ files:
|
|
|
235
235
|
- app/assets/javascripts/completion_kit/application.js
|
|
236
236
|
- app/assets/stylesheets/completion_kit/application.css.erb
|
|
237
237
|
- app/controllers/completion_kit/api/v1/base_controller.rb
|
|
238
|
+
- app/controllers/completion_kit/api/v1/calibrations_controller.rb
|
|
238
239
|
- app/controllers/completion_kit/api/v1/datasets_controller.rb
|
|
239
240
|
- app/controllers/completion_kit/api/v1/metric_groups_controller.rb
|
|
240
241
|
- app/controllers/completion_kit/api/v1/metrics_controller.rb
|
|
@@ -245,6 +246,7 @@ files:
|
|
|
245
246
|
- app/controllers/completion_kit/api/v1/tags_controller.rb
|
|
246
247
|
- app/controllers/completion_kit/api_reference_controller.rb
|
|
247
248
|
- app/controllers/completion_kit/application_controller.rb
|
|
249
|
+
- app/controllers/completion_kit/calibrations_controller.rb
|
|
248
250
|
- app/controllers/completion_kit/dashboard_controller.rb
|
|
249
251
|
- app/controllers/completion_kit/dashboard_dismissals_controller.rb
|
|
250
252
|
- app/controllers/completion_kit/datasets_controller.rb
|
|
@@ -267,8 +269,10 @@ files:
|
|
|
267
269
|
- app/jobs/completion_kit/run_completion_check_job.rb
|
|
268
270
|
- app/mailers/completion_kit/application_mailer.rb
|
|
269
271
|
- app/models/completion_kit/application_record.rb
|
|
272
|
+
- app/models/completion_kit/calibration.rb
|
|
270
273
|
- app/models/completion_kit/dashboard_dismissal.rb
|
|
271
274
|
- app/models/completion_kit/dataset.rb
|
|
275
|
+
- app/models/completion_kit/judge_version.rb
|
|
272
276
|
- app/models/completion_kit/mcp_session.rb
|
|
273
277
|
- app/models/completion_kit/metric.rb
|
|
274
278
|
- app/models/completion_kit/metric_group.rb
|
|
@@ -292,6 +296,7 @@ files:
|
|
|
292
296
|
- app/services/completion_kit/llm_client.rb
|
|
293
297
|
- app/services/completion_kit/mcp_dispatcher.rb
|
|
294
298
|
- app/services/completion_kit/mcp_tools/base.rb
|
|
299
|
+
- app/services/completion_kit/mcp_tools/calibrations.rb
|
|
295
300
|
- app/services/completion_kit/mcp_tools/datasets.rb
|
|
296
301
|
- app/services/completion_kit/mcp_tools/metric_groups.rb
|
|
297
302
|
- app/services/completion_kit/mcp_tools/metrics.rb
|
|
@@ -317,6 +322,7 @@ files:
|
|
|
317
322
|
- app/views/completion_kit/api_reference/_resource_card.html.erb
|
|
318
323
|
- app/views/completion_kit/api_reference/_resource_list.html.erb
|
|
319
324
|
- app/views/completion_kit/api_reference/index.html.erb
|
|
325
|
+
- app/views/completion_kit/calibrations/_buttons.html.erb
|
|
320
326
|
- app/views/completion_kit/dashboard/_eye_icon.html.erb
|
|
321
327
|
- app/views/completion_kit/dashboard/_eye_off_icon.html.erb
|
|
322
328
|
- app/views/completion_kit/dashboard/_failures_card.html.erb
|
|
@@ -399,6 +405,8 @@ files:
|
|
|
399
405
|
- db/migrate/20260513000001_create_completion_kit_mcp_sessions.rb
|
|
400
406
|
- db/migrate/20260514000001_allow_judge_only_runs.rb
|
|
401
407
|
- db/migrate/20260516000001_create_completion_kit_dashboard_dismissals.rb
|
|
408
|
+
- db/migrate/20260522000001_create_completion_kit_judge_versions.rb
|
|
409
|
+
- db/migrate/20260522000002_create_completion_kit_calibrations.rb
|
|
402
410
|
- lib/completion-kit.rb
|
|
403
411
|
- lib/completion_kit.rb
|
|
404
412
|
- lib/completion_kit/concurrency_check.rb
|