completion-kit 0.5.33 → 0.5.35
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css.erb +85 -0
- data/app/controllers/completion_kit/api/v1/calibrations_controller.rb +57 -0
- data/app/controllers/completion_kit/calibrations_controller.rb +50 -0
- data/app/helpers/completion_kit/application_helper.rb +14 -0
- data/app/models/completion_kit/calibration.rb +47 -0
- data/app/models/completion_kit/judge_version.rb +32 -0
- data/app/models/completion_kit/provider_credential.rb +4 -24
- data/app/services/completion_kit/api_config.rb +20 -14
- data/app/services/completion_kit/mcp_dispatcher.rb +3 -1
- data/app/services/completion_kit/mcp_tools/calibrations.rb +73 -0
- data/app/services/completion_kit/ollama_client.rb +5 -1
- data/app/services/completion_kit/provider_endpoint.rb +47 -0
- data/app/views/completion_kit/calibrations/_buttons.html.erb +50 -0
- data/app/views/completion_kit/datasets/_form.html.erb +5 -3
- data/app/views/completion_kit/datasets/index.html.erb +6 -6
- data/app/views/completion_kit/metric_groups/index.html.erb +5 -5
- data/app/views/completion_kit/metrics/_form.html.erb +5 -3
- data/app/views/completion_kit/metrics/index.html.erb +5 -5
- data/app/views/completion_kit/prompts/_form.html.erb +7 -4
- data/app/views/completion_kit/prompts/index.html.erb +7 -7
- data/app/views/completion_kit/provider_credentials/_form.html.erb +5 -3
- data/app/views/completion_kit/responses/show.html.erb +10 -1
- data/app/views/completion_kit/runs/_form.html.erb +5 -3
- data/app/views/completion_kit/runs/_response_row.html.erb +1 -1
- data/app/views/completion_kit/runs/_row.html.erb +1 -1
- data/app/views/completion_kit/runs/_status_header.html.erb +2 -2
- data/app/views/completion_kit/runs/_status_panel.html.erb +1 -1
- data/app/views/completion_kit/runs/_table.html.erb +6 -6
- data/app/views/completion_kit/suggestions/show.html.erb +1 -1
- data/app/views/completion_kit/tags/_picker.html.erb +2 -2
- data/app/views/completion_kit/tags/index.html.erb +4 -4
- data/config/routes.rb +8 -2
- data/db/migrate/20260522000001_create_completion_kit_judge_versions.rb +28 -0
- data/db/migrate/20260522000002_create_completion_kit_calibrations.rb +32 -0
- data/lib/completion_kit/version.rb +1 -1
- data/lib/completion_kit.rb +5 -0
- metadata +10 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 7ec9f216056f47d007b8a512a009dea02aeaad328a9c4832ac0c1e122b816b15
|
|
4
|
+
data.tar.gz: 413f1b8e8ca28ed2c14e55210299a28bef42acb1a437a3ee9c3bd88b62d03bf9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c837fd6ddc33c5799145bac2a0b1dba4ca1df807365c621330b619c47607e666a30325b47ce6d34e2ecb93e13d4e14a9538d597e0d56e47b9f7a1a836432be0d
|
|
7
|
+
data.tar.gz: 06b9ac7200883cd19d11ff127898d52d940b3f7a110cd5e7744bfcb2abab797fa05e923ba76121f6c00c6aeb8ac5d7b5d5e8b79b07d773ad0c0d5650ec41b90d
|
|
@@ -5088,3 +5088,88 @@ a.tag-mark {
|
|
|
5088
5088
|
letter-spacing: 0.04em;
|
|
5089
5089
|
text-transform: uppercase;
|
|
5090
5090
|
}
|
|
5091
|
+
|
|
5092
|
+
.ck-record-name {
|
|
5093
|
+
color: inherit;
|
|
5094
|
+
text-decoration: none;
|
|
5095
|
+
}
|
|
5096
|
+
.ck-record-name:hover,
|
|
5097
|
+
.ck-record-name:focus-visible {
|
|
5098
|
+
color: var(--ck-accent);
|
|
5099
|
+
}
|
|
5100
|
+
|
|
5101
|
+
.ck-field-error {
|
|
5102
|
+
color: var(--ck-error, #d93232);
|
|
5103
|
+
font-size: 0.85rem;
|
|
5104
|
+
margin: 4px 0 0;
|
|
5105
|
+
}
|
|
5106
|
+
|
|
5107
|
+
.ck-visually-hidden {
|
|
5108
|
+
position: absolute;
|
|
5109
|
+
width: 1px;
|
|
5110
|
+
height: 1px;
|
|
5111
|
+
padding: 0;
|
|
5112
|
+
margin: -1px;
|
|
5113
|
+
overflow: hidden;
|
|
5114
|
+
clip: rect(0 0 0 0);
|
|
5115
|
+
white-space: nowrap;
|
|
5116
|
+
border: 0;
|
|
5117
|
+
}
|
|
5118
|
+
.tag-mark:focus-within {
|
|
5119
|
+
outline: 2px solid var(--ck-accent);
|
|
5120
|
+
outline-offset: 2px;
|
|
5121
|
+
}
|
|
5122
|
+
|
|
5123
|
+
.ck-calibration {
|
|
5124
|
+
margin-top: 12px;
|
|
5125
|
+
padding-top: 12px;
|
|
5126
|
+
border-top: 1px dashed rgba(255, 255, 255, 0.08);
|
|
5127
|
+
}
|
|
5128
|
+
.ck-calibration__prompt {
|
|
5129
|
+
font-size: 0.8rem;
|
|
5130
|
+
color: var(--ck-dim);
|
|
5131
|
+
margin: 0 0 8px;
|
|
5132
|
+
display: flex;
|
|
5133
|
+
align-items: center;
|
|
5134
|
+
gap: 8px;
|
|
5135
|
+
}
|
|
5136
|
+
.ck-calibration__count {
|
|
5137
|
+
font-size: 0.75rem;
|
|
5138
|
+
color: var(--ck-accent);
|
|
5139
|
+
}
|
|
5140
|
+
.ck-calibration__buttons {
|
|
5141
|
+
display: flex;
|
|
5142
|
+
gap: 8px;
|
|
5143
|
+
flex-wrap: wrap;
|
|
5144
|
+
}
|
|
5145
|
+
.ck-calibration__pill {
|
|
5146
|
+
display: inline-flex;
|
|
5147
|
+
align-items: center;
|
|
5148
|
+
gap: 6px;
|
|
5149
|
+
padding: 6px 12px;
|
|
5150
|
+
border-radius: 999px;
|
|
5151
|
+
font-size: 0.85rem;
|
|
5152
|
+
background: transparent;
|
|
5153
|
+
border: 1px solid rgba(255, 255, 255, 0.18);
|
|
5154
|
+
color: inherit;
|
|
5155
|
+
cursor: pointer;
|
|
5156
|
+
}
|
|
5157
|
+
.ck-calibration__pill:hover,
|
|
5158
|
+
.ck-calibration__pill:focus-visible {
|
|
5159
|
+
border-color: var(--ck-accent);
|
|
5160
|
+
}
|
|
5161
|
+
.ck-calibration__pill.is-active {
|
|
5162
|
+
background: var(--ck-accent);
|
|
5163
|
+
color: #0b1320;
|
|
5164
|
+
border-color: var(--ck-accent);
|
|
5165
|
+
}
|
|
5166
|
+
.ck-calibration__detail {
|
|
5167
|
+
margin-top: 10px;
|
|
5168
|
+
display: flex;
|
|
5169
|
+
flex-direction: column;
|
|
5170
|
+
gap: 8px;
|
|
5171
|
+
}
|
|
5172
|
+
.ck-calibration__value {
|
|
5173
|
+
color: var(--ck-accent);
|
|
5174
|
+
font-weight: 600;
|
|
5175
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module Api
|
|
3
|
+
module V1
|
|
4
|
+
class CalibrationsController < BaseController
|
|
5
|
+
before_action :ensure_calibration_enabled
|
|
6
|
+
before_action :set_scope
|
|
7
|
+
|
|
8
|
+
def index
|
|
9
|
+
render json: scope_calibrations
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def create
|
|
13
|
+
calibration = scope_calibrations.find_or_initialize_by(created_by: created_by_param)
|
|
14
|
+
calibration.assign_attributes(
|
|
15
|
+
run: @run,
|
|
16
|
+
response: @response,
|
|
17
|
+
metric: @metric,
|
|
18
|
+
judge_version: JudgeVersion.ensure_current_for(@metric),
|
|
19
|
+
**calibration_params
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
if calibration.save
|
|
23
|
+
render json: calibration, status: calibration.previously_new_record? ? :created : :ok
|
|
24
|
+
else
|
|
25
|
+
render json: { errors: calibration.errors }, status: :unprocessable_entity
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
def ensure_calibration_enabled
|
|
32
|
+
render(json: { error: "Calibration disabled" }, status: :not_found) unless CompletionKit.config.judge_calibration_enabled
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def set_scope
|
|
36
|
+
@run = Run.find(params[:run_id])
|
|
37
|
+
@response = @run.responses.find(params[:response_id])
|
|
38
|
+
@metric = Metric.find(params[:metric_id])
|
|
39
|
+
rescue ActiveRecord::RecordNotFound
|
|
40
|
+
not_found
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def scope_calibrations
|
|
44
|
+
Calibration.where(run_id: @run.id, response_id: @response.id, metric_id: @metric.id)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def calibration_params
|
|
48
|
+
params.permit(:verdict, :corrected_score, :note).to_h.symbolize_keys
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def created_by_param
|
|
52
|
+
params[:created_by].presence || "api"
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class CalibrationsController < ApplicationController
|
|
3
|
+
before_action :ensure_calibration_enabled
|
|
4
|
+
before_action :set_scope
|
|
5
|
+
|
|
6
|
+
def create
|
|
7
|
+
created_by = calibration_creator
|
|
8
|
+
calibration = Calibration.find_or_initialize_by(
|
|
9
|
+
run_id: @run.id, response_id: @response.id, metric_id: @metric.id, created_by: created_by
|
|
10
|
+
)
|
|
11
|
+
calibration.assign_attributes(
|
|
12
|
+
judge_version: JudgeVersion.ensure_current_for(@metric),
|
|
13
|
+
verdict: params[:verdict],
|
|
14
|
+
corrected_score: params[:corrected_score].presence,
|
|
15
|
+
note: params[:note].presence
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
if calibration.save
|
|
19
|
+
render turbo_stream: turbo_stream.replace(
|
|
20
|
+
"calibration_#{@response.id}_#{@metric.id}",
|
|
21
|
+
partial: "completion_kit/calibrations/buttons",
|
|
22
|
+
locals: { review: review_for_metric, calibration: calibration, run: @run, response_row: @response, metric: @metric }
|
|
23
|
+
)
|
|
24
|
+
else
|
|
25
|
+
flash[:alert] = calibration.errors.full_messages.to_sentence
|
|
26
|
+
redirect_to run_response_path(@run, @response)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
def ensure_calibration_enabled
|
|
33
|
+
head :not_found unless CompletionKit.config.judge_calibration_enabled
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def set_scope
|
|
37
|
+
@run = Run.find(params[:run_id])
|
|
38
|
+
@response = @run.responses.find(params[:response_id])
|
|
39
|
+
@metric = Metric.find(params[:metric_id])
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def review_for_metric
|
|
43
|
+
@response.reviews.find_by(metric_id: @metric.id)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def calibration_creator
|
|
47
|
+
request.env["HTTP_X_REMOTE_USER"].presence || CompletionKit.config.username.presence || "operator"
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -218,6 +218,20 @@ module CompletionKit
|
|
|
218
218
|
"#{base_path}?#{{ tag: next_set.map(&:name) }.to_query}"
|
|
219
219
|
end
|
|
220
220
|
|
|
221
|
+
def ck_field_aria(form, field)
|
|
222
|
+
return {} unless form.object.errors[field].any?
|
|
223
|
+
{ "aria-invalid" => "true", "aria-describedby" => ck_field_error_id(form, field) }
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
def ck_field_error(form, field)
|
|
227
|
+
return nil unless form.object.errors[field].any?
|
|
228
|
+
content_tag(:p, form.object.errors[field].first, class: "ck-field-error", id: ck_field_error_id(form, field))
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
def ck_field_error_id(form, field)
|
|
232
|
+
"#{form.object.model_name.param_key}_#{field}_error"
|
|
233
|
+
end
|
|
234
|
+
|
|
221
235
|
private
|
|
222
236
|
|
|
223
237
|
def diff_tokens(old_text, new_text, side)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class Calibration < ApplicationRecord
|
|
3
|
+
VERDICTS = %w[agree disagree borderline].freeze
|
|
4
|
+
|
|
5
|
+
belongs_to :run
|
|
6
|
+
belongs_to :response
|
|
7
|
+
belongs_to :metric
|
|
8
|
+
belongs_to :judge_version
|
|
9
|
+
|
|
10
|
+
validates :verdict, presence: true, inclusion: { in: VERDICTS }
|
|
11
|
+
validates :response_id,
|
|
12
|
+
uniqueness: { scope: [:metric_id, :created_by] }
|
|
13
|
+
validate :corrected_score_required_when_disagreeing
|
|
14
|
+
validate :corrected_score_within_rubric
|
|
15
|
+
|
|
16
|
+
scope :for_run, ->(run_id) { where(run_id: run_id) }
|
|
17
|
+
scope :for_metric, ->(metric_id) { where(metric_id: metric_id) }
|
|
18
|
+
|
|
19
|
+
def as_json(options = {})
|
|
20
|
+
{
|
|
21
|
+
id: id,
|
|
22
|
+
run_id: run_id,
|
|
23
|
+
response_id: response_id,
|
|
24
|
+
metric_id: metric_id,
|
|
25
|
+
judge_version_id: judge_version_id,
|
|
26
|
+
verdict: verdict,
|
|
27
|
+
corrected_score: corrected_score,
|
|
28
|
+
note: note,
|
|
29
|
+
created_by: created_by,
|
|
30
|
+
created_at: created_at
|
|
31
|
+
}
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def corrected_score_required_when_disagreeing
|
|
37
|
+
return unless verdict == "disagree"
|
|
38
|
+
errors.add(:corrected_score, "must be set when disagreeing with the judge") if corrected_score.blank?
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def corrected_score_within_rubric
|
|
42
|
+
return if corrected_score.blank?
|
|
43
|
+
score = corrected_score.to_f
|
|
44
|
+
errors.add(:corrected_score, "must be between 1 and 5") unless score >= 1 && score <= 5
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class JudgeVersion < ApplicationRecord
|
|
3
|
+
belongs_to :metric
|
|
4
|
+
has_many :calibrations, dependent: :destroy
|
|
5
|
+
|
|
6
|
+
serialize :rubric_bands, coder: JSON
|
|
7
|
+
|
|
8
|
+
validates :metric_id, presence: true
|
|
9
|
+
|
|
10
|
+
scope :current, -> { where(current: true) }
|
|
11
|
+
|
|
12
|
+
def self.ensure_current_for(metric)
|
|
13
|
+
current.find_by(metric_id: metric.id) || create!(
|
|
14
|
+
metric: metric,
|
|
15
|
+
instruction: metric.instruction,
|
|
16
|
+
rubric_bands: metric.rubric_bands,
|
|
17
|
+
current: true
|
|
18
|
+
)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def as_json(options = {})
|
|
22
|
+
{
|
|
23
|
+
id: id,
|
|
24
|
+
metric_id: metric_id,
|
|
25
|
+
instruction: instruction,
|
|
26
|
+
rubric_bands: rubric_bands,
|
|
27
|
+
current: current,
|
|
28
|
+
created_at: created_at
|
|
29
|
+
}
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -1,6 +1,3 @@
|
|
|
1
|
-
require "ipaddr"
|
|
2
|
-
require "resolv"
|
|
3
|
-
|
|
4
1
|
module CompletionKit
|
|
5
2
|
class ProviderCredential < ApplicationRecord
|
|
6
3
|
include Turbo::Broadcastable
|
|
@@ -8,7 +5,7 @@ module CompletionKit
|
|
|
8
5
|
PROVIDER_LABELS = {
|
|
9
6
|
"openai" => "OpenAI",
|
|
10
7
|
"anthropic" => "Anthropic",
|
|
11
|
-
"ollama" => "Ollama /
|
|
8
|
+
"ollama" => "Ollama / OpenAI-compatible endpoint",
|
|
12
9
|
"openrouter" => "OpenRouter"
|
|
13
10
|
}.freeze
|
|
14
11
|
|
|
@@ -139,29 +136,12 @@ module CompletionKit
|
|
|
139
136
|
def api_endpoint_not_internal
|
|
140
137
|
return if api_endpoint.blank?
|
|
141
138
|
|
|
142
|
-
|
|
143
|
-
|
|
139
|
+
issues = ProviderEndpoint.validate(api_endpoint)
|
|
140
|
+
if issues.include?(:invalid_url)
|
|
144
141
|
errors.add(:api_endpoint, "must be a valid http or https URL")
|
|
145
|
-
|
|
146
|
-
end
|
|
147
|
-
|
|
148
|
-
if endpoint_addresses(uri.host).any? { |ip| ip.private? || ip.link_local? }
|
|
142
|
+
elsif issues.include?(:unsafe_host)
|
|
149
143
|
errors.add(:api_endpoint, "must not point at a private or internal address")
|
|
150
144
|
end
|
|
151
145
|
end
|
|
152
|
-
|
|
153
|
-
def safe_http_uri(value)
|
|
154
|
-
uri = URI.parse(value.to_s.strip)
|
|
155
|
-
uri if uri.is_a?(URI::HTTP) && uri.host.present?
|
|
156
|
-
rescue URI::InvalidURIError
|
|
157
|
-
nil
|
|
158
|
-
end
|
|
159
|
-
|
|
160
|
-
def endpoint_addresses(host)
|
|
161
|
-
bare = host.delete_prefix("[").delete_suffix("]")
|
|
162
|
-
[IPAddr.new(bare)]
|
|
163
|
-
rescue IPAddr::InvalidAddressError
|
|
164
|
-
Resolv.getaddresses(host).map { |addr| IPAddr.new(addr) }
|
|
165
|
-
end
|
|
166
146
|
end
|
|
167
147
|
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class ApiConfig
|
|
3
|
+
PROVIDERS = %w[openai anthropic ollama openrouter].freeze
|
|
4
|
+
|
|
3
5
|
def self.for_model(model_name)
|
|
4
6
|
provider = provider_for_model(model_name)
|
|
5
7
|
provider ? for_provider(provider) : {}
|
|
@@ -9,21 +11,25 @@ module CompletionKit
|
|
|
9
11
|
provider = provider_name.to_s
|
|
10
12
|
stored = ProviderCredential.find_by(provider: provider)&.config_hash || {}
|
|
11
13
|
|
|
12
|
-
defaults =
|
|
13
|
-
|
|
14
|
-
{ provider: "openai", api_key: CompletionKit.config.openai_api_key || ENV["OPENAI_API_KEY"] }
|
|
15
|
-
when "anthropic"
|
|
16
|
-
{ provider: "anthropic", api_key: CompletionKit.config.anthropic_api_key || ENV["ANTHROPIC_API_KEY"] }
|
|
17
|
-
when "ollama"
|
|
18
|
-
{
|
|
19
|
-
provider: "ollama",
|
|
20
|
-
api_key: CompletionKit.config.ollama_api_key || ENV["OLLAMA_API_KEY"],
|
|
21
|
-
api_endpoint: CompletionKit.config.ollama_api_endpoint || ENV["OLLAMA_API_ENDPOINT"]
|
|
22
|
-
}
|
|
23
|
-
when "openrouter"
|
|
24
|
-
{ provider: "openrouter", api_key: ENV["OPENROUTER_API_KEY"] }
|
|
14
|
+
defaults = if CompletionKit.config.tenant_scope
|
|
15
|
+
PROVIDERS.include?(provider) ? { provider: provider } : {}
|
|
25
16
|
else
|
|
26
|
-
|
|
17
|
+
case provider
|
|
18
|
+
when "openai"
|
|
19
|
+
{ provider: "openai", api_key: CompletionKit.config.openai_api_key || ENV["OPENAI_API_KEY"] }
|
|
20
|
+
when "anthropic"
|
|
21
|
+
{ provider: "anthropic", api_key: CompletionKit.config.anthropic_api_key || ENV["ANTHROPIC_API_KEY"] }
|
|
22
|
+
when "ollama"
|
|
23
|
+
{
|
|
24
|
+
provider: "ollama",
|
|
25
|
+
api_key: CompletionKit.config.ollama_api_key || ENV["OLLAMA_API_KEY"],
|
|
26
|
+
api_endpoint: CompletionKit.config.ollama_api_endpoint || ENV["OLLAMA_API_ENDPOINT"]
|
|
27
|
+
}
|
|
28
|
+
when "openrouter"
|
|
29
|
+
{ provider: "openrouter", api_key: ENV["OPENROUTER_API_KEY"] }
|
|
30
|
+
else
|
|
31
|
+
{}
|
|
32
|
+
end
|
|
27
33
|
end
|
|
28
34
|
|
|
29
35
|
defaults.merge(stored.compact)
|
|
@@ -33,7 +33,8 @@ module CompletionKit
|
|
|
33
33
|
McpTools::Metrics.definitions +
|
|
34
34
|
McpTools::MetricGroups.definitions +
|
|
35
35
|
McpTools::ProviderCredentials.definitions +
|
|
36
|
-
McpTools::Tags.definitions
|
|
36
|
+
McpTools::Tags.definitions +
|
|
37
|
+
McpTools::Calibrations.definitions
|
|
37
38
|
end
|
|
38
39
|
|
|
39
40
|
def self.call_tool(name, arguments)
|
|
@@ -46,6 +47,7 @@ module CompletionKit
|
|
|
46
47
|
when /\Ametric_groups_/ then McpTools::MetricGroups.call(name, arguments)
|
|
47
48
|
when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
|
|
48
49
|
when /\Atags_/ then McpTools::Tags.call(name, arguments)
|
|
50
|
+
when /\Acalibrations_/ then McpTools::Calibrations.call(name, arguments)
|
|
49
51
|
else raise MethodNotFound, "Unknown tool: #{name}"
|
|
50
52
|
end
|
|
51
53
|
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module McpTools
|
|
3
|
+
module Calibrations
|
|
4
|
+
extend Base
|
|
5
|
+
|
|
6
|
+
TOOLS = {
|
|
7
|
+
"calibrations_list" => {
|
|
8
|
+
description: "List calibrations. Filter by run_id, response_id, metric_id, or created_by.",
|
|
9
|
+
inputSchema: {
|
|
10
|
+
type: "object",
|
|
11
|
+
properties: {
|
|
12
|
+
run_id: {type: "integer"},
|
|
13
|
+
response_id: {type: "integer"},
|
|
14
|
+
metric_id: {type: "integer"},
|
|
15
|
+
created_by: {type: "string"}
|
|
16
|
+
},
|
|
17
|
+
required: []
|
|
18
|
+
},
|
|
19
|
+
handler: :list
|
|
20
|
+
},
|
|
21
|
+
"calibrations_create" => {
|
|
22
|
+
description: "Upsert a calibration for (run, response, metric, created_by). Verdict is one of agree, disagree, borderline. corrected_score (1..5) is required when verdict is 'disagree'.",
|
|
23
|
+
inputSchema: {
|
|
24
|
+
type: "object",
|
|
25
|
+
properties: {
|
|
26
|
+
run_id: {type: "integer"},
|
|
27
|
+
response_id: {type: "integer"},
|
|
28
|
+
metric_id: {type: "integer"},
|
|
29
|
+
verdict: {type: "string", enum: %w[agree disagree borderline]},
|
|
30
|
+
corrected_score: {type: "number"},
|
|
31
|
+
note: {type: "string"},
|
|
32
|
+
created_by: {type: "string"}
|
|
33
|
+
},
|
|
34
|
+
required: ["run_id", "response_id", "metric_id", "verdict"]
|
|
35
|
+
},
|
|
36
|
+
handler: :create
|
|
37
|
+
}
|
|
38
|
+
}.freeze
|
|
39
|
+
|
|
40
|
+
def self.list(args)
|
|
41
|
+
scope = CompletionKit::Calibration.all
|
|
42
|
+
scope = scope.where(run_id: args["run_id"]) if args["run_id"]
|
|
43
|
+
scope = scope.where(response_id: args["response_id"]) if args["response_id"]
|
|
44
|
+
scope = scope.where(metric_id: args["metric_id"]) if args["metric_id"]
|
|
45
|
+
scope = scope.where(created_by: args["created_by"]) if args["created_by"]
|
|
46
|
+
text_result(scope.order(:created_at).map(&:as_json))
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def self.create(args)
|
|
50
|
+
run = CompletionKit::Run.find(args["run_id"])
|
|
51
|
+
response = run.responses.find(args["response_id"])
|
|
52
|
+
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
53
|
+
created_by = args["created_by"].presence || "mcp"
|
|
54
|
+
|
|
55
|
+
calibration = CompletionKit::Calibration.find_or_initialize_by(
|
|
56
|
+
run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
|
|
57
|
+
)
|
|
58
|
+
calibration.assign_attributes(
|
|
59
|
+
judge_version: CompletionKit::JudgeVersion.ensure_current_for(metric),
|
|
60
|
+
verdict: args["verdict"],
|
|
61
|
+
corrected_score: args["corrected_score"],
|
|
62
|
+
note: args["note"]
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if calibration.save
|
|
66
|
+
text_result(calibration.as_json)
|
|
67
|
+
else
|
|
68
|
+
error_result(calibration.errors.full_messages.join(", "))
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -7,6 +7,7 @@ module CompletionKit
|
|
|
7
7
|
def generate_completion(prompt, options = {})
|
|
8
8
|
@temperature_dropped = false
|
|
9
9
|
return "Error: API endpoint not configured" unless configured?
|
|
10
|
+
return "Error: API endpoint resolves to a private address" unless ProviderEndpoint.safe?(api_endpoint)
|
|
10
11
|
|
|
11
12
|
model = options[:model]
|
|
12
13
|
max_tokens = options[:max_tokens] || 1000
|
|
@@ -44,6 +45,7 @@ module CompletionKit
|
|
|
44
45
|
|
|
45
46
|
def available_models
|
|
46
47
|
return [] unless configured?
|
|
48
|
+
return [] unless ProviderEndpoint.safe?(api_endpoint)
|
|
47
49
|
|
|
48
50
|
response = build_connection(api_endpoint).get("/v1/models") do |req|
|
|
49
51
|
req.headers["Authorization"] = "Bearer #{api_key}" if api_key.present?
|
|
@@ -74,7 +76,9 @@ module CompletionKit
|
|
|
74
76
|
end
|
|
75
77
|
|
|
76
78
|
def api_endpoint
|
|
77
|
-
|
|
79
|
+
raw = @config[:api_endpoint] || ENV["OLLAMA_API_ENDPOINT"]
|
|
80
|
+
raw ||= "http://localhost:11434/v1" if CompletionKit.config.allow_loopback_endpoints
|
|
81
|
+
raw.to_s.delete_suffix("/")
|
|
78
82
|
end
|
|
79
83
|
|
|
80
84
|
def post_completion(model:, prompt:, max_tokens:, temperature:)
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
require "ipaddr"
|
|
2
|
+
require "resolv"
|
|
3
|
+
|
|
4
|
+
module CompletionKit
|
|
5
|
+
module ProviderEndpoint
|
|
6
|
+
ZERO_NET = IPAddr.new("0.0.0.0/8").freeze
|
|
7
|
+
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def validate(url)
|
|
11
|
+
uri = parse(url)
|
|
12
|
+
return [:invalid_url] unless uri
|
|
13
|
+
addrs = addresses(uri.host)
|
|
14
|
+
return [:unresolvable] if addrs.empty?
|
|
15
|
+
return [:unsafe_host] if addrs.any? { |ip| unsafe?(ip) }
|
|
16
|
+
[]
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def safe?(url)
|
|
20
|
+
errors = validate(url)
|
|
21
|
+
errors.empty? || errors == [:unresolvable]
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def parse(value)
|
|
25
|
+
uri = URI.parse(value.to_s.strip)
|
|
26
|
+
uri if uri.is_a?(URI::HTTP) && uri.host.present?
|
|
27
|
+
rescue URI::InvalidURIError
|
|
28
|
+
nil
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def addresses(host)
|
|
32
|
+
bare = host.delete_prefix("[").delete_suffix("]")
|
|
33
|
+
[IPAddr.new(bare)]
|
|
34
|
+
rescue IPAddr::InvalidAddressError
|
|
35
|
+
Resolv.getaddresses(host).map { |addr| IPAddr.new(addr) }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def unsafe?(ip)
|
|
39
|
+
return true if ip.private?
|
|
40
|
+
return true if ip.link_local?
|
|
41
|
+
return true if ip.to_i.zero?
|
|
42
|
+
return true if ip.ipv4? && ZERO_NET.include?(ip)
|
|
43
|
+
return true if ip.loopback? && !CompletionKit.config.allow_loopback_endpoints
|
|
44
|
+
false
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
<div id="calibration_<%= response_row.id %>_<%= metric.id %>" class="ck-calibration">
|
|
2
|
+
<% current_verdict = calibration&.verdict %>
|
|
3
|
+
<% verdict_count = CompletionKit::Calibration.where(response_id: response_row.id, metric_id: metric.id).count %>
|
|
4
|
+
<p class="ck-calibration__prompt">
|
|
5
|
+
How does this score feel?
|
|
6
|
+
<% if verdict_count > 0 %>
|
|
7
|
+
<span class="ck-calibration__count"><%= pluralize(verdict_count, "verdict") %> collected</span>
|
|
8
|
+
<% end %>
|
|
9
|
+
</p>
|
|
10
|
+
<div class="ck-calibration__buttons">
|
|
11
|
+
<% CompletionKit::Calibration::VERDICTS.each do |verdict| %>
|
|
12
|
+
<%= button_to run_response_calibrations_path(run, response_row, metric_id: metric.id, verdict: verdict),
|
|
13
|
+
method: :post,
|
|
14
|
+
form: { data: { turbo: "true" } },
|
|
15
|
+
class: "ck-calibration__pill ck-calibration__pill--#{verdict}#{' is-active' if verdict == current_verdict}",
|
|
16
|
+
"aria-pressed": (verdict == current_verdict).to_s do %>
|
|
17
|
+
<% case verdict
|
|
18
|
+
when "agree" %>👍 Agree<% when "disagree" %>👎 Disagree<% else %>🤔 Borderline<% end %>
|
|
19
|
+
<% end %>
|
|
20
|
+
<% end %>
|
|
21
|
+
</div>
|
|
22
|
+
|
|
23
|
+
<% if current_verdict == "disagree" %>
|
|
24
|
+
<%= form_with url: run_response_calibrations_path(run, response_row),
|
|
25
|
+
method: :post, local: false,
|
|
26
|
+
class: "ck-calibration__detail" do |f| %>
|
|
27
|
+
<%= hidden_field_tag :metric_id, metric.id %>
|
|
28
|
+
<%= hidden_field_tag :verdict, "disagree" %>
|
|
29
|
+
<label class="ck-label">
|
|
30
|
+
Your score
|
|
31
|
+
<span class="ck-calibration__value" data-calibration-value><%= calibration.corrected_score || review&.ai_score || 3 %></span>
|
|
32
|
+
</label>
|
|
33
|
+
<input type="range" name="corrected_score" min="1" max="5" step="0.5"
|
|
34
|
+
value="<%= calibration.corrected_score || review&.ai_score || 3 %>"
|
|
35
|
+
oninput="this.closest('.ck-calibration__detail').querySelector('[data-calibration-value]').textContent = this.value"
|
|
36
|
+
class="ck-slider">
|
|
37
|
+
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= calibration.note %></textarea>
|
|
38
|
+
<%= f.submit "Save", class: ck_button_classes(:dark) %>
|
|
39
|
+
<% end %>
|
|
40
|
+
<% elsif current_verdict == "borderline" %>
|
|
41
|
+
<%= form_with url: run_response_calibrations_path(run, response_row),
|
|
42
|
+
method: :post, local: false,
|
|
43
|
+
class: "ck-calibration__detail" do |f| %>
|
|
44
|
+
<%= hidden_field_tag :metric_id, metric.id %>
|
|
45
|
+
<%= hidden_field_tag :verdict, "borderline" %>
|
|
46
|
+
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= calibration.note %></textarea>
|
|
47
|
+
<%= f.submit "Save", class: ck_button_classes(:dark) %>
|
|
48
|
+
<% end %>
|
|
49
|
+
<% end %>
|
|
50
|
+
</div>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<%= form_with(model: dataset, local: true) do |form| %>
|
|
2
2
|
<% if dataset.errors.any? %>
|
|
3
|
-
<div class="ck-flash ck-flash--alert">
|
|
3
|
+
<div class="ck-flash ck-flash--alert" role="alert">
|
|
4
4
|
<p class="ck-flash__title"><%= pluralize(dataset.errors.count, "problem") %> prevented this dataset from being saved.</p>
|
|
5
5
|
<ul class="ck-error-list">
|
|
6
6
|
<% dataset.errors.full_messages.each do |message| %>
|
|
@@ -13,12 +13,14 @@
|
|
|
13
13
|
<div class="ck-card ck-form-card">
|
|
14
14
|
<div class="ck-field">
|
|
15
15
|
<%= form.label :name, "Name", class: "ck-label" %>
|
|
16
|
-
<%= form.text_field :name, class: "ck-input", placeholder: "Customer support tickets" %>
|
|
16
|
+
<%= form.text_field :name, class: "ck-input", placeholder: "Customer support tickets", **ck_field_aria(form, :name) %>
|
|
17
|
+
<%= ck_field_error(form, :name) %>
|
|
17
18
|
</div>
|
|
18
19
|
|
|
19
20
|
<div class="ck-field">
|
|
20
21
|
<%= form.label :csv_data, "CSV data", class: "ck-label" %>
|
|
21
|
-
<%= form.text_area :csv_data, rows: 12, class: "ck-input ck-input--area ck-input--code", placeholder: "content,audience\nFirst ticket text,internal\nSecond ticket text,customer" %>
|
|
22
|
+
<%= form.text_area :csv_data, rows: 12, class: "ck-input ck-input--area ck-input--code", placeholder: "content,audience\nFirst ticket text,internal\nSecond ticket text,customer", **ck_field_aria(form, :csv_data) %>
|
|
23
|
+
<%= ck_field_error(form, :csv_data) %>
|
|
22
24
|
</div>
|
|
23
25
|
|
|
24
26
|
<%= render "completion_kit/tags/picker", record: dataset, param_namespace: :dataset %>
|
|
@@ -17,18 +17,18 @@
|
|
|
17
17
|
<table class="ck-results-table ck-datasets-table">
|
|
18
18
|
<thead>
|
|
19
19
|
<tr>
|
|
20
|
-
<th>Name</th>
|
|
21
|
-
<th>Rows</th>
|
|
22
|
-
<th>Used in</th>
|
|
23
|
-
<th>Created</th>
|
|
24
|
-
<th></th>
|
|
20
|
+
<th scope="col">Name</th>
|
|
21
|
+
<th scope="col">Rows</th>
|
|
22
|
+
<th scope="col">Used in</th>
|
|
23
|
+
<th scope="col">Created</th>
|
|
24
|
+
<th scope="col"></th>
|
|
25
25
|
</tr>
|
|
26
26
|
</thead>
|
|
27
27
|
<tbody>
|
|
28
28
|
<% @datasets.each do |dataset| %>
|
|
29
29
|
<tr onclick="window.location='<%= dataset_path(dataset) %>'" style="cursor: pointer;">
|
|
30
30
|
<td>
|
|
31
|
-
|
|
31
|
+
<%= link_to dataset_path(dataset), class: "ck-record-name" do %><strong><%= dataset.name %></strong><% end %>
|
|
32
32
|
<% if dataset.tags.any? %>
|
|
33
33
|
<div class="tag-marks-row">
|
|
34
34
|
<%= render "completion_kit/tags/marks", tags: dataset.tags %>
|
|
@@ -22,17 +22,17 @@
|
|
|
22
22
|
<table class="ck-results-table ck-metric-groups-table">
|
|
23
23
|
<thead>
|
|
24
24
|
<tr>
|
|
25
|
-
<th>Name</th>
|
|
26
|
-
<th>Description</th>
|
|
27
|
-
<th>Members</th>
|
|
28
|
-
<th></th>
|
|
25
|
+
<th scope="col">Name</th>
|
|
26
|
+
<th scope="col">Description</th>
|
|
27
|
+
<th scope="col">Members</th>
|
|
28
|
+
<th scope="col"></th>
|
|
29
29
|
</tr>
|
|
30
30
|
</thead>
|
|
31
31
|
<tbody>
|
|
32
32
|
<% @metric_groups.each do |metric_group| %>
|
|
33
33
|
<tr onclick="window.location='<%= metric_group_path(metric_group) %>'" style="cursor: pointer;">
|
|
34
34
|
<td>
|
|
35
|
-
|
|
35
|
+
<%= link_to metric_group_path(metric_group), class: "ck-record-name" do %><strong><%= metric_group.name %></strong><% end %>
|
|
36
36
|
<% if metric_group.tags.any? %>
|
|
37
37
|
<div class="tag-marks-row">
|
|
38
38
|
<%= render "completion_kit/tags/marks", tags: metric_group.tags %>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<%= form_with(model: metric, local: true) do |form| %>
|
|
2
2
|
<% if metric.errors.any? %>
|
|
3
|
-
<div class="ck-flash ck-flash--alert">
|
|
3
|
+
<div class="ck-flash ck-flash--alert" role="alert">
|
|
4
4
|
<p class="ck-flash__title"><%= pluralize(metric.errors.count, "problem") %> prevented this metric from being saved.</p>
|
|
5
5
|
<ul class="ck-error-list">
|
|
6
6
|
<% metric.errors.full_messages.each do |message| %>
|
|
@@ -13,13 +13,15 @@
|
|
|
13
13
|
<div class="ck-card ck-form-card">
|
|
14
14
|
<div class="ck-field">
|
|
15
15
|
<%= form.label :name, "Metric name", class: "ck-label" %>
|
|
16
|
-
<%= form.text_field :name, class: "ck-input", placeholder: "Helpfulness" %>
|
|
16
|
+
<%= form.text_field :name, class: "ck-input", placeholder: "Helpfulness", **ck_field_aria(form, :name) %>
|
|
17
|
+
<%= ck_field_error(form, :name) %>
|
|
17
18
|
</div>
|
|
18
19
|
|
|
19
20
|
<div class="ck-field ck-field--spacious">
|
|
20
21
|
<p class="ck-section-title">Instruction</p>
|
|
21
22
|
<p class="ck-hint">What should the judge assess? This instruction is sent to the LLM judge when scoring outputs.</p>
|
|
22
|
-
<%= form.text_area :instruction, rows: 8, class: "ck-input ck-input--area", placeholder: "Evaluate whether the output..." %>
|
|
23
|
+
<%= form.text_area :instruction, rows: 8, class: "ck-input ck-input--area", placeholder: "Evaluate whether the output...", **ck_field_aria(form, :instruction) %>
|
|
24
|
+
<%= ck_field_error(form, :instruction) %>
|
|
23
25
|
</div>
|
|
24
26
|
|
|
25
27
|
<div class="ck-field ck-field--spacious">
|
|
@@ -17,17 +17,17 @@
|
|
|
17
17
|
<table class="ck-results-table ck-metrics-table">
|
|
18
18
|
<thead>
|
|
19
19
|
<tr>
|
|
20
|
-
<th>Name</th>
|
|
21
|
-
<th>Instruction</th>
|
|
22
|
-
<th>In groups</th>
|
|
23
|
-
<th></th>
|
|
20
|
+
<th scope="col">Name</th>
|
|
21
|
+
<th scope="col">Instruction</th>
|
|
22
|
+
<th scope="col">In groups</th>
|
|
23
|
+
<th scope="col"></th>
|
|
24
24
|
</tr>
|
|
25
25
|
</thead>
|
|
26
26
|
<tbody>
|
|
27
27
|
<% @metrics.each do |metric| %>
|
|
28
28
|
<tr onclick="window.location='<%= metric_path(metric) %>'" style="cursor: pointer;">
|
|
29
29
|
<td>
|
|
30
|
-
|
|
30
|
+
<%= link_to metric_path(metric), class: "ck-record-name" do %><strong><%= metric.name %></strong><% end %>
|
|
31
31
|
<% if metric.tags.any? %>
|
|
32
32
|
<div class="tag-marks-row">
|
|
33
33
|
<%= render "completion_kit/tags/marks", tags: metric.tags %>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<%= form_with(model: prompt, local: true) do |form| %>
|
|
2
2
|
<% if prompt.errors.any? %>
|
|
3
|
-
<div class="ck-flash ck-flash--alert">
|
|
3
|
+
<div class="ck-flash ck-flash--alert" role="alert">
|
|
4
4
|
<p class="ck-flash__title"><%= pluralize(prompt.errors.count, "problem") %> prevented this prompt from being saved.</p>
|
|
5
5
|
<ul class="ck-error-list">
|
|
6
6
|
<% prompt.errors.full_messages.each do |message| %>
|
|
@@ -17,17 +17,20 @@
|
|
|
17
17
|
<div class="ck-card ck-form-card">
|
|
18
18
|
<div class="ck-field">
|
|
19
19
|
<%= form.label :name, "Name", class: "ck-label" %>
|
|
20
|
-
<%= form.text_field :name, class: "ck-input", placeholder: "Support summary", autocomplete: "off", data: { "1p-ignore": "" } %>
|
|
20
|
+
<%= form.text_field :name, class: "ck-input", placeholder: "Support summary", autocomplete: "off", data: { "1p-ignore": "" }, **ck_field_aria(form, :name) %>
|
|
21
|
+
<%= ck_field_error(form, :name) %>
|
|
21
22
|
</div>
|
|
22
23
|
|
|
23
24
|
<div class="ck-field">
|
|
24
25
|
<%= form.label :description, class: "ck-label" %>
|
|
25
|
-
<%= form.text_area :description, rows: 3, class: "ck-input ck-input--area", placeholder: "Short note about this prompt." %>
|
|
26
|
+
<%= form.text_area :description, rows: 3, class: "ck-input ck-input--area", placeholder: "Short note about this prompt.", **ck_field_aria(form, :description) %>
|
|
27
|
+
<%= ck_field_error(form, :description) %>
|
|
26
28
|
</div>
|
|
27
29
|
|
|
28
30
|
<div class="ck-field">
|
|
29
31
|
<%= form.label :template, "Prompt text", class: "ck-label" %>
|
|
30
|
-
<%= form.text_area :template, rows: 12, class: "ck-input ck-input--area ck-input--code", placeholder: "Summarize {{content}} for {{audience}}" %>
|
|
32
|
+
<%= form.text_area :template, rows: 12, class: "ck-input ck-input--area ck-input--code", placeholder: "Summarize {{content}} for {{audience}}", **ck_field_aria(form, :template) %>
|
|
33
|
+
<%= ck_field_error(form, :template) %>
|
|
31
34
|
<p class="ck-hint">Use <code>{{variable}}</code>. Match your dataset column names.</p>
|
|
32
35
|
</div>
|
|
33
36
|
|
|
@@ -17,19 +17,19 @@
|
|
|
17
17
|
<table class="ck-results-table ck-prompts-table">
|
|
18
18
|
<thead>
|
|
19
19
|
<tr>
|
|
20
|
-
<th>Name</th>
|
|
21
|
-
<th>Version</th>
|
|
22
|
-
<th>Model</th>
|
|
23
|
-
<th>Best score</th>
|
|
24
|
-
<th>Runs</th>
|
|
25
|
-
<th></th>
|
|
20
|
+
<th scope="col">Name</th>
|
|
21
|
+
<th scope="col">Version</th>
|
|
22
|
+
<th scope="col">Model</th>
|
|
23
|
+
<th scope="col">Best score</th>
|
|
24
|
+
<th scope="col">Runs</th>
|
|
25
|
+
<th scope="col"></th>
|
|
26
26
|
</tr>
|
|
27
27
|
</thead>
|
|
28
28
|
<tbody>
|
|
29
29
|
<% @prompts.each do |prompt| %>
|
|
30
30
|
<tr onclick="window.location='<%= prompt_path(prompt) %>'" style="cursor: pointer;">
|
|
31
31
|
<td>
|
|
32
|
-
|
|
32
|
+
<%= link_to prompt_path(prompt), class: "ck-record-name" do %><strong><%= prompt.name %></strong><% end %>
|
|
33
33
|
<% if prompt.description.present? %>
|
|
34
34
|
<p class="ck-prompts-table__desc"><%= truncate(prompt.description, length: 120) %></p>
|
|
35
35
|
<% end %>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<%= form_with(model: provider_credential, local: true) do |form| %>
|
|
2
2
|
<% if provider_credential.errors.any? %>
|
|
3
|
-
<div class="ck-flash ck-flash--alert">
|
|
3
|
+
<div class="ck-flash ck-flash--alert" role="alert">
|
|
4
4
|
<p class="ck-flash__title"><%= pluralize(provider_credential.errors.count, "problem") %> prevented this provider credential from being saved.</p>
|
|
5
5
|
<ul class="ck-error-list">
|
|
6
6
|
<% provider_credential.errors.full_messages.each do |message| %>
|
|
@@ -18,12 +18,14 @@
|
|
|
18
18
|
|
|
19
19
|
<div class="ck-field">
|
|
20
20
|
<%= form.label :api_key, "API key", class: "ck-label" %>
|
|
21
|
-
<%= form.text_area :api_key, rows: 3, class: "ck-input ck-input--area ck-input--code", placeholder: "Paste the provider API key" %>
|
|
21
|
+
<%= form.text_area :api_key, rows: 3, class: "ck-input ck-input--area ck-input--code", placeholder: "Paste the provider API key", **ck_field_aria(form, :api_key) %>
|
|
22
|
+
<%= ck_field_error(form, :api_key) %>
|
|
22
23
|
</div>
|
|
23
24
|
|
|
24
25
|
<div class="ck-field">
|
|
25
26
|
<%= form.label :api_endpoint, "API endpoint", class: "ck-label" %>
|
|
26
|
-
<%= form.text_field :api_endpoint, class: "ck-input", placeholder: "Only needed for Ollama or custom OpenAI-compatible endpoints" %>
|
|
27
|
+
<%= form.text_field :api_endpoint, class: "ck-input", placeholder: "Only needed for Ollama or custom OpenAI-compatible endpoints", **ck_field_aria(form, :api_endpoint) %>
|
|
28
|
+
<%= ck_field_error(form, :api_endpoint) %>
|
|
27
29
|
</div>
|
|
28
30
|
|
|
29
31
|
<div class="ck-actions">
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
<ol class="ck-breadcrumb">
|
|
1
|
+
<ol class="ck-breadcrumb" aria-label="Breadcrumb">
|
|
2
2
|
<% if @run.prompt %>
|
|
3
3
|
<li><%= link_to "Prompts", prompts_path %></li>
|
|
4
4
|
<li><%= link_to @run.prompt.name, prompt_path(@run.prompt) %></li>
|
|
@@ -116,6 +116,15 @@
|
|
|
116
116
|
<div class="ck-note-box"><%= review.ai_feedback %></div>
|
|
117
117
|
</div>
|
|
118
118
|
<% end %>
|
|
119
|
+
<% if CompletionKit.config.judge_calibration_enabled && review.metric && review.ai_score %>
|
|
120
|
+
<% existing = CompletionKit::Calibration.find_by(
|
|
121
|
+
response_id: @response.id, metric_id: review.metric_id,
|
|
122
|
+
created_by: CompletionKit.config.username.presence || "operator"
|
|
123
|
+
) %>
|
|
124
|
+
<%= render "completion_kit/calibrations/buttons",
|
|
125
|
+
review: review, calibration: existing, run: @run,
|
|
126
|
+
response_row: @response, metric: review.metric %>
|
|
127
|
+
<% end %>
|
|
119
128
|
</div>
|
|
120
129
|
<% end %>
|
|
121
130
|
</div>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<%= form_with(model: run, local: true) do |form| %>
|
|
2
2
|
<% if run.errors.any? %>
|
|
3
|
-
<div class="ck-flash ck-flash--alert">
|
|
3
|
+
<div class="ck-flash ck-flash--alert" role="alert">
|
|
4
4
|
<p class="ck-flash__title"><%= pluralize(run.errors.count, "problem") %> prevented this run from being saved.</p>
|
|
5
5
|
<ul class="ck-error-list">
|
|
6
6
|
<% run.errors.full_messages.each do |message| %>
|
|
@@ -13,7 +13,8 @@
|
|
|
13
13
|
<div class="ck-card ck-form-card">
|
|
14
14
|
<div class="ck-field">
|
|
15
15
|
<%= form.label :name, "Name (auto-generated if blank)", class: "ck-label" %>
|
|
16
|
-
<%= form.text_field :name, class: "ck-input", placeholder: run.name.presence || "Auto-generated from prompt + version + timestamp" %>
|
|
16
|
+
<%= form.text_field :name, class: "ck-input", placeholder: run.name.presence || "Auto-generated from prompt + version + timestamp", **ck_field_aria(form, :name) %>
|
|
17
|
+
<%= ck_field_error(form, :name) %>
|
|
17
18
|
</div>
|
|
18
19
|
|
|
19
20
|
<div class="ck-field">
|
|
@@ -56,7 +57,8 @@
|
|
|
56
57
|
|
|
57
58
|
<div class="ck-field" id="output-column-field" hidden>
|
|
58
59
|
<%= form.label :output_column, "Output column", class: "ck-label" %>
|
|
59
|
-
<%= form.text_field :output_column, value: run.output_column.presence || "actual_output", class: "ck-input", id: "run_output_column", placeholder: "actual_output" %>
|
|
60
|
+
<%= form.text_field :output_column, value: run.output_column.presence || "actual_output", class: "ck-input", id: "run_output_column", placeholder: "actual_output", **ck_field_aria(form, :output_column) %>
|
|
61
|
+
<%= ck_field_error(form, :output_column) %>
|
|
60
62
|
<p class="ck-field-hint">Name of the dataset column whose value will be graded as the response. Defaults to <code>actual_output</code>.</p>
|
|
61
63
|
</div>
|
|
62
64
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<% clickable = response.succeeded? %>
|
|
2
2
|
<tr id="response_<%= response.id %>"<% if clickable %> onclick="window.location='<%= run_response_path(run, response, sort: params[:sort]) %>'" style="cursor: pointer;"<% end %>>
|
|
3
|
-
<td class="ck-response-cell__index"
|
|
3
|
+
<td class="ck-response-cell__index"><% if clickable %><%= link_to index, run_response_path(run, response, sort: params[:sort]), class: "ck-record-name" %><% else %><%= index %><% end %></td>
|
|
4
4
|
<td class="ck-response-cell__text">
|
|
5
5
|
<% if response.status == "failed" %>
|
|
6
6
|
<% err = response.error_payload %>
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
<div class="ck-runs-table__identity">
|
|
4
4
|
<span class="ck-run-name">
|
|
5
5
|
<span class="<%= ck_run_dot(run) %>"></span>
|
|
6
|
-
|
|
6
|
+
<%= link_to ck_run_path(run), class: "ck-record-name" do %><strong><%= run.name %></strong><% end %>
|
|
7
7
|
</span>
|
|
8
8
|
<div class="ck-runs-table__config">
|
|
9
9
|
<% if run.prompt %>
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
<div id="run_status_header">
|
|
1
|
+
<div id="run_status_header" aria-live="polite">
|
|
2
2
|
<% if run.status == "failed" %>
|
|
3
|
-
<div class="ck-flash ck-flash--alert">
|
|
3
|
+
<div class="ck-flash ck-flash--alert" role="alert">
|
|
4
4
|
<%= run.failure_summary.presence || run.error_message.presence || "Run failed." %>
|
|
5
5
|
</div>
|
|
6
6
|
<% end %>
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
<% snap = run.progress_snapshot %>
|
|
2
|
-
<div id="run_status_panel">
|
|
2
|
+
<div id="run_status_panel" aria-live="polite" aria-atomic="true">
|
|
3
3
|
<% if run.status.in?(%w[running completed]) && snap[:generated_total] > 0 %>
|
|
4
4
|
<% failed_count = snap[:generated_failed] + snap[:judged_failed] %>
|
|
5
5
|
<% has_judge = snap[:judged_total] > 0 || run.judge_configured? %>
|
|
@@ -3,12 +3,12 @@
|
|
|
3
3
|
<table class="ck-results-table ck-runs-table" style="margin-top: 0.5rem;">
|
|
4
4
|
<thead>
|
|
5
5
|
<tr>
|
|
6
|
-
<th>Run</th>
|
|
7
|
-
<th>Responses</th>
|
|
8
|
-
<th>Metrics</th>
|
|
9
|
-
<th>Avg score</th>
|
|
10
|
-
<th>When</th>
|
|
11
|
-
<th></th>
|
|
6
|
+
<th scope="col">Run</th>
|
|
7
|
+
<th scope="col">Responses</th>
|
|
8
|
+
<th scope="col">Metrics</th>
|
|
9
|
+
<th scope="col">Avg score</th>
|
|
10
|
+
<th scope="col">When</th>
|
|
11
|
+
<th scope="col"></th>
|
|
12
12
|
</tr>
|
|
13
13
|
</thead>
|
|
14
14
|
<tbody>
|
|
@@ -7,8 +7,8 @@
|
|
|
7
7
|
<% all_tags.each do |tag| %>
|
|
8
8
|
<% checked = selected_ids.include?(tag.id) %>
|
|
9
9
|
<label class="tag-mark" style="--mark-color: var(--tag-<%= tag.color %>);">
|
|
10
|
-
<%= check_box_tag "#{param_namespace}[tag_names][]", tag.name, checked, hidden:
|
|
11
|
-
|
|
10
|
+
<%= check_box_tag "#{param_namespace}[tag_names][]", tag.name, checked, class: "ck-visually-hidden", "aria-label": "Tag: #{tag.name}" %>
|
|
11
|
+
<span aria-hidden="true"><%= tag.name %></span>
|
|
12
12
|
</label>
|
|
13
13
|
<% end %>
|
|
14
14
|
<%= text_field_tag "#{param_namespace}[tag_names][]", "",
|
|
@@ -14,9 +14,9 @@
|
|
|
14
14
|
<table class="ck-results-table ck-tags-table">
|
|
15
15
|
<thead>
|
|
16
16
|
<tr>
|
|
17
|
-
<th>Tag</th>
|
|
18
|
-
<th>Applied to</th>
|
|
19
|
-
<th></th>
|
|
17
|
+
<th scope="col">Tag</th>
|
|
18
|
+
<th scope="col">Applied to</th>
|
|
19
|
+
<th scope="col"></th>
|
|
20
20
|
</tr>
|
|
21
21
|
</thead>
|
|
22
22
|
<tbody>
|
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
<% by_type = @tagging_by_type.select { |(tid, _), _| tid == tag.id } %>
|
|
26
26
|
<% breakdown = by_type.map { |(_, type), n| pluralize(n, type.demodulize.titleize.downcase) }.join(" · ") %>
|
|
27
27
|
<tr onclick="window.location='<%= edit_tag_path(tag) %>'" style="cursor: pointer;">
|
|
28
|
-
<td
|
|
28
|
+
<td><%= link_to edit_tag_path(tag), class: "ck-record-name" do %><span class="tag-mark tag-mark--lg" style="--mark-color: var(--tag-<%= tag.color %>);"><%= tag.name %></span><% end %></td>
|
|
29
29
|
<td data-label="Applied to" class="ck-meta-copy">
|
|
30
30
|
<% if count.zero? %>
|
|
31
31
|
<span class="ck-tags-table__unused">Not used yet</span>
|
data/config/routes.rb
CHANGED
|
@@ -26,7 +26,9 @@ CompletionKit::Engine.routes.draw do
|
|
|
26
26
|
post :rerun
|
|
27
27
|
get :refresh_status
|
|
28
28
|
end
|
|
29
|
-
resources :responses, only: [:show]
|
|
29
|
+
resources :responses, only: [:show] do
|
|
30
|
+
resources :calibrations, only: [:create]
|
|
31
|
+
end
|
|
30
32
|
end
|
|
31
33
|
|
|
32
34
|
resources :suggestions, only: [:show] do
|
|
@@ -54,7 +56,11 @@ CompletionKit::Engine.routes.draw do
|
|
|
54
56
|
post :generate
|
|
55
57
|
post :retry_failures
|
|
56
58
|
end
|
|
57
|
-
resources :responses, only: [:index, :show]
|
|
59
|
+
resources :responses, only: [:index, :show] do
|
|
60
|
+
resources :metrics, only: [] do
|
|
61
|
+
resources :calibrations, only: [:index, :create]
|
|
62
|
+
end
|
|
63
|
+
end
|
|
58
64
|
end
|
|
59
65
|
resources :datasets
|
|
60
66
|
resources :metrics
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
class CreateCompletionKitJudgeVersions < ActiveRecord::Migration[8.1]
|
|
2
|
+
def change
|
|
3
|
+
create_table :completion_kit_judge_versions do |t|
|
|
4
|
+
t.references :metric,
|
|
5
|
+
null: false,
|
|
6
|
+
foreign_key: { to_table: :completion_kit_metrics, on_delete: :cascade },
|
|
7
|
+
index: { name: "index_ck_judge_versions_on_metric_id" }
|
|
8
|
+
t.text :instruction
|
|
9
|
+
t.text :rubric_bands
|
|
10
|
+
t.boolean :current, null: false, default: true
|
|
11
|
+
t.timestamps
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
add_index :completion_kit_judge_versions,
|
|
15
|
+
[:metric_id, :current],
|
|
16
|
+
name: "index_ck_judge_versions_on_metric_current"
|
|
17
|
+
|
|
18
|
+
reversible do |dir|
|
|
19
|
+
dir.up do
|
|
20
|
+
metric_model = Class.new(ActiveRecord::Base) { self.table_name = "completion_kit_metrics" }
|
|
21
|
+
jv_model = Class.new(ActiveRecord::Base) { self.table_name = "completion_kit_judge_versions" }
|
|
22
|
+
metric_model.find_each do |m|
|
|
23
|
+
jv_model.create!(metric_id: m.id, instruction: m["instruction"], rubric_bands: m["rubric_bands"], current: true)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
class CreateCompletionKitCalibrations < ActiveRecord::Migration[8.1]
|
|
2
|
+
def change
|
|
3
|
+
create_table :completion_kit_calibrations do |t|
|
|
4
|
+
t.references :run,
|
|
5
|
+
null: false,
|
|
6
|
+
foreign_key: { to_table: :completion_kit_runs, on_delete: :cascade },
|
|
7
|
+
index: { name: "index_ck_calibrations_on_run_id" }
|
|
8
|
+
t.references :response,
|
|
9
|
+
null: false,
|
|
10
|
+
foreign_key: { to_table: :completion_kit_responses, on_delete: :cascade },
|
|
11
|
+
index: { name: "index_ck_calibrations_on_response_id" }
|
|
12
|
+
t.references :metric,
|
|
13
|
+
null: false,
|
|
14
|
+
foreign_key: { to_table: :completion_kit_metrics, on_delete: :cascade },
|
|
15
|
+
index: { name: "index_ck_calibrations_on_metric_id" }
|
|
16
|
+
t.references :judge_version,
|
|
17
|
+
null: false,
|
|
18
|
+
foreign_key: { to_table: :completion_kit_judge_versions, on_delete: :cascade },
|
|
19
|
+
index: { name: "index_ck_calibrations_on_judge_version_id" }
|
|
20
|
+
t.string :verdict, null: false
|
|
21
|
+
t.string :created_by
|
|
22
|
+
t.decimal :corrected_score, precision: 4, scale: 1
|
|
23
|
+
t.text :note
|
|
24
|
+
t.timestamps
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
add_index :completion_kit_calibrations,
|
|
28
|
+
[:response_id, :metric_id, :created_by],
|
|
29
|
+
unique: true,
|
|
30
|
+
name: "index_ck_calibrations_on_response_metric_user"
|
|
31
|
+
end
|
|
32
|
+
end
|
data/lib/completion_kit.rb
CHANGED
|
@@ -11,6 +11,8 @@ module CompletionKit
|
|
|
11
11
|
attr_accessor :tenant_scope, :tenant_scope_columns
|
|
12
12
|
attr_accessor :api_reference_authentication_partial
|
|
13
13
|
attr_accessor :api_rate_limit, :web_rate_limit
|
|
14
|
+
attr_accessor :allow_loopback_endpoints
|
|
15
|
+
attr_accessor :judge_calibration_enabled
|
|
14
16
|
|
|
15
17
|
def initialize
|
|
16
18
|
@openai_api_key = ENV['OPENAI_API_KEY']
|
|
@@ -25,6 +27,9 @@ module CompletionKit
|
|
|
25
27
|
@api_rate_limit = 120
|
|
26
28
|
@web_rate_limit = 300
|
|
27
29
|
|
|
30
|
+
@allow_loopback_endpoints = true
|
|
31
|
+
@judge_calibration_enabled = true
|
|
32
|
+
|
|
28
33
|
@api_reference_authentication_partial = "completion_kit/api_reference/authentication"
|
|
29
34
|
end
|
|
30
35
|
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.35
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -235,6 +235,7 @@ files:
|
|
|
235
235
|
- app/assets/javascripts/completion_kit/application.js
|
|
236
236
|
- app/assets/stylesheets/completion_kit/application.css.erb
|
|
237
237
|
- app/controllers/completion_kit/api/v1/base_controller.rb
|
|
238
|
+
- app/controllers/completion_kit/api/v1/calibrations_controller.rb
|
|
238
239
|
- app/controllers/completion_kit/api/v1/datasets_controller.rb
|
|
239
240
|
- app/controllers/completion_kit/api/v1/metric_groups_controller.rb
|
|
240
241
|
- app/controllers/completion_kit/api/v1/metrics_controller.rb
|
|
@@ -245,6 +246,7 @@ files:
|
|
|
245
246
|
- app/controllers/completion_kit/api/v1/tags_controller.rb
|
|
246
247
|
- app/controllers/completion_kit/api_reference_controller.rb
|
|
247
248
|
- app/controllers/completion_kit/application_controller.rb
|
|
249
|
+
- app/controllers/completion_kit/calibrations_controller.rb
|
|
248
250
|
- app/controllers/completion_kit/dashboard_controller.rb
|
|
249
251
|
- app/controllers/completion_kit/dashboard_dismissals_controller.rb
|
|
250
252
|
- app/controllers/completion_kit/datasets_controller.rb
|
|
@@ -267,8 +269,10 @@ files:
|
|
|
267
269
|
- app/jobs/completion_kit/run_completion_check_job.rb
|
|
268
270
|
- app/mailers/completion_kit/application_mailer.rb
|
|
269
271
|
- app/models/completion_kit/application_record.rb
|
|
272
|
+
- app/models/completion_kit/calibration.rb
|
|
270
273
|
- app/models/completion_kit/dashboard_dismissal.rb
|
|
271
274
|
- app/models/completion_kit/dataset.rb
|
|
275
|
+
- app/models/completion_kit/judge_version.rb
|
|
272
276
|
- app/models/completion_kit/mcp_session.rb
|
|
273
277
|
- app/models/completion_kit/metric.rb
|
|
274
278
|
- app/models/completion_kit/metric_group.rb
|
|
@@ -292,6 +296,7 @@ files:
|
|
|
292
296
|
- app/services/completion_kit/llm_client.rb
|
|
293
297
|
- app/services/completion_kit/mcp_dispatcher.rb
|
|
294
298
|
- app/services/completion_kit/mcp_tools/base.rb
|
|
299
|
+
- app/services/completion_kit/mcp_tools/calibrations.rb
|
|
295
300
|
- app/services/completion_kit/mcp_tools/datasets.rb
|
|
296
301
|
- app/services/completion_kit/mcp_tools/metric_groups.rb
|
|
297
302
|
- app/services/completion_kit/mcp_tools/metrics.rb
|
|
@@ -308,6 +313,7 @@ files:
|
|
|
308
313
|
- app/services/completion_kit/open_ai_client.rb
|
|
309
314
|
- app/services/completion_kit/open_router_client.rb
|
|
310
315
|
- app/services/completion_kit/prompt_improvement_service.rb
|
|
316
|
+
- app/services/completion_kit/provider_endpoint.rb
|
|
311
317
|
- app/services/completion_kit/worker_health.rb
|
|
312
318
|
- app/validators/completion_kit/tenant_scoped_uniqueness_validator.rb
|
|
313
319
|
- app/views/completion_kit/api_reference/_authentication.html.erb
|
|
@@ -316,6 +322,7 @@ files:
|
|
|
316
322
|
- app/views/completion_kit/api_reference/_resource_card.html.erb
|
|
317
323
|
- app/views/completion_kit/api_reference/_resource_list.html.erb
|
|
318
324
|
- app/views/completion_kit/api_reference/index.html.erb
|
|
325
|
+
- app/views/completion_kit/calibrations/_buttons.html.erb
|
|
319
326
|
- app/views/completion_kit/dashboard/_eye_icon.html.erb
|
|
320
327
|
- app/views/completion_kit/dashboard/_eye_off_icon.html.erb
|
|
321
328
|
- app/views/completion_kit/dashboard/_failures_card.html.erb
|
|
@@ -398,6 +405,8 @@ files:
|
|
|
398
405
|
- db/migrate/20260513000001_create_completion_kit_mcp_sessions.rb
|
|
399
406
|
- db/migrate/20260514000001_allow_judge_only_runs.rb
|
|
400
407
|
- db/migrate/20260516000001_create_completion_kit_dashboard_dismissals.rb
|
|
408
|
+
- db/migrate/20260522000001_create_completion_kit_judge_versions.rb
|
|
409
|
+
- db/migrate/20260522000002_create_completion_kit_calibrations.rb
|
|
401
410
|
- lib/completion-kit.rb
|
|
402
411
|
- lib/completion_kit.rb
|
|
403
412
|
- lib/completion_kit/concurrency_check.rb
|