completion-kit 0.11.0 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/javascripts/completion_kit/application.js +32 -0
- data/app/assets/stylesheets/completion_kit/application.css +51 -51
- data/app/controllers/completion_kit/{calibrations_controller.rb → agreements_controller.rb} +19 -19
- data/app/controllers/completion_kit/api/v1/{calibrations_controller.rb → agreements_controller.rb} +18 -18
- data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +2 -7
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +1 -1
- data/app/controllers/completion_kit/metrics_controller.rb +10 -11
- data/app/controllers/completion_kit/provider_credentials_controller.rb +7 -2
- data/app/jobs/completion_kit/judge_review_job.rb +2 -2
- data/app/jobs/completion_kit/model_discovery_job.rb +2 -2
- data/app/models/completion_kit/{calibration.rb → agreement.rb} +1 -1
- data/app/models/completion_kit/metric_version.rb +1 -17
- data/app/models/completion_kit/provider_credential.rb +22 -12
- data/app/models/completion_kit/review.rb +1 -0
- data/app/services/completion_kit/{calibration_math.rb → agreement_math.rb} +1 -1
- data/app/services/completion_kit/mcp_dispatcher.rb +2 -2
- data/app/services/completion_kit/mcp_tools/{calibrations.rb → agreements.rb} +11 -11
- data/app/services/completion_kit/mcp_tools/judges.rb +3 -3
- data/app/services/completion_kit/mcp_tools/metric_versions.rb +2 -7
- data/app/services/completion_kit/{metric_calibration_examples.rb → metric_agreement_examples.rb} +6 -6
- data/app/services/completion_kit/{metric_calibration_stats.rb → metric_agreement_stats.rb} +6 -6
- data/app/services/completion_kit/metric_improvement_validator.rb +1 -1
- data/app/services/completion_kit/metric_variant_generator.rb +2 -2
- data/app/services/completion_kit/model_discovery_service.rb +16 -3
- data/app/views/completion_kit/{calibrations → agreements}/_buttons.html.erb +33 -33
- data/app/views/completion_kit/{calibrations → agreements}/_trust_panel.html.erb +5 -5
- data/app/views/completion_kit/api_reference/_body.html.erb +15 -15
- data/app/views/completion_kit/metrics/_guiding_examples.html.erb +1 -1
- data/app/views/completion_kit/metrics/edit.html.erb +1 -1
- data/app/views/completion_kit/metrics/show.html.erb +6 -6
- data/app/views/completion_kit/provider_credentials/_models_card.html.erb +1 -1
- data/app/views/completion_kit/provider_credentials/statuses.turbo_stream.erb +4 -0
- data/app/views/completion_kit/responses/show.html.erb +4 -4
- data/app/views/completion_kit/runs/show.html.erb +1 -1
- data/config/routes.rb +4 -3
- data/db/migrate/20260531000002_backfill_review_metric_versions.rb +33 -0
- data/db/migrate/20260531000003_add_metric_version_fk_to_reviews.rb +6 -0
- data/db/migrate/20260531000004_rename_calibrations_to_agreements.rb +19 -0
- data/lib/completion_kit/version.rb +1 -1
- data/lib/completion_kit.rb +2 -2
- metadata +14 -10
|
@@ -3,7 +3,7 @@ module CompletionKit
|
|
|
3
3
|
STATES = %w[draft published].freeze
|
|
4
4
|
|
|
5
5
|
belongs_to :metric
|
|
6
|
-
has_many :
|
|
6
|
+
has_many :agreements, dependent: :destroy
|
|
7
7
|
|
|
8
8
|
serialize :rubric_bands, coder: JSON
|
|
9
9
|
serialize :validation_summary, coder: JSON
|
|
@@ -83,22 +83,6 @@ module CompletionKit
|
|
|
83
83
|
self
|
|
84
84
|
end
|
|
85
85
|
|
|
86
|
-
def revert!
|
|
87
|
-
raise ArgumentError, "only a published version can be reverted to" unless published?
|
|
88
|
-
audit = nil
|
|
89
|
-
MetricVersion.transaction do
|
|
90
|
-
audit = self.class.create!(
|
|
91
|
-
metric: metric,
|
|
92
|
-
instruction: instruction,
|
|
93
|
-
rubric_bands: rubric_bands,
|
|
94
|
-
state: "draft",
|
|
95
|
-
source: "revert"
|
|
96
|
-
)
|
|
97
|
-
audit.publish!
|
|
98
|
-
end
|
|
99
|
-
audit
|
|
100
|
-
end
|
|
101
|
-
|
|
102
86
|
def as_json(options = {})
|
|
103
87
|
{
|
|
104
88
|
id: id,
|
|
@@ -79,11 +79,13 @@ module CompletionKit
|
|
|
79
79
|
end
|
|
80
80
|
|
|
81
81
|
def broadcast_discovery_progress
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
82
|
+
safely_broadcast do
|
|
83
|
+
broadcast_replace_to(
|
|
84
|
+
"completion_kit_provider_#{id}",
|
|
85
|
+
target: "discovery_status_#{id}",
|
|
86
|
+
html: render_partial("completion_kit/provider_credentials/discovery_status", provider_credential: self)
|
|
87
|
+
)
|
|
88
|
+
end
|
|
87
89
|
broadcast_provider_models
|
|
88
90
|
end
|
|
89
91
|
|
|
@@ -93,13 +95,15 @@ module CompletionKit
|
|
|
93
95
|
end
|
|
94
96
|
|
|
95
97
|
def broadcast_provider_models
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
98
|
+
safely_broadcast do
|
|
99
|
+
Turbo::StreamsChannel.broadcast_action_to(
|
|
100
|
+
"completion_kit_provider_#{id}",
|
|
101
|
+
action: "replace",
|
|
102
|
+
target: "provider_models_#{id}",
|
|
103
|
+
method: "morph",
|
|
104
|
+
html: render_partial("completion_kit/provider_credentials/models_card", provider_credential: self)
|
|
105
|
+
)
|
|
106
|
+
end
|
|
103
107
|
end
|
|
104
108
|
|
|
105
109
|
private
|
|
@@ -133,6 +137,12 @@ module CompletionKit
|
|
|
133
137
|
CompletionKit::ApplicationController.render(partial: partial, locals: locals)
|
|
134
138
|
end
|
|
135
139
|
|
|
140
|
+
def safely_broadcast
|
|
141
|
+
yield
|
|
142
|
+
rescue StandardError => e
|
|
143
|
+
Rails.logger.error("[CompletionKit] discovery broadcast render failed: #{e.class}: #{e.message}")
|
|
144
|
+
end
|
|
145
|
+
|
|
136
146
|
def api_endpoint_not_internal
|
|
137
147
|
return if api_endpoint.blank?
|
|
138
148
|
|
|
@@ -8,6 +8,7 @@ module CompletionKit
|
|
|
8
8
|
has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
|
|
9
9
|
|
|
10
10
|
validates :metric_name, presence: true
|
|
11
|
+
validates :metric_version, presence: true
|
|
11
12
|
validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
|
|
12
13
|
|
|
13
14
|
before_validation :set_default_status
|
|
@@ -35,7 +35,7 @@ module CompletionKit
|
|
|
35
35
|
McpTools::MetricVersions.definitions +
|
|
36
36
|
McpTools::ProviderCredentials.definitions +
|
|
37
37
|
McpTools::Tags.definitions +
|
|
38
|
-
McpTools::
|
|
38
|
+
McpTools::Agreements.definitions +
|
|
39
39
|
McpTools::Judges.definitions
|
|
40
40
|
end
|
|
41
41
|
|
|
@@ -50,7 +50,7 @@ module CompletionKit
|
|
|
50
50
|
when /\Ametrics_/ then McpTools::Metrics.call(name, arguments)
|
|
51
51
|
when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
|
|
52
52
|
when /\Atags_/ then McpTools::Tags.call(name, arguments)
|
|
53
|
-
when /\
|
|
53
|
+
when /\Aagreements_/ then McpTools::Agreements.call(name, arguments)
|
|
54
54
|
when /\Ajudges_/ then McpTools::Judges.call(name, arguments)
|
|
55
55
|
else raise MethodNotFound, "Unknown tool: #{name}"
|
|
56
56
|
end
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
module McpTools
|
|
3
|
-
module
|
|
3
|
+
module Agreements
|
|
4
4
|
extend Base
|
|
5
5
|
|
|
6
6
|
TOOLS = {
|
|
7
|
-
"
|
|
8
|
-
description: "List
|
|
7
|
+
"agreements_list" => {
|
|
8
|
+
description: "List agreements. Filter by run_id, response_id, metric_id, or created_by.",
|
|
9
9
|
inputSchema: {
|
|
10
10
|
type: "object",
|
|
11
11
|
properties: {
|
|
@@ -18,8 +18,8 @@ module CompletionKit
|
|
|
18
18
|
},
|
|
19
19
|
handler: :list
|
|
20
20
|
},
|
|
21
|
-
"
|
|
22
|
-
description: "Upsert
|
|
21
|
+
"agreements_create" => {
|
|
22
|
+
description: "Upsert an agreement for (run, response, metric, created_by). Verdict is one of agree, disagree, borderline. corrected_score (1..5) is required when verdict is 'disagree'.",
|
|
23
23
|
inputSchema: {
|
|
24
24
|
type: "object",
|
|
25
25
|
properties: {
|
|
@@ -38,7 +38,7 @@ module CompletionKit
|
|
|
38
38
|
}.freeze
|
|
39
39
|
|
|
40
40
|
def self.list(args)
|
|
41
|
-
scope = CompletionKit::
|
|
41
|
+
scope = CompletionKit::Agreement.all
|
|
42
42
|
scope = scope.where(run_id: args["run_id"]) if args["run_id"]
|
|
43
43
|
scope = scope.where(response_id: args["response_id"]) if args["response_id"]
|
|
44
44
|
scope = scope.where(metric_id: args["metric_id"]) if args["metric_id"]
|
|
@@ -52,20 +52,20 @@ module CompletionKit
|
|
|
52
52
|
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
53
53
|
created_by = args["created_by"].presence || "mcp"
|
|
54
54
|
|
|
55
|
-
|
|
55
|
+
agreement = CompletionKit::Agreement.find_or_initialize_by(
|
|
56
56
|
run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
|
|
57
57
|
)
|
|
58
|
-
|
|
58
|
+
agreement.assign_attributes(
|
|
59
59
|
metric_version: CompletionKit::MetricVersion.ensure_current_for(metric),
|
|
60
60
|
verdict: args["verdict"],
|
|
61
61
|
corrected_score: args["corrected_score"],
|
|
62
62
|
note: args["note"]
|
|
63
63
|
)
|
|
64
64
|
|
|
65
|
-
if
|
|
66
|
-
text_result(
|
|
65
|
+
if agreement.save
|
|
66
|
+
text_result(agreement.as_json)
|
|
67
67
|
else
|
|
68
|
-
error_result(
|
|
68
|
+
error_result(agreement.errors.full_messages.join(", "))
|
|
69
69
|
end
|
|
70
70
|
end
|
|
71
71
|
end
|
|
@@ -33,7 +33,7 @@ module CompletionKit
|
|
|
33
33
|
handler: :replay
|
|
34
34
|
},
|
|
35
35
|
"judges_compare" => {
|
|
36
|
-
description: "Compare two metric versions'
|
|
36
|
+
description: "Compare two metric versions' agreement stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
|
|
37
37
|
inputSchema: {
|
|
38
38
|
type: "object",
|
|
39
39
|
properties: {
|
|
@@ -77,8 +77,8 @@ module CompletionKit
|
|
|
77
77
|
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
78
78
|
a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
|
|
79
79
|
b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
|
|
80
|
-
stats_a = CompletionKit::
|
|
81
|
-
stats_b = CompletionKit::
|
|
80
|
+
stats_a = CompletionKit::MetricAgreementStats.for(metric, metric_version: a)
|
|
81
|
+
stats_b = CompletionKit::MetricAgreementStats.for(metric, metric_version: b)
|
|
82
82
|
text_result({
|
|
83
83
|
metric_id: metric.id,
|
|
84
84
|
a: metric_version_payload(a, stats_a),
|
|
@@ -47,13 +47,8 @@ module CompletionKit
|
|
|
47
47
|
|
|
48
48
|
def self.publish(args)
|
|
49
49
|
version = CompletionKit::MetricVersion.find(args["metric_version_id"])
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
text_result(audit.as_json)
|
|
53
|
-
else
|
|
54
|
-
version.publish!
|
|
55
|
-
text_result(version.reload.as_json)
|
|
56
|
-
end
|
|
50
|
+
version.publish!
|
|
51
|
+
text_result(version.reload.as_json)
|
|
57
52
|
end
|
|
58
53
|
|
|
59
54
|
def self.dismiss(args)
|
data/app/services/completion_kit/{metric_calibration_examples.rb → metric_agreement_examples.rb}
RENAMED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
|
-
module
|
|
2
|
+
module MetricAgreementExamples
|
|
3
3
|
DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
|
|
4
4
|
|
|
5
5
|
module_function
|
|
@@ -9,18 +9,18 @@ module CompletionKit
|
|
|
9
9
|
end
|
|
10
10
|
|
|
11
11
|
def disagreements_for(metric, limit: 8)
|
|
12
|
-
|
|
12
|
+
agreements_for(metric, verdict: "disagree", limit: limit)
|
|
13
13
|
end
|
|
14
14
|
|
|
15
15
|
def borderlines_for(metric, limit: 6)
|
|
16
|
-
|
|
16
|
+
agreements_for(metric, verdict: "borderline", limit: limit)
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
|
|
20
20
|
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
21
21
|
return [] unless current_version
|
|
22
22
|
|
|
23
|
-
relation =
|
|
23
|
+
relation = Agreement
|
|
24
24
|
.where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
|
|
25
25
|
.where.not(corrected_score: nil)
|
|
26
26
|
relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
|
|
@@ -28,8 +28,8 @@ module CompletionKit
|
|
|
28
28
|
.reject { |example| example[:judge_score].nil? }
|
|
29
29
|
end
|
|
30
30
|
|
|
31
|
-
def
|
|
32
|
-
base =
|
|
31
|
+
def agreements_for(metric, verdict:, limit:)
|
|
32
|
+
base = Agreement.where(metric_id: metric.id, verdict: verdict)
|
|
33
33
|
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
34
34
|
scoped = current_version ? base.where(metric_version_id: current_version.id) : base
|
|
35
35
|
effective = scoped.exists? ? scoped : base
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
|
-
class
|
|
2
|
+
class MetricAgreementStats
|
|
3
3
|
PROVISIONAL_MIN = 10
|
|
4
4
|
FIRM_MIN = 30
|
|
5
5
|
|
|
@@ -49,7 +49,7 @@ module CompletionKit
|
|
|
49
49
|
end
|
|
50
50
|
|
|
51
51
|
def call
|
|
52
|
-
scope =
|
|
52
|
+
scope = Agreement.where(metric_id: @metric.id)
|
|
53
53
|
if @metric_version
|
|
54
54
|
scope = scope.where(metric_version_id: @metric_version.id)
|
|
55
55
|
elsif !@all_versions
|
|
@@ -62,12 +62,12 @@ module CompletionKit
|
|
|
62
62
|
disagrees = verdicts.count { |v, _, _| v == "disagree" }
|
|
63
63
|
borderlines = verdicts.count { |v, _, _| v == "borderline" }
|
|
64
64
|
|
|
65
|
-
ci =
|
|
65
|
+
ci = AgreementMath.wilson_interval(successes: agrees, n: n)
|
|
66
66
|
|
|
67
67
|
pairs = score_pairs(verdicts)
|
|
68
|
-
mae_value =
|
|
69
|
-
pearson_value =
|
|
70
|
-
kappa_value =
|
|
68
|
+
mae_value = AgreementMath.mae(pairs)
|
|
69
|
+
pearson_value = AgreementMath.pearson(pairs)
|
|
70
|
+
kappa_value = AgreementMath.quadratic_weighted_kappa(pairs, categories: 1..5)
|
|
71
71
|
|
|
72
72
|
Result.new(
|
|
73
73
|
sample_size: n,
|
|
@@ -28,7 +28,7 @@ module CompletionKit
|
|
|
28
28
|
current = MetricVersion.current.find_by(metric_id: @metric.id)
|
|
29
29
|
return [] unless current
|
|
30
30
|
|
|
31
|
-
base =
|
|
31
|
+
base = Agreement.where(metric_id: @metric.id, metric_version_id: current.id, verdict: %w[agree disagree])
|
|
32
32
|
@key_size_before_cap = base.count
|
|
33
33
|
base.includes(response: :reviews)
|
|
34
34
|
.order(created_at: :desc)
|
|
@@ -41,8 +41,8 @@ module CompletionKit
|
|
|
41
41
|
private
|
|
42
42
|
|
|
43
43
|
def build_meta_prompt
|
|
44
|
-
disagreements =
|
|
45
|
-
borderlines =
|
|
44
|
+
disagreements = MetricAgreementExamples.disagreements_for(@metric)
|
|
45
|
+
borderlines = MetricAgreementExamples.borderlines_for(@metric)
|
|
46
46
|
sections = []
|
|
47
47
|
sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
|
|
48
48
|
sections << ""
|
|
@@ -12,7 +12,7 @@ module CompletionKit
|
|
|
12
12
|
@api_endpoint = config[:api_endpoint]
|
|
13
13
|
end
|
|
14
14
|
|
|
15
|
-
def refresh!(&on_progress)
|
|
15
|
+
def refresh!(force: false, &on_progress)
|
|
16
16
|
discovered = fetch_models
|
|
17
17
|
reconcile(discovered)
|
|
18
18
|
# OpenRouter publishes capability metadata (output modalities, etc.), so we
|
|
@@ -20,6 +20,7 @@ module CompletionKit
|
|
|
20
20
|
# Judging stays unknown ("?") until a real run proves it.
|
|
21
21
|
return if @provider == "openrouter"
|
|
22
22
|
|
|
23
|
+
reset_failed_generation if force
|
|
23
24
|
probe_new_models(&on_progress)
|
|
24
25
|
end
|
|
25
26
|
|
|
@@ -181,6 +182,11 @@ module CompletionKit
|
|
|
181
182
|
end
|
|
182
183
|
end
|
|
183
184
|
|
|
185
|
+
def reset_failed_generation
|
|
186
|
+
Model.where(provider: @provider, status: %w[active failed], supports_generation: false)
|
|
187
|
+
.update_all(supports_generation: nil, generation_error: nil)
|
|
188
|
+
end
|
|
189
|
+
|
|
184
190
|
def probe_new_models(&on_progress)
|
|
185
191
|
candidates = Model.where(provider: @provider, status: %w[active failed])
|
|
186
192
|
.where("supports_generation IS NULL OR supports_judging IS NULL OR (generation_error IS NOT NULL AND #{retryable_error_sql('generation_error')}) OR (judging_error IS NOT NULL AND #{retryable_error_sql('judging_error')})")
|
|
@@ -220,7 +226,7 @@ module CompletionKit
|
|
|
220
226
|
|
|
221
227
|
def probe_generation(model)
|
|
222
228
|
probe_input = "Reply with exactly this token and nothing else: PING-OK"
|
|
223
|
-
response = send_probe(model.model_id, probe_input,
|
|
229
|
+
response = send_probe(model.model_id, probe_input, probe_max_output_tokens)
|
|
224
230
|
if response.success?
|
|
225
231
|
text = extract_text(response).to_s
|
|
226
232
|
if text.blank?
|
|
@@ -251,7 +257,7 @@ module CompletionKit
|
|
|
251
257
|
AI output to evaluate: The sky is blue.
|
|
252
258
|
PROMPT
|
|
253
259
|
|
|
254
|
-
response = send_probe(model.model_id, judge_input,
|
|
260
|
+
response = send_probe(model.model_id, judge_input, probe_max_output_tokens)
|
|
255
261
|
if response.success?
|
|
256
262
|
text = extract_text(response).to_s
|
|
257
263
|
if text.match?(/Score:\s*\d/i)
|
|
@@ -269,6 +275,13 @@ module CompletionKit
|
|
|
269
275
|
model.judging_error = e.message
|
|
270
276
|
end
|
|
271
277
|
|
|
278
|
+
OPENAI_REASONING_PROBE_BUDGET = 65_536
|
|
279
|
+
CHAT_PROBE_BUDGET = 1_024
|
|
280
|
+
|
|
281
|
+
def probe_max_output_tokens
|
|
282
|
+
@provider == "openai" ? OPENAI_REASONING_PROBE_BUDGET : CHAT_PROBE_BUDGET
|
|
283
|
+
end
|
|
284
|
+
|
|
272
285
|
def send_probe(model_id, input, max_tokens)
|
|
273
286
|
case @provider
|
|
274
287
|
when "openai" then openai_probe(model_id, input, max_tokens)
|
|
@@ -1,34 +1,34 @@
|
|
|
1
|
-
<div id="
|
|
2
|
-
<% current_verdict =
|
|
1
|
+
<div id="agreement_<%= response_row.id %>_<%= metric.id %>" class="ck-agreement">
|
|
2
|
+
<% current_verdict = agreement&.verdict %>
|
|
3
3
|
<% pending_verdict = local_assigns[:pending_verdict] %>
|
|
4
4
|
<% active_verdict = pending_verdict || current_verdict %>
|
|
5
5
|
<% error = local_assigns[:error] %>
|
|
6
6
|
<% me = CompletionKit.config.username.presence || "operator" %>
|
|
7
|
-
<%
|
|
7
|
+
<% other_agreements = CompletionKit::Agreement
|
|
8
8
|
.where(response_id: response_row.id, metric_id: metric.id)
|
|
9
9
|
.where.not(created_by: me)
|
|
10
10
|
.order(created_at: :asc).to_a %>
|
|
11
11
|
<% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
|
|
12
|
-
<p class="ck-
|
|
13
|
-
<span class="ck-
|
|
14
|
-
<% if
|
|
15
|
-
<span class="ck-
|
|
16
|
-
<span class="ck-
|
|
12
|
+
<p class="ck-agreement__prompt">
|
|
13
|
+
<span class="ck-agreement__label">Your verdict</span>
|
|
14
|
+
<% if other_agreements.any? %>
|
|
15
|
+
<span class="ck-agreement__meta"><%= pluralize(other_agreements.size, "other verdict") %> on this score</span>
|
|
16
|
+
<span class="ck-agreement__sep">·</span>
|
|
17
17
|
<% end %>
|
|
18
|
-
<%= link_to metric_path(metric), class: "ck-
|
|
18
|
+
<%= link_to metric_path(metric, anchor: "agreement"), class: "ck-agreement__meta-link" do %><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Agreement →<% end %>
|
|
19
19
|
</p>
|
|
20
|
-
<div class="ck-
|
|
20
|
+
<div class="ck-agreement__buttons">
|
|
21
21
|
<% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
|
|
22
22
|
<% verdict_hints = {
|
|
23
23
|
"agree" => "The score looks right.",
|
|
24
24
|
"disagree" => "The score is wrong — you'll pick the right one.",
|
|
25
25
|
"borderline" => "The rubric is unclear here; either score could be defensible."
|
|
26
26
|
} %>
|
|
27
|
-
<% CompletionKit::
|
|
28
|
-
<%= button_to
|
|
27
|
+
<% CompletionKit::Agreement::VERDICTS.each do |verdict| %>
|
|
28
|
+
<%= button_to run_response_agreements_path(run, response_row, metric_id: metric.id, verdict: verdict),
|
|
29
29
|
method: :post,
|
|
30
30
|
form: { data: { turbo: "true" } },
|
|
31
|
-
class: "ck-
|
|
31
|
+
class: "ck-agreement__pill ck-agreement__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
|
|
32
32
|
"aria-pressed": (verdict == active_verdict).to_s,
|
|
33
33
|
title: verdict_hints[verdict] do %>
|
|
34
34
|
<%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
@@ -38,26 +38,26 @@
|
|
|
38
38
|
</div>
|
|
39
39
|
|
|
40
40
|
<% if error.present? %>
|
|
41
|
-
<p class="ck-
|
|
41
|
+
<p class="ck-agreement__error" role="alert"><%= error %></p>
|
|
42
42
|
<% end %>
|
|
43
43
|
|
|
44
|
-
<% if
|
|
45
|
-
<details class="ck-
|
|
46
|
-
<summary class="ck-
|
|
44
|
+
<% if other_agreements.any? %>
|
|
45
|
+
<details class="ck-agreement__others">
|
|
46
|
+
<summary class="ck-agreement__others-summary">
|
|
47
47
|
<%= heroicon_tag "chevron-right", variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
48
|
-
<span>What others said (<%=
|
|
48
|
+
<span>What others said (<%= other_agreements.size %>)</span>
|
|
49
49
|
</summary>
|
|
50
|
-
<ul class="ck-
|
|
51
|
-
<%
|
|
52
|
-
<li class="ck-
|
|
53
|
-
<div class="ck-
|
|
54
|
-
<span class="ck-
|
|
50
|
+
<ul class="ck-agreement__others-list">
|
|
51
|
+
<% other_agreements.each do |other| %>
|
|
52
|
+
<li class="ck-agreement__others-item ck-agreement__others-item--<%= other.verdict %>">
|
|
53
|
+
<div class="ck-agreement__others-row">
|
|
54
|
+
<span class="ck-agreement__others-verdict">
|
|
55
55
|
<%= heroicon_tag verdict_icons[other.verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
56
56
|
<%= other.verdict %>
|
|
57
57
|
</span>
|
|
58
|
-
<span class="ck-
|
|
58
|
+
<span class="ck-agreement__others-by"><%= other.created_by %></span>
|
|
59
59
|
<% if other.corrected_score %>
|
|
60
|
-
<span class="ck-
|
|
60
|
+
<span class="ck-agreement__others-stars" aria-label="<%= pluralize(other.corrected_score.to_i, 'star') %>" title="<%= pluralize(other.corrected_score.to_i, 'star') %>">
|
|
61
61
|
<% 5.times do |i| %>
|
|
62
62
|
<svg viewBox="0 0 24 24" width="12" height="12" stroke-width="1.75" class="ck-star <%= i < other.corrected_score.to_i ? "ck-star--filled" : "ck-star--empty" %>" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
|
|
63
63
|
<% end %>
|
|
@@ -65,7 +65,7 @@
|
|
|
65
65
|
<% end %>
|
|
66
66
|
</div>
|
|
67
67
|
<% if other.note.to_s.present? %>
|
|
68
|
-
<p class="ck-
|
|
68
|
+
<p class="ck-agreement__others-note">"<%= other.note.to_s.truncate(140) %>"</p>
|
|
69
69
|
<% end %>
|
|
70
70
|
</li>
|
|
71
71
|
<% end %>
|
|
@@ -74,10 +74,10 @@
|
|
|
74
74
|
<% end %>
|
|
75
75
|
|
|
76
76
|
<% if active_verdict == "disagree" %>
|
|
77
|
-
<% existing_score = (
|
|
78
|
-
<%= form_with url:
|
|
77
|
+
<% existing_score = (agreement&.corrected_score || review&.ai_score)&.round %>
|
|
78
|
+
<%= form_with url: run_response_agreements_path(run, response_row),
|
|
79
79
|
method: :post, local: false,
|
|
80
|
-
class: "ck-
|
|
80
|
+
class: "ck-agreement__detail" do |f| %>
|
|
81
81
|
<%= hidden_field_tag :metric_id, metric.id %>
|
|
82
82
|
<%= hidden_field_tag :verdict, "disagree" %>
|
|
83
83
|
<p class="ck-label">What should the score have been?</p>
|
|
@@ -93,16 +93,16 @@
|
|
|
93
93
|
<% end %>
|
|
94
94
|
</div>
|
|
95
95
|
</fieldset>
|
|
96
|
-
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%=
|
|
96
|
+
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= agreement&.note %></textarea>
|
|
97
97
|
<%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
|
|
98
98
|
<% end %>
|
|
99
99
|
<% elsif active_verdict == "borderline" %>
|
|
100
|
-
<%= form_with url:
|
|
100
|
+
<%= form_with url: run_response_agreements_path(run, response_row),
|
|
101
101
|
method: :post, local: false,
|
|
102
|
-
class: "ck-
|
|
102
|
+
class: "ck-agreement__detail" do |f| %>
|
|
103
103
|
<%= hidden_field_tag :metric_id, metric.id %>
|
|
104
104
|
<%= hidden_field_tag :verdict, "borderline" %>
|
|
105
|
-
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%=
|
|
105
|
+
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= agreement&.note %></textarea>
|
|
106
106
|
<%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
|
|
107
107
|
<% end %>
|
|
108
108
|
<% end %>
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
<% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
|
|
5
5
|
<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
|
|
6
6
|
created_by = CompletionKit.config.username.presence || "operator"
|
|
7
|
-
verdicted_ids = CompletionKit::
|
|
7
|
+
verdicted_ids = CompletionKit::Agreement.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
|
|
8
8
|
CompletionKit::Response.joins(:reviews)
|
|
9
9
|
.where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
|
|
10
10
|
.where.not(reviews: { ai_score: nil })
|
|
@@ -12,20 +12,20 @@
|
|
|
12
12
|
.order(created_at: :desc).first
|
|
13
13
|
end %>
|
|
14
14
|
<% prior_version_verdicts = if stats.sample_size.zero? && metric && current_metric_version
|
|
15
|
-
CompletionKit::
|
|
15
|
+
CompletionKit::Agreement.where(metric_id: metric.id).where.not(metric_version_id: current_metric_version.id).count
|
|
16
16
|
else
|
|
17
17
|
0
|
|
18
18
|
end %>
|
|
19
19
|
|
|
20
|
-
<p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
20
|
+
<p id="agreement" class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
21
21
|
<% if stats.sample_size.zero? %>
|
|
22
22
|
<span class="ck-trust-line__lead">Not measured yet.</span>
|
|
23
|
-
<span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::
|
|
23
|
+
<span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricAgreementStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
|
|
24
24
|
<% if target_response %>
|
|
25
25
|
<%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
26
26
|
<% end %>
|
|
27
27
|
<% elsif stats.counter_only? %>
|
|
28
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::
|
|
28
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricAgreementStats::PROVISIONAL_MIN %></strong></span>
|
|
29
29
|
<% if stats.short_to_target > 0 %><span class="ck-trust-line__hint"><%= stats.short_to_target %> more to report a rate</span><% end %>
|
|
30
30
|
<% if target_response %>
|
|
31
31
|
<%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
<input type="radio" name="ck-api-tab" id="ck-tab-datasets" class="ck-api-tabs__radio">
|
|
18
18
|
<input type="radio" name="ck-api-tab" id="ck-tab-metrics" class="ck-api-tabs__radio">
|
|
19
19
|
<input type="radio" name="ck-api-tab" id="ck-tab-metric-groups" class="ck-api-tabs__radio">
|
|
20
|
-
<input type="radio" name="ck-api-tab" id="ck-tab-
|
|
20
|
+
<input type="radio" name="ck-api-tab" id="ck-tab-agreements" class="ck-api-tabs__radio">
|
|
21
21
|
<input type="radio" name="ck-api-tab" id="ck-tab-tags" class="ck-api-tabs__radio">
|
|
22
22
|
<input type="radio" name="ck-api-tab" id="ck-tab-providers" class="ck-api-tabs__radio">
|
|
23
23
|
|
|
@@ -29,7 +29,7 @@
|
|
|
29
29
|
<label for="ck-tab-datasets" class="ck-api-tabs__label">Datasets <span class="ck-api-tabs__count">5</span></label>
|
|
30
30
|
<label for="ck-tab-metrics" class="ck-api-tabs__label">Metrics <span class="ck-api-tabs__count">10</span></label>
|
|
31
31
|
<label for="ck-tab-metric-groups" class="ck-api-tabs__label">Metric Groups <span class="ck-api-tabs__count">5</span></label>
|
|
32
|
-
<label for="ck-tab-
|
|
32
|
+
<label for="ck-tab-agreements" class="ck-api-tabs__label">Agreements <span class="ck-api-tabs__count">3</span></label>
|
|
33
33
|
<label for="ck-tab-tags" class="ck-api-tabs__label">Tags <span class="ck-api-tabs__count">5</span></label>
|
|
34
34
|
<label for="ck-tab-providers" class="ck-api-tabs__label">Providers <span class="ck-api-tabs__count">5</span></label>
|
|
35
35
|
</nav>
|
|
@@ -238,8 +238,8 @@
|
|
|
238
238
|
} %>
|
|
239
239
|
|
|
240
240
|
<div class="ck-api-endpoint" style="padding-top: 1.5rem;">
|
|
241
|
-
<p class="ck-kicker" style="margin-bottom: 0.5rem;">
|
|
242
|
-
<p class="ck-meta-copy">Drive metric improvement from disagree-flagged
|
|
241
|
+
<p class="ck-kicker" style="margin-bottom: 0.5rem;">Agreement loop</p>
|
|
242
|
+
<p class="ck-meta-copy">Drive metric improvement from disagree-flagged agreements: ask the model to rewrite the instruction and rubric into a new draft version.</p>
|
|
243
243
|
</div>
|
|
244
244
|
<div class="ck-api-endpoint">
|
|
245
245
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/metrics/:id/suggest_variants</p>
|
|
@@ -250,7 +250,7 @@
|
|
|
250
250
|
|
|
251
251
|
<div class="ck-api-endpoint" style="padding-top: 1.5rem;">
|
|
252
252
|
<p class="ck-kicker" style="margin-bottom: 0.5rem;">Metric versions</p>
|
|
253
|
-
<p class="ck-meta-copy">Every metric carries a history of versions (the current published one, prior published ones, and any draft suggestions). Reviews and
|
|
253
|
+
<p class="ck-meta-copy">Every metric carries a history of versions (the current published one, prior published ones, and any draft suggestions). Reviews and agreements record the version they ran against, so the API can surface stale state and let you revert.</p>
|
|
254
254
|
</div>
|
|
255
255
|
<div class="ck-api-endpoint">
|
|
256
256
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/metrics/:metric_id/metric_versions</p>
|
|
@@ -294,23 +294,23 @@
|
|
|
294
294
|
</div>
|
|
295
295
|
|
|
296
296
|
<div class="ck-api-tabs__panel">
|
|
297
|
-
<h2 class="ck-section-title">
|
|
298
|
-
<p class="ck-copy">Per-verdict feedback events on a response/metric pair: agree, disagree (with a corrected score and note), or borderline.
|
|
297
|
+
<h2 class="ck-section-title">Agreements</h2>
|
|
298
|
+
<p class="ck-copy">Per-verdict feedback events on a response/metric pair: agree, disagree (with a corrected score and note), or borderline. Agreements capture the metric version that was current when the verdict was cast, which is what drives the trust signal and the "stale" indicators across the rest of the API.</p>
|
|
299
299
|
<div class="ck-api-endpoint">
|
|
300
|
-
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/
|
|
301
|
-
<p class="ck-meta-copy">List
|
|
300
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> /api/v1/agreements</p>
|
|
301
|
+
<p class="ck-meta-copy">List agreements across all runs. Supports filtering by any combination of the query params below.</p>
|
|
302
302
|
<p class="ck-api-params"><strong>Optional filters:</strong> <code>run_id</code>, <code>response_id</code>, <code>metric_id</code>, <code>metric_version_id</code>, <code>created_by</code>, <code>verdict</code> (<code>agree</code>, <code>disagree</code>, or <code>borderline</code>)</p>
|
|
303
|
-
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/
|
|
303
|
+
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl \"#{base_url}/api/v1/agreements?metric_id=1&verdict=disagree\" \\\n -H \"Authorization: Bearer #{token}\"" %>
|
|
304
304
|
</div>
|
|
305
305
|
<div class="ck-api-endpoint">
|
|
306
|
-
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:run_id/responses/:response_id/metrics/:metric_id/
|
|
307
|
-
<p class="ck-meta-copy">Cast
|
|
306
|
+
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/runs/:run_id/responses/:response_id/metrics/:metric_id/agreements</p>
|
|
307
|
+
<p class="ck-meta-copy">Cast an agreement on a specific response/metric pair. The metric version on the record is set automatically from the run's review.</p>
|
|
308
308
|
<p class="ck-api-params"><strong>Required:</strong> <code>verdict</code>, <code>created_by</code> <strong>Optional:</strong> <code>corrected_score</code>, <code>note</code></p>
|
|
309
|
-
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/responses/42/metrics/3/
|
|
309
|
+
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/runs/1/responses/42/metrics/3/agreements \\\n -H \"Authorization: Bearer #{token}\" \\\n -H \"Content-Type: application/json\" \\\n -d '{\"verdict\": \"disagree\", \"corrected_score\": 3, \"note\": \"too generous\", \"created_by\": \"alice\"}'" %>
|
|
310
310
|
</div>
|
|
311
311
|
<div class="ck-api-endpoint">
|
|
312
|
-
<p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/
|
|
313
|
-
<p class="ck-meta-copy">Delete
|
|
312
|
+
<p class="ck-api-method"><span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/agreements/:id</p>
|
|
313
|
+
<p class="ck-meta-copy">Delete an agreement. Returns 204 No Content.</p>
|
|
314
314
|
</div>
|
|
315
315
|
</div>
|
|
316
316
|
|