completion-kit 0.10.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +118 -55
- data/app/controllers/completion_kit/{calibrations_controller.rb → agreements_controller.rb} +19 -19
- data/app/controllers/completion_kit/api/v1/{calibrations_controller.rb → agreements_controller.rb} +18 -18
- data/app/controllers/completion_kit/api/v1/metric_versions_controller.rb +2 -7
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +1 -1
- data/app/controllers/completion_kit/metrics_controller.rb +18 -23
- data/app/jobs/completion_kit/judge_review_job.rb +2 -2
- data/app/jobs/completion_kit/metric_suggestion_job.rb +46 -0
- data/app/models/completion_kit/{calibration.rb → agreement.rb} +1 -1
- data/app/models/completion_kit/metric_version.rb +2 -17
- data/app/models/completion_kit/review.rb +1 -0
- data/app/services/completion_kit/{calibration_math.rb → agreement_math.rb} +1 -1
- data/app/services/completion_kit/mcp_dispatcher.rb +2 -2
- data/app/services/completion_kit/mcp_tools/{calibrations.rb → agreements.rb} +11 -11
- data/app/services/completion_kit/mcp_tools/judges.rb +3 -3
- data/app/services/completion_kit/mcp_tools/metric_versions.rb +2 -7
- data/app/services/completion_kit/{metric_calibration_examples.rb → metric_agreement_examples.rb} +6 -6
- data/app/services/completion_kit/{metric_calibration_stats.rb → metric_agreement_stats.rb} +6 -6
- data/app/services/completion_kit/metric_improvement_validator.rb +101 -0
- data/app/services/completion_kit/metric_variant_generator.rb +2 -2
- data/app/views/completion_kit/{calibrations → agreements}/_buttons.html.erb +33 -33
- data/app/views/completion_kit/{calibrations → agreements}/_trust_panel.html.erb +6 -9
- data/app/views/completion_kit/api_reference/_body.html.erb +15 -15
- data/app/views/completion_kit/metrics/_guiding_examples.html.erb +1 -1
- data/app/views/completion_kit/metrics/_suggestion_failed.html.erb +3 -0
- data/app/views/completion_kit/metrics/_suggestion_pending.html.erb +3 -0
- data/app/views/completion_kit/metrics/_suggestion_ready.html.erb +4 -0
- data/app/views/completion_kit/metrics/_validation_scoreboard.html.erb +12 -0
- data/app/views/completion_kit/metrics/edit.html.erb +1 -1
- data/app/views/completion_kit/metrics/show.html.erb +25 -11
- data/app/views/completion_kit/responses/show.html.erb +4 -4
- data/app/views/completion_kit/runs/show.html.erb +1 -1
- data/config/routes.rb +3 -3
- data/db/migrate/20260531000001_add_validation_summary_to_completion_kit_metric_versions.rb +5 -0
- data/db/migrate/20260531000002_backfill_review_metric_versions.rb +33 -0
- data/db/migrate/20260531000003_add_metric_version_fk_to_reviews.rb +6 -0
- data/db/migrate/20260531000004_rename_calibrations_to_agreements.rb +19 -0
- data/lib/completion_kit/version.rb +1 -1
- data/lib/completion_kit.rb +2 -2
- metadata +20 -10
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
require "faraday"
|
|
2
|
+
|
|
3
|
+
module CompletionKit
|
|
4
|
+
class MetricSuggestionJob < ApplicationJob
|
|
5
|
+
queue_as :llm
|
|
6
|
+
|
|
7
|
+
retry_on Faraday::TimeoutError, Faraday::ConnectionFailed, wait: :polynomially_longer, attempts: 5
|
|
8
|
+
retry_on CompletionKit::RateLimitError, wait: :polynomially_longer, attempts: 5
|
|
9
|
+
|
|
10
|
+
rescue_from(StandardError) do |error|
|
|
11
|
+
Rails.error.report(error, handled: true, context: { job: self.class.name })
|
|
12
|
+
broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def perform(metric_id)
|
|
16
|
+
@metric = Metric.find_by(id: metric_id)
|
|
17
|
+
return unless @metric
|
|
18
|
+
|
|
19
|
+
MetricVersion.drafts.where(metric_id: @metric.id, source: "suggestion").destroy_all
|
|
20
|
+
|
|
21
|
+
generator = MetricVariantGenerator.new(@metric, count: 1)
|
|
22
|
+
variants = generator.call
|
|
23
|
+
if variants.empty?
|
|
24
|
+
broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_failed", locals: { metric: @metric })
|
|
25
|
+
return
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
draft = generator.persist!(variants).max_by(&:version_number)
|
|
29
|
+
summary = MetricImprovementValidator.new(@metric, draft).call
|
|
30
|
+
draft.update!(validation_summary: summary)
|
|
31
|
+
|
|
32
|
+
broadcast_status(@metric, partial: "completion_kit/metrics/suggestion_ready", locals: { metric: @metric, draft: draft })
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def broadcast_status(metric, partial:, locals:)
|
|
38
|
+
html = CompletionKit::ApplicationController.render(partial: partial, locals: locals)
|
|
39
|
+
Turbo::StreamsChannel.broadcast_replace_to(
|
|
40
|
+
"metric_#{metric.id}_suggestion",
|
|
41
|
+
target: "ck-suggestion-status-#{metric.id}",
|
|
42
|
+
html: html
|
|
43
|
+
)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -3,9 +3,10 @@ module CompletionKit
|
|
|
3
3
|
STATES = %w[draft published].freeze
|
|
4
4
|
|
|
5
5
|
belongs_to :metric
|
|
6
|
-
has_many :
|
|
6
|
+
has_many :agreements, dependent: :destroy
|
|
7
7
|
|
|
8
8
|
serialize :rubric_bands, coder: JSON
|
|
9
|
+
serialize :validation_summary, coder: JSON
|
|
9
10
|
|
|
10
11
|
before_validation :assign_version_number, on: :create
|
|
11
12
|
|
|
@@ -82,22 +83,6 @@ module CompletionKit
|
|
|
82
83
|
self
|
|
83
84
|
end
|
|
84
85
|
|
|
85
|
-
def revert!
|
|
86
|
-
raise ArgumentError, "only a published version can be reverted to" unless published?
|
|
87
|
-
audit = nil
|
|
88
|
-
MetricVersion.transaction do
|
|
89
|
-
audit = self.class.create!(
|
|
90
|
-
metric: metric,
|
|
91
|
-
instruction: instruction,
|
|
92
|
-
rubric_bands: rubric_bands,
|
|
93
|
-
state: "draft",
|
|
94
|
-
source: "revert"
|
|
95
|
-
)
|
|
96
|
-
audit.publish!
|
|
97
|
-
end
|
|
98
|
-
audit
|
|
99
|
-
end
|
|
100
|
-
|
|
101
86
|
def as_json(options = {})
|
|
102
87
|
{
|
|
103
88
|
id: id,
|
|
@@ -8,6 +8,7 @@ module CompletionKit
|
|
|
8
8
|
has_many :dashboard_dismissals, as: :dismissable, dependent: :destroy
|
|
9
9
|
|
|
10
10
|
validates :metric_name, presence: true
|
|
11
|
+
validates :metric_version, presence: true
|
|
11
12
|
validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
|
|
12
13
|
|
|
13
14
|
before_validation :set_default_status
|
|
@@ -35,7 +35,7 @@ module CompletionKit
|
|
|
35
35
|
McpTools::MetricVersions.definitions +
|
|
36
36
|
McpTools::ProviderCredentials.definitions +
|
|
37
37
|
McpTools::Tags.definitions +
|
|
38
|
-
McpTools::
|
|
38
|
+
McpTools::Agreements.definitions +
|
|
39
39
|
McpTools::Judges.definitions
|
|
40
40
|
end
|
|
41
41
|
|
|
@@ -50,7 +50,7 @@ module CompletionKit
|
|
|
50
50
|
when /\Ametrics_/ then McpTools::Metrics.call(name, arguments)
|
|
51
51
|
when /\Aprovider_credentials_/ then McpTools::ProviderCredentials.call(name, arguments)
|
|
52
52
|
when /\Atags_/ then McpTools::Tags.call(name, arguments)
|
|
53
|
-
when /\
|
|
53
|
+
when /\Aagreements_/ then McpTools::Agreements.call(name, arguments)
|
|
54
54
|
when /\Ajudges_/ then McpTools::Judges.call(name, arguments)
|
|
55
55
|
else raise MethodNotFound, "Unknown tool: #{name}"
|
|
56
56
|
end
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
module McpTools
|
|
3
|
-
module
|
|
3
|
+
module Agreements
|
|
4
4
|
extend Base
|
|
5
5
|
|
|
6
6
|
TOOLS = {
|
|
7
|
-
"
|
|
8
|
-
description: "List
|
|
7
|
+
"agreements_list" => {
|
|
8
|
+
description: "List agreements. Filter by run_id, response_id, metric_id, or created_by.",
|
|
9
9
|
inputSchema: {
|
|
10
10
|
type: "object",
|
|
11
11
|
properties: {
|
|
@@ -18,8 +18,8 @@ module CompletionKit
|
|
|
18
18
|
},
|
|
19
19
|
handler: :list
|
|
20
20
|
},
|
|
21
|
-
"
|
|
22
|
-
description: "Upsert
|
|
21
|
+
"agreements_create" => {
|
|
22
|
+
description: "Upsert an agreement for (run, response, metric, created_by). Verdict is one of agree, disagree, borderline. corrected_score (1..5) is required when verdict is 'disagree'.",
|
|
23
23
|
inputSchema: {
|
|
24
24
|
type: "object",
|
|
25
25
|
properties: {
|
|
@@ -38,7 +38,7 @@ module CompletionKit
|
|
|
38
38
|
}.freeze
|
|
39
39
|
|
|
40
40
|
def self.list(args)
|
|
41
|
-
scope = CompletionKit::
|
|
41
|
+
scope = CompletionKit::Agreement.all
|
|
42
42
|
scope = scope.where(run_id: args["run_id"]) if args["run_id"]
|
|
43
43
|
scope = scope.where(response_id: args["response_id"]) if args["response_id"]
|
|
44
44
|
scope = scope.where(metric_id: args["metric_id"]) if args["metric_id"]
|
|
@@ -52,20 +52,20 @@ module CompletionKit
|
|
|
52
52
|
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
53
53
|
created_by = args["created_by"].presence || "mcp"
|
|
54
54
|
|
|
55
|
-
|
|
55
|
+
agreement = CompletionKit::Agreement.find_or_initialize_by(
|
|
56
56
|
run_id: run.id, response_id: response.id, metric_id: metric.id, created_by: created_by
|
|
57
57
|
)
|
|
58
|
-
|
|
58
|
+
agreement.assign_attributes(
|
|
59
59
|
metric_version: CompletionKit::MetricVersion.ensure_current_for(metric),
|
|
60
60
|
verdict: args["verdict"],
|
|
61
61
|
corrected_score: args["corrected_score"],
|
|
62
62
|
note: args["note"]
|
|
63
63
|
)
|
|
64
64
|
|
|
65
|
-
if
|
|
66
|
-
text_result(
|
|
65
|
+
if agreement.save
|
|
66
|
+
text_result(agreement.as_json)
|
|
67
67
|
else
|
|
68
|
-
error_result(
|
|
68
|
+
error_result(agreement.errors.full_messages.join(", "))
|
|
69
69
|
end
|
|
70
70
|
end
|
|
71
71
|
end
|
|
@@ -33,7 +33,7 @@ module CompletionKit
|
|
|
33
33
|
handler: :replay
|
|
34
34
|
},
|
|
35
35
|
"judges_compare" => {
|
|
36
|
-
description: "Compare two metric versions'
|
|
36
|
+
description: "Compare two metric versions' agreement stats side by side. Pass either two metric_version_ids or one metric_id with metric_version_a_id / metric_version_b_id.",
|
|
37
37
|
inputSchema: {
|
|
38
38
|
type: "object",
|
|
39
39
|
properties: {
|
|
@@ -77,8 +77,8 @@ module CompletionKit
|
|
|
77
77
|
metric = CompletionKit::Metric.find(args["metric_id"])
|
|
78
78
|
a = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_a_id"])
|
|
79
79
|
b = CompletionKit::MetricVersion.where(metric_id: metric.id).find(args["metric_version_b_id"])
|
|
80
|
-
stats_a = CompletionKit::
|
|
81
|
-
stats_b = CompletionKit::
|
|
80
|
+
stats_a = CompletionKit::MetricAgreementStats.for(metric, metric_version: a)
|
|
81
|
+
stats_b = CompletionKit::MetricAgreementStats.for(metric, metric_version: b)
|
|
82
82
|
text_result({
|
|
83
83
|
metric_id: metric.id,
|
|
84
84
|
a: metric_version_payload(a, stats_a),
|
|
@@ -47,13 +47,8 @@ module CompletionKit
|
|
|
47
47
|
|
|
48
48
|
def self.publish(args)
|
|
49
49
|
version = CompletionKit::MetricVersion.find(args["metric_version_id"])
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
text_result(audit.as_json)
|
|
53
|
-
else
|
|
54
|
-
version.publish!
|
|
55
|
-
text_result(version.reload.as_json)
|
|
56
|
-
end
|
|
50
|
+
version.publish!
|
|
51
|
+
text_result(version.reload.as_json)
|
|
57
52
|
end
|
|
58
53
|
|
|
59
54
|
def self.dismiss(args)
|
data/app/services/completion_kit/{metric_calibration_examples.rb → metric_agreement_examples.rb}
RENAMED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
|
-
module
|
|
2
|
+
module MetricAgreementExamples
|
|
3
3
|
DEFAULT_JUDGE_EXAMPLE_LIMIT = 5
|
|
4
4
|
|
|
5
5
|
module_function
|
|
@@ -9,18 +9,18 @@ module CompletionKit
|
|
|
9
9
|
end
|
|
10
10
|
|
|
11
11
|
def disagreements_for(metric, limit: 8)
|
|
12
|
-
|
|
12
|
+
agreements_for(metric, verdict: "disagree", limit: limit)
|
|
13
13
|
end
|
|
14
14
|
|
|
15
15
|
def borderlines_for(metric, limit: 6)
|
|
16
|
-
|
|
16
|
+
agreements_for(metric, verdict: "borderline", limit: limit)
|
|
17
17
|
end
|
|
18
18
|
|
|
19
19
|
def judge_examples_for(metric, exclude_response_id: nil, limit: DEFAULT_JUDGE_EXAMPLE_LIMIT)
|
|
20
20
|
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
21
21
|
return [] unless current_version
|
|
22
22
|
|
|
23
|
-
relation =
|
|
23
|
+
relation = Agreement
|
|
24
24
|
.where(metric_id: metric.id, metric_version_id: current_version.id, excluded_from_examples: false)
|
|
25
25
|
.where.not(corrected_score: nil)
|
|
26
26
|
relation = relation.where.not(response_id: exclude_response_id) if exclude_response_id
|
|
@@ -28,8 +28,8 @@ module CompletionKit
|
|
|
28
28
|
.reject { |example| example[:judge_score].nil? }
|
|
29
29
|
end
|
|
30
30
|
|
|
31
|
-
def
|
|
32
|
-
base =
|
|
31
|
+
def agreements_for(metric, verdict:, limit:)
|
|
32
|
+
base = Agreement.where(metric_id: metric.id, verdict: verdict)
|
|
33
33
|
current_version = MetricVersion.current.find_by(metric_id: metric.id)
|
|
34
34
|
scoped = current_version ? base.where(metric_version_id: current_version.id) : base
|
|
35
35
|
effective = scoped.exists? ? scoped : base
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
|
-
class
|
|
2
|
+
class MetricAgreementStats
|
|
3
3
|
PROVISIONAL_MIN = 10
|
|
4
4
|
FIRM_MIN = 30
|
|
5
5
|
|
|
@@ -49,7 +49,7 @@ module CompletionKit
|
|
|
49
49
|
end
|
|
50
50
|
|
|
51
51
|
def call
|
|
52
|
-
scope =
|
|
52
|
+
scope = Agreement.where(metric_id: @metric.id)
|
|
53
53
|
if @metric_version
|
|
54
54
|
scope = scope.where(metric_version_id: @metric_version.id)
|
|
55
55
|
elsif !@all_versions
|
|
@@ -62,12 +62,12 @@ module CompletionKit
|
|
|
62
62
|
disagrees = verdicts.count { |v, _, _| v == "disagree" }
|
|
63
63
|
borderlines = verdicts.count { |v, _, _| v == "borderline" }
|
|
64
64
|
|
|
65
|
-
ci =
|
|
65
|
+
ci = AgreementMath.wilson_interval(successes: agrees, n: n)
|
|
66
66
|
|
|
67
67
|
pairs = score_pairs(verdicts)
|
|
68
|
-
mae_value =
|
|
69
|
-
pearson_value =
|
|
70
|
-
kappa_value =
|
|
68
|
+
mae_value = AgreementMath.mae(pairs)
|
|
69
|
+
pearson_value = AgreementMath.pearson(pairs)
|
|
70
|
+
kappa_value = AgreementMath.quadratic_weighted_kappa(pairs, categories: 1..5)
|
|
71
71
|
|
|
72
72
|
Result.new(
|
|
73
73
|
sample_size: n,
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class MetricImprovementValidator
|
|
3
|
+
ANSWER_KEY_LIMIT = 30
|
|
4
|
+
|
|
5
|
+
def initialize(metric, candidate, scorer: nil)
|
|
6
|
+
@metric = metric
|
|
7
|
+
@candidate = candidate
|
|
8
|
+
@scorer = scorer || method(:rescore)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def call
|
|
12
|
+
key = answer_key
|
|
13
|
+
rows = []
|
|
14
|
+
key.each do |entry|
|
|
15
|
+
begin
|
|
16
|
+
score = @scorer.call(entry[:response], @candidate)
|
|
17
|
+
rescue StandardError
|
|
18
|
+
next
|
|
19
|
+
end
|
|
20
|
+
rows << classify(entry, score.to_i)
|
|
21
|
+
end
|
|
22
|
+
summarize(rows, key.size, key_capped?)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def answer_key
|
|
28
|
+
current = MetricVersion.current.find_by(metric_id: @metric.id)
|
|
29
|
+
return [] unless current
|
|
30
|
+
|
|
31
|
+
base = Agreement.where(metric_id: @metric.id, metric_version_id: current.id, verdict: %w[agree disagree])
|
|
32
|
+
@key_size_before_cap = base.count
|
|
33
|
+
base.includes(response: :reviews)
|
|
34
|
+
.order(created_at: :desc)
|
|
35
|
+
.limit(ANSWER_KEY_LIMIT)
|
|
36
|
+
.filter_map do |cal|
|
|
37
|
+
response = cal.response
|
|
38
|
+
next unless response.response_text.present?
|
|
39
|
+
review = response.reviews.find { |r| r.metric_id == @metric.id }
|
|
40
|
+
position = cal.verdict == "disagree" ? cal.corrected_score : review&.ai_score
|
|
41
|
+
next if position.nil?
|
|
42
|
+
{ response: response, verdict: cal.verdict, position: position }
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def key_capped?
|
|
47
|
+
@key_size_before_cap.to_i > ANSWER_KEY_LIMIT
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def classify(entry, candidate_score)
|
|
51
|
+
matched = candidate_score == entry[:position].to_i
|
|
52
|
+
outcome = if entry[:verdict] == "disagree"
|
|
53
|
+
matched ? "fix" : "still_off"
|
|
54
|
+
else
|
|
55
|
+
matched ? "keep" : "break"
|
|
56
|
+
end
|
|
57
|
+
{
|
|
58
|
+
"response_id" => entry[:response].id,
|
|
59
|
+
"verdict" => entry[:verdict],
|
|
60
|
+
"position" => entry[:position].to_i,
|
|
61
|
+
"candidate_score" => candidate_score,
|
|
62
|
+
"outcome" => outcome
|
|
63
|
+
}
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def summarize(rows, total, capped)
|
|
67
|
+
fixes = rows.count { |r| r["outcome"] == "fix" }
|
|
68
|
+
keeps = rows.count { |r| r["outcome"] == "keep" }
|
|
69
|
+
breaks = rows.count { |r| r["outcome"] == "break" }
|
|
70
|
+
still_off = rows.count { |r| r["outcome"] == "still_off" }
|
|
71
|
+
agreements = rows.count { |r| r["verdict"] == "agree" }
|
|
72
|
+
{
|
|
73
|
+
"total" => total,
|
|
74
|
+
"tested" => rows.size,
|
|
75
|
+
"capped" => capped,
|
|
76
|
+
"fixes" => fixes,
|
|
77
|
+
"keeps" => keeps,
|
|
78
|
+
"breaks" => breaks,
|
|
79
|
+
"still_off" => still_off,
|
|
80
|
+
"before" => agreements,
|
|
81
|
+
"after" => fixes + keeps,
|
|
82
|
+
"rows" => rows
|
|
83
|
+
}
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def rescore(response, candidate)
|
|
87
|
+
run = response.run
|
|
88
|
+
config = ApiConfig.for_model(run.judge_model).merge(judge_model: run.judge_model)
|
|
89
|
+
rubric_text = Metric.rubric_text_for(Metric.normalize_rubric_bands(candidate.rubric_bands))
|
|
90
|
+
result = JudgeService.new(config).evaluate(
|
|
91
|
+
response.response_text,
|
|
92
|
+
response.expected_output,
|
|
93
|
+
run.prompt&.template,
|
|
94
|
+
criteria: candidate.instruction.to_s,
|
|
95
|
+
rubric_text: rubric_text,
|
|
96
|
+
input_data: response.input_data
|
|
97
|
+
)
|
|
98
|
+
result[:score]
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
@@ -41,8 +41,8 @@ module CompletionKit
|
|
|
41
41
|
private
|
|
42
42
|
|
|
43
43
|
def build_meta_prompt
|
|
44
|
-
disagreements =
|
|
45
|
-
borderlines =
|
|
44
|
+
disagreements = MetricAgreementExamples.disagreements_for(@metric)
|
|
45
|
+
borderlines = MetricAgreementExamples.borderlines_for(@metric)
|
|
46
46
|
sections = []
|
|
47
47
|
sections << "You are an expert evaluator. The judge below is misaligned with humans. Propose #{@count == 1 ? "a single" : "#{@count}"} concrete rewrite that closes the gap."
|
|
48
48
|
sections << ""
|
|
@@ -1,34 +1,34 @@
|
|
|
1
|
-
<div id="
|
|
2
|
-
<% current_verdict =
|
|
1
|
+
<div id="agreement_<%= response_row.id %>_<%= metric.id %>" class="ck-agreement">
|
|
2
|
+
<% current_verdict = agreement&.verdict %>
|
|
3
3
|
<% pending_verdict = local_assigns[:pending_verdict] %>
|
|
4
4
|
<% active_verdict = pending_verdict || current_verdict %>
|
|
5
5
|
<% error = local_assigns[:error] %>
|
|
6
6
|
<% me = CompletionKit.config.username.presence || "operator" %>
|
|
7
|
-
<%
|
|
7
|
+
<% other_agreements = CompletionKit::Agreement
|
|
8
8
|
.where(response_id: response_row.id, metric_id: metric.id)
|
|
9
9
|
.where.not(created_by: me)
|
|
10
10
|
.order(created_at: :asc).to_a %>
|
|
11
11
|
<% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
|
|
12
|
-
<p class="ck-
|
|
13
|
-
<span class="ck-
|
|
14
|
-
<% if
|
|
15
|
-
<span class="ck-
|
|
16
|
-
<span class="ck-
|
|
12
|
+
<p class="ck-agreement__prompt">
|
|
13
|
+
<span class="ck-agreement__label">Your verdict</span>
|
|
14
|
+
<% if other_agreements.any? %>
|
|
15
|
+
<span class="ck-agreement__meta"><%= pluralize(other_agreements.size, "other verdict") %> on this score</span>
|
|
16
|
+
<span class="ck-agreement__sep">·</span>
|
|
17
17
|
<% end %>
|
|
18
|
-
<%= link_to metric_path(metric), class: "ck-
|
|
18
|
+
<%= link_to metric_path(metric, anchor: "agreement"), class: "ck-agreement__meta-link" do %><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Agreement →<% end %>
|
|
19
19
|
</p>
|
|
20
|
-
<div class="ck-
|
|
20
|
+
<div class="ck-agreement__buttons">
|
|
21
21
|
<% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
|
|
22
22
|
<% verdict_hints = {
|
|
23
23
|
"agree" => "The score looks right.",
|
|
24
24
|
"disagree" => "The score is wrong — you'll pick the right one.",
|
|
25
25
|
"borderline" => "The rubric is unclear here; either score could be defensible."
|
|
26
26
|
} %>
|
|
27
|
-
<% CompletionKit::
|
|
28
|
-
<%= button_to
|
|
27
|
+
<% CompletionKit::Agreement::VERDICTS.each do |verdict| %>
|
|
28
|
+
<%= button_to run_response_agreements_path(run, response_row, metric_id: metric.id, verdict: verdict),
|
|
29
29
|
method: :post,
|
|
30
30
|
form: { data: { turbo: "true" } },
|
|
31
|
-
class: "ck-
|
|
31
|
+
class: "ck-agreement__pill ck-agreement__pill--#{verdict}#{' is-active' if verdict == active_verdict}",
|
|
32
32
|
"aria-pressed": (verdict == active_verdict).to_s,
|
|
33
33
|
title: verdict_hints[verdict] do %>
|
|
34
34
|
<%= heroicon_tag verdict_icons[verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
@@ -38,26 +38,26 @@
|
|
|
38
38
|
</div>
|
|
39
39
|
|
|
40
40
|
<% if error.present? %>
|
|
41
|
-
<p class="ck-
|
|
41
|
+
<p class="ck-agreement__error" role="alert"><%= error %></p>
|
|
42
42
|
<% end %>
|
|
43
43
|
|
|
44
|
-
<% if
|
|
45
|
-
<details class="ck-
|
|
46
|
-
<summary class="ck-
|
|
44
|
+
<% if other_agreements.any? %>
|
|
45
|
+
<details class="ck-agreement__others">
|
|
46
|
+
<summary class="ck-agreement__others-summary">
|
|
47
47
|
<%= heroicon_tag "chevron-right", variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
48
|
-
<span>What others said (<%=
|
|
48
|
+
<span>What others said (<%= other_agreements.size %>)</span>
|
|
49
49
|
</summary>
|
|
50
|
-
<ul class="ck-
|
|
51
|
-
<%
|
|
52
|
-
<li class="ck-
|
|
53
|
-
<div class="ck-
|
|
54
|
-
<span class="ck-
|
|
50
|
+
<ul class="ck-agreement__others-list">
|
|
51
|
+
<% other_agreements.each do |other| %>
|
|
52
|
+
<li class="ck-agreement__others-item ck-agreement__others-item--<%= other.verdict %>">
|
|
53
|
+
<div class="ck-agreement__others-row">
|
|
54
|
+
<span class="ck-agreement__others-verdict">
|
|
55
55
|
<%= heroicon_tag verdict_icons[other.verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
56
56
|
<%= other.verdict %>
|
|
57
57
|
</span>
|
|
58
|
-
<span class="ck-
|
|
58
|
+
<span class="ck-agreement__others-by"><%= other.created_by %></span>
|
|
59
59
|
<% if other.corrected_score %>
|
|
60
|
-
<span class="ck-
|
|
60
|
+
<span class="ck-agreement__others-stars" aria-label="<%= pluralize(other.corrected_score.to_i, 'star') %>" title="<%= pluralize(other.corrected_score.to_i, 'star') %>">
|
|
61
61
|
<% 5.times do |i| %>
|
|
62
62
|
<svg viewBox="0 0 24 24" width="12" height="12" stroke-width="1.75" class="ck-star <%= i < other.corrected_score.to_i ? "ck-star--filled" : "ck-star--empty" %>" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
|
|
63
63
|
<% end %>
|
|
@@ -65,7 +65,7 @@
|
|
|
65
65
|
<% end %>
|
|
66
66
|
</div>
|
|
67
67
|
<% if other.note.to_s.present? %>
|
|
68
|
-
<p class="ck-
|
|
68
|
+
<p class="ck-agreement__others-note">"<%= other.note.to_s.truncate(140) %>"</p>
|
|
69
69
|
<% end %>
|
|
70
70
|
</li>
|
|
71
71
|
<% end %>
|
|
@@ -74,10 +74,10 @@
|
|
|
74
74
|
<% end %>
|
|
75
75
|
|
|
76
76
|
<% if active_verdict == "disagree" %>
|
|
77
|
-
<% existing_score = (
|
|
78
|
-
<%= form_with url:
|
|
77
|
+
<% existing_score = (agreement&.corrected_score || review&.ai_score)&.round %>
|
|
78
|
+
<%= form_with url: run_response_agreements_path(run, response_row),
|
|
79
79
|
method: :post, local: false,
|
|
80
|
-
class: "ck-
|
|
80
|
+
class: "ck-agreement__detail" do |f| %>
|
|
81
81
|
<%= hidden_field_tag :metric_id, metric.id %>
|
|
82
82
|
<%= hidden_field_tag :verdict, "disagree" %>
|
|
83
83
|
<p class="ck-label">What should the score have been?</p>
|
|
@@ -93,16 +93,16 @@
|
|
|
93
93
|
<% end %>
|
|
94
94
|
</div>
|
|
95
95
|
</fieldset>
|
|
96
|
-
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%=
|
|
96
|
+
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="Why? (optional)"><%= agreement&.note %></textarea>
|
|
97
97
|
<%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
|
|
98
98
|
<% end %>
|
|
99
99
|
<% elsif active_verdict == "borderline" %>
|
|
100
|
-
<%= form_with url:
|
|
100
|
+
<%= form_with url: run_response_agreements_path(run, response_row),
|
|
101
101
|
method: :post, local: false,
|
|
102
|
-
class: "ck-
|
|
102
|
+
class: "ck-agreement__detail" do |f| %>
|
|
103
103
|
<%= hidden_field_tag :metric_id, metric.id %>
|
|
104
104
|
<%= hidden_field_tag :verdict, "borderline" %>
|
|
105
|
-
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%=
|
|
105
|
+
<textarea name="note" rows="2" class="ck-input ck-input--area" placeholder="What made this borderline? (optional)"><%= agreement&.note %></textarea>
|
|
106
106
|
<%= f.submit "Save", class: "#{ck_button_classes(:dark)}#{' ck-button--just-saved' if local_assigns[:just_saved]}" %>
|
|
107
107
|
<% end %>
|
|
108
108
|
<% end %>
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
<% current_metric_version = metric && CompletionKit::MetricVersion.current.find_by(metric_id: metric.id) %>
|
|
5
5
|
<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric && current_metric_version
|
|
6
6
|
created_by = CompletionKit.config.username.presence || "operator"
|
|
7
|
-
verdicted_ids = CompletionKit::
|
|
7
|
+
verdicted_ids = CompletionKit::Agreement.where(metric_id: metric.id, created_by: created_by, metric_version_id: current_metric_version.id).pluck(:response_id)
|
|
8
8
|
CompletionKit::Response.joins(:reviews)
|
|
9
9
|
.where(reviews: { metric_id: metric.id, metric_version_id: current_metric_version.id })
|
|
10
10
|
.where.not(reviews: { ai_score: nil })
|
|
@@ -12,29 +12,26 @@
|
|
|
12
12
|
.order(created_at: :desc).first
|
|
13
13
|
end %>
|
|
14
14
|
<% prior_version_verdicts = if stats.sample_size.zero? && metric && current_metric_version
|
|
15
|
-
CompletionKit::
|
|
15
|
+
CompletionKit::Agreement.where(metric_id: metric.id).where.not(metric_version_id: current_metric_version.id).count
|
|
16
16
|
else
|
|
17
17
|
0
|
|
18
18
|
end %>
|
|
19
19
|
|
|
20
|
-
<p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
20
|
+
<p id="agreement" class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
21
21
|
<% if stats.sample_size.zero? %>
|
|
22
22
|
<span class="ck-trust-line__lead">Not measured yet.</span>
|
|
23
|
-
<span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::
|
|
23
|
+
<span class="ck-trust-line__hint"><%= current_metric_version ? "#{current_metric_version.version_label} needs" : "Needs" %> <%= CompletionKit::MetricAgreementStats::PROVISIONAL_MIN %> human reviews of the judge's scores.</span>
|
|
24
24
|
<% if target_response %>
|
|
25
25
|
<%= link_to "Review a judge's score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
26
26
|
<% end %>
|
|
27
27
|
<% elsif stats.counter_only? %>
|
|
28
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::
|
|
28
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Human reviews</span> <strong><%= stats.sample_size %> / <%= CompletionKit::MetricAgreementStats::PROVISIONAL_MIN %></strong></span>
|
|
29
29
|
<% if stats.short_to_target > 0 %><span class="ck-trust-line__hint"><%= stats.short_to_target %> more to report a rate</span><% end %>
|
|
30
30
|
<% if target_response %>
|
|
31
31
|
<%= link_to "Review another score →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-cal-link" %>
|
|
32
32
|
<% end %>
|
|
33
33
|
<% else %>
|
|
34
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">
|
|
35
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Margin</span> ±<%= (stats.margin * 100).round %> pt</span>
|
|
36
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Read</span> <%= stats.firm? ? "settled" : "early" %></span>
|
|
37
|
-
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Sample</span> <%= stats.sample_size %></span>
|
|
34
|
+
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Agrees with you</span> <strong class="ck-trust-line__figure">~<%= (stats.agreement_point * 100).round %>%</strong> of <%= stats.sample_size %> reviews</span>
|
|
38
35
|
<% if stats.borderline_rate && stats.borderline_rate > 0 %>
|
|
39
36
|
<% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %>
|
|
40
37
|
<span class="ck-cal-stat"><span class="ck-cal-stat__label">Unclear</span> <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>"><%= (stats.borderline_rate * 100).round %>%</span></span>
|