completion-kit 0.5.41 → 0.5.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/javascripts/completion_kit/application.js +17 -0
- data/app/assets/stylesheets/completion_kit/application.css +557 -23
- data/app/controllers/completion_kit/metrics_controller.rb +62 -24
- data/app/jobs/completion_kit/judge_review_job.rb +11 -0
- data/app/models/completion_kit/judge_version.rb +32 -1
- data/app/models/completion_kit/starter_metric_dismissal.rb +5 -0
- data/app/services/completion_kit/judge_variant_generator.rb +8 -6
- data/app/services/completion_kit/metric_calibration_stats.rb +16 -4
- data/app/services/completion_kit/starter_metrics.rb +94 -0
- data/app/views/completion_kit/api_reference/_body.html.erb +1 -1
- data/app/views/completion_kit/calibrations/_buttons.html.erb +43 -6
- data/app/views/completion_kit/calibrations/_trust_panel.html.erb +27 -28
- data/app/views/completion_kit/metrics/_form.html.erb +90 -4
- data/app/views/completion_kit/metrics/_rubric_diff.html.erb +25 -0
- data/app/views/completion_kit/metrics/_rubric_hint.html.erb +4 -0
- data/app/views/completion_kit/metrics/_starter_card.html.erb +15 -0
- data/app/views/completion_kit/metrics/edit.html.erb +5 -1
- data/app/views/completion_kit/metrics/index.html.erb +32 -6
- data/app/views/completion_kit/metrics/show.html.erb +132 -126
- data/app/views/completion_kit/metrics/starter_preview.html.erb +45 -0
- data/app/views/completion_kit/responses/show.html.erb +1 -1
- data/app/views/completion_kit/runs/_status_panel.html.erb +2 -2
- data/config/routes.rb +7 -1
- data/db/migrate/20260524000001_create_completion_kit_starter_metric_dismissals.rb +12 -0
- data/db/migrate/20260525000001_add_version_number_and_published_at_to_judge_versions.rb +24 -0
- data/lib/completion_kit/version.rb +1 -1
- metadata +9 -1
|
@@ -1,21 +1,50 @@
|
|
|
1
1
|
module CompletionKit
|
|
2
2
|
class MetricsController < ApplicationController
|
|
3
3
|
include CompletionKit::TagFiltering
|
|
4
|
-
before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
|
|
4
|
+
before_action :set_metric, only: [:show, :edit, :update, :destroy, :add_few_shot, :remove_few_shot, :publish_draft, :suggest_variants, :dismiss_suggestion]
|
|
5
5
|
|
|
6
6
|
def index
|
|
7
7
|
@metrics = apply_tag_filter(Metric.includes(:metric_groups, :tags).order(:name))
|
|
8
|
+
@available_starters = StarterMetrics.available
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def starter_preview
|
|
12
|
+
@starter = StarterMetrics.find(params[:key])
|
|
13
|
+
return redirect_to(metrics_path, alert: "Unknown starter metric.") unless @starter
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def adopt_starter
|
|
17
|
+
starter = StarterMetrics.find(params[:key])
|
|
18
|
+
return redirect_to(metrics_path, alert: "Unknown starter metric.") unless starter
|
|
19
|
+
if Metric.exists?(name: starter.name)
|
|
20
|
+
return redirect_to(metrics_path, alert: "A metric named \"#{starter.name}\" already exists.")
|
|
21
|
+
end
|
|
22
|
+
metric = Metric.create!(
|
|
23
|
+
name: starter.name,
|
|
24
|
+
instruction: starter.instruction,
|
|
25
|
+
rubric_bands: starter.rubric_bands
|
|
26
|
+
)
|
|
27
|
+
redirect_to metric_path(metric), notice: "Added the \"#{starter.name}\" starter. Tweak any band before you run a judge against it."
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def dismiss_starter
|
|
31
|
+
starter = StarterMetrics.find(params[:key])
|
|
32
|
+
return redirect_to(metrics_path, alert: "Unknown starter metric.") unless starter
|
|
33
|
+
StarterMetricDismissal.find_or_create_by(starter_key: starter.key)
|
|
34
|
+
redirect_to metrics_path, notice: "Dismissed \"#{starter.name}\". It won't appear here again."
|
|
8
35
|
end
|
|
9
36
|
|
|
10
37
|
def show
|
|
38
|
+
@published_judge_version = JudgeVersion.ensure_current_for(@metric)
|
|
11
39
|
@disagreements = Calibration.where(metric_id: @metric.id, verdict: "disagree")
|
|
12
|
-
.includes(response: [:reviews, :run])
|
|
40
|
+
.includes(:judge_version, response: [:reviews, :run])
|
|
13
41
|
.order(created_at: :desc)
|
|
14
42
|
.limit(50)
|
|
15
43
|
@edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
16
|
-
@published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
|
|
17
44
|
@suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
18
|
-
@improve_disagreement_count = @
|
|
45
|
+
@improve_disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree",
|
|
46
|
+
judge_version_id: @published_judge_version.id).count
|
|
47
|
+
@versions = JudgeVersion.where(metric_id: @metric.id).order(version_number: :desc).to_a
|
|
19
48
|
end
|
|
20
49
|
|
|
21
50
|
def new
|
|
@@ -23,6 +52,9 @@ module CompletionKit
|
|
|
23
52
|
end
|
|
24
53
|
|
|
25
54
|
def edit
|
|
55
|
+
@suggestion_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "suggestion").order(created_at: :desc).first
|
|
56
|
+
@edit_draft = JudgeVersion.drafts.where(metric_id: @metric.id, source: "edit").order(created_at: :desc).first
|
|
57
|
+
@published_judge_version = JudgeVersion.published.where(metric_id: @metric.id, current: true).first
|
|
26
58
|
end
|
|
27
59
|
|
|
28
60
|
def create
|
|
@@ -49,9 +81,10 @@ module CompletionKit
|
|
|
49
81
|
end
|
|
50
82
|
|
|
51
83
|
def suggest_variants
|
|
84
|
+
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
52
85
|
disagreement_count = Calibration.where(metric_id: @metric.id, verdict: "disagree").count
|
|
53
86
|
if disagreement_count.zero?
|
|
54
|
-
redirect_to
|
|
87
|
+
redirect_to target, alert: "Mark at least one row as Disagree before asking the model to suggest a change."
|
|
55
88
|
return
|
|
56
89
|
end
|
|
57
90
|
|
|
@@ -60,38 +93,36 @@ module CompletionKit
|
|
|
60
93
|
generator = JudgeVariantGenerator.new(@metric, count: 1)
|
|
61
94
|
variants = generator.call
|
|
62
95
|
if variants.empty?
|
|
63
|
-
redirect_to
|
|
96
|
+
redirect_to target, alert: "The model returned no usable variants. Try again with a different model."
|
|
64
97
|
return
|
|
65
98
|
end
|
|
66
99
|
generator.persist!(variants)
|
|
67
|
-
redirect_to
|
|
100
|
+
redirect_to target, notice: "Drafted a new version. Review it below."
|
|
68
101
|
end
|
|
69
102
|
|
|
70
103
|
def dismiss_suggestion
|
|
71
|
-
draft = JudgeVersion.drafts.where(metric_id: @metric.id
|
|
104
|
+
draft = JudgeVersion.drafts.where(metric_id: @metric.id).find_by(id: params[:draft_id])
|
|
72
105
|
draft&.destroy
|
|
73
|
-
|
|
106
|
+
target = params[:back_to] == "edit" ? edit_metric_path(@metric) : metric_path(@metric)
|
|
107
|
+
redirect_to target, notice: "Dismissed."
|
|
74
108
|
end
|
|
75
109
|
|
|
76
110
|
def publish_draft
|
|
77
|
-
scope = JudgeVersion.
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
111
|
+
scope = JudgeVersion.where(metric_id: @metric.id)
|
|
112
|
+
version = if params[:draft_id].present?
|
|
113
|
+
scope.find_by(id: params[:draft_id])
|
|
114
|
+
else
|
|
115
|
+
JudgeVersion.drafts.where(metric_id: @metric.id).order(created_at: :desc).first
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
if version.nil?
|
|
119
|
+
redirect_to metric_path(@metric), alert: "No version to publish."
|
|
82
120
|
return
|
|
83
121
|
end
|
|
84
122
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
@metric.update_columns(
|
|
89
|
-
instruction: draft.instruction,
|
|
90
|
-
rubric_bands: Array(draft.rubric_bands).to_json
|
|
91
|
-
)
|
|
92
|
-
end
|
|
93
|
-
|
|
94
|
-
redirect_to metric_path(@metric), notice: "This judge version is now live."
|
|
123
|
+
version.publish!
|
|
124
|
+
redirect_to metric_path(@metric),
|
|
125
|
+
notice: "#{@metric.name} #{version.version_label} is now the published version."
|
|
95
126
|
end
|
|
96
127
|
|
|
97
128
|
def add_few_shot
|
|
@@ -112,6 +143,13 @@ module CompletionKit
|
|
|
112
143
|
redirect_to metric_path(@metric), notice: "Got it. The judge will remember this next time it grades."
|
|
113
144
|
end
|
|
114
145
|
|
|
146
|
+
def remove_few_shot
|
|
147
|
+
cal_id = params[:calibration_id].to_i
|
|
148
|
+
remaining = Array(@metric.few_shot_examples).reject { |fs| fs["calibration_id"].to_i == cal_id }
|
|
149
|
+
@metric.update!(few_shot_examples: remaining)
|
|
150
|
+
redirect_to metric_path(@metric), notice: "Forgotten. The judge won't see this case next time."
|
|
151
|
+
end
|
|
152
|
+
|
|
115
153
|
private
|
|
116
154
|
|
|
117
155
|
def set_metric
|
|
@@ -57,6 +57,7 @@ module CompletionKit
|
|
|
57
57
|
run.prompt&.template,
|
|
58
58
|
criteria: metric.instruction.to_s,
|
|
59
59
|
rubric_text: metric.display_rubric_text,
|
|
60
|
+
human_examples: few_shot_payload(metric),
|
|
60
61
|
input_data: response.input_data
|
|
61
62
|
)
|
|
62
63
|
|
|
@@ -119,5 +120,15 @@ module CompletionKit
|
|
|
119
120
|
response = Response.find_by(id: response_id)
|
|
120
121
|
RunCompletionCheckJob.perform_later(response.run_id) if response
|
|
121
122
|
end
|
|
123
|
+
|
|
124
|
+
def few_shot_payload(metric)
|
|
125
|
+
Array(metric.few_shot_examples).map do |fs|
|
|
126
|
+
{
|
|
127
|
+
human_score: fs["human_score"],
|
|
128
|
+
response_text: fs["response"].to_s,
|
|
129
|
+
human_note: fs["human_note"].to_s
|
|
130
|
+
}
|
|
131
|
+
end
|
|
132
|
+
end
|
|
122
133
|
end
|
|
123
134
|
end
|
|
@@ -7,8 +7,11 @@ module CompletionKit
|
|
|
7
7
|
|
|
8
8
|
serialize :rubric_bands, coder: JSON
|
|
9
9
|
|
|
10
|
+
before_validation :assign_version_number, on: :create
|
|
11
|
+
|
|
10
12
|
validates :metric_id, presence: true
|
|
11
13
|
validates :state, inclusion: { in: STATES }
|
|
14
|
+
validates :version_number, presence: true, uniqueness: { scope: :metric_id }
|
|
12
15
|
|
|
13
16
|
scope :current, -> { where(current: true) }
|
|
14
17
|
scope :published, -> { where(state: "published") }
|
|
@@ -20,7 +23,8 @@ module CompletionKit
|
|
|
20
23
|
instruction: metric.instruction,
|
|
21
24
|
rubric_bands: metric.rubric_bands,
|
|
22
25
|
current: true,
|
|
23
|
-
state: "published"
|
|
26
|
+
state: "published",
|
|
27
|
+
published_at: Time.current
|
|
24
28
|
)
|
|
25
29
|
end
|
|
26
30
|
|
|
@@ -32,17 +36,44 @@ module CompletionKit
|
|
|
32
36
|
state == "published"
|
|
33
37
|
end
|
|
34
38
|
|
|
39
|
+
def version_label
|
|
40
|
+
"v#{version_number}"
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def publish!
|
|
44
|
+
JudgeVersion.transaction do
|
|
45
|
+
self.class.where(metric_id: metric_id).where.not(id: id).update_all(current: false)
|
|
46
|
+
reload
|
|
47
|
+
update!(state: "published", current: true, published_at: published_at || Time.current)
|
|
48
|
+
metric.update_columns(
|
|
49
|
+
instruction: instruction,
|
|
50
|
+
rubric_bands: Array(rubric_bands).to_json
|
|
51
|
+
)
|
|
52
|
+
end
|
|
53
|
+
self
|
|
54
|
+
end
|
|
55
|
+
|
|
35
56
|
def as_json(options = {})
|
|
36
57
|
{
|
|
37
58
|
id: id,
|
|
38
59
|
metric_id: metric_id,
|
|
60
|
+
version_number: version_number,
|
|
39
61
|
instruction: instruction,
|
|
40
62
|
rubric_bands: rubric_bands,
|
|
41
63
|
current: current,
|
|
42
64
|
state: state,
|
|
43
65
|
source: source,
|
|
66
|
+
published_at: published_at,
|
|
44
67
|
created_at: created_at
|
|
45
68
|
}
|
|
46
69
|
end
|
|
70
|
+
|
|
71
|
+
private
|
|
72
|
+
|
|
73
|
+
def assign_version_number
|
|
74
|
+
return if version_number.present?
|
|
75
|
+
max = self.class.where(metric_id: metric_id).maximum(:version_number).to_i
|
|
76
|
+
self.version_number = max + 1
|
|
77
|
+
end
|
|
47
78
|
end
|
|
48
79
|
end
|
|
@@ -86,7 +86,7 @@ module CompletionKit
|
|
|
86
86
|
sections << "REASONING: <one short sentence: what changes and why>"
|
|
87
87
|
sections << "INSTRUCTION:"
|
|
88
88
|
sections << "<the rewritten instruction>"
|
|
89
|
-
sections << "RUBRIC: # optional
|
|
89
|
+
sections << "RUBRIC: # optional. Omit this block if the rubric is unchanged."
|
|
90
90
|
sections << "5: <description for 5 stars>"
|
|
91
91
|
sections << "4: <description for 4 stars>"
|
|
92
92
|
sections << "3: <description for 3 stars>"
|
|
@@ -133,11 +133,13 @@ module CompletionKit
|
|
|
133
133
|
end
|
|
134
134
|
|
|
135
135
|
def calibrations_for(metric, verdict:, limit:)
|
|
136
|
-
Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
136
|
+
scope = Calibration.where(metric_id: metric.id, verdict: verdict)
|
|
137
|
+
current_version = JudgeVersion.current.find_by(metric_id: metric.id)
|
|
138
|
+
scope = scope.where(judge_version_id: current_version.id) if current_version
|
|
139
|
+
scope.includes(response: :reviews)
|
|
140
|
+
.order(created_at: :desc)
|
|
141
|
+
.limit(limit)
|
|
142
|
+
.map do |cal|
|
|
141
143
|
review = cal.response.reviews.find { |r| r.metric_id == metric.id }
|
|
142
144
|
{
|
|
143
145
|
input: cal.response.input_data,
|
|
@@ -31,18 +31,30 @@ module CompletionKit
|
|
|
31
31
|
end
|
|
32
32
|
end
|
|
33
33
|
|
|
34
|
-
|
|
35
|
-
|
|
34
|
+
CURRENT = :current
|
|
35
|
+
|
|
36
|
+
def self.for(metric, judge_version: CURRENT)
|
|
37
|
+
resolved = case judge_version
|
|
38
|
+
when CURRENT then JudgeVersion.current.find_by(metric_id: metric.id)
|
|
39
|
+
when nil then nil
|
|
40
|
+
else judge_version
|
|
41
|
+
end
|
|
42
|
+
new(metric: metric, judge_version: resolved, all_versions: judge_version.nil?).call
|
|
36
43
|
end
|
|
37
44
|
|
|
38
|
-
def initialize(metric:, judge_version: nil)
|
|
45
|
+
def initialize(metric:, judge_version: nil, all_versions: false)
|
|
39
46
|
@metric = metric
|
|
40
47
|
@judge_version = judge_version
|
|
48
|
+
@all_versions = all_versions
|
|
41
49
|
end
|
|
42
50
|
|
|
43
51
|
def call
|
|
44
52
|
scope = Calibration.where(metric_id: @metric.id)
|
|
45
|
-
|
|
53
|
+
if @judge_version
|
|
54
|
+
scope = scope.where(judge_version_id: @judge_version.id)
|
|
55
|
+
elsif !@all_versions
|
|
56
|
+
scope = scope.none
|
|
57
|
+
end
|
|
46
58
|
|
|
47
59
|
verdicts = scope.pluck(:verdict, :corrected_score, :response_id)
|
|
48
60
|
n = verdicts.length
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
module StarterMetrics
|
|
3
|
+
Starter = Struct.new(:key, :name, :description, :catches, :instruction, :rubric_bands, keyword_init: true)
|
|
4
|
+
|
|
5
|
+
ALL = [
|
|
6
|
+
Starter.new(
|
|
7
|
+
key: "correctness",
|
|
8
|
+
name: "Correctness",
|
|
9
|
+
description: "Is the output factually right and free of made-up information?",
|
|
10
|
+
catches: "Hallucinations, wrong facts, subtle distortions. The most universally-asked question about an LLM's output.",
|
|
11
|
+
instruction: "Is the output factually right and free of made-up information? Penalise hallucinations and subtle factual distortions; reward outputs whose every claim checks out.",
|
|
12
|
+
rubric_bands: [
|
|
13
|
+
{ "stars" => 5, "description" => "Every fact in the output checks out." },
|
|
14
|
+
{ "stars" => 4, "description" => "Right in substance; minor imprecision or omission." },
|
|
15
|
+
{ "stars" => 3, "description" => "Mostly right, one or two facts are off." },
|
|
16
|
+
{ "stars" => 2, "description" => "Mostly wrong with a few right details." },
|
|
17
|
+
{ "stars" => 1, "description" => "Wrong, misleading, or contains fabricated facts." }
|
|
18
|
+
]
|
|
19
|
+
),
|
|
20
|
+
Starter.new(
|
|
21
|
+
key: "instruction_following",
|
|
22
|
+
name: "Instruction following",
|
|
23
|
+
description: "Did the model do everything that was asked?",
|
|
24
|
+
catches: "The response is factually right but ignores \"answer in two sentences\", \"use bullet points\", \"do not include X\". Distinct from Correctness — a response can be right and still fail this.",
|
|
25
|
+
instruction: "Did the model do every concrete thing the prompt asked for? Score against the explicit requirements in the prompt (format constraints, count limits, exclusions, audience cues). Factual accuracy is a different dimension — score that elsewhere.",
|
|
26
|
+
rubric_bands: [
|
|
27
|
+
{ "stars" => 5, "description" => "Followed every requirement in the prompt exactly." },
|
|
28
|
+
{ "stars" => 4, "description" => "Followed every requirement with a small slip." },
|
|
29
|
+
{ "stars" => 3, "description" => "Did the main thing, missed at least one explicit requirement." },
|
|
30
|
+
{ "stars" => 2, "description" => "Did some of what was asked, missed the main requirement." },
|
|
31
|
+
{ "stars" => 1, "description" => "Ignored the instructions or did something different." }
|
|
32
|
+
]
|
|
33
|
+
),
|
|
34
|
+
Starter.new(
|
|
35
|
+
key: "format_compliance",
|
|
36
|
+
name: "Format compliance",
|
|
37
|
+
description: "Does the output follow the required structure?",
|
|
38
|
+
catches: "Invalid JSON, missing schema fields, extra prose around a structured response, wrong casing on keys. Critical for any LLM wired into an API.",
|
|
39
|
+
instruction: "Does the output match the format the prompt asked for — JSON shape, schema, keys, casing, no stray prose? Score on whether a downstream parser would accept it without massaging.",
|
|
40
|
+
rubric_bands: [
|
|
41
|
+
{ "stars" => 5, "description" => "Exact spec, ready to consume programmatically." },
|
|
42
|
+
{ "stars" => 4, "description" => "Spec-compliant with one cosmetic issue." },
|
|
43
|
+
{ "stars" => 3, "description" => "Right shape, minor deviations (extra commentary, casing, ordering)." },
|
|
44
|
+
{ "stars" => 2, "description" => "Right format with substantive deviations (missing required fields, wrong types)." },
|
|
45
|
+
{ "stars" => 1, "description" => "Wrong format or unparseable." }
|
|
46
|
+
]
|
|
47
|
+
),
|
|
48
|
+
Starter.new(
|
|
49
|
+
key: "tone",
|
|
50
|
+
name: "Tone",
|
|
51
|
+
description: "Does the voice fit the audience the prompt asked for?",
|
|
52
|
+
catches: "Rude, robotic, off-brand, too casual, too formal. The dimension hardest to eyeball at scale and the one most user-facing surfaces care about.",
|
|
53
|
+
instruction: "Does the voice match the audience and brand the prompt called for? Reward outputs that sound like the persona the prompt asked for. Penalise rude, robotic, off-brand, or wrong-register replies.",
|
|
54
|
+
rubric_bands: [
|
|
55
|
+
{ "stars" => 5, "description" => "Sounds like the brand or persona the prompt asked for." },
|
|
56
|
+
{ "stars" => 4, "description" => "Right tone with a slip or two." },
|
|
57
|
+
{ "stars" => 3, "description" => "Acceptable, generic, no personality." },
|
|
58
|
+
{ "stars" => 2, "description" => "Mismatched tone; sounds like a different audience." },
|
|
59
|
+
{ "stars" => 1, "description" => "Off-tone in a way a user would notice (rude, condescending, jarring)." }
|
|
60
|
+
]
|
|
61
|
+
),
|
|
62
|
+
Starter.new(
|
|
63
|
+
key: "conciseness",
|
|
64
|
+
name: "Conciseness",
|
|
65
|
+
description: "Is it the right length — no padding, no missing detail?",
|
|
66
|
+
catches: "Rambling responses, repetitive caveats, over-hedging. LLMs default to verbose. Conciseness is the dimension where users most often see scores move after tuning.",
|
|
67
|
+
instruction: "Is the output the right length for the task — no padding, no missing detail, no hedging filler? Penalise rambling, repetition, over-caveating, and unnecessary preamble. Penalise too-short outputs that drop information.",
|
|
68
|
+
rubric_bands: [
|
|
69
|
+
{ "stars" => 5, "description" => "Exactly as long as the task needs, no more, no less." },
|
|
70
|
+
{ "stars" => 4, "description" => "Right length with a small redundancy." },
|
|
71
|
+
{ "stars" => 3, "description" => "Acceptable; trims could happen or detail could be added." },
|
|
72
|
+
{ "stars" => 2, "description" => "Noticeable filler or visible gaps." },
|
|
73
|
+
{ "stars" => 1, "description" => "Padded, repetitive, or so short it loses information." }
|
|
74
|
+
]
|
|
75
|
+
)
|
|
76
|
+
].freeze
|
|
77
|
+
|
|
78
|
+
module_function
|
|
79
|
+
|
|
80
|
+
def find(key)
|
|
81
|
+
ALL.find { |s| s.key == key }
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def available
|
|
85
|
+
adopted_names = Metric.where(name: ALL.map(&:name)).pluck(:name).to_set
|
|
86
|
+
dismissed_keys = StarterMetricDismissal.pluck(:starter_key).to_set
|
|
87
|
+
ALL.reject { |s| adopted_names.include?(s.name) || dismissed_keys.include?(s.key) }
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def adopted?(starter)
|
|
91
|
+
Metric.exists?(name: starter.name)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -187,7 +187,7 @@
|
|
|
187
187
|
</div>
|
|
188
188
|
<%= render "completion_kit/api_reference/resource_list", title: "Your datasets",
|
|
189
189
|
items: datasets.map { |d|
|
|
190
|
-
{ name: d.name, meta: pluralize([d.csv_data.to_s.lines.count - 1, 0].max, "
|
|
190
|
+
{ name: d.name, meta: pluralize([d.csv_data.to_s.lines.count - 1, 0].max, "entry"),
|
|
191
191
|
url: "#{base_url}/api/v1/datasets/#{d.id}", dom_id: "dataset_ep_#{d.id}" }
|
|
192
192
|
} %>
|
|
193
193
|
</div>
|
|
@@ -3,14 +3,19 @@
|
|
|
3
3
|
<% pending_verdict = local_assigns[:pending_verdict] %>
|
|
4
4
|
<% active_verdict = pending_verdict || current_verdict %>
|
|
5
5
|
<% error = local_assigns[:error] %>
|
|
6
|
-
<%
|
|
6
|
+
<% me = CompletionKit.config.username.presence || "operator" %>
|
|
7
|
+
<% other_calibrations = CompletionKit::Calibration
|
|
8
|
+
.where(response_id: response_row.id, metric_id: metric.id)
|
|
9
|
+
.where.not(created_by: me)
|
|
10
|
+
.order(created_at: :asc).to_a %>
|
|
11
|
+
<% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
|
|
7
12
|
<p class="ck-calibration__prompt">
|
|
8
13
|
<span class="ck-calibration__label">Your verdict</span>
|
|
9
|
-
<% if
|
|
10
|
-
<span class="ck-
|
|
11
|
-
|
|
12
|
-
<span class="ck-calibration__hint">Tell us what you think — was the score right? Verdicts roll up into the metric's <%= link_to "trust level", metric_path(metric), class: "ck-link" %>.</span>
|
|
14
|
+
<% if other_calibrations.any? %>
|
|
15
|
+
<span class="ck-calibration__meta"><%= pluralize(other_calibrations.size, "other verdict") %> on this score</span>
|
|
16
|
+
<span class="ck-calibration__sep">·</span>
|
|
13
17
|
<% end %>
|
|
18
|
+
<%= link_to metric_path(metric), class: "ck-calibration__meta-link" do %><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration →<% end %>
|
|
14
19
|
</p>
|
|
15
20
|
<div class="ck-calibration__buttons">
|
|
16
21
|
<% verdict_icons = { "agree" => "hand-thumb-up", "disagree" => "hand-thumb-down", "borderline" => "scale" } %>
|
|
@@ -36,6 +41,38 @@
|
|
|
36
41
|
<p class="ck-calibration__error" role="alert"><%= error %></p>
|
|
37
42
|
<% end %>
|
|
38
43
|
|
|
44
|
+
<% if other_calibrations.any? %>
|
|
45
|
+
<details class="ck-calibration__others">
|
|
46
|
+
<summary class="ck-calibration__others-summary">
|
|
47
|
+
<%= heroicon_tag "chevron-right", variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
48
|
+
<span>What others said (<%= other_calibrations.size %>)</span>
|
|
49
|
+
</summary>
|
|
50
|
+
<ul class="ck-calibration__others-list">
|
|
51
|
+
<% other_calibrations.each do |other| %>
|
|
52
|
+
<li class="ck-calibration__others-item ck-calibration__others-item--<%= other.verdict %>">
|
|
53
|
+
<div class="ck-calibration__others-row">
|
|
54
|
+
<span class="ck-calibration__others-verdict">
|
|
55
|
+
<%= heroicon_tag verdict_icons[other.verdict], variant: :outline, size: 14, "aria-hidden": "true" %>
|
|
56
|
+
<%= other.verdict %>
|
|
57
|
+
</span>
|
|
58
|
+
<span class="ck-calibration__others-by"><%= other.created_by %></span>
|
|
59
|
+
<% if other.corrected_score %>
|
|
60
|
+
<span class="ck-calibration__others-stars" aria-label="<%= pluralize(other.corrected_score.to_i, 'star') %>" title="<%= pluralize(other.corrected_score.to_i, 'star') %>">
|
|
61
|
+
<% 5.times do |i| %>
|
|
62
|
+
<svg viewBox="0 0 24 24" width="12" height="12" stroke-width="1.75" class="ck-star <%= i < other.corrected_score.to_i ? "ck-star--filled" : "ck-star--empty" %>" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
|
|
63
|
+
<% end %>
|
|
64
|
+
</span>
|
|
65
|
+
<% end %>
|
|
66
|
+
</div>
|
|
67
|
+
<% if other.note.to_s.present? %>
|
|
68
|
+
<p class="ck-calibration__others-note">"<%= other.note.to_s.truncate(140) %>"</p>
|
|
69
|
+
<% end %>
|
|
70
|
+
</li>
|
|
71
|
+
<% end %>
|
|
72
|
+
</ul>
|
|
73
|
+
</details>
|
|
74
|
+
<% end %>
|
|
75
|
+
|
|
39
76
|
<% if active_verdict == "disagree" %>
|
|
40
77
|
<% existing_score = (calibration&.corrected_score || review&.ai_score)&.round %>
|
|
41
78
|
<%= form_with url: run_response_calibrations_path(run, response_row),
|
|
@@ -51,7 +88,7 @@
|
|
|
51
88
|
<% radio_id = "ck-star-#{response_row.id}-#{metric.id}-#{n}" %>
|
|
52
89
|
<input type="radio" name="corrected_score" id="<%= radio_id %>" value="<%= n %>" <%= "checked" if existing_score == n %> required>
|
|
53
90
|
<label for="<%= radio_id %>" title="<%= pluralize(n, 'star') %>" aria-label="<%= pluralize(n, 'star') %>">
|
|
54
|
-
<svg viewBox="0 0 24 24"
|
|
91
|
+
<svg viewBox="0 0 24 24" stroke-width="1.5" aria-hidden="true"><polygon points="12 2 15.09 8.26 22 9.27 17 14.14 18.18 21.02 12 17.77 5.82 21.02 7 14.14 2 9.27 8.91 8.26 12 2"/></svg>
|
|
55
92
|
</label>
|
|
56
93
|
<% end %>
|
|
57
94
|
</div>
|
|
@@ -1,31 +1,30 @@
|
|
|
1
1
|
<% stats = local_assigns[:stats] %>
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
2
|
+
<% metric = local_assigns[:metric] %>
|
|
3
|
+
<% anchor = metric&.name&.parameterize %>
|
|
4
|
+
<% target_response = if (stats.sample_size.zero? || stats.counter_only?) && metric
|
|
5
|
+
created_by = CompletionKit.config.username.presence || "operator"
|
|
6
|
+
verdicted_ids = CompletionKit::Calibration.where(metric_id: metric.id, created_by: created_by).pluck(:response_id)
|
|
7
|
+
CompletionKit::Response.joins(:reviews)
|
|
8
|
+
.where(reviews: { metric_id: metric.id })
|
|
9
|
+
.where.not(reviews: { ai_score: nil })
|
|
10
|
+
.where.not(id: verdicted_ids)
|
|
11
|
+
.order(created_at: :desc).first
|
|
12
|
+
end %>
|
|
13
|
+
|
|
14
|
+
<p class="ck-trust-line ck-trust-line--<%= stats.gate %>">
|
|
15
|
+
<span class="ck-trust-line__label"><%= heroicon_tag "adjustments-horizontal", variant: :outline, "aria-hidden": "true" %>Calibration</span>
|
|
16
|
+
<% if stats.sample_size.zero? %>
|
|
17
|
+
<span class="ck-trust-line__state">Not measured yet.</span>
|
|
18
|
+
<span class="ck-trust-line__hint">Needs <%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %> verdicts on the judge's scores.<% if target_response %>
|
|
19
|
+
<%= link_to "Give a verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %>
|
|
20
|
+
<% else %> Run this metric on a dataset, then give a verdict.<% end %></span>
|
|
21
|
+
<% elsif stats.counter_only? %>
|
|
22
|
+
<span class="ck-trust-line__counter"><%= stats.sample_size %>/<%= CompletionKit::MetricCalibrationStats::PROVISIONAL_MIN %></span>
|
|
23
|
+
<span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.short_to_target > 0 %> · <%= stats.short_to_target %> more before this can be measured<% end %><% if target_response %> · <%= link_to "Give another verdict →", run_response_path(target_response.run, target_response, anchor: anchor), class: "ck-trust-line__link" %><% end %></span>
|
|
9
24
|
<% else %>
|
|
10
|
-
<
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
title="The range we're confident the true rate sits in, given how few verdicts we have so far.">±<%= (stats.margin * 100).round %> pt</span>
|
|
15
|
-
<span class="ck-trust-panel__gate"
|
|
16
|
-
title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts to tighten the margin.' %>"><%= stats.firm? ? "settled" : "early read" %></span>
|
|
17
|
-
</div>
|
|
18
|
-
<div class="ck-trust-panel__details">
|
|
19
|
-
<span><%= pluralize(stats.sample_size, "verdict") %></span>
|
|
20
|
-
<% if stats.borderline_rate && stats.borderline_rate > 0 %>
|
|
21
|
-
<% level = if stats.borderline_rate > 0.30 then "danger"
|
|
22
|
-
elsif stats.borderline_rate > 0.15 then "warning"
|
|
23
|
-
else "ok" end %>
|
|
24
|
-
<span class="ck-trust-panel__borderline ck-trust-panel__borderline--<%= level %>"
|
|
25
|
-
title="<%= level == 'ok' ? 'Some reviewers said the rubric was unclear here.' : 'A lot of reviewers say the rubric is unclear here. Consider splitting the metric or rewriting the rubric.' %>">
|
|
26
|
-
<%= (stats.borderline_rate * 100).round %>% said "unclear"
|
|
27
|
-
</span>
|
|
28
|
-
<% end %>
|
|
29
|
-
</div>
|
|
25
|
+
<span class="ck-trust-line__score" title="Roughly how often the judge and the humans agreed.">~<%= (stats.agreement_point * 100).round %>%</span>
|
|
26
|
+
<span class="ck-trust-line__margin" title="The range we're confident the true rate sits in.">±<%= (stats.margin * 100).round %> pt</span>
|
|
27
|
+
<span class="ck-trust-line__gate" title="<%= stats.firm? ? 'Enough verdicts for a reliable read.' : 'Early read. Keep giving verdicts.' %>"><%= stats.firm? ? "settled" : "early" %></span>
|
|
28
|
+
<span class="ck-trust-line__hint"><%= pluralize(stats.sample_size, "verdict") %><% if stats.borderline_rate && stats.borderline_rate > 0 %><% level = stats.borderline_rate > 0.30 ? "danger" : stats.borderline_rate > 0.15 ? "warning" : "ok" %> · <span class="ck-trust-line__borderline ck-trust-line__borderline--<%= level %>" title="<%= level == 'ok' ? '' : 'Reviewers said the rubric was unclear here.' %>"><%= (stats.borderline_rate * 100).round %>% unclear</span><% end %></span>
|
|
30
29
|
<% end %>
|
|
31
|
-
</
|
|
30
|
+
</p>
|