completion-kit 0.15.1 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/app/assets/stylesheets/completion_kit/application.css +17 -0
- data/app/controllers/completion_kit/runs_controller.rb +3 -5
- data/app/controllers/completion_kit/suggestions_controller.rb +10 -0
- data/app/jobs/completion_kit/prompt_suggestion_job.rb +55 -0
- data/app/models/completion_kit/suggestion.rb +27 -1
- data/app/services/completion_kit/api_config.rb +14 -8
- data/app/services/completion_kit/judge_service.rb +14 -2
- data/app/services/completion_kit/mcp_tools/metrics.rb +1 -1
- data/app/services/completion_kit/mcp_tools/prompts.rb +9 -2
- data/app/services/completion_kit/metric_variant_generator.rb +3 -1
- data/app/services/completion_kit/onboarding/sample_data.rb +6 -3
- data/app/services/completion_kit/prompt_improvement_validator.rb +106 -0
- data/app/views/completion_kit/api_reference/_body.html.erb +4 -3
- data/app/views/completion_kit/suggestions/_scoreboard.html.erb +16 -0
- data/app/views/completion_kit/suggestions/_state.html.erb +56 -0
- data/app/views/completion_kit/suggestions/show.html.erb +2 -32
- data/db/migrate/20260611000001_add_validation_to_completion_kit_suggestions.rb +6 -0
- data/lib/completion_kit/version.rb +1 -1
- data/lib/completion_kit.rb +1 -1
- metadata +6 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: c07d6c47ea3beb045a95e9ef2aa91a4e2afbe24668810826d7e50b1e8202b5d0
|
|
4
|
+
data.tar.gz: b60493ca5889e0a90c5168f3fbdf6ceab3c621cf7790152334c51b988c564aca
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: af5b8ff0082999d04cc348a75b3fd91f2b4b5fab71424bccdfccb2f0431d18f9998175b50f89676f31144d3d8524a2da14c258c7306ce0c7aedbdd40ce9b89ea
|
|
7
|
+
data.tar.gz: e999cad7d73effda025db4d70b3c1d854e406158cd6308f2624a7bfb82b28abcd62af073dbc3bef6705dec4ecce1d45d1d832e89478837abbe91b220a407e489
|
|
@@ -4116,6 +4116,23 @@ table.ck-runs-table {
|
|
|
4116
4116
|
height: 14px;
|
|
4117
4117
|
}
|
|
4118
4118
|
|
|
4119
|
+
.ck-suggest-progress {
|
|
4120
|
+
margin: 1.5rem 0;
|
|
4121
|
+
padding: 1.5rem;
|
|
4122
|
+
border: 1px solid var(--ck-line);
|
|
4123
|
+
border-radius: var(--ck-radius-lg);
|
|
4124
|
+
background: var(--ck-surface);
|
|
4125
|
+
}
|
|
4126
|
+
|
|
4127
|
+
.ck-suggest-progress .ck-meta-copy {
|
|
4128
|
+
margin: 0.5rem 0 0;
|
|
4129
|
+
max-width: 62ch;
|
|
4130
|
+
}
|
|
4131
|
+
|
|
4132
|
+
.ck-suggest-progress form {
|
|
4133
|
+
margin-top: 1rem;
|
|
4134
|
+
}
|
|
4135
|
+
|
|
4119
4136
|
.ck-suggest-reasoning {
|
|
4120
4137
|
margin: 1.5rem 0;
|
|
4121
4138
|
padding: 1.25rem 1.5rem;
|
|
@@ -137,14 +137,12 @@ module CompletionKit
|
|
|
137
137
|
return
|
|
138
138
|
end
|
|
139
139
|
|
|
140
|
-
service = PromptImprovementService.new(@run)
|
|
141
|
-
result = service.suggest
|
|
142
140
|
suggestion = @run.suggestions.create!(
|
|
143
141
|
prompt: @run.prompt,
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
original_template: result["original_template"]
|
|
142
|
+
original_template: @run.prompt.template,
|
|
143
|
+
status: "pending"
|
|
147
144
|
)
|
|
145
|
+
PromptSuggestionJob.perform_later(suggestion.id)
|
|
148
146
|
redirect_to suggestion_path(suggestion, from: "run")
|
|
149
147
|
end
|
|
150
148
|
|
|
@@ -8,6 +8,16 @@ module CompletionKit
|
|
|
8
8
|
end
|
|
9
9
|
|
|
10
10
|
def apply
|
|
11
|
+
if @suggestion.applied_at?
|
|
12
|
+
redirect_to suggestion_path(@suggestion), notice: "Suggestion already applied."
|
|
13
|
+
return
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
unless @suggestion.ready?
|
|
17
|
+
redirect_to suggestion_path(@suggestion), alert: "This suggestion isn't ready to apply yet."
|
|
18
|
+
return
|
|
19
|
+
end
|
|
20
|
+
|
|
11
21
|
run = @suggestion.run
|
|
12
22
|
new_prompt = run.prompt.clone_as_new_version(template: @suggestion.suggested_template)
|
|
13
23
|
new_prompt.publish!
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
require "faraday"
|
|
2
|
+
|
|
3
|
+
module CompletionKit
|
|
4
|
+
class PromptSuggestionJob < ApplicationJob
|
|
5
|
+
queue_as :llm
|
|
6
|
+
|
|
7
|
+
retry_on Faraday::TimeoutError, Faraday::ConnectionFailed, wait: :polynomially_longer, attempts: 5
|
|
8
|
+
retry_on CompletionKit::RateLimitError, wait: :polynomially_longer, attempts: 5
|
|
9
|
+
|
|
10
|
+
rescue_from(StandardError) do |error|
|
|
11
|
+
Rails.error.report(error, handled: true, context: { job: self.class.name })
|
|
12
|
+
if @suggestion
|
|
13
|
+
@suggestion.update_columns(status: "failed")
|
|
14
|
+
broadcast(@suggestion)
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def perform(suggestion_id)
|
|
19
|
+
@suggestion = Suggestion.find_by(id: suggestion_id)
|
|
20
|
+
return unless @suggestion
|
|
21
|
+
|
|
22
|
+
run = @suggestion.run
|
|
23
|
+
result = PromptImprovementService.new(run).suggest
|
|
24
|
+
|
|
25
|
+
if result["suggested_template"].blank?
|
|
26
|
+
@suggestion.update!(status: "failed")
|
|
27
|
+
broadcast(@suggestion)
|
|
28
|
+
return
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
summary = PromptImprovementValidator.new(run, result["suggested_template"]).call
|
|
32
|
+
@suggestion.update!(
|
|
33
|
+
reasoning: result["reasoning"],
|
|
34
|
+
suggested_template: result["suggested_template"],
|
|
35
|
+
validation_summary: summary,
|
|
36
|
+
status: "ready"
|
|
37
|
+
)
|
|
38
|
+
broadcast(@suggestion)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
def broadcast(suggestion)
|
|
44
|
+
html = CompletionKit::ApplicationController.render(
|
|
45
|
+
partial: "completion_kit/suggestions/state",
|
|
46
|
+
locals: { suggestion: suggestion, run: suggestion.run }
|
|
47
|
+
)
|
|
48
|
+
Turbo::StreamsChannel.broadcast_replace_to(
|
|
49
|
+
"completion_kit_suggestion_#{suggestion.id}",
|
|
50
|
+
target: "ck-suggestion-status-#{suggestion.id}",
|
|
51
|
+
html: html
|
|
52
|
+
)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
@@ -3,6 +3,32 @@ module CompletionKit
|
|
|
3
3
|
belongs_to :run
|
|
4
4
|
belongs_to :prompt
|
|
5
5
|
|
|
6
|
-
|
|
6
|
+
serialize :validation_summary, coder: JSON
|
|
7
|
+
|
|
8
|
+
validates :suggested_template, presence: true, if: :ready?
|
|
9
|
+
|
|
10
|
+
def pending?
|
|
11
|
+
status == "pending"
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def failed?
|
|
15
|
+
status == "failed"
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def ready?
|
|
19
|
+
!pending? && !failed?
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def validated?
|
|
23
|
+
vs = validation_summary
|
|
24
|
+
vs.present? && vs["after_avg"].present?
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def net_negative?
|
|
28
|
+
return false unless validated?
|
|
29
|
+
|
|
30
|
+
vs = validation_summary
|
|
31
|
+
vs["after_avg"].to_f < vs["before_avg"].to_f || vs["regressed"].to_i > vs["improved"].to_i
|
|
32
|
+
end
|
|
7
33
|
end
|
|
8
34
|
end
|
|
@@ -39,14 +39,20 @@ module CompletionKit
|
|
|
39
39
|
available_match = available_models.find { |model| model[:id] == model_name.to_s }
|
|
40
40
|
return available_match[:provider] if available_match
|
|
41
41
|
|
|
42
|
-
case model_name.to_s
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
42
|
+
guess = case model_name.to_s
|
|
43
|
+
when /\Agpt-/ then "openai"
|
|
44
|
+
when /\Aclaude-/ then "anthropic"
|
|
45
|
+
end
|
|
46
|
+
configured = ProviderCredential.distinct.pluck(:provider)
|
|
47
|
+
return guess if configured.empty?
|
|
48
|
+
|
|
49
|
+
guess if guess && configured.include?(guess)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def self.default_judge_model
|
|
53
|
+
configured = CompletionKit.config.judge_model
|
|
54
|
+
configured = configured.call if configured.respond_to?(:call)
|
|
55
|
+
configured.presence || Model.for_judging.order(:provider, :display_name).first&.model_id
|
|
50
56
|
end
|
|
51
57
|
|
|
52
58
|
def self.valid_for_model?(model_name)
|
|
@@ -6,7 +6,7 @@ module CompletionKit
|
|
|
6
6
|
class JudgeService
|
|
7
7
|
def initialize(config = {})
|
|
8
8
|
@config = config
|
|
9
|
-
@judge_model = config[:judge_model] ||
|
|
9
|
+
@judge_model = config[:judge_model].presence || ApiConfig.default_judge_model
|
|
10
10
|
@judge_client = LlmClient.for_model(@judge_model, ApiConfig.for_model(@judge_model))
|
|
11
11
|
end
|
|
12
12
|
|
|
@@ -43,11 +43,23 @@ module CompletionKit
|
|
|
43
43
|
judge_prompt += "\nCriteria: #{criteria}\n"
|
|
44
44
|
end
|
|
45
45
|
|
|
46
|
+
judge_prompt += "\nScore strictly on the dimension described above. Do not raise or lower the score for qualities the rubric and criteria do not mention.\n"
|
|
47
|
+
|
|
46
48
|
judge_prompt += human_examples_block(human_examples)
|
|
47
49
|
|
|
50
|
+
if prompt.present?
|
|
51
|
+
judge_prompt += <<~PROMPT
|
|
52
|
+
|
|
53
|
+
The prompt that generated the output is shown below for reference. Weigh it only when the dimension you are scoring is about adherence to what was asked: following instructions, matching a required format or schema, or hitting a requested tone or persona. For dimensions about the output's intrinsic quality, such as factual correctness or conciseness, judge the output on its own and ignore the prompt's specific rules. If the output breaks a prompt rule that is unrelated to the dimension you are scoring, such as a content restriction, a banned topic, or a length limit, do not lower the score for breaking it.
|
|
54
|
+
|
|
55
|
+
Original prompt: #{prompt}
|
|
56
|
+
|
|
57
|
+
Reminder: score only the dimension named in the criteria above.
|
|
58
|
+
PROMPT
|
|
59
|
+
end
|
|
60
|
+
|
|
48
61
|
judge_prompt += <<~PROMPT
|
|
49
62
|
|
|
50
|
-
Original prompt: #{prompt || "Not provided"}
|
|
51
63
|
#{input_data.present? ? "Input data: #{input_data}" : ""}
|
|
52
64
|
#{expected_output.present? ? "Expected output: #{expected_output}" : ""}
|
|
53
65
|
AI output to evaluate: #{output}
|
|
@@ -52,7 +52,7 @@ module CompletionKit
|
|
|
52
52
|
properties: {
|
|
53
53
|
metric_id: {type: "integer"},
|
|
54
54
|
count: {type: "integer", description: "How many variants to request (default 1, max 3). One focused rewrite beats five reworded copies."},
|
|
55
|
-
model: {type: "string", description: "Override the model used to generate variants. Defaults to
|
|
55
|
+
model: {type: "string", description: "Override the model used to generate variants. Defaults to the configured judge model or an available judging model."}
|
|
56
56
|
},
|
|
57
57
|
required: ["metric_id"]
|
|
58
58
|
},
|
|
@@ -110,18 +110,25 @@ module CompletionKit
|
|
|
110
110
|
return error_result("Judge-only runs don't have a prompt to improve.") if run.prompt.nil?
|
|
111
111
|
|
|
112
112
|
result = PromptImprovementService.new(run).suggest
|
|
113
|
+
return error_result("The model didn't return a usable rewrite.") if result["suggested_template"].blank?
|
|
114
|
+
|
|
115
|
+
validation = PromptImprovementValidator.new(run, result["suggested_template"]).call
|
|
113
116
|
suggestion = run.suggestions.create!(
|
|
114
117
|
prompt: run.prompt,
|
|
115
118
|
reasoning: result["reasoning"],
|
|
116
119
|
suggested_template: result["suggested_template"],
|
|
117
|
-
original_template: result["original_template"]
|
|
120
|
+
original_template: result["original_template"],
|
|
121
|
+
validation_summary: validation,
|
|
122
|
+
status: "ready"
|
|
118
123
|
)
|
|
119
124
|
text_result(
|
|
120
125
|
suggestion_id: suggestion.id,
|
|
121
126
|
prompt_id: run.prompt.id,
|
|
122
127
|
reasoning: suggestion.reasoning,
|
|
123
128
|
suggested_template: suggestion.suggested_template,
|
|
124
|
-
original_template: suggestion.original_template
|
|
129
|
+
original_template: suggestion.original_template,
|
|
130
|
+
validation: validation,
|
|
131
|
+
net_negative: suggestion.net_negative?
|
|
125
132
|
)
|
|
126
133
|
end
|
|
127
134
|
end
|
|
@@ -10,10 +10,12 @@ module CompletionKit
|
|
|
10
10
|
@metric = metric
|
|
11
11
|
n = count.to_i
|
|
12
12
|
@count = n < 1 ? DEFAULT_VARIANT_COUNT : [n, MAX_VARIANT_COUNT].min
|
|
13
|
-
@model = model ||
|
|
13
|
+
@model = model.presence || ApiConfig.default_judge_model
|
|
14
14
|
end
|
|
15
15
|
|
|
16
16
|
def call
|
|
17
|
+
raise CompletionKit::ConfigurationError, "No judging model available; set CompletionKit.config.judge_model or add a provider with a judging model" if @model.blank?
|
|
18
|
+
|
|
17
19
|
client = LlmClient.for_model(@model, ApiConfig.for_model(@model))
|
|
18
20
|
raw = client.generate_completion(build_meta_prompt, model: @model, max_tokens: 2500, temperature: DEFAULT_TEMPERATURE)
|
|
19
21
|
parse(raw).first(@count)
|
|
@@ -15,8 +15,7 @@ module CompletionKit
|
|
|
15
15
|
SAMPLE_PROMPT = {
|
|
16
16
|
name: "Sample: Support reply",
|
|
17
17
|
description: "A starter prompt. Drafts a warm, professional reply to a customer support ticket. Edit it or delete it; it's just here to get you going.",
|
|
18
|
-
template: "You are a senior customer-support specialist. Write a warm, professional reply to this ticket. Acknowledge the customer's situation, be specific about next steps, and don't be defensive.\n\nTicket:\n{{ticket}}"
|
|
19
|
-
llm_model: "gpt-4o-mini"
|
|
18
|
+
template: "You are a senior customer-support specialist. Write a warm, professional reply to this ticket. Acknowledge the customer's situation, be specific about next steps, and don't be defensive.\n\nTicket:\n{{ticket}}"
|
|
20
19
|
}.freeze
|
|
21
20
|
|
|
22
21
|
module_function
|
|
@@ -25,11 +24,15 @@ module CompletionKit
|
|
|
25
24
|
return if CompletionKit::Prompt.exists? || CompletionKit::Dataset.exists?
|
|
26
25
|
|
|
27
26
|
CompletionKit::Dataset.create!(name: "Sample: Customer tickets", csv_data: SAMPLE_CSV)
|
|
27
|
+
|
|
28
|
+
model = CompletionKit::Model.for_generation.order(:provider, :display_name).first&.model_id
|
|
29
|
+
return unless model
|
|
30
|
+
|
|
28
31
|
CompletionKit::Prompt.create!(
|
|
29
32
|
name: SAMPLE_PROMPT[:name],
|
|
30
33
|
description: SAMPLE_PROMPT[:description],
|
|
31
34
|
template: SAMPLE_PROMPT[:template],
|
|
32
|
-
llm_model:
|
|
35
|
+
llm_model: model
|
|
33
36
|
)
|
|
34
37
|
end
|
|
35
38
|
end
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
require "json"
|
|
2
|
+
|
|
3
|
+
module CompletionKit
|
|
4
|
+
class PromptImprovementValidator
|
|
5
|
+
HELD_OUT_LIMIT = 30
|
|
6
|
+
|
|
7
|
+
Candidate = Struct.new(:template)
|
|
8
|
+
|
|
9
|
+
def initialize(run, candidate_template, generator: nil, judge: nil)
|
|
10
|
+
@run = run
|
|
11
|
+
@candidate = candidate_template
|
|
12
|
+
@generator = generator || method(:generate)
|
|
13
|
+
@judge = judge || method(:judge_score)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def call
|
|
17
|
+
rows = held_out.filter_map do |response|
|
|
18
|
+
new_text = @generator.call(response)
|
|
19
|
+
next if new_text.blank?
|
|
20
|
+
|
|
21
|
+
after = @judge.call(response, new_text)
|
|
22
|
+
next if after.nil?
|
|
23
|
+
|
|
24
|
+
row_for(response, after)
|
|
25
|
+
rescue StandardError
|
|
26
|
+
next
|
|
27
|
+
end
|
|
28
|
+
summarize(rows, @total.to_i, @total.to_i > HELD_OUT_LIMIT)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
def held_out
|
|
34
|
+
scope = @run.responses
|
|
35
|
+
.where.not(response_text: [nil, ""])
|
|
36
|
+
.where.not(input_data: [nil, ""])
|
|
37
|
+
.where(id: Review.where.not(ai_score: nil).select(:response_id))
|
|
38
|
+
@total = scope.count
|
|
39
|
+
scope.order(:row_index).limit(HELD_OUT_LIMIT).to_a
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def row_for(response, after)
|
|
43
|
+
before = response.score
|
|
44
|
+
{
|
|
45
|
+
"response_id" => response.id,
|
|
46
|
+
"before" => before.round(2),
|
|
47
|
+
"after" => after.to_f.round(2),
|
|
48
|
+
"delta" => (after.to_f - before).round(2)
|
|
49
|
+
}
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def summarize(rows, total, capped)
|
|
53
|
+
improved = rows.count { |r| r["after"] > r["before"] }
|
|
54
|
+
regressed = rows.count { |r| r["after"] < r["before"] }
|
|
55
|
+
{
|
|
56
|
+
"total" => total,
|
|
57
|
+
"tested" => rows.size,
|
|
58
|
+
"capped" => capped,
|
|
59
|
+
"before_avg" => avg(rows.map { |r| r["before"] }),
|
|
60
|
+
"after_avg" => avg(rows.map { |r| r["after"] }),
|
|
61
|
+
"improved" => improved,
|
|
62
|
+
"regressed" => regressed,
|
|
63
|
+
"unchanged" => rows.size - improved - regressed,
|
|
64
|
+
"rows" => rows
|
|
65
|
+
}
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def avg(values)
|
|
69
|
+
return nil if values.empty?
|
|
70
|
+
|
|
71
|
+
(values.sum / values.size).round(2)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def generate(response)
|
|
75
|
+
rendered = CsvProcessor.apply_variables(Candidate.new(@candidate), parse_input(response.input_data))
|
|
76
|
+
model = @run.prompt.llm_model
|
|
77
|
+
client = LlmClient.for_model(model, ApiConfig.for_model(model))
|
|
78
|
+
raise CompletionKit::ConfigurationError, client.configuration_errors.join(", ") unless client.configured?
|
|
79
|
+
|
|
80
|
+
text = client.generate_completion(rendered, model: model, temperature: @run.temperature)
|
|
81
|
+
raise StandardError, text if text.to_s.start_with?("Error:")
|
|
82
|
+
|
|
83
|
+
text
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def judge_score(response, new_text)
|
|
87
|
+
config = ApiConfig.for_model(@run.judge_model).merge(judge_model: @run.judge_model)
|
|
88
|
+
judge = JudgeService.new(config)
|
|
89
|
+
scores = @run.metrics.filter_map do |metric|
|
|
90
|
+
judge.evaluate(
|
|
91
|
+
new_text, response.expected_output, @candidate,
|
|
92
|
+
criteria: metric.instruction.to_s,
|
|
93
|
+
rubric_text: metric.display_rubric_text,
|
|
94
|
+
input_data: response.input_data
|
|
95
|
+
)[:score]
|
|
96
|
+
end
|
|
97
|
+
avg(scores)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def parse_input(raw)
|
|
101
|
+
JSON.parse(raw)
|
|
102
|
+
rescue JSON::ParserError
|
|
103
|
+
{}
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
@@ -46,7 +46,7 @@
|
|
|
46
46
|
<span class="ck-mcp-install-card__icon">▶</span>
|
|
47
47
|
Claude Code
|
|
48
48
|
</div>
|
|
49
|
-
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "claude mcp add
|
|
49
|
+
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "claude mcp add --transport http completion-kit \\\n #{base_url}/mcp \\\n --header \"Authorization: Bearer #{token}\"" %>
|
|
50
50
|
</div>
|
|
51
51
|
<div class="ck-mcp-install-card">
|
|
52
52
|
<div class="ck-mcp-install-card__header">
|
|
@@ -199,9 +199,10 @@
|
|
|
199
199
|
</div>
|
|
200
200
|
<div class="ck-api-endpoint">
|
|
201
201
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">POST</span> /api/v1/datasets</p>
|
|
202
|
-
<p class="ck-meta-copy">Create a dataset.</p>
|
|
203
|
-
<p class="ck-api-params"><strong>Required:</strong> <code>name</code>, <code>csv_data</code
|
|
202
|
+
<p class="ck-meta-copy">Create a dataset from inline CSV or an uploaded CSV file.</p>
|
|
203
|
+
<p class="ck-api-params"><strong>Required:</strong> <code>name</code>, and either <code>csv_data</code> (inline CSV) or a multipart <code>file</code> (CSV upload, preferred for large datasets)</p>
|
|
204
204
|
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/datasets \\\n -H \"Authorization: Bearer #{token}\" \\\n -H \"Content-Type: application/json\" \\\n -d '{\"name\": \"tickets\", \"csv_data\": \"text,expected_output\\\\nHello,Hi\"}'" %>
|
|
205
|
+
<%= render "completion_kit/api_reference/example", base_url: base_url, token: token, real_token: real_token, cmd: "curl -X POST #{base_url}/api/v1/datasets \\\n -H \"Authorization: Bearer #{token}\" \\\n -F \"name=tickets\" \\\n -F \"file=@tickets.csv\"" %>
|
|
205
206
|
</div>
|
|
206
207
|
<div class="ck-api-endpoint">
|
|
207
208
|
<p class="ck-api-method"><span class="ck-chip ck-chip--soft">GET</span> <span class="ck-chip ck-chip--soft">PATCH</span> <span class="ck-chip" style="color: var(--ck-danger);">DELETE</span> /api/v1/datasets/:id</p>
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
<% s = summary %>
|
|
2
|
+
<div class="ck-scoreboard">
|
|
3
|
+
<% if s["after_avg"] && s["before_avg"] %>
|
|
4
|
+
<p class="ck-scoreboard__headline">Scored <strong><%= s["after_avg"] %></strong> across <%= pluralize(s["tested"], "held-out response") %> <span class="ck-scoreboard__was">was <%= s["before_avg"] %></span></p>
|
|
5
|
+
<ul class="ck-scoreboard__tally">
|
|
6
|
+
<li class="ck-scoreboard__stat ck-scoreboard__stat--fix">Improved <strong><%= s["improved"] %></strong></li>
|
|
7
|
+
<li class="ck-scoreboard__stat ck-scoreboard__stat--keep">Held <strong><%= s["unchanged"] %></strong></li>
|
|
8
|
+
<li class="ck-scoreboard__stat ck-scoreboard__stat--break">Regressed <strong><%= s["regressed"] %></strong></li>
|
|
9
|
+
</ul>
|
|
10
|
+
<% else %>
|
|
11
|
+
<p class="ck-scoreboard__headline">Couldn't re-score this rewrite against the run's responses.</p>
|
|
12
|
+
<% end %>
|
|
13
|
+
<% if s["capped"] %>
|
|
14
|
+
<p class="ck-scoreboard__note">Tested against this run's 30 most recent responses.</p>
|
|
15
|
+
<% end %>
|
|
16
|
+
</div>
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
<div id="ck-suggestion-status-<%= suggestion.id %>" class="ck-suggestion-state">
|
|
2
|
+
<% if suggestion.pending? %>
|
|
3
|
+
<div class="ck-suggest-progress">
|
|
4
|
+
<p class="ck-kicker">Validating</p>
|
|
5
|
+
<p class="ck-meta-copy">Drafting a stronger prompt and re-scoring it against this run's responses. This page updates on its own when it finishes.</p>
|
|
6
|
+
</div>
|
|
7
|
+
<% elsif suggestion.failed? %>
|
|
8
|
+
<div class="ck-suggest-progress">
|
|
9
|
+
<p class="ck-kicker">Try again</p>
|
|
10
|
+
<p class="ck-meta-copy">We couldn't produce a validated rewrite this time. Review a few more responses, then try again.</p>
|
|
11
|
+
<%= button_to "Try again", suggest_run_path(run), method: :post, class: ck_button_classes(:light, variant: :outline), form_class: "inline-block" %>
|
|
12
|
+
</div>
|
|
13
|
+
<% else %>
|
|
14
|
+
<% if suggestion.validation_summary.present? %>
|
|
15
|
+
<%= render "completion_kit/suggestions/scoreboard", summary: suggestion.validation_summary %>
|
|
16
|
+
<% end %>
|
|
17
|
+
|
|
18
|
+
<div class="ck-suggest-reasoning">
|
|
19
|
+
<p class="ck-kicker">Why these changes</p>
|
|
20
|
+
<div class="ck-suggest-reasoning__body"><%= simple_format(suggestion.reasoning) %></div>
|
|
21
|
+
</div>
|
|
22
|
+
|
|
23
|
+
<div class="ck-suggest-diff">
|
|
24
|
+
<div class="ck-suggest-diff__pane">
|
|
25
|
+
<div class="ck-suggest-diff__header">
|
|
26
|
+
<span class="ck-suggest-diff__label ck-suggest-diff__label--before">Original prompt</span>
|
|
27
|
+
<span class="ck-suggest-diff__version"><%= suggestion.prompt.version_label %></span>
|
|
28
|
+
</div>
|
|
29
|
+
<pre class="ck-suggest-diff__code"><%= ck_word_diff_old(suggestion.original_template, suggestion.suggested_template) %></pre>
|
|
30
|
+
</div>
|
|
31
|
+
<div class="ck-suggest-diff__pane">
|
|
32
|
+
<div class="ck-suggest-diff__header">
|
|
33
|
+
<span class="ck-suggest-diff__label ck-suggest-diff__label--after">Suggested prompt</span>
|
|
34
|
+
</div>
|
|
35
|
+
<pre class="ck-suggest-diff__code"><%= ck_word_diff_new(suggestion.original_template, suggestion.suggested_template) %></pre>
|
|
36
|
+
</div>
|
|
37
|
+
</div>
|
|
38
|
+
|
|
39
|
+
<div class="ck-suggest-full">
|
|
40
|
+
<p class="ck-kicker">Full suggested prompt</p>
|
|
41
|
+
<pre class="ck-code ck-code--dark"><%= suggestion.suggested_template %></pre>
|
|
42
|
+
</div>
|
|
43
|
+
|
|
44
|
+
<div class="ck-actions">
|
|
45
|
+
<% if suggestion.applied_at? %>
|
|
46
|
+
<span class="ck-chip" style="background: var(--ck-success-soft); color: var(--ck-success);">Applied</span>
|
|
47
|
+
<% elsif !suggestion.validated? %>
|
|
48
|
+
<%= button_to "Apply anyway", apply_suggestion_path(suggestion), method: :post, class: ck_button_classes(:light, variant: :outline), form: { class: "inline-block", data: { turbo_confirm: "This rewrite couldn't be re-scored against the run's responses. Apply it anyway?" } } %>
|
|
49
|
+
<% elsif suggestion.net_negative? %>
|
|
50
|
+
<%= button_to "Apply anyway", apply_suggestion_path(suggestion), method: :post, class: ck_button_classes(:light, variant: :outline), form: { class: "inline-block", data: { turbo_confirm: "This rewrite scored lower than the original on the held-out responses. Apply it anyway?" } } %>
|
|
51
|
+
<% else %>
|
|
52
|
+
<%= button_to "Apply suggestion", apply_suggestion_path(suggestion), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
|
|
53
|
+
<% end %>
|
|
54
|
+
</div>
|
|
55
|
+
<% end %>
|
|
56
|
+
</div>
|
|
@@ -19,8 +19,6 @@
|
|
|
19
19
|
· <%= @run.responses.count %> responses scored
|
|
20
20
|
<% if @run.avg_score %>
|
|
21
21
|
<span class="<%= ck_badge_classes(ck_score_kind(@run.avg_score)) %>"><%= @run.avg_score %></span>
|
|
22
|
-
<% else %>
|
|
23
|
-
—
|
|
24
22
|
<% end %>
|
|
25
23
|
</p>
|
|
26
24
|
</div>
|
|
@@ -30,36 +28,8 @@
|
|
|
30
28
|
<% else %>
|
|
31
29
|
<%= link_to "Back to prompt", prompt_path(@run.prompt), class: ck_button_classes(:light, variant: :outline) %>
|
|
32
30
|
<% end %>
|
|
33
|
-
<% if @suggestion.applied_at? %>
|
|
34
|
-
<span class="ck-chip" style="background: var(--ck-success-soft); color: var(--ck-success);">Applied</span>
|
|
35
|
-
<% else %>
|
|
36
|
-
<%= button_to "Apply suggestion", apply_suggestion_path(@suggestion), method: :post, class: ck_button_classes(:dark), form_class: "inline-block" %>
|
|
37
|
-
<% end %>
|
|
38
31
|
</div>
|
|
39
32
|
</section>
|
|
40
33
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
<div class="ck-suggest-reasoning__body"><%= simple_format(@suggestion.reasoning) %></div>
|
|
44
|
-
</div>
|
|
45
|
-
|
|
46
|
-
<div class="ck-suggest-diff">
|
|
47
|
-
<div class="ck-suggest-diff__pane">
|
|
48
|
-
<div class="ck-suggest-diff__header">
|
|
49
|
-
<span class="ck-suggest-diff__label ck-suggest-diff__label--before">Original prompt</span>
|
|
50
|
-
<span class="ck-suggest-diff__version"><%= @suggestion.prompt.version_label %></span>
|
|
51
|
-
</div>
|
|
52
|
-
<pre class="ck-suggest-diff__code"><%= ck_word_diff_old(@suggestion.original_template, @suggestion.suggested_template) %></pre>
|
|
53
|
-
</div>
|
|
54
|
-
<div class="ck-suggest-diff__pane">
|
|
55
|
-
<div class="ck-suggest-diff__header">
|
|
56
|
-
<span class="ck-suggest-diff__label ck-suggest-diff__label--after">Suggested prompt</span>
|
|
57
|
-
</div>
|
|
58
|
-
<pre class="ck-suggest-diff__code"><%= ck_word_diff_new(@suggestion.original_template, @suggestion.suggested_template) %></pre>
|
|
59
|
-
</div>
|
|
60
|
-
</div>
|
|
61
|
-
|
|
62
|
-
<div class="ck-suggest-full">
|
|
63
|
-
<p class="ck-kicker">Full suggested prompt</p>
|
|
64
|
-
<pre class="ck-code ck-code--dark"><%= @suggestion.suggested_template %></pre>
|
|
65
|
-
</div>
|
|
34
|
+
<%= turbo_stream_from "completion_kit_suggestion_#{@suggestion.id}" %>
|
|
35
|
+
<%= render "completion_kit/suggestions/state", suggestion: @suggestion, run: @run %>
|
data/lib/completion_kit.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: completion-kit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.16.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Damien Bastin
|
|
@@ -268,6 +268,7 @@ files:
|
|
|
268
268
|
- app/jobs/completion_kit/judge_review_job.rb
|
|
269
269
|
- app/jobs/completion_kit/metric_suggestion_job.rb
|
|
270
270
|
- app/jobs/completion_kit/model_discovery_job.rb
|
|
271
|
+
- app/jobs/completion_kit/prompt_suggestion_job.rb
|
|
271
272
|
- app/jobs/completion_kit/run_completion_check_job.rb
|
|
272
273
|
- app/mailers/completion_kit/application_mailer.rb
|
|
273
274
|
- app/models/completion_kit/agreement.rb
|
|
@@ -324,6 +325,7 @@ files:
|
|
|
324
325
|
- app/services/completion_kit/open_ai_client.rb
|
|
325
326
|
- app/services/completion_kit/open_router_client.rb
|
|
326
327
|
- app/services/completion_kit/prompt_improvement_service.rb
|
|
328
|
+
- app/services/completion_kit/prompt_improvement_validator.rb
|
|
327
329
|
- app/services/completion_kit/provider_endpoint.rb
|
|
328
330
|
- app/services/completion_kit/starter_metrics.rb
|
|
329
331
|
- app/services/completion_kit/worker_health.rb
|
|
@@ -396,6 +398,8 @@ files:
|
|
|
396
398
|
- app/views/completion_kit/runs/new.html.erb
|
|
397
399
|
- app/views/completion_kit/runs/show.html.erb
|
|
398
400
|
- app/views/completion_kit/shared/_settings_nav.html.erb
|
|
401
|
+
- app/views/completion_kit/suggestions/_scoreboard.html.erb
|
|
402
|
+
- app/views/completion_kit/suggestions/_state.html.erb
|
|
399
403
|
- app/views/completion_kit/suggestions/show.html.erb
|
|
400
404
|
- app/views/completion_kit/tags/_filter_bar.html.erb
|
|
401
405
|
- app/views/completion_kit/tags/_form.html.erb
|
|
@@ -444,6 +448,7 @@ files:
|
|
|
444
448
|
- db/migrate/20260531000002_backfill_review_metric_versions.rb
|
|
445
449
|
- db/migrate/20260531000003_add_metric_version_fk_to_reviews.rb
|
|
446
450
|
- db/migrate/20260531000004_rename_calibrations_to_agreements.rb
|
|
451
|
+
- db/migrate/20260611000001_add_validation_to_completion_kit_suggestions.rb
|
|
447
452
|
- lib/completion-kit.rb
|
|
448
453
|
- lib/completion_kit.rb
|
|
449
454
|
- lib/completion_kit/concurrency_check.rb
|