completion-kit 0.1.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/MIT-LICENSE +20 -0
- data/README.md +192 -0
- data/Rakefile +12 -0
- data/app/assets/config/completion_kit_manifest.js +1 -0
- data/app/assets/config/manifest.js +3 -0
- data/app/assets/images/completion_kit/logo.svg +6 -0
- data/app/assets/javascripts/completion_kit/evaluation_steps_controller.js +25 -0
- data/app/assets/stylesheets/completion_kit/application.css +2214 -0
- data/app/controllers/completion_kit/api/v1/base_controller.rb +29 -0
- data/app/controllers/completion_kit/api/v1/criteria_controller.rb +62 -0
- data/app/controllers/completion_kit/api/v1/datasets_controller.rb +51 -0
- data/app/controllers/completion_kit/api/v1/metrics_controller.rb +51 -0
- data/app/controllers/completion_kit/api/v1/prompts_controller.rb +64 -0
- data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb +51 -0
- data/app/controllers/completion_kit/api/v1/responses_controller.rb +32 -0
- data/app/controllers/completion_kit/api/v1/runs_controller.rb +71 -0
- data/app/controllers/completion_kit/api_reference_controller.rb +9 -0
- data/app/controllers/completion_kit/application_controller.rb +31 -0
- data/app/controllers/completion_kit/criteria_controller.rb +67 -0
- data/app/controllers/completion_kit/datasets_controller.rb +53 -0
- data/app/controllers/completion_kit/mcp_controller.rb +57 -0
- data/app/controllers/completion_kit/metrics_controller.rb +52 -0
- data/app/controllers/completion_kit/prompts_controller.rb +69 -0
- data/app/controllers/completion_kit/provider_credentials_controller.rb +63 -0
- data/app/controllers/completion_kit/responses_controller.rb +44 -0
- data/app/controllers/completion_kit/runs_controller.rb +131 -0
- data/app/helpers/completion_kit/application_helper.rb +193 -0
- data/app/jobs/completion_kit/application_job.rb +4 -0
- data/app/jobs/completion_kit/generate_job.rb +12 -0
- data/app/jobs/completion_kit/judge_job.rb +12 -0
- data/app/jobs/completion_kit/model_discovery_job.rb +29 -0
- data/app/mailers/completion_kit/application_mailer.rb +6 -0
- data/app/models/completion_kit/application_record.rb +5 -0
- data/app/models/completion_kit/criteria.rb +22 -0
- data/app/models/completion_kit/criteria_membership.rb +20 -0
- data/app/models/completion_kit/dataset.rb +24 -0
- data/app/models/completion_kit/metric.rb +97 -0
- data/app/models/completion_kit/model.rb +13 -0
- data/app/models/completion_kit/prompt.rb +99 -0
- data/app/models/completion_kit/provider_credential.rb +114 -0
- data/app/models/completion_kit/response.rb +30 -0
- data/app/models/completion_kit/review.rb +28 -0
- data/app/models/completion_kit/run.rb +253 -0
- data/app/models/completion_kit/run_metric.rb +6 -0
- data/app/models/completion_kit/suggestion.rb +8 -0
- data/app/services/completion_kit/anthropic_client.rb +86 -0
- data/app/services/completion_kit/api_config.rb +80 -0
- data/app/services/completion_kit/csv_processor.rb +65 -0
- data/app/services/completion_kit/judge_service.rb +87 -0
- data/app/services/completion_kit/llm_client.rb +45 -0
- data/app/services/completion_kit/mcp_dispatcher.rb +53 -0
- data/app/services/completion_kit/mcp_tools/criteria.rb +106 -0
- data/app/services/completion_kit/mcp_tools/datasets.rb +90 -0
- data/app/services/completion_kit/mcp_tools/metrics.rb +98 -0
- data/app/services/completion_kit/mcp_tools/prompts.rb +112 -0
- data/app/services/completion_kit/mcp_tools/provider_credentials.rb +97 -0
- data/app/services/completion_kit/mcp_tools/responses.rb +45 -0
- data/app/services/completion_kit/mcp_tools/runs.rb +130 -0
- data/app/services/completion_kit/model_discovery_service.rb +223 -0
- data/app/services/completion_kit/ollama_client.rb +80 -0
- data/app/services/completion_kit/open_ai_client.rb +71 -0
- data/app/services/completion_kit/open_router_client.rb +69 -0
- data/app/services/completion_kit/prompt_improvement_service.rb +81 -0
- data/app/views/completion_kit/api_reference/_example.html.erb +6 -0
- data/app/views/completion_kit/api_reference/index.html.erb +308 -0
- data/app/views/completion_kit/criteria/_form.html.erb +46 -0
- data/app/views/completion_kit/criteria/edit.html.erb +14 -0
- data/app/views/completion_kit/criteria/index.html.erb +37 -0
- data/app/views/completion_kit/criteria/new.html.erb +13 -0
- data/app/views/completion_kit/criteria/show.html.erb +37 -0
- data/app/views/completion_kit/datasets/_form.html.erb +29 -0
- data/app/views/completion_kit/datasets/edit.html.erb +13 -0
- data/app/views/completion_kit/datasets/index.html.erb +38 -0
- data/app/views/completion_kit/datasets/new.html.erb +12 -0
- data/app/views/completion_kit/datasets/show.html.erb +45 -0
- data/app/views/completion_kit/metrics/_form.html.erb +72 -0
- data/app/views/completion_kit/metrics/edit.html.erb +13 -0
- data/app/views/completion_kit/metrics/index.html.erb +34 -0
- data/app/views/completion_kit/metrics/new.html.erb +12 -0
- data/app/views/completion_kit/metrics/show.html.erb +49 -0
- data/app/views/completion_kit/prompts/_form.html.erb +52 -0
- data/app/views/completion_kit/prompts/edit.html.erb +13 -0
- data/app/views/completion_kit/prompts/index.html.erb +46 -0
- data/app/views/completion_kit/prompts/new.html.erb +12 -0
- data/app/views/completion_kit/prompts/show.html.erb +156 -0
- data/app/views/completion_kit/provider_credentials/_discovery_status.html.erb +30 -0
- data/app/views/completion_kit/provider_credentials/_form.html.erb +71 -0
- data/app/views/completion_kit/provider_credentials/edit.html.erb +12 -0
- data/app/views/completion_kit/provider_credentials/index.html.erb +41 -0
- data/app/views/completion_kit/provider_credentials/new.html.erb +12 -0
- data/app/views/completion_kit/responses/show.html.erb +87 -0
- data/app/views/completion_kit/runs/_actions.html.erb +14 -0
- data/app/views/completion_kit/runs/_form.html.erb +159 -0
- data/app/views/completion_kit/runs/_progress.html.erb +18 -0
- data/app/views/completion_kit/runs/_response_row.html.erb +13 -0
- data/app/views/completion_kit/runs/_sort_toolbar.html.erb +8 -0
- data/app/views/completion_kit/runs/_status_header.html.erb +15 -0
- data/app/views/completion_kit/runs/edit.html.erb +14 -0
- data/app/views/completion_kit/runs/index.html.erb +43 -0
- data/app/views/completion_kit/runs/new.html.erb +12 -0
- data/app/views/completion_kit/runs/show.html.erb +79 -0
- data/app/views/completion_kit/runs/suggestion.html.erb +47 -0
- data/app/views/layouts/completion_kit/application.html.erb +77 -0
- data/config/routes.rb +55 -0
- data/db/migrate/20260311000001_create_completion_kit_tables.rb +87 -0
- data/db/migrate/20260326000001_rename_criteria_to_instruction_on_metrics_and_reviews.rb +6 -0
- data/db/migrate/20260327000001_add_progress_to_runs.rb +6 -0
- data/db/migrate/20260327100001_replace_criteria_with_direct_metrics_on_runs.rb +12 -0
- data/db/migrate/20260328000001_add_error_message_to_runs.rb +5 -0
- data/db/migrate/20260329000001_create_completion_kit_models.rb +20 -0
- data/db/migrate/20260401170001_add_discovery_columns_to_completion_kit_provider_credentials.rb +7 -0
- data/db/migrate/20260403000001_add_temperature_to_completion_kit_runs.rb +5 -0
- data/db/migrate/20260403000002_create_completion_kit_suggestions.rb +13 -0
- data/db/migrate/20260403000003_add_applied_at_to_completion_kit_suggestions.rb +5 -0
- data/lib/completion-kit.rb +1 -0
- data/lib/completion_kit/engine.rb +35 -0
- data/lib/completion_kit/version.rb +3 -0
- data/lib/completion_kit.rb +55 -0
- data/lib/generators/completion_kit/install_generator.rb +21 -0
- data/lib/generators/completion_kit/templates/README +20 -0
- data/lib/generators/completion_kit/templates/initializer.rb +43 -0
- metadata +361 -0
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class Metric < ApplicationRecord
|
|
3
|
+
DEFAULT_RUBRIC_BANDS = [
|
|
4
|
+
{ "stars" => 5, "description" => "Fully meets or exceeds all criteria. No meaningful issues." },
|
|
5
|
+
{ "stars" => 4, "description" => "Meets criteria well. Minor issues only." },
|
|
6
|
+
{ "stars" => 3, "description" => "Meets criteria adequately. Some room for improvement." },
|
|
7
|
+
{ "stars" => 2, "description" => "Partially meets criteria. Significant gaps or frequent errors." },
|
|
8
|
+
{ "stars" => 1, "description" => "Fails to meet the criteria. Major errors or completely off-target." }
|
|
9
|
+
].freeze
|
|
10
|
+
|
|
11
|
+
has_many :criteria_memberships, dependent: :destroy
|
|
12
|
+
has_many :criterias, through: :criteria_memberships, source: :criteria
|
|
13
|
+
has_many :reviews, dependent: :nullify
|
|
14
|
+
|
|
15
|
+
serialize :rubric_bands, coder: JSON
|
|
16
|
+
serialize :evaluation_steps, coder: JSON
|
|
17
|
+
|
|
18
|
+
validates :name, presence: true
|
|
19
|
+
validates :key, uniqueness: true, allow_nil: true
|
|
20
|
+
|
|
21
|
+
before_validation :generate_key
|
|
22
|
+
before_validation :normalize_rubric_bands
|
|
23
|
+
before_validation :set_defaults
|
|
24
|
+
|
|
25
|
+
def self.default_rubric_bands
|
|
26
|
+
DEFAULT_RUBRIC_BANDS.map(&:dup)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def self.default_rubric_text
|
|
30
|
+
rubric_text_for(default_rubric_bands)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def self.rubric_text_for(bands)
|
|
34
|
+
Array(bands).sort_by { |b| -(b["stars"] || 0) }.map do |band|
|
|
35
|
+
stars = band["stars"].to_i
|
|
36
|
+
label = stars == 1 ? "1 star" : "#{stars} stars"
|
|
37
|
+
"#{label}: #{band["description"]}"
|
|
38
|
+
end.join("\n\n")
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def self.normalize_rubric_bands(raw_bands)
|
|
42
|
+
bands = raw_bands.is_a?(Hash) ? raw_bands.values : Array(raw_bands)
|
|
43
|
+
band_map = bands.each_with_object({}) do |band, acc|
|
|
44
|
+
next unless band.respond_to?(:to_h)
|
|
45
|
+
|
|
46
|
+
normalized = band.to_h.stringify_keys.slice("stars", "description")
|
|
47
|
+
stars = normalized["stars"].to_i
|
|
48
|
+
next unless (1..5).cover?(stars)
|
|
49
|
+
|
|
50
|
+
acc[stars] = {
|
|
51
|
+
"stars" => stars,
|
|
52
|
+
"description" => normalized["description"].to_s.strip
|
|
53
|
+
}
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
default_rubric_bands.map do |default_band|
|
|
57
|
+
stars = default_band["stars"]
|
|
58
|
+
band = band_map[stars]
|
|
59
|
+
{
|
|
60
|
+
"stars" => stars,
|
|
61
|
+
"description" => band && band["description"].present? ? band["description"] : default_band["description"]
|
|
62
|
+
}
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def rubric_bands_for_form
|
|
67
|
+
self.class.normalize_rubric_bands(rubric_bands)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def display_rubric_text
|
|
71
|
+
self.class.rubric_text_for(rubric_bands_for_form)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def as_json(options = {})
|
|
75
|
+
{
|
|
76
|
+
id: id, name: name, key: key, instruction: instruction,
|
|
77
|
+
evaluation_steps: evaluation_steps, rubric_bands: rubric_bands,
|
|
78
|
+
created_at: created_at, updated_at: updated_at
|
|
79
|
+
}
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
private
|
|
83
|
+
|
|
84
|
+
def generate_key
|
|
85
|
+
self.key ||= name.parameterize if name.present?
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def set_defaults
|
|
89
|
+
self.evaluation_steps ||= []
|
|
90
|
+
self.rubric_bands = self.class.default_rubric_bands if rubric_bands.blank?
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def normalize_rubric_bands
|
|
94
|
+
self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class Model < ApplicationRecord
|
|
3
|
+
STATUSES = %w[active retired failed].freeze
|
|
4
|
+
|
|
5
|
+
validates :provider, presence: true
|
|
6
|
+
validates :model_id, presence: true, uniqueness: { scope: :provider }
|
|
7
|
+
validates :status, presence: true, inclusion: { in: STATUSES }
|
|
8
|
+
|
|
9
|
+
scope :active, -> { where(status: "active") }
|
|
10
|
+
scope :for_generation, -> { active.where(supports_generation: true) }
|
|
11
|
+
scope :for_judging, -> { active.where(supports_judging: true) }
|
|
12
|
+
end
|
|
13
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class Prompt < ApplicationRecord
|
|
3
|
+
has_many :runs, dependent: :destroy
|
|
4
|
+
has_many :responses, through: :runs
|
|
5
|
+
|
|
6
|
+
validates :name, presence: true
|
|
7
|
+
validates :template, presence: true
|
|
8
|
+
validates :llm_model, presence: true
|
|
9
|
+
validates :family_key, presence: true
|
|
10
|
+
validates :version_number, presence: true, numericality: { only_integer: true, greater_than: 0 }
|
|
11
|
+
|
|
12
|
+
before_validation :assign_family_key, on: :create
|
|
13
|
+
before_validation :assign_version_number, on: :create
|
|
14
|
+
before_validation :set_defaults
|
|
15
|
+
|
|
16
|
+
scope :current_versions, -> { where(current: true).order(created_at: :desc) }
|
|
17
|
+
|
|
18
|
+
def self.available_models(provider: nil)
|
|
19
|
+
ApiConfig.available_models(provider: provider)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def self.current_for(identifier)
|
|
23
|
+
current_versions.find_by(family_key: identifier) ||
|
|
24
|
+
current_versions.find_by(name: identifier) ||
|
|
25
|
+
current_versions.find { |p| p.slug == identifier.to_s } ||
|
|
26
|
+
raise(ActiveRecord::RecordNotFound)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def slug
|
|
30
|
+
name.to_s.downcase.strip.gsub(/[^a-z0-9]+/, "-").gsub(/\A-|-\z/, "")
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def variables
|
|
34
|
+
CsvProcessor.extract_variables(self)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def version_label
|
|
38
|
+
"v#{version_number}"
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def display_name
|
|
42
|
+
"#{name} — #{version_label}"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def family_versions
|
|
46
|
+
self.class.where(family_key: family_key).order(version_number: :desc, created_at: :desc)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def clone_as_new_version(overrides = {})
|
|
50
|
+
self.class.create!(
|
|
51
|
+
{
|
|
52
|
+
name: name,
|
|
53
|
+
description: description,
|
|
54
|
+
template: template,
|
|
55
|
+
llm_model: llm_model,
|
|
56
|
+
family_key: family_key,
|
|
57
|
+
version_number: next_version_number,
|
|
58
|
+
current: false,
|
|
59
|
+
published_at: nil
|
|
60
|
+
}.merge(overrides.compact)
|
|
61
|
+
)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def publish!
|
|
65
|
+
transaction do
|
|
66
|
+
self.class.where(family_key: family_key).where.not(id: id).update_all(current: false)
|
|
67
|
+
reload
|
|
68
|
+
update!(current: true, published_at: Time.current)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def as_json(options = {})
|
|
73
|
+
{
|
|
74
|
+
id: id, name: name, description: description, template: template,
|
|
75
|
+
llm_model: llm_model, family_key: family_key, version_number: version_number,
|
|
76
|
+
current: current, created_at: created_at, updated_at: updated_at
|
|
77
|
+
}
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
private
|
|
81
|
+
|
|
82
|
+
def assign_family_key
|
|
83
|
+
self.family_key ||= SecureRandom.uuid
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def assign_version_number
|
|
87
|
+
self.version_number ||= next_version_number
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def next_version_number
|
|
91
|
+
self.class.where(family_key: family_key).maximum(:version_number).to_i + 1
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def set_defaults
|
|
95
|
+
self.current = true if current.nil?
|
|
96
|
+
self.published_at ||= Time.current if current?
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class ProviderCredential < ApplicationRecord
|
|
3
|
+
include Turbo::Broadcastable
|
|
4
|
+
PROVIDERS = %w[openai anthropic ollama openrouter].freeze
|
|
5
|
+
PROVIDER_LABELS = {
|
|
6
|
+
"openai" => "OpenAI",
|
|
7
|
+
"anthropic" => "Anthropic",
|
|
8
|
+
"ollama" => "Ollama / local endpoint",
|
|
9
|
+
"openrouter" => "OpenRouter"
|
|
10
|
+
}.freeze
|
|
11
|
+
|
|
12
|
+
encrypts :api_key
|
|
13
|
+
|
|
14
|
+
def as_json(options = {})
|
|
15
|
+
{
|
|
16
|
+
id: id, provider: provider, api_endpoint: api_endpoint,
|
|
17
|
+
created_at: created_at, updated_at: updated_at
|
|
18
|
+
}
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def display_provider
|
|
22
|
+
PROVIDER_LABELS[provider] || provider.titleize
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
validates :provider, presence: true, inclusion: { in: PROVIDERS }, uniqueness: true
|
|
26
|
+
|
|
27
|
+
after_save :enqueue_discovery
|
|
28
|
+
|
|
29
|
+
def config_hash
|
|
30
|
+
{
|
|
31
|
+
provider: provider,
|
|
32
|
+
api_key: api_key,
|
|
33
|
+
api_endpoint: api_endpoint
|
|
34
|
+
}.compact
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def available_models
|
|
38
|
+
LlmClient.for_provider(provider, config_hash).available_models
|
|
39
|
+
rescue StandardError
|
|
40
|
+
[]
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def configured?
|
|
44
|
+
LlmClient.for_provider(provider, config_hash).configured?
|
|
45
|
+
rescue StandardError
|
|
46
|
+
false
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def prompt_count
|
|
50
|
+
model_ids = Model.where(provider: provider).pluck(:model_id)
|
|
51
|
+
return 0 if model_ids.empty?
|
|
52
|
+
Prompt.where(llm_model: model_ids, current: true).count
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def judge_count
|
|
56
|
+
model_ids = Model.where(provider: provider).pluck(:model_id)
|
|
57
|
+
return 0 if model_ids.empty?
|
|
58
|
+
Run.where(judge_model: model_ids).count
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def last_used_at
|
|
62
|
+
model_ids = Model.where(provider: provider).pluck(:model_id)
|
|
63
|
+
return nil if model_ids.empty?
|
|
64
|
+
prompt_scope = Prompt.where(llm_model: model_ids).select(:id)
|
|
65
|
+
Run.where("prompt_id IN (:prompts) OR judge_model IN (:models)",
|
|
66
|
+
prompts: prompt_scope, models: model_ids)
|
|
67
|
+
.where.not(status: "pending")
|
|
68
|
+
.maximum(:created_at)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def broadcast_discovery_progress
|
|
72
|
+
broadcast_replace_to(
|
|
73
|
+
"completion_kit_provider_#{id}",
|
|
74
|
+
target: "discovery_status_#{id}",
|
|
75
|
+
html: render_partial("completion_kit/provider_credentials/discovery_status", provider_credential: self)
|
|
76
|
+
)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def broadcast_discovery_complete
|
|
80
|
+
broadcast_discovery_progress
|
|
81
|
+
broadcast_model_dropdowns
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
private
|
|
85
|
+
|
|
86
|
+
def enqueue_discovery
|
|
87
|
+
update_columns(discovery_status: "discovering", discovery_current: 0, discovery_total: 0)
|
|
88
|
+
ModelDiscoveryJob.perform_later(id)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def broadcast_model_dropdowns
|
|
92
|
+
helper = ApplicationController.helpers
|
|
93
|
+
gen_html = helper.ck_model_options_html(:generation)
|
|
94
|
+
judge_html = '<option value="">None</option>' + helper.ck_model_options_html(:judging)
|
|
95
|
+
|
|
96
|
+
Turbo::StreamsChannel.broadcast_action_to(
|
|
97
|
+
"completion_kit_provider_#{id}",
|
|
98
|
+
action: :replace,
|
|
99
|
+
target: "prompt_llm_model",
|
|
100
|
+
html: "<select name=\"prompt[llm_model]\" id=\"prompt_llm_model\" class=\"ck-input\">#{gen_html}</select>"
|
|
101
|
+
)
|
|
102
|
+
Turbo::StreamsChannel.broadcast_action_to(
|
|
103
|
+
"completion_kit_provider_#{id}",
|
|
104
|
+
action: :replace,
|
|
105
|
+
target: "run_judge_model",
|
|
106
|
+
html: "<select name=\"run[judge_model]\" id=\"run_judge_model\" class=\"ck-input\">#{judge_html}</select>"
|
|
107
|
+
)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def render_partial(partial, locals)
|
|
111
|
+
CompletionKit::ApplicationController.render(partial: partial, locals: locals)
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class Response < ApplicationRecord
|
|
3
|
+
belongs_to :run
|
|
4
|
+
has_many :reviews, dependent: :destroy
|
|
5
|
+
|
|
6
|
+
delegate :prompt, to: :run
|
|
7
|
+
|
|
8
|
+
validates :response_text, presence: true
|
|
9
|
+
|
|
10
|
+
def as_json(options = {})
|
|
11
|
+
{
|
|
12
|
+
id: id, run_id: run_id, input_data: input_data,
|
|
13
|
+
response_text: response_text, expected_output: expected_output,
|
|
14
|
+
created_at: created_at, score: score, reviewed: reviewed?,
|
|
15
|
+
reviews: reviews.map(&:as_json)
|
|
16
|
+
}
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def score
|
|
20
|
+
scores = reviews.select { |r| r.ai_score.present? }.map { |r| r.ai_score.to_f }
|
|
21
|
+
return nil if scores.empty?
|
|
22
|
+
|
|
23
|
+
(scores.sum / scores.length).round(2)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def reviewed?
|
|
27
|
+
reviews.any? { |r| r.ai_score.present? }
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class Review < ApplicationRecord
|
|
3
|
+
STATUSES = %w[pending evaluated failed].freeze
|
|
4
|
+
|
|
5
|
+
belongs_to :response
|
|
6
|
+
belongs_to :metric, optional: true
|
|
7
|
+
|
|
8
|
+
validates :metric_name, presence: true
|
|
9
|
+
validates :status, inclusion: { in: STATUSES }
|
|
10
|
+
validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
|
|
11
|
+
|
|
12
|
+
before_validation :set_default_status
|
|
13
|
+
|
|
14
|
+
def as_json(options = {})
|
|
15
|
+
{
|
|
16
|
+
id: id, response_id: response_id, metric_id: metric_id,
|
|
17
|
+
metric_name: metric_name, ai_score: ai_score,
|
|
18
|
+
ai_feedback: ai_feedback, status: status
|
|
19
|
+
}
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
def set_default_status
|
|
25
|
+
self.status ||= "pending"
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
module CompletionKit
|
|
2
|
+
class Run < ApplicationRecord
|
|
3
|
+
include Turbo::Broadcastable
|
|
4
|
+
|
|
5
|
+
STATUSES = %w[pending generating judging completed failed].freeze
|
|
6
|
+
|
|
7
|
+
belongs_to :prompt
|
|
8
|
+
belongs_to :dataset, optional: true
|
|
9
|
+
has_many :responses, dependent: :destroy
|
|
10
|
+
has_many :run_metrics, -> { order(:position) }, dependent: :destroy
|
|
11
|
+
has_many :metrics, through: :run_metrics
|
|
12
|
+
has_many :suggestions, dependent: :destroy
|
|
13
|
+
|
|
14
|
+
validates :name, presence: true
|
|
15
|
+
validates :status, inclusion: { in: STATUSES }
|
|
16
|
+
|
|
17
|
+
before_validation :set_default_status, on: :create
|
|
18
|
+
before_validation :set_auto_name, on: :create
|
|
19
|
+
|
|
20
|
+
def judge_configured?
|
|
21
|
+
judge_model.present? && metrics.any? && ApiConfig.valid_for_model?(judge_model)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def avg_score
|
|
25
|
+
all_reviews = responses.flat_map(&:reviews)
|
|
26
|
+
scores = all_reviews.map(&:ai_score).compact.map(&:to_f)
|
|
27
|
+
return nil if scores.empty?
|
|
28
|
+
|
|
29
|
+
(scores.sum / scores.length).round(2)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def metric_averages
|
|
33
|
+
all_reviews = responses.flat_map(&:reviews).select { |r| r.ai_score.present? }
|
|
34
|
+
all_reviews.group_by(&:metric_name).map do |name, reviews|
|
|
35
|
+
scores = reviews.map { |r| r.ai_score.to_f }
|
|
36
|
+
{ name: name, avg: (scores.sum / scores.length).round(1) }
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def generate_responses!
|
|
41
|
+
rows = if dataset
|
|
42
|
+
CsvProcessor.process_self(self)
|
|
43
|
+
else
|
|
44
|
+
[{}]
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
if rows.empty?
|
|
48
|
+
errors.add(:base, "Dataset has no rows")
|
|
49
|
+
return false
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
client = LlmClient.for_model(prompt.llm_model, ApiConfig.for_model(prompt.llm_model))
|
|
53
|
+
|
|
54
|
+
unless client.configured?
|
|
55
|
+
msg = "LLM API not configured: #{client.configuration_errors.join(', ')}"
|
|
56
|
+
errors.add(:base, msg)
|
|
57
|
+
update_columns(status: "failed", error_message: msg) if persisted?
|
|
58
|
+
return false
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
update!(status: "generating", progress_current: 0, progress_total: rows.length, error_message: nil)
|
|
62
|
+
responses.destroy_all
|
|
63
|
+
broadcast_ui
|
|
64
|
+
broadcast_clear_responses
|
|
65
|
+
|
|
66
|
+
rows.each_with_index do |row, index|
|
|
67
|
+
input = row.empty? ? nil : row.to_json
|
|
68
|
+
rendered = CsvProcessor.apply_variables(prompt, row)
|
|
69
|
+
response_text = client.generate_completion(rendered, model: prompt.llm_model, temperature: temperature)
|
|
70
|
+
|
|
71
|
+
resp = responses.create!(
|
|
72
|
+
input_data: input,
|
|
73
|
+
response_text: response_text,
|
|
74
|
+
expected_output: row["expected_output"]
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
update_columns(progress_current: index + 1)
|
|
78
|
+
broadcast_progress
|
|
79
|
+
broadcast_response(resp)
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
if judge_configured?
|
|
83
|
+
judge_responses!
|
|
84
|
+
else
|
|
85
|
+
update!(status: "completed")
|
|
86
|
+
broadcast_ui
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
true
|
|
90
|
+
rescue Faraday::Error => e
|
|
91
|
+
update_columns(status: "failed", error_message: e.message)
|
|
92
|
+
errors.add(:base, e.message)
|
|
93
|
+
broadcast_ui
|
|
94
|
+
false
|
|
95
|
+
rescue StandardError => e
|
|
96
|
+
update_columns(status: "failed", error_message: e.message) if persisted?
|
|
97
|
+
errors.add(:base, e.message)
|
|
98
|
+
broadcast_ui if persisted?
|
|
99
|
+
false
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def judge_responses!
|
|
103
|
+
total_evaluations = responses.count * metrics.count
|
|
104
|
+
update!(status: "judging", progress_current: 0, progress_total: total_evaluations, error_message: nil)
|
|
105
|
+
broadcast_ui
|
|
106
|
+
|
|
107
|
+
judge = JudgeService.new(ApiConfig.for_model(judge_model).merge(judge_model: judge_model))
|
|
108
|
+
evaluation_count = 0
|
|
109
|
+
|
|
110
|
+
responses.find_each do |response|
|
|
111
|
+
metrics.each do |metric|
|
|
112
|
+
evaluation = judge.evaluate(
|
|
113
|
+
response.response_text,
|
|
114
|
+
response.expected_output,
|
|
115
|
+
prompt.template,
|
|
116
|
+
criteria: metric.respond_to?(:instruction) ? metric.instruction.to_s : "",
|
|
117
|
+
evaluation_steps: metric.respond_to?(:evaluation_steps) ? metric.evaluation_steps : nil,
|
|
118
|
+
rubric_text: metric.respond_to?(:display_rubric_text) ? metric.display_rubric_text : nil,
|
|
119
|
+
input_data: response.input_data
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
response.reviews.find_or_initialize_by(metric_id: metric.id).tap do |review|
|
|
123
|
+
review.assign_attributes(
|
|
124
|
+
metric_name: metric.name,
|
|
125
|
+
instruction: metric.respond_to?(:instruction) ? metric.instruction.to_s : "",
|
|
126
|
+
status: "evaluated",
|
|
127
|
+
ai_score: evaluation[:score],
|
|
128
|
+
ai_feedback: evaluation[:feedback]
|
|
129
|
+
)
|
|
130
|
+
review.save!
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
evaluation_count += 1
|
|
134
|
+
update_columns(progress_current: evaluation_count)
|
|
135
|
+
broadcast_progress
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
broadcast_response_update(response)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
update!(status: "completed")
|
|
142
|
+
broadcast_ui
|
|
143
|
+
true
|
|
144
|
+
rescue Faraday::Error => e
|
|
145
|
+
update_columns(status: "failed", error_message: e.message)
|
|
146
|
+
errors.add(:base, e.message)
|
|
147
|
+
broadcast_ui
|
|
148
|
+
false
|
|
149
|
+
rescue StandardError => e
|
|
150
|
+
update_columns(status: "failed", error_message: e.message) if persisted?
|
|
151
|
+
errors.add(:base, e.message)
|
|
152
|
+
broadcast_ui if persisted?
|
|
153
|
+
false
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def as_json(options = {})
|
|
157
|
+
{
|
|
158
|
+
id: id, name: name, status: status, prompt_id: prompt_id,
|
|
159
|
+
dataset_id: dataset_id, judge_model: judge_model, temperature: temperature,
|
|
160
|
+
created_at: created_at, updated_at: updated_at,
|
|
161
|
+
responses_count: responses.count, avg_score: avg_score,
|
|
162
|
+
progress_current: progress_current, progress_total: progress_total,
|
|
163
|
+
error_message: error_message, metric_ids: metric_ids
|
|
164
|
+
}
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
private
|
|
168
|
+
|
|
169
|
+
def broadcast_ui
|
|
170
|
+
broadcast_progress
|
|
171
|
+
broadcast_status_header
|
|
172
|
+
broadcast_actions
|
|
173
|
+
broadcast_sort_toolbar
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def render_engine_partial(partial, locals)
|
|
177
|
+
CompletionKit::ApplicationController.render(
|
|
178
|
+
partial: partial,
|
|
179
|
+
locals: locals
|
|
180
|
+
)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def broadcast_progress
|
|
184
|
+
reload
|
|
185
|
+
broadcast_replace_to(
|
|
186
|
+
"completion_kit_run_#{id}",
|
|
187
|
+
target: "run_progress",
|
|
188
|
+
html: render_engine_partial("completion_kit/runs/progress", run: self)
|
|
189
|
+
)
|
|
190
|
+
broadcast_status_header
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def broadcast_status_header
|
|
194
|
+
broadcast_replace_to(
|
|
195
|
+
"completion_kit_run_#{id}",
|
|
196
|
+
target: "run_status_header",
|
|
197
|
+
html: render_engine_partial("completion_kit/runs/status_header", run: self)
|
|
198
|
+
)
|
|
199
|
+
end
|
|
200
|
+
|
|
201
|
+
def broadcast_actions
|
|
202
|
+
broadcast_replace_to(
|
|
203
|
+
"completion_kit_run_#{id}",
|
|
204
|
+
target: "run_actions",
|
|
205
|
+
html: render_engine_partial("completion_kit/runs/actions", run: self)
|
|
206
|
+
)
|
|
207
|
+
end
|
|
208
|
+
|
|
209
|
+
def broadcast_sort_toolbar
|
|
210
|
+
broadcast_replace_to(
|
|
211
|
+
"completion_kit_run_#{id}",
|
|
212
|
+
target: "run_sort_toolbar",
|
|
213
|
+
html: render_engine_partial("completion_kit/runs/sort_toolbar", run: self)
|
|
214
|
+
)
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def broadcast_clear_responses
|
|
218
|
+
broadcast_replace_to(
|
|
219
|
+
"completion_kit_run_#{id}",
|
|
220
|
+
target: "run_responses",
|
|
221
|
+
html: '<div id="run_responses"></div>'
|
|
222
|
+
)
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def broadcast_response(response)
|
|
226
|
+
broadcast_append_to(
|
|
227
|
+
"completion_kit_run_#{id}",
|
|
228
|
+
target: "run_responses",
|
|
229
|
+
html: render_engine_partial("completion_kit/runs/response_row", run: self, response: response, index: responses.where("id <= ?", response.id).count)
|
|
230
|
+
)
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def broadcast_response_update(response)
|
|
234
|
+
broadcast_replace_to(
|
|
235
|
+
"completion_kit_run_#{id}",
|
|
236
|
+
target: "response_#{response.id}",
|
|
237
|
+
html: render_engine_partial("completion_kit/runs/response_row", run: self, response: response, index: responses.where("id <= ?", response.id).count)
|
|
238
|
+
)
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
def set_default_status
|
|
242
|
+
self.status ||= "pending"
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
def set_auto_name
|
|
246
|
+
return if name.present?
|
|
247
|
+
return unless prompt.present?
|
|
248
|
+
|
|
249
|
+
count = Run.where(prompt_id: prompt_id).count + 1
|
|
250
|
+
self.name = "#{prompt.name} — v#{prompt.version_number} ##{count}"
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
end
|