completion-kit 0.1.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. checksums.yaml +7 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README.md +192 -0
  4. data/Rakefile +12 -0
  5. data/app/assets/config/completion_kit_manifest.js +1 -0
  6. data/app/assets/config/manifest.js +3 -0
  7. data/app/assets/images/completion_kit/logo.svg +6 -0
  8. data/app/assets/javascripts/completion_kit/evaluation_steps_controller.js +25 -0
  9. data/app/assets/stylesheets/completion_kit/application.css +2214 -0
  10. data/app/controllers/completion_kit/api/v1/base_controller.rb +29 -0
  11. data/app/controllers/completion_kit/api/v1/criteria_controller.rb +62 -0
  12. data/app/controllers/completion_kit/api/v1/datasets_controller.rb +51 -0
  13. data/app/controllers/completion_kit/api/v1/metrics_controller.rb +51 -0
  14. data/app/controllers/completion_kit/api/v1/prompts_controller.rb +64 -0
  15. data/app/controllers/completion_kit/api/v1/provider_credentials_controller.rb +51 -0
  16. data/app/controllers/completion_kit/api/v1/responses_controller.rb +32 -0
  17. data/app/controllers/completion_kit/api/v1/runs_controller.rb +71 -0
  18. data/app/controllers/completion_kit/api_reference_controller.rb +9 -0
  19. data/app/controllers/completion_kit/application_controller.rb +31 -0
  20. data/app/controllers/completion_kit/criteria_controller.rb +67 -0
  21. data/app/controllers/completion_kit/datasets_controller.rb +53 -0
  22. data/app/controllers/completion_kit/mcp_controller.rb +57 -0
  23. data/app/controllers/completion_kit/metrics_controller.rb +52 -0
  24. data/app/controllers/completion_kit/prompts_controller.rb +69 -0
  25. data/app/controllers/completion_kit/provider_credentials_controller.rb +63 -0
  26. data/app/controllers/completion_kit/responses_controller.rb +44 -0
  27. data/app/controllers/completion_kit/runs_controller.rb +131 -0
  28. data/app/helpers/completion_kit/application_helper.rb +193 -0
  29. data/app/jobs/completion_kit/application_job.rb +4 -0
  30. data/app/jobs/completion_kit/generate_job.rb +12 -0
  31. data/app/jobs/completion_kit/judge_job.rb +12 -0
  32. data/app/jobs/completion_kit/model_discovery_job.rb +29 -0
  33. data/app/mailers/completion_kit/application_mailer.rb +6 -0
  34. data/app/models/completion_kit/application_record.rb +5 -0
  35. data/app/models/completion_kit/criteria.rb +22 -0
  36. data/app/models/completion_kit/criteria_membership.rb +20 -0
  37. data/app/models/completion_kit/dataset.rb +24 -0
  38. data/app/models/completion_kit/metric.rb +97 -0
  39. data/app/models/completion_kit/model.rb +13 -0
  40. data/app/models/completion_kit/prompt.rb +99 -0
  41. data/app/models/completion_kit/provider_credential.rb +114 -0
  42. data/app/models/completion_kit/response.rb +30 -0
  43. data/app/models/completion_kit/review.rb +28 -0
  44. data/app/models/completion_kit/run.rb +253 -0
  45. data/app/models/completion_kit/run_metric.rb +6 -0
  46. data/app/models/completion_kit/suggestion.rb +8 -0
  47. data/app/services/completion_kit/anthropic_client.rb +86 -0
  48. data/app/services/completion_kit/api_config.rb +80 -0
  49. data/app/services/completion_kit/csv_processor.rb +65 -0
  50. data/app/services/completion_kit/judge_service.rb +87 -0
  51. data/app/services/completion_kit/llm_client.rb +45 -0
  52. data/app/services/completion_kit/mcp_dispatcher.rb +53 -0
  53. data/app/services/completion_kit/mcp_tools/criteria.rb +106 -0
  54. data/app/services/completion_kit/mcp_tools/datasets.rb +90 -0
  55. data/app/services/completion_kit/mcp_tools/metrics.rb +98 -0
  56. data/app/services/completion_kit/mcp_tools/prompts.rb +112 -0
  57. data/app/services/completion_kit/mcp_tools/provider_credentials.rb +97 -0
  58. data/app/services/completion_kit/mcp_tools/responses.rb +45 -0
  59. data/app/services/completion_kit/mcp_tools/runs.rb +130 -0
  60. data/app/services/completion_kit/model_discovery_service.rb +223 -0
  61. data/app/services/completion_kit/ollama_client.rb +80 -0
  62. data/app/services/completion_kit/open_ai_client.rb +71 -0
  63. data/app/services/completion_kit/open_router_client.rb +69 -0
  64. data/app/services/completion_kit/prompt_improvement_service.rb +81 -0
  65. data/app/views/completion_kit/api_reference/_example.html.erb +6 -0
  66. data/app/views/completion_kit/api_reference/index.html.erb +308 -0
  67. data/app/views/completion_kit/criteria/_form.html.erb +46 -0
  68. data/app/views/completion_kit/criteria/edit.html.erb +14 -0
  69. data/app/views/completion_kit/criteria/index.html.erb +37 -0
  70. data/app/views/completion_kit/criteria/new.html.erb +13 -0
  71. data/app/views/completion_kit/criteria/show.html.erb +37 -0
  72. data/app/views/completion_kit/datasets/_form.html.erb +29 -0
  73. data/app/views/completion_kit/datasets/edit.html.erb +13 -0
  74. data/app/views/completion_kit/datasets/index.html.erb +38 -0
  75. data/app/views/completion_kit/datasets/new.html.erb +12 -0
  76. data/app/views/completion_kit/datasets/show.html.erb +45 -0
  77. data/app/views/completion_kit/metrics/_form.html.erb +72 -0
  78. data/app/views/completion_kit/metrics/edit.html.erb +13 -0
  79. data/app/views/completion_kit/metrics/index.html.erb +34 -0
  80. data/app/views/completion_kit/metrics/new.html.erb +12 -0
  81. data/app/views/completion_kit/metrics/show.html.erb +49 -0
  82. data/app/views/completion_kit/prompts/_form.html.erb +52 -0
  83. data/app/views/completion_kit/prompts/edit.html.erb +13 -0
  84. data/app/views/completion_kit/prompts/index.html.erb +46 -0
  85. data/app/views/completion_kit/prompts/new.html.erb +12 -0
  86. data/app/views/completion_kit/prompts/show.html.erb +156 -0
  87. data/app/views/completion_kit/provider_credentials/_discovery_status.html.erb +30 -0
  88. data/app/views/completion_kit/provider_credentials/_form.html.erb +71 -0
  89. data/app/views/completion_kit/provider_credentials/edit.html.erb +12 -0
  90. data/app/views/completion_kit/provider_credentials/index.html.erb +41 -0
  91. data/app/views/completion_kit/provider_credentials/new.html.erb +12 -0
  92. data/app/views/completion_kit/responses/show.html.erb +87 -0
  93. data/app/views/completion_kit/runs/_actions.html.erb +14 -0
  94. data/app/views/completion_kit/runs/_form.html.erb +159 -0
  95. data/app/views/completion_kit/runs/_progress.html.erb +18 -0
  96. data/app/views/completion_kit/runs/_response_row.html.erb +13 -0
  97. data/app/views/completion_kit/runs/_sort_toolbar.html.erb +8 -0
  98. data/app/views/completion_kit/runs/_status_header.html.erb +15 -0
  99. data/app/views/completion_kit/runs/edit.html.erb +14 -0
  100. data/app/views/completion_kit/runs/index.html.erb +43 -0
  101. data/app/views/completion_kit/runs/new.html.erb +12 -0
  102. data/app/views/completion_kit/runs/show.html.erb +79 -0
  103. data/app/views/completion_kit/runs/suggestion.html.erb +47 -0
  104. data/app/views/layouts/completion_kit/application.html.erb +77 -0
  105. data/config/routes.rb +55 -0
  106. data/db/migrate/20260311000001_create_completion_kit_tables.rb +87 -0
  107. data/db/migrate/20260326000001_rename_criteria_to_instruction_on_metrics_and_reviews.rb +6 -0
  108. data/db/migrate/20260327000001_add_progress_to_runs.rb +6 -0
  109. data/db/migrate/20260327100001_replace_criteria_with_direct_metrics_on_runs.rb +12 -0
  110. data/db/migrate/20260328000001_add_error_message_to_runs.rb +5 -0
  111. data/db/migrate/20260329000001_create_completion_kit_models.rb +20 -0
  112. data/db/migrate/20260401170001_add_discovery_columns_to_completion_kit_provider_credentials.rb +7 -0
  113. data/db/migrate/20260403000001_add_temperature_to_completion_kit_runs.rb +5 -0
  114. data/db/migrate/20260403000002_create_completion_kit_suggestions.rb +13 -0
  115. data/db/migrate/20260403000003_add_applied_at_to_completion_kit_suggestions.rb +5 -0
  116. data/lib/completion-kit.rb +1 -0
  117. data/lib/completion_kit/engine.rb +35 -0
  118. data/lib/completion_kit/version.rb +3 -0
  119. data/lib/completion_kit.rb +55 -0
  120. data/lib/generators/completion_kit/install_generator.rb +21 -0
  121. data/lib/generators/completion_kit/templates/README +20 -0
  122. data/lib/generators/completion_kit/templates/initializer.rb +43 -0
  123. metadata +361 -0
@@ -0,0 +1,97 @@
1
+ module CompletionKit
2
+ class Metric < ApplicationRecord
3
+ DEFAULT_RUBRIC_BANDS = [
4
+ { "stars" => 5, "description" => "Fully meets or exceeds all criteria. No meaningful issues." },
5
+ { "stars" => 4, "description" => "Meets criteria well. Minor issues only." },
6
+ { "stars" => 3, "description" => "Meets criteria adequately. Some room for improvement." },
7
+ { "stars" => 2, "description" => "Partially meets criteria. Significant gaps or frequent errors." },
8
+ { "stars" => 1, "description" => "Fails to meet the criteria. Major errors or completely off-target." }
9
+ ].freeze
10
+
11
+ has_many :criteria_memberships, dependent: :destroy
12
+ has_many :criterias, through: :criteria_memberships, source: :criteria
13
+ has_many :reviews, dependent: :nullify
14
+
15
+ serialize :rubric_bands, coder: JSON
16
+ serialize :evaluation_steps, coder: JSON
17
+
18
+ validates :name, presence: true
19
+ validates :key, uniqueness: true, allow_nil: true
20
+
21
+ before_validation :generate_key
22
+ before_validation :normalize_rubric_bands
23
+ before_validation :set_defaults
24
+
25
+ def self.default_rubric_bands
26
+ DEFAULT_RUBRIC_BANDS.map(&:dup)
27
+ end
28
+
29
+ def self.default_rubric_text
30
+ rubric_text_for(default_rubric_bands)
31
+ end
32
+
33
+ def self.rubric_text_for(bands)
34
+ Array(bands).sort_by { |b| -(b["stars"] || 0) }.map do |band|
35
+ stars = band["stars"].to_i
36
+ label = stars == 1 ? "1 star" : "#{stars} stars"
37
+ "#{label}: #{band["description"]}"
38
+ end.join("\n\n")
39
+ end
40
+
41
+ def self.normalize_rubric_bands(raw_bands)
42
+ bands = raw_bands.is_a?(Hash) ? raw_bands.values : Array(raw_bands)
43
+ band_map = bands.each_with_object({}) do |band, acc|
44
+ next unless band.respond_to?(:to_h)
45
+
46
+ normalized = band.to_h.stringify_keys.slice("stars", "description")
47
+ stars = normalized["stars"].to_i
48
+ next unless (1..5).cover?(stars)
49
+
50
+ acc[stars] = {
51
+ "stars" => stars,
52
+ "description" => normalized["description"].to_s.strip
53
+ }
54
+ end
55
+
56
+ default_rubric_bands.map do |default_band|
57
+ stars = default_band["stars"]
58
+ band = band_map[stars]
59
+ {
60
+ "stars" => stars,
61
+ "description" => band && band["description"].present? ? band["description"] : default_band["description"]
62
+ }
63
+ end
64
+ end
65
+
66
+ def rubric_bands_for_form
67
+ self.class.normalize_rubric_bands(rubric_bands)
68
+ end
69
+
70
+ def display_rubric_text
71
+ self.class.rubric_text_for(rubric_bands_for_form)
72
+ end
73
+
74
+ def as_json(options = {})
75
+ {
76
+ id: id, name: name, key: key, instruction: instruction,
77
+ evaluation_steps: evaluation_steps, rubric_bands: rubric_bands,
78
+ created_at: created_at, updated_at: updated_at
79
+ }
80
+ end
81
+
82
+ private
83
+
84
+ def generate_key
85
+ self.key ||= name.parameterize if name.present?
86
+ end
87
+
88
+ def set_defaults
89
+ self.evaluation_steps ||= []
90
+ self.rubric_bands = self.class.default_rubric_bands if rubric_bands.blank?
91
+ end
92
+
93
+ def normalize_rubric_bands
94
+ self.rubric_bands = self.class.normalize_rubric_bands(rubric_bands) if rubric_bands.present?
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,13 @@
1
+ module CompletionKit
2
+ class Model < ApplicationRecord
3
+ STATUSES = %w[active retired failed].freeze
4
+
5
+ validates :provider, presence: true
6
+ validates :model_id, presence: true, uniqueness: { scope: :provider }
7
+ validates :status, presence: true, inclusion: { in: STATUSES }
8
+
9
+ scope :active, -> { where(status: "active") }
10
+ scope :for_generation, -> { active.where(supports_generation: true) }
11
+ scope :for_judging, -> { active.where(supports_judging: true) }
12
+ end
13
+ end
@@ -0,0 +1,99 @@
1
+ module CompletionKit
2
+ class Prompt < ApplicationRecord
3
+ has_many :runs, dependent: :destroy
4
+ has_many :responses, through: :runs
5
+
6
+ validates :name, presence: true
7
+ validates :template, presence: true
8
+ validates :llm_model, presence: true
9
+ validates :family_key, presence: true
10
+ validates :version_number, presence: true, numericality: { only_integer: true, greater_than: 0 }
11
+
12
+ before_validation :assign_family_key, on: :create
13
+ before_validation :assign_version_number, on: :create
14
+ before_validation :set_defaults
15
+
16
+ scope :current_versions, -> { where(current: true).order(created_at: :desc) }
17
+
18
+ def self.available_models(provider: nil)
19
+ ApiConfig.available_models(provider: provider)
20
+ end
21
+
22
+ def self.current_for(identifier)
23
+ current_versions.find_by(family_key: identifier) ||
24
+ current_versions.find_by(name: identifier) ||
25
+ current_versions.find { |p| p.slug == identifier.to_s } ||
26
+ raise(ActiveRecord::RecordNotFound)
27
+ end
28
+
29
+ def slug
30
+ name.to_s.downcase.strip.gsub(/[^a-z0-9]+/, "-").gsub(/\A-|-\z/, "")
31
+ end
32
+
33
+ def variables
34
+ CsvProcessor.extract_variables(self)
35
+ end
36
+
37
+ def version_label
38
+ "v#{version_number}"
39
+ end
40
+
41
+ def display_name
42
+ "#{name} — #{version_label}"
43
+ end
44
+
45
+ def family_versions
46
+ self.class.where(family_key: family_key).order(version_number: :desc, created_at: :desc)
47
+ end
48
+
49
+ def clone_as_new_version(overrides = {})
50
+ self.class.create!(
51
+ {
52
+ name: name,
53
+ description: description,
54
+ template: template,
55
+ llm_model: llm_model,
56
+ family_key: family_key,
57
+ version_number: next_version_number,
58
+ current: false,
59
+ published_at: nil
60
+ }.merge(overrides.compact)
61
+ )
62
+ end
63
+
64
+ def publish!
65
+ transaction do
66
+ self.class.where(family_key: family_key).where.not(id: id).update_all(current: false)
67
+ reload
68
+ update!(current: true, published_at: Time.current)
69
+ end
70
+ end
71
+
72
+ def as_json(options = {})
73
+ {
74
+ id: id, name: name, description: description, template: template,
75
+ llm_model: llm_model, family_key: family_key, version_number: version_number,
76
+ current: current, created_at: created_at, updated_at: updated_at
77
+ }
78
+ end
79
+
80
+ private
81
+
82
+ def assign_family_key
83
+ self.family_key ||= SecureRandom.uuid
84
+ end
85
+
86
+ def assign_version_number
87
+ self.version_number ||= next_version_number
88
+ end
89
+
90
+ def next_version_number
91
+ self.class.where(family_key: family_key).maximum(:version_number).to_i + 1
92
+ end
93
+
94
+ def set_defaults
95
+ self.current = true if current.nil?
96
+ self.published_at ||= Time.current if current?
97
+ end
98
+ end
99
+ end
@@ -0,0 +1,114 @@
1
+ module CompletionKit
2
+ class ProviderCredential < ApplicationRecord
3
+ include Turbo::Broadcastable
4
+ PROVIDERS = %w[openai anthropic ollama openrouter].freeze
5
+ PROVIDER_LABELS = {
6
+ "openai" => "OpenAI",
7
+ "anthropic" => "Anthropic",
8
+ "ollama" => "Ollama / local endpoint",
9
+ "openrouter" => "OpenRouter"
10
+ }.freeze
11
+
12
+ encrypts :api_key
13
+
14
+ def as_json(options = {})
15
+ {
16
+ id: id, provider: provider, api_endpoint: api_endpoint,
17
+ created_at: created_at, updated_at: updated_at
18
+ }
19
+ end
20
+
21
+ def display_provider
22
+ PROVIDER_LABELS[provider] || provider.titleize
23
+ end
24
+
25
+ validates :provider, presence: true, inclusion: { in: PROVIDERS }, uniqueness: true
26
+
27
+ after_save :enqueue_discovery
28
+
29
+ def config_hash
30
+ {
31
+ provider: provider,
32
+ api_key: api_key,
33
+ api_endpoint: api_endpoint
34
+ }.compact
35
+ end
36
+
37
+ def available_models
38
+ LlmClient.for_provider(provider, config_hash).available_models
39
+ rescue StandardError
40
+ []
41
+ end
42
+
43
+ def configured?
44
+ LlmClient.for_provider(provider, config_hash).configured?
45
+ rescue StandardError
46
+ false
47
+ end
48
+
49
+ def prompt_count
50
+ model_ids = Model.where(provider: provider).pluck(:model_id)
51
+ return 0 if model_ids.empty?
52
+ Prompt.where(llm_model: model_ids, current: true).count
53
+ end
54
+
55
+ def judge_count
56
+ model_ids = Model.where(provider: provider).pluck(:model_id)
57
+ return 0 if model_ids.empty?
58
+ Run.where(judge_model: model_ids).count
59
+ end
60
+
61
+ def last_used_at
62
+ model_ids = Model.where(provider: provider).pluck(:model_id)
63
+ return nil if model_ids.empty?
64
+ prompt_scope = Prompt.where(llm_model: model_ids).select(:id)
65
+ Run.where("prompt_id IN (:prompts) OR judge_model IN (:models)",
66
+ prompts: prompt_scope, models: model_ids)
67
+ .where.not(status: "pending")
68
+ .maximum(:created_at)
69
+ end
70
+
71
+ def broadcast_discovery_progress
72
+ broadcast_replace_to(
73
+ "completion_kit_provider_#{id}",
74
+ target: "discovery_status_#{id}",
75
+ html: render_partial("completion_kit/provider_credentials/discovery_status", provider_credential: self)
76
+ )
77
+ end
78
+
79
+ def broadcast_discovery_complete
80
+ broadcast_discovery_progress
81
+ broadcast_model_dropdowns
82
+ end
83
+
84
+ private
85
+
86
+ def enqueue_discovery
87
+ update_columns(discovery_status: "discovering", discovery_current: 0, discovery_total: 0)
88
+ ModelDiscoveryJob.perform_later(id)
89
+ end
90
+
91
+ def broadcast_model_dropdowns
92
+ helper = ApplicationController.helpers
93
+ gen_html = helper.ck_model_options_html(:generation)
94
+ judge_html = '<option value="">None</option>' + helper.ck_model_options_html(:judging)
95
+
96
+ Turbo::StreamsChannel.broadcast_action_to(
97
+ "completion_kit_provider_#{id}",
98
+ action: :replace,
99
+ target: "prompt_llm_model",
100
+ html: "<select name=\"prompt[llm_model]\" id=\"prompt_llm_model\" class=\"ck-input\">#{gen_html}</select>"
101
+ )
102
+ Turbo::StreamsChannel.broadcast_action_to(
103
+ "completion_kit_provider_#{id}",
104
+ action: :replace,
105
+ target: "run_judge_model",
106
+ html: "<select name=\"run[judge_model]\" id=\"run_judge_model\" class=\"ck-input\">#{judge_html}</select>"
107
+ )
108
+ end
109
+
110
+ def render_partial(partial, locals)
111
+ CompletionKit::ApplicationController.render(partial: partial, locals: locals)
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,30 @@
1
+ module CompletionKit
2
+ class Response < ApplicationRecord
3
+ belongs_to :run
4
+ has_many :reviews, dependent: :destroy
5
+
6
+ delegate :prompt, to: :run
7
+
8
+ validates :response_text, presence: true
9
+
10
+ def as_json(options = {})
11
+ {
12
+ id: id, run_id: run_id, input_data: input_data,
13
+ response_text: response_text, expected_output: expected_output,
14
+ created_at: created_at, score: score, reviewed: reviewed?,
15
+ reviews: reviews.map(&:as_json)
16
+ }
17
+ end
18
+
19
+ def score
20
+ scores = reviews.select { |r| r.ai_score.present? }.map { |r| r.ai_score.to_f }
21
+ return nil if scores.empty?
22
+
23
+ (scores.sum / scores.length).round(2)
24
+ end
25
+
26
+ def reviewed?
27
+ reviews.any? { |r| r.ai_score.present? }
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,28 @@
1
+ module CompletionKit
2
+ class Review < ApplicationRecord
3
+ STATUSES = %w[pending evaluated failed].freeze
4
+
5
+ belongs_to :response
6
+ belongs_to :metric, optional: true
7
+
8
+ validates :metric_name, presence: true
9
+ validates :status, inclusion: { in: STATUSES }
10
+ validates :ai_score, numericality: { greater_than_or_equal_to: 1, less_than_or_equal_to: 5 }, allow_nil: true
11
+
12
+ before_validation :set_default_status
13
+
14
+ def as_json(options = {})
15
+ {
16
+ id: id, response_id: response_id, metric_id: metric_id,
17
+ metric_name: metric_name, ai_score: ai_score,
18
+ ai_feedback: ai_feedback, status: status
19
+ }
20
+ end
21
+
22
+ private
23
+
24
+ def set_default_status
25
+ self.status ||= "pending"
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,253 @@
1
+ module CompletionKit
2
+ class Run < ApplicationRecord
3
+ include Turbo::Broadcastable
4
+
5
+ STATUSES = %w[pending generating judging completed failed].freeze
6
+
7
+ belongs_to :prompt
8
+ belongs_to :dataset, optional: true
9
+ has_many :responses, dependent: :destroy
10
+ has_many :run_metrics, -> { order(:position) }, dependent: :destroy
11
+ has_many :metrics, through: :run_metrics
12
+ has_many :suggestions, dependent: :destroy
13
+
14
+ validates :name, presence: true
15
+ validates :status, inclusion: { in: STATUSES }
16
+
17
+ before_validation :set_default_status, on: :create
18
+ before_validation :set_auto_name, on: :create
19
+
20
+ def judge_configured?
21
+ judge_model.present? && metrics.any? && ApiConfig.valid_for_model?(judge_model)
22
+ end
23
+
24
+ def avg_score
25
+ all_reviews = responses.flat_map(&:reviews)
26
+ scores = all_reviews.map(&:ai_score).compact.map(&:to_f)
27
+ return nil if scores.empty?
28
+
29
+ (scores.sum / scores.length).round(2)
30
+ end
31
+
32
+ def metric_averages
33
+ all_reviews = responses.flat_map(&:reviews).select { |r| r.ai_score.present? }
34
+ all_reviews.group_by(&:metric_name).map do |name, reviews|
35
+ scores = reviews.map { |r| r.ai_score.to_f }
36
+ { name: name, avg: (scores.sum / scores.length).round(1) }
37
+ end
38
+ end
39
+
40
+ def generate_responses!
41
+ rows = if dataset
42
+ CsvProcessor.process_self(self)
43
+ else
44
+ [{}]
45
+ end
46
+
47
+ if rows.empty?
48
+ errors.add(:base, "Dataset has no rows")
49
+ return false
50
+ end
51
+
52
+ client = LlmClient.for_model(prompt.llm_model, ApiConfig.for_model(prompt.llm_model))
53
+
54
+ unless client.configured?
55
+ msg = "LLM API not configured: #{client.configuration_errors.join(', ')}"
56
+ errors.add(:base, msg)
57
+ update_columns(status: "failed", error_message: msg) if persisted?
58
+ return false
59
+ end
60
+
61
+ update!(status: "generating", progress_current: 0, progress_total: rows.length, error_message: nil)
62
+ responses.destroy_all
63
+ broadcast_ui
64
+ broadcast_clear_responses
65
+
66
+ rows.each_with_index do |row, index|
67
+ input = row.empty? ? nil : row.to_json
68
+ rendered = CsvProcessor.apply_variables(prompt, row)
69
+ response_text = client.generate_completion(rendered, model: prompt.llm_model, temperature: temperature)
70
+
71
+ resp = responses.create!(
72
+ input_data: input,
73
+ response_text: response_text,
74
+ expected_output: row["expected_output"]
75
+ )
76
+
77
+ update_columns(progress_current: index + 1)
78
+ broadcast_progress
79
+ broadcast_response(resp)
80
+ end
81
+
82
+ if judge_configured?
83
+ judge_responses!
84
+ else
85
+ update!(status: "completed")
86
+ broadcast_ui
87
+ end
88
+
89
+ true
90
+ rescue Faraday::Error => e
91
+ update_columns(status: "failed", error_message: e.message)
92
+ errors.add(:base, e.message)
93
+ broadcast_ui
94
+ false
95
+ rescue StandardError => e
96
+ update_columns(status: "failed", error_message: e.message) if persisted?
97
+ errors.add(:base, e.message)
98
+ broadcast_ui if persisted?
99
+ false
100
+ end
101
+
102
+ def judge_responses!
103
+ total_evaluations = responses.count * metrics.count
104
+ update!(status: "judging", progress_current: 0, progress_total: total_evaluations, error_message: nil)
105
+ broadcast_ui
106
+
107
+ judge = JudgeService.new(ApiConfig.for_model(judge_model).merge(judge_model: judge_model))
108
+ evaluation_count = 0
109
+
110
+ responses.find_each do |response|
111
+ metrics.each do |metric|
112
+ evaluation = judge.evaluate(
113
+ response.response_text,
114
+ response.expected_output,
115
+ prompt.template,
116
+ criteria: metric.respond_to?(:instruction) ? metric.instruction.to_s : "",
117
+ evaluation_steps: metric.respond_to?(:evaluation_steps) ? metric.evaluation_steps : nil,
118
+ rubric_text: metric.respond_to?(:display_rubric_text) ? metric.display_rubric_text : nil,
119
+ input_data: response.input_data
120
+ )
121
+
122
+ response.reviews.find_or_initialize_by(metric_id: metric.id).tap do |review|
123
+ review.assign_attributes(
124
+ metric_name: metric.name,
125
+ instruction: metric.respond_to?(:instruction) ? metric.instruction.to_s : "",
126
+ status: "evaluated",
127
+ ai_score: evaluation[:score],
128
+ ai_feedback: evaluation[:feedback]
129
+ )
130
+ review.save!
131
+ end
132
+
133
+ evaluation_count += 1
134
+ update_columns(progress_current: evaluation_count)
135
+ broadcast_progress
136
+ end
137
+
138
+ broadcast_response_update(response)
139
+ end
140
+
141
+ update!(status: "completed")
142
+ broadcast_ui
143
+ true
144
+ rescue Faraday::Error => e
145
+ update_columns(status: "failed", error_message: e.message)
146
+ errors.add(:base, e.message)
147
+ broadcast_ui
148
+ false
149
+ rescue StandardError => e
150
+ update_columns(status: "failed", error_message: e.message) if persisted?
151
+ errors.add(:base, e.message)
152
+ broadcast_ui if persisted?
153
+ false
154
+ end
155
+
156
+ def as_json(options = {})
157
+ {
158
+ id: id, name: name, status: status, prompt_id: prompt_id,
159
+ dataset_id: dataset_id, judge_model: judge_model, temperature: temperature,
160
+ created_at: created_at, updated_at: updated_at,
161
+ responses_count: responses.count, avg_score: avg_score,
162
+ progress_current: progress_current, progress_total: progress_total,
163
+ error_message: error_message, metric_ids: metric_ids
164
+ }
165
+ end
166
+
167
+ private
168
+
169
+ def broadcast_ui
170
+ broadcast_progress
171
+ broadcast_status_header
172
+ broadcast_actions
173
+ broadcast_sort_toolbar
174
+ end
175
+
176
+ def render_engine_partial(partial, locals)
177
+ CompletionKit::ApplicationController.render(
178
+ partial: partial,
179
+ locals: locals
180
+ )
181
+ end
182
+
183
+ def broadcast_progress
184
+ reload
185
+ broadcast_replace_to(
186
+ "completion_kit_run_#{id}",
187
+ target: "run_progress",
188
+ html: render_engine_partial("completion_kit/runs/progress", run: self)
189
+ )
190
+ broadcast_status_header
191
+ end
192
+
193
+ def broadcast_status_header
194
+ broadcast_replace_to(
195
+ "completion_kit_run_#{id}",
196
+ target: "run_status_header",
197
+ html: render_engine_partial("completion_kit/runs/status_header", run: self)
198
+ )
199
+ end
200
+
201
+ def broadcast_actions
202
+ broadcast_replace_to(
203
+ "completion_kit_run_#{id}",
204
+ target: "run_actions",
205
+ html: render_engine_partial("completion_kit/runs/actions", run: self)
206
+ )
207
+ end
208
+
209
+ def broadcast_sort_toolbar
210
+ broadcast_replace_to(
211
+ "completion_kit_run_#{id}",
212
+ target: "run_sort_toolbar",
213
+ html: render_engine_partial("completion_kit/runs/sort_toolbar", run: self)
214
+ )
215
+ end
216
+
217
+ def broadcast_clear_responses
218
+ broadcast_replace_to(
219
+ "completion_kit_run_#{id}",
220
+ target: "run_responses",
221
+ html: '<div id="run_responses"></div>'
222
+ )
223
+ end
224
+
225
+ def broadcast_response(response)
226
+ broadcast_append_to(
227
+ "completion_kit_run_#{id}",
228
+ target: "run_responses",
229
+ html: render_engine_partial("completion_kit/runs/response_row", run: self, response: response, index: responses.where("id <= ?", response.id).count)
230
+ )
231
+ end
232
+
233
+ def broadcast_response_update(response)
234
+ broadcast_replace_to(
235
+ "completion_kit_run_#{id}",
236
+ target: "response_#{response.id}",
237
+ html: render_engine_partial("completion_kit/runs/response_row", run: self, response: response, index: responses.where("id <= ?", response.id).count)
238
+ )
239
+ end
240
+
241
+ def set_default_status
242
+ self.status ||= "pending"
243
+ end
244
+
245
+ def set_auto_name
246
+ return if name.present?
247
+ return unless prompt.present?
248
+
249
+ count = Run.where(prompt_id: prompt_id).count + 1
250
+ self.name = "#{prompt.name} — v#{prompt.version_number} ##{count}"
251
+ end
252
+ end
253
+ end
@@ -0,0 +1,6 @@
1
+ module CompletionKit
2
+ class RunMetric < ApplicationRecord
3
+ belongs_to :run
4
+ belongs_to :metric
5
+ end
6
+ end
@@ -0,0 +1,8 @@
1
+ module CompletionKit
2
+ class Suggestion < ApplicationRecord
3
+ belongs_to :run
4
+ belongs_to :prompt
5
+
6
+ validates :suggested_template, presence: true
7
+ end
8
+ end