leva 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +55 -1
  3. data/app/assets/stylesheets/leva/application.css +165 -25
  4. data/app/controllers/leva/dataset_optimizations_controller.rb +64 -0
  5. data/app/controllers/leva/experiments_controller.rb +14 -6
  6. data/app/controllers/leva/workbench_controller.rb +26 -10
  7. data/app/helpers/leva/application_helper.rb +32 -16
  8. data/app/models/leva/dataset.rb +1 -0
  9. data/app/models/leva/experiment.rb +1 -0
  10. data/app/models/leva/optimization_run.rb +137 -0
  11. data/app/models/leva/prompt.rb +10 -0
  12. data/app/services/leva/class_loader.rb +37 -0
  13. data/app/services/leva/dataset_converter.rb +64 -0
  14. data/app/services/leva/optimizers/base.rb +183 -0
  15. data/app/services/leva/optimizers/bootstrap.rb +92 -0
  16. data/app/services/leva/optimizers/gepa_optimizer.rb +59 -0
  17. data/app/services/leva/optimizers/miprov2_optimizer.rb +52 -0
  18. data/app/services/leva/prompt_optimizer.rb +305 -0
  19. data/app/services/leva/signature_generator.rb +129 -0
  20. data/app/views/leva/datasets/show.html.erb +3 -0
  21. data/app/views/leva/experiments/_experiment.html.erb +9 -10
  22. data/app/views/leva/experiments/_form.html.erb +10 -0
  23. data/app/views/leva/experiments/index.html.erb +2 -1
  24. data/app/views/leva/experiments/show.html.erb +20 -21
  25. data/app/views/leva/optimization_runs/show.html.erb +698 -0
  26. data/app/views/leva/runner_results/show.html.erb +18 -48
  27. data/app/views/leva/workbench/_results_section.html.erb +3 -11
  28. data/db/migrate/20241204000001_create_leva_optimization_runs.rb +25 -0
  29. data/lib/generators/leva/templates/eval.rb.erb +4 -2
  30. data/lib/leva/errors.rb +18 -0
  31. data/lib/leva/version.rb +1 -1
  32. data/lib/leva.rb +1 -0
  33. metadata +16 -3
@@ -0,0 +1,305 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Leva
4
+ # Optimizes prompts using DSPy.rb optimizers.
5
+ #
6
+ # This service coordinates the optimization process, delegating
7
+ # the actual optimization work to strategy classes.
8
+ #
9
+ # @example Optimize a prompt for a dataset
10
+ # optimizer = Leva::PromptOptimizer.new(dataset: dataset, mode: :medium)
11
+ # result = optimizer.optimize
12
+ # # => { system_prompt: "...", user_prompt: "...", metadata: {...} }
13
+ #
14
+ # @example With GEPA optimizer
15
+ # optimizer = Leva::PromptOptimizer.new(dataset: dataset, optimizer: :gepa, mode: :medium)
16
+ # result = optimizer.optimize
17
+ class PromptOptimizer
18
+ # Minimum number of examples required for optimization
19
+ MINIMUM_EXAMPLES = 10
20
+
21
+ # Available optimizers with their strategy classes
22
+ OPTIMIZERS = {
23
+ bootstrap: {
24
+ name: "Bootstrap",
25
+ strategy_class: Leva::Optimizers::Bootstrap,
26
+ gem: nil,
27
+ description: "Fast and simple. Automatically selects optimal few-shot examples from your dataset. " \
28
+ "Best for quick iteration and when you have limited data (10-50 examples). " \
29
+ "Does not modify instructions, only adds demonstrations."
30
+ },
31
+ gepa: {
32
+ name: "GEPA",
33
+ strategy_class: Leva::Optimizers::GepaOptimizer,
34
+ gem: "dspy-gepa",
35
+ description: "State-of-the-art optimizer using reflective prompt evolution. Uses LLM reflection " \
36
+ "to identify what works and propose improvements. Outperforms MIPROv2 by 10-14% " \
37
+ "while being more sample efficient. Best choice for maximum quality."
38
+ },
39
+ miprov2: {
40
+ name: "MIPROv2",
41
+ strategy_class: Leva::Optimizers::Miprov2Optimizer,
42
+ gem: "dspy-miprov2",
43
+ description: "Uses Bayesian optimization to search for optimal instructions and few-shot examples. " \
44
+ "Good for larger datasets (200+ examples). More computationally demanding but thorough. " \
45
+ "Can overfit on small datasets."
46
+ }
47
+ }.freeze
48
+
49
+ # Default optimizer
50
+ DEFAULT_OPTIMIZER = :bootstrap
51
+
52
+ # Optimization modes with their approximate durations
53
+ MODES = {
54
+ light: { description: "Fast optimization (~5 min)", trials: 5 },
55
+ medium: { description: "Balanced optimization (~15 min)", trials: 15 },
56
+ heavy: { description: "Thorough optimization (~30 min)", trials: 30 }
57
+ }.freeze
58
+
59
+ # Default model if none specified (fast and cheap)
60
+ DEFAULT_MODEL = "gemini-2.5-flash"
61
+
62
+ # Returns available models from RubyLLM.
63
+ # Results are cached for 5 minutes to avoid repeated expensive calls.
64
+ #
65
+ # @return [Array<RubyLLM::Model>] All available chat models
66
+ def self.available_models
67
+ Rails.cache.fetch("leva/available_models", expires_in: 5.minutes) do
68
+ RubyLLM.models.chat_models
69
+ end
70
+ end
71
+
72
+ # Finds a model by ID.
73
+ #
74
+ # @param model_id [String] The model ID to find
75
+ # @return [RubyLLM::Model, nil] The model or nil if not found
76
+ def self.find_model(model_id)
77
+ RubyLLM.models.find(model_id)
78
+ rescue RubyLLM::ModelNotFoundError
79
+ nil
80
+ end
81
+
82
+ # @return [Leva::Dataset] The dataset being optimized
83
+ attr_reader :dataset
84
+
85
+ # @return [Symbol] The optimization mode (:light, :medium, :heavy)
86
+ attr_reader :mode
87
+
88
+ # @return [String] The model to use for optimization
89
+ attr_reader :model
90
+
91
+ # @return [Symbol] The optimizer to use (:bootstrap, :gepa, :miprov2)
92
+ attr_reader :optimizer
93
+
94
+ # @param dataset [Leva::Dataset] The dataset to optimize for
95
+ # @param metric [Proc, nil] Custom evaluation metric (default: exact string match)
96
+ # @param mode [Symbol] Optimization intensity (:light, :medium, :heavy)
97
+ # @param model [String, nil] The model to use (default: DEFAULT_MODEL)
98
+ # @param optimizer [Symbol, String] The optimizer to use (default: :bootstrap)
99
+ # @param progress_callback [Proc, nil] Callback for progress updates
100
+ def initialize(dataset:, metric: nil, mode: :light, model: nil, optimizer: nil, progress_callback: nil)
101
+ @dataset = dataset
102
+ @metric = metric || default_metric
103
+ @mode = mode.to_sym
104
+ @model = model.presence || DEFAULT_MODEL
105
+ @optimizer = (optimizer.presence || DEFAULT_OPTIMIZER).to_sym
106
+ @progress_callback = progress_callback
107
+ @last_progress = nil
108
+ end
109
+
110
+ # Runs the optimization process.
111
+ #
112
+ # @return [Hash] Hash containing :system_prompt, :user_prompt, and :metadata
113
+ # @raise [Leva::InsufficientDataError] If dataset has too few records
114
+ # @raise [Leva::DspyConfigurationError] If DSPy is not configured
115
+ def optimize
116
+ report_progress(step: "validating", progress: 0)
117
+ validate_dataset!
118
+ validate_dspy_configuration!
119
+ validate_optimizer!
120
+
121
+ report_progress(step: "splitting_data", progress: 10)
122
+ splits = DatasetConverter.new(@dataset).split
123
+
124
+ report_progress(step: "generating_signature", progress: 20)
125
+ signature = SignatureGenerator.new(@dataset).generate
126
+
127
+ # Delegate to optimizer strategy
128
+ strategy = build_optimizer_strategy
129
+ result = strategy.optimize(splits, signature)
130
+
131
+ report_progress(step: "complete", progress: 100)
132
+
133
+ build_final_result(result, splits, strategy.optimizer_type)
134
+ end
135
+
136
+ # Checks if the dataset is ready for optimization.
137
+ #
138
+ # @return [Boolean] True if the dataset can be optimized
139
+ def can_optimize?
140
+ @dataset.dataset_records.count >= MINIMUM_EXAMPLES
141
+ end
142
+
143
+ # Returns the number of additional records needed for optimization.
144
+ #
145
+ # @return [Integer] Number of records still needed (0 if ready)
146
+ def records_needed
147
+ [ MINIMUM_EXAMPLES - @dataset.dataset_records.count, 0 ].max
148
+ end
149
+
150
+ # Checks if a specific optimizer is available.
151
+ #
152
+ # @param optimizer_type [Symbol] The optimizer to check
153
+ # @return [Boolean] True if the optimizer is available
154
+ def self.optimizer_available?(optimizer_type)
155
+ optimizer_type = optimizer_type.to_sym
156
+ return true if optimizer_type == :bootstrap
157
+
158
+ case optimizer_type
159
+ when :gepa
160
+ !!defined?(DSPy::Teleprompt::GEPA)
161
+ when :miprov2
162
+ !!defined?(DSPy::Teleprompt::MIPROv2)
163
+ else
164
+ false
165
+ end
166
+ end
167
+
168
+ private
169
+
170
+ # Builds the optimizer strategy instance.
171
+ #
172
+ # @return [Leva::Optimizers::Base] The optimizer strategy
173
+ def build_optimizer_strategy
174
+ config = OPTIMIZERS[@optimizer]
175
+ config[:strategy_class].new(
176
+ model: @model,
177
+ metric: @metric,
178
+ mode: @mode,
179
+ progress_callback: @progress_callback
180
+ )
181
+ end
182
+
183
+ # Builds the final result hash from optimization.
184
+ #
185
+ # @param result [Hash] The optimizer result with :instruction, :few_shot_examples, :score
186
+ # @param splits [Hash] The data splits
187
+ # @param optimizer_type [Symbol] The optimizer that was used
188
+ # @return [Hash] The formatted result
189
+ def build_final_result(result, splits, optimizer_type)
190
+ sample_record = @dataset.dataset_records.first&.recordable
191
+ input_fields = sample_record&.to_llm_context&.keys || []
192
+
193
+ formatted_examples = result[:few_shot_examples].map do |ex|
194
+ { input: ex[:input], output: ex.dig(:expected, :output) }
195
+ end
196
+
197
+ {
198
+ system_prompt: result[:instruction],
199
+ user_prompt: build_user_prompt_template(input_fields),
200
+ metadata: {
201
+ optimization: {
202
+ score: result[:score],
203
+ mode: @mode.to_s,
204
+ optimizer: optimizer_type.to_s,
205
+ model: @model,
206
+ few_shot_examples: formatted_examples,
207
+ optimized_at: Time.current.iso8601,
208
+ dataset_size: @dataset.dataset_records.count,
209
+ train_size: splits[:train].size,
210
+ val_size: splits[:val].size,
211
+ test_size: splits[:test].size
212
+ }
213
+ }
214
+ }
215
+ end
216
+
217
+ # Reports progress to the callback if provided.
218
+ # Throttles updates to only report when progress changes by 5% or more.
219
+ #
220
+ # @param step [String] Current step name
221
+ # @param progress [Integer] Progress percentage (0-100)
222
+ # @param examples_processed [Integer, nil] Number of examples processed
223
+ # @param total [Integer, nil] Total examples to process
224
+ # @return [void]
225
+ def report_progress(step:, progress:, examples_processed: nil, total: nil)
226
+ return unless @progress_callback
227
+
228
+ # Skip if progress hasn't changed by at least 5%
229
+ return if @last_progress && (progress - @last_progress).abs < 5
230
+
231
+ @last_progress = progress
232
+ @progress_callback.call(
233
+ step: step,
234
+ progress: progress,
235
+ examples_processed: examples_processed,
236
+ total: total
237
+ )
238
+ end
239
+
240
+ # Validates that the dataset has enough records.
241
+ #
242
+ # @raise [Leva::InsufficientDataError] If dataset has too few records
243
+ def validate_dataset!
244
+ count = @dataset.dataset_records.count
245
+ return if count >= MINIMUM_EXAMPLES
246
+
247
+ raise InsufficientDataError,
248
+ "Dataset needs at least #{MINIMUM_EXAMPLES} records for optimization, has #{count}"
249
+ end
250
+
251
+ # Validates that DSPy is properly configured.
252
+ #
253
+ # @raise [Leva::DspyConfigurationError] If DSPy is not configured
254
+ def validate_dspy_configuration!
255
+ unless defined?(DSPy) && defined?(DSPy::Predict)
256
+ raise DspyConfigurationError, "DSPy is not installed. Add 'dspy' gem to your Gemfile."
257
+ end
258
+ end
259
+
260
+ # Validates that the selected optimizer is available.
261
+ #
262
+ # @raise [Leva::DspyConfigurationError] If optimizer is not available
263
+ def validate_optimizer!
264
+ return if @optimizer == :bootstrap
265
+ return if self.class.optimizer_available?(@optimizer)
266
+
267
+ gem_name = OPTIMIZERS.dig(@optimizer, :gem)
268
+ raise DspyConfigurationError, <<~MSG.strip
269
+ #{@optimizer.to_s.upcase} optimizer is not available. Install it:
270
+
271
+ gem 'dspy'
272
+ gem '#{gem_name}'
273
+
274
+ Or set DSPY_WITH_#{@optimizer.to_s.upcase}=1 before requiring dspy.
275
+ MSG
276
+ end
277
+
278
+ # Returns the default evaluation metric (case-insensitive exact match).
279
+ # Handles both Hash examples and DSPy::Example objects.
280
+ #
281
+ # @return [Proc] The default metric function
282
+ def default_metric
283
+ lambda do |example, prediction|
284
+ # Handle both Hash and DSPy::Example
285
+ expected_output = if example.is_a?(Hash)
286
+ example.dig(:expected, :output)
287
+ else
288
+ # DSPy::Example has expected_values method to get Hash
289
+ example.expected_values[:output]
290
+ end
291
+ expected = expected_output.to_s.strip.downcase
292
+ actual = prediction.to_s.strip.downcase
293
+ expected == actual ? 1.0 : 0.0
294
+ end
295
+ end
296
+
297
+ # Builds the user prompt template with Liquid placeholders.
298
+ #
299
+ # @param input_fields [Array<Symbol>] The input field names
300
+ # @return [String] The user prompt template
301
+ def build_user_prompt_template(input_fields)
302
+ input_fields.map { |field| "{{ #{field} }}" }.join("\n\n")
303
+ end
304
+ end
305
+ end
@@ -0,0 +1,129 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Leva
4
+ # Generates DSPy signatures from Leva dataset records.
5
+ #
6
+ # This service analyzes the structure of dataset records and generates
7
+ # a dynamic DSPy::Signature class that matches the input/output schema.
8
+ #
9
+ # @example Generate a signature from a dataset
10
+ # generator = Leva::SignatureGenerator.new(dataset)
11
+ # signature_class = generator.generate
12
+ # predictor = DSPy::Predict.new(signature_class)
13
+ class SignatureGenerator
14
+ # @param dataset [Leva::Dataset] The dataset to analyze
15
+ # @param description [String, nil] Optional description for the signature
16
+ def initialize(dataset, description: nil)
17
+ @dataset = dataset
18
+ @description = description
19
+ @sample_record = dataset.dataset_records.first&.recordable
20
+ end
21
+
22
+ # Generates a DSPy::Signature class based on the dataset structure.
23
+ #
24
+ # @return [Class, nil] A dynamically generated DSPy::Signature subclass, or nil if no sample
25
+ def generate
26
+ return nil unless @sample_record
27
+
28
+ input_fields = extract_input_fields
29
+ output_type = infer_output_type(@sample_record.ground_truth)
30
+ description = @description || generate_description
31
+
32
+ build_signature_class(input_fields, output_type, description)
33
+ end
34
+
35
+ # Returns the input field names that will be used in the signature.
36
+ #
37
+ # @return [Array<Symbol>] Array of input field names
38
+ def input_field_names
39
+ return [] unless @sample_record
40
+
41
+ extract_input_fields.keys
42
+ end
43
+
44
+ private
45
+
46
+ # Extracts input fields from the sample record's LLM context.
47
+ #
48
+ # @return [Hash<Symbol, Class>] Map of field names to their inferred types
49
+ def extract_input_fields
50
+ context = @sample_record.to_llm_context
51
+ context.transform_values { |value| infer_type(value) }
52
+ end
53
+
54
+ # Infers the Ruby type for a given value.
55
+ #
56
+ # @param value [Object] The value to analyze
57
+ # @return [Class] The inferred type (String, Integer, Float, Array, or Hash)
58
+ def infer_type(value)
59
+ case value
60
+ when String then String
61
+ when Integer then Integer
62
+ when Float then Float
63
+ when Array then Array
64
+ when Hash then Hash
65
+ else String
66
+ end
67
+ end
68
+
69
+ # Infers the output type from ground truth.
70
+ #
71
+ # @param ground_truth [Object] The ground truth value to analyze
72
+ # @return [Symbol] The output type (:string, :array, or :hash)
73
+ def infer_output_type(ground_truth)
74
+ case ground_truth
75
+ when String then :string
76
+ when Array then :array
77
+ when Hash then :hash
78
+ else :string
79
+ end
80
+ end
81
+
82
+ # Generates a description for the signature based on the dataset.
83
+ #
84
+ # @return [String] A descriptive string for the signature
85
+ def generate_description
86
+ # Analyze ground truth values to determine task type
87
+ ground_truths = @dataset.dataset_records.limit(20).map { |r| r.recordable.ground_truth }.compact
88
+ unique_outputs = ground_truths.uniq
89
+
90
+ if unique_outputs.size <= 10
91
+ # Classification task - be explicit about output format
92
+ "Classify the input. Respond with ONLY one of these exact values, nothing else: #{unique_outputs.join(', ')}"
93
+ else
94
+ # Generation task
95
+ "Generate output for the given input from dataset: #{@dataset.name}"
96
+ end
97
+ end
98
+
99
+ # Builds the DSPy::Signature class dynamically.
100
+ #
101
+ # @param input_fields [Hash<Symbol, Class>] Input field definitions
102
+ # @param output_type [Symbol] The output type
103
+ # @param description [String] Description for the signature
104
+ # @return [Class] The generated DSPy::Signature subclass
105
+ # @raise [Leva::DspyConfigurationError] If DSPy is not available
106
+ def build_signature_class(input_fields, output_type, description)
107
+ unless defined?(DSPy::Signature)
108
+ raise DspyConfigurationError, "DSPy is required for signature generation"
109
+ end
110
+
111
+ captured_input_fields = input_fields
112
+ captured_description = description
113
+
114
+ Class.new(DSPy::Signature) do
115
+ description captured_description
116
+
117
+ input do
118
+ captured_input_fields.each do |name, _type|
119
+ const name, String
120
+ end
121
+ end
122
+
123
+ output do
124
+ const :output, String
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end
@@ -84,6 +84,9 @@
84
84
  <% end %>
85
85
  </section>
86
86
 
87
+ <%# Optimized Prompts Section - TODO: Enable when DSPy routes are added %>
88
+ <%# This feature is available in the PromptOptimizer service but UI routes are pending %>
89
+
87
90
  <%# Experiments Section %>
88
91
  <section>
89
92
  <div class="section-header">
@@ -8,6 +8,9 @@
8
8
  else 'status-dot-pending'
9
9
  end
10
10
  run_count = experiment.runner_results.count
11
+
12
+ # Group evaluation results by evaluator_class to avoid N+1 queries
13
+ grouped_results = experiment.evaluation_results.group_by(&:evaluator_class)
11
14
  %>
12
15
  <tr class="experiment-row" onclick="window.location='<%= experiment_path(experiment) %>'">
13
16
  <td>
@@ -21,6 +24,9 @@
21
24
  <td>
22
25
  <span class="cell-dataset"><%= experiment.dataset&.name || '—' %></span>
23
26
  </td>
27
+ <td>
28
+ <span class="cell-model font-mono text-sm"><%= experiment.metadata&.dig("model") || '—' %></span>
29
+ </td>
24
30
  <td class="text-right text-nowrap">
25
31
  <span class="cell-timestamp"><%= time_ago_in_words(experiment.created_at) %></span>
26
32
  </td>
@@ -33,22 +39,15 @@
33
39
  <td class="text-right">
34
40
  <span class="cell-count"><%= run_count %></span>
35
41
  </td>
36
- <% Leva::EvaluationResult.distinct.pluck(:evaluator_class).each do |evaluator_class| %>
42
+ <% @evaluator_classes.each do |evaluator_class| %>
37
43
  <td class="text-right">
38
- <% results = experiment.evaluation_results.where(evaluator_class: evaluator_class) %>
44
+ <% results = grouped_results[evaluator_class] || [] %>
39
45
  <% if results.any? %>
40
46
  <%
41
47
  avg_score = (results.sum(&:score) / results.size.to_f)
42
48
  score_pct = (avg_score * 100).round
43
- score_class = case avg_score
44
- when 0...0.2 then 'score-bad'
45
- when 0.2...0.4 then 'score-poor'
46
- when 0.4...0.6 then 'score-fair'
47
- when 0.6...0.8 then 'score-good'
48
- else 'score-excellent'
49
- end
50
49
  %>
51
- <span class="score-pill <%= score_class %>"><%= score_pct %>%</span>
50
+ <span class="score-pill <%= score_class(avg_score) %>"><%= score_pct %>%</span>
52
51
  <% else %>
53
52
  <span class="score-empty">—</span>
54
53
  <% end %>
@@ -50,6 +50,16 @@
50
50
  class: "form-select" %>
51
51
  <p class="form-hint">The runner executes your model logic for each dataset record.</p>
52
52
  </div>
53
+
54
+ <div class="form-group" id="model-selection-group">
55
+ <label for="experiment_metadata_model" class="form-label">Model (for LLM runners)</label>
56
+ <select name="experiment[metadata][model]" id="experiment_metadata_model" class="form-select">
57
+ <% Leva::PromptOptimizer.available_models.each do |m| %>
58
+ <option value="<%= m.id %>" <%= 'selected' if @experiment.metadata&.dig("model") == m.id || (@experiment.metadata.blank? && m.id == "gemini-2.5-flash") %>><%= m.name %></option>
59
+ <% end %>
60
+ </select>
61
+ <p class="form-hint">The AI model to use when running LLM-based runners like SentimentLlmRun.</p>
62
+ </div>
53
63
  </div>
54
64
 
55
65
  <hr class="form-divider">
@@ -21,10 +21,11 @@
21
21
  <tr>
22
22
  <th>Experiment</th>
23
23
  <th style="width: 140px;">Dataset</th>
24
+ <th style="width: 140px;">Model</th>
24
25
  <th class="text-right" style="width: 90px;">Created</th>
25
26
  <th class="text-center" style="width: 90px;">Status</th>
26
27
  <th class="text-right" style="width: 60px;">Runs</th>
27
- <% Leva::EvaluationResult.distinct.pluck(:evaluator_class).each do |evaluator_class| %>
28
+ <% @evaluator_classes.each do |evaluator_class| %>
28
29
  <%
29
30
  # Clean up evaluator name: "SentimentAccuracyEval" -> "Accuracy"
30
31
  # Remove common prefixes/suffixes and module names
@@ -55,12 +55,27 @@
55
55
  </div>
56
56
  <div class="exp-meta-item">
57
57
  <span class="exp-meta-label">Prompt</span>
58
- <span class="exp-meta-value"><%= @experiment.prompt ? @experiment.prompt.name : '—' %></span>
58
+ <span class="exp-meta-value">
59
+ <% if @experiment.prompt %>
60
+ <%= @experiment.prompt.name %>
61
+ <% if @experiment.prompt.optimized? %>
62
+ <span class="badge badge-optimized" title="Generated by <%= @experiment.prompt.optimizer_name&.titleize || 'optimizer' %>">Optimized</span>
63
+ <% end %>
64
+ <% else %>
65
+
66
+ <% end %>
67
+ </span>
59
68
  </div>
60
69
  <div class="exp-meta-item">
61
70
  <span class="exp-meta-label">Runner</span>
62
71
  <span class="exp-meta-value font-mono text-sm"><%= @experiment.runner_class&.demodulize || '—' %></span>
63
72
  </div>
73
+ <% if @experiment.metadata&.dig("model").present? %>
74
+ <div class="exp-meta-item">
75
+ <span class="exp-meta-label">Model</span>
76
+ <span class="exp-meta-value font-mono text-sm"><%= @experiment.metadata["model"] %></span>
77
+ </div>
78
+ <% end %>
64
79
  <div class="exp-meta-item">
65
80
  <span class="exp-meta-label">Created</span>
66
81
  <span class="exp-meta-value"><%= time_ago_in_words(@experiment.created_at) %> ago</span>
@@ -79,13 +94,6 @@
79
94
  <%
80
95
  avg_score = (results.sum(&:score) / results.size.to_f).round(2)
81
96
  score_pct = (avg_score * 100).round
82
- score_class = case avg_score
83
- when 0...0.2 then 'score-bad'
84
- when 0.2...0.4 then 'score-poor'
85
- when 0.4...0.6 then 'score-fair'
86
- when 0.6...0.8 then 'score-good'
87
- else 'score-excellent'
88
- end
89
97
  short_name = evaluator_class.demodulize
90
98
  .gsub(/Evaluator$/, '')
91
99
  .gsub(/Eval$/, '')
@@ -93,10 +101,10 @@
93
101
  short_name = short_name.presence || evaluator_class.demodulize.gsub(/Eval(uator)?$/, '')
94
102
  %>
95
103
  <div class="eval-summary-card" title="<%= results.size %> evaluations">
96
- <span class="eval-summary-score <%= score_class %>"><%= score_pct %><span class="eval-summary-pct">%</span></span>
104
+ <span class="eval-summary-score <%= score_class(avg_score) %>"><%= score_pct %><span class="eval-summary-pct">%</span></span>
97
105
  <span class="eval-summary-name"><%= short_name %></span>
98
106
  <div class="eval-summary-bar">
99
- <div class="eval-summary-bar-fill <%= score_class %>" style="width: <%= score_pct %>%"></div>
107
+ <div class="eval-summary-bar-fill <%= score_class(avg_score) %>" style="width: <%= score_pct %>%"></div>
100
108
  </div>
101
109
  <span class="eval-summary-count"><%= results.size %> runs</span>
102
110
  </div>
@@ -139,7 +147,7 @@
139
147
  <span class="row-title"><%= runner_result.dataset_record.display_name %></span>
140
148
  </td>
141
149
  <td>
142
- <span class="prediction-badge"><%= truncate(runner_result.prediction.to_s.strip, length: 25) %></span>
150
+ <span class="prediction-badge"><%= truncate(runner_result.parsed_predictions.first.to_s.presence || runner_result.prediction.to_s.strip, length: 25) %></span>
143
151
  </td>
144
152
  <td class="text-muted"><%= truncate(runner_result.ground_truth.to_s.strip.presence || '—', length: 25) %></td>
145
153
  <% @experiment.evaluation_results.group_by(&:evaluator_class).keys.each do |evaluator_class| %>
@@ -147,16 +155,7 @@
147
155
  <% eval_result = runner_result.evaluation_results.find_by(evaluator_class: evaluator_class) %>
148
156
  <% if eval_result %>
149
157
  <% score = eval_result.score %>
150
- <%
151
- score_class = case score
152
- when 0...0.2 then 'score-bad'
153
- when 0.2...0.4 then 'score-poor'
154
- when 0.4...0.6 then 'score-fair'
155
- when 0.6...0.8 then 'score-good'
156
- else 'score-excellent'
157
- end
158
- %>
159
- <span class="score-inline <%= score_class %>"><%= sprintf('%.2f', score) %></span>
158
+ <span class="score-inline <%= score_class(score) %>"><%= sprintf('%.2f', score) %></span>
160
159
  <% else %>
161
160
  <span class="text-subtle">—</span>
162
161
  <% end %>