leva 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +55 -1
- data/app/assets/stylesheets/leva/application.css +165 -25
- data/app/controllers/leva/dataset_optimizations_controller.rb +64 -0
- data/app/controllers/leva/experiments_controller.rb +14 -6
- data/app/controllers/leva/workbench_controller.rb +26 -10
- data/app/helpers/leva/application_helper.rb +32 -16
- data/app/models/leva/dataset.rb +1 -0
- data/app/models/leva/experiment.rb +1 -0
- data/app/models/leva/optimization_run.rb +137 -0
- data/app/models/leva/prompt.rb +10 -0
- data/app/services/leva/class_loader.rb +37 -0
- data/app/services/leva/dataset_converter.rb +64 -0
- data/app/services/leva/optimizers/base.rb +183 -0
- data/app/services/leva/optimizers/bootstrap.rb +92 -0
- data/app/services/leva/optimizers/gepa_optimizer.rb +59 -0
- data/app/services/leva/optimizers/miprov2_optimizer.rb +52 -0
- data/app/services/leva/prompt_optimizer.rb +305 -0
- data/app/services/leva/signature_generator.rb +129 -0
- data/app/views/leva/datasets/show.html.erb +3 -0
- data/app/views/leva/experiments/_experiment.html.erb +9 -10
- data/app/views/leva/experiments/_form.html.erb +10 -0
- data/app/views/leva/experiments/index.html.erb +2 -1
- data/app/views/leva/experiments/show.html.erb +20 -21
- data/app/views/leva/optimization_runs/show.html.erb +698 -0
- data/app/views/leva/runner_results/show.html.erb +18 -48
- data/app/views/leva/workbench/_results_section.html.erb +3 -11
- data/db/migrate/20241204000001_create_leva_optimization_runs.rb +25 -0
- data/lib/generators/leva/templates/eval.rb.erb +4 -2
- data/lib/leva/errors.rb +18 -0
- data/lib/leva/version.rb +1 -1
- data/lib/leva.rb +1 -0
- metadata +16 -3
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Leva
|
|
4
|
+
# Optimizes prompts using DSPy.rb optimizers.
|
|
5
|
+
#
|
|
6
|
+
# This service coordinates the optimization process, delegating
|
|
7
|
+
# the actual optimization work to strategy classes.
|
|
8
|
+
#
|
|
9
|
+
# @example Optimize a prompt for a dataset
|
|
10
|
+
# optimizer = Leva::PromptOptimizer.new(dataset: dataset, mode: :medium)
|
|
11
|
+
# result = optimizer.optimize
|
|
12
|
+
# # => { system_prompt: "...", user_prompt: "...", metadata: {...} }
|
|
13
|
+
#
|
|
14
|
+
# @example With GEPA optimizer
|
|
15
|
+
# optimizer = Leva::PromptOptimizer.new(dataset: dataset, optimizer: :gepa, mode: :medium)
|
|
16
|
+
# result = optimizer.optimize
|
|
17
|
+
class PromptOptimizer
|
|
18
|
+
# Minimum number of examples required for optimization
|
|
19
|
+
MINIMUM_EXAMPLES = 10
|
|
20
|
+
|
|
21
|
+
# Available optimizers with their strategy classes
|
|
22
|
+
OPTIMIZERS = {
|
|
23
|
+
bootstrap: {
|
|
24
|
+
name: "Bootstrap",
|
|
25
|
+
strategy_class: Leva::Optimizers::Bootstrap,
|
|
26
|
+
gem: nil,
|
|
27
|
+
description: "Fast and simple. Automatically selects optimal few-shot examples from your dataset. " \
|
|
28
|
+
"Best for quick iteration and when you have limited data (10-50 examples). " \
|
|
29
|
+
"Does not modify instructions, only adds demonstrations."
|
|
30
|
+
},
|
|
31
|
+
gepa: {
|
|
32
|
+
name: "GEPA",
|
|
33
|
+
strategy_class: Leva::Optimizers::GepaOptimizer,
|
|
34
|
+
gem: "dspy-gepa",
|
|
35
|
+
description: "State-of-the-art optimizer using reflective prompt evolution. Uses LLM reflection " \
|
|
36
|
+
"to identify what works and propose improvements. Outperforms MIPROv2 by 10-14% " \
|
|
37
|
+
"while being more sample efficient. Best choice for maximum quality."
|
|
38
|
+
},
|
|
39
|
+
miprov2: {
|
|
40
|
+
name: "MIPROv2",
|
|
41
|
+
strategy_class: Leva::Optimizers::Miprov2Optimizer,
|
|
42
|
+
gem: "dspy-miprov2",
|
|
43
|
+
description: "Uses Bayesian optimization to search for optimal instructions and few-shot examples. " \
|
|
44
|
+
"Good for larger datasets (200+ examples). More computationally demanding but thorough. " \
|
|
45
|
+
"Can overfit on small datasets."
|
|
46
|
+
}
|
|
47
|
+
}.freeze
|
|
48
|
+
|
|
49
|
+
# Default optimizer
|
|
50
|
+
DEFAULT_OPTIMIZER = :bootstrap
|
|
51
|
+
|
|
52
|
+
# Optimization modes with their approximate durations
|
|
53
|
+
MODES = {
|
|
54
|
+
light: { description: "Fast optimization (~5 min)", trials: 5 },
|
|
55
|
+
medium: { description: "Balanced optimization (~15 min)", trials: 15 },
|
|
56
|
+
heavy: { description: "Thorough optimization (~30 min)", trials: 30 }
|
|
57
|
+
}.freeze
|
|
58
|
+
|
|
59
|
+
# Default model if none specified (fast and cheap)
|
|
60
|
+
DEFAULT_MODEL = "gemini-2.5-flash"
|
|
61
|
+
|
|
62
|
+
# Returns available models from RubyLLM.
|
|
63
|
+
# Results are cached for 5 minutes to avoid repeated expensive calls.
|
|
64
|
+
#
|
|
65
|
+
# @return [Array<RubyLLM::Model>] All available chat models
|
|
66
|
+
def self.available_models
|
|
67
|
+
Rails.cache.fetch("leva/available_models", expires_in: 5.minutes) do
|
|
68
|
+
RubyLLM.models.chat_models
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Finds a model by ID.
|
|
73
|
+
#
|
|
74
|
+
# @param model_id [String] The model ID to find
|
|
75
|
+
# @return [RubyLLM::Model, nil] The model or nil if not found
|
|
76
|
+
def self.find_model(model_id)
|
|
77
|
+
RubyLLM.models.find(model_id)
|
|
78
|
+
rescue RubyLLM::ModelNotFoundError
|
|
79
|
+
nil
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# @return [Leva::Dataset] The dataset being optimized
|
|
83
|
+
attr_reader :dataset
|
|
84
|
+
|
|
85
|
+
# @return [Symbol] The optimization mode (:light, :medium, :heavy)
|
|
86
|
+
attr_reader :mode
|
|
87
|
+
|
|
88
|
+
# @return [String] The model to use for optimization
|
|
89
|
+
attr_reader :model
|
|
90
|
+
|
|
91
|
+
# @return [Symbol] The optimizer to use (:bootstrap, :gepa, :miprov2)
|
|
92
|
+
attr_reader :optimizer
|
|
93
|
+
|
|
94
|
+
# @param dataset [Leva::Dataset] The dataset to optimize for
|
|
95
|
+
# @param metric [Proc, nil] Custom evaluation metric (default: exact string match)
|
|
96
|
+
# @param mode [Symbol] Optimization intensity (:light, :medium, :heavy)
|
|
97
|
+
# @param model [String, nil] The model to use (default: DEFAULT_MODEL)
|
|
98
|
+
# @param optimizer [Symbol, String] The optimizer to use (default: :bootstrap)
|
|
99
|
+
# @param progress_callback [Proc, nil] Callback for progress updates
|
|
100
|
+
def initialize(dataset:, metric: nil, mode: :light, model: nil, optimizer: nil, progress_callback: nil)
|
|
101
|
+
@dataset = dataset
|
|
102
|
+
@metric = metric || default_metric
|
|
103
|
+
@mode = mode.to_sym
|
|
104
|
+
@model = model.presence || DEFAULT_MODEL
|
|
105
|
+
@optimizer = (optimizer.presence || DEFAULT_OPTIMIZER).to_sym
|
|
106
|
+
@progress_callback = progress_callback
|
|
107
|
+
@last_progress = nil
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Runs the optimization process.
|
|
111
|
+
#
|
|
112
|
+
# @return [Hash] Hash containing :system_prompt, :user_prompt, and :metadata
|
|
113
|
+
# @raise [Leva::InsufficientDataError] If dataset has too few records
|
|
114
|
+
# @raise [Leva::DspyConfigurationError] If DSPy is not configured
|
|
115
|
+
def optimize
|
|
116
|
+
report_progress(step: "validating", progress: 0)
|
|
117
|
+
validate_dataset!
|
|
118
|
+
validate_dspy_configuration!
|
|
119
|
+
validate_optimizer!
|
|
120
|
+
|
|
121
|
+
report_progress(step: "splitting_data", progress: 10)
|
|
122
|
+
splits = DatasetConverter.new(@dataset).split
|
|
123
|
+
|
|
124
|
+
report_progress(step: "generating_signature", progress: 20)
|
|
125
|
+
signature = SignatureGenerator.new(@dataset).generate
|
|
126
|
+
|
|
127
|
+
# Delegate to optimizer strategy
|
|
128
|
+
strategy = build_optimizer_strategy
|
|
129
|
+
result = strategy.optimize(splits, signature)
|
|
130
|
+
|
|
131
|
+
report_progress(step: "complete", progress: 100)
|
|
132
|
+
|
|
133
|
+
build_final_result(result, splits, strategy.optimizer_type)
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Checks if the dataset is ready for optimization.
|
|
137
|
+
#
|
|
138
|
+
# @return [Boolean] True if the dataset can be optimized
|
|
139
|
+
def can_optimize?
|
|
140
|
+
@dataset.dataset_records.count >= MINIMUM_EXAMPLES
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Returns the number of additional records needed for optimization.
|
|
144
|
+
#
|
|
145
|
+
# @return [Integer] Number of records still needed (0 if ready)
|
|
146
|
+
def records_needed
|
|
147
|
+
[ MINIMUM_EXAMPLES - @dataset.dataset_records.count, 0 ].max
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Checks if a specific optimizer is available.
|
|
151
|
+
#
|
|
152
|
+
# @param optimizer_type [Symbol] The optimizer to check
|
|
153
|
+
# @return [Boolean] True if the optimizer is available
|
|
154
|
+
def self.optimizer_available?(optimizer_type)
|
|
155
|
+
optimizer_type = optimizer_type.to_sym
|
|
156
|
+
return true if optimizer_type == :bootstrap
|
|
157
|
+
|
|
158
|
+
case optimizer_type
|
|
159
|
+
when :gepa
|
|
160
|
+
!!defined?(DSPy::Teleprompt::GEPA)
|
|
161
|
+
when :miprov2
|
|
162
|
+
!!defined?(DSPy::Teleprompt::MIPROv2)
|
|
163
|
+
else
|
|
164
|
+
false
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
private
|
|
169
|
+
|
|
170
|
+
# Builds the optimizer strategy instance.
|
|
171
|
+
#
|
|
172
|
+
# @return [Leva::Optimizers::Base] The optimizer strategy
|
|
173
|
+
def build_optimizer_strategy
|
|
174
|
+
config = OPTIMIZERS[@optimizer]
|
|
175
|
+
config[:strategy_class].new(
|
|
176
|
+
model: @model,
|
|
177
|
+
metric: @metric,
|
|
178
|
+
mode: @mode,
|
|
179
|
+
progress_callback: @progress_callback
|
|
180
|
+
)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
# Builds the final result hash from optimization.
|
|
184
|
+
#
|
|
185
|
+
# @param result [Hash] The optimizer result with :instruction, :few_shot_examples, :score
|
|
186
|
+
# @param splits [Hash] The data splits
|
|
187
|
+
# @param optimizer_type [Symbol] The optimizer that was used
|
|
188
|
+
# @return [Hash] The formatted result
|
|
189
|
+
def build_final_result(result, splits, optimizer_type)
|
|
190
|
+
sample_record = @dataset.dataset_records.first&.recordable
|
|
191
|
+
input_fields = sample_record&.to_llm_context&.keys || []
|
|
192
|
+
|
|
193
|
+
formatted_examples = result[:few_shot_examples].map do |ex|
|
|
194
|
+
{ input: ex[:input], output: ex.dig(:expected, :output) }
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
{
|
|
198
|
+
system_prompt: result[:instruction],
|
|
199
|
+
user_prompt: build_user_prompt_template(input_fields),
|
|
200
|
+
metadata: {
|
|
201
|
+
optimization: {
|
|
202
|
+
score: result[:score],
|
|
203
|
+
mode: @mode.to_s,
|
|
204
|
+
optimizer: optimizer_type.to_s,
|
|
205
|
+
model: @model,
|
|
206
|
+
few_shot_examples: formatted_examples,
|
|
207
|
+
optimized_at: Time.current.iso8601,
|
|
208
|
+
dataset_size: @dataset.dataset_records.count,
|
|
209
|
+
train_size: splits[:train].size,
|
|
210
|
+
val_size: splits[:val].size,
|
|
211
|
+
test_size: splits[:test].size
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Reports progress to the callback if provided.
|
|
218
|
+
# Throttles updates to only report when progress changes by 5% or more.
|
|
219
|
+
#
|
|
220
|
+
# @param step [String] Current step name
|
|
221
|
+
# @param progress [Integer] Progress percentage (0-100)
|
|
222
|
+
# @param examples_processed [Integer, nil] Number of examples processed
|
|
223
|
+
# @param total [Integer, nil] Total examples to process
|
|
224
|
+
# @return [void]
|
|
225
|
+
def report_progress(step:, progress:, examples_processed: nil, total: nil)
|
|
226
|
+
return unless @progress_callback
|
|
227
|
+
|
|
228
|
+
# Skip if progress hasn't changed by at least 5%
|
|
229
|
+
return if @last_progress && (progress - @last_progress).abs < 5
|
|
230
|
+
|
|
231
|
+
@last_progress = progress
|
|
232
|
+
@progress_callback.call(
|
|
233
|
+
step: step,
|
|
234
|
+
progress: progress,
|
|
235
|
+
examples_processed: examples_processed,
|
|
236
|
+
total: total
|
|
237
|
+
)
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Validates that the dataset has enough records.
|
|
241
|
+
#
|
|
242
|
+
# @raise [Leva::InsufficientDataError] If dataset has too few records
|
|
243
|
+
def validate_dataset!
|
|
244
|
+
count = @dataset.dataset_records.count
|
|
245
|
+
return if count >= MINIMUM_EXAMPLES
|
|
246
|
+
|
|
247
|
+
raise InsufficientDataError,
|
|
248
|
+
"Dataset needs at least #{MINIMUM_EXAMPLES} records for optimization, has #{count}"
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
# Validates that DSPy is properly configured.
|
|
252
|
+
#
|
|
253
|
+
# @raise [Leva::DspyConfigurationError] If DSPy is not configured
|
|
254
|
+
def validate_dspy_configuration!
|
|
255
|
+
unless defined?(DSPy) && defined?(DSPy::Predict)
|
|
256
|
+
raise DspyConfigurationError, "DSPy is not installed. Add 'dspy' gem to your Gemfile."
|
|
257
|
+
end
|
|
258
|
+
end
|
|
259
|
+
|
|
260
|
+
# Validates that the selected optimizer is available.
|
|
261
|
+
#
|
|
262
|
+
# @raise [Leva::DspyConfigurationError] If optimizer is not available
|
|
263
|
+
def validate_optimizer!
|
|
264
|
+
return if @optimizer == :bootstrap
|
|
265
|
+
return if self.class.optimizer_available?(@optimizer)
|
|
266
|
+
|
|
267
|
+
gem_name = OPTIMIZERS.dig(@optimizer, :gem)
|
|
268
|
+
raise DspyConfigurationError, <<~MSG.strip
|
|
269
|
+
#{@optimizer.to_s.upcase} optimizer is not available. Install it:
|
|
270
|
+
|
|
271
|
+
gem 'dspy'
|
|
272
|
+
gem '#{gem_name}'
|
|
273
|
+
|
|
274
|
+
Or set DSPY_WITH_#{@optimizer.to_s.upcase}=1 before requiring dspy.
|
|
275
|
+
MSG
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Returns the default evaluation metric (case-insensitive exact match).
|
|
279
|
+
# Handles both Hash examples and DSPy::Example objects.
|
|
280
|
+
#
|
|
281
|
+
# @return [Proc] The default metric function
|
|
282
|
+
def default_metric
|
|
283
|
+
lambda do |example, prediction|
|
|
284
|
+
# Handle both Hash and DSPy::Example
|
|
285
|
+
expected_output = if example.is_a?(Hash)
|
|
286
|
+
example.dig(:expected, :output)
|
|
287
|
+
else
|
|
288
|
+
# DSPy::Example has expected_values method to get Hash
|
|
289
|
+
example.expected_values[:output]
|
|
290
|
+
end
|
|
291
|
+
expected = expected_output.to_s.strip.downcase
|
|
292
|
+
actual = prediction.to_s.strip.downcase
|
|
293
|
+
expected == actual ? 1.0 : 0.0
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
|
|
297
|
+
# Builds the user prompt template with Liquid placeholders.
|
|
298
|
+
#
|
|
299
|
+
# @param input_fields [Array<Symbol>] The input field names
|
|
300
|
+
# @return [String] The user prompt template
|
|
301
|
+
def build_user_prompt_template(input_fields)
|
|
302
|
+
input_fields.map { |field| "{{ #{field} }}" }.join("\n\n")
|
|
303
|
+
end
|
|
304
|
+
end
|
|
305
|
+
end
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Leva
|
|
4
|
+
# Generates DSPy signatures from Leva dataset records.
|
|
5
|
+
#
|
|
6
|
+
# This service analyzes the structure of dataset records and generates
|
|
7
|
+
# a dynamic DSPy::Signature class that matches the input/output schema.
|
|
8
|
+
#
|
|
9
|
+
# @example Generate a signature from a dataset
|
|
10
|
+
# generator = Leva::SignatureGenerator.new(dataset)
|
|
11
|
+
# signature_class = generator.generate
|
|
12
|
+
# predictor = DSPy::Predict.new(signature_class)
|
|
13
|
+
class SignatureGenerator
|
|
14
|
+
# @param dataset [Leva::Dataset] The dataset to analyze
|
|
15
|
+
# @param description [String, nil] Optional description for the signature
|
|
16
|
+
def initialize(dataset, description: nil)
|
|
17
|
+
@dataset = dataset
|
|
18
|
+
@description = description
|
|
19
|
+
@sample_record = dataset.dataset_records.first&.recordable
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Generates a DSPy::Signature class based on the dataset structure.
|
|
23
|
+
#
|
|
24
|
+
# @return [Class, nil] A dynamically generated DSPy::Signature subclass, or nil if no sample
|
|
25
|
+
def generate
|
|
26
|
+
return nil unless @sample_record
|
|
27
|
+
|
|
28
|
+
input_fields = extract_input_fields
|
|
29
|
+
output_type = infer_output_type(@sample_record.ground_truth)
|
|
30
|
+
description = @description || generate_description
|
|
31
|
+
|
|
32
|
+
build_signature_class(input_fields, output_type, description)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Returns the input field names that will be used in the signature.
|
|
36
|
+
#
|
|
37
|
+
# @return [Array<Symbol>] Array of input field names
|
|
38
|
+
def input_field_names
|
|
39
|
+
return [] unless @sample_record
|
|
40
|
+
|
|
41
|
+
extract_input_fields.keys
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
# Extracts input fields from the sample record's LLM context.
|
|
47
|
+
#
|
|
48
|
+
# @return [Hash<Symbol, Class>] Map of field names to their inferred types
|
|
49
|
+
def extract_input_fields
|
|
50
|
+
context = @sample_record.to_llm_context
|
|
51
|
+
context.transform_values { |value| infer_type(value) }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Infers the Ruby type for a given value.
|
|
55
|
+
#
|
|
56
|
+
# @param value [Object] The value to analyze
|
|
57
|
+
# @return [Class] The inferred type (String, Integer, Float, Array, or Hash)
|
|
58
|
+
def infer_type(value)
|
|
59
|
+
case value
|
|
60
|
+
when String then String
|
|
61
|
+
when Integer then Integer
|
|
62
|
+
when Float then Float
|
|
63
|
+
when Array then Array
|
|
64
|
+
when Hash then Hash
|
|
65
|
+
else String
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Infers the output type from ground truth.
|
|
70
|
+
#
|
|
71
|
+
# @param ground_truth [Object] The ground truth value to analyze
|
|
72
|
+
# @return [Symbol] The output type (:string, :array, or :hash)
|
|
73
|
+
def infer_output_type(ground_truth)
|
|
74
|
+
case ground_truth
|
|
75
|
+
when String then :string
|
|
76
|
+
when Array then :array
|
|
77
|
+
when Hash then :hash
|
|
78
|
+
else :string
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Generates a description for the signature based on the dataset.
|
|
83
|
+
#
|
|
84
|
+
# @return [String] A descriptive string for the signature
|
|
85
|
+
def generate_description
|
|
86
|
+
# Analyze ground truth values to determine task type
|
|
87
|
+
ground_truths = @dataset.dataset_records.limit(20).map { |r| r.recordable.ground_truth }.compact
|
|
88
|
+
unique_outputs = ground_truths.uniq
|
|
89
|
+
|
|
90
|
+
if unique_outputs.size <= 10
|
|
91
|
+
# Classification task - be explicit about output format
|
|
92
|
+
"Classify the input. Respond with ONLY one of these exact values, nothing else: #{unique_outputs.join(', ')}"
|
|
93
|
+
else
|
|
94
|
+
# Generation task
|
|
95
|
+
"Generate output for the given input from dataset: #{@dataset.name}"
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Builds the DSPy::Signature class dynamically.
|
|
100
|
+
#
|
|
101
|
+
# @param input_fields [Hash<Symbol, Class>] Input field definitions
|
|
102
|
+
# @param output_type [Symbol] The output type
|
|
103
|
+
# @param description [String] Description for the signature
|
|
104
|
+
# @return [Class] The generated DSPy::Signature subclass
|
|
105
|
+
# @raise [Leva::DspyConfigurationError] If DSPy is not available
|
|
106
|
+
def build_signature_class(input_fields, output_type, description)
|
|
107
|
+
unless defined?(DSPy::Signature)
|
|
108
|
+
raise DspyConfigurationError, "DSPy is required for signature generation"
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
captured_input_fields = input_fields
|
|
112
|
+
captured_description = description
|
|
113
|
+
|
|
114
|
+
Class.new(DSPy::Signature) do
|
|
115
|
+
description captured_description
|
|
116
|
+
|
|
117
|
+
input do
|
|
118
|
+
captured_input_fields.each do |name, _type|
|
|
119
|
+
const name, String
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
output do
|
|
124
|
+
const :output, String
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
@@ -84,6 +84,9 @@
|
|
|
84
84
|
<% end %>
|
|
85
85
|
</section>
|
|
86
86
|
|
|
87
|
+
<%# Optimized Prompts Section - TODO: Enable when DSPy routes are added %>
|
|
88
|
+
<%# This feature is available in the PromptOptimizer service but UI routes are pending %>
|
|
89
|
+
|
|
87
90
|
<%# Experiments Section %>
|
|
88
91
|
<section>
|
|
89
92
|
<div class="section-header">
|
|
@@ -8,6 +8,9 @@
|
|
|
8
8
|
else 'status-dot-pending'
|
|
9
9
|
end
|
|
10
10
|
run_count = experiment.runner_results.count
|
|
11
|
+
|
|
12
|
+
# Group evaluation results by evaluator_class to avoid N+1 queries
|
|
13
|
+
grouped_results = experiment.evaluation_results.group_by(&:evaluator_class)
|
|
11
14
|
%>
|
|
12
15
|
<tr class="experiment-row" onclick="window.location='<%= experiment_path(experiment) %>'">
|
|
13
16
|
<td>
|
|
@@ -21,6 +24,9 @@
|
|
|
21
24
|
<td>
|
|
22
25
|
<span class="cell-dataset"><%= experiment.dataset&.name || '—' %></span>
|
|
23
26
|
</td>
|
|
27
|
+
<td>
|
|
28
|
+
<span class="cell-model font-mono text-sm"><%= experiment.metadata&.dig("model") || '—' %></span>
|
|
29
|
+
</td>
|
|
24
30
|
<td class="text-right text-nowrap">
|
|
25
31
|
<span class="cell-timestamp"><%= time_ago_in_words(experiment.created_at) %></span>
|
|
26
32
|
</td>
|
|
@@ -33,22 +39,15 @@
|
|
|
33
39
|
<td class="text-right">
|
|
34
40
|
<span class="cell-count"><%= run_count %></span>
|
|
35
41
|
</td>
|
|
36
|
-
<%
|
|
42
|
+
<% @evaluator_classes.each do |evaluator_class| %>
|
|
37
43
|
<td class="text-right">
|
|
38
|
-
<% results =
|
|
44
|
+
<% results = grouped_results[evaluator_class] || [] %>
|
|
39
45
|
<% if results.any? %>
|
|
40
46
|
<%
|
|
41
47
|
avg_score = (results.sum(&:score) / results.size.to_f)
|
|
42
48
|
score_pct = (avg_score * 100).round
|
|
43
|
-
score_class = case avg_score
|
|
44
|
-
when 0...0.2 then 'score-bad'
|
|
45
|
-
when 0.2...0.4 then 'score-poor'
|
|
46
|
-
when 0.4...0.6 then 'score-fair'
|
|
47
|
-
when 0.6...0.8 then 'score-good'
|
|
48
|
-
else 'score-excellent'
|
|
49
|
-
end
|
|
50
49
|
%>
|
|
51
|
-
<span class="score-pill <%= score_class %>"><%= score_pct %>%</span>
|
|
50
|
+
<span class="score-pill <%= score_class(avg_score) %>"><%= score_pct %>%</span>
|
|
52
51
|
<% else %>
|
|
53
52
|
<span class="score-empty">—</span>
|
|
54
53
|
<% end %>
|
|
@@ -50,6 +50,16 @@
|
|
|
50
50
|
class: "form-select" %>
|
|
51
51
|
<p class="form-hint">The runner executes your model logic for each dataset record.</p>
|
|
52
52
|
</div>
|
|
53
|
+
|
|
54
|
+
<div class="form-group" id="model-selection-group">
|
|
55
|
+
<label for="experiment_metadata_model" class="form-label">Model (for LLM runners)</label>
|
|
56
|
+
<select name="experiment[metadata][model]" id="experiment_metadata_model" class="form-select">
|
|
57
|
+
<% Leva::PromptOptimizer.available_models.each do |m| %>
|
|
58
|
+
<option value="<%= m.id %>" <%= 'selected' if @experiment.metadata&.dig("model") == m.id || (@experiment.metadata.blank? && m.id == "gemini-2.5-flash") %>><%= m.name %></option>
|
|
59
|
+
<% end %>
|
|
60
|
+
</select>
|
|
61
|
+
<p class="form-hint">The AI model to use when running LLM-based runners like SentimentLlmRun.</p>
|
|
62
|
+
</div>
|
|
53
63
|
</div>
|
|
54
64
|
|
|
55
65
|
<hr class="form-divider">
|
|
@@ -21,10 +21,11 @@
|
|
|
21
21
|
<tr>
|
|
22
22
|
<th>Experiment</th>
|
|
23
23
|
<th style="width: 140px;">Dataset</th>
|
|
24
|
+
<th style="width: 140px;">Model</th>
|
|
24
25
|
<th class="text-right" style="width: 90px;">Created</th>
|
|
25
26
|
<th class="text-center" style="width: 90px;">Status</th>
|
|
26
27
|
<th class="text-right" style="width: 60px;">Runs</th>
|
|
27
|
-
<%
|
|
28
|
+
<% @evaluator_classes.each do |evaluator_class| %>
|
|
28
29
|
<%
|
|
29
30
|
# Clean up evaluator name: "SentimentAccuracyEval" -> "Accuracy"
|
|
30
31
|
# Remove common prefixes/suffixes and module names
|
|
@@ -55,12 +55,27 @@
|
|
|
55
55
|
</div>
|
|
56
56
|
<div class="exp-meta-item">
|
|
57
57
|
<span class="exp-meta-label">Prompt</span>
|
|
58
|
-
<span class="exp-meta-value"
|
|
58
|
+
<span class="exp-meta-value">
|
|
59
|
+
<% if @experiment.prompt %>
|
|
60
|
+
<%= @experiment.prompt.name %>
|
|
61
|
+
<% if @experiment.prompt.optimized? %>
|
|
62
|
+
<span class="badge badge-optimized" title="Generated by <%= @experiment.prompt.optimizer_name&.titleize || 'optimizer' %>">Optimized</span>
|
|
63
|
+
<% end %>
|
|
64
|
+
<% else %>
|
|
65
|
+
—
|
|
66
|
+
<% end %>
|
|
67
|
+
</span>
|
|
59
68
|
</div>
|
|
60
69
|
<div class="exp-meta-item">
|
|
61
70
|
<span class="exp-meta-label">Runner</span>
|
|
62
71
|
<span class="exp-meta-value font-mono text-sm"><%= @experiment.runner_class&.demodulize || '—' %></span>
|
|
63
72
|
</div>
|
|
73
|
+
<% if @experiment.metadata&.dig("model").present? %>
|
|
74
|
+
<div class="exp-meta-item">
|
|
75
|
+
<span class="exp-meta-label">Model</span>
|
|
76
|
+
<span class="exp-meta-value font-mono text-sm"><%= @experiment.metadata["model"] %></span>
|
|
77
|
+
</div>
|
|
78
|
+
<% end %>
|
|
64
79
|
<div class="exp-meta-item">
|
|
65
80
|
<span class="exp-meta-label">Created</span>
|
|
66
81
|
<span class="exp-meta-value"><%= time_ago_in_words(@experiment.created_at) %> ago</span>
|
|
@@ -79,13 +94,6 @@
|
|
|
79
94
|
<%
|
|
80
95
|
avg_score = (results.sum(&:score) / results.size.to_f).round(2)
|
|
81
96
|
score_pct = (avg_score * 100).round
|
|
82
|
-
score_class = case avg_score
|
|
83
|
-
when 0...0.2 then 'score-bad'
|
|
84
|
-
when 0.2...0.4 then 'score-poor'
|
|
85
|
-
when 0.4...0.6 then 'score-fair'
|
|
86
|
-
when 0.6...0.8 then 'score-good'
|
|
87
|
-
else 'score-excellent'
|
|
88
|
-
end
|
|
89
97
|
short_name = evaluator_class.demodulize
|
|
90
98
|
.gsub(/Evaluator$/, '')
|
|
91
99
|
.gsub(/Eval$/, '')
|
|
@@ -93,10 +101,10 @@
|
|
|
93
101
|
short_name = short_name.presence || evaluator_class.demodulize.gsub(/Eval(uator)?$/, '')
|
|
94
102
|
%>
|
|
95
103
|
<div class="eval-summary-card" title="<%= results.size %> evaluations">
|
|
96
|
-
<span class="eval-summary-score <%= score_class %>"><%= score_pct %><span class="eval-summary-pct">%</span></span>
|
|
104
|
+
<span class="eval-summary-score <%= score_class(avg_score) %>"><%= score_pct %><span class="eval-summary-pct">%</span></span>
|
|
97
105
|
<span class="eval-summary-name"><%= short_name %></span>
|
|
98
106
|
<div class="eval-summary-bar">
|
|
99
|
-
<div class="eval-summary-bar-fill <%= score_class %>" style="width: <%= score_pct %>%"></div>
|
|
107
|
+
<div class="eval-summary-bar-fill <%= score_class(avg_score) %>" style="width: <%= score_pct %>%"></div>
|
|
100
108
|
</div>
|
|
101
109
|
<span class="eval-summary-count"><%= results.size %> runs</span>
|
|
102
110
|
</div>
|
|
@@ -139,7 +147,7 @@
|
|
|
139
147
|
<span class="row-title"><%= runner_result.dataset_record.display_name %></span>
|
|
140
148
|
</td>
|
|
141
149
|
<td>
|
|
142
|
-
<span class="prediction-badge"><%= truncate(runner_result.prediction.to_s.strip, length: 25) %></span>
|
|
150
|
+
<span class="prediction-badge"><%= truncate(runner_result.parsed_predictions.first.to_s.presence || runner_result.prediction.to_s.strip, length: 25) %></span>
|
|
143
151
|
</td>
|
|
144
152
|
<td class="text-muted"><%= truncate(runner_result.ground_truth.to_s.strip.presence || '—', length: 25) %></td>
|
|
145
153
|
<% @experiment.evaluation_results.group_by(&:evaluator_class).keys.each do |evaluator_class| %>
|
|
@@ -147,16 +155,7 @@
|
|
|
147
155
|
<% eval_result = runner_result.evaluation_results.find_by(evaluator_class: evaluator_class) %>
|
|
148
156
|
<% if eval_result %>
|
|
149
157
|
<% score = eval_result.score %>
|
|
150
|
-
|
|
151
|
-
score_class = case score
|
|
152
|
-
when 0...0.2 then 'score-bad'
|
|
153
|
-
when 0.2...0.4 then 'score-poor'
|
|
154
|
-
when 0.4...0.6 then 'score-fair'
|
|
155
|
-
when 0.6...0.8 then 'score-good'
|
|
156
|
-
else 'score-excellent'
|
|
157
|
-
end
|
|
158
|
-
%>
|
|
159
|
-
<span class="score-inline <%= score_class %>"><%= sprintf('%.2f', score) %></span>
|
|
158
|
+
<span class="score-inline <%= score_class(score) %>"><%= sprintf('%.2f', score) %></span>
|
|
160
159
|
<% else %>
|
|
161
160
|
<span class="text-subtle">—</span>
|
|
162
161
|
<% end %>
|