leva 0.2.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +57 -0
- data/app/assets/stylesheets/leva/application.css +9 -0
- data/app/controllers/leva/dataset_optimizations_controller.rb +64 -0
- data/app/controllers/leva/experiments_controller.rb +14 -6
- data/app/controllers/leva/workbench_controller.rb +26 -10
- data/app/helpers/leva/application_helper.rb +32 -16
- data/app/models/leva/dataset.rb +1 -0
- data/app/models/leva/experiment.rb +1 -0
- data/app/models/leva/optimization_run.rb +137 -0
- data/app/models/leva/prompt.rb +10 -0
- data/app/services/leva/class_loader.rb +37 -0
- data/app/services/leva/dataset_converter.rb +64 -0
- data/app/services/leva/optimizers/base.rb +183 -0
- data/app/services/leva/optimizers/bootstrap.rb +92 -0
- data/app/services/leva/optimizers/gepa_optimizer.rb +59 -0
- data/app/services/leva/optimizers/miprov2_optimizer.rb +52 -0
- data/app/services/leva/prompt_optimizer.rb +305 -0
- data/app/services/leva/signature_generator.rb +129 -0
- data/app/views/leva/dataset_optimizations/new.html.erb +145 -0
- data/app/views/leva/datasets/show.html.erb +65 -0
- data/app/views/leva/experiments/_experiment.html.erb +9 -10
- data/app/views/leva/experiments/_form.html.erb +10 -0
- data/app/views/leva/experiments/index.html.erb +2 -1
- data/app/views/leva/experiments/show.html.erb +20 -21
- data/app/views/leva/optimization_runs/show.html.erb +698 -0
- data/app/views/leva/runner_results/show.html.erb +18 -48
- data/app/views/leva/workbench/_results_section.html.erb +1 -9
- data/config/routes.rb +2 -0
- data/db/migrate/20241204000001_create_leva_optimization_runs.rb +25 -0
- data/lib/generators/leva/templates/eval.rb.erb +4 -2
- data/lib/leva/errors.rb +18 -0
- data/lib/leva/version.rb +1 -1
- data/lib/leva.rb +1 -0
- metadata +17 -3
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Leva
|
|
4
|
+
# Generates DSPy signatures from Leva dataset records.
|
|
5
|
+
#
|
|
6
|
+
# This service analyzes the structure of dataset records and generates
|
|
7
|
+
# a dynamic DSPy::Signature class that matches the input/output schema.
|
|
8
|
+
#
|
|
9
|
+
# @example Generate a signature from a dataset
|
|
10
|
+
# generator = Leva::SignatureGenerator.new(dataset)
|
|
11
|
+
# signature_class = generator.generate
|
|
12
|
+
# predictor = DSPy::Predict.new(signature_class)
|
|
13
|
+
class SignatureGenerator
|
|
14
|
+
# @param dataset [Leva::Dataset] The dataset to analyze
|
|
15
|
+
# @param description [String, nil] Optional description for the signature
|
|
16
|
+
def initialize(dataset, description: nil)
|
|
17
|
+
@dataset = dataset
|
|
18
|
+
@description = description
|
|
19
|
+
@sample_record = dataset.dataset_records.first&.recordable
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Generates a DSPy::Signature class based on the dataset structure.
|
|
23
|
+
#
|
|
24
|
+
# @return [Class, nil] A dynamically generated DSPy::Signature subclass, or nil if no sample
|
|
25
|
+
def generate
|
|
26
|
+
return nil unless @sample_record
|
|
27
|
+
|
|
28
|
+
input_fields = extract_input_fields
|
|
29
|
+
output_type = infer_output_type(@sample_record.ground_truth)
|
|
30
|
+
description = @description || generate_description
|
|
31
|
+
|
|
32
|
+
build_signature_class(input_fields, output_type, description)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Returns the input field names that will be used in the signature.
|
|
36
|
+
#
|
|
37
|
+
# @return [Array<Symbol>] Array of input field names
|
|
38
|
+
def input_field_names
|
|
39
|
+
return [] unless @sample_record
|
|
40
|
+
|
|
41
|
+
extract_input_fields.keys
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
# Extracts input fields from the sample record's LLM context.
|
|
47
|
+
#
|
|
48
|
+
# @return [Hash<Symbol, Class>] Map of field names to their inferred types
|
|
49
|
+
def extract_input_fields
|
|
50
|
+
context = @sample_record.to_llm_context
|
|
51
|
+
context.transform_values { |value| infer_type(value) }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Infers the Ruby type for a given value.
|
|
55
|
+
#
|
|
56
|
+
# @param value [Object] The value to analyze
|
|
57
|
+
# @return [Class] The inferred type (String, Integer, Float, Array, or Hash)
|
|
58
|
+
def infer_type(value)
|
|
59
|
+
case value
|
|
60
|
+
when String then String
|
|
61
|
+
when Integer then Integer
|
|
62
|
+
when Float then Float
|
|
63
|
+
when Array then Array
|
|
64
|
+
when Hash then Hash
|
|
65
|
+
else String
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# Infers the output type from ground truth.
|
|
70
|
+
#
|
|
71
|
+
# @param ground_truth [Object] The ground truth value to analyze
|
|
72
|
+
# @return [Symbol] The output type (:string, :array, or :hash)
|
|
73
|
+
def infer_output_type(ground_truth)
|
|
74
|
+
case ground_truth
|
|
75
|
+
when String then :string
|
|
76
|
+
when Array then :array
|
|
77
|
+
when Hash then :hash
|
|
78
|
+
else :string
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Generates a description for the signature based on the dataset.
|
|
83
|
+
#
|
|
84
|
+
# @return [String] A descriptive string for the signature
|
|
85
|
+
def generate_description
|
|
86
|
+
# Analyze ground truth values to determine task type
|
|
87
|
+
ground_truths = @dataset.dataset_records.limit(20).map { |r| r.recordable.ground_truth }.compact
|
|
88
|
+
unique_outputs = ground_truths.uniq
|
|
89
|
+
|
|
90
|
+
if unique_outputs.size <= 10
|
|
91
|
+
# Classification task - be explicit about output format
|
|
92
|
+
"Classify the input. Respond with ONLY one of these exact values, nothing else: #{unique_outputs.join(', ')}"
|
|
93
|
+
else
|
|
94
|
+
# Generation task
|
|
95
|
+
"Generate output for the given input from dataset: #{@dataset.name}"
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Builds the DSPy::Signature class dynamically.
|
|
100
|
+
#
|
|
101
|
+
# @param input_fields [Hash<Symbol, Class>] Input field definitions
|
|
102
|
+
# @param output_type [Symbol] The output type
|
|
103
|
+
# @param description [String] Description for the signature
|
|
104
|
+
# @return [Class] The generated DSPy::Signature subclass
|
|
105
|
+
# @raise [Leva::DspyConfigurationError] If DSPy is not available
|
|
106
|
+
def build_signature_class(input_fields, output_type, description)
|
|
107
|
+
unless defined?(DSPy::Signature)
|
|
108
|
+
raise DspyConfigurationError, "DSPy is required for signature generation"
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
captured_input_fields = input_fields
|
|
112
|
+
captured_description = description
|
|
113
|
+
|
|
114
|
+
Class.new(DSPy::Signature) do
|
|
115
|
+
description captured_description
|
|
116
|
+
|
|
117
|
+
input do
|
|
118
|
+
captured_input_fields.each do |name, _type|
|
|
119
|
+
const name, String
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
output do
|
|
124
|
+
const :output, String
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
<% content_for :title, 'Optimize Prompt' %>
|
|
2
|
+
<div class="container page">
|
|
3
|
+
<nav class="breadcrumb mb-4">
|
|
4
|
+
<%= link_to "Datasets", datasets_path, class: "breadcrumb-link" %>
|
|
5
|
+
<span class="breadcrumb-sep">/</span>
|
|
6
|
+
<%= link_to @dataset.name, dataset_path(@dataset), class: "breadcrumb-link" %>
|
|
7
|
+
<span class="breadcrumb-sep">/</span>
|
|
8
|
+
<span class="breadcrumb-current">Optimize</span>
|
|
9
|
+
</nav>
|
|
10
|
+
|
|
11
|
+
<div class="page-header mb-6">
|
|
12
|
+
<div>
|
|
13
|
+
<h1>Optimize Prompt</h1>
|
|
14
|
+
<p class="text-muted text-sm mt-1">Use DSPy to automatically optimize your prompt with few-shot examples</p>
|
|
15
|
+
</div>
|
|
16
|
+
</div>
|
|
17
|
+
|
|
18
|
+
<% if @can_optimize %>
|
|
19
|
+
<%= form_with url: dataset_optimization_path(@dataset), method: :post, local: true, class: "card p-6" do |form| %>
|
|
20
|
+
<%# Basic Information %>
|
|
21
|
+
<div class="form-section">
|
|
22
|
+
<div class="form-group">
|
|
23
|
+
<%= form.label :prompt_name, "Prompt Name", class: "form-label" %>
|
|
24
|
+
<%= form.text_field :prompt_name, value: "Optimized: #{@dataset.name}", autofocus: true, class: "form-input", placeholder: "e.g., Optimized Sentiment Classifier" %>
|
|
25
|
+
<p class="form-hint">Name for the new optimized prompt that will be created.</p>
|
|
26
|
+
</div>
|
|
27
|
+
</div>
|
|
28
|
+
|
|
29
|
+
<hr class="form-divider">
|
|
30
|
+
|
|
31
|
+
<%# Optimization Settings %>
|
|
32
|
+
<div class="form-section">
|
|
33
|
+
<h4 class="form-section-title">Optimization Settings</h4>
|
|
34
|
+
|
|
35
|
+
<div class="form-row">
|
|
36
|
+
<div class="form-group flex-1">
|
|
37
|
+
<%= form.label :optimizer, "Optimizer", class: "form-label" %>
|
|
38
|
+
<%= form.select :optimizer,
|
|
39
|
+
@optimizers.map { |k, v| [v[:name], k] },
|
|
40
|
+
{},
|
|
41
|
+
class: "form-select" %>
|
|
42
|
+
<p class="form-hint">Algorithm used to optimize the prompt.</p>
|
|
43
|
+
</div>
|
|
44
|
+
|
|
45
|
+
<div class="form-group flex-1">
|
|
46
|
+
<%= form.label :mode, "Mode", class: "form-label" %>
|
|
47
|
+
<%= form.select :mode,
|
|
48
|
+
@modes.map { |k, v| ["#{v[:name]} (#{v[:description]})", k] },
|
|
49
|
+
{},
|
|
50
|
+
class: "form-select" %>
|
|
51
|
+
<p class="form-hint">Higher modes use more examples but take longer.</p>
|
|
52
|
+
</div>
|
|
53
|
+
</div>
|
|
54
|
+
|
|
55
|
+
<div class="form-group">
|
|
56
|
+
<%= form.label :model, "Model", class: "form-label" %>
|
|
57
|
+
<%= form.select :model,
|
|
58
|
+
@models.map { |m| [m.name, m.id] },
|
|
59
|
+
{},
|
|
60
|
+
class: "form-select" %>
|
|
61
|
+
<p class="form-hint">The AI model to use during optimization.</p>
|
|
62
|
+
</div>
|
|
63
|
+
</div>
|
|
64
|
+
|
|
65
|
+
<hr class="form-divider">
|
|
66
|
+
|
|
67
|
+
<%# Dataset Info %>
|
|
68
|
+
<div class="form-section">
|
|
69
|
+
<h4 class="form-section-title">Dataset Information</h4>
|
|
70
|
+
<div class="info-grid">
|
|
71
|
+
<div class="info-item">
|
|
72
|
+
<span class="info-label">Records Available</span>
|
|
73
|
+
<span class="info-value"><%= @record_count %></span>
|
|
74
|
+
</div>
|
|
75
|
+
<div class="info-item">
|
|
76
|
+
<span class="info-label">Minimum Required</span>
|
|
77
|
+
<span class="info-value">10</span>
|
|
78
|
+
</div>
|
|
79
|
+
</div>
|
|
80
|
+
</div>
|
|
81
|
+
|
|
82
|
+
<div class="form-actions">
|
|
83
|
+
<%= link_to "Cancel", dataset_path(@dataset), class: "btn btn-ghost" %>
|
|
84
|
+
<%= form.submit "Start Optimization", class: "btn btn-primary" %>
|
|
85
|
+
</div>
|
|
86
|
+
<% end %>
|
|
87
|
+
<% else %>
|
|
88
|
+
<div class="card p-6">
|
|
89
|
+
<div class="setup-required">
|
|
90
|
+
<div class="setup-required-icon">
|
|
91
|
+
<svg class="icon-xl" fill="none" viewBox="0 0 24 24" stroke="currentColor">
|
|
92
|
+
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="1.5" d="M12 9v2m0 4h.01m-6.938 4h13.856c1.54 0 2.502-1.667 1.732-2.5L13.732 4c-.77-.833-1.964-.833-2.732 0L4.082 16.5c-.77.833.192 2.5 1.732 2.5z" />
|
|
93
|
+
</svg>
|
|
94
|
+
</div>
|
|
95
|
+
<h3 class="setup-required-title">More Records Needed</h3>
|
|
96
|
+
<p class="setup-required-desc">You need at least <strong>10 records</strong> to optimize a prompt. Currently you have <strong><%= @record_count %></strong>.</p>
|
|
97
|
+
<p class="setup-required-hint mt-2">Add <strong><%= @records_needed %></strong> more records to enable optimization.</p>
|
|
98
|
+
</div>
|
|
99
|
+
|
|
100
|
+
<div class="form-actions">
|
|
101
|
+
<%= link_to dataset_path(@dataset), class: "btn btn-ghost" do %>
|
|
102
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
103
|
+
<path fill-rule="evenodd" d="M9.707 16.707a1 1 0 01-1.414 0l-6-6a1 1 0 010-1.414l6-6a1 1 0 011.414 1.414L5.414 9H17a1 1 0 110 2H5.414l4.293 4.293a1 1 0 010 1.414z" clip-rule="evenodd" />
|
|
104
|
+
</svg>
|
|
105
|
+
Back to Dataset
|
|
106
|
+
<% end %>
|
|
107
|
+
</div>
|
|
108
|
+
</div>
|
|
109
|
+
<% end %>
|
|
110
|
+
</div>
|
|
111
|
+
|
|
112
|
+
<style>
|
|
113
|
+
.info-grid {
|
|
114
|
+
display: grid;
|
|
115
|
+
grid-template-columns: repeat(2, 1fr);
|
|
116
|
+
gap: 1rem;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
.info-item {
|
|
120
|
+
background: var(--bg-secondary);
|
|
121
|
+
border-radius: 0.5rem;
|
|
122
|
+
padding: 1rem;
|
|
123
|
+
text-align: center;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
.info-label {
|
|
127
|
+
display: block;
|
|
128
|
+
font-size: 0.75rem;
|
|
129
|
+
color: var(--text-muted);
|
|
130
|
+
text-transform: uppercase;
|
|
131
|
+
letter-spacing: 0.04em;
|
|
132
|
+
margin-bottom: 0.25rem;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
.info-value {
|
|
136
|
+
font-size: 1.5rem;
|
|
137
|
+
font-weight: 600;
|
|
138
|
+
font-family: 'Fira Code', monospace;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
.setup-required-hint {
|
|
142
|
+
color: var(--text-muted);
|
|
143
|
+
font-size: 0.875rem;
|
|
144
|
+
}
|
|
145
|
+
</style>
|
|
@@ -84,6 +84,70 @@
|
|
|
84
84
|
<% end %>
|
|
85
85
|
</section>
|
|
86
86
|
|
|
87
|
+
<%# Prompt Optimization Section %>
|
|
88
|
+
<section class="mb-8">
|
|
89
|
+
<div class="section-header">
|
|
90
|
+
<h3 class="section-title">Prompt Optimization</h3>
|
|
91
|
+
<span class="section-count"><%= @dataset.optimization_runs.count %></span>
|
|
92
|
+
<div class="ml-auto">
|
|
93
|
+
<% optimizer = Leva::PromptOptimizer.new(dataset: @dataset) %>
|
|
94
|
+
<% if optimizer.can_optimize? %>
|
|
95
|
+
<%= link_to new_dataset_optimization_path(@dataset), class: "btn btn-primary btn-sm" do %>
|
|
96
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
97
|
+
<path fill-rule="evenodd" d="M11.3 1.046A1 1 0 0112 2v5h4a1 1 0 01.82 1.573l-7 10A1 1 0 018 18v-5H4a1 1 0 01-.82-1.573l7-10a1 1 0 011.12-.38z" clip-rule="evenodd" />
|
|
98
|
+
</svg>
|
|
99
|
+
Optimize Prompt
|
|
100
|
+
<% end %>
|
|
101
|
+
<% else %>
|
|
102
|
+
<button class="btn btn-ghost btn-sm" disabled title="Need <%= optimizer.records_needed %> more records">
|
|
103
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
104
|
+
<path fill-rule="evenodd" d="M11.3 1.046A1 1 0 0112 2v5h4a1 1 0 01.82 1.573l-7 10A1 1 0 018 18v-5H4a1 1 0 01-.82-1.573l7-10a1 1 0 011.12-.38z" clip-rule="evenodd" />
|
|
105
|
+
</svg>
|
|
106
|
+
Need <%= optimizer.records_needed %> more records
|
|
107
|
+
</button>
|
|
108
|
+
<% end %>
|
|
109
|
+
</div>
|
|
110
|
+
</div>
|
|
111
|
+
|
|
112
|
+
<% if @dataset.optimization_runs.any? %>
|
|
113
|
+
<div class="table-wrapper">
|
|
114
|
+
<div class="table-scroll">
|
|
115
|
+
<table class="table table-clickable">
|
|
116
|
+
<thead>
|
|
117
|
+
<tr>
|
|
118
|
+
<th>Prompt Name</th>
|
|
119
|
+
<th>Optimizer</th>
|
|
120
|
+
<th>Mode</th>
|
|
121
|
+
<th>Status</th>
|
|
122
|
+
<th class="text-right">Created</th>
|
|
123
|
+
</tr>
|
|
124
|
+
</thead>
|
|
125
|
+
<tbody>
|
|
126
|
+
<% @dataset.optimization_runs.order(created_at: :desc).each do |run| %>
|
|
127
|
+
<tr class="clickable-row" onclick="window.location='<%= optimization_run_path(run) %>'">
|
|
128
|
+
<td><span class="row-title"><%= run.prompt_name %></span></td>
|
|
129
|
+
<td><%= run.optimizer&.titleize || 'Bootstrap' %></td>
|
|
130
|
+
<td><%= run.mode&.titleize || 'Light' %></td>
|
|
131
|
+
<td>
|
|
132
|
+
<span class="badge badge-<%= run.status == 'completed' ? 'success' : (run.status == 'failed' ? 'error' : 'warning') %>">
|
|
133
|
+
<%= run.status&.titleize || 'Pending' %>
|
|
134
|
+
</span>
|
|
135
|
+
</td>
|
|
136
|
+
<td class="text-right text-muted"><%= time_ago_in_words(run.created_at) %> ago</td>
|
|
137
|
+
</tr>
|
|
138
|
+
<% end %>
|
|
139
|
+
</tbody>
|
|
140
|
+
</table>
|
|
141
|
+
</div>
|
|
142
|
+
</div>
|
|
143
|
+
<% else %>
|
|
144
|
+
<div class="empty-state-inline">
|
|
145
|
+
<p class="text-muted text-sm">No optimization runs yet.</p>
|
|
146
|
+
<p class="text-xs text-subtle mt-2">Use DSPy to optimize your prompts with few-shot examples.</p>
|
|
147
|
+
</div>
|
|
148
|
+
<% end %>
|
|
149
|
+
</section>
|
|
150
|
+
|
|
87
151
|
<%# Experiments Section %>
|
|
88
152
|
<section>
|
|
89
153
|
<div class="section-header">
|
|
@@ -123,6 +187,7 @@
|
|
|
123
187
|
</tr>
|
|
124
188
|
</thead>
|
|
125
189
|
<tbody>
|
|
190
|
+
<% @evaluator_classes = Leva::EvaluationResult.distinct.pluck(:evaluator_class) %>
|
|
126
191
|
<%= render partial: 'leva/experiments/experiment', collection: @dataset.experiments %>
|
|
127
192
|
</tbody>
|
|
128
193
|
</table>
|
|
@@ -8,6 +8,9 @@
|
|
|
8
8
|
else 'status-dot-pending'
|
|
9
9
|
end
|
|
10
10
|
run_count = experiment.runner_results.count
|
|
11
|
+
|
|
12
|
+
# Group evaluation results by evaluator_class to avoid N+1 queries
|
|
13
|
+
grouped_results = experiment.evaluation_results.group_by(&:evaluator_class)
|
|
11
14
|
%>
|
|
12
15
|
<tr class="experiment-row" onclick="window.location='<%= experiment_path(experiment) %>'">
|
|
13
16
|
<td>
|
|
@@ -21,6 +24,9 @@
|
|
|
21
24
|
<td>
|
|
22
25
|
<span class="cell-dataset"><%= experiment.dataset&.name || '—' %></span>
|
|
23
26
|
</td>
|
|
27
|
+
<td>
|
|
28
|
+
<span class="cell-model font-mono text-sm"><%= experiment.metadata&.dig("model") || '—' %></span>
|
|
29
|
+
</td>
|
|
24
30
|
<td class="text-right text-nowrap">
|
|
25
31
|
<span class="cell-timestamp"><%= time_ago_in_words(experiment.created_at) %></span>
|
|
26
32
|
</td>
|
|
@@ -33,22 +39,15 @@
|
|
|
33
39
|
<td class="text-right">
|
|
34
40
|
<span class="cell-count"><%= run_count %></span>
|
|
35
41
|
</td>
|
|
36
|
-
<%
|
|
42
|
+
<% @evaluator_classes.each do |evaluator_class| %>
|
|
37
43
|
<td class="text-right">
|
|
38
|
-
<% results =
|
|
44
|
+
<% results = grouped_results[evaluator_class] || [] %>
|
|
39
45
|
<% if results.any? %>
|
|
40
46
|
<%
|
|
41
47
|
avg_score = (results.sum(&:score) / results.size.to_f)
|
|
42
48
|
score_pct = (avg_score * 100).round
|
|
43
|
-
score_class = case avg_score
|
|
44
|
-
when 0...0.2 then 'score-bad'
|
|
45
|
-
when 0.2...0.4 then 'score-poor'
|
|
46
|
-
when 0.4...0.6 then 'score-fair'
|
|
47
|
-
when 0.6...0.8 then 'score-good'
|
|
48
|
-
else 'score-excellent'
|
|
49
|
-
end
|
|
50
49
|
%>
|
|
51
|
-
<span class="score-pill <%= score_class %>"><%= score_pct %>%</span>
|
|
50
|
+
<span class="score-pill <%= score_class(avg_score) %>"><%= score_pct %>%</span>
|
|
52
51
|
<% else %>
|
|
53
52
|
<span class="score-empty">—</span>
|
|
54
53
|
<% end %>
|
|
@@ -50,6 +50,16 @@
|
|
|
50
50
|
class: "form-select" %>
|
|
51
51
|
<p class="form-hint">The runner executes your model logic for each dataset record.</p>
|
|
52
52
|
</div>
|
|
53
|
+
|
|
54
|
+
<div class="form-group" id="model-selection-group">
|
|
55
|
+
<label for="experiment_metadata_model" class="form-label">Model (for LLM runners)</label>
|
|
56
|
+
<select name="experiment[metadata][model]" id="experiment_metadata_model" class="form-select">
|
|
57
|
+
<% Leva::PromptOptimizer.available_models.each do |m| %>
|
|
58
|
+
<option value="<%= m.id %>" <%= 'selected' if @experiment.metadata&.dig("model") == m.id || (@experiment.metadata.blank? && m.id == "gemini-2.5-flash") %>><%= m.name %></option>
|
|
59
|
+
<% end %>
|
|
60
|
+
</select>
|
|
61
|
+
<p class="form-hint">The AI model to use when running LLM-based runners like SentimentLlmRun.</p>
|
|
62
|
+
</div>
|
|
53
63
|
</div>
|
|
54
64
|
|
|
55
65
|
<hr class="form-divider">
|
|
@@ -21,10 +21,11 @@
|
|
|
21
21
|
<tr>
|
|
22
22
|
<th>Experiment</th>
|
|
23
23
|
<th style="width: 140px;">Dataset</th>
|
|
24
|
+
<th style="width: 140px;">Model</th>
|
|
24
25
|
<th class="text-right" style="width: 90px;">Created</th>
|
|
25
26
|
<th class="text-center" style="width: 90px;">Status</th>
|
|
26
27
|
<th class="text-right" style="width: 60px;">Runs</th>
|
|
27
|
-
<%
|
|
28
|
+
<% @evaluator_classes.each do |evaluator_class| %>
|
|
28
29
|
<%
|
|
29
30
|
# Clean up evaluator name: "SentimentAccuracyEval" -> "Accuracy"
|
|
30
31
|
# Remove common prefixes/suffixes and module names
|
|
@@ -55,12 +55,27 @@
|
|
|
55
55
|
</div>
|
|
56
56
|
<div class="exp-meta-item">
|
|
57
57
|
<span class="exp-meta-label">Prompt</span>
|
|
58
|
-
<span class="exp-meta-value"
|
|
58
|
+
<span class="exp-meta-value">
|
|
59
|
+
<% if @experiment.prompt %>
|
|
60
|
+
<%= @experiment.prompt.name %>
|
|
61
|
+
<% if @experiment.prompt.optimized? %>
|
|
62
|
+
<span class="badge badge-optimized" title="Generated by <%= @experiment.prompt.optimizer_name&.titleize || 'optimizer' %>">Optimized</span>
|
|
63
|
+
<% end %>
|
|
64
|
+
<% else %>
|
|
65
|
+
—
|
|
66
|
+
<% end %>
|
|
67
|
+
</span>
|
|
59
68
|
</div>
|
|
60
69
|
<div class="exp-meta-item">
|
|
61
70
|
<span class="exp-meta-label">Runner</span>
|
|
62
71
|
<span class="exp-meta-value font-mono text-sm"><%= @experiment.runner_class&.demodulize || '—' %></span>
|
|
63
72
|
</div>
|
|
73
|
+
<% if @experiment.metadata&.dig("model").present? %>
|
|
74
|
+
<div class="exp-meta-item">
|
|
75
|
+
<span class="exp-meta-label">Model</span>
|
|
76
|
+
<span class="exp-meta-value font-mono text-sm"><%= @experiment.metadata["model"] %></span>
|
|
77
|
+
</div>
|
|
78
|
+
<% end %>
|
|
64
79
|
<div class="exp-meta-item">
|
|
65
80
|
<span class="exp-meta-label">Created</span>
|
|
66
81
|
<span class="exp-meta-value"><%= time_ago_in_words(@experiment.created_at) %> ago</span>
|
|
@@ -79,13 +94,6 @@
|
|
|
79
94
|
<%
|
|
80
95
|
avg_score = (results.sum(&:score) / results.size.to_f).round(2)
|
|
81
96
|
score_pct = (avg_score * 100).round
|
|
82
|
-
score_class = case avg_score
|
|
83
|
-
when 0...0.2 then 'score-bad'
|
|
84
|
-
when 0.2...0.4 then 'score-poor'
|
|
85
|
-
when 0.4...0.6 then 'score-fair'
|
|
86
|
-
when 0.6...0.8 then 'score-good'
|
|
87
|
-
else 'score-excellent'
|
|
88
|
-
end
|
|
89
97
|
short_name = evaluator_class.demodulize
|
|
90
98
|
.gsub(/Evaluator$/, '')
|
|
91
99
|
.gsub(/Eval$/, '')
|
|
@@ -93,10 +101,10 @@
|
|
|
93
101
|
short_name = short_name.presence || evaluator_class.demodulize.gsub(/Eval(uator)?$/, '')
|
|
94
102
|
%>
|
|
95
103
|
<div class="eval-summary-card" title="<%= results.size %> evaluations">
|
|
96
|
-
<span class="eval-summary-score <%= score_class %>"><%= score_pct %><span class="eval-summary-pct">%</span></span>
|
|
104
|
+
<span class="eval-summary-score <%= score_class(avg_score) %>"><%= score_pct %><span class="eval-summary-pct">%</span></span>
|
|
97
105
|
<span class="eval-summary-name"><%= short_name %></span>
|
|
98
106
|
<div class="eval-summary-bar">
|
|
99
|
-
<div class="eval-summary-bar-fill <%= score_class %>" style="width: <%= score_pct %>%"></div>
|
|
107
|
+
<div class="eval-summary-bar-fill <%= score_class(avg_score) %>" style="width: <%= score_pct %>%"></div>
|
|
100
108
|
</div>
|
|
101
109
|
<span class="eval-summary-count"><%= results.size %> runs</span>
|
|
102
110
|
</div>
|
|
@@ -139,7 +147,7 @@
|
|
|
139
147
|
<span class="row-title"><%= runner_result.dataset_record.display_name %></span>
|
|
140
148
|
</td>
|
|
141
149
|
<td>
|
|
142
|
-
<span class="prediction-badge"><%= truncate(runner_result.prediction.to_s.strip, length: 25) %></span>
|
|
150
|
+
<span class="prediction-badge"><%= truncate(runner_result.parsed_predictions.first.to_s.presence || runner_result.prediction.to_s.strip, length: 25) %></span>
|
|
143
151
|
</td>
|
|
144
152
|
<td class="text-muted"><%= truncate(runner_result.ground_truth.to_s.strip.presence || '—', length: 25) %></td>
|
|
145
153
|
<% @experiment.evaluation_results.group_by(&:evaluator_class).keys.each do |evaluator_class| %>
|
|
@@ -147,16 +155,7 @@
|
|
|
147
155
|
<% eval_result = runner_result.evaluation_results.find_by(evaluator_class: evaluator_class) %>
|
|
148
156
|
<% if eval_result %>
|
|
149
157
|
<% score = eval_result.score %>
|
|
150
|
-
|
|
151
|
-
score_class = case score
|
|
152
|
-
when 0...0.2 then 'score-bad'
|
|
153
|
-
when 0.2...0.4 then 'score-poor'
|
|
154
|
-
when 0.4...0.6 then 'score-fair'
|
|
155
|
-
when 0.6...0.8 then 'score-good'
|
|
156
|
-
else 'score-excellent'
|
|
157
|
-
end
|
|
158
|
-
%>
|
|
159
|
-
<span class="score-inline <%= score_class %>"><%= sprintf('%.2f', score) %></span>
|
|
158
|
+
<span class="score-inline <%= score_class(score) %>"><%= sprintf('%.2f', score) %></span>
|
|
160
159
|
<% else %>
|
|
161
160
|
<span class="text-subtle">—</span>
|
|
162
161
|
<% end %>
|