leva 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +55 -1
- data/app/assets/stylesheets/leva/application.css +165 -25
- data/app/controllers/leva/dataset_optimizations_controller.rb +64 -0
- data/app/controllers/leva/experiments_controller.rb +14 -6
- data/app/controllers/leva/workbench_controller.rb +26 -10
- data/app/helpers/leva/application_helper.rb +32 -16
- data/app/models/leva/dataset.rb +1 -0
- data/app/models/leva/experiment.rb +1 -0
- data/app/models/leva/optimization_run.rb +137 -0
- data/app/models/leva/prompt.rb +10 -0
- data/app/services/leva/class_loader.rb +37 -0
- data/app/services/leva/dataset_converter.rb +64 -0
- data/app/services/leva/optimizers/base.rb +183 -0
- data/app/services/leva/optimizers/bootstrap.rb +92 -0
- data/app/services/leva/optimizers/gepa_optimizer.rb +59 -0
- data/app/services/leva/optimizers/miprov2_optimizer.rb +52 -0
- data/app/services/leva/prompt_optimizer.rb +305 -0
- data/app/services/leva/signature_generator.rb +129 -0
- data/app/views/leva/datasets/show.html.erb +3 -0
- data/app/views/leva/experiments/_experiment.html.erb +9 -10
- data/app/views/leva/experiments/_form.html.erb +10 -0
- data/app/views/leva/experiments/index.html.erb +2 -1
- data/app/views/leva/experiments/show.html.erb +20 -21
- data/app/views/leva/optimization_runs/show.html.erb +698 -0
- data/app/views/leva/runner_results/show.html.erb +18 -48
- data/app/views/leva/workbench/_results_section.html.erb +3 -11
- data/db/migrate/20241204000001_create_leva_optimization_runs.rb +25 -0
- data/lib/generators/leva/templates/eval.rb.erb +4 -2
- data/lib/leva/errors.rb +18 -0
- data/lib/leva/version.rb +1 -1
- data/lib/leva.rb +1 -0
- metadata +16 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz: '
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: '03694d16308b610d8c1cc83ec070cf2c0a03273d93b4e220834ff063f8df5b0a'
|
|
4
|
+
data.tar.gz: 31fa8e5737410dbb9b5729bf43616ef037fbad1c6b8188e60649a5156c8f87c1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f12f9ec8d00a5dcd9a8c003a598d9ec316be4bd8b8b2deb7a99680a14dcd64790b496829e7635e28f5b86dd7a5f484b9043b504bda24f7e3d0fd75b8e4eee271
|
|
7
|
+
data.tar.gz: 293f53edc39d95ed612b0ce0e0e5097f38e888990c7e8530b54da6afcf2015ae7f150f8f9bd9d2bb1171c5bf18c0c4a34180482594c376ed17341ae42bce9f09
|
data/README.md
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
|
|
6
6
|
Leva is a Ruby on Rails framework for evaluating Language Models (LLMs) using ActiveRecord datasets on production models. It provides a flexible structure for creating experiments, managing datasets, and implementing various evaluation logic on production data with security in mind.
|
|
7
7
|
|
|
8
|
-

|
|
9
9
|
|
|
10
10
|
## Installation
|
|
11
11
|
|
|
@@ -181,6 +181,60 @@ experiment.evaluation_results.group_by(&:evaluator_class).each do |evaluator_cla
|
|
|
181
181
|
end
|
|
182
182
|
```
|
|
183
183
|
|
|
184
|
+
## Prompt Optimization (DSPy Integration)
|
|
185
|
+
|
|
186
|
+
Leva includes optional prompt optimization powered by [DSPy.rb](https://github.com/kieranklaassen/dspy.rb). This feature automatically finds optimal prompts and few-shot examples for your datasets.
|
|
187
|
+
|
|
188
|
+
**Requirements:**
|
|
189
|
+
- Ruby 3.3.0 or higher
|
|
190
|
+
- DSPy gem and optional optimizer gems
|
|
191
|
+
|
|
192
|
+
### Installation
|
|
193
|
+
|
|
194
|
+
Add the DSPy gems to your Gemfile:
|
|
195
|
+
|
|
196
|
+
```ruby
|
|
197
|
+
gem "dspy" # Core DSPy functionality (required)
|
|
198
|
+
gem "dspy-gepa" # GEPA optimizer (optional, recommended)
|
|
199
|
+
gem "dspy-miprov2" # MIPROv2 optimizer (optional)
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### Available Optimizers
|
|
203
|
+
|
|
204
|
+
| Optimizer | Best For | Description |
|
|
205
|
+
|-----------|----------|-------------|
|
|
206
|
+
| **Bootstrap** | Quick iteration, small datasets | Fast selection of few-shot examples. No gem required. |
|
|
207
|
+
| **GEPA** | Maximum quality | State-of-the-art reflective prompt evolution. 10-14% better than MIPROv2. |
|
|
208
|
+
| **MIPROv2** | Large datasets (200+) | Bayesian optimization for instructions and examples. |
|
|
209
|
+
|
|
210
|
+
### Usage
|
|
211
|
+
|
|
212
|
+
```ruby
|
|
213
|
+
# Create an optimizer for your dataset
|
|
214
|
+
optimizer = Leva::PromptOptimizer.new(
|
|
215
|
+
dataset: dataset,
|
|
216
|
+
optimizer: :gepa, # :bootstrap, :gepa, or :miprov2
|
|
217
|
+
mode: :medium, # :light, :medium, or :heavy
|
|
218
|
+
model: "gpt-4o-mini" # Any model supported by RubyLLM
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Run optimization
|
|
222
|
+
result = optimizer.optimize
|
|
223
|
+
|
|
224
|
+
# Result contains optimized prompts
|
|
225
|
+
result[:system_prompt] # Optimized instruction
|
|
226
|
+
result[:user_prompt] # Template with Liquid variables
|
|
227
|
+
result[:metadata] # Score, examples, and optimization details
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### Optimization Modes
|
|
231
|
+
|
|
232
|
+
| Mode | Duration | Use Case |
|
|
233
|
+
|------|----------|----------|
|
|
234
|
+
| `:light` | ~5 min | Quick experiments |
|
|
235
|
+
| `:medium` | ~15 min | Balanced quality/speed |
|
|
236
|
+
| `:heavy` | ~30 min | Production prompts |
|
|
237
|
+
|
|
184
238
|
## Configuration
|
|
185
239
|
|
|
186
240
|
Ensure you set up any required API keys or other configurations in your Rails credentials or environment variables.
|
|
@@ -947,6 +947,15 @@ strong {
|
|
|
947
947
|
color: var(--info-400);
|
|
948
948
|
}
|
|
949
949
|
|
|
950
|
+
.badge-optimized {
|
|
951
|
+
background: rgba(168, 127, 219, 0.15);
|
|
952
|
+
color: #b794f4;
|
|
953
|
+
font-size: 0.65rem;
|
|
954
|
+
padding: 2px 6px;
|
|
955
|
+
margin-left: 6px;
|
|
956
|
+
vertical-align: middle;
|
|
957
|
+
}
|
|
958
|
+
|
|
950
959
|
/* Status Dots */
|
|
951
960
|
.status-dot {
|
|
952
961
|
display: inline-block;
|
|
@@ -1330,6 +1339,13 @@ strong {
|
|
|
1330
1339
|
padding: var(--space-4);
|
|
1331
1340
|
overflow-y: auto;
|
|
1332
1341
|
transition: width var(--transition-base), padding var(--transition-base);
|
|
1342
|
+
/* Hide scrollbar while maintaining scroll */
|
|
1343
|
+
scrollbar-width: none;
|
|
1344
|
+
-ms-overflow-style: none;
|
|
1345
|
+
}
|
|
1346
|
+
|
|
1347
|
+
.panel-right::-webkit-scrollbar {
|
|
1348
|
+
display: none;
|
|
1333
1349
|
}
|
|
1334
1350
|
|
|
1335
1351
|
.panel-right.resizing {
|
|
@@ -1833,6 +1849,40 @@ dialog::backdrop {
|
|
|
1833
1849
|
}
|
|
1834
1850
|
|
|
1835
1851
|
|
|
1852
|
+
/* ============================================
|
|
1853
|
+
SCROLLBAR HIDING (while maintaining scroll)
|
|
1854
|
+
============================================ */
|
|
1855
|
+
|
|
1856
|
+
/* Hide scrollbars globally in workbench panels */
|
|
1857
|
+
.panel,
|
|
1858
|
+
.panel-right,
|
|
1859
|
+
.sidebar-content,
|
|
1860
|
+
.output-block,
|
|
1861
|
+
.result-block,
|
|
1862
|
+
.main-content,
|
|
1863
|
+
.prompt-textarea,
|
|
1864
|
+
.prompt-sidebar,
|
|
1865
|
+
.layout-workbench,
|
|
1866
|
+
.content-body,
|
|
1867
|
+
.output-value {
|
|
1868
|
+
scrollbar-width: none; /* Firefox */
|
|
1869
|
+
-ms-overflow-style: none; /* IE 10+ */
|
|
1870
|
+
}
|
|
1871
|
+
|
|
1872
|
+
.panel::-webkit-scrollbar,
|
|
1873
|
+
.panel-right::-webkit-scrollbar,
|
|
1874
|
+
.sidebar-content::-webkit-scrollbar,
|
|
1875
|
+
.output-block::-webkit-scrollbar,
|
|
1876
|
+
.result-block::-webkit-scrollbar,
|
|
1877
|
+
.main-content::-webkit-scrollbar,
|
|
1878
|
+
.prompt-textarea::-webkit-scrollbar,
|
|
1879
|
+
.prompt-sidebar::-webkit-scrollbar,
|
|
1880
|
+
.layout-workbench::-webkit-scrollbar,
|
|
1881
|
+
.content-body::-webkit-scrollbar,
|
|
1882
|
+
.output-value::-webkit-scrollbar {
|
|
1883
|
+
display: none; /* Chrome, Safari, Opera */
|
|
1884
|
+
}
|
|
1885
|
+
|
|
1836
1886
|
/* ============================================
|
|
1837
1887
|
WORKBENCH SPECIFIC
|
|
1838
1888
|
============================================ */
|
|
@@ -1993,9 +2043,11 @@ dialog::backdrop {
|
|
|
1993
2043
|
============================================ */
|
|
1994
2044
|
|
|
1995
2045
|
.run-controls {
|
|
1996
|
-
padding
|
|
1997
|
-
|
|
1998
|
-
|
|
2046
|
+
padding: var(--space-4);
|
|
2047
|
+
background: rgba(26, 25, 24, 0.4);
|
|
2048
|
+
border-radius: var(--radius-lg);
|
|
2049
|
+
border: 1px solid var(--gray-800);
|
|
2050
|
+
margin-bottom: var(--space-3);
|
|
1999
2051
|
}
|
|
2000
2052
|
|
|
2001
2053
|
.run-selects {
|
|
@@ -2024,22 +2076,26 @@ dialog::backdrop {
|
|
|
2024
2076
|
============================================ */
|
|
2025
2077
|
|
|
2026
2078
|
.output-section {
|
|
2027
|
-
padding
|
|
2028
|
-
|
|
2029
|
-
|
|
2079
|
+
padding: var(--space-4);
|
|
2080
|
+
background: rgba(26, 25, 24, 0.4);
|
|
2081
|
+
border-radius: var(--radius-lg);
|
|
2082
|
+
margin-bottom: var(--space-3);
|
|
2083
|
+
border: 1px solid var(--gray-800);
|
|
2030
2084
|
}
|
|
2031
2085
|
|
|
2032
2086
|
.output-header {
|
|
2033
2087
|
display: flex;
|
|
2034
|
-
align-items:
|
|
2088
|
+
align-items: center;
|
|
2035
2089
|
justify-content: space-between;
|
|
2036
|
-
margin-bottom: var(--space-
|
|
2090
|
+
margin-bottom: var(--space-4);
|
|
2091
|
+
padding-bottom: var(--space-3);
|
|
2092
|
+
border-bottom: 1px solid var(--gray-800);
|
|
2037
2093
|
}
|
|
2038
2094
|
|
|
2039
2095
|
.output-title {
|
|
2040
2096
|
font-size: var(--text-xs);
|
|
2041
2097
|
font-weight: 600;
|
|
2042
|
-
color: var(--gray-
|
|
2098
|
+
color: var(--gray-300);
|
|
2043
2099
|
text-transform: uppercase;
|
|
2044
2100
|
letter-spacing: var(--tracking-wide);
|
|
2045
2101
|
}
|
|
@@ -2047,48 +2103,125 @@ dialog::backdrop {
|
|
|
2047
2103
|
.output-meta {
|
|
2048
2104
|
font-size: 10px;
|
|
2049
2105
|
color: var(--gray-500);
|
|
2106
|
+
display: flex;
|
|
2107
|
+
align-items: center;
|
|
2108
|
+
gap: var(--space-2);
|
|
2050
2109
|
}
|
|
2051
2110
|
|
|
2052
2111
|
.output-grid {
|
|
2053
2112
|
display: flex;
|
|
2054
2113
|
flex-direction: column;
|
|
2055
|
-
gap: var(--space-
|
|
2114
|
+
gap: var(--space-3);
|
|
2056
2115
|
}
|
|
2057
2116
|
|
|
2058
2117
|
.output-block {
|
|
2059
2118
|
background: var(--gray-800);
|
|
2060
|
-
border-radius: var(--radius-
|
|
2061
|
-
padding: var(--space-
|
|
2119
|
+
border-radius: var(--radius-md);
|
|
2120
|
+
padding: var(--space-3) var(--space-4);
|
|
2062
2121
|
display: flex;
|
|
2063
|
-
align-items:
|
|
2122
|
+
align-items: flex-start;
|
|
2064
2123
|
gap: var(--space-3);
|
|
2124
|
+
transition: background-color 0.15s ease, border-color 0.15s ease;
|
|
2125
|
+
}
|
|
2126
|
+
|
|
2127
|
+
.output-block:hover {
|
|
2128
|
+
background: var(--gray-750, #2a2928);
|
|
2065
2129
|
}
|
|
2066
2130
|
|
|
2067
2131
|
.output-block--expected {
|
|
2068
|
-
background: rgba(125, 179, 103, 0.
|
|
2069
|
-
border-left:
|
|
2132
|
+
background: rgba(125, 179, 103, 0.06);
|
|
2133
|
+
border-left: 3px solid var(--success-500);
|
|
2134
|
+
padding: var(--space-2) var(--space-3);
|
|
2135
|
+
}
|
|
2136
|
+
|
|
2137
|
+
.output-block--expected:hover {
|
|
2138
|
+
background: rgba(125, 179, 103, 0.1);
|
|
2139
|
+
}
|
|
2140
|
+
|
|
2141
|
+
.output-block--expected .output-value {
|
|
2142
|
+
font-weight: 500;
|
|
2143
|
+
color: var(--gray-50);
|
|
2070
2144
|
}
|
|
2071
2145
|
|
|
2072
2146
|
.output-block--got {
|
|
2073
|
-
|
|
2147
|
+
background: var(--gray-850, #222120);
|
|
2148
|
+
border: 1px solid var(--gray-700);
|
|
2149
|
+
border-left: 3px solid var(--gray-600);
|
|
2150
|
+
}
|
|
2151
|
+
|
|
2152
|
+
.output-block--parsed {
|
|
2153
|
+
background: rgba(212, 168, 74, 0.06);
|
|
2154
|
+
border-left: 3px solid var(--accent-500);
|
|
2155
|
+
padding: var(--space-2) var(--space-3);
|
|
2156
|
+
}
|
|
2157
|
+
|
|
2158
|
+
.output-block--parsed:hover {
|
|
2159
|
+
background: rgba(212, 168, 74, 0.1);
|
|
2160
|
+
}
|
|
2161
|
+
|
|
2162
|
+
.output-block--parsed .output-value {
|
|
2163
|
+
font-weight: 500;
|
|
2164
|
+
color: var(--accent-300);
|
|
2074
2165
|
}
|
|
2075
2166
|
|
|
2076
2167
|
.output-label {
|
|
2077
2168
|
font-size: 10px;
|
|
2078
2169
|
font-weight: 600;
|
|
2079
|
-
color: var(--gray-
|
|
2170
|
+
color: var(--gray-400);
|
|
2080
2171
|
text-transform: uppercase;
|
|
2081
|
-
letter-spacing: 0.
|
|
2172
|
+
letter-spacing: 0.05em;
|
|
2082
2173
|
flex-shrink: 0;
|
|
2083
|
-
width:
|
|
2174
|
+
min-width: 64px;
|
|
2175
|
+
padding-top: 2px;
|
|
2084
2176
|
}
|
|
2085
2177
|
|
|
2086
2178
|
.output-value {
|
|
2087
2179
|
font-family: var(--font-mono);
|
|
2088
2180
|
font-size: var(--text-sm);
|
|
2089
2181
|
color: var(--gray-100);
|
|
2090
|
-
line-height: 1.
|
|
2182
|
+
line-height: 1.6;
|
|
2091
2183
|
word-break: break-word;
|
|
2184
|
+
white-space: pre-wrap;
|
|
2185
|
+
flex: 1;
|
|
2186
|
+
min-width: 0;
|
|
2187
|
+
}
|
|
2188
|
+
|
|
2189
|
+
/* Long output values get constrained height with scroll */
|
|
2190
|
+
.output-block--got .output-value {
|
|
2191
|
+
max-height: 180px;
|
|
2192
|
+
overflow-y: auto;
|
|
2193
|
+
scrollbar-width: none;
|
|
2194
|
+
-ms-overflow-style: none;
|
|
2195
|
+
font-size: 12px;
|
|
2196
|
+
line-height: 1.7;
|
|
2197
|
+
color: var(--gray-200);
|
|
2198
|
+
}
|
|
2199
|
+
|
|
2200
|
+
.output-block--got .output-value::-webkit-scrollbar {
|
|
2201
|
+
display: none;
|
|
2202
|
+
}
|
|
2203
|
+
|
|
2204
|
+
/* Fade effect for long content */
|
|
2205
|
+
.output-block--got {
|
|
2206
|
+
position: relative;
|
|
2207
|
+
}
|
|
2208
|
+
|
|
2209
|
+
.output-block--got::after {
|
|
2210
|
+
content: '';
|
|
2211
|
+
position: absolute;
|
|
2212
|
+
bottom: 0;
|
|
2213
|
+
left: 0;
|
|
2214
|
+
right: 0;
|
|
2215
|
+
height: 40px;
|
|
2216
|
+
background: linear-gradient(transparent, var(--gray-850, #222120));
|
|
2217
|
+
pointer-events: none;
|
|
2218
|
+
border-radius: 0 0 var(--radius-md) var(--radius-md);
|
|
2219
|
+
opacity: 0;
|
|
2220
|
+
transition: opacity 0.2s;
|
|
2221
|
+
}
|
|
2222
|
+
|
|
2223
|
+
.output-block--got:has(.output-value:not(:hover)):after {
|
|
2224
|
+
opacity: 0.8;
|
|
2092
2225
|
}
|
|
2093
2226
|
|
|
2094
2227
|
.output-empty {
|
|
@@ -2103,20 +2236,25 @@ dialog::backdrop {
|
|
|
2103
2236
|
============================================ */
|
|
2104
2237
|
|
|
2105
2238
|
.eval-section {
|
|
2106
|
-
|
|
2239
|
+
padding: var(--space-4);
|
|
2240
|
+
background: rgba(26, 25, 24, 0.4);
|
|
2241
|
+
border-radius: var(--radius-lg);
|
|
2242
|
+
border: 1px solid var(--gray-800);
|
|
2107
2243
|
}
|
|
2108
2244
|
|
|
2109
2245
|
.eval-header {
|
|
2110
2246
|
display: flex;
|
|
2111
2247
|
align-items: center;
|
|
2112
2248
|
justify-content: space-between;
|
|
2113
|
-
margin-bottom: var(--space-
|
|
2249
|
+
margin-bottom: var(--space-4);
|
|
2250
|
+
padding-bottom: var(--space-3);
|
|
2251
|
+
border-bottom: 1px solid var(--gray-800);
|
|
2114
2252
|
}
|
|
2115
2253
|
|
|
2116
2254
|
.eval-title {
|
|
2117
2255
|
font-size: var(--text-xs);
|
|
2118
2256
|
font-weight: 600;
|
|
2119
|
-
color: var(--gray-
|
|
2257
|
+
color: var(--gray-300);
|
|
2120
2258
|
text-transform: uppercase;
|
|
2121
2259
|
letter-spacing: var(--tracking-wide);
|
|
2122
2260
|
}
|
|
@@ -2130,8 +2268,8 @@ dialog::backdrop {
|
|
|
2130
2268
|
.eval-card {
|
|
2131
2269
|
flex: 1 1 calc(50% - var(--space-1));
|
|
2132
2270
|
min-width: 0;
|
|
2133
|
-
background: var(--gray-
|
|
2134
|
-
border: 1px solid
|
|
2271
|
+
background: var(--gray-850, #222120);
|
|
2272
|
+
border: 1px solid var(--gray-700);
|
|
2135
2273
|
border-radius: var(--radius-md);
|
|
2136
2274
|
padding: var(--space-3);
|
|
2137
2275
|
cursor: pointer;
|
|
@@ -2143,6 +2281,8 @@ dialog::backdrop {
|
|
|
2143
2281
|
|
|
2144
2282
|
.eval-card:hover {
|
|
2145
2283
|
background: var(--gray-750, #2a2928);
|
|
2284
|
+
border-color: var(--gray-600);
|
|
2285
|
+
transform: translateY(-1px);
|
|
2146
2286
|
}
|
|
2147
2287
|
|
|
2148
2288
|
.eval-card-inner {
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Leva
|
|
4
|
+
class DatasetOptimizationsController < ApplicationController
|
|
5
|
+
before_action :set_dataset
|
|
6
|
+
|
|
7
|
+
# GET /datasets/:dataset_id/optimization/new
|
|
8
|
+
# Shows the prompt optimization form
|
|
9
|
+
# @return [void]
|
|
10
|
+
def new
|
|
11
|
+
@record_count = @dataset.dataset_records.count
|
|
12
|
+
@prompt_optimizer = PromptOptimizer.new(dataset: @dataset)
|
|
13
|
+
@can_optimize = @prompt_optimizer.can_optimize?
|
|
14
|
+
@records_needed = @prompt_optimizer.records_needed
|
|
15
|
+
@modes = PromptOptimizer::MODES
|
|
16
|
+
@models = PromptOptimizer.available_models
|
|
17
|
+
@optimizers = PromptOptimizer::OPTIMIZERS
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# POST /datasets/:dataset_id/optimization
|
|
21
|
+
# Starts the prompt optimization job with progress tracking
|
|
22
|
+
# @return [void]
|
|
23
|
+
def create
|
|
24
|
+
opt_params = optimization_params
|
|
25
|
+
|
|
26
|
+
@optimization_run = @dataset.optimization_runs.create!(
|
|
27
|
+
prompt_name: opt_params[:prompt_name],
|
|
28
|
+
mode: opt_params[:mode],
|
|
29
|
+
model: opt_params[:model],
|
|
30
|
+
optimizer: opt_params[:optimizer],
|
|
31
|
+
status: :pending
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
PromptOptimizationJob.perform_later(optimization_run_id: @optimization_run.id)
|
|
35
|
+
|
|
36
|
+
redirect_to optimization_run_path(@optimization_run)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
# Use callbacks to share common setup or constraints between actions.
|
|
42
|
+
# @return [void]
|
|
43
|
+
def set_dataset
|
|
44
|
+
@dataset = Dataset.find(params[:dataset_id])
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# Strong parameters for optimization run creation.
|
|
48
|
+
# @return [Hash]
|
|
49
|
+
# @raise [ActionController::BadRequest] If model is invalid
|
|
50
|
+
def optimization_params
|
|
51
|
+
model = params[:model].presence || PromptOptimizer::DEFAULT_MODEL
|
|
52
|
+
unless PromptOptimizer.find_model(model)
|
|
53
|
+
raise ActionController::BadRequest, "Invalid model: #{model}"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
{
|
|
57
|
+
prompt_name: params[:prompt_name].presence || "Optimized: #{@dataset.name}",
|
|
58
|
+
mode: params[:mode].presence || "light",
|
|
59
|
+
model: model,
|
|
60
|
+
optimizer: params[:optimizer].presence || PromptOptimizer::DEFAULT_OPTIMIZER.to_s
|
|
61
|
+
}
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
@@ -2,8 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
module Leva
|
|
4
4
|
class ExperimentsController < ApplicationController
|
|
5
|
-
include ApplicationHelper
|
|
6
|
-
|
|
7
5
|
before_action :set_experiment, only: [ :show, :edit, :update ]
|
|
8
6
|
before_action :check_editable, only: [ :edit, :update ]
|
|
9
7
|
before_action :load_runners_and_evaluators, only: [ :new, :edit, :create, :update ]
|
|
@@ -11,7 +9,8 @@ module Leva
|
|
|
11
9
|
# GET /experiments
|
|
12
10
|
# @return [void]
|
|
13
11
|
def index
|
|
14
|
-
@experiments = Experiment.all
|
|
12
|
+
@experiments = Experiment.includes(:evaluation_results).all
|
|
13
|
+
@evaluator_classes = Leva::EvaluationResult.distinct.pluck(:evaluator_class)
|
|
15
14
|
end
|
|
16
15
|
|
|
17
16
|
# GET /experiments/1
|
|
@@ -83,12 +82,21 @@ module Leva
|
|
|
83
82
|
# Only allow a list of trusted parameters through.
|
|
84
83
|
# @return [ActionController::Parameters]
|
|
85
84
|
def experiment_params
|
|
86
|
-
params.require(:experiment).permit(:name, :description, :dataset_id, :prompt_id, :runner_class, evaluator_classes: [])
|
|
85
|
+
permitted = params.require(:experiment).permit(:name, :description, :dataset_id, :prompt_id, :runner_class, evaluator_classes: [], metadata: {})
|
|
86
|
+
# Ensure metadata is a hash, not ActionController::Parameters
|
|
87
|
+
if permitted[:metadata].present?
|
|
88
|
+
metadata_hash = permitted[:metadata].to_h
|
|
89
|
+
if metadata_hash.to_json.bytesize > 100_000
|
|
90
|
+
raise ActionController::BadRequest, "Metadata exceeds maximum size of 100KB"
|
|
91
|
+
end
|
|
92
|
+
permitted[:metadata] = metadata_hash
|
|
93
|
+
end
|
|
94
|
+
permitted
|
|
87
95
|
end
|
|
88
96
|
|
|
89
97
|
def load_runners_and_evaluators
|
|
90
|
-
@runners =
|
|
91
|
-
@evaluators =
|
|
98
|
+
@runners = Leva::ClassLoader.runners
|
|
99
|
+
@evaluators = Leva::ClassLoader.evaluators
|
|
92
100
|
end
|
|
93
101
|
|
|
94
102
|
def check_editable
|
|
@@ -19,14 +19,12 @@ module Leva
|
|
|
19
19
|
@selected_dataset_record = params[:dataset_record_id] || DatasetRecord.first&.id
|
|
20
20
|
|
|
21
21
|
# Get merged context if runner and dataset record are available
|
|
22
|
-
if @selected_runner && @dataset_record
|
|
23
|
-
runner_class = @selected_runner.constantize
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
@merged_context = @record_context.merge(@runner_context)
|
|
29
|
-
end
|
|
22
|
+
if @selected_runner && @dataset_record && valid_runner?(@selected_runner)
|
|
23
|
+
runner_class = @selected_runner.constantize
|
|
24
|
+
runner = runner_class.new
|
|
25
|
+
@record_context = @dataset_record.recordable.to_llm_context
|
|
26
|
+
@runner_context = runner.to_llm_context(@dataset_record.recordable)
|
|
27
|
+
@merged_context = @record_context.merge(@runner_context)
|
|
30
28
|
end
|
|
31
29
|
end
|
|
32
30
|
|
|
@@ -67,8 +65,8 @@ module Leva
|
|
|
67
65
|
def run
|
|
68
66
|
return redirect_to workbench_index_path, alert: "Please select a record and a runner" unless @dataset_record && run_params[:runner]
|
|
69
67
|
|
|
68
|
+
return redirect_to workbench_index_path, alert: "Invalid runner selected" unless valid_runner?(run_params[:runner])
|
|
70
69
|
runner_class = run_params[:runner].constantize
|
|
71
|
-
return redirect_to workbench_index_path, alert: "Invalid runner selected" unless runner_class < Leva::BaseRun
|
|
72
70
|
|
|
73
71
|
runner = runner_class.new
|
|
74
72
|
runner_result = runner.execute_and_store(nil, @dataset_record, @prompt)
|
|
@@ -90,8 +88,8 @@ module Leva
|
|
|
90
88
|
def run_evaluator
|
|
91
89
|
return redirect_to workbench_index_path, alert: "No runner result available" unless @runner_result
|
|
92
90
|
|
|
91
|
+
return redirect_to workbench_index_path, alert: "Invalid evaluator selected" unless allowed_evaluator_names.include?(params[:evaluator])
|
|
93
92
|
evaluator_class = params[:evaluator].constantize
|
|
94
|
-
return redirect_to workbench_index_path, alert: "Invalid evaluator selected" unless evaluator_class < Leva::BaseEval
|
|
95
93
|
|
|
96
94
|
evaluator = evaluator_class.new
|
|
97
95
|
evaluator.evaluate_and_store(nil, @runner_result)
|
|
@@ -120,5 +118,23 @@ module Leva
|
|
|
120
118
|
def set_runner_result
|
|
121
119
|
@runner_result = @dataset_record.runner_results.last if @dataset_record
|
|
122
120
|
end
|
|
121
|
+
|
|
122
|
+
def allowed_runner_names
|
|
123
|
+
@allowed_runner_names ||= load_runners.map(&:name)
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
def allowed_evaluator_names
|
|
127
|
+
@allowed_evaluator_names ||= load_evaluators.map(&:name)
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
def valid_runner?(runner_name)
|
|
131
|
+
return true if allowed_runner_names.include?(runner_name)
|
|
132
|
+
|
|
133
|
+
# Also accept any class that inherits from BaseRun (for testing)
|
|
134
|
+
klass = runner_name.constantize
|
|
135
|
+
klass < Leva::BaseRun
|
|
136
|
+
rescue NameError
|
|
137
|
+
false
|
|
138
|
+
end
|
|
123
139
|
end
|
|
124
140
|
end
|
|
@@ -4,14 +4,44 @@ module Leva
|
|
|
4
4
|
#
|
|
5
5
|
# @return [Array<Class>] An array of evaluator classes
|
|
6
6
|
def load_evaluators
|
|
7
|
-
|
|
7
|
+
Leva::ClassLoader.evaluators
|
|
8
8
|
end
|
|
9
9
|
|
|
10
10
|
# Loads all runner classes that inherit from Leva::BaseRun
|
|
11
11
|
#
|
|
12
12
|
# @return [Array<Class>] An array of runner classes
|
|
13
13
|
def load_runners
|
|
14
|
-
|
|
14
|
+
Leva::ClassLoader.runners
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Returns the CSS class for a score value.
|
|
18
|
+
#
|
|
19
|
+
# @param score [Float, nil] The score value (0.0 - 1.0)
|
|
20
|
+
# @return [String] The CSS class for the score
|
|
21
|
+
def score_class(score)
|
|
22
|
+
return "" if score.nil?
|
|
23
|
+
|
|
24
|
+
case score
|
|
25
|
+
when 0...0.2 then "score-bad"
|
|
26
|
+
when 0.2...0.4 then "score-poor"
|
|
27
|
+
when 0.4...0.6 then "score-fair"
|
|
28
|
+
when 0.6...0.8 then "score-good"
|
|
29
|
+
else "score-excellent"
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Returns the display name for a model.
|
|
34
|
+
#
|
|
35
|
+
# Uses RubyLLM to find the model and get its display name,
|
|
36
|
+
# falling back to extracting the name from the model ID.
|
|
37
|
+
#
|
|
38
|
+
# @param model_id [String] The model ID
|
|
39
|
+
# @return [String] The display name for the model
|
|
40
|
+
def model_display_name(model_id)
|
|
41
|
+
return "—" if model_id.blank?
|
|
42
|
+
|
|
43
|
+
@models_cache ||= Leva::PromptOptimizer.available_models.index_by(&:id)
|
|
44
|
+
@models_cache[model_id]&.name || model_id.split("/").last
|
|
15
45
|
end
|
|
16
46
|
|
|
17
47
|
# Loads predefined prompts from markdown files
|
|
@@ -25,19 +55,5 @@ module Leva
|
|
|
25
55
|
end
|
|
26
56
|
prompts
|
|
27
57
|
end
|
|
28
|
-
|
|
29
|
-
private
|
|
30
|
-
|
|
31
|
-
# Loads classes from a specified directory that inherit from a given base class
|
|
32
|
-
#
|
|
33
|
-
# @param directory [String] The directory path to load classes from
|
|
34
|
-
# @param base_class [Class] The base class that loaded classes should inherit from
|
|
35
|
-
# @return [Array<Class>] An array of loaded classes
|
|
36
|
-
def load_classes_from_directory(directory, base_class)
|
|
37
|
-
classes = Dir[Rails.root.join(directory, "*.rb")].map do |file|
|
|
38
|
-
File.basename(file, ".rb").camelize.constantize
|
|
39
|
-
end.select { |klass| klass < base_class }
|
|
40
|
-
classes.empty? ? [] : classes
|
|
41
|
-
end
|
|
42
58
|
end
|
|
43
59
|
end
|
data/app/models/leva/dataset.rb
CHANGED