leva 0.1.9.1 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +21 -5
- data/app/assets/stylesheets/leva/application.css +3083 -15
- data/app/controllers/leva/application_controller.rb +1 -1
- data/app/controllers/leva/dataset_records_controller.rb +1 -1
- data/app/controllers/leva/datasets_controller.rb +6 -6
- data/app/controllers/leva/design_system_controller.rb +9 -0
- data/app/controllers/leva/experiments_controller.rb +8 -8
- data/app/controllers/leva/runner_results_controller.rb +1 -1
- data/app/controllers/leva/workbench_controller.rb +26 -15
- data/app/helpers/leva/application_helper.rb +7 -7
- data/app/jobs/leva/experiment_job.rb +1 -1
- data/app/jobs/leva/run_eval_job.rb +1 -1
- data/app/models/concerns/leva/recordable.rb +5 -5
- data/app/models/leva/dataset.rb +1 -1
- data/app/models/leva/evaluation_result.rb +1 -1
- data/app/models/leva/experiment.rb +1 -1
- data/app/models/leva/prompt.rb +1 -1
- data/app/views/layouts/leva/application.html.erb +23 -24
- data/app/views/leva/dataset_records/index.html.erb +70 -43
- data/app/views/leva/dataset_records/show.html.erb +115 -25
- data/app/views/leva/datasets/_dataset.html.erb +11 -18
- data/app/views/leva/datasets/_form.html.erb +18 -14
- data/app/views/leva/datasets/edit.html.erb +16 -4
- data/app/views/leva/datasets/index.html.erb +33 -41
- data/app/views/leva/datasets/new.html.erb +15 -4
- data/app/views/leva/datasets/show.html.erb +120 -139
- data/app/views/leva/design_system/index.html.erb +1731 -0
- data/app/views/leva/experiments/_experiment.html.erb +46 -31
- data/app/views/leva/experiments/_form.html.erb +62 -35
- data/app/views/leva/experiments/edit.html.erb +17 -3
- data/app/views/leva/experiments/index.html.erb +41 -36
- data/app/views/leva/experiments/new.html.erb +52 -4
- data/app/views/leva/experiments/show.html.erb +155 -98
- data/app/views/leva/runner_results/show.html.erb +271 -54
- data/app/views/leva/workbench/_evaluation_area.html.erb +18 -4
- data/app/views/leva/workbench/_prompt_content.html.erb +124 -73
- data/app/views/leva/workbench/_prompt_form.html.erb +24 -23
- data/app/views/leva/workbench/_prompt_sidebar.html.erb +57 -12
- data/app/views/leva/workbench/_results_section.html.erb +274 -112
- data/app/views/leva/workbench/_top_bar.html.erb +16 -6
- data/app/views/leva/workbench/edit.html.erb +46 -15
- data/app/views/leva/workbench/index.html.erb +5 -8
- data/app/views/leva/workbench/new.html.erb +74 -42
- data/config/routes.rb +11 -9
- data/db/migrate/20240813173033_create_leva_dataset_records.rb +1 -0
- data/db/migrate/20240813173035_create_leva_experiments.rb +2 -0
- data/db/migrate/{20240816201419_create_leva_runner_results.rb → 20240813173040_create_leva_runner_results.rb} +4 -1
- data/db/migrate/20240813173050_create_leva_evaluation_results.rb +3 -3
- data/lib/generators/leva/eval_generator.rb +4 -4
- data/lib/generators/leva/runner_generator.rb +4 -4
- data/lib/generators/leva/templates/runner.rb.erb +20 -0
- data/lib/leva/version.rb +1 -1
- data/lib/leva.rb +24 -2
- metadata +5 -11
- data/db/migrate/20240816201433_update_leva_evaluation_results.rb +0 -8
- data/db/migrate/20240821163608_make_experiment_optional_for_runner_results.rb +0 -6
- data/db/migrate/20240821181934_add_prompt_to_leva_runner_results.rb +0 -5
- data/db/migrate/20240821183153_add_runner_and_evaluator_to_leva_experiments.rb +0 -6
- data/db/migrate/20240821191713_add_actual_result_to_leva_dataset_records.rb +0 -5
- data/db/migrate/20240822143201_remove_actual_result_from_leva_runner_results.rb +0 -5
- data/db/migrate/20240912183556_add_runner_class_to_leva_runner_results.rb +0 -5
- data/lib/tasks/auto_annotate_models.rake +0 -59
|
@@ -4,117 +4,174 @@
|
|
|
4
4
|
<meta http-equiv="refresh" content="5">
|
|
5
5
|
<% end %>
|
|
6
6
|
<% end %>
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
<div class="
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
7
|
+
|
|
8
|
+
<div class="container page">
|
|
9
|
+
<div class="page-header">
|
|
10
|
+
<div class="page-header-content">
|
|
11
|
+
<div class="page-header-title-row">
|
|
12
|
+
<h1><%= @experiment.name %></h1>
|
|
13
|
+
<%
|
|
14
|
+
status_dot = case @experiment.status
|
|
15
|
+
when 'pending' then 'status-dot-pending'
|
|
16
|
+
when 'running' then 'status-dot-info'
|
|
17
|
+
when 'completed' then 'status-dot-success'
|
|
18
|
+
when 'failed' then 'status-dot-error'
|
|
19
|
+
else 'status-dot-pending'
|
|
20
|
+
end
|
|
21
|
+
%>
|
|
22
|
+
<span class="status-indicator">
|
|
23
|
+
<span class="status-dot <%= status_dot %>"></span>
|
|
24
|
+
<span class="text-sm text-muted"><%= @experiment.status&.capitalize || 'N/A' %></span>
|
|
25
|
+
</span>
|
|
26
|
+
</div>
|
|
27
|
+
<% if @experiment.description.present? %>
|
|
28
|
+
<p class="text-muted text-sm mt-1"><%= @experiment.description %></p>
|
|
29
|
+
<% end %>
|
|
30
|
+
</div>
|
|
31
|
+
<div class="btn-group">
|
|
32
|
+
<% if @experiment.status != 'completed' %>
|
|
33
|
+
<%= link_to edit_experiment_path(@experiment), class: 'btn btn-ghost' do %>
|
|
34
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
35
|
+
<path d="M13.586 3.586a2 2 0 112.828 2.828l-.793.793-2.828-2.828.793-.793zM11.379 5.793L3 14.172V17h2.828l8.38-8.379-2.83-2.828z" />
|
|
23
36
|
</svg>
|
|
24
|
-
|
|
37
|
+
Edit
|
|
25
38
|
<% end %>
|
|
26
|
-
|
|
39
|
+
<% end %>
|
|
40
|
+
<%= button_to rerun_experiment_path(@experiment), method: :post, class: 'btn btn-primary', data: { confirm: 'Are you sure you want to rerun this experiment? This will delete all existing results.' } do %>
|
|
41
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
42
|
+
<path fill-rule="evenodd" d="M4 2a1 1 0 011 1v2.101a7.002 7.002 0 0111.601 2.566 1 1 0 11-1.885.666A5.002 5.002 0 005.999 7H9a1 1 0 010 2H4a1 1 0 01-1-1V3a1 1 0 011-1zm.008 9.057a1 1 0 011.276.61A5.002 5.002 0 0014.001 13H11a1 1 0 110-2h5a1 1 0 011 1v5a1 1 0 11-2 0v-2.101a7.002 7.002 0 01-11.601-2.566 1 1 0 01.61-1.276z" clip-rule="evenodd" />
|
|
43
|
+
</svg>
|
|
44
|
+
Rerun
|
|
45
|
+
<% end %>
|
|
27
46
|
</div>
|
|
28
|
-
<p class="text-gray-400"><%= @experiment.description %></p>
|
|
29
|
-
<p class="text-indigo-300 mt-2">Status: <%= @experiment.status&.capitalize || 'N/A' %></p>
|
|
30
47
|
</div>
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
48
|
+
|
|
49
|
+
<%# Experiment Configuration %>
|
|
50
|
+
<section class="mb-6">
|
|
51
|
+
<div class="exp-meta">
|
|
52
|
+
<div class="exp-meta-item">
|
|
53
|
+
<span class="exp-meta-label">Dataset</span>
|
|
54
|
+
<%= link_to @experiment.dataset.name, dataset_path(@experiment.dataset), class: 'exp-meta-value exp-meta-link' %>
|
|
55
|
+
</div>
|
|
56
|
+
<div class="exp-meta-item">
|
|
57
|
+
<span class="exp-meta-label">Prompt</span>
|
|
58
|
+
<span class="exp-meta-value"><%= @experiment.prompt ? @experiment.prompt.name : '—' %></span>
|
|
59
|
+
</div>
|
|
60
|
+
<div class="exp-meta-item">
|
|
61
|
+
<span class="exp-meta-label">Runner</span>
|
|
62
|
+
<span class="exp-meta-value font-mono text-sm"><%= @experiment.runner_class&.demodulize || '—' %></span>
|
|
63
|
+
</div>
|
|
64
|
+
<div class="exp-meta-item">
|
|
65
|
+
<span class="exp-meta-label">Created</span>
|
|
66
|
+
<span class="exp-meta-value"><%= time_ago_in_words(@experiment.created_at) %> ago</span>
|
|
67
|
+
</div>
|
|
68
|
+
</div>
|
|
69
|
+
</section>
|
|
70
|
+
|
|
71
|
+
<%# Evaluation Summary %>
|
|
72
|
+
<% if @experiment.evaluation_results.any? %>
|
|
73
|
+
<section class="mb-8">
|
|
74
|
+
<div class="section-header">
|
|
75
|
+
<h3 class="section-title">Evaluation Scores</h3>
|
|
76
|
+
</div>
|
|
77
|
+
<div class="eval-summary">
|
|
35
78
|
<% @experiment.evaluation_results.group_by(&:evaluator_class).each do |evaluator_class, results| %>
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
79
|
+
<%
|
|
80
|
+
avg_score = (results.sum(&:score) / results.size.to_f).round(2)
|
|
81
|
+
score_pct = (avg_score * 100).round
|
|
82
|
+
score_class = case avg_score
|
|
83
|
+
when 0...0.2 then 'score-bad'
|
|
84
|
+
when 0.2...0.4 then 'score-poor'
|
|
85
|
+
when 0.4...0.6 then 'score-fair'
|
|
86
|
+
when 0.6...0.8 then 'score-good'
|
|
87
|
+
else 'score-excellent'
|
|
88
|
+
end
|
|
89
|
+
short_name = evaluator_class.demodulize
|
|
90
|
+
.gsub(/Evaluator$/, '')
|
|
91
|
+
.gsub(/Eval$/, '')
|
|
92
|
+
.gsub(/^Sentiment/, '')
|
|
93
|
+
short_name = short_name.presence || evaluator_class.demodulize.gsub(/Eval(uator)?$/, '')
|
|
94
|
+
%>
|
|
95
|
+
<div class="eval-summary-card" title="<%= results.size %> evaluations">
|
|
96
|
+
<span class="eval-summary-score <%= score_class %>"><%= score_pct %><span class="eval-summary-pct">%</span></span>
|
|
97
|
+
<span class="eval-summary-name"><%= short_name %></span>
|
|
98
|
+
<div class="eval-summary-bar">
|
|
99
|
+
<div class="eval-summary-bar-fill <%= score_class %>" style="width: <%= score_pct %>%"></div>
|
|
100
|
+
</div>
|
|
101
|
+
<span class="eval-summary-count"><%= results.size %> runs</span>
|
|
49
102
|
</div>
|
|
50
103
|
<% end %>
|
|
51
104
|
</div>
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
<
|
|
57
|
-
<
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
</div>
|
|
62
|
-
<div class="bg-gray-800 rounded-lg shadow-lg p-6">
|
|
63
|
-
<h2 class="text-2xl font-semibold text-indigo-300 mb-4">Runner Results</h2>
|
|
105
|
+
</section>
|
|
106
|
+
<% end %>
|
|
107
|
+
|
|
108
|
+
<%# Runner Results %>
|
|
109
|
+
<section>
|
|
110
|
+
<div class="section-header">
|
|
111
|
+
<h3 class="section-title">Results</h3>
|
|
112
|
+
<span class="section-count"><%= @experiment.runner_results.count %> runs</span>
|
|
113
|
+
</div>
|
|
64
114
|
<% if @experiment.runner_results.any? %>
|
|
65
|
-
<div class="
|
|
66
|
-
<
|
|
67
|
-
<
|
|
68
|
-
<
|
|
69
|
-
<th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider">Dataset Record</th>
|
|
70
|
-
<th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider">Prompt</th>
|
|
71
|
-
<th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider">Prediction</th>
|
|
72
|
-
<th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider">Ground Truth</th>
|
|
73
|
-
<% @experiment.evaluation_results.group_by(&:evaluator_class).keys.each do |evaluator_class| %>
|
|
74
|
-
<th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider"><%= evaluator_class %></th>
|
|
75
|
-
<% end %>
|
|
76
|
-
<th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider">Created At</th>
|
|
77
|
-
<th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider">Actions</th>
|
|
78
|
-
</tr>
|
|
79
|
-
</thead>
|
|
80
|
-
<tbody class="bg-gray-700 divide-y divide-gray-600">
|
|
81
|
-
<% @experiment.runner_results.each do |runner_result| %>
|
|
115
|
+
<div class="table-wrapper">
|
|
116
|
+
<div class="table-scroll">
|
|
117
|
+
<table class="table table-clickable table-results">
|
|
118
|
+
<thead>
|
|
82
119
|
<tr>
|
|
83
|
-
<
|
|
84
|
-
<
|
|
85
|
-
<
|
|
86
|
-
<td class="px-6 py-4 text-sm text-gray-300"><%= truncate(runner_result.ground_truth, length: 30) %></td>
|
|
120
|
+
<th>Record</th>
|
|
121
|
+
<th>Prediction</th>
|
|
122
|
+
<th>Expected</th>
|
|
87
123
|
<% @experiment.evaluation_results.group_by(&:evaluator_class).keys.each do |evaluator_class| %>
|
|
88
|
-
<%
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
when 0.6...0.8 then 'text-lime-500'
|
|
97
|
-
when 0.8...1.0 then 'text-green-400'
|
|
98
|
-
else 'text-green-300'
|
|
99
|
-
end %>
|
|
100
|
-
<span class="<%= color_class %> font-semibold"><%= sprintf('%.2f', score) %></span>
|
|
101
|
-
<% else %>
|
|
102
|
-
<span class="text-gray-400">N/A</span>
|
|
103
|
-
<% end %>
|
|
104
|
-
</td>
|
|
124
|
+
<%
|
|
125
|
+
short_name = evaluator_class.demodulize
|
|
126
|
+
.gsub(/Evaluator$/, '')
|
|
127
|
+
.gsub(/Eval$/, '')
|
|
128
|
+
.gsub(/^Sentiment/, '')
|
|
129
|
+
short_name = short_name.presence || evaluator_class.demodulize.gsub(/Eval(uator)?$/, '')
|
|
130
|
+
%>
|
|
131
|
+
<th class="text-right" style="width: 80px;"><%= short_name %></th>
|
|
105
132
|
<% end %>
|
|
106
|
-
<td class="px-6 py-4 whitespace-nowrap text-sm text-gray-300"><%= runner_result.created_at.strftime("%Y-%m-%d %H:%M:%S") %></td>
|
|
107
|
-
<td class="px-6 py-4 whitespace-nowrap text-sm text-gray-300">
|
|
108
|
-
<%= link_to 'View Details', experiment_runner_result_path(@experiment, runner_result), class: 'text-indigo-400 hover:underline mr-2' %>
|
|
109
|
-
<%= link_to 'Experiment', workbench_index_path(prompt_id: runner_result.prompt_id, dataset_record_id: runner_result.dataset_record_id, runner: @experiment.runner_class), class: 'text-indigo-400 hover:underline' %>
|
|
110
|
-
</td>
|
|
111
133
|
</tr>
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
134
|
+
</thead>
|
|
135
|
+
<tbody>
|
|
136
|
+
<% @experiment.runner_results.each do |runner_result| %>
|
|
137
|
+
<tr class="clickable-row" onclick="window.location='<%= experiment_runner_result_path(@experiment, runner_result) %>'">
|
|
138
|
+
<td>
|
|
139
|
+
<span class="row-title"><%= runner_result.dataset_record.display_name %></span>
|
|
140
|
+
</td>
|
|
141
|
+
<td>
|
|
142
|
+
<span class="prediction-badge"><%= truncate(runner_result.prediction.to_s.strip, length: 25) %></span>
|
|
143
|
+
</td>
|
|
144
|
+
<td class="text-muted"><%= truncate(runner_result.ground_truth.to_s.strip.presence || '—', length: 25) %></td>
|
|
145
|
+
<% @experiment.evaluation_results.group_by(&:evaluator_class).keys.each do |evaluator_class| %>
|
|
146
|
+
<td class="text-right">
|
|
147
|
+
<% eval_result = runner_result.evaluation_results.find_by(evaluator_class: evaluator_class) %>
|
|
148
|
+
<% if eval_result %>
|
|
149
|
+
<% score = eval_result.score %>
|
|
150
|
+
<%
|
|
151
|
+
score_class = case score
|
|
152
|
+
when 0...0.2 then 'score-bad'
|
|
153
|
+
when 0.2...0.4 then 'score-poor'
|
|
154
|
+
when 0.4...0.6 then 'score-fair'
|
|
155
|
+
when 0.6...0.8 then 'score-good'
|
|
156
|
+
else 'score-excellent'
|
|
157
|
+
end
|
|
158
|
+
%>
|
|
159
|
+
<span class="score-inline <%= score_class %>"><%= sprintf('%.2f', score) %></span>
|
|
160
|
+
<% else %>
|
|
161
|
+
<span class="text-subtle">—</span>
|
|
162
|
+
<% end %>
|
|
163
|
+
</td>
|
|
164
|
+
<% end %>
|
|
165
|
+
</tr>
|
|
166
|
+
<% end %>
|
|
167
|
+
</tbody>
|
|
168
|
+
</table>
|
|
169
|
+
</div>
|
|
115
170
|
</div>
|
|
116
171
|
<% else %>
|
|
117
|
-
<
|
|
172
|
+
<div class="empty-state-inline">
|
|
173
|
+
<p class="text-muted">No results yet. Run the experiment to see results.</p>
|
|
174
|
+
</div>
|
|
118
175
|
<% end %>
|
|
119
|
-
</
|
|
120
|
-
</div>
|
|
176
|
+
</section>
|
|
177
|
+
</div>
|
|
@@ -1,64 +1,281 @@
|
|
|
1
|
-
<% content_for :title,
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
1
|
+
<% content_for :title, @runner_result.dataset_record.display_name %>
|
|
2
|
+
<%
|
|
3
|
+
# Calculate overall status based on average score
|
|
4
|
+
avg_score = @runner_result.evaluation_results.any? ? @runner_result.evaluation_results.average(:score).to_f : nil
|
|
5
|
+
status_class = case avg_score
|
|
6
|
+
when nil then 'status-dot-pending'
|
|
7
|
+
when 0...0.4 then 'status-dot-error'
|
|
8
|
+
when 0.4...0.7 then 'status-dot-warning'
|
|
9
|
+
else 'status-dot-success'
|
|
10
|
+
end
|
|
11
|
+
%>
|
|
12
|
+
<div class="container page">
|
|
13
|
+
<%# Header with back link %>
|
|
14
|
+
<div class="page-header">
|
|
15
|
+
<div>
|
|
16
|
+
<nav class="breadcrumb mb-2">
|
|
17
|
+
<%= link_to experiments_path, class: 'breadcrumb-link', title: 'All Experiments' do %>
|
|
18
|
+
<svg class="icon-sm" style="vertical-align: -2px; margin-right: 2px;" viewBox="0 0 20 20" fill="currentColor">
|
|
19
|
+
<path d="M10.707 2.293a1 1 0 00-1.414 0l-7 7a1 1 0 001.414 1.414L4 10.414V17a1 1 0 001 1h2a1 1 0 001-1v-2a1 1 0 011-1h2a1 1 0 011 1v2a1 1 0 001 1h2a1 1 0 001-1v-6.586l.293.293a1 1 0 001.414-1.414l-7-7z" />
|
|
20
|
+
</svg>
|
|
21
|
+
<% end %>
|
|
22
|
+
<span class="breadcrumb-sep">/</span>
|
|
23
|
+
<%= link_to @experiment.name, experiment_path(@experiment), class: 'breadcrumb-link' %>
|
|
24
|
+
<span class="breadcrumb-sep">/</span>
|
|
25
|
+
<span class="breadcrumb-current">Result #<%= @runner_result.id %></span>
|
|
26
|
+
</nav>
|
|
27
|
+
<h1 class="flex items-center gap-3">
|
|
28
|
+
<span class="status-dot <%= status_class %>"></span>
|
|
29
|
+
<%= @runner_result.dataset_record.display_name %>
|
|
30
|
+
</h1>
|
|
20
31
|
</div>
|
|
21
|
-
<div class="
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
<h3 class="text-lg font-semibold text-indigo-200 mb-2"><%= eval_result.evaluator_class %></h3>
|
|
28
|
-
<% score = eval_result.score %>
|
|
29
|
-
<% color_class = case score
|
|
30
|
-
when 0...0.2 then 'text-red-500'
|
|
31
|
-
when 0.2...0.4 then 'text-orange-500'
|
|
32
|
-
when 0.4...0.6 then 'text-yellow-500'
|
|
33
|
-
when 0.6...0.8 then 'text-lime-500'
|
|
34
|
-
when 0.8...1.0 then 'text-green-400'
|
|
35
|
-
else 'text-green-300'
|
|
36
|
-
end %>
|
|
37
|
-
<p class="text-xl font-bold <%= color_class %>"><%= sprintf('%.2f', score) %></p>
|
|
38
|
-
</div>
|
|
39
|
-
<% end %>
|
|
40
|
-
</div>
|
|
41
|
-
<% else %>
|
|
42
|
-
<p class="text-gray-400">No evaluation results available.</p>
|
|
32
|
+
<div class="btn-group">
|
|
33
|
+
<%= link_to workbench_index_path(prompt_id: @runner_result.prompt_id, dataset_record_id: @runner_result.dataset_record_id, runner: @experiment.runner_class), class: 'btn btn-secondary' do %>
|
|
34
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
35
|
+
<path d="M13.586 3.586a2 2 0 112.828 2.828l-.793.793-2.828-2.828.793-.793zM11.379 5.793L3 14.172V17h2.828l8.38-8.379-2.83-2.828z" />
|
|
36
|
+
</svg>
|
|
37
|
+
Open in Workbench
|
|
43
38
|
<% end %>
|
|
44
39
|
</div>
|
|
45
40
|
</div>
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
41
|
+
|
|
42
|
+
<%# Inline metadata with icons %>
|
|
43
|
+
<div class="result-meta-bar mb-6">
|
|
44
|
+
<div class="result-meta-item">
|
|
45
|
+
<svg class="icon-sm text-muted" viewBox="0 0 20 20" fill="currentColor">
|
|
46
|
+
<path fill-rule="evenodd" d="M4 4a2 2 0 012-2h4.586A2 2 0 0112 2.586L15.414 6A2 2 0 0116 7.414V16a2 2 0 01-2 2H6a2 2 0 01-2-2V4zm2 6a1 1 0 011-1h6a1 1 0 110 2H7a1 1 0 01-1-1zm1 3a1 1 0 100 2h6a1 1 0 100-2H7z" clip-rule="evenodd" />
|
|
47
|
+
</svg>
|
|
48
|
+
<span class="result-meta-label">Prompt</span>
|
|
49
|
+
<%= link_to @runner_result.prompt.name, prompt_path(@runner_result.prompt), class: 'result-meta-value result-meta-link' %>
|
|
50
|
+
<span class="tag tag-gray">v<%= @runner_result.prompt_version %></span>
|
|
51
|
+
</div>
|
|
52
|
+
<div class="result-meta-divider"></div>
|
|
53
|
+
<div class="result-meta-item">
|
|
54
|
+
<svg class="icon-sm text-muted" viewBox="0 0 20 20" fill="currentColor">
|
|
55
|
+
<path fill-rule="evenodd" d="M11.3 1.046A1 1 0 0112 2v5h4a1 1 0 01.82 1.573l-7 10A1 1 0 018 18v-5H4a1 1 0 01-.82-1.573l7-10a1 1 0 011.12-.38z" clip-rule="evenodd" />
|
|
56
|
+
</svg>
|
|
57
|
+
<span class="result-meta-label">Runner</span>
|
|
58
|
+
<span class="result-meta-value font-mono text-sm"><%= @experiment.runner_class.to_s.demodulize %></span>
|
|
59
|
+
</div>
|
|
60
|
+
<div class="result-meta-divider"></div>
|
|
61
|
+
<div class="result-meta-item">
|
|
62
|
+
<svg class="icon-sm text-muted" viewBox="0 0 20 20" fill="currentColor">
|
|
63
|
+
<path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm1-12a1 1 0 10-2 0v4a1 1 0 00.293.707l2.828 2.829a1 1 0 101.415-1.415L11 9.586V6z" clip-rule="evenodd" />
|
|
64
|
+
</svg>
|
|
65
|
+
<span class="result-meta-label">Ran</span>
|
|
66
|
+
<span class="result-meta-value" title="<%= @runner_result.created_at.strftime('%B %d, %Y at %I:%M %p') %>"><%= time_ago_in_words(@runner_result.created_at) %> ago</span>
|
|
67
|
+
</div>
|
|
68
|
+
<% if @runner_result.evaluation_results.any? %>
|
|
69
|
+
<div class="result-meta-divider"></div>
|
|
70
|
+
<div class="result-meta-item">
|
|
71
|
+
<svg class="icon-sm text-muted" viewBox="0 0 20 20" fill="currentColor">
|
|
72
|
+
<path d="M9 2a1 1 0 000 2h2a1 1 0 100-2H9z" />
|
|
73
|
+
<path fill-rule="evenodd" d="M4 5a2 2 0 012-2 3 3 0 003 3h2a3 3 0 003-3 2 2 0 012 2v11a2 2 0 01-2 2H6a2 2 0 01-2-2V5zm9.707 5.707a1 1 0 00-1.414-1.414L9 12.586l-1.293-1.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z" clip-rule="evenodd" />
|
|
74
|
+
</svg>
|
|
75
|
+
<span class="result-meta-label">Evaluations</span>
|
|
76
|
+
<span class="result-meta-value"><%= @runner_result.evaluation_results.count %></span>
|
|
77
|
+
</div>
|
|
78
|
+
<% end %>
|
|
79
|
+
</div>
|
|
80
|
+
|
|
81
|
+
<%# Evaluation Scores Section %>
|
|
82
|
+
<% if @runner_result.evaluation_results.any? %>
|
|
83
|
+
<div class="result-section mb-6">
|
|
84
|
+
<div class="result-section-header">
|
|
85
|
+
<h2 class="result-section-title">
|
|
86
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
87
|
+
<path d="M9.049 2.927c.3-.921 1.603-.921 1.902 0l1.07 3.292a1 1 0 00.95.69h3.462c.969 0 1.371 1.24.588 1.81l-2.8 2.034a1 1 0 00-.364 1.118l1.07 3.292c.3.921-.755 1.688-1.54 1.118l-2.8-2.034a1 1 0 00-1.175 0l-2.8 2.034c-.784.57-1.838-.197-1.539-1.118l1.07-3.292a1 1 0 00-.364-1.118L2.98 8.72c-.783-.57-.38-1.81.588-1.81h3.461a1 1 0 00.951-.69l1.07-3.292z" />
|
|
88
|
+
</svg>
|
|
89
|
+
Evaluation Scores
|
|
90
|
+
</h2>
|
|
91
|
+
<span class="result-section-count"><%= @runner_result.evaluation_results.count %> evaluator<%= @runner_result.evaluation_results.count == 1 ? '' : 's' %></span>
|
|
92
|
+
</div>
|
|
93
|
+
<div class="eval-summary">
|
|
94
|
+
<% @runner_result.evaluation_results.each do |eval_result| %>
|
|
95
|
+
<%
|
|
96
|
+
score = eval_result.score
|
|
97
|
+
score_pct = (score * 100).round
|
|
98
|
+
score_class = case score
|
|
99
|
+
when 0...0.2 then 'score-bad'
|
|
100
|
+
when 0.2...0.4 then 'score-poor'
|
|
101
|
+
when 0.4...0.6 then 'score-fair'
|
|
102
|
+
when 0.6...0.8 then 'score-good'
|
|
103
|
+
else 'score-excellent'
|
|
104
|
+
end
|
|
105
|
+
score_label = case score
|
|
106
|
+
when 0...0.2 then 'Bad'
|
|
107
|
+
when 0.2...0.4 then 'Poor'
|
|
108
|
+
when 0.4...0.6 then 'Fair'
|
|
109
|
+
when 0.6...0.8 then 'Good'
|
|
110
|
+
else 'Excellent'
|
|
111
|
+
end
|
|
112
|
+
short_name = eval_result.evaluator_class.demodulize
|
|
113
|
+
.gsub(/Evaluator$/, '')
|
|
114
|
+
.gsub(/Eval$/, '')
|
|
115
|
+
.gsub(/^Sentiment/, '')
|
|
116
|
+
short_name = short_name.presence || eval_result.evaluator_class.demodulize.gsub(/Eval(uator)?$/, '')
|
|
117
|
+
%>
|
|
118
|
+
<div class="eval-summary-card" title="<%= eval_result.evaluator_class %>">
|
|
119
|
+
<span class="eval-summary-score <%= score_class %>"><%= score_pct %><span class="eval-summary-pct">%</span></span>
|
|
120
|
+
<span class="eval-summary-name"><%= short_name %></span>
|
|
121
|
+
<div class="eval-summary-bar">
|
|
122
|
+
<div class="eval-summary-bar-fill <%= score_class %>" style="width: <%= score_pct %>%"></div>
|
|
123
|
+
</div>
|
|
124
|
+
<span class="eval-summary-label <%= score_class %>"><%= score_label %></span>
|
|
55
125
|
</div>
|
|
56
126
|
<% end %>
|
|
57
127
|
</div>
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
128
|
+
</div>
|
|
129
|
+
<% end %>
|
|
130
|
+
|
|
131
|
+
<%# Predictions and Ground Truth - Side by side comparison %>
|
|
132
|
+
<%
|
|
133
|
+
predictions = @runner_result.parsed_predictions.presence || [@runner_result.prediction].compact
|
|
134
|
+
prediction_text = predictions.first.to_s
|
|
135
|
+
expected_text = @runner_result.ground_truth.to_s
|
|
136
|
+
is_match = prediction_text.strip == expected_text.strip && prediction_text.present?
|
|
137
|
+
%>
|
|
138
|
+
<div class="result-section mb-6">
|
|
139
|
+
<div class="result-section-header">
|
|
140
|
+
<h2 class="result-section-title">
|
|
141
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
142
|
+
<path fill-rule="evenodd" d="M4 2a1 1 0 011 1v2.101a7.002 7.002 0 0111.601 2.566 1 1 0 11-1.885.666A5.002 5.002 0 005.999 7H9a1 1 0 010 2H4a1 1 0 01-1-1V3a1 1 0 011-1zm.008 9.057a1 1 0 011.276.61A5.002 5.002 0 0014.001 13H11a1 1 0 110-2h5a1 1 0 011 1v5a1 1 0 11-2 0v-2.101a7.002 7.002 0 01-11.601-2.566 1 1 0 01.61-1.276z" clip-rule="evenodd" />
|
|
143
|
+
</svg>
|
|
144
|
+
Output Comparison
|
|
145
|
+
</h2>
|
|
146
|
+
<% if is_match %>
|
|
147
|
+
<span class="comparison-match-badge comparison-match-badge--match">
|
|
148
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
149
|
+
<path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z" clip-rule="evenodd" />
|
|
150
|
+
</svg>
|
|
151
|
+
Exact Match
|
|
152
|
+
</span>
|
|
153
|
+
<% elsif prediction_text.present? && expected_text.present? %>
|
|
154
|
+
<span class="comparison-match-badge comparison-match-badge--mismatch">
|
|
155
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
156
|
+
<path fill-rule="evenodd" d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7 4a1 1 0 11-2 0 1 1 0 012 0zm-1-9a1 1 0 00-1 1v4a1 1 0 102 0V6a1 1 0 00-1-1z" clip-rule="evenodd" />
|
|
157
|
+
</svg>
|
|
158
|
+
Difference
|
|
159
|
+
</span>
|
|
160
|
+
<% end %>
|
|
161
|
+
</div>
|
|
162
|
+
|
|
163
|
+
<div class="result-comparison">
|
|
164
|
+
<div class="result-panel result-panel--prediction">
|
|
165
|
+
<div class="result-panel-header">
|
|
166
|
+
<div class="result-panel-header-content">
|
|
167
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
168
|
+
<path fill-rule="evenodd" d="M11.3 1.046A1 1 0 0112 2v5h4a1 1 0 01.82 1.573l-7 10A1 1 0 018 18v-5H4a1 1 0 01-.82-1.573l7-10a1 1 0 011.12-.38z" clip-rule="evenodd" />
|
|
169
|
+
</svg>
|
|
170
|
+
<span class="result-panel-label">Prediction</span>
|
|
171
|
+
</div>
|
|
172
|
+
<span class="result-panel-meta"><%= prediction_text.length %> chars</span>
|
|
173
|
+
</div>
|
|
174
|
+
<div class="result-panel-content">
|
|
175
|
+
<% if predictions.any? %>
|
|
176
|
+
<% predictions.each_with_index do |prediction, index| %>
|
|
177
|
+
<div class="result-value-block <%= index > 0 ? 'mt-3' : '' %>">
|
|
178
|
+
<% if predictions.size > 1 %>
|
|
179
|
+
<span class="result-index"><%= index + 1 %></span>
|
|
180
|
+
<% end %>
|
|
181
|
+
<pre class="result-code"><code><%= prediction.presence || '—' %></code></pre>
|
|
182
|
+
</div>
|
|
183
|
+
<% end %>
|
|
184
|
+
<% else %>
|
|
185
|
+
<div class="result-empty-state">
|
|
186
|
+
<svg class="icon-lg text-muted" viewBox="0 0 20 20" fill="currentColor">
|
|
187
|
+
<path fill-rule="evenodd" d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a1 1 0 000 2v3a1 1 0 001 1h1a1 1 0 100-2v-3a1 1 0 00-1-1H9z" clip-rule="evenodd" />
|
|
188
|
+
</svg>
|
|
189
|
+
<span>No prediction generated</span>
|
|
190
|
+
</div>
|
|
191
|
+
<% end %>
|
|
192
|
+
</div>
|
|
193
|
+
</div>
|
|
194
|
+
<div class="result-panel result-panel--expected">
|
|
195
|
+
<div class="result-panel-header">
|
|
196
|
+
<div class="result-panel-header-content">
|
|
197
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
198
|
+
<path fill-rule="evenodd" d="M6.267 3.455a3.066 3.066 0 001.745-.723 3.066 3.066 0 013.976 0 3.066 3.066 0 001.745.723 3.066 3.066 0 012.812 2.812c.051.643.304 1.254.723 1.745a3.066 3.066 0 010 3.976 3.066 3.066 0 00-.723 1.745 3.066 3.066 0 01-2.812 2.812 3.066 3.066 0 00-1.745.723 3.066 3.066 0 01-3.976 0 3.066 3.066 0 00-1.745-.723 3.066 3.066 0 01-2.812-2.812 3.066 3.066 0 00-.723-1.745 3.066 3.066 0 010-3.976 3.066 3.066 0 00.723-1.745 3.066 3.066 0 012.812-2.812zm7.44 5.252a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z" clip-rule="evenodd" />
|
|
199
|
+
</svg>
|
|
200
|
+
<span class="result-panel-label">Expected (Ground Truth)</span>
|
|
201
|
+
</div>
|
|
202
|
+
<span class="result-panel-meta"><%= expected_text.length %> chars</span>
|
|
203
|
+
</div>
|
|
204
|
+
<div class="result-panel-content">
|
|
205
|
+
<% if expected_text.present? %>
|
|
206
|
+
<pre class="result-code"><code><%= expected_text %></code></pre>
|
|
207
|
+
<% else %>
|
|
208
|
+
<div class="result-empty-state">
|
|
209
|
+
<svg class="icon-lg text-muted" viewBox="0 0 20 20" fill="currentColor">
|
|
210
|
+
<path fill-rule="evenodd" d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a1 1 0 000 2v3a1 1 0 001 1h1a1 1 0 100-2v-3a1 1 0 00-1-1H9z" clip-rule="evenodd" />
|
|
211
|
+
</svg>
|
|
212
|
+
<span>No ground truth defined</span>
|
|
213
|
+
</div>
|
|
214
|
+
<% end %>
|
|
215
|
+
</div>
|
|
61
216
|
</div>
|
|
62
217
|
</div>
|
|
63
218
|
</div>
|
|
64
|
-
|
|
219
|
+
|
|
220
|
+
<%# Raw Output - Enhanced Collapsible Section %>
|
|
221
|
+
<% if @runner_result.respond_to?(:raw_output) && @runner_result.raw_output.present? %>
|
|
222
|
+
<%
|
|
223
|
+
raw_output = @runner_result.raw_output
|
|
224
|
+
line_count = raw_output.lines.count
|
|
225
|
+
char_count = raw_output.length
|
|
226
|
+
%>
|
|
227
|
+
<div class="result-section">
|
|
228
|
+
<details class="raw-output-collapsible">
|
|
229
|
+
<summary class="raw-output-header">
|
|
230
|
+
<div class="raw-output-header-left">
|
|
231
|
+
<svg class="icon-sm raw-output-icon" viewBox="0 0 20 20" fill="currentColor">
|
|
232
|
+
<path fill-rule="evenodd" d="M12.316 3.051a1 1 0 01.633 1.265l-4 12a1 1 0 11-1.898-.632l4-12a1 1 0 011.265-.633zM5.707 6.293a1 1 0 010 1.414L3.414 10l2.293 2.293a1 1 0 11-1.414 1.414l-3-3a1 1 0 010-1.414l3-3a1 1 0 011.414 0zm8.586 0a1 1 0 011.414 0l3 3a1 1 0 010 1.414l-3 3a1 1 0 11-1.414-1.414L16.586 10l-2.293-2.293a1 1 0 010-1.414z" clip-rule="evenodd" />
|
|
233
|
+
</svg>
|
|
234
|
+
<span class="raw-output-title">Raw Output</span>
|
|
235
|
+
<span class="raw-output-stats">
|
|
236
|
+
<span class="raw-output-stat"><%= line_count %> line<%= line_count == 1 ? '' : 's' %></span>
|
|
237
|
+
<span class="raw-output-stat-sep"></span>
|
|
238
|
+
<span class="raw-output-stat"><%= number_to_human_size(char_count) %></span>
|
|
239
|
+
</span>
|
|
240
|
+
</div>
|
|
241
|
+
<div class="raw-output-header-right">
|
|
242
|
+
<span class="raw-output-expand-hint">
|
|
243
|
+
<span class="expand-text">Show</span>
|
|
244
|
+
<span class="collapse-text">Hide</span>
|
|
245
|
+
</span>
|
|
246
|
+
<svg class="raw-output-chevron" viewBox="0 0 20 20" fill="currentColor">
|
|
247
|
+
<path fill-rule="evenodd" d="M5.293 7.293a1 1 0 011.414 0L10 10.586l3.293-3.293a1 1 0 111.414 1.414l-4 4a1 1 0 01-1.414 0l-4-4a1 1 0 010-1.414z" clip-rule="evenodd" />
|
|
248
|
+
</svg>
|
|
249
|
+
</div>
|
|
250
|
+
</summary>
|
|
251
|
+
<div class="raw-output-body">
|
|
252
|
+
<div class="raw-output-toolbar">
|
|
253
|
+
<span class="raw-output-lang-hint">
|
|
254
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
255
|
+
<path fill-rule="evenodd" d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a1 1 0 000 2v3a1 1 0 001 1h1a1 1 0 100-2v-3a1 1 0 00-1-1H9z" clip-rule="evenodd" />
|
|
256
|
+
</svg>
|
|
257
|
+
Response from LLM
|
|
258
|
+
</span>
|
|
259
|
+
</div>
|
|
260
|
+
<pre class="raw-output-code"><code><%= raw_output %></code></pre>
|
|
261
|
+
</div>
|
|
262
|
+
</details>
|
|
263
|
+
</div>
|
|
264
|
+
<% end %>
|
|
265
|
+
|
|
266
|
+
<%# Dataset Record Link - Footer %>
|
|
267
|
+
<div class="result-footer mt-8">
|
|
268
|
+
<div class="result-footer-content">
|
|
269
|
+
<span class="result-footer-label">Dataset Record:</span>
|
|
270
|
+
<%= link_to dataset_dataset_record_path(@runner_result.dataset_record.dataset, @runner_result.dataset_record), class: 'result-footer-link' do %>
|
|
271
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
272
|
+
<path d="M4 4a2 2 0 012-2h4.586A2 2 0 0112 2.586L15.414 6A2 2 0 0116 7.414V16a2 2 0 01-2 2H6a2 2 0 01-2-2V4z" />
|
|
273
|
+
</svg>
|
|
274
|
+
<%= @runner_result.dataset_record.display_name %>
|
|
275
|
+
<svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
|
|
276
|
+
<path fill-rule="evenodd" d="M7.293 14.707a1 1 0 010-1.414L10.586 10 7.293 6.707a1 1 0 011.414-1.414l4 4a1 1 0 010 1.414l-4 4a1 1 0 01-1.414 0z" clip-rule="evenodd" />
|
|
277
|
+
</svg>
|
|
278
|
+
<% end %>
|
|
279
|
+
</div>
|
|
280
|
+
</div>
|
|
281
|
+
</div>
|