leva 0.1.9.1 → 0.1.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +21 -5
  3. data/app/assets/stylesheets/leva/application.css +3083 -15
  4. data/app/controllers/leva/application_controller.rb +1 -1
  5. data/app/controllers/leva/dataset_records_controller.rb +1 -1
  6. data/app/controllers/leva/datasets_controller.rb +6 -6
  7. data/app/controllers/leva/design_system_controller.rb +9 -0
  8. data/app/controllers/leva/experiments_controller.rb +8 -8
  9. data/app/controllers/leva/runner_results_controller.rb +1 -1
  10. data/app/controllers/leva/workbench_controller.rb +26 -15
  11. data/app/helpers/leva/application_helper.rb +7 -7
  12. data/app/jobs/leva/experiment_job.rb +1 -1
  13. data/app/jobs/leva/run_eval_job.rb +1 -1
  14. data/app/models/concerns/leva/recordable.rb +5 -5
  15. data/app/models/leva/dataset.rb +1 -1
  16. data/app/models/leva/evaluation_result.rb +1 -1
  17. data/app/models/leva/experiment.rb +1 -1
  18. data/app/models/leva/prompt.rb +1 -1
  19. data/app/views/layouts/leva/application.html.erb +23 -24
  20. data/app/views/leva/dataset_records/index.html.erb +70 -43
  21. data/app/views/leva/dataset_records/show.html.erb +115 -25
  22. data/app/views/leva/datasets/_dataset.html.erb +11 -18
  23. data/app/views/leva/datasets/_form.html.erb +18 -14
  24. data/app/views/leva/datasets/edit.html.erb +16 -4
  25. data/app/views/leva/datasets/index.html.erb +33 -41
  26. data/app/views/leva/datasets/new.html.erb +15 -4
  27. data/app/views/leva/datasets/show.html.erb +120 -139
  28. data/app/views/leva/design_system/index.html.erb +1731 -0
  29. data/app/views/leva/experiments/_experiment.html.erb +46 -31
  30. data/app/views/leva/experiments/_form.html.erb +62 -35
  31. data/app/views/leva/experiments/edit.html.erb +17 -3
  32. data/app/views/leva/experiments/index.html.erb +41 -36
  33. data/app/views/leva/experiments/new.html.erb +52 -4
  34. data/app/views/leva/experiments/show.html.erb +155 -98
  35. data/app/views/leva/runner_results/show.html.erb +271 -54
  36. data/app/views/leva/workbench/_evaluation_area.html.erb +18 -4
  37. data/app/views/leva/workbench/_prompt_content.html.erb +124 -73
  38. data/app/views/leva/workbench/_prompt_form.html.erb +24 -23
  39. data/app/views/leva/workbench/_prompt_sidebar.html.erb +57 -12
  40. data/app/views/leva/workbench/_results_section.html.erb +274 -112
  41. data/app/views/leva/workbench/_top_bar.html.erb +16 -6
  42. data/app/views/leva/workbench/edit.html.erb +46 -15
  43. data/app/views/leva/workbench/index.html.erb +5 -8
  44. data/app/views/leva/workbench/new.html.erb +74 -42
  45. data/config/routes.rb +11 -9
  46. data/db/migrate/20240813173033_create_leva_dataset_records.rb +1 -0
  47. data/db/migrate/20240813173035_create_leva_experiments.rb +2 -0
  48. data/db/migrate/{20240816201419_create_leva_runner_results.rb → 20240813173040_create_leva_runner_results.rb} +4 -1
  49. data/db/migrate/20240813173050_create_leva_evaluation_results.rb +3 -3
  50. data/lib/generators/leva/eval_generator.rb +4 -4
  51. data/lib/generators/leva/runner_generator.rb +4 -4
  52. data/lib/generators/leva/templates/runner.rb.erb +20 -0
  53. data/lib/leva/version.rb +1 -1
  54. data/lib/leva.rb +24 -2
  55. metadata +5 -11
  56. data/db/migrate/20240816201433_update_leva_evaluation_results.rb +0 -8
  57. data/db/migrate/20240821163608_make_experiment_optional_for_runner_results.rb +0 -6
  58. data/db/migrate/20240821181934_add_prompt_to_leva_runner_results.rb +0 -5
  59. data/db/migrate/20240821183153_add_runner_and_evaluator_to_leva_experiments.rb +0 -6
  60. data/db/migrate/20240821191713_add_actual_result_to_leva_dataset_records.rb +0 -5
  61. data/db/migrate/20240822143201_remove_actual_result_from_leva_runner_results.rb +0 -5
  62. data/db/migrate/20240912183556_add_runner_class_to_leva_runner_results.rb +0 -5
  63. data/lib/tasks/auto_annotate_models.rake +0 -59
@@ -4,117 +4,174 @@
4
4
  <meta http-equiv="refresh" content="5">
5
5
  <% end %>
6
6
  <% end %>
7
- <div class="container mx-auto px-4 py-8 bg-gray-950 text-white">
8
- <div class="mb-8">
9
- <div class="flex justify-between items-center">
10
- <h1 class="text-3xl font-bold text-indigo-400 mb-2"><%= @experiment.name %></h1>
11
- <div class="flex items-center space-x-4">
12
- <% if @experiment.status != 'completed' %>
13
- <%= link_to edit_experiment_path(@experiment), class: 'btn btn-secondary flex items-center' do %>
14
- <svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-2" viewBox="0 0 20 20" fill="currentColor">
15
- <path d="M13.586 3.586a2 2 0 112.828 2.828l-.793.793-2.828-2.828.793-.793zM11.379 5.793L3 14.172V17h2.828l8.38-8.379-2.83-2.828z" />
16
- </svg>
17
- Edit Experiment
18
- <% end %>
19
- <% end %>
20
- <%= button_to rerun_experiment_path(@experiment), method: :post, class: 'btn btn-primary flex items-center', data: { confirm: 'Are you sure you want to rerun this experiment? This will delete all existing results.' } do %>
21
- <svg xmlns="http://www.w3.org/2000/svg" class="h-5 w-5 mr-2" viewBox="0 0 20 20" fill="currentColor">
22
- <path fill-rule="evenodd" d="M4 2a1 1 0 011 1v2.101a7.002 7.002 0 0111.601 2.566 1 1 0 11-1.885.666A5.002 5.002 0 005.999 7H9a1 1 0 010 2H4a1 1 0 01-1-1V3a1 1 0 011-1zm.008 9.057a1 1 0 011.276.61A5.002 5.002 0 0014.001 13H11a1 1 0 110-2h5a1 1 0 011 1v5a1 1 0 11-2 0v-2.101a7.002 7.002 0 01-11.601-2.566 1 1 0 01.61-1.276z" clip-rule="evenodd" />
7
+
8
+ <div class="container page">
9
+ <div class="page-header">
10
+ <div class="page-header-content">
11
+ <div class="page-header-title-row">
12
+ <h1><%= @experiment.name %></h1>
13
+ <%
14
+ status_dot = case @experiment.status
15
+ when 'pending' then 'status-dot-pending'
16
+ when 'running' then 'status-dot-info'
17
+ when 'completed' then 'status-dot-success'
18
+ when 'failed' then 'status-dot-error'
19
+ else 'status-dot-pending'
20
+ end
21
+ %>
22
+ <span class="status-indicator">
23
+ <span class="status-dot <%= status_dot %>"></span>
24
+ <span class="text-sm text-muted"><%= @experiment.status&.capitalize || 'N/A' %></span>
25
+ </span>
26
+ </div>
27
+ <% if @experiment.description.present? %>
28
+ <p class="text-muted text-sm mt-1"><%= @experiment.description %></p>
29
+ <% end %>
30
+ </div>
31
+ <div class="btn-group">
32
+ <% if @experiment.status != 'completed' %>
33
+ <%= link_to edit_experiment_path(@experiment), class: 'btn btn-ghost' do %>
34
+ <svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
35
+ <path d="M13.586 3.586a2 2 0 112.828 2.828l-.793.793-2.828-2.828.793-.793zM11.379 5.793L3 14.172V17h2.828l8.38-8.379-2.83-2.828z" />
23
36
  </svg>
24
- Rerun Experiment
37
+ Edit
25
38
  <% end %>
26
- </div>
39
+ <% end %>
40
+ <%= button_to rerun_experiment_path(@experiment), method: :post, class: 'btn btn-primary', data: { confirm: 'Are you sure you want to rerun this experiment? This will delete all existing results.' } do %>
41
+ <svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
42
+ <path fill-rule="evenodd" d="M4 2a1 1 0 011 1v2.101a7.002 7.002 0 0111.601 2.566 1 1 0 11-1.885.666A5.002 5.002 0 005.999 7H9a1 1 0 010 2H4a1 1 0 01-1-1V3a1 1 0 011-1zm.008 9.057a1 1 0 011.276.61A5.002 5.002 0 0014.001 13H11a1 1 0 110-2h5a1 1 0 011 1v5a1 1 0 11-2 0v-2.101a7.002 7.002 0 01-11.601-2.566 1 1 0 01.61-1.276z" clip-rule="evenodd" />
43
+ </svg>
44
+ Rerun
45
+ <% end %>
27
46
  </div>
28
- <p class="text-gray-400"><%= @experiment.description %></p>
29
- <p class="text-indigo-300 mt-2">Status: <%= @experiment.status&.capitalize || 'N/A' %></p>
30
47
  </div>
31
- <div class="bg-gray-800 rounded-lg shadow-lg p-6 mb-8">
32
- <h2 class="text-3xl font-semibold text-indigo-300 mb-6">Evaluation Summary</h2>
33
- <% if @experiment.evaluation_results.any? %>
34
- <div class="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6">
48
+
49
+ <%# Experiment Configuration %>
50
+ <section class="mb-6">
51
+ <div class="exp-meta">
52
+ <div class="exp-meta-item">
53
+ <span class="exp-meta-label">Dataset</span>
54
+ <%= link_to @experiment.dataset.name, dataset_path(@experiment.dataset), class: 'exp-meta-value exp-meta-link' %>
55
+ </div>
56
+ <div class="exp-meta-item">
57
+ <span class="exp-meta-label">Prompt</span>
58
+ <span class="exp-meta-value"><%= @experiment.prompt ? @experiment.prompt.name : '—' %></span>
59
+ </div>
60
+ <div class="exp-meta-item">
61
+ <span class="exp-meta-label">Runner</span>
62
+ <span class="exp-meta-value font-mono text-sm"><%= @experiment.runner_class&.demodulize || '—' %></span>
63
+ </div>
64
+ <div class="exp-meta-item">
65
+ <span class="exp-meta-label">Created</span>
66
+ <span class="exp-meta-value"><%= time_ago_in_words(@experiment.created_at) %> ago</span>
67
+ </div>
68
+ </div>
69
+ </section>
70
+
71
+ <%# Evaluation Summary %>
72
+ <% if @experiment.evaluation_results.any? %>
73
+ <section class="mb-8">
74
+ <div class="section-header">
75
+ <h3 class="section-title">Evaluation Scores</h3>
76
+ </div>
77
+ <div class="eval-summary">
35
78
  <% @experiment.evaluation_results.group_by(&:evaluator_class).each do |evaluator_class, results| %>
36
- <div class="bg-gray-700 rounded-lg p-6">
37
- <h3 class="text-xl font-semibold text-indigo-200 mb-4"><%= evaluator_class %></h3>
38
- <% avg_score = (results.sum(&:score) / results.size.to_f).round(2) %>
39
- <% color_class = case avg_score
40
- when 0...0.2 then 'text-red-500'
41
- when 0.2...0.4 then 'text-orange-500'
42
- when 0.4...0.6 then 'text-yellow-500'
43
- when 0.6...0.8 then 'text-lime-500'
44
- when 0.8...1.0 then 'text-green-400'
45
- else 'text-green-300'
46
- end %>
47
- <p class="text-2xl font-bold <%= color_class %> mb-2"><%= sprintf('%.2f', avg_score) %></p>
48
- <p class="text-gray-300">Number of Evaluations: <%= results.size %></p>
79
+ <%
80
+ avg_score = (results.sum(&:score) / results.size.to_f).round(2)
81
+ score_pct = (avg_score * 100).round
82
+ score_class = case avg_score
83
+ when 0...0.2 then 'score-bad'
84
+ when 0.2...0.4 then 'score-poor'
85
+ when 0.4...0.6 then 'score-fair'
86
+ when 0.6...0.8 then 'score-good'
87
+ else 'score-excellent'
88
+ end
89
+ short_name = evaluator_class.demodulize
90
+ .gsub(/Evaluator$/, '')
91
+ .gsub(/Eval$/, '')
92
+ .gsub(/^Sentiment/, '')
93
+ short_name = short_name.presence || evaluator_class.demodulize.gsub(/Eval(uator)?$/, '')
94
+ %>
95
+ <div class="eval-summary-card" title="<%= results.size %> evaluations">
96
+ <span class="eval-summary-score <%= score_class %>"><%= score_pct %><span class="eval-summary-pct">%</span></span>
97
+ <span class="eval-summary-name"><%= short_name %></span>
98
+ <div class="eval-summary-bar">
99
+ <div class="eval-summary-bar-fill <%= score_class %>" style="width: <%= score_pct %>%"></div>
100
+ </div>
101
+ <span class="eval-summary-count"><%= results.size %> runs</span>
49
102
  </div>
50
103
  <% end %>
51
104
  </div>
52
- <% else %>
53
- <p class="text-gray-400 text-xl">No evaluation results available yet.</p>
54
- <% end %>
55
- </div>
56
- <div class="bg-gray-800 rounded-lg shadow-lg p-6 mb-8">
57
- <h2 class="text-2xl font-semibold text-indigo-300 mb-4">Experiment Details</h2>
58
- <p class="text-gray-400">Dataset: <%= link_to @experiment.dataset.name, dataset_path(@experiment.dataset), class: 'text-indigo-400 hover:underline' %></p>
59
- <p class="text-gray-400">Prompt: <%= @experiment.prompt ? @experiment.prompt.name : 'Not specified' %></p>
60
- <!-- Add more experiment details as needed -->
61
- </div>
62
- <div class="bg-gray-800 rounded-lg shadow-lg p-6">
63
- <h2 class="text-2xl font-semibold text-indigo-300 mb-4">Runner Results</h2>
105
+ </section>
106
+ <% end %>
107
+
108
+ <%# Runner Results %>
109
+ <section>
110
+ <div class="section-header">
111
+ <h3 class="section-title">Results</h3>
112
+ <span class="section-count"><%= @experiment.runner_results.count %> runs</span>
113
+ </div>
64
114
  <% if @experiment.runner_results.any? %>
65
- <div class="overflow-x-auto">
66
- <table class="min-w-full divide-y divide-gray-700">
67
- <thead>
68
- <tr>
69
- <th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider">Dataset Record</th>
70
- <th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider">Prompt</th>
71
- <th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider">Prediction</th>
72
- <th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider">Ground Truth</th>
73
- <% @experiment.evaluation_results.group_by(&:evaluator_class).keys.each do |evaluator_class| %>
74
- <th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider"><%= evaluator_class %></th>
75
- <% end %>
76
- <th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider">Created At</th>
77
- <th class="px-6 py-3 text-left text-xs font-medium text-indigo-300 uppercase tracking-wider">Actions</th>
78
- </tr>
79
- </thead>
80
- <tbody class="bg-gray-700 divide-y divide-gray-600">
81
- <% @experiment.runner_results.each do |runner_result| %>
115
+ <div class="table-wrapper">
116
+ <div class="table-scroll">
117
+ <table class="table table-clickable table-results">
118
+ <thead>
82
119
  <tr>
83
- <td class="px-6 py-4 whitespace-nowrap text-sm text-gray-300"><%= runner_result.dataset_record.display_name %></td>
84
- <td class="px-6 py-4 whitespace-nowrap text-sm text-gray-300"><%= runner_result.prompt.name %> (v<%= runner_result.prompt_version %>)</td>
85
- <td class="px-6 py-4 text-sm text-gray-300"><%= truncate(runner_result.prediction, length: 30) %></td>
86
- <td class="px-6 py-4 text-sm text-gray-300"><%= truncate(runner_result.ground_truth, length: 30) %></td>
120
+ <th>Record</th>
121
+ <th>Prediction</th>
122
+ <th>Expected</th>
87
123
  <% @experiment.evaluation_results.group_by(&:evaluator_class).keys.each do |evaluator_class| %>
88
- <% eval_result = runner_result.evaluation_results.find_by(evaluator_class: evaluator_class) %>
89
- <td class="px-6 py-4 whitespace-nowrap text-sm">
90
- <% if eval_result %>
91
- <% score = eval_result.score %>
92
- <% color_class = case score
93
- when 0...0.2 then 'text-red-500'
94
- when 0.2...0.4 then 'text-orange-500'
95
- when 0.4...0.6 then 'text-yellow-500'
96
- when 0.6...0.8 then 'text-lime-500'
97
- when 0.8...1.0 then 'text-green-400'
98
- else 'text-green-300'
99
- end %>
100
- <span class="<%= color_class %> font-semibold"><%= sprintf('%.2f', score) %></span>
101
- <% else %>
102
- <span class="text-gray-400">N/A</span>
103
- <% end %>
104
- </td>
124
+ <%
125
+ short_name = evaluator_class.demodulize
126
+ .gsub(/Evaluator$/, '')
127
+ .gsub(/Eval$/, '')
128
+ .gsub(/^Sentiment/, '')
129
+ short_name = short_name.presence || evaluator_class.demodulize.gsub(/Eval(uator)?$/, '')
130
+ %>
131
+ <th class="text-right" style="width: 80px;"><%= short_name %></th>
105
132
  <% end %>
106
- <td class="px-6 py-4 whitespace-nowrap text-sm text-gray-300"><%= runner_result.created_at.strftime("%Y-%m-%d %H:%M:%S") %></td>
107
- <td class="px-6 py-4 whitespace-nowrap text-sm text-gray-300">
108
- <%= link_to 'View Details', experiment_runner_result_path(@experiment, runner_result), class: 'text-indigo-400 hover:underline mr-2' %>
109
- <%= link_to 'Experiment', workbench_index_path(prompt_id: runner_result.prompt_id, dataset_record_id: runner_result.dataset_record_id, runner: @experiment.runner_class), class: 'text-indigo-400 hover:underline' %>
110
- </td>
111
133
  </tr>
112
- <% end %>
113
- </tbody>
114
- </table>
134
+ </thead>
135
+ <tbody>
136
+ <% @experiment.runner_results.each do |runner_result| %>
137
+ <tr class="clickable-row" onclick="window.location='<%= experiment_runner_result_path(@experiment, runner_result) %>'">
138
+ <td>
139
+ <span class="row-title"><%= runner_result.dataset_record.display_name %></span>
140
+ </td>
141
+ <td>
142
+ <span class="prediction-badge"><%= truncate(runner_result.prediction.to_s.strip, length: 25) %></span>
143
+ </td>
144
+ <td class="text-muted"><%= truncate(runner_result.ground_truth.to_s.strip.presence || '—', length: 25) %></td>
145
+ <% @experiment.evaluation_results.group_by(&:evaluator_class).keys.each do |evaluator_class| %>
146
+ <td class="text-right">
147
+ <% eval_result = runner_result.evaluation_results.find_by(evaluator_class: evaluator_class) %>
148
+ <% if eval_result %>
149
+ <% score = eval_result.score %>
150
+ <%
151
+ score_class = case score
152
+ when 0...0.2 then 'score-bad'
153
+ when 0.2...0.4 then 'score-poor'
154
+ when 0.4...0.6 then 'score-fair'
155
+ when 0.6...0.8 then 'score-good'
156
+ else 'score-excellent'
157
+ end
158
+ %>
159
+ <span class="score-inline <%= score_class %>"><%= sprintf('%.2f', score) %></span>
160
+ <% else %>
161
+ <span class="text-subtle">—</span>
162
+ <% end %>
163
+ </td>
164
+ <% end %>
165
+ </tr>
166
+ <% end %>
167
+ </tbody>
168
+ </table>
169
+ </div>
115
170
  </div>
116
171
  <% else %>
117
- <p class="text-gray-400">No runner results available yet.</p>
172
+ <div class="empty-state-inline">
173
+ <p class="text-muted">No results yet. Run the experiment to see results.</p>
174
+ </div>
118
175
  <% end %>
119
- </div>
120
- </div>
176
+ </section>
177
+ </div>
@@ -1,64 +1,281 @@
1
- <% content_for :title, "Runner Result Details" %>
2
- <div class="container mx-auto px-4 py-8 bg-gray-950 text-white">
3
- <div class="mb-8">
4
- <h1 class="text-3xl font-bold text-indigo-400 mb-2">Runner Result Details</h1>
5
- <%= link_to "Back to Experiment", experiment_path(@experiment), class: "text-indigo-400 hover:underline" %>
6
- </div>
7
- <div class="grid grid-cols-1 md:grid-cols-2 gap-6 mb-8">
8
- <div class="bg-gray-800 rounded-lg shadow-lg p-6">
9
- <h2 class="text-2xl font-semibold text-indigo-300 mb-4">Details</h2>
10
- <p class="text-gray-400">
11
- <strong class="text-indigo-300">Dataset Record:</strong>
12
- <%= link_to @runner_result.dataset_record.display_name, dataset_dataset_record_path(@runner_result.dataset_record.dataset, @runner_result.dataset_record), class: "text-indigo-400 hover:underline" %>
13
- </p>
14
- <p class="text-gray-400">
15
- <strong class="text-indigo-300">Prompt:</strong>
16
- <%= link_to "#{@runner_result.prompt.name} (v#{@runner_result.prompt_version})", prompt_path(@runner_result.prompt), class: "text-indigo-400 hover:underline" %>
17
- </p>
18
- <p class="text-gray-400"><strong class="text-indigo-300">Created At:</strong> <%= @runner_result.created_at.strftime("%Y-%m-%d %H:%M:%S") %></p>
19
- <%= link_to 'Run in Workbench', workbench_index_path(prompt_id: @runner_result.prompt_id, dataset_record_id: @runner_result.dataset_record_id, runner: @experiment.runner_class), class: 'mt-4 inline-block px-4 py-2 bg-indigo-600 text-white rounded hover:bg-indigo-700 transition-colors duration-200' %>
1
+ <% content_for :title, @runner_result.dataset_record.display_name %>
2
+ <%
3
+ # Calculate overall status based on average score
4
+ avg_score = @runner_result.evaluation_results.any? ? @runner_result.evaluation_results.average(:score).to_f : nil
5
+ status_class = case avg_score
6
+ when nil then 'status-dot-pending'
7
+ when 0...0.4 then 'status-dot-error'
8
+ when 0.4...0.7 then 'status-dot-warning'
9
+ else 'status-dot-success'
10
+ end
11
+ %>
12
+ <div class="container page">
13
+ <%# Header with back link %>
14
+ <div class="page-header">
15
+ <div>
16
+ <nav class="breadcrumb mb-2">
17
+ <%= link_to experiments_path, class: 'breadcrumb-link', title: 'All Experiments' do %>
18
+ <svg class="icon-sm" style="vertical-align: -2px; margin-right: 2px;" viewBox="0 0 20 20" fill="currentColor">
19
+ <path d="M10.707 2.293a1 1 0 00-1.414 0l-7 7a1 1 0 001.414 1.414L4 10.414V17a1 1 0 001 1h2a1 1 0 001-1v-2a1 1 0 011-1h2a1 1 0 011 1v2a1 1 0 001 1h2a1 1 0 001-1v-6.586l.293.293a1 1 0 001.414-1.414l-7-7z" />
20
+ </svg>
21
+ <% end %>
22
+ <span class="breadcrumb-sep">/</span>
23
+ <%= link_to @experiment.name, experiment_path(@experiment), class: 'breadcrumb-link' %>
24
+ <span class="breadcrumb-sep">/</span>
25
+ <span class="breadcrumb-current">Result #<%= @runner_result.id %></span>
26
+ </nav>
27
+ <h1 class="flex items-center gap-3">
28
+ <span class="status-dot <%= status_class %>"></span>
29
+ <%= @runner_result.dataset_record.display_name %>
30
+ </h1>
20
31
  </div>
21
- <div class="bg-gray-800 rounded-lg shadow-lg p-6">
22
- <h2 class="text-2xl font-semibold text-indigo-300 mb-4">Evaluation Results</h2>
23
- <% if @runner_result.evaluation_results.any? %>
24
- <div class="space-y-4">
25
- <% @runner_result.evaluation_results.each do |eval_result| %>
26
- <div class="bg-gray-700 rounded-lg p-4">
27
- <h3 class="text-lg font-semibold text-indigo-200 mb-2"><%= eval_result.evaluator_class %></h3>
28
- <% score = eval_result.score %>
29
- <% color_class = case score
30
- when 0...0.2 then 'text-red-500'
31
- when 0.2...0.4 then 'text-orange-500'
32
- when 0.4...0.6 then 'text-yellow-500'
33
- when 0.6...0.8 then 'text-lime-500'
34
- when 0.8...1.0 then 'text-green-400'
35
- else 'text-green-300'
36
- end %>
37
- <p class="text-xl font-bold <%= color_class %>"><%= sprintf('%.2f', score) %></p>
38
- </div>
39
- <% end %>
40
- </div>
41
- <% else %>
42
- <p class="text-gray-400">No evaluation results available.</p>
32
+ <div class="btn-group">
33
+ <%= link_to workbench_index_path(prompt_id: @runner_result.prompt_id, dataset_record_id: @runner_result.dataset_record_id, runner: @experiment.runner_class), class: 'btn btn-secondary' do %>
34
+ <svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
35
+ <path d="M13.586 3.586a2 2 0 112.828 2.828l-.793.793-2.828-2.828.793-.793zM11.379 5.793L3 14.172V17h2.828l8.38-8.379-2.83-2.828z" />
36
+ </svg>
37
+ Open in Workbench
43
38
  <% end %>
44
39
  </div>
45
40
  </div>
46
- <div class="bg-gray-800 rounded-lg shadow-lg p-6 mb-8">
47
- <h2 class="text-2xl font-semibold text-indigo-300 mb-4">Predictions and Ground Truth</h2>
48
- <div class="grid grid-cols-1 md:grid-cols-2 gap-6">
49
- <div>
50
- <h3 class="text-xl font-semibold text-indigo-200 mb-2">Predictions</h3>
51
- <% @runner_result.parsed_predictions.each_with_index do |prediction, index| %>
52
- <div class="mb-4">
53
- <h4 class="text-lg font-semibold text-indigo-100 mb-2">Prediction <%= index + 1 %></h4>
54
- <pre class="bg-gray-700 p-4 rounded-lg mt-2 text-sm text-gray-300 whitespace-pre-wrap"><%= prediction %></pre>
41
+
42
+ <%# Inline metadata with icons %>
43
+ <div class="result-meta-bar mb-6">
44
+ <div class="result-meta-item">
45
+ <svg class="icon-sm text-muted" viewBox="0 0 20 20" fill="currentColor">
46
+ <path fill-rule="evenodd" d="M4 4a2 2 0 012-2h4.586A2 2 0 0112 2.586L15.414 6A2 2 0 0116 7.414V16a2 2 0 01-2 2H6a2 2 0 01-2-2V4zm2 6a1 1 0 011-1h6a1 1 0 110 2H7a1 1 0 01-1-1zm1 3a1 1 0 100 2h6a1 1 0 100-2H7z" clip-rule="evenodd" />
47
+ </svg>
48
+ <span class="result-meta-label">Prompt</span>
49
+ <%= link_to @runner_result.prompt.name, prompt_path(@runner_result.prompt), class: 'result-meta-value result-meta-link' %>
50
+ <span class="tag tag-gray">v<%= @runner_result.prompt_version %></span>
51
+ </div>
52
+ <div class="result-meta-divider"></div>
53
+ <div class="result-meta-item">
54
+ <svg class="icon-sm text-muted" viewBox="0 0 20 20" fill="currentColor">
55
+ <path fill-rule="evenodd" d="M11.3 1.046A1 1 0 0112 2v5h4a1 1 0 01.82 1.573l-7 10A1 1 0 018 18v-5H4a1 1 0 01-.82-1.573l7-10a1 1 0 011.12-.38z" clip-rule="evenodd" />
56
+ </svg>
57
+ <span class="result-meta-label">Runner</span>
58
+ <span class="result-meta-value font-mono text-sm"><%= @experiment.runner_class.to_s.demodulize %></span>
59
+ </div>
60
+ <div class="result-meta-divider"></div>
61
+ <div class="result-meta-item">
62
+ <svg class="icon-sm text-muted" viewBox="0 0 20 20" fill="currentColor">
63
+ <path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm1-12a1 1 0 10-2 0v4a1 1 0 00.293.707l2.828 2.829a1 1 0 101.415-1.415L11 9.586V6z" clip-rule="evenodd" />
64
+ </svg>
65
+ <span class="result-meta-label">Ran</span>
66
+ <span class="result-meta-value" title="<%= @runner_result.created_at.strftime('%B %d, %Y at %I:%M %p') %>"><%= time_ago_in_words(@runner_result.created_at) %> ago</span>
67
+ </div>
68
+ <% if @runner_result.evaluation_results.any? %>
69
+ <div class="result-meta-divider"></div>
70
+ <div class="result-meta-item">
71
+ <svg class="icon-sm text-muted" viewBox="0 0 20 20" fill="currentColor">
72
+ <path d="M9 2a1 1 0 000 2h2a1 1 0 100-2H9z" />
73
+ <path fill-rule="evenodd" d="M4 5a2 2 0 012-2 3 3 0 003 3h2a3 3 0 003-3 2 2 0 012 2v11a2 2 0 01-2 2H6a2 2 0 01-2-2V5zm9.707 5.707a1 1 0 00-1.414-1.414L9 12.586l-1.293-1.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z" clip-rule="evenodd" />
74
+ </svg>
75
+ <span class="result-meta-label">Evaluations</span>
76
+ <span class="result-meta-value"><%= @runner_result.evaluation_results.count %></span>
77
+ </div>
78
+ <% end %>
79
+ </div>
80
+
81
+ <%# Evaluation Scores Section %>
82
+ <% if @runner_result.evaluation_results.any? %>
83
+ <div class="result-section mb-6">
84
+ <div class="result-section-header">
85
+ <h2 class="result-section-title">
86
+ <svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
87
+ <path d="M9.049 2.927c.3-.921 1.603-.921 1.902 0l1.07 3.292a1 1 0 00.95.69h3.462c.969 0 1.371 1.24.588 1.81l-2.8 2.034a1 1 0 00-.364 1.118l1.07 3.292c.3.921-.755 1.688-1.54 1.118l-2.8-2.034a1 1 0 00-1.175 0l-2.8 2.034c-.784.57-1.838-.197-1.539-1.118l1.07-3.292a1 1 0 00-.364-1.118L2.98 8.72c-.783-.57-.38-1.81.588-1.81h3.461a1 1 0 00.951-.69l1.07-3.292z" />
88
+ </svg>
89
+ Evaluation Scores
90
+ </h2>
91
+ <span class="result-section-count"><%= @runner_result.evaluation_results.count %> evaluator<%= @runner_result.evaluation_results.count == 1 ? '' : 's' %></span>
92
+ </div>
93
+ <div class="eval-summary">
94
+ <% @runner_result.evaluation_results.each do |eval_result| %>
95
+ <%
96
+ score = eval_result.score
97
+ score_pct = (score * 100).round
98
+ score_class = case score
99
+ when 0...0.2 then 'score-bad'
100
+ when 0.2...0.4 then 'score-poor'
101
+ when 0.4...0.6 then 'score-fair'
102
+ when 0.6...0.8 then 'score-good'
103
+ else 'score-excellent'
104
+ end
105
+ score_label = case score
106
+ when 0...0.2 then 'Bad'
107
+ when 0.2...0.4 then 'Poor'
108
+ when 0.4...0.6 then 'Fair'
109
+ when 0.6...0.8 then 'Good'
110
+ else 'Excellent'
111
+ end
112
+ short_name = eval_result.evaluator_class.demodulize
113
+ .gsub(/Evaluator$/, '')
114
+ .gsub(/Eval$/, '')
115
+ .gsub(/^Sentiment/, '')
116
+ short_name = short_name.presence || eval_result.evaluator_class.demodulize.gsub(/Eval(uator)?$/, '')
117
+ %>
118
+ <div class="eval-summary-card" title="<%= eval_result.evaluator_class %>">
119
+ <span class="eval-summary-score <%= score_class %>"><%= score_pct %><span class="eval-summary-pct">%</span></span>
120
+ <span class="eval-summary-name"><%= short_name %></span>
121
+ <div class="eval-summary-bar">
122
+ <div class="eval-summary-bar-fill <%= score_class %>" style="width: <%= score_pct %>%"></div>
123
+ </div>
124
+ <span class="eval-summary-label <%= score_class %>"><%= score_label %></span>
55
125
  </div>
56
126
  <% end %>
57
127
  </div>
58
- <div>
59
- <h3 class="text-xl font-semibold text-indigo-200 mb-2">Ground Truth</h3>
60
- <pre class="bg-gray-700 p-4 rounded-lg mt-2 text-sm text-gray-300 whitespace-pre-wrap"><%= @runner_result.ground_truth %></pre>
128
+ </div>
129
+ <% end %>
130
+
131
+ <%# Predictions and Ground Truth - Side by side comparison %>
132
+ <%
133
+ predictions = @runner_result.parsed_predictions.presence || [@runner_result.prediction].compact
134
+ prediction_text = predictions.first.to_s
135
+ expected_text = @runner_result.ground_truth.to_s
136
+ is_match = prediction_text.strip == expected_text.strip && prediction_text.present?
137
+ %>
138
+ <div class="result-section mb-6">
139
+ <div class="result-section-header">
140
+ <h2 class="result-section-title">
141
+ <svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
142
+ <path fill-rule="evenodd" d="M4 2a1 1 0 011 1v2.101a7.002 7.002 0 0111.601 2.566 1 1 0 11-1.885.666A5.002 5.002 0 005.999 7H9a1 1 0 010 2H4a1 1 0 01-1-1V3a1 1 0 011-1zm.008 9.057a1 1 0 011.276.61A5.002 5.002 0 0014.001 13H11a1 1 0 110-2h5a1 1 0 011 1v5a1 1 0 11-2 0v-2.101a7.002 7.002 0 01-11.601-2.566 1 1 0 01.61-1.276z" clip-rule="evenodd" />
143
+ </svg>
144
+ Output Comparison
145
+ </h2>
146
+ <% if is_match %>
147
+ <span class="comparison-match-badge comparison-match-badge--match">
148
+ <svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
149
+ <path fill-rule="evenodd" d="M10 18a8 8 0 100-16 8 8 0 000 16zm3.707-9.293a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z" clip-rule="evenodd" />
150
+ </svg>
151
+ Exact Match
152
+ </span>
153
+ <% elsif prediction_text.present? && expected_text.present? %>
154
+ <span class="comparison-match-badge comparison-match-badge--mismatch">
155
+ <svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
156
+ <path fill-rule="evenodd" d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7 4a1 1 0 11-2 0 1 1 0 012 0zm-1-9a1 1 0 00-1 1v4a1 1 0 102 0V6a1 1 0 00-1-1z" clip-rule="evenodd" />
157
+ </svg>
158
+ Difference
159
+ </span>
160
+ <% end %>
161
+ </div>
162
+
163
+ <div class="result-comparison">
164
+ <div class="result-panel result-panel--prediction">
165
+ <div class="result-panel-header">
166
+ <div class="result-panel-header-content">
167
+ <svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
168
+ <path fill-rule="evenodd" d="M11.3 1.046A1 1 0 0112 2v5h4a1 1 0 01.82 1.573l-7 10A1 1 0 018 18v-5H4a1 1 0 01-.82-1.573l7-10a1 1 0 011.12-.38z" clip-rule="evenodd" />
169
+ </svg>
170
+ <span class="result-panel-label">Prediction</span>
171
+ </div>
172
+ <span class="result-panel-meta"><%= prediction_text.length %> chars</span>
173
+ </div>
174
+ <div class="result-panel-content">
175
+ <% if predictions.any? %>
176
+ <% predictions.each_with_index do |prediction, index| %>
177
+ <div class="result-value-block <%= index > 0 ? 'mt-3' : '' %>">
178
+ <% if predictions.size > 1 %>
179
+ <span class="result-index"><%= index + 1 %></span>
180
+ <% end %>
181
+ <pre class="result-code"><code><%= prediction.presence || '—' %></code></pre>
182
+ </div>
183
+ <% end %>
184
+ <% else %>
185
+ <div class="result-empty-state">
186
+ <svg class="icon-lg text-muted" viewBox="0 0 20 20" fill="currentColor">
187
+ <path fill-rule="evenodd" d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a1 1 0 000 2v3a1 1 0 001 1h1a1 1 0 100-2v-3a1 1 0 00-1-1H9z" clip-rule="evenodd" />
188
+ </svg>
189
+ <span>No prediction generated</span>
190
+ </div>
191
+ <% end %>
192
+ </div>
193
+ </div>
194
+ <div class="result-panel result-panel--expected">
195
+ <div class="result-panel-header">
196
+ <div class="result-panel-header-content">
197
+ <svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
198
+ <path fill-rule="evenodd" d="M6.267 3.455a3.066 3.066 0 001.745-.723 3.066 3.066 0 013.976 0 3.066 3.066 0 001.745.723 3.066 3.066 0 012.812 2.812c.051.643.304 1.254.723 1.745a3.066 3.066 0 010 3.976 3.066 3.066 0 00-.723 1.745 3.066 3.066 0 01-2.812 2.812 3.066 3.066 0 00-1.745.723 3.066 3.066 0 01-3.976 0 3.066 3.066 0 00-1.745-.723 3.066 3.066 0 01-2.812-2.812 3.066 3.066 0 00-.723-1.745 3.066 3.066 0 010-3.976 3.066 3.066 0 00.723-1.745 3.066 3.066 0 012.812-2.812zm7.44 5.252a1 1 0 00-1.414-1.414L9 10.586 7.707 9.293a1 1 0 00-1.414 1.414l2 2a1 1 0 001.414 0l4-4z" clip-rule="evenodd" />
199
+ </svg>
200
+ <span class="result-panel-label">Expected (Ground Truth)</span>
201
+ </div>
202
+ <span class="result-panel-meta"><%= expected_text.length %> chars</span>
203
+ </div>
204
+ <div class="result-panel-content">
205
+ <% if expected_text.present? %>
206
+ <pre class="result-code"><code><%= expected_text %></code></pre>
207
+ <% else %>
208
+ <div class="result-empty-state">
209
+ <svg class="icon-lg text-muted" viewBox="0 0 20 20" fill="currentColor">
210
+ <path fill-rule="evenodd" d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a1 1 0 000 2v3a1 1 0 001 1h1a1 1 0 100-2v-3a1 1 0 00-1-1H9z" clip-rule="evenodd" />
211
+ </svg>
212
+ <span>No ground truth defined</span>
213
+ </div>
214
+ <% end %>
215
+ </div>
61
216
  </div>
62
217
  </div>
63
218
  </div>
64
- </div>
219
+
220
+ <%# Raw Output - Enhanced Collapsible Section %>
221
+ <% if @runner_result.respond_to?(:raw_output) && @runner_result.raw_output.present? %>
222
+ <%
223
+ raw_output = @runner_result.raw_output
224
+ line_count = raw_output.lines.count
225
+ char_count = raw_output.length
226
+ %>
227
+ <div class="result-section">
228
+ <details class="raw-output-collapsible">
229
+ <summary class="raw-output-header">
230
+ <div class="raw-output-header-left">
231
+ <svg class="icon-sm raw-output-icon" viewBox="0 0 20 20" fill="currentColor">
232
+ <path fill-rule="evenodd" d="M12.316 3.051a1 1 0 01.633 1.265l-4 12a1 1 0 11-1.898-.632l4-12a1 1 0 011.265-.633zM5.707 6.293a1 1 0 010 1.414L3.414 10l2.293 2.293a1 1 0 11-1.414 1.414l-3-3a1 1 0 010-1.414l3-3a1 1 0 011.414 0zm8.586 0a1 1 0 011.414 0l3 3a1 1 0 010 1.414l-3 3a1 1 0 11-1.414-1.414L16.586 10l-2.293-2.293a1 1 0 010-1.414z" clip-rule="evenodd" />
233
+ </svg>
234
+ <span class="raw-output-title">Raw Output</span>
235
+ <span class="raw-output-stats">
236
+ <span class="raw-output-stat"><%= line_count %> line<%= line_count == 1 ? '' : 's' %></span>
237
+ <span class="raw-output-stat-sep"></span>
238
+ <span class="raw-output-stat"><%= number_to_human_size(char_count) %></span>
239
+ </span>
240
+ </div>
241
+ <div class="raw-output-header-right">
242
+ <span class="raw-output-expand-hint">
243
+ <span class="expand-text">Show</span>
244
+ <span class="collapse-text">Hide</span>
245
+ </span>
246
+ <svg class="raw-output-chevron" viewBox="0 0 20 20" fill="currentColor">
247
+ <path fill-rule="evenodd" d="M5.293 7.293a1 1 0 011.414 0L10 10.586l3.293-3.293a1 1 0 111.414 1.414l-4 4a1 1 0 01-1.414 0l-4-4a1 1 0 010-1.414z" clip-rule="evenodd" />
248
+ </svg>
249
+ </div>
250
+ </summary>
251
+ <div class="raw-output-body">
252
+ <div class="raw-output-toolbar">
253
+ <span class="raw-output-lang-hint">
254
+ <svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
255
+ <path fill-rule="evenodd" d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a1 1 0 000 2v3a1 1 0 001 1h1a1 1 0 100-2v-3a1 1 0 00-1-1H9z" clip-rule="evenodd" />
256
+ </svg>
257
+ Response from LLM
258
+ </span>
259
+ </div>
260
+ <pre class="raw-output-code"><code><%= raw_output %></code></pre>
261
+ </div>
262
+ </details>
263
+ </div>
264
+ <% end %>
265
+
266
+ <%# Dataset Record Link - Footer %>
267
+ <div class="result-footer mt-8">
268
+ <div class="result-footer-content">
269
+ <span class="result-footer-label">Dataset Record:</span>
270
+ <%= link_to dataset_dataset_record_path(@runner_result.dataset_record.dataset, @runner_result.dataset_record), class: 'result-footer-link' do %>
271
+ <svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
272
+ <path d="M4 4a2 2 0 012-2h4.586A2 2 0 0112 2.586L15.414 6A2 2 0 0116 7.414V16a2 2 0 01-2 2H6a2 2 0 01-2-2V4z" />
273
+ </svg>
274
+ <%= @runner_result.dataset_record.display_name %>
275
+ <svg class="icon-sm" viewBox="0 0 20 20" fill="currentColor">
276
+ <path fill-rule="evenodd" d="M7.293 14.707a1 1 0 010-1.414L10.586 10 7.293 6.707a1 1 0 011.414-1.414l4 4a1 1 0 010 1.414l-4 4a1 1 0 01-1.414 0z" clip-rule="evenodd" />
277
+ </svg>
278
+ <% end %>
279
+ </div>
280
+ </div>
281
+ </div>