ruby_llm-evals 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +180 -8
- data/Rakefile +0 -2
- data/app/assets/stylesheets/ruby_llm/evals/application.css +15 -0
- data/app/assets/stylesheets/ruby_llm/evals/bulma.min.css +3 -0
- data/app/assets/stylesheets/ruby_llm/evals/json_editor.css +25 -0
- data/app/controllers/concerns/ruby_llm/evals/prompt_executions/prompt_execution_scoped.rb +19 -0
- data/app/controllers/ruby_llm/evals/application_controller.rb +14 -0
- data/app/controllers/ruby_llm/evals/prompt_executions/failures_controller.rb +15 -0
- data/app/controllers/ruby_llm/evals/prompt_executions/passages_controller.rb +15 -0
- data/app/controllers/ruby_llm/evals/prompt_executions/retries_controller.rb +16 -0
- data/app/controllers/ruby_llm/evals/prompts_controller.rb +87 -0
- data/app/controllers/ruby_llm/evals/runs_controller.rb +46 -0
- data/app/helpers/ruby_llm/evals/application_helper.rb +39 -0
- data/app/helpers/ruby_llm/evals/prompt_executions_helper.rb +6 -0
- data/app/helpers/ruby_llm/evals/prompts_helper.rb +37 -0
- data/app/helpers/ruby_llm/evals/runs_helper.rb +6 -0
- data/app/javascript/ruby_llm/evals/application.js +3 -0
- data/app/javascript/ruby_llm/evals/controllers/application.js +13 -0
- data/app/javascript/ruby_llm/evals/controllers/eval_type_selector_controller.js +37 -0
- data/app/javascript/ruby_llm/evals/controllers/file_input_controller.js +21 -0
- data/app/javascript/ruby_llm/evals/controllers/index.js +4 -0
- data/app/javascript/ruby_llm/evals/controllers/json_editor_controller.js +129 -0
- data/app/javascript/ruby_llm/evals/controllers/provider_model_controller.js +85 -0
- data/app/javascript/ruby_llm/evals/controllers/schema_selector_controller.js +31 -0
- data/app/jobs/ruby_llm/evals/application_job.rb +6 -0
- data/app/jobs/ruby_llm/evals/execute_sample_job.rb +26 -0
- data/app/jobs/ruby_llm/evals/perform_run_job.rb +21 -0
- data/app/mailers/ruby_llm/evals/application_mailer.rb +8 -0
- data/app/models/concerns/ruby_llm/evals/job_trackable.rb +15 -0
- data/app/models/ruby_llm/evals/application_record.rb +7 -0
- data/app/models/ruby_llm/evals/page.rb +53 -0
- data/app/models/ruby_llm/evals/prompt.rb +55 -0
- data/app/models/ruby_llm/evals/prompt_execution.rb +169 -0
- data/app/models/ruby_llm/evals/run.rb +45 -0
- data/app/models/ruby_llm/evals/sample.rb +20 -0
- data/app/schemas/ruby_llm/evals/judge_verdict_schema.rb +8 -0
- data/app/views/layouts/ruby_llm/evals/application.html.erb +29 -0
- data/app/views/ruby_llm/evals/application/_flashes.html.erb +9 -0
- data/app/views/ruby_llm/evals/application/_nav.html.erb +12 -0
- data/app/views/ruby_llm/evals/application/_pagination.html.erb +7 -0
- data/app/views/ruby_llm/evals/application/_tabs.html.erb +6 -0
- data/app/views/ruby_llm/evals/prompts/_filters.html.erb +15 -0
- data/app/views/ruby_llm/evals/prompts/_form.html.erb +104 -0
- data/app/views/ruby_llm/evals/prompts/_prompt.html.erb +14 -0
- data/app/views/ruby_llm/evals/prompts/compare.html.erb +90 -0
- data/app/views/ruby_llm/evals/prompts/edit.html.erb +5 -0
- data/app/views/ruby_llm/evals/prompts/index.html.erb +32 -0
- data/app/views/ruby_llm/evals/prompts/new.html.erb +5 -0
- data/app/views/ruby_llm/evals/prompts/show.html.erb +107 -0
- data/app/views/ruby_llm/evals/runs/_filters.html.erb +17 -0
- data/app/views/ruby_llm/evals/runs/_run.html.erb +13 -0
- data/app/views/ruby_llm/evals/runs/index.html.erb +30 -0
- data/app/views/ruby_llm/evals/runs/show.html.erb +188 -0
- data/app/views/ruby_llm/evals/samples/_form.html.erb +88 -0
- data/config/importmap.rb +13 -0
- data/config/locales/en.yml +7 -0
- data/config/routes.rb +20 -1
- data/db/migrate/20251022211228_create_ruby_llm_evals_prompts.rb +21 -0
- data/db/migrate/20251022211229_create_ruby_llm_evals_samples.rb +14 -0
- data/db/migrate/20251022211230_create_ruby_llm_evals_runs.rb +21 -0
- data/db/migrate/20251022211231_create_ruby_llm_evals_prompt_executions.rb +26 -0
- data/lib/activemodel/validations/json_validator.rb +14 -0
- data/lib/ruby_llm/evals/engine.rb +49 -1
- data/lib/ruby_llm/evals/version.rb +2 -2
- data/lib/ruby_llm/evals.rb +7 -3
- metadata +65 -6
- /data/lib/tasks/{ruby_llm/evals_tasks.rake → ruby_llm_evals_tasks.rake} +0 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
<% content_for :title, "#{@prompt.name} - Runs Comparison" %>
|
|
2
|
+
|
|
3
|
+
<h1 class="title"><%= @prompt.name %> - Runs Comparison</h1>
|
|
4
|
+
|
|
5
|
+
<% if @runs.any? %>
|
|
6
|
+
<div style="overflow-x: auto;">
|
|
7
|
+
<table class="table is-bordered is-striped is-fullwidth">
|
|
8
|
+
<thead>
|
|
9
|
+
<tr>
|
|
10
|
+
<th>Metric</th>
|
|
11
|
+
<% @runs.each do |run| %>
|
|
12
|
+
<th>
|
|
13
|
+
<%= link_to "Run ##{run.id}", run_path(run) %>
|
|
14
|
+
<div class="is-size-7 has-text-grey">
|
|
15
|
+
<%= run.created_at %>
|
|
16
|
+
</div>
|
|
17
|
+
</th>
|
|
18
|
+
<% end %>
|
|
19
|
+
</tr>
|
|
20
|
+
</thead>
|
|
21
|
+
<tbody>
|
|
22
|
+
<tr>
|
|
23
|
+
<th>Provider/Model</th>
|
|
24
|
+
<% @runs.each do |run| %>
|
|
25
|
+
<td><%= run.provider %>/<%= run.model %></td>
|
|
26
|
+
<% end %>
|
|
27
|
+
</tr>
|
|
28
|
+
|
|
29
|
+
<tr>
|
|
30
|
+
<th>Temperature</th>
|
|
31
|
+
<% @runs.each do |run| %>
|
|
32
|
+
<td><%= run.temperature || "N/A" %></td>
|
|
33
|
+
<% end %>
|
|
34
|
+
</tr>
|
|
35
|
+
|
|
36
|
+
<tr>
|
|
37
|
+
<th>Instructions</th>
|
|
38
|
+
<% @runs.each do |run| %>
|
|
39
|
+
<td><%= truncate run.instructions %></td>
|
|
40
|
+
<% end %>
|
|
41
|
+
</tr>
|
|
42
|
+
|
|
43
|
+
<tr>
|
|
44
|
+
<th>Message</th>
|
|
45
|
+
<% @runs.each do |run| %>
|
|
46
|
+
<td><%= truncate run.message %></td>
|
|
47
|
+
<% end %>
|
|
48
|
+
</tr>
|
|
49
|
+
|
|
50
|
+
<tr>
|
|
51
|
+
<th>Accuracy</th>
|
|
52
|
+
<% @runs.each do |run| %>
|
|
53
|
+
<td><%= accuracy(run) %></td>
|
|
54
|
+
<% end %>
|
|
55
|
+
</tr>
|
|
56
|
+
|
|
57
|
+
<tr>
|
|
58
|
+
<th>Cost</th>
|
|
59
|
+
<% @runs.each do |run| %>
|
|
60
|
+
<td>$<%= run.total_cost %></td>
|
|
61
|
+
<% end %>
|
|
62
|
+
</tr>
|
|
63
|
+
|
|
64
|
+
<tr>
|
|
65
|
+
<th>Duration</th>
|
|
66
|
+
<% @runs.each do |run| %>
|
|
67
|
+
<td><%= duration(run) %></td>
|
|
68
|
+
<% end %>
|
|
69
|
+
</tr>
|
|
70
|
+
|
|
71
|
+
<% if @samples.any? %>
|
|
72
|
+
<% @samples.each_with_index do |sample, sample_index| %>
|
|
73
|
+
<tr>
|
|
74
|
+
<th>
|
|
75
|
+
Sample #<%= sample.id %>
|
|
76
|
+
</th>
|
|
77
|
+
<% @runs.each do |run| %>
|
|
78
|
+
<td class="has-text-centered">
|
|
79
|
+
<%= status_indicator run.prompt_executions.find { |pe| pe.sample.id == sample.id } %>
|
|
80
|
+
</td>
|
|
81
|
+
<% end %>
|
|
82
|
+
</tr>
|
|
83
|
+
<% end %>
|
|
84
|
+
<% end %>
|
|
85
|
+
</tbody>
|
|
86
|
+
</table>
|
|
87
|
+
</div>
|
|
88
|
+
<% else %>
|
|
89
|
+
<div class="mt-6 has-text-centered is-size-4 has-text-grey">There are no runs</div>
|
|
90
|
+
<% end %>
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
<% content_for :title, "Prompts" %>
|
|
2
|
+
|
|
3
|
+
<div class="level">
|
|
4
|
+
<div class="level-left">
|
|
5
|
+
<%= render "filters", clear_path: prompts_path %>
|
|
6
|
+
</div>
|
|
7
|
+
<div class="level-right is-align-self-flex-end">
|
|
8
|
+
<%= link_to "New prompt", new_prompt_path, class: "button is-link" %>
|
|
9
|
+
</div>
|
|
10
|
+
</div>
|
|
11
|
+
|
|
12
|
+
<% if @prompts.records.any? %>
|
|
13
|
+
<table class="table is-hoverable is-fullwidth">
|
|
14
|
+
<thead>
|
|
15
|
+
<tr>
|
|
16
|
+
<th>Name</th>
|
|
17
|
+
<th>Provider/Model</th>
|
|
18
|
+
<th>Instructions</th>
|
|
19
|
+
<th>Message</th>
|
|
20
|
+
<th>Runs</th>
|
|
21
|
+
<th></th>
|
|
22
|
+
</tr>
|
|
23
|
+
</thead>
|
|
24
|
+
<tbody>
|
|
25
|
+
<%= render partial: "ruby_llm/evals/prompts/prompt", collection: @prompts.records %>
|
|
26
|
+
</tbody>
|
|
27
|
+
</table>
|
|
28
|
+
<% else %>
|
|
29
|
+
<div class="mt-6 has-text-centered is-size-4 has-text-grey">There are no prompts</div>
|
|
30
|
+
<% end %>
|
|
31
|
+
|
|
32
|
+
<%= render "ruby_llm/evals/application/pagination", page: @prompts %>
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
<% content_for :title, @prompt.name %>
|
|
2
|
+
|
|
3
|
+
<h1 class="title"><%= @prompt.name %></h1>
|
|
4
|
+
|
|
5
|
+
<table class="table">
|
|
6
|
+
<tbody>
|
|
7
|
+
<tr>
|
|
8
|
+
<th>Slug</th>
|
|
9
|
+
<td><%= @prompt.slug %></td>
|
|
10
|
+
</tr>
|
|
11
|
+
<tr>
|
|
12
|
+
<th>Provider</th>
|
|
13
|
+
<td><%= @prompt.provider %></td>
|
|
14
|
+
</tr>
|
|
15
|
+
<tr>
|
|
16
|
+
<th>Model</th>
|
|
17
|
+
<td><%= @prompt.model %></td>
|
|
18
|
+
</tr>
|
|
19
|
+
<tr>
|
|
20
|
+
<th>Temperature</th>
|
|
21
|
+
<td><%= @prompt.temperature %></td>
|
|
22
|
+
</tr>
|
|
23
|
+
<tr>
|
|
24
|
+
<th>Params</th>
|
|
25
|
+
<td><%= json @prompt.params %></td>
|
|
26
|
+
</tr>
|
|
27
|
+
<tr>
|
|
28
|
+
<th>Tools</th>
|
|
29
|
+
<td><%= @prompt.tools.try :to_sentence %></td>
|
|
30
|
+
</tr>
|
|
31
|
+
<tr>
|
|
32
|
+
<th>Schema</th>
|
|
33
|
+
<td>
|
|
34
|
+
<% if @prompt.schema_other.present? %>
|
|
35
|
+
<%= json @prompt.schema_other %>
|
|
36
|
+
<% elsif @prompt.schema.present? %>
|
|
37
|
+
<%= @prompt.schema %>
|
|
38
|
+
<% end %>
|
|
39
|
+
</td>
|
|
40
|
+
</tr>
|
|
41
|
+
<tr>
|
|
42
|
+
<th>Instructions</th>
|
|
43
|
+
<td style="white-space: pre-wrap;"><%= @prompt.instructions %></td>
|
|
44
|
+
</tr>
|
|
45
|
+
<tr>
|
|
46
|
+
<th>Message</th>
|
|
47
|
+
<td style="white-space: pre-wrap;"><%= @prompt.message %></td>
|
|
48
|
+
</tr>
|
|
49
|
+
</tbody>
|
|
50
|
+
</table>
|
|
51
|
+
|
|
52
|
+
<h2 class="mt-6 subtitle">Samples</h2>
|
|
53
|
+
|
|
54
|
+
<% if @prompt.samples.any? %>
|
|
55
|
+
<table class="table is-fullwidth">
|
|
56
|
+
<thead>
|
|
57
|
+
<tr>
|
|
58
|
+
<th>Sample</th>
|
|
59
|
+
<th>Variables</th>
|
|
60
|
+
<th>Eval type</th>
|
|
61
|
+
<th>Expected output</th>
|
|
62
|
+
<th>Files</th>
|
|
63
|
+
</tr>
|
|
64
|
+
</thead>
|
|
65
|
+
<tbody>
|
|
66
|
+
<% @prompt.samples.each do |sample| %>
|
|
67
|
+
<tr>
|
|
68
|
+
<td>#<%= sample.id %></td>
|
|
69
|
+
<td><%= json sample.variables %></td>
|
|
70
|
+
<td>
|
|
71
|
+
<%= sample.eval_type %>
|
|
72
|
+
<% if sample.llm_judge? %>
|
|
73
|
+
<div>(<%= sample.judge_provider %>/<%= sample.judge_model %>)</div>
|
|
74
|
+
<% end %>
|
|
75
|
+
</td>
|
|
76
|
+
<td><%= expected_output sample %></td>
|
|
77
|
+
<td>
|
|
78
|
+
<ul>
|
|
79
|
+
<% sample.files.each do |file| %>
|
|
80
|
+
<li>
|
|
81
|
+
<% if file.previewable? %>
|
|
82
|
+
<%= image_tag main_app.url_for(file.preview(resize_to_limit: [400, 400]).processed) %>
|
|
83
|
+
<% elsif file.variable? %>
|
|
84
|
+
<%= image_tag main_app.url_for(file.variant(resize_to_limit: [400, 400]).processed) %>
|
|
85
|
+
<% else %>
|
|
86
|
+
<%= link_to file.filename, main_app.url_for(file) %>
|
|
87
|
+
<% end %>
|
|
88
|
+
<div><%= link_to file.filename, main_app.rails_blob_path(file, disposition: "attachment") %> (<%= number_to_human_size(file.byte_size) %>)</div>
|
|
89
|
+
</li>
|
|
90
|
+
<% end %>
|
|
91
|
+
</ul>
|
|
92
|
+
</td>
|
|
93
|
+
</tr>
|
|
94
|
+
<% end %>
|
|
95
|
+
</tbody>
|
|
96
|
+
</table>
|
|
97
|
+
<% else %>
|
|
98
|
+
<div class="mt-6 has-text-centered is-size-4 has-text-grey">There are no samples</div>
|
|
99
|
+
<% end %>
|
|
100
|
+
|
|
101
|
+
<div class="mt-6 buttons">
|
|
102
|
+
<% if @prompt.runs.any? %>
|
|
103
|
+
<%= link_to "Compare runs", compare_prompt_path(@prompt), class: "button" %>
|
|
104
|
+
<% end %>
|
|
105
|
+
<%= link_to "Edit this prompt", edit_prompt_path(@prompt), class: "button" %>
|
|
106
|
+
<%= button_to "Destroy this prompt", @prompt, method: :delete, class: "button is-danger" %>
|
|
107
|
+
</div>
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
<div class="level">
|
|
2
|
+
<div class="level-left">
|
|
3
|
+
<%= form_with scope: :filter, url: runs_path, method: :get, data: {controller: "auto-submit"} do |form| %>
|
|
4
|
+
<div class="field is-grouped">
|
|
5
|
+
<div>
|
|
6
|
+
<%= form.label :ruby_llm_evals_prompt_id, "Prompt", class: "label" %>
|
|
7
|
+
<div class="select">
|
|
8
|
+
<%= form.collection_select :ruby_llm_evals_prompt_id, RubyLLM::Evals::Prompt.all, :id, :name, {selected: filter_param.dig(:filter, :ruby_llm_evals_prompt_id), include_blank: true}, data: {action: "auto-submit#submit"} %>
|
|
9
|
+
</div>
|
|
10
|
+
</div>
|
|
11
|
+
<div class="is-align-self-flex-end">
|
|
12
|
+
<%= link_to "Clear", clear_path, class: "button" %>
|
|
13
|
+
</div>
|
|
14
|
+
</div>
|
|
15
|
+
<% end %>
|
|
16
|
+
</div>
|
|
17
|
+
</div>
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
<tr>
|
|
2
|
+
<td><%= link_to "##{run.id}", run_path(run) %></td>
|
|
3
|
+
<td><%= link_to run.prompt.name, run.prompt %></td>
|
|
4
|
+
<td><%= run.provider %>/<%= run.model %></td>
|
|
5
|
+
<td><%= truncate run.instructions %></td>
|
|
6
|
+
<td><%= truncate run.message %></td>
|
|
7
|
+
<td><%= run.started_at %></td>
|
|
8
|
+
<td><%= run.ended_at %></td>
|
|
9
|
+
<td><%= duration run %></td>
|
|
10
|
+
<td><%= accuracy run %></td>
|
|
11
|
+
<td>$<%= run.cost %></td>
|
|
12
|
+
<td><%= button_to "Destroy", run, method: :delete, class: "button is-danger" %></td>
|
|
13
|
+
</tr>
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
<% content_for :title, "Runs" %>
|
|
2
|
+
|
|
3
|
+
<%= render "filters", clear_path: runs_path %>
|
|
4
|
+
|
|
5
|
+
<% if @runs.records.any? %>
|
|
6
|
+
<table class="table is-hoverable is-fullwidth">
|
|
7
|
+
<thead>
|
|
8
|
+
<tr>
|
|
9
|
+
<th>Run</th>
|
|
10
|
+
<th>Prompt</th>
|
|
11
|
+
<th>Provider/Model</th>
|
|
12
|
+
<th>Instructions</th>
|
|
13
|
+
<th>Message</th>
|
|
14
|
+
<th>Started at</th>
|
|
15
|
+
<th>Ended at</th>
|
|
16
|
+
<th>Duration</th>
|
|
17
|
+
<th>Accuracy</th>
|
|
18
|
+
<th>Cost</th>
|
|
19
|
+
<th></th>
|
|
20
|
+
</tr>
|
|
21
|
+
</thead>
|
|
22
|
+
<tbody>
|
|
23
|
+
<%= render partial: "ruby_llm/evals/runs/run", collection: @runs.records %>
|
|
24
|
+
</tbody>
|
|
25
|
+
</table>
|
|
26
|
+
<% else %>
|
|
27
|
+
<div class="mt-6 has-text-centered is-size-4 has-text-grey">There are no runs</div>
|
|
28
|
+
<% end %>
|
|
29
|
+
|
|
30
|
+
<%= render "ruby_llm/evals/application/pagination", page: @runs %>
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
<% content_for :title, "Run ##{@run.id}" %>
|
|
2
|
+
|
|
3
|
+
<h1 class="title">Run #<%= @run.id %></h1>
|
|
4
|
+
|
|
5
|
+
<table class="table">
|
|
6
|
+
<tbody>
|
|
7
|
+
<tr>
|
|
8
|
+
<th>Prompt</th>
|
|
9
|
+
<td><%= link_to @run.prompt.name, @run.prompt %></td>
|
|
10
|
+
</tr>
|
|
11
|
+
<tr>
|
|
12
|
+
<th>Provider/Model</th>
|
|
13
|
+
<td><%= @run.provider %>/<%= @run.model %></td>
|
|
14
|
+
</tr>
|
|
15
|
+
<tr>
|
|
16
|
+
<th>Temperature</th>
|
|
17
|
+
<td><%= @run.temperature %></td>
|
|
18
|
+
</tr>
|
|
19
|
+
<tr>
|
|
20
|
+
<th>Params</th>
|
|
21
|
+
<td><%= json @run.params %></td>
|
|
22
|
+
</tr>
|
|
23
|
+
<tr>
|
|
24
|
+
<th>Tools</th>
|
|
25
|
+
<td><%= @run.tools.try :to_sentence %></td>
|
|
26
|
+
</tr>
|
|
27
|
+
<tr>
|
|
28
|
+
<th>Schema</th>
|
|
29
|
+
<td>
|
|
30
|
+
<% if @run.schema_other.present? %>
|
|
31
|
+
<%= json @run.schema_other %>
|
|
32
|
+
<% elsif @run.schema.present? %>
|
|
33
|
+
<%= @run.schema %>
|
|
34
|
+
<% end %>
|
|
35
|
+
</td>
|
|
36
|
+
</tr>
|
|
37
|
+
<tr>
|
|
38
|
+
<th>Instructions</th>
|
|
39
|
+
<td style="white-space: pre-wrap;"><%= @run.instructions %></td>
|
|
40
|
+
</tr>
|
|
41
|
+
<tr>
|
|
42
|
+
<th>Message</th>
|
|
43
|
+
<td style="white-space: pre-wrap;"><%= @run.message %></td>
|
|
44
|
+
</tr>
|
|
45
|
+
<tr>
|
|
46
|
+
<th>ActiveJob ID</th>
|
|
47
|
+
<td><%= @run.active_job_id %></td>
|
|
48
|
+
</tr>
|
|
49
|
+
<tr>
|
|
50
|
+
<th>Started at</th>
|
|
51
|
+
<td><%= @run.started_at %></td>
|
|
52
|
+
</tr>
|
|
53
|
+
<tr>
|
|
54
|
+
<th>Ended at</th>
|
|
55
|
+
<td><%= @run.ended_at %></td>
|
|
56
|
+
</tr>
|
|
57
|
+
<tr>
|
|
58
|
+
<th>Duration</th>
|
|
59
|
+
<td><%= duration @run %></td>
|
|
60
|
+
</tr>
|
|
61
|
+
<tr>
|
|
62
|
+
<th>Accuracy</th>
|
|
63
|
+
<td><%= accuracy @run %></td>
|
|
64
|
+
</tr>
|
|
65
|
+
<tr>
|
|
66
|
+
<th>Cost</th>
|
|
67
|
+
<td>$<%= @run.cost %></td>
|
|
68
|
+
</tr>
|
|
69
|
+
<% if @run.judge_cost > 0 %>
|
|
70
|
+
<tr>
|
|
71
|
+
<th>Judge Cost</th>
|
|
72
|
+
<td>$<%= @run.judge_cost %></td>
|
|
73
|
+
</tr>
|
|
74
|
+
<tr>
|
|
75
|
+
<th>Total Cost</th>
|
|
76
|
+
<td>$<%= @run.total_cost %></td>
|
|
77
|
+
</tr>
|
|
78
|
+
<% end %>
|
|
79
|
+
</tbody>
|
|
80
|
+
</table>
|
|
81
|
+
|
|
82
|
+
<h2 class="mt-6 subtitle">Prompt executions</h2>
|
|
83
|
+
|
|
84
|
+
<table class="table is-fullwidth">
|
|
85
|
+
<thead>
|
|
86
|
+
<tr>
|
|
87
|
+
<th>Active Job ID</th>
|
|
88
|
+
<th>Variables</th>
|
|
89
|
+
<th>With</th>
|
|
90
|
+
<th>Output message</th>
|
|
91
|
+
<th>Eval type</th>
|
|
92
|
+
<th>Expected output</th>
|
|
93
|
+
<th>Tokens (input/output)</th>
|
|
94
|
+
<th>Duration</th>
|
|
95
|
+
<th>Cost</th>
|
|
96
|
+
<th>Passed?</th>
|
|
97
|
+
<th></th>
|
|
98
|
+
</tr>
|
|
99
|
+
</thead>
|
|
100
|
+
<tbody>
|
|
101
|
+
<% @run.prompt_executions.each do |prompt_execution| %>
|
|
102
|
+
<tr>
|
|
103
|
+
<td>
|
|
104
|
+
<%= prompt_execution.active_job_id %>
|
|
105
|
+
<% if prompt_execution.error_message.present? %>
|
|
106
|
+
<span title="<%= prompt_execution.error_message %>">⚠️</span>
|
|
107
|
+
<% end %>
|
|
108
|
+
</td>
|
|
109
|
+
<td><%= json prompt_execution.variables %></td>
|
|
110
|
+
<td>
|
|
111
|
+
<ul>
|
|
112
|
+
<% prompt_execution.files.each do |file| %>
|
|
113
|
+
<li>
|
|
114
|
+
<% if file.previewable? %>
|
|
115
|
+
<%= image_tag main_app.url_for(file.preview(resize_to_limit: [400, 400]).processed) %>
|
|
116
|
+
<% elsif file.variable? %>
|
|
117
|
+
<%= image_tag main_app.url_for(file.variant(resize_to_limit: [400, 400]).processed) %>
|
|
118
|
+
<% else %>
|
|
119
|
+
<%= link_to file.filename, main_app.url_for(file) %>
|
|
120
|
+
<% end %>
|
|
121
|
+
<div><%= link_to file.filename, main_app.rails_blob_path(file, disposition: "attachment") %> (<%= number_to_human_size(file.byte_size) %>)</div>
|
|
122
|
+
</li>
|
|
123
|
+
<% end %>
|
|
124
|
+
</ul>
|
|
125
|
+
</td>
|
|
126
|
+
<td>
|
|
127
|
+
<% if @run.schema_other.present? || @run.schema.present? %>
|
|
128
|
+
<% if prompt_execution.message.present? %>
|
|
129
|
+
<%= json JSON.parse(prompt_execution.message) %>
|
|
130
|
+
<% end %>
|
|
131
|
+
<% else %>
|
|
132
|
+
<div style="white-space: pre-wrap;"><%= prompt_execution.message %></div>
|
|
133
|
+
<% end %>
|
|
134
|
+
</td>
|
|
135
|
+
<td>
|
|
136
|
+
<%= prompt_execution.eval_type %>
|
|
137
|
+
<% if prompt_execution.llm_judge? %>
|
|
138
|
+
<div>(<%= prompt_execution.judge_provider %>/<%= prompt_execution.judge_model %>)</div>
|
|
139
|
+
<% end %>
|
|
140
|
+
</td>
|
|
141
|
+
<td>
|
|
142
|
+
<% if prompt_execution.llm_judge? %>
|
|
143
|
+
<div><strong>Criteria:</strong> <%= prompt_execution.expected_output %></div>
|
|
144
|
+
<% if prompt_execution.judge_message&.[]("explanation").present? %>
|
|
145
|
+
<div><strong>Explanation:</strong> <%= prompt_execution.judge_message["explanation"] %></div>
|
|
146
|
+
<% end %>
|
|
147
|
+
<% else %>
|
|
148
|
+
<%= expected_output prompt_execution %>
|
|
149
|
+
<% end %>
|
|
150
|
+
</td>
|
|
151
|
+
<td>
|
|
152
|
+
<div><%= prompt_execution.input %>/<%= prompt_execution.output %></div>
|
|
153
|
+
<% if prompt_execution.llm_judge? %>
|
|
154
|
+
<div><%= prompt_execution.judge_input %>/<%= prompt_execution.judge_output %> (judge)</div>
|
|
155
|
+
<% end %>
|
|
156
|
+
</td>
|
|
157
|
+
<td><%= duration prompt_execution %></td>
|
|
158
|
+
<td>
|
|
159
|
+
<div>$<%= prompt_execution.cost %></div>
|
|
160
|
+
<% if prompt_execution.llm_judge? %>
|
|
161
|
+
<div>$<%= prompt_execution.judge_cost %> (judge)</div>
|
|
162
|
+
<% end %>
|
|
163
|
+
</td>
|
|
164
|
+
<td><%= status_indicator(prompt_execution) %></td>
|
|
165
|
+
<td>
|
|
166
|
+
<div class="buttons">
|
|
167
|
+
<% if prompt_execution.error_message.present? %>
|
|
168
|
+
<%= button_to "Retry", retry_prompt_execution_path(prompt_execution), method: :post, class: "button is-warning is-light" %>
|
|
169
|
+
<% end %>
|
|
170
|
+
<% if prompt_execution.sample.human_judge? %>
|
|
171
|
+
<% if prompt_execution.passed.nil? %>
|
|
172
|
+
<%= button_to "Pass", prompt_execution_passage_path(prompt_execution), method: :post, class: "button is-success is-light" %>
|
|
173
|
+
<%= button_to "Fail", prompt_execution_failure_path(prompt_execution), method: :post, class: "button is-danger is-light" %>
|
|
174
|
+
<% else %>
|
|
175
|
+
<% target_path = prompt_execution.passed? ? prompt_execution_failure_path(prompt_execution) : prompt_execution_passage_path(prompt_execution) %>
|
|
176
|
+
<%= button_to "Toggle", target_path, method: :post, class: "button" %>
|
|
177
|
+
<% end %>
|
|
178
|
+
<% end %>
|
|
179
|
+
</div>
|
|
180
|
+
</td>
|
|
181
|
+
</tr>
|
|
182
|
+
<% end %>
|
|
183
|
+
</tbody>
|
|
184
|
+
</table>
|
|
185
|
+
|
|
186
|
+
<div class="mt-6 buttons">
|
|
187
|
+
<%= button_to "Destroy this run", @run, method: :delete, class: "button is-danger" %>
|
|
188
|
+
</div>
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
<div class="box nested-form-wrapper" data-new-record="<%= form.object.new_record? %>" data-controller="eval-type-selector">
|
|
2
|
+
<div class="columns">
|
|
3
|
+
<div class="column field">
|
|
4
|
+
<%= form.label :variables, class: "label" %>
|
|
5
|
+
<div data-controller="json-editor" data-json-editor-height-value="150px">
|
|
6
|
+
<div data-json-editor-target="editor" class="json-editor"></div>
|
|
7
|
+
<%= form.text_area :variables, value: form.object.variables&.to_json, class: "textarea", data: { json_editor_target: "textarea" } %>
|
|
8
|
+
<p class="help is-danger is-hidden" data-json-editor-target="error"></p>
|
|
9
|
+
</div>
|
|
10
|
+
<p class="help is-danger"><%= form.object.errors.full_messages_for(:variables).join(", ") %></p>
|
|
11
|
+
</div>
|
|
12
|
+
|
|
13
|
+
<div class="column">
|
|
14
|
+
<div class="field">
|
|
15
|
+
<%= form.label :eval_type, class: "label" %>
|
|
16
|
+
<div class="select is-fullwidth">
|
|
17
|
+
<%= form.select :eval_type, RubyLLM::Evals::Sample.eval_types, {}, data: { action: "change->eval-type-selector#toggle", eval_type_selector_target: "select" } %>
|
|
18
|
+
</div>
|
|
19
|
+
<p class="help is-danger"><%= form.object.errors.full_messages_for(:eval_type).join(", ") %></p>
|
|
20
|
+
</div>
|
|
21
|
+
|
|
22
|
+
<div data-eval-type-selector-target="judgeFields" data-controller="provider-model" data-provider-model-models-by-provider-value="<%= models_by_provider_data %>" style="<%= 'display: none;' unless form.object.llm_judge? %>">
|
|
23
|
+
<div class="field">
|
|
24
|
+
<%= form.label :judge_provider, class: "label" %>
|
|
25
|
+
<div class="select is-fullwidth">
|
|
26
|
+
<%= form.select :judge_provider, options_for_select(RubyLLM.providers.map { [_1.name, _1.slug] }, form.object.judge_provider), { include_blank: true }, data: { provider_model_target: "provider", action: "change->provider-model#provider-changed" } %>
|
|
27
|
+
</div>
|
|
28
|
+
<p class="help is-danger"><%= form.object.errors.full_messages_for(:judge_provider).join(", ") %></p>
|
|
29
|
+
</div>
|
|
30
|
+
|
|
31
|
+
<div class="field">
|
|
32
|
+
<%= form.label :judge_model, class: "label" %>
|
|
33
|
+
<div class="select is-fullwidth is-hidden" data-provider-model-target="modelSelectWrapper">
|
|
34
|
+
<%= form.select :judge_model, [], {}, { class: "input", name: nil, data: { provider_model_target: "modelSelect", action: "change->provider-model#syncToInput" } } %>
|
|
35
|
+
</div>
|
|
36
|
+
<%= form.text_field :judge_model, class: "input", data: { provider_model_target: "modelInput" } %>
|
|
37
|
+
<p class="help is-danger"><%= form.object.errors.full_messages_for(:judge_model).join(", ") %></p>
|
|
38
|
+
</div>
|
|
39
|
+
</div>
|
|
40
|
+
</div>
|
|
41
|
+
|
|
42
|
+
<div class="column field" data-eval-type-selector-target="expectedOutput" style="<%= 'display: none;' if form.object.human_judge? %>">
|
|
43
|
+
<%= form.label :expected_output, class: "label" %>
|
|
44
|
+
<%= form.text_area :expected_output, class: "textarea" %>
|
|
45
|
+
<p class="help is-danger"><%= form.object.errors.full_messages_for(:expected_output).join(", ") %></p>
|
|
46
|
+
</div>
|
|
47
|
+
|
|
48
|
+
<div class="column field">
|
|
49
|
+
<%= form.label :files, "Attach files", class: "label" %>
|
|
50
|
+
|
|
51
|
+
<% if form.object.persisted? && form.object.files.attached? %>
|
|
52
|
+
<% form.object.files.each do |file| %>
|
|
53
|
+
<%= form.hidden_field :files, multiple: true, value: file.signed_id %>
|
|
54
|
+
<% end %>
|
|
55
|
+
<% end %>
|
|
56
|
+
|
|
57
|
+
<div class="file has-name is-fullwidth" data-controller="file-input">
|
|
58
|
+
<label class="file-label">
|
|
59
|
+
<%= form.file_field :files, multiple: true, class: "file-input", data: { action: "change->file-input#update", file_input_target: "input" } %>
|
|
60
|
+
<span class="file-cta">
|
|
61
|
+
<span class="file-label">Choose files...</span>
|
|
62
|
+
</span>
|
|
63
|
+
<span class="file-name" data-file-input-target="name">No files selected</span>
|
|
64
|
+
</label>
|
|
65
|
+
</div>
|
|
66
|
+
|
|
67
|
+
<% if form.object.persisted? && form.object.files.any? %>
|
|
68
|
+
<div>
|
|
69
|
+
<p>Current attachments:</p>
|
|
70
|
+
<ul>
|
|
71
|
+
<% form.object.files.each do |file| %>
|
|
72
|
+
<li>
|
|
73
|
+
<span><%= file.filename %> (<%= number_to_human_size(file.byte_size) %>)</span>
|
|
74
|
+
<%= link_to "Remove", main_app.rails_blob_path(file, disposition: "attachment"), data: { turbo_method: :delete, turbo_confirm: "Are you sure?" } %>
|
|
75
|
+
</li>
|
|
76
|
+
<% end %>
|
|
77
|
+
</ul>
|
|
78
|
+
</div>
|
|
79
|
+
<% end %>
|
|
80
|
+
</div>
|
|
81
|
+
</div>
|
|
82
|
+
|
|
83
|
+
<div class="field">
|
|
84
|
+
<button type="button" class="button is-danger is-outlined" data-action="nested-form#remove">Remove sample</button>
|
|
85
|
+
</div>
|
|
86
|
+
|
|
87
|
+
<%= form.hidden_field :_destroy %>
|
|
88
|
+
</div>
|
data/config/importmap.rb
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
pin "@hotwired/turbo-rails", to: "turbo.min.js", preload: true
|
|
2
|
+
pin "@hotwired/stimulus", to: "stimulus.min.js", preload: true
|
|
3
|
+
pin "@hotwired/stimulus-loading", to: "stimulus-loading.js", preload: true
|
|
4
|
+
pin "@stimulus-components/auto-submit", to: "https://ga.jspm.io/npm:@stimulus-components/auto-submit@6.0.0/dist/stimulus-auto-submit.mjs"
|
|
5
|
+
pin "@stimulus-components/rails-nested-form", to: "https://ga.jspm.io/npm:@stimulus-components/rails-nested-form@5.0.0/dist/stimulus-rails-nested-form.mjs", preload: true
|
|
6
|
+
|
|
7
|
+
pin "ace-builds", to: "https://cdn.jsdelivr.net/npm/ace-builds@1.35.5/src-min-noconflict/ace.min.js", preload: true
|
|
8
|
+
pin "ace-mode-json", to: "https://cdn.jsdelivr.net/npm/ace-builds@1.35.5/src-min-noconflict/mode-json.min.js", preload: true
|
|
9
|
+
pin "ace-theme-github", to: "https://cdn.jsdelivr.net/npm/ace-builds@1.35.5/src-min-noconflict/theme-github.min.js", preload: true
|
|
10
|
+
pin "ace-theme-github-dark", to: "https://cdn.jsdelivr.net/npm/ace-builds@1.35.5/src-min-noconflict/theme-github_dark.min.js", preload: true
|
|
11
|
+
|
|
12
|
+
pin "application", to: "ruby_llm/evals/application.js", preload: true
|
|
13
|
+
pin_all_from RubyLLM::Evals::Engine.root.join("app/javascript/ruby_llm/evals/controllers"), under: "controllers", to: "ruby_llm/evals/controllers"
|
data/config/routes.rb
CHANGED
|
@@ -1,2 +1,21 @@
|
|
|
1
|
-
|
|
1
|
+
RubyLLM::Evals::Engine.routes.draw do
|
|
2
|
+
resources :runs, only: %i[destroy index show]
|
|
3
|
+
|
|
4
|
+
resources :prompt_executions, only: [] do
|
|
5
|
+
scope module: :prompt_executions do
|
|
6
|
+
resource :failure, only: %i[create]
|
|
7
|
+
resource :passage, only: %i[create]
|
|
8
|
+
resource :retry, only: %i[create]
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
resources :prompts do
|
|
13
|
+
resources :runs, only: %i[create]
|
|
14
|
+
|
|
15
|
+
member do
|
|
16
|
+
get :compare
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
root to: "prompts#index"
|
|
2
21
|
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
class CreateRubyLLMEvalsPrompts < ActiveRecord::Migration[7.0]
|
|
2
|
+
def change
|
|
3
|
+
create_table :ruby_llm_evals_prompts do |t|
|
|
4
|
+
t.string :name, null: false
|
|
5
|
+
t.string :slug, null: false
|
|
6
|
+
t.string :provider, null: false
|
|
7
|
+
t.string :model, null: false
|
|
8
|
+
t.float :temperature
|
|
9
|
+
t.json :params
|
|
10
|
+
t.json :tools
|
|
11
|
+
t.string :schema
|
|
12
|
+
t.json :schema_other
|
|
13
|
+
t.text :instructions
|
|
14
|
+
t.text :message
|
|
15
|
+
|
|
16
|
+
t.timestamps
|
|
17
|
+
end
|
|
18
|
+
add_index :ruby_llm_evals_prompts, :name, unique: true
|
|
19
|
+
add_index :ruby_llm_evals_prompts, :slug, unique: true
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
class CreateRubyLLMEvalsSamples < ActiveRecord::Migration[7.0]
|
|
2
|
+
def change
|
|
3
|
+
create_table :ruby_llm_evals_samples do |t|
|
|
4
|
+
t.references :ruby_llm_evals_prompt, null: false, foreign_key: true
|
|
5
|
+
t.string :eval_type, null: false
|
|
6
|
+
t.text :expected_output
|
|
7
|
+
t.string :judge_model
|
|
8
|
+
t.string :judge_provider
|
|
9
|
+
t.json :variables
|
|
10
|
+
|
|
11
|
+
t.timestamps
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|