leva 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +55 -16
- data/app/controllers/leva/dataset_records_controller.rb +21 -0
- data/app/controllers/leva/datasets_controller.rb +9 -2
- data/app/controllers/leva/experiments_controller.rb +34 -9
- data/app/controllers/leva/runner_results_controller.rb +8 -0
- data/app/controllers/leva/workbench_controller.rb +85 -12
- data/app/helpers/leva/application_helper.rb +39 -0
- data/app/javascript/controllers/prompt_form_controller.js +45 -0
- data/app/javascript/controllers/prompt_selector_controller.js +31 -0
- data/app/jobs/leva/experiment_job.rb +9 -4
- data/app/jobs/leva/run_eval_job.rb +40 -0
- data/app/models/concerns/leva/recordable.rb +37 -0
- data/app/models/leva/dataset.rb +15 -6
- data/app/models/leva/dataset_record.rb +40 -1
- data/app/models/leva/evaluation_result.rb +15 -7
- data/app/models/leva/experiment.rb +24 -12
- data/app/models/leva/prompt.rb +14 -1
- data/app/models/leva/runner_result.rb +56 -0
- data/app/views/layouts/leva/application.html.erb +24 -13
- data/app/views/leva/dataset_records/index.html.erb +49 -0
- data/app/views/leva/dataset_records/show.html.erb +30 -0
- data/app/views/leva/datasets/_dataset.html.erb +18 -0
- data/app/views/leva/datasets/_form.html.erb +24 -0
- data/app/views/leva/datasets/edit.html.erb +5 -0
- data/app/views/leva/datasets/index.html.erb +51 -38
- data/app/views/leva/datasets/new.html.erb +5 -0
- data/app/views/leva/datasets/show.html.erb +160 -8
- data/app/views/leva/experiments/_experiment.html.erb +42 -0
- data/app/views/leva/experiments/_form.html.erb +49 -0
- data/app/views/leva/experiments/edit.html.erb +5 -0
- data/app/views/leva/experiments/index.html.erb +53 -37
- data/app/views/leva/experiments/new.html.erb +5 -0
- data/app/views/leva/experiments/show.html.erb +115 -19
- data/app/views/leva/runner_results/show.html.erb +64 -0
- data/app/views/leva/workbench/_evaluation_area.html.erb +5 -0
- data/app/views/leva/workbench/_prompt_content.html.erb +216 -0
- data/app/views/leva/workbench/_prompt_form.html.erb +89 -0
- data/app/views/leva/workbench/_prompt_sidebar.html.erb +21 -0
- data/app/views/leva/workbench/_results_section.html.erb +159 -0
- data/app/views/leva/workbench/_top_bar.html.erb +10 -0
- data/app/views/leva/workbench/edit.html.erb +20 -0
- data/app/views/leva/workbench/index.html.erb +5 -91
- data/app/views/leva/workbench/new.html.erb +79 -36
- data/config/routes.rb +15 -6
- data/db/migrate/20240813172916_create_leva_datasets.rb +1 -0
- data/db/migrate/20240813173035_create_leva_experiments.rb +1 -0
- data/db/migrate/20240816201419_create_leva_runner_results.rb +11 -0
- data/db/migrate/20240816201433_update_leva_evaluation_results.rb +8 -0
- data/db/migrate/20240821163608_make_experiment_optional_for_runner_results.rb +6 -0
- data/db/migrate/20240821181934_add_prompt_to_leva_runner_results.rb +5 -0
- data/db/migrate/20240821183153_add_runner_and_evaluator_to_leva_experiments.rb +6 -0
- data/db/migrate/20240821191713_add_actual_result_to_leva_dataset_records.rb +5 -0
- data/db/migrate/20240822143201_remove_actual_result_from_leva_runner_results.rb +5 -0
- data/db/migrate/20240912183556_add_runner_class_to_leva_runner_results.rb +5 -0
- data/lib/generators/leva/templates/eval.rb.erb +7 -8
- data/lib/generators/leva/templates/runner.rb.erb +25 -0
- data/lib/leva/version.rb +1 -1
- data/lib/leva.rb +84 -44
- metadata +49 -5
- data/app/evals/test_sentiment_accuracy_eval.rb +0 -6
- data/app/runners/test_sentiment_run.rb +0 -13
- data/lib/leva/base_eval.rb +0 -75
@@ -1,38 +1,81 @@
|
|
1
|
-
<% content_for :title,
|
2
|
-
<div class="px-4
|
3
|
-
<
|
4
|
-
|
5
|
-
|
6
|
-
<
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
<div>
|
14
|
-
<h3 class="text-lg leading-6 font-medium text-gray-900">Experiment Information</h3>
|
15
|
-
<p class="mt-1 text-sm text-gray-500">Provide details for your new experiment.</p>
|
16
|
-
</div>
|
17
|
-
<div class="mt-6 grid grid-cols-1 gap-y-6 gap-x-4 sm:grid-cols-6">
|
18
|
-
<div class="sm:col-span-4">
|
19
|
-
<%= form.label :name, class: "block text-sm font-medium text-gray-700" %>
|
20
|
-
<div class="mt-1">
|
21
|
-
<%= form.text_field :name, class: "shadow-sm focus:ring-indigo-500 focus:border-indigo-500 block w-full sm:text-sm border-gray-300 rounded-md" %>
|
22
|
-
</div>
|
23
|
-
</div>
|
24
|
-
<div class="sm:col-span-2">
|
25
|
-
<%= form.label :dataset_id, class: "block text-sm font-medium text-gray-700" %>
|
26
|
-
<div class="mt-1">
|
27
|
-
<%= form.select :dataset_id, options_for_select(@datasets.map { |dataset| [dataset.name, dataset.id] }), class: "shadow-sm focus:ring-indigo-500 focus:border-indigo-500 block w-full sm:text-sm border-gray-300 rounded-md" %>
|
28
|
-
</div>
|
29
|
-
</div>
|
30
|
-
</div>
|
31
|
-
</div>
|
32
|
-
</div>
|
33
|
-
<div class="pt-5">
|
34
|
-
<%= form.submit class: "btn btn-primary btn-block" %>
|
1
|
+
<% content_for :title, "New Prompt" %>
|
2
|
+
<div class="container mx-auto px-4 py-8 bg-gray-950 text-white">
|
3
|
+
<h1 class="text-3xl font-bold text-indigo-400 mb-6">New Prompt</h1>
|
4
|
+
<%= form_with(model: @prompt, url: workbench_index_path, local: true, class: "bg-gray-800 rounded-lg shadow-lg p-6", data: { controller: "prompt-selector" }) do |form| %>
|
5
|
+
<% if @prompt.errors.any? %>
|
6
|
+
<div class="bg-red-900 border border-red-700 text-red-100 px-4 py-3 rounded-lg mb-4">
|
7
|
+
<h2><%= pluralize(@prompt.errors.count, "error") %> prohibited this prompt from being saved:</h2>
|
8
|
+
<ul>
|
9
|
+
<% @prompt.errors.full_messages.each do |message| %>
|
10
|
+
<li><%= message %></li>
|
11
|
+
<% end %>
|
12
|
+
</ul>
|
35
13
|
</div>
|
36
14
|
<% end %>
|
37
|
-
|
38
|
-
|
15
|
+
<div class="mb-4">
|
16
|
+
<%= form.label :name, class: "block text-sm font-semibold mb-2 text-indigo-300" %>
|
17
|
+
<%= form.text_field :name, autofocus: true, class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none" %>
|
18
|
+
</div>
|
19
|
+
<div class="mb-4">
|
20
|
+
<%= form.label :system_prompt, class: "block text-sm font-semibold mb-2 text-indigo-300" %>
|
21
|
+
<%= form.text_area :system_prompt, rows: 2, class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none" %>
|
22
|
+
</div>
|
23
|
+
<div class="mb-4">
|
24
|
+
<%= form.label :predefined_prompt, "Select Predefined Prompt", class: "block text-sm font-semibold mb-2 text-indigo-300" %>
|
25
|
+
<%= form.select :predefined_prompt,
|
26
|
+
options_for_select([['Custom Prompt', '']] + @predefined_prompts.map { |name, content| [name, content] }),
|
27
|
+
{},
|
28
|
+
class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none",
|
29
|
+
data: { action: "change->prompt-selector#toggleUserPrompt" }
|
30
|
+
%>
|
31
|
+
</div>
|
32
|
+
<div class="mb-4" data-prompt-selector-target="userPromptField">
|
33
|
+
<%= form.label :user_prompt, class: "block text-sm font-semibold mb-2 text-indigo-300" %>
|
34
|
+
<%= form.text_area :user_prompt, rows: 5, class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none" %>
|
35
|
+
</div>
|
36
|
+
<div class="mb-4 hidden" data-prompt-selector-target="promptPreview">
|
37
|
+
<h3 class="text-lg font-semibold mb-2 text-indigo-300">Prompt Preview</h3>
|
38
|
+
<div class="bg-gray-700 text-white p-3 rounded-lg" data-prompt-selector-target="previewContent"></div>
|
39
|
+
</div>
|
40
|
+
<div class="flex items-center justify-end space-x-4">
|
41
|
+
<%= link_to "Cancel", workbench_index_path, class: "px-3 py-2 rounded-md text-sm font-medium text-gray-300 hover:bg-gray-800 hover:text-white transition-colors duration-150 ease-in-out" %>
|
42
|
+
<%= form.submit "Create Prompt", class: "px-3 py-2 rounded-md text-sm font-medium bg-indigo-600 text-white shadow-lg hover:bg-indigo-700 transition-colors duration-150 ease-in-out" %>
|
43
|
+
</div>
|
44
|
+
<% end %>
|
45
|
+
</div>
|
46
|
+
<script>
|
47
|
+
(() => {
|
48
|
+
const application = Stimulus.Application.start()
|
49
|
+
|
50
|
+
application.register("prompt-selector", class extends Stimulus.Controller {
|
51
|
+
static targets = ["userPromptField", "promptPreview", "previewContent"]
|
52
|
+
|
53
|
+
toggleUserPrompt(event) {
|
54
|
+
const selectedContent = event.target.value
|
55
|
+
if (selectedContent) {
|
56
|
+
this.userPromptFieldTarget.style.display = 'none'
|
57
|
+
this.promptPreviewTarget.classList.remove('hidden')
|
58
|
+
this.loadPredefinedPrompt(selectedContent)
|
59
|
+
} else {
|
60
|
+
this.userPromptFieldTarget.style.display = 'block'
|
61
|
+
this.promptPreviewTarget.classList.add('hidden')
|
62
|
+
this.clearUserPrompt()
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
loadPredefinedPrompt(content) {
|
67
|
+
const userPromptTextarea = this.userPromptFieldTarget.querySelector('textarea')
|
68
|
+
userPromptTextarea.value = content
|
69
|
+
this.previewContentTarget.innerHTML = marked.parse(content)
|
70
|
+
}
|
71
|
+
|
72
|
+
clearUserPrompt() {
|
73
|
+
const userPromptTextarea = this.userPromptFieldTarget.querySelector('textarea')
|
74
|
+
userPromptTextarea.value = ''
|
75
|
+
this.previewContentTarget.innerHTML = ''
|
76
|
+
}
|
77
|
+
})
|
78
|
+
})()
|
79
|
+
</script>
|
80
|
+
<!-- Include marked.js for Markdown parsing -->
|
81
|
+
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
data/config/routes.rb
CHANGED
@@ -1,12 +1,21 @@
|
|
1
1
|
Leva::Engine.routes.draw do
|
2
2
|
root 'workbench#index'
|
3
3
|
|
4
|
-
resources :datasets
|
5
|
-
|
4
|
+
resources :datasets do
|
5
|
+
resources :dataset_records, path: 'records', only: [:index, :show]
|
6
|
+
end
|
7
|
+
resources :experiments, except: [:destroy] do
|
8
|
+
member do
|
9
|
+
post :rerun
|
10
|
+
end
|
11
|
+
resources :runner_results, only: [:show]
|
12
|
+
end
|
6
13
|
resources :prompts
|
7
|
-
resources :workbench, only: [:index, :new, :
|
8
|
-
|
9
|
-
|
10
|
-
|
14
|
+
resources :workbench, only: [:index, :new, :create, :edit, :update] do
|
15
|
+
collection do
|
16
|
+
post 'run'
|
17
|
+
post 'run_all_evals'
|
18
|
+
post 'run_evaluator'
|
19
|
+
end
|
11
20
|
end
|
12
21
|
end
|
@@ -2,6 +2,7 @@ class CreateLevaExperiments < ActiveRecord::Migration[7.2]
|
|
2
2
|
def change
|
3
3
|
create_table :leva_experiments do |t|
|
4
4
|
t.string :name
|
5
|
+
t.text :description
|
5
6
|
t.references :dataset, null: false, foreign_key: { to_table: :leva_datasets }
|
6
7
|
t.references :prompt, null: true, foreign_key: { to_table: :leva_prompts }
|
7
8
|
t.integer :status
|
@@ -0,0 +1,11 @@
|
|
1
|
+
class CreateLevaRunnerResults < ActiveRecord::Migration[7.2]
|
2
|
+
def change
|
3
|
+
create_table :leva_runner_results do |t|
|
4
|
+
t.references :experiment, null: false, foreign_key: { to_table: :leva_experiments }
|
5
|
+
t.references :dataset_record, null: false, foreign_key: { to_table: :leva_dataset_records }
|
6
|
+
t.text :prediction
|
7
|
+
|
8
|
+
t.timestamps
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
class UpdateLevaEvaluationResults < ActiveRecord::Migration[7.2]
|
2
|
+
def change
|
3
|
+
add_reference :leva_evaluation_results, :runner_result, null: false, foreign_key: { to_table: :leva_runner_results }
|
4
|
+
add_column :leva_evaluation_results, :evaluator_class, :string, null: false
|
5
|
+
remove_column :leva_evaluation_results, :prediction, :string
|
6
|
+
remove_column :leva_evaluation_results, :label, :string
|
7
|
+
end
|
8
|
+
end
|
@@ -1,15 +1,14 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
class <%= class_name %>Eval < Leva::BaseEval
|
4
|
-
# @param
|
5
|
-
# @param
|
6
|
-
# @return [
|
7
|
-
def evaluate(
|
4
|
+
# @param runner_result [Leva::RunnerResult] The runner result to evaluate
|
5
|
+
# @param recordable [YourRecordClass] The recordable object to evaluate
|
6
|
+
# @return [Float] The score of the evaluation
|
7
|
+
def evaluate(runner_result, recordable)
|
8
8
|
# Implement your evaluation logic here
|
9
|
+
# You can access the ground truth using recordable.ground_truth
|
9
10
|
|
10
|
-
|
11
|
-
|
12
|
-
score: score
|
13
|
-
)
|
11
|
+
# Example implementation:
|
12
|
+
runner_result.parsed_predictions.first == recordable.ground_truth ? 1.0 : 0.0
|
14
13
|
end
|
15
14
|
end
|
@@ -8,4 +8,29 @@ class <%= class_name %>Run < Leva::BaseRun
|
|
8
8
|
# This could involve calling an API, running a local model, etc.
|
9
9
|
# Return the result of the run to be used to evaluate the model
|
10
10
|
end
|
11
|
+
|
12
|
+
# Uncomment and modify this method to customize parsed predictions
|
13
|
+
# @param runner_result [Leva::RunnerResult] The runner result to parse
|
14
|
+
# @return [Array<String>] The parsed predictions
|
15
|
+
# def parsed_predictions(runner_result)
|
16
|
+
# # Example: Extract predictions from XML-like tags
|
17
|
+
# runner_result.prediction.scan(/<prediction>(.*?)<\/prediction>/).flatten
|
18
|
+
# end
|
19
|
+
|
20
|
+
# Uncomment and modify this method to customize ground truth extraction
|
21
|
+
# @param runner_result [Leva::RunnerResult] The runner result to get ground truth from
|
22
|
+
# @return [String] The ground truth for the runner result
|
23
|
+
# def ground_truth(runner_result)
|
24
|
+
# # Example: Extract ground truth from a specific field
|
25
|
+
# runner_result.dataset_record.recordable.custom_ground_truth_field
|
26
|
+
# end
|
27
|
+
|
28
|
+
# Uncomment and modify this method to customize regex extraction
|
29
|
+
# @param runner_result [Leva::RunnerResult] The runner result to extract regex from
|
30
|
+
# @return [Regexp, nil] The regex pattern to use for parsing predictions
|
31
|
+
# def extract_regex_pattern(runner_result)
|
32
|
+
# # Your custom regex extraction logic here
|
33
|
+
# # For example:
|
34
|
+
# # /\<result\>(.*?)\<\/result\>/
|
35
|
+
# end
|
11
36
|
end
|
data/lib/leva/version.rb
CHANGED
data/lib/leva.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "leva/version"
|
2
2
|
require "leva/engine"
|
3
|
+
require "liquid"
|
3
4
|
|
4
5
|
module Leva
|
5
6
|
# Runs an evaluation experiment with the given run and evals.
|
@@ -9,9 +10,34 @@ module Leva
|
|
9
10
|
# @param evals [Array<Leva::BaseEval>] The evaluation implementations to use.
|
10
11
|
# @return [void]
|
11
12
|
def self.run_evaluation(experiment:, run:, evals:)
|
12
|
-
|
13
|
+
experiment.update(status: :running)
|
14
|
+
|
15
|
+
experiment.dataset.dataset_records.find_each do |dataset_record|
|
16
|
+
runner_result = run.execute_and_store(experiment, dataset_record, experiment.prompt)
|
17
|
+
|
18
|
+
evals.each do |eval|
|
19
|
+
eval.evaluate_and_store(experiment, runner_result)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
experiment.update(status: :completed)
|
24
|
+
rescue StandardError => e
|
25
|
+
experiment.update(status: :failed)
|
26
|
+
Rails.logger.error "Error in experiment #{experiment.name}: #{e.message}"
|
27
|
+
end
|
28
|
+
|
29
|
+
# Runs a single evaluation for a dataset record
|
30
|
+
#
|
31
|
+
# @param experiment [Leva::Experiment] The experiment to run.
|
32
|
+
# @param run [Leva::BaseRun] The run implementation to use.
|
33
|
+
# @param evals [Array<Leva::BaseEval>] The evaluation implementations to use.
|
34
|
+
# @param dataset_record [Leva::DatasetRecord] The dataset record to process.
|
35
|
+
# @return [void]
|
36
|
+
def self.run_single_evaluation(experiment:, run:, evals:, dataset_record:)
|
37
|
+
runner_result = run.execute_and_store(experiment, dataset_record, experiment.prompt)
|
38
|
+
|
13
39
|
evals.each do |eval|
|
14
|
-
eval.
|
40
|
+
eval.evaluate_and_store(experiment, runner_result)
|
15
41
|
end
|
16
42
|
end
|
17
43
|
|
@@ -29,20 +55,47 @@ module Leva
|
|
29
55
|
raise NotImplementedError, "#{self.class} must implement #execute"
|
30
56
|
end
|
31
57
|
|
32
|
-
#
|
58
|
+
# Executes the run on a given dataset record and stores the result.
|
33
59
|
#
|
34
|
-
# @param experiment [Leva::Experiment] The experiment
|
35
|
-
# @
|
36
|
-
|
60
|
+
# @param experiment [Leva::Experiment, nil] The experiment being run, if any.
|
61
|
+
# @param dataset_record [Leva::DatasetRecord] The dataset record to run the model on.
|
62
|
+
# @param prompt [Leva::Prompt] The prompt to store the version of.
|
63
|
+
# @return [Leva::RunnerResult] The stored runner result.
|
64
|
+
def execute_and_store(experiment, dataset_record, prompt)
|
65
|
+
# Expose these to the subclass execution
|
37
66
|
@experiment = experiment
|
38
|
-
@prompt =
|
67
|
+
@prompt = prompt
|
68
|
+
|
69
|
+
result = execute(dataset_record.recordable)
|
70
|
+
RunnerResult.create!(
|
71
|
+
experiment: experiment,
|
72
|
+
dataset_record: dataset_record,
|
73
|
+
prompt: prompt,
|
74
|
+
prediction: result,
|
75
|
+
runner_class: self.class.name
|
76
|
+
)
|
77
|
+
end
|
39
78
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
79
|
+
# @param runner_result [Leva::RunnerResult] The runner result to parse
|
80
|
+
# @return [Array<String>] The parsed predictions
|
81
|
+
def parsed_predictions(runner_result)
|
82
|
+
if extract_regex_pattern(runner_result)
|
83
|
+
runner_result.prediction.scan(extract_regex_pattern(runner_result)).map { |match| match.first&.strip }.compact
|
84
|
+
else
|
85
|
+
[runner_result.prediction]
|
44
86
|
end
|
45
|
-
|
87
|
+
end
|
88
|
+
|
89
|
+
# @param runner_result [Leva::RunnerResult] The runner result to extract regex from
|
90
|
+
# @return [Regexp, nil] The regex pattern to use for parsing predictions
|
91
|
+
def extract_regex_pattern(runner_result)
|
92
|
+
runner_result.dataset_record.recordable.extract_regex_pattern if runner_result.dataset_record.recordable.respond_to?(:extract_regex_pattern)
|
93
|
+
end
|
94
|
+
|
95
|
+
# @param runner_result [Leva::RunnerResult] The runner result to get ground truth from
|
96
|
+
# @return [String] The ground truth for the runner result
|
97
|
+
def ground_truth(runner_result)
|
98
|
+
runner_result.dataset_record.ground_truth
|
46
99
|
end
|
47
100
|
end
|
48
101
|
|
@@ -51,47 +104,34 @@ module Leva
|
|
51
104
|
# @abstract Subclass and override {#evaluate} to implement
|
52
105
|
# custom evaluation logic.
|
53
106
|
class BaseEval
|
54
|
-
# Evaluates the model's prediction against the
|
107
|
+
# Evaluates the model's prediction against the ground truth.
|
55
108
|
#
|
56
109
|
# @param prediction [Object] The model's prediction.
|
57
|
-
# @param
|
58
|
-
# @return [
|
110
|
+
# @param recordable [Object] The recordable object containing the ground truth.
|
111
|
+
# @return [Float] The evaluation score.
|
59
112
|
# @raise [NotImplementedError] if the method is not implemented in a subclass.
|
60
|
-
def evaluate(prediction,
|
113
|
+
def evaluate(prediction, recordable)
|
61
114
|
raise NotImplementedError, "#{self.class} must implement #evaluate"
|
62
115
|
end
|
63
116
|
|
64
|
-
# Evaluates
|
117
|
+
# Evaluates a single runner result and stores the evaluation.
|
65
118
|
#
|
66
|
-
# @param experiment [Leva::Experiment] The experiment
|
67
|
-
# @param
|
68
|
-
# @return [
|
69
|
-
def
|
70
|
-
experiment
|
71
|
-
|
72
|
-
evaluation = evaluate(prediction, dataset_record.recordable)
|
73
|
-
|
74
|
-
Leva::EvaluationResult.create!(
|
75
|
-
experiment: experiment,
|
76
|
-
dataset_record: dataset_record,
|
77
|
-
prediction: prediction,
|
78
|
-
score: evaluation.score,
|
79
|
-
label: evaluation.label
|
80
|
-
)
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
119
|
+
# @param experiment [Leva::Experiment, nil] The experiment being evaluated, if any.
|
120
|
+
# @param runner_result [Leva::RunnerResult] The runner result to evaluate.
|
121
|
+
# @return [Leva::EvaluationResult] The stored evaluation result.
|
122
|
+
def evaluate_and_store(experiment, runner_result)
|
123
|
+
@experiment = experiment
|
124
|
+
@runner_result = runner_result
|
84
125
|
|
85
|
-
|
86
|
-
class Result
|
87
|
-
attr_reader :label, :prediction, :score
|
126
|
+
score = evaluate(runner_result, runner_result.dataset_record.recordable)
|
88
127
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
128
|
+
EvaluationResult.create!(
|
129
|
+
experiment: experiment,
|
130
|
+
dataset_record: runner_result.dataset_record,
|
131
|
+
runner_result: runner_result,
|
132
|
+
score: score,
|
133
|
+
evaluator_class: self.class.name
|
134
|
+
)
|
95
135
|
end
|
96
136
|
end
|
97
137
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: leva
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kieran Klaassen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 7.2.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: liquid
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 5.5.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 5.5.0
|
27
41
|
description: Leva is a Ruby on Rails framework for evaluating Language Models (LLMs)
|
28
42
|
using ActiveRecord datasets. It provides a flexible structure for creating experiments,
|
29
43
|
managing datasets, and implementing various evaluation logic.
|
@@ -39,26 +53,49 @@ files:
|
|
39
53
|
- app/assets/config/leva_manifest.js
|
40
54
|
- app/assets/stylesheets/leva/application.css
|
41
55
|
- app/controllers/leva/application_controller.rb
|
56
|
+
- app/controllers/leva/dataset_records_controller.rb
|
42
57
|
- app/controllers/leva/datasets_controller.rb
|
43
58
|
- app/controllers/leva/experiments_controller.rb
|
59
|
+
- app/controllers/leva/runner_results_controller.rb
|
44
60
|
- app/controllers/leva/workbench_controller.rb
|
45
|
-
- app/evals/test_sentiment_accuracy_eval.rb
|
46
61
|
- app/helpers/leva/application_helper.rb
|
62
|
+
- app/javascript/controllers/prompt_form_controller.js
|
63
|
+
- app/javascript/controllers/prompt_selector_controller.js
|
47
64
|
- app/jobs/leva/application_job.rb
|
48
65
|
- app/jobs/leva/experiment_job.rb
|
66
|
+
- app/jobs/leva/run_eval_job.rb
|
49
67
|
- app/mailers/leva/application_mailer.rb
|
68
|
+
- app/models/concerns/leva/recordable.rb
|
50
69
|
- app/models/leva/application_record.rb
|
51
70
|
- app/models/leva/dataset.rb
|
52
71
|
- app/models/leva/dataset_record.rb
|
53
72
|
- app/models/leva/evaluation_result.rb
|
54
73
|
- app/models/leva/experiment.rb
|
55
74
|
- app/models/leva/prompt.rb
|
56
|
-
- app/
|
75
|
+
- app/models/leva/runner_result.rb
|
57
76
|
- app/views/layouts/leva/application.html.erb
|
77
|
+
- app/views/leva/dataset_records/index.html.erb
|
78
|
+
- app/views/leva/dataset_records/show.html.erb
|
79
|
+
- app/views/leva/datasets/_dataset.html.erb
|
80
|
+
- app/views/leva/datasets/_form.html.erb
|
81
|
+
- app/views/leva/datasets/edit.html.erb
|
58
82
|
- app/views/leva/datasets/index.html.erb
|
83
|
+
- app/views/leva/datasets/new.html.erb
|
59
84
|
- app/views/leva/datasets/show.html.erb
|
85
|
+
- app/views/leva/experiments/_experiment.html.erb
|
86
|
+
- app/views/leva/experiments/_form.html.erb
|
87
|
+
- app/views/leva/experiments/edit.html.erb
|
60
88
|
- app/views/leva/experiments/index.html.erb
|
89
|
+
- app/views/leva/experiments/new.html.erb
|
61
90
|
- app/views/leva/experiments/show.html.erb
|
91
|
+
- app/views/leva/runner_results/show.html.erb
|
92
|
+
- app/views/leva/workbench/_evaluation_area.html.erb
|
93
|
+
- app/views/leva/workbench/_prompt_content.html.erb
|
94
|
+
- app/views/leva/workbench/_prompt_form.html.erb
|
95
|
+
- app/views/leva/workbench/_prompt_sidebar.html.erb
|
96
|
+
- app/views/leva/workbench/_results_section.html.erb
|
97
|
+
- app/views/leva/workbench/_top_bar.html.erb
|
98
|
+
- app/views/leva/workbench/edit.html.erb
|
62
99
|
- app/views/leva/workbench/index.html.erb
|
63
100
|
- app/views/leva/workbench/new.html.erb
|
64
101
|
- config/routes.rb
|
@@ -67,12 +104,19 @@ files:
|
|
67
104
|
- db/migrate/20240813173034_create_leva_prompts.rb
|
68
105
|
- db/migrate/20240813173035_create_leva_experiments.rb
|
69
106
|
- db/migrate/20240813173050_create_leva_evaluation_results.rb
|
107
|
+
- db/migrate/20240816201419_create_leva_runner_results.rb
|
108
|
+
- db/migrate/20240816201433_update_leva_evaluation_results.rb
|
109
|
+
- db/migrate/20240821163608_make_experiment_optional_for_runner_results.rb
|
110
|
+
- db/migrate/20240821181934_add_prompt_to_leva_runner_results.rb
|
111
|
+
- db/migrate/20240821183153_add_runner_and_evaluator_to_leva_experiments.rb
|
112
|
+
- db/migrate/20240821191713_add_actual_result_to_leva_dataset_records.rb
|
113
|
+
- db/migrate/20240822143201_remove_actual_result_from_leva_runner_results.rb
|
114
|
+
- db/migrate/20240912183556_add_runner_class_to_leva_runner_results.rb
|
70
115
|
- lib/generators/leva/eval_generator.rb
|
71
116
|
- lib/generators/leva/runner_generator.rb
|
72
117
|
- lib/generators/leva/templates/eval.rb.erb
|
73
118
|
- lib/generators/leva/templates/runner.rb.erb
|
74
119
|
- lib/leva.rb
|
75
|
-
- lib/leva/base_eval.rb
|
76
120
|
- lib/leva/engine.rb
|
77
121
|
- lib/leva/version.rb
|
78
122
|
- lib/tasks/auto_annotate_models.rake
|
@@ -1,13 +0,0 @@
|
|
1
|
-
class TestSentimentRun < Leva::BaseRun
|
2
|
-
def execute(record)
|
3
|
-
# Simple sentiment analysis logic for testing
|
4
|
-
case record.content.downcase
|
5
|
-
when /love|great|excellent/
|
6
|
-
"Positive"
|
7
|
-
when /terrible|bad|awful/
|
8
|
-
"Negative"
|
9
|
-
else
|
10
|
-
"Neutral"
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|