leva 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +52 -16
- data/app/controllers/leva/dataset_records_controller.rb +21 -0
- data/app/controllers/leva/datasets_controller.rb +9 -2
- data/app/controllers/leva/experiments_controller.rb +34 -9
- data/app/controllers/leva/runner_results_controller.rb +8 -0
- data/app/controllers/leva/workbench_controller.rb +85 -12
- data/app/helpers/leva/application_helper.rb +39 -0
- data/app/javascript/controllers/prompt_form_controller.js +45 -0
- data/app/javascript/controllers/prompt_selector_controller.js +31 -0
- data/app/jobs/leva/experiment_job.rb +9 -4
- data/app/jobs/leva/run_eval_job.rb +40 -0
- data/app/models/concerns/leva/recordable.rb +37 -0
- data/app/models/leva/dataset.rb +15 -6
- data/app/models/leva/dataset_record.rb +43 -5
- data/app/models/leva/evaluation_result.rb +22 -14
- data/app/models/leva/experiment.rb +26 -14
- data/app/models/leva/prompt.rb +14 -1
- data/app/models/leva/runner_result.rb +54 -0
- data/app/views/layouts/leva/application.html.erb +24 -13
- data/app/views/leva/dataset_records/index.html.erb +49 -0
- data/app/views/leva/dataset_records/show.html.erb +30 -0
- data/app/views/leva/datasets/_dataset.html.erb +18 -0
- data/app/views/leva/datasets/_form.html.erb +24 -0
- data/app/views/leva/datasets/edit.html.erb +5 -0
- data/app/views/leva/datasets/index.html.erb +51 -38
- data/app/views/leva/datasets/new.html.erb +5 -0
- data/app/views/leva/datasets/show.html.erb +160 -8
- data/app/views/leva/experiments/_experiment.html.erb +42 -0
- data/app/views/leva/experiments/_form.html.erb +49 -0
- data/app/views/leva/experiments/edit.html.erb +5 -0
- data/app/views/leva/experiments/index.html.erb +53 -37
- data/app/views/leva/experiments/new.html.erb +5 -0
- data/app/views/leva/experiments/show.html.erb +115 -19
- data/app/views/leva/runner_results/show.html.erb +64 -0
- data/app/views/leva/workbench/_evaluation_area.html.erb +5 -0
- data/app/views/leva/workbench/_prompt_content.html.erb +216 -0
- data/app/views/leva/workbench/_prompt_form.html.erb +89 -0
- data/app/views/leva/workbench/_prompt_sidebar.html.erb +21 -0
- data/app/views/leva/workbench/_results_section.html.erb +159 -0
- data/app/views/leva/workbench/_top_bar.html.erb +10 -0
- data/app/views/leva/workbench/edit.html.erb +20 -0
- data/app/views/leva/workbench/index.html.erb +5 -91
- data/app/views/leva/workbench/new.html.erb +79 -36
- data/config/routes.rb +15 -6
- data/db/migrate/20240813172916_create_leva_datasets.rb +1 -0
- data/db/migrate/20240813173033_create_leva_dataset_records.rb +1 -1
- data/db/migrate/20240813173035_create_leva_experiments.rb +3 -2
- data/db/migrate/20240813173050_create_leva_evaluation_results.rb +2 -2
- data/db/migrate/20240816201419_create_leva_runner_results.rb +11 -0
- data/db/migrate/20240816201433_update_leva_evaluation_results.rb +8 -0
- data/db/migrate/20240821163608_make_experiment_optional_for_runner_results.rb +6 -0
- data/db/migrate/20240821181934_add_prompt_to_leva_runner_results.rb +5 -0
- data/db/migrate/20240821183153_add_runner_and_evaluator_to_leva_experiments.rb +6 -0
- data/db/migrate/20240821191713_add_actual_result_to_leva_dataset_records.rb +5 -0
- data/db/migrate/20240822143201_remove_actual_result_from_leva_runner_results.rb +5 -0
- data/lib/generators/leva/templates/eval.rb.erb +6 -7
- data/lib/leva/version.rb +1 -1
- data/lib/leva.rb +62 -45
- metadata +48 -5
- data/app/evals/test_sentiment_accuracy_eval.rb +0 -6
- data/app/runners/test_sentiment_run.rb +0 -13
- data/lib/leva/base_eval.rb +0 -75
@@ -1,38 +1,81 @@
|
|
1
|
-
<% content_for :title,
|
2
|
-
<div class="px-4
|
3
|
-
<
|
4
|
-
|
5
|
-
|
6
|
-
<
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
<div>
|
14
|
-
<h3 class="text-lg leading-6 font-medium text-gray-900">Experiment Information</h3>
|
15
|
-
<p class="mt-1 text-sm text-gray-500">Provide details for your new experiment.</p>
|
16
|
-
</div>
|
17
|
-
<div class="mt-6 grid grid-cols-1 gap-y-6 gap-x-4 sm:grid-cols-6">
|
18
|
-
<div class="sm:col-span-4">
|
19
|
-
<%= form.label :name, class: "block text-sm font-medium text-gray-700" %>
|
20
|
-
<div class="mt-1">
|
21
|
-
<%= form.text_field :name, class: "shadow-sm focus:ring-indigo-500 focus:border-indigo-500 block w-full sm:text-sm border-gray-300 rounded-md" %>
|
22
|
-
</div>
|
23
|
-
</div>
|
24
|
-
<div class="sm:col-span-2">
|
25
|
-
<%= form.label :dataset_id, class: "block text-sm font-medium text-gray-700" %>
|
26
|
-
<div class="mt-1">
|
27
|
-
<%= form.select :dataset_id, options_for_select(@datasets.map { |dataset| [dataset.name, dataset.id] }), class: "shadow-sm focus:ring-indigo-500 focus:border-indigo-500 block w-full sm:text-sm border-gray-300 rounded-md" %>
|
28
|
-
</div>
|
29
|
-
</div>
|
30
|
-
</div>
|
31
|
-
</div>
|
32
|
-
</div>
|
33
|
-
<div class="pt-5">
|
34
|
-
<%= form.submit class: "btn btn-primary btn-block" %>
|
1
|
+
<% content_for :title, "New Prompt" %>
|
2
|
+
<div class="container mx-auto px-4 py-8 bg-gray-950 text-white">
|
3
|
+
<h1 class="text-3xl font-bold text-indigo-400 mb-6">New Prompt</h1>
|
4
|
+
<%= form_with(model: @prompt, url: workbench_index_path, local: true, class: "bg-gray-800 rounded-lg shadow-lg p-6", data: { controller: "prompt-selector" }) do |form| %>
|
5
|
+
<% if @prompt.errors.any? %>
|
6
|
+
<div class="bg-red-900 border border-red-700 text-red-100 px-4 py-3 rounded-lg mb-4">
|
7
|
+
<h2><%= pluralize(@prompt.errors.count, "error") %> prohibited this prompt from being saved:</h2>
|
8
|
+
<ul>
|
9
|
+
<% @prompt.errors.full_messages.each do |message| %>
|
10
|
+
<li><%= message %></li>
|
11
|
+
<% end %>
|
12
|
+
</ul>
|
35
13
|
</div>
|
36
14
|
<% end %>
|
37
|
-
|
38
|
-
|
15
|
+
<div class="mb-4">
|
16
|
+
<%= form.label :name, class: "block text-sm font-semibold mb-2 text-indigo-300" %>
|
17
|
+
<%= form.text_field :name, autofocus: true, class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none" %>
|
18
|
+
</div>
|
19
|
+
<div class="mb-4">
|
20
|
+
<%= form.label :system_prompt, class: "block text-sm font-semibold mb-2 text-indigo-300" %>
|
21
|
+
<%= form.text_area :system_prompt, rows: 2, class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none" %>
|
22
|
+
</div>
|
23
|
+
<div class="mb-4">
|
24
|
+
<%= form.label :predefined_prompt, "Select Predefined Prompt", class: "block text-sm font-semibold mb-2 text-indigo-300" %>
|
25
|
+
<%= form.select :predefined_prompt,
|
26
|
+
options_for_select([['Custom Prompt', '']] + @predefined_prompts.map { |name, content| [name, content] }),
|
27
|
+
{},
|
28
|
+
class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none",
|
29
|
+
data: { action: "change->prompt-selector#toggleUserPrompt" }
|
30
|
+
%>
|
31
|
+
</div>
|
32
|
+
<div class="mb-4" data-prompt-selector-target="userPromptField">
|
33
|
+
<%= form.label :user_prompt, class: "block text-sm font-semibold mb-2 text-indigo-300" %>
|
34
|
+
<%= form.text_area :user_prompt, rows: 5, class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none" %>
|
35
|
+
</div>
|
36
|
+
<div class="mb-4 hidden" data-prompt-selector-target="promptPreview">
|
37
|
+
<h3 class="text-lg font-semibold mb-2 text-indigo-300">Prompt Preview</h3>
|
38
|
+
<div class="bg-gray-700 text-white p-3 rounded-lg" data-prompt-selector-target="previewContent"></div>
|
39
|
+
</div>
|
40
|
+
<div class="flex items-center justify-end space-x-4">
|
41
|
+
<%= link_to "Cancel", workbench_index_path, class: "px-3 py-2 rounded-md text-sm font-medium text-gray-300 hover:bg-gray-800 hover:text-white transition-colors duration-150 ease-in-out" %>
|
42
|
+
<%= form.submit "Create Prompt", class: "px-3 py-2 rounded-md text-sm font-medium bg-indigo-600 text-white shadow-lg hover:bg-indigo-700 transition-colors duration-150 ease-in-out" %>
|
43
|
+
</div>
|
44
|
+
<% end %>
|
45
|
+
</div>
|
46
|
+
<script>
|
47
|
+
(() => {
|
48
|
+
const application = Stimulus.Application.start()
|
49
|
+
|
50
|
+
application.register("prompt-selector", class extends Stimulus.Controller {
|
51
|
+
static targets = ["userPromptField", "promptPreview", "previewContent"]
|
52
|
+
|
53
|
+
toggleUserPrompt(event) {
|
54
|
+
const selectedContent = event.target.value
|
55
|
+
if (selectedContent) {
|
56
|
+
this.userPromptFieldTarget.style.display = 'none'
|
57
|
+
this.promptPreviewTarget.classList.remove('hidden')
|
58
|
+
this.loadPredefinedPrompt(selectedContent)
|
59
|
+
} else {
|
60
|
+
this.userPromptFieldTarget.style.display = 'block'
|
61
|
+
this.promptPreviewTarget.classList.add('hidden')
|
62
|
+
this.clearUserPrompt()
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
loadPredefinedPrompt(content) {
|
67
|
+
const userPromptTextarea = this.userPromptFieldTarget.querySelector('textarea')
|
68
|
+
userPromptTextarea.value = content
|
69
|
+
this.previewContentTarget.innerHTML = marked.parse(content)
|
70
|
+
}
|
71
|
+
|
72
|
+
clearUserPrompt() {
|
73
|
+
const userPromptTextarea = this.userPromptFieldTarget.querySelector('textarea')
|
74
|
+
userPromptTextarea.value = ''
|
75
|
+
this.previewContentTarget.innerHTML = ''
|
76
|
+
}
|
77
|
+
})
|
78
|
+
})()
|
79
|
+
</script>
|
80
|
+
<!-- Include marked.js for Markdown parsing -->
|
81
|
+
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
data/config/routes.rb
CHANGED
@@ -1,12 +1,21 @@
|
|
1
1
|
Leva::Engine.routes.draw do
|
2
2
|
root 'workbench#index'
|
3
3
|
|
4
|
-
resources :datasets
|
5
|
-
|
4
|
+
resources :datasets do
|
5
|
+
resources :dataset_records, path: 'records', only: [:index, :show]
|
6
|
+
end
|
7
|
+
resources :experiments, except: [:destroy] do
|
8
|
+
member do
|
9
|
+
post :rerun
|
10
|
+
end
|
11
|
+
resources :runner_results, only: [:show]
|
12
|
+
end
|
6
13
|
resources :prompts
|
7
|
-
resources :workbench, only: [:index, :new, :
|
8
|
-
|
9
|
-
|
10
|
-
|
14
|
+
resources :workbench, only: [:index, :new, :create, :edit, :update] do
|
15
|
+
collection do
|
16
|
+
post 'run'
|
17
|
+
post 'run_all_evals'
|
18
|
+
post 'run_evaluator'
|
19
|
+
end
|
11
20
|
end
|
12
21
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
class CreateLevaDatasetRecords < ActiveRecord::Migration[7.2]
|
2
2
|
def change
|
3
3
|
create_table :leva_dataset_records do |t|
|
4
|
-
t.references :
|
4
|
+
t.references :dataset, null: false, foreign_key: { to_table: :leva_datasets }
|
5
5
|
t.references :recordable, polymorphic: true, null: false
|
6
6
|
|
7
7
|
t.timestamps
|
@@ -2,8 +2,9 @@ class CreateLevaExperiments < ActiveRecord::Migration[7.2]
|
|
2
2
|
def change
|
3
3
|
create_table :leva_experiments do |t|
|
4
4
|
t.string :name
|
5
|
-
t.
|
6
|
-
t.references :
|
5
|
+
t.text :description
|
6
|
+
t.references :dataset, null: false, foreign_key: { to_table: :leva_datasets }
|
7
|
+
t.references :prompt, null: true, foreign_key: { to_table: :leva_prompts }
|
7
8
|
t.integer :status
|
8
9
|
t.text :metadata
|
9
10
|
|
@@ -1,8 +1,8 @@
|
|
1
1
|
class CreateLevaEvaluationResults < ActiveRecord::Migration[7.2]
|
2
2
|
def change
|
3
3
|
create_table :leva_evaluation_results do |t|
|
4
|
-
t.references :
|
5
|
-
t.references :
|
4
|
+
t.references :experiment, null: false, foreign_key: { to_table: :leva_experiments }
|
5
|
+
t.references :dataset_record, null: false, foreign_key: { to_table: :leva_dataset_records }
|
6
6
|
t.string :prediction
|
7
7
|
t.float :score
|
8
8
|
t.string :label
|
@@ -0,0 +1,11 @@
|
|
1
|
+
class CreateLevaRunnerResults < ActiveRecord::Migration[7.2]
|
2
|
+
def change
|
3
|
+
create_table :leva_runner_results do |t|
|
4
|
+
t.references :experiment, null: false, foreign_key: { to_table: :leva_experiments }
|
5
|
+
t.references :dataset_record, null: false, foreign_key: { to_table: :leva_dataset_records }
|
6
|
+
t.text :prediction
|
7
|
+
|
8
|
+
t.timestamps
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
class UpdateLevaEvaluationResults < ActiveRecord::Migration[7.2]
|
2
|
+
def change
|
3
|
+
add_reference :leva_evaluation_results, :runner_result, null: false, foreign_key: { to_table: :leva_runner_results }
|
4
|
+
add_column :leva_evaluation_results, :evaluator_class, :string, null: false
|
5
|
+
remove_column :leva_evaluation_results, :prediction, :string
|
6
|
+
remove_column :leva_evaluation_results, :label, :string
|
7
|
+
end
|
8
|
+
end
|
@@ -2,14 +2,13 @@
|
|
2
2
|
|
3
3
|
class <%= class_name %>Eval < Leva::BaseEval
|
4
4
|
# @param prediction [String] The prediction to evaluate
|
5
|
-
# @param
|
6
|
-
# @return [
|
7
|
-
def evaluate(prediction,
|
5
|
+
# @param recordable [YourRecordClass] The recordable object to evaluate
|
6
|
+
# @return [Float] The score of the evaluation
|
7
|
+
def evaluate(prediction, recordable)
|
8
8
|
# Implement your evaluation logic here
|
9
|
+
# You can access the ground truth using recordable.ground_truth
|
9
10
|
|
10
|
-
|
11
|
-
|
12
|
-
score: score
|
13
|
-
)
|
11
|
+
# Example implementation:
|
12
|
+
prediction == recordable.ground_truth ? 1.0 : 0.0
|
14
13
|
end
|
15
14
|
end
|
data/lib/leva/version.rb
CHANGED
data/lib/leva.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "leva/version"
|
2
2
|
require "leva/engine"
|
3
|
+
require "liquid"
|
3
4
|
|
4
5
|
module Leva
|
5
6
|
# Runs an evaluation experiment with the given run and evals.
|
@@ -9,9 +10,34 @@ module Leva
|
|
9
10
|
# @param evals [Array<Leva::BaseEval>] The evaluation implementations to use.
|
10
11
|
# @return [void]
|
11
12
|
def self.run_evaluation(experiment:, run:, evals:)
|
12
|
-
|
13
|
+
experiment.update(status: :running)
|
14
|
+
|
15
|
+
experiment.dataset.dataset_records.find_each do |dataset_record|
|
16
|
+
runner_result = run.execute_and_store(experiment, dataset_record, experiment.prompt)
|
17
|
+
|
18
|
+
evals.each do |eval|
|
19
|
+
eval.evaluate_and_store(experiment, runner_result)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
experiment.update(status: :completed)
|
24
|
+
rescue StandardError => e
|
25
|
+
experiment.update(status: :failed)
|
26
|
+
Rails.logger.error "Error in experiment #{experiment.name}: #{e.message}"
|
27
|
+
end
|
28
|
+
|
29
|
+
# Runs a single evaluation for a dataset record
|
30
|
+
#
|
31
|
+
# @param experiment [Leva::Experiment] The experiment to run.
|
32
|
+
# @param run [Leva::BaseRun] The run implementation to use.
|
33
|
+
# @param evals [Array<Leva::BaseEval>] The evaluation implementations to use.
|
34
|
+
# @param dataset_record [Leva::DatasetRecord] The dataset record to process.
|
35
|
+
# @return [void]
|
36
|
+
def self.run_single_evaluation(experiment:, run:, evals:, dataset_record:)
|
37
|
+
runner_result = run.execute_and_store(experiment, dataset_record, experiment.prompt)
|
38
|
+
|
13
39
|
evals.each do |eval|
|
14
|
-
eval.
|
40
|
+
eval.evaluate_and_store(experiment, runner_result)
|
15
41
|
end
|
16
42
|
end
|
17
43
|
|
@@ -29,20 +55,24 @@ module Leva
|
|
29
55
|
raise NotImplementedError, "#{self.class} must implement #execute"
|
30
56
|
end
|
31
57
|
|
32
|
-
#
|
58
|
+
# Executes the run on a given dataset record and stores the result.
|
33
59
|
#
|
34
|
-
# @param experiment [Leva::Experiment] The experiment
|
35
|
-
# @
|
36
|
-
|
60
|
+
# @param experiment [Leva::Experiment, nil] The experiment being run, if any.
|
61
|
+
# @param dataset_record [Leva::DatasetRecord] The dataset record to run the model on.
|
62
|
+
# @param prompt [Leva::Prompt] The prompt to store the version of.
|
63
|
+
# @return [Leva::RunnerResult] The stored runner result.
|
64
|
+
def execute_and_store(experiment, dataset_record, prompt)
|
65
|
+
# Expose these to the subclass execution
|
37
66
|
@experiment = experiment
|
38
|
-
@prompt =
|
67
|
+
@prompt = prompt
|
39
68
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
69
|
+
result = execute(dataset_record.recordable)
|
70
|
+
RunnerResult.create!(
|
71
|
+
experiment: experiment,
|
72
|
+
dataset_record: dataset_record,
|
73
|
+
prompt: prompt,
|
74
|
+
prediction: result,
|
75
|
+
)
|
46
76
|
end
|
47
77
|
end
|
48
78
|
|
@@ -51,47 +81,34 @@ module Leva
|
|
51
81
|
# @abstract Subclass and override {#evaluate} to implement
|
52
82
|
# custom evaluation logic.
|
53
83
|
class BaseEval
|
54
|
-
# Evaluates the model's prediction against the
|
84
|
+
# Evaluates the model's prediction against the ground truth.
|
55
85
|
#
|
56
86
|
# @param prediction [Object] The model's prediction.
|
57
|
-
# @param
|
58
|
-
# @return [
|
87
|
+
# @param recordable [Object] The recordable object containing the ground truth.
|
88
|
+
# @return [Float] The evaluation score.
|
59
89
|
# @raise [NotImplementedError] if the method is not implemented in a subclass.
|
60
|
-
def evaluate(prediction,
|
90
|
+
def evaluate(prediction, recordable)
|
61
91
|
raise NotImplementedError, "#{self.class} must implement #evaluate"
|
62
92
|
end
|
63
93
|
|
64
|
-
# Evaluates
|
94
|
+
# Evaluates a single runner result and stores the evaluation.
|
65
95
|
#
|
66
|
-
# @param experiment [Leva::Experiment] The experiment
|
67
|
-
# @param
|
68
|
-
# @return [
|
69
|
-
def
|
70
|
-
experiment
|
71
|
-
|
72
|
-
evaluation = evaluate(prediction, dataset_record.recordable)
|
73
|
-
|
74
|
-
Leva::EvaluationResult.create!(
|
75
|
-
experiment: experiment,
|
76
|
-
dataset_record: dataset_record,
|
77
|
-
prediction: prediction,
|
78
|
-
score: evaluation.score,
|
79
|
-
label: evaluation.label
|
80
|
-
)
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
96
|
+
# @param experiment [Leva::Experiment, nil] The experiment being evaluated, if any.
|
97
|
+
# @param runner_result [Leva::RunnerResult] The runner result to evaluate.
|
98
|
+
# @return [Leva::EvaluationResult] The stored evaluation result.
|
99
|
+
def evaluate_and_store(experiment, runner_result)
|
100
|
+
@experiment = experiment
|
101
|
+
@runner_result = runner_result
|
84
102
|
|
85
|
-
|
86
|
-
class Result
|
87
|
-
attr_reader :label, :prediction, :score
|
103
|
+
score = evaluate(runner_result, runner_result.dataset_record.recordable)
|
88
104
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
105
|
+
EvaluationResult.create!(
|
106
|
+
experiment: experiment,
|
107
|
+
dataset_record: runner_result.dataset_record,
|
108
|
+
runner_result: runner_result,
|
109
|
+
score: score,
|
110
|
+
evaluator_class: self.class.name
|
111
|
+
)
|
95
112
|
end
|
96
113
|
end
|
97
114
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: leva
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kieran Klaassen
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rails
|
@@ -24,6 +24,20 @@ dependencies:
|
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 7.2.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: liquid
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 5.5.0
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 5.5.0
|
27
41
|
description: Leva is a Ruby on Rails framework for evaluating Language Models (LLMs)
|
28
42
|
using ActiveRecord datasets. It provides a flexible structure for creating experiments,
|
29
43
|
managing datasets, and implementing various evaluation logic.
|
@@ -39,26 +53,49 @@ files:
|
|
39
53
|
- app/assets/config/leva_manifest.js
|
40
54
|
- app/assets/stylesheets/leva/application.css
|
41
55
|
- app/controllers/leva/application_controller.rb
|
56
|
+
- app/controllers/leva/dataset_records_controller.rb
|
42
57
|
- app/controllers/leva/datasets_controller.rb
|
43
58
|
- app/controllers/leva/experiments_controller.rb
|
59
|
+
- app/controllers/leva/runner_results_controller.rb
|
44
60
|
- app/controllers/leva/workbench_controller.rb
|
45
|
-
- app/evals/test_sentiment_accuracy_eval.rb
|
46
61
|
- app/helpers/leva/application_helper.rb
|
62
|
+
- app/javascript/controllers/prompt_form_controller.js
|
63
|
+
- app/javascript/controllers/prompt_selector_controller.js
|
47
64
|
- app/jobs/leva/application_job.rb
|
48
65
|
- app/jobs/leva/experiment_job.rb
|
66
|
+
- app/jobs/leva/run_eval_job.rb
|
49
67
|
- app/mailers/leva/application_mailer.rb
|
68
|
+
- app/models/concerns/leva/recordable.rb
|
50
69
|
- app/models/leva/application_record.rb
|
51
70
|
- app/models/leva/dataset.rb
|
52
71
|
- app/models/leva/dataset_record.rb
|
53
72
|
- app/models/leva/evaluation_result.rb
|
54
73
|
- app/models/leva/experiment.rb
|
55
74
|
- app/models/leva/prompt.rb
|
56
|
-
- app/
|
75
|
+
- app/models/leva/runner_result.rb
|
57
76
|
- app/views/layouts/leva/application.html.erb
|
77
|
+
- app/views/leva/dataset_records/index.html.erb
|
78
|
+
- app/views/leva/dataset_records/show.html.erb
|
79
|
+
- app/views/leva/datasets/_dataset.html.erb
|
80
|
+
- app/views/leva/datasets/_form.html.erb
|
81
|
+
- app/views/leva/datasets/edit.html.erb
|
58
82
|
- app/views/leva/datasets/index.html.erb
|
83
|
+
- app/views/leva/datasets/new.html.erb
|
59
84
|
- app/views/leva/datasets/show.html.erb
|
85
|
+
- app/views/leva/experiments/_experiment.html.erb
|
86
|
+
- app/views/leva/experiments/_form.html.erb
|
87
|
+
- app/views/leva/experiments/edit.html.erb
|
60
88
|
- app/views/leva/experiments/index.html.erb
|
89
|
+
- app/views/leva/experiments/new.html.erb
|
61
90
|
- app/views/leva/experiments/show.html.erb
|
91
|
+
- app/views/leva/runner_results/show.html.erb
|
92
|
+
- app/views/leva/workbench/_evaluation_area.html.erb
|
93
|
+
- app/views/leva/workbench/_prompt_content.html.erb
|
94
|
+
- app/views/leva/workbench/_prompt_form.html.erb
|
95
|
+
- app/views/leva/workbench/_prompt_sidebar.html.erb
|
96
|
+
- app/views/leva/workbench/_results_section.html.erb
|
97
|
+
- app/views/leva/workbench/_top_bar.html.erb
|
98
|
+
- app/views/leva/workbench/edit.html.erb
|
62
99
|
- app/views/leva/workbench/index.html.erb
|
63
100
|
- app/views/leva/workbench/new.html.erb
|
64
101
|
- config/routes.rb
|
@@ -67,12 +104,18 @@ files:
|
|
67
104
|
- db/migrate/20240813173034_create_leva_prompts.rb
|
68
105
|
- db/migrate/20240813173035_create_leva_experiments.rb
|
69
106
|
- db/migrate/20240813173050_create_leva_evaluation_results.rb
|
107
|
+
- db/migrate/20240816201419_create_leva_runner_results.rb
|
108
|
+
- db/migrate/20240816201433_update_leva_evaluation_results.rb
|
109
|
+
- db/migrate/20240821163608_make_experiment_optional_for_runner_results.rb
|
110
|
+
- db/migrate/20240821181934_add_prompt_to_leva_runner_results.rb
|
111
|
+
- db/migrate/20240821183153_add_runner_and_evaluator_to_leva_experiments.rb
|
112
|
+
- db/migrate/20240821191713_add_actual_result_to_leva_dataset_records.rb
|
113
|
+
- db/migrate/20240822143201_remove_actual_result_from_leva_runner_results.rb
|
70
114
|
- lib/generators/leva/eval_generator.rb
|
71
115
|
- lib/generators/leva/runner_generator.rb
|
72
116
|
- lib/generators/leva/templates/eval.rb.erb
|
73
117
|
- lib/generators/leva/templates/runner.rb.erb
|
74
118
|
- lib/leva.rb
|
75
|
-
- lib/leva/base_eval.rb
|
76
119
|
- lib/leva/engine.rb
|
77
120
|
- lib/leva/version.rb
|
78
121
|
- lib/tasks/auto_annotate_models.rake
|
@@ -1,13 +0,0 @@
|
|
1
|
-
class TestSentimentRun < Leva::BaseRun
|
2
|
-
def execute(record)
|
3
|
-
# Simple sentiment analysis logic for testing
|
4
|
-
case record.content.downcase
|
5
|
-
when /love|great|excellent/
|
6
|
-
"Positive"
|
7
|
-
when /terrible|bad|awful/
|
8
|
-
"Negative"
|
9
|
-
else
|
10
|
-
"Neutral"
|
11
|
-
end
|
12
|
-
end
|
13
|
-
end
|
data/lib/leva/base_eval.rb
DELETED
@@ -1,75 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Leva
|
4
|
-
class BaseEval
|
5
|
-
class << self
|
6
|
-
attr_reader :dataset_record_class_name
|
7
|
-
|
8
|
-
# Set the dataset record class for the eval
|
9
|
-
# @param class_name [String] The name of the dataset record class
|
10
|
-
def leva_dataset_record_class(class_name)
|
11
|
-
@dataset_record_class_name = class_name
|
12
|
-
end
|
13
|
-
|
14
|
-
# Run the experiment
|
15
|
-
# @param experiment [Leva::Experiment] The experiment to run
|
16
|
-
def run_experiment(experiment)
|
17
|
-
new.run_experiment(experiment)
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
# Run the experiment
|
22
|
-
# @param experiment [Leva::Experiment] The experiment to run
|
23
|
-
def run_experiment(experiment)
|
24
|
-
@experiment = experiment
|
25
|
-
@experiment.update(status: :running)
|
26
|
-
|
27
|
-
@experiment.dataset.records.each do |record|
|
28
|
-
@record = record
|
29
|
-
unless @record.class_name == self.class.dataset_record_class_name
|
30
|
-
raise ArgumentError, "Record class #{@record.class_name} does not match expected class #{self.class.dataset_record_class_name}"
|
31
|
-
end
|
32
|
-
ExperimentJob.perform_later(self, @record)
|
33
|
-
end
|
34
|
-
|
35
|
-
@experiment.update(status: :completed)
|
36
|
-
rescue StandardError => e
|
37
|
-
@experiment.update(status: :failed)
|
38
|
-
Rails.logger.error "Error in experiment #{@experiment.name}: #{e.message}"
|
39
|
-
end
|
40
|
-
|
41
|
-
# Run the evaluation for a single record
|
42
|
-
# @param record [ActiveRecord::Base] The record to evaluate
|
43
|
-
# @return [Leva::Result] The result of the evaluation
|
44
|
-
def run_each(record)
|
45
|
-
raise NotImplementedError, "Subclasses must implement the 'run_each' method"
|
46
|
-
end
|
47
|
-
|
48
|
-
# Save the result of an evaluation
|
49
|
-
# @param result [Leva::Result] The result of the evaluation
|
50
|
-
def save_result(result)
|
51
|
-
Leva::EvaluationResult.create!(
|
52
|
-
experiment: @experiment,
|
53
|
-
dataset_record: Leva::DatasetRecord.find_by(recordable: @record, dataset: @experiment.dataset),
|
54
|
-
prediction: result.prediction,
|
55
|
-
score: result.score,
|
56
|
-
label: result.label
|
57
|
-
)
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
|
-
# Represents the result of an evaluation
|
62
|
-
class Result
|
63
|
-
attr_reader :label, :prediction, :score
|
64
|
-
|
65
|
-
# Initialize a new Result
|
66
|
-
# @param label [String] The label for the result
|
67
|
-
# @param prediction [String] The prediction made by the evaluation
|
68
|
-
# @param score [Float] The score of the evaluation (0.0 to 1.0)
|
69
|
-
def initialize(label:, prediction:, score:)
|
70
|
-
@label = label
|
71
|
-
@prediction = prediction
|
72
|
-
@score = score
|
73
|
-
end
|
74
|
-
end
|
75
|
-
end
|