leva 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +55 -16
- data/app/controllers/leva/dataset_records_controller.rb +21 -0
- data/app/controllers/leva/datasets_controller.rb +9 -2
- data/app/controllers/leva/experiments_controller.rb +34 -9
- data/app/controllers/leva/runner_results_controller.rb +8 -0
- data/app/controllers/leva/workbench_controller.rb +85 -12
- data/app/helpers/leva/application_helper.rb +39 -0
- data/app/javascript/controllers/prompt_form_controller.js +45 -0
- data/app/javascript/controllers/prompt_selector_controller.js +31 -0
- data/app/jobs/leva/experiment_job.rb +9 -4
- data/app/jobs/leva/run_eval_job.rb +40 -0
- data/app/models/concerns/leva/recordable.rb +37 -0
- data/app/models/leva/dataset.rb +15 -6
- data/app/models/leva/dataset_record.rb +40 -1
- data/app/models/leva/evaluation_result.rb +15 -7
- data/app/models/leva/experiment.rb +24 -12
- data/app/models/leva/prompt.rb +14 -1
- data/app/models/leva/runner_result.rb +56 -0
- data/app/views/layouts/leva/application.html.erb +24 -13
- data/app/views/leva/dataset_records/index.html.erb +49 -0
- data/app/views/leva/dataset_records/show.html.erb +30 -0
- data/app/views/leva/datasets/_dataset.html.erb +18 -0
- data/app/views/leva/datasets/_form.html.erb +24 -0
- data/app/views/leva/datasets/edit.html.erb +5 -0
- data/app/views/leva/datasets/index.html.erb +51 -38
- data/app/views/leva/datasets/new.html.erb +5 -0
- data/app/views/leva/datasets/show.html.erb +160 -8
- data/app/views/leva/experiments/_experiment.html.erb +42 -0
- data/app/views/leva/experiments/_form.html.erb +49 -0
- data/app/views/leva/experiments/edit.html.erb +5 -0
- data/app/views/leva/experiments/index.html.erb +53 -37
- data/app/views/leva/experiments/new.html.erb +5 -0
- data/app/views/leva/experiments/show.html.erb +115 -19
- data/app/views/leva/runner_results/show.html.erb +64 -0
- data/app/views/leva/workbench/_evaluation_area.html.erb +5 -0
- data/app/views/leva/workbench/_prompt_content.html.erb +216 -0
- data/app/views/leva/workbench/_prompt_form.html.erb +89 -0
- data/app/views/leva/workbench/_prompt_sidebar.html.erb +21 -0
- data/app/views/leva/workbench/_results_section.html.erb +159 -0
- data/app/views/leva/workbench/_top_bar.html.erb +10 -0
- data/app/views/leva/workbench/edit.html.erb +20 -0
- data/app/views/leva/workbench/index.html.erb +5 -91
- data/app/views/leva/workbench/new.html.erb +79 -36
- data/config/routes.rb +15 -6
- data/db/migrate/20240813172916_create_leva_datasets.rb +1 -0
- data/db/migrate/20240813173035_create_leva_experiments.rb +1 -0
- data/db/migrate/20240816201419_create_leva_runner_results.rb +11 -0
- data/db/migrate/20240816201433_update_leva_evaluation_results.rb +8 -0
- data/db/migrate/20240821163608_make_experiment_optional_for_runner_results.rb +6 -0
- data/db/migrate/20240821181934_add_prompt_to_leva_runner_results.rb +5 -0
- data/db/migrate/20240821183153_add_runner_and_evaluator_to_leva_experiments.rb +6 -0
- data/db/migrate/20240821191713_add_actual_result_to_leva_dataset_records.rb +5 -0
- data/db/migrate/20240822143201_remove_actual_result_from_leva_runner_results.rb +5 -0
- data/db/migrate/20240912183556_add_runner_class_to_leva_runner_results.rb +5 -0
- data/lib/generators/leva/templates/eval.rb.erb +7 -8
- data/lib/generators/leva/templates/runner.rb.erb +25 -0
- data/lib/leva/version.rb +1 -1
- data/lib/leva.rb +84 -44
- metadata +49 -5
- data/app/evals/test_sentiment_accuracy_eval.rb +0 -6
- data/app/runners/test_sentiment_run.rb +0 -13
- data/lib/leva/base_eval.rb +0 -75
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0b12a341adbc24fa5c29fe3bf9961345fe02f66fb31089908a61ccf9d4061dff
|
4
|
+
data.tar.gz: 3e74533f1417a0f88aeab8f7942af4df7a6ad0eca6af89c1e29a1cd178cb3121
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ad517366dd537adda1d937252e494215d36164f8461dbd818794d746bfb84838e7fa6889e29fa1cd873fdca7992f545832718acb0346ee2cd2dee80bb55ee9df
|
7
|
+
data.tar.gz: 5bf3f499fb42c99c898df934e7c188c85f2d10cb0ad77552b5d7a349552b9f614bc742be52648570ca13da9858e7144ce2dae39196d9a083b8aaac1f6c266b59
|
data/README.md
CHANGED
@@ -2,6 +2,9 @@
|
|
2
2
|
|
3
3
|
Leva is a Ruby on Rails framework for evaluating Language Models (LLMs) using ActiveRecord datasets on production models. It provides a flexible structure for creating experiments, managing datasets, and implementing various evaluation logic on production data with security in mind.
|
4
4
|
|
5
|
+

|
6
|
+

|
7
|
+
|
5
8
|
## Installation
|
6
9
|
|
7
10
|
Add this line to your application's Gemfile:
|
@@ -27,14 +30,46 @@ rails db:migrate
|
|
27
30
|
|
28
31
|
### 1. Setting up Datasets
|
29
32
|
|
30
|
-
First, create a dataset and add any ActiveRecord records you want to evaluate against:
|
33
|
+
First, create a dataset and add any ActiveRecord records you want to evaluate against. To make your models compatible with Leva, include the `Leva::Recordable` concern in your model:
|
31
34
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
35
|
+
````ruby
|
36
|
+
class TextContent < ApplicationRecord
|
37
|
+
include Leva::Recordable
|
38
|
+
|
39
|
+
# @return [String] The ground truth label for the record
|
40
|
+
def ground_truth
|
41
|
+
expected_label
|
42
|
+
end
|
43
|
+
|
44
|
+
# @return [Hash] A hash of attributes to be displayed in the dataset records index
|
45
|
+
def index_attributes
|
46
|
+
{
|
47
|
+
text: text,
|
48
|
+
expected_label: expected_label,
|
49
|
+
created_at: created_at.strftime('%Y-%m-%d %H:%M:%S')
|
50
|
+
}
|
51
|
+
end
|
52
|
+
|
53
|
+
# @return [Hash] A hash of attributes to be displayed in the dataset record show view
|
54
|
+
def show_attributes
|
55
|
+
{
|
56
|
+
text: text,
|
57
|
+
expected_label: expected_label,
|
58
|
+
created_at: created_at.strftime('%Y-%m-%d %H:%M:%S')
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
# @return [Hash] A hash of attributes to be displayed in the dataset record show view
|
63
|
+
def to_llm_context
|
64
|
+
{
|
65
|
+
text: text,
|
66
|
+
expected_label: expected_label,
|
67
|
+
created_at: created_at.strftime('%Y-%m-%d %H:%M:%S')
|
68
|
+
}
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
dataset = Leva::Dataset.create(name: "Sentiment Analysis Dataset") dataset.add_record TextContent.create(text: "I love this product!", expected_label: "Positive") dataset.add_record TextContent.create(text: "Terrible experience", expected_label: "Negative") dataset.add_record TextContent.create(text: "It's ok", expected_label: "Neutral")
|
38
73
|
|
39
74
|
### 2. Implementing Runs
|
40
75
|
|
@@ -42,7 +77,7 @@ Create a run class to handle the execution of your inference logic:
|
|
42
77
|
|
43
78
|
```bash
|
44
79
|
rails generate leva:runner sentiment
|
45
|
-
|
80
|
+
````
|
46
81
|
|
47
82
|
```ruby
|
48
83
|
class SentimentRun < Leva::BaseRun
|
@@ -64,17 +99,17 @@ rails generate leva:eval sentiment_accuracy
|
|
64
99
|
|
65
100
|
```ruby
|
66
101
|
class SentimentAccuracyEval < Leva::BaseEval
|
67
|
-
def evaluate(prediction,
|
68
|
-
score = prediction ==
|
69
|
-
|
102
|
+
def evaluate(prediction, record)
|
103
|
+
score = prediction == record.expected_label ? 1.0 : 0.0
|
104
|
+
[score, record.expected_label]
|
70
105
|
end
|
71
106
|
end
|
72
107
|
|
73
108
|
class SentimentF1Eval < Leva::BaseEval
|
74
|
-
def evaluate(prediction,
|
109
|
+
def evaluate(prediction, record)
|
75
110
|
# Calculate F1 score
|
76
111
|
# ...
|
77
|
-
|
112
|
+
[f1_score, record.f1_score]
|
78
113
|
end
|
79
114
|
end
|
80
115
|
```
|
@@ -122,9 +157,9 @@ Leva.run_evaluation(experiment: experiment, run: run, evals: evals)
|
|
122
157
|
After the experiments are complete, analyze the results:
|
123
158
|
|
124
159
|
```ruby
|
125
|
-
experiment.evaluation_results.group_by(&:
|
160
|
+
experiment.evaluation_results.group_by(&:evaluator_class).each do |evaluator_class, results|
|
126
161
|
average_score = results.average(&:score)
|
127
|
-
puts "#{
|
162
|
+
puts "#{evaluator_class.capitalize} Average Score: #{average_score}"
|
128
163
|
end
|
129
164
|
```
|
130
165
|
|
@@ -139,13 +174,13 @@ Ensure you set up any required API keys or other configurations in your Rails cr
|
|
139
174
|
- `Leva`: Handles the process of running experiments.
|
140
175
|
- `Leva::BaseRun`: Base class for run implementations.
|
141
176
|
- `Leva::BaseEval`: Base class for eval implementations.
|
142
|
-
- `Leva::Result`: Represents the result of an evaluation.
|
143
177
|
|
144
178
|
### Models
|
145
179
|
|
146
180
|
- `Leva::Dataset`: Represents a collection of data to be evaluated.
|
147
181
|
- `Leva::DatasetRecord`: Represents individual records within a dataset.
|
148
182
|
- `Leva::Experiment`: Represents a single run of an evaluation on a dataset.
|
183
|
+
- `Leva::RunnerResult`: Stores the results of each run execution.
|
149
184
|
- `Leva::EvaluationResult`: Stores the results of each evaluation.
|
150
185
|
- `Leva::Prompt`: Represents a prompt for an LLM.
|
151
186
|
|
@@ -156,3 +191,7 @@ Bug reports and pull requests are welcome on GitHub at https://github.com/kieran
|
|
156
191
|
## License
|
157
192
|
|
158
193
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
194
|
+
|
195
|
+
## Roadmap
|
196
|
+
|
197
|
+
- [ ] Parallelize evaluation
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Leva
|
2
|
+
class DatasetRecordsController < ApplicationController
|
3
|
+
before_action :set_dataset
|
4
|
+
|
5
|
+
# GET /datasets/:dataset_id/records
|
6
|
+
def index
|
7
|
+
@records = @dataset.dataset_records.includes(:recordable)
|
8
|
+
end
|
9
|
+
|
10
|
+
# GET /datasets/:dataset_id/records/:id
|
11
|
+
def show
|
12
|
+
@record = @dataset.dataset_records.find(params[:id])
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def set_dataset
|
18
|
+
@dataset = Dataset.find(params[:dataset_id])
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -13,6 +13,8 @@ module Leva
|
|
13
13
|
# GET /datasets/1
|
14
14
|
# @return [void]
|
15
15
|
def show
|
16
|
+
@experiments = @dataset.experiments
|
17
|
+
@new_experiment = Experiment.new(dataset: @dataset)
|
16
18
|
end
|
17
19
|
|
18
20
|
# GET /datasets/new
|
@@ -24,6 +26,7 @@ module Leva
|
|
24
26
|
# GET /datasets/1/edit
|
25
27
|
# @return [void]
|
26
28
|
def edit
|
29
|
+
# The @dataset is already set by the before_action
|
27
30
|
end
|
28
31
|
|
29
32
|
# POST /datasets
|
@@ -51,8 +54,12 @@ module Leva
|
|
51
54
|
# DELETE /datasets/1
|
52
55
|
# @return [void]
|
53
56
|
def destroy
|
54
|
-
@dataset.
|
55
|
-
|
57
|
+
if @dataset.dataset_records.any?
|
58
|
+
redirect_to @dataset, alert: 'Cannot delete dataset with existing records.'
|
59
|
+
else
|
60
|
+
@dataset.destroy
|
61
|
+
redirect_to datasets_url, notice: 'Dataset was successfully destroyed.'
|
62
|
+
end
|
56
63
|
end
|
57
64
|
|
58
65
|
private
|
@@ -2,7 +2,11 @@
|
|
2
2
|
|
3
3
|
module Leva
|
4
4
|
class ExperimentsController < ApplicationController
|
5
|
-
|
5
|
+
include ApplicationHelper
|
6
|
+
|
7
|
+
before_action :set_experiment, only: [:show, :edit, :update]
|
8
|
+
before_action :check_editable, only: [:edit, :update]
|
9
|
+
before_action :load_runners_and_evaluators, only: [:new, :edit, :create, :update]
|
6
10
|
|
7
11
|
# GET /experiments
|
8
12
|
# @return [void]
|
@@ -13,17 +17,19 @@ module Leva
|
|
13
17
|
# GET /experiments/1
|
14
18
|
# @return [void]
|
15
19
|
def show
|
20
|
+
@experiment = Experiment.includes(runner_results: :evaluation_results).find(params[:id])
|
16
21
|
end
|
17
22
|
|
18
23
|
# GET /experiments/new
|
19
24
|
# @return [void]
|
20
25
|
def new
|
21
|
-
@experiment = Experiment.new
|
26
|
+
@experiment = Experiment.new(dataset_id: params[:dataset_id])
|
22
27
|
end
|
23
28
|
|
24
29
|
# GET /experiments/1/edit
|
25
30
|
# @return [void]
|
26
31
|
def edit
|
32
|
+
# The @experiment is already set by the before_action
|
27
33
|
end
|
28
34
|
|
29
35
|
# POST /experiments
|
@@ -32,8 +38,8 @@ module Leva
|
|
32
38
|
@experiment = Experiment.new(experiment_params)
|
33
39
|
|
34
40
|
if @experiment.save
|
35
|
-
ExperimentJob.perform_later(@experiment)
|
36
|
-
redirect_to @experiment, notice: 'Experiment was successfully created.'
|
41
|
+
ExperimentJob.perform_later(@experiment) unless @experiment.completed?
|
42
|
+
redirect_to @experiment, notice: 'Experiment was successfully created and is now running.'
|
37
43
|
else
|
38
44
|
render :new
|
39
45
|
end
|
@@ -49,11 +55,21 @@ module Leva
|
|
49
55
|
end
|
50
56
|
end
|
51
57
|
|
52
|
-
#
|
58
|
+
# POST /experiments/1/rerun
|
53
59
|
# @return [void]
|
54
|
-
def
|
55
|
-
@experiment.
|
56
|
-
|
60
|
+
def rerun
|
61
|
+
@experiment = Experiment.find(params[:id])
|
62
|
+
|
63
|
+
# Delete existing runner results and evaluation results
|
64
|
+
@experiment.runner_results.destroy_all
|
65
|
+
|
66
|
+
# Reset experiment status to pending
|
67
|
+
@experiment.update(status: :pending)
|
68
|
+
|
69
|
+
# Queue the job again
|
70
|
+
ExperimentJob.perform_later(@experiment)
|
71
|
+
|
72
|
+
redirect_to @experiment, notice: 'Experiment has been reset and is now running again.'
|
57
73
|
end
|
58
74
|
|
59
75
|
private
|
@@ -67,7 +83,16 @@ module Leva
|
|
67
83
|
# Only allow a list of trusted parameters through.
|
68
84
|
# @return [ActionController::Parameters]
|
69
85
|
def experiment_params
|
70
|
-
params.require(:experiment).permit(:name, :description, :dataset_id)
|
86
|
+
params.require(:experiment).permit(:name, :description, :dataset_id, :prompt_id, :runner_class, evaluator_classes: [])
|
87
|
+
end
|
88
|
+
|
89
|
+
def load_runners_and_evaluators
|
90
|
+
@runners = load_runners
|
91
|
+
@evaluators = load_evaluators
|
92
|
+
end
|
93
|
+
|
94
|
+
def check_editable
|
95
|
+
redirect_to @experiment, alert: 'Completed experiments cannot be edited.' if @experiment.completed?
|
71
96
|
end
|
72
97
|
end
|
73
98
|
end
|
@@ -2,39 +2,112 @@
|
|
2
2
|
|
3
3
|
module Leva
|
4
4
|
class WorkbenchController < ApplicationController
|
5
|
+
include ApplicationHelper
|
6
|
+
|
7
|
+
before_action :set_prompt, only: [:index, :edit, :update, :run, :run_all_evals, :run_evaluator]
|
8
|
+
before_action :set_dataset_record, only: [:index, :run, :run_all_evals, :run_evaluator]
|
9
|
+
before_action :set_runner_result, only: [:index, :run_all_evals, :run_evaluator]
|
10
|
+
|
5
11
|
# GET /workbench
|
6
12
|
# @return [void]
|
7
13
|
def index
|
8
14
|
@prompts = Prompt.all
|
9
|
-
@selected_prompt =
|
10
|
-
@evaluators =
|
15
|
+
@selected_prompt = @prompt || Prompt.first
|
16
|
+
@evaluators = load_evaluators
|
17
|
+
@runners = load_runners
|
18
|
+
@selected_runner = params[:runner] || @runners.first&.name
|
19
|
+
@selected_dataset_record = params[:dataset_record_id] || DatasetRecord.first&.id
|
11
20
|
end
|
12
21
|
|
13
22
|
# GET /workbench/new
|
14
23
|
# @return [void]
|
15
24
|
def new
|
16
|
-
@
|
25
|
+
@prompt = Prompt.new
|
26
|
+
@predefined_prompts = load_predefined_prompts
|
27
|
+
end
|
28
|
+
|
29
|
+
# POST /workbench
|
30
|
+
# @return [void]
|
31
|
+
def create
|
32
|
+
@prompt = Prompt.new(prompt_params)
|
33
|
+
if @prompt.save
|
34
|
+
redirect_to workbench_index_path(prompt_id: @prompt.id), notice: 'Prompt was successfully created.'
|
35
|
+
else
|
36
|
+
render :new
|
37
|
+
end
|
17
38
|
end
|
18
39
|
|
19
40
|
# GET /workbench/1
|
20
41
|
# @return [void]
|
21
|
-
def
|
22
|
-
|
42
|
+
def edit
|
43
|
+
end
|
44
|
+
|
45
|
+
# PATCH/PUT /workbench/1
|
46
|
+
# @return [void]
|
47
|
+
def update
|
48
|
+
@prompt = Prompt.find(params[:id])
|
49
|
+
if @prompt.update(prompt_params)
|
50
|
+
render json: { status: 'success', message: 'Prompt updated successfully' }
|
51
|
+
else
|
52
|
+
render json: { status: 'error', errors: @prompt.errors.full_messages }, status: :unprocessable_entity
|
53
|
+
end
|
23
54
|
end
|
24
55
|
|
25
56
|
def run
|
26
|
-
|
27
|
-
|
57
|
+
return redirect_to workbench_index_path, alert: 'Please select a record and a runner' unless @dataset_record && run_params[:runner]
|
58
|
+
|
59
|
+
runner_class = run_params[:runner].constantize
|
60
|
+
return redirect_to workbench_index_path, alert: 'Invalid runner selected' unless runner_class < Leva::BaseRun
|
61
|
+
|
62
|
+
runner = runner_class.new
|
63
|
+
runner_result = runner.execute_and_store(nil, @dataset_record, @prompt)
|
64
|
+
|
65
|
+
redirect_to workbench_index_path(prompt_id: @prompt.id, dataset_record_id: @dataset_record.id, runner: run_params[:runner]), notice: 'Run completed successfully'
|
28
66
|
end
|
29
67
|
|
30
|
-
def
|
31
|
-
|
32
|
-
|
68
|
+
def run_all_evals
|
69
|
+
return redirect_to workbench_index_path, alert: 'No runner result available' unless @runner_result
|
70
|
+
|
71
|
+
load_evaluators.each do |evaluator_class|
|
72
|
+
evaluator = evaluator_class.new
|
73
|
+
evaluator.evaluate_and_store(nil, @runner_result)
|
74
|
+
end
|
75
|
+
|
76
|
+
redirect_to workbench_index_path(prompt_id: @prompt.id, dataset_record_id: @dataset_record.id, runner: params[:runner]), notice: 'All evaluations completed successfully'
|
33
77
|
end
|
34
78
|
|
35
79
|
def run_evaluator
|
36
|
-
|
37
|
-
|
80
|
+
return redirect_to workbench_index_path, alert: 'No runner result available' unless @runner_result
|
81
|
+
|
82
|
+
evaluator_class = params[:evaluator].constantize
|
83
|
+
return redirect_to workbench_index_path, alert: 'Invalid evaluator selected' unless evaluator_class < Leva::BaseEval
|
84
|
+
|
85
|
+
evaluator = evaluator_class.new
|
86
|
+
evaluator.evaluate_and_store(nil, @runner_result)
|
87
|
+
|
88
|
+
redirect_to workbench_index_path(prompt_id: @prompt.id, dataset_record_id: @dataset_record.id, runner: params[:runner]), notice: 'Evaluator run successfully'
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def set_prompt
|
94
|
+
@prompt = params[:prompt_id] ? Prompt.find(params[:prompt_id]) : Prompt.first
|
95
|
+
end
|
96
|
+
|
97
|
+
def prompt_params
|
98
|
+
params.require(:prompt).permit(:name, :system_prompt, :user_prompt, :version)
|
99
|
+
end
|
100
|
+
|
101
|
+
def set_dataset_record
|
102
|
+
@dataset_record = DatasetRecord.find_by(id: params[:dataset_record_id]) || DatasetRecord.first
|
103
|
+
end
|
104
|
+
|
105
|
+
def run_params
|
106
|
+
params.permit(:runner, :prompt_id, :dataset_record_id)
|
107
|
+
end
|
108
|
+
|
109
|
+
def set_runner_result
|
110
|
+
@runner_result = @dataset_record.runner_results.last if @dataset_record
|
38
111
|
end
|
39
112
|
end
|
40
113
|
end
|
@@ -1,4 +1,43 @@
|
|
1
1
|
module Leva
|
2
2
|
module ApplicationHelper
|
3
|
+
# Loads all evaluator classes that inherit from Leva::BaseEval
|
4
|
+
#
|
5
|
+
# @return [Array<Class>] An array of evaluator classes
|
6
|
+
def load_evaluators
|
7
|
+
load_classes_from_directory('app/evals', Leva::BaseEval) || []
|
8
|
+
end
|
9
|
+
|
10
|
+
# Loads all runner classes that inherit from Leva::BaseRun
|
11
|
+
#
|
12
|
+
# @return [Array<Class>] An array of runner classes
|
13
|
+
def load_runners
|
14
|
+
load_classes_from_directory('app/runners', Leva::BaseRun) || []
|
15
|
+
end
|
16
|
+
|
17
|
+
# Loads predefined prompts from markdown files
|
18
|
+
#
|
19
|
+
# @return [Array<Array<String, String>>] An array of prompt name and content pairs
|
20
|
+
def load_predefined_prompts
|
21
|
+
prompts = Dir.glob(Rails.root.join('app', 'prompts', '*.md')).map do |file|
|
22
|
+
name = File.basename(file, '.md').titleize
|
23
|
+
content = File.read(file)
|
24
|
+
[name, content]
|
25
|
+
end
|
26
|
+
prompts
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
# Loads classes from a specified directory that inherit from a given base class
|
32
|
+
#
|
33
|
+
# @param directory [String] The directory path to load classes from
|
34
|
+
# @param base_class [Class] The base class that loaded classes should inherit from
|
35
|
+
# @return [Array<Class>] An array of loaded classes
|
36
|
+
def load_classes_from_directory(directory, base_class)
|
37
|
+
classes = Dir[Rails.root.join(directory, '*.rb')].map do |file|
|
38
|
+
File.basename(file, '.rb').camelize.constantize
|
39
|
+
end.select { |klass| klass < base_class }
|
40
|
+
classes.empty? ? [] : classes
|
41
|
+
end
|
3
42
|
end
|
4
43
|
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
import { Controller } from "@hotwired/stimulus";
|
2
|
+
|
3
|
+
export default class extends Controller {
|
4
|
+
static targets = ["form"];
|
5
|
+
|
6
|
+
autoSave() {
|
7
|
+
clearTimeout(this.timeout);
|
8
|
+
this.timeout = setTimeout(() => {
|
9
|
+
this.submitForm();
|
10
|
+
}, 500);
|
11
|
+
}
|
12
|
+
|
13
|
+
submitForm() {
|
14
|
+
const form = this.element;
|
15
|
+
const formData = new FormData(form);
|
16
|
+
|
17
|
+
fetch(form.action, {
|
18
|
+
method: form.method,
|
19
|
+
body: formData,
|
20
|
+
headers: {
|
21
|
+
Accept: "application/json",
|
22
|
+
"X-CSRF-Token": document.querySelector('meta[name="csrf-token"]').content,
|
23
|
+
},
|
24
|
+
})
|
25
|
+
.then((response) => response.json())
|
26
|
+
.then((data) => {
|
27
|
+
const statusElement = document.getElementById("form-status");
|
28
|
+
if (data.status === "success") {
|
29
|
+
statusElement.textContent = "Changes saved successfully";
|
30
|
+
statusElement.classList.add("text-green-500");
|
31
|
+
statusElement.classList.remove("text-red-500");
|
32
|
+
} else {
|
33
|
+
statusElement.textContent = `Error: ${data.errors.join(", ")}`;
|
34
|
+
statusElement.classList.add("text-red-500");
|
35
|
+
statusElement.classList.remove("text-green-500");
|
36
|
+
}
|
37
|
+
setTimeout(() => {
|
38
|
+
statusElement.textContent = "";
|
39
|
+
}, 3000);
|
40
|
+
})
|
41
|
+
.catch((error) => {
|
42
|
+
console.error("Error:", error);
|
43
|
+
});
|
44
|
+
}
|
45
|
+
}
|
@@ -0,0 +1,31 @@
|
|
1
|
+
import { Controller } from "@hotwired/stimulus";
|
2
|
+
|
3
|
+
export default class extends Controller {
|
4
|
+
static targets = ["userPromptField"];
|
5
|
+
|
6
|
+
toggleUserPrompt(event) {
|
7
|
+
const selectedFile = event.target.value;
|
8
|
+
if (selectedFile) {
|
9
|
+
this.userPromptFieldTarget.style.display = "none";
|
10
|
+
this.loadPredefinedPrompt(selectedFile);
|
11
|
+
} else {
|
12
|
+
this.userPromptFieldTarget.style.display = "block";
|
13
|
+
this.clearUserPrompt();
|
14
|
+
}
|
15
|
+
}
|
16
|
+
|
17
|
+
loadPredefinedPrompt(file) {
|
18
|
+
fetch(file)
|
19
|
+
.then((response) => response.text())
|
20
|
+
.then((content) => {
|
21
|
+
const userPromptTextarea = this.userPromptFieldTarget.querySelector("textarea");
|
22
|
+
userPromptTextarea.value = content;
|
23
|
+
})
|
24
|
+
.catch((error) => console.error("Error loading predefined prompt:", error));
|
25
|
+
}
|
26
|
+
|
27
|
+
clearUserPrompt() {
|
28
|
+
const userPromptTextarea = this.userPromptFieldTarget.querySelector("textarea");
|
29
|
+
userPromptTextarea.value = "";
|
30
|
+
}
|
31
|
+
}
|
@@ -4,13 +4,18 @@ module Leva
|
|
4
4
|
class ExperimentJob < ApplicationJob
|
5
5
|
queue_as :default
|
6
6
|
|
7
|
-
# Perform the experiment
|
7
|
+
# Perform the experiment by scheduling all dataset records for evaluation
|
8
8
|
#
|
9
9
|
# @param experiment [Experiment] The experiment to run
|
10
10
|
# @return [void]
|
11
|
-
def perform(
|
12
|
-
|
13
|
-
|
11
|
+
def perform(experiment)
|
12
|
+
return if experiment.completed? || experiment.running?
|
13
|
+
|
14
|
+
experiment.update!(status: :running)
|
15
|
+
|
16
|
+
experiment.dataset.dataset_records.each_with_index do |record, index|
|
17
|
+
RunEvalJob.set(wait: 3.seconds * index).perform_later(experiment.id, record.id)
|
18
|
+
end
|
14
19
|
end
|
15
20
|
end
|
16
21
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Leva
|
4
|
+
class RunEvalJob < ApplicationJob
|
5
|
+
queue_as :default
|
6
|
+
|
7
|
+
# Perform a single run and evaluation for a dataset record
|
8
|
+
#
|
9
|
+
# @param experiment_id [Integer] The ID of the experiment
|
10
|
+
# @param dataset_record_id [Integer] The ID of the dataset record
|
11
|
+
# @return [void]
|
12
|
+
def perform(experiment_id, dataset_record_id)
|
13
|
+
experiment = Experiment.find(experiment_id)
|
14
|
+
dataset_record = DatasetRecord.find(dataset_record_id)
|
15
|
+
|
16
|
+
run = constantize_class(experiment.runner_class).new
|
17
|
+
evals = experiment.evaluator_classes.compact.reject(&:empty?).map { |klass| constantize_class(klass).new }
|
18
|
+
|
19
|
+
Leva.run_single_evaluation(experiment: experiment, run: run, evals: evals, dataset_record: dataset_record)
|
20
|
+
|
21
|
+
experiment.update!(status: :completed) if is_last(experiment)
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def constantize_class(class_name)
|
27
|
+
class_name.constantize
|
28
|
+
rescue NameError => e
|
29
|
+
raise NameError, "Invalid class name: #{class_name}. Error: #{e.message}"
|
30
|
+
end
|
31
|
+
|
32
|
+
# Check if all dataset records for the experiment have a runner result
|
33
|
+
#
|
34
|
+
# @param experiment [Experiment] The experiment to check
|
35
|
+
# @return [Boolean] True if all dataset records have a runner result, false otherwise
|
36
|
+
def is_last(experiment)
|
37
|
+
experiment.dataset.dataset_records.count == experiment.runner_results.count
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Leva
|
2
|
+
module Recordable
|
3
|
+
extend ActiveSupport::Concern
|
4
|
+
|
5
|
+
included do
|
6
|
+
has_many :dataset_records, as: :recordable, class_name: 'Leva::DatasetRecord', dependent: :destroy
|
7
|
+
has_many :datasets, through: :dataset_records, class_name: 'Leva::Dataset'
|
8
|
+
has_many :runner_results, through: :dataset_records, class_name: 'Leva::RunnerResult'
|
9
|
+
has_many :evaluation_results, through: :runner_results, class_name: 'Leva::EvaluationResult'
|
10
|
+
end
|
11
|
+
|
12
|
+
# @return [String] The ground truth label for the record
|
13
|
+
def ground_truth
|
14
|
+
raise NotImplementedError, "#{self.class} must implement #ground_truth"
|
15
|
+
end
|
16
|
+
|
17
|
+
# @return [Hash] A hash of attributes to be displayed in the dataset records index
|
18
|
+
def index_attributes
|
19
|
+
raise NotImplementedError, "#{self.class} must implement #index_attributes"
|
20
|
+
end
|
21
|
+
|
22
|
+
# @return [Hash] A hash of attributes to be displayed in the dataset record show view
|
23
|
+
def show_attributes
|
24
|
+
raise NotImplementedError, "#{self.class} must implement #show_attributes"
|
25
|
+
end
|
26
|
+
|
27
|
+
# @return [Hash] A hash of attributes to be liquified for LLM context
|
28
|
+
def to_llm_context
|
29
|
+
raise NotImplementedError, "#{self.class} must implement #to_llm_context"
|
30
|
+
end
|
31
|
+
|
32
|
+
# @return [Regexp] A regex pattern to extract the contents of a LLM response
|
33
|
+
def extract_regex_pattern
|
34
|
+
false
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
data/app/models/leva/dataset.rb
CHANGED
@@ -2,18 +2,27 @@
|
|
2
2
|
#
|
3
3
|
# Table name: leva_datasets
|
4
4
|
#
|
5
|
-
# id
|
6
|
-
#
|
7
|
-
#
|
8
|
-
#
|
5
|
+
# id :integer not null, primary key
|
6
|
+
# description :text
|
7
|
+
# name :string
|
8
|
+
# created_at :datetime not null
|
9
|
+
# updated_at :datetime not null
|
9
10
|
#
|
10
11
|
module Leva
|
11
12
|
class Dataset < ApplicationRecord
|
12
13
|
has_many :dataset_records, dependent: :destroy
|
13
14
|
has_many :experiments, dependent: :destroy
|
14
15
|
|
16
|
+
validates :name, presence: true
|
17
|
+
|
18
|
+
# Adds a record to the dataset if it doesn't already exist
|
19
|
+
#
|
20
|
+
# @param record [ActiveRecord::Base] The record to be added to the dataset
|
21
|
+
# @return [Leva::DatasetRecord, nil] The created dataset record or nil if it already exists
|
15
22
|
def add_record(record)
|
16
|
-
dataset_records.
|
23
|
+
dataset_records.find_or_create_by(recordable: record) do |dr|
|
24
|
+
dr.recordable = record
|
25
|
+
end
|
17
26
|
end
|
18
27
|
end
|
19
|
-
end
|
28
|
+
end
|