leva 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +55 -16
  3. data/app/controllers/leva/dataset_records_controller.rb +21 -0
  4. data/app/controllers/leva/datasets_controller.rb +9 -2
  5. data/app/controllers/leva/experiments_controller.rb +34 -9
  6. data/app/controllers/leva/runner_results_controller.rb +8 -0
  7. data/app/controllers/leva/workbench_controller.rb +85 -12
  8. data/app/helpers/leva/application_helper.rb +39 -0
  9. data/app/javascript/controllers/prompt_form_controller.js +45 -0
  10. data/app/javascript/controllers/prompt_selector_controller.js +31 -0
  11. data/app/jobs/leva/experiment_job.rb +9 -4
  12. data/app/jobs/leva/run_eval_job.rb +40 -0
  13. data/app/models/concerns/leva/recordable.rb +37 -0
  14. data/app/models/leva/dataset.rb +15 -6
  15. data/app/models/leva/dataset_record.rb +40 -1
  16. data/app/models/leva/evaluation_result.rb +15 -7
  17. data/app/models/leva/experiment.rb +24 -12
  18. data/app/models/leva/prompt.rb +14 -1
  19. data/app/models/leva/runner_result.rb +56 -0
  20. data/app/views/layouts/leva/application.html.erb +24 -13
  21. data/app/views/leva/dataset_records/index.html.erb +49 -0
  22. data/app/views/leva/dataset_records/show.html.erb +30 -0
  23. data/app/views/leva/datasets/_dataset.html.erb +18 -0
  24. data/app/views/leva/datasets/_form.html.erb +24 -0
  25. data/app/views/leva/datasets/edit.html.erb +5 -0
  26. data/app/views/leva/datasets/index.html.erb +51 -38
  27. data/app/views/leva/datasets/new.html.erb +5 -0
  28. data/app/views/leva/datasets/show.html.erb +160 -8
  29. data/app/views/leva/experiments/_experiment.html.erb +42 -0
  30. data/app/views/leva/experiments/_form.html.erb +49 -0
  31. data/app/views/leva/experiments/edit.html.erb +5 -0
  32. data/app/views/leva/experiments/index.html.erb +53 -37
  33. data/app/views/leva/experiments/new.html.erb +5 -0
  34. data/app/views/leva/experiments/show.html.erb +115 -19
  35. data/app/views/leva/runner_results/show.html.erb +64 -0
  36. data/app/views/leva/workbench/_evaluation_area.html.erb +5 -0
  37. data/app/views/leva/workbench/_prompt_content.html.erb +216 -0
  38. data/app/views/leva/workbench/_prompt_form.html.erb +89 -0
  39. data/app/views/leva/workbench/_prompt_sidebar.html.erb +21 -0
  40. data/app/views/leva/workbench/_results_section.html.erb +159 -0
  41. data/app/views/leva/workbench/_top_bar.html.erb +10 -0
  42. data/app/views/leva/workbench/edit.html.erb +20 -0
  43. data/app/views/leva/workbench/index.html.erb +5 -91
  44. data/app/views/leva/workbench/new.html.erb +79 -36
  45. data/config/routes.rb +15 -6
  46. data/db/migrate/20240813172916_create_leva_datasets.rb +1 -0
  47. data/db/migrate/20240813173035_create_leva_experiments.rb +1 -0
  48. data/db/migrate/20240816201419_create_leva_runner_results.rb +11 -0
  49. data/db/migrate/20240816201433_update_leva_evaluation_results.rb +8 -0
  50. data/db/migrate/20240821163608_make_experiment_optional_for_runner_results.rb +6 -0
  51. data/db/migrate/20240821181934_add_prompt_to_leva_runner_results.rb +5 -0
  52. data/db/migrate/20240821183153_add_runner_and_evaluator_to_leva_experiments.rb +6 -0
  53. data/db/migrate/20240821191713_add_actual_result_to_leva_dataset_records.rb +5 -0
  54. data/db/migrate/20240822143201_remove_actual_result_from_leva_runner_results.rb +5 -0
  55. data/db/migrate/20240912183556_add_runner_class_to_leva_runner_results.rb +5 -0
  56. data/lib/generators/leva/templates/eval.rb.erb +7 -8
  57. data/lib/generators/leva/templates/runner.rb.erb +25 -0
  58. data/lib/leva/version.rb +1 -1
  59. data/lib/leva.rb +84 -44
  60. metadata +49 -5
  61. data/app/evals/test_sentiment_accuracy_eval.rb +0 -6
  62. data/app/runners/test_sentiment_run.rb +0 -13
  63. data/lib/leva/base_eval.rb +0 -75
@@ -1,38 +1,81 @@
1
- <% content_for :title, 'New Experiment' %>
2
- <div class="px-4 sm:px-6 lg:px-8">
3
- <div class="sm:flex sm:items-center">
4
- <div class="sm:flex-auto">
5
- <h1 class="text-2xl font-semibold text-gray-900">New Experiment</h1>
6
- <p class="mt-2 text-sm text-gray-700">Create a new experiment in your workbench.</p>
7
- </div>
8
- </div>
9
- <div class="mt-8 max-w-xl">
10
- <%= form_with(model: @experiment, url: experiments_path, local: true, class: "space-y-8 divide-y divide-gray-200") do |form| %>
11
- <div class="space-y-8 divide-y divide-gray-200">
12
- <div>
13
- <div>
14
- <h3 class="text-lg leading-6 font-medium text-gray-900">Experiment Information</h3>
15
- <p class="mt-1 text-sm text-gray-500">Provide details for your new experiment.</p>
16
- </div>
17
- <div class="mt-6 grid grid-cols-1 gap-y-6 gap-x-4 sm:grid-cols-6">
18
- <div class="sm:col-span-4">
19
- <%= form.label :name, class: "block text-sm font-medium text-gray-700" %>
20
- <div class="mt-1">
21
- <%= form.text_field :name, class: "shadow-sm focus:ring-indigo-500 focus:border-indigo-500 block w-full sm:text-sm border-gray-300 rounded-md" %>
22
- </div>
23
- </div>
24
- <div class="sm:col-span-2">
25
- <%= form.label :dataset_id, class: "block text-sm font-medium text-gray-700" %>
26
- <div class="mt-1">
27
- <%= form.select :dataset_id, options_for_select(@datasets.map { |dataset| [dataset.name, dataset.id] }), class: "shadow-sm focus:ring-indigo-500 focus:border-indigo-500 block w-full sm:text-sm border-gray-300 rounded-md" %>
28
- </div>
29
- </div>
30
- </div>
31
- </div>
32
- </div>
33
- <div class="pt-5">
34
- <%= form.submit class: "btn btn-primary btn-block" %>
1
+ <% content_for :title, "New Prompt" %>
2
+ <div class="container mx-auto px-4 py-8 bg-gray-950 text-white">
3
+ <h1 class="text-3xl font-bold text-indigo-400 mb-6">New Prompt</h1>
4
+ <%= form_with(model: @prompt, url: workbench_index_path, local: true, class: "bg-gray-800 rounded-lg shadow-lg p-6", data: { controller: "prompt-selector" }) do |form| %>
5
+ <% if @prompt.errors.any? %>
6
+ <div class="bg-red-900 border border-red-700 text-red-100 px-4 py-3 rounded-lg mb-4">
7
+ <h2><%= pluralize(@prompt.errors.count, "error") %> prohibited this prompt from being saved:</h2>
8
+ <ul>
9
+ <% @prompt.errors.full_messages.each do |message| %>
10
+ <li><%= message %></li>
11
+ <% end %>
12
+ </ul>
35
13
  </div>
36
14
  <% end %>
37
- </div>
38
- </div>
15
+ <div class="mb-4">
16
+ <%= form.label :name, class: "block text-sm font-semibold mb-2 text-indigo-300" %>
17
+ <%= form.text_field :name, autofocus: true, class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none" %>
18
+ </div>
19
+ <div class="mb-4">
20
+ <%= form.label :system_prompt, class: "block text-sm font-semibold mb-2 text-indigo-300" %>
21
+ <%= form.text_area :system_prompt, rows: 2, class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none" %>
22
+ </div>
23
+ <div class="mb-4">
24
+ <%= form.label :predefined_prompt, "Select Predefined Prompt", class: "block text-sm font-semibold mb-2 text-indigo-300" %>
25
+ <%= form.select :predefined_prompt,
26
+ options_for_select([['Custom Prompt', '']] + @predefined_prompts.map { |name, content| [name, content] }),
27
+ {},
28
+ class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none",
29
+ data: { action: "change->prompt-selector#toggleUserPrompt" }
30
+ %>
31
+ </div>
32
+ <div class="mb-4" data-prompt-selector-target="userPromptField">
33
+ <%= form.label :user_prompt, class: "block text-sm font-semibold mb-2 text-indigo-300" %>
34
+ <%= form.text_area :user_prompt, rows: 5, class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none" %>
35
+ </div>
36
+ <div class="mb-4 hidden" data-prompt-selector-target="promptPreview">
37
+ <h3 class="text-lg font-semibold mb-2 text-indigo-300">Prompt Preview</h3>
38
+ <div class="bg-gray-700 text-white p-3 rounded-lg" data-prompt-selector-target="previewContent"></div>
39
+ </div>
40
+ <div class="flex items-center justify-end space-x-4">
41
+ <%= link_to "Cancel", workbench_index_path, class: "px-3 py-2 rounded-md text-sm font-medium text-gray-300 hover:bg-gray-800 hover:text-white transition-colors duration-150 ease-in-out" %>
42
+ <%= form.submit "Create Prompt", class: "px-3 py-2 rounded-md text-sm font-medium bg-indigo-600 text-white shadow-lg hover:bg-indigo-700 transition-colors duration-150 ease-in-out" %>
43
+ </div>
44
+ <% end %>
45
+ </div>
46
+ <script>
47
+ (() => {
48
+ const application = Stimulus.Application.start()
49
+
50
+ application.register("prompt-selector", class extends Stimulus.Controller {
51
+ static targets = ["userPromptField", "promptPreview", "previewContent"]
52
+
53
+ toggleUserPrompt(event) {
54
+ const selectedContent = event.target.value
55
+ if (selectedContent) {
56
+ this.userPromptFieldTarget.style.display = 'none'
57
+ this.promptPreviewTarget.classList.remove('hidden')
58
+ this.loadPredefinedPrompt(selectedContent)
59
+ } else {
60
+ this.userPromptFieldTarget.style.display = 'block'
61
+ this.promptPreviewTarget.classList.add('hidden')
62
+ this.clearUserPrompt()
63
+ }
64
+ }
65
+
66
+ loadPredefinedPrompt(content) {
67
+ const userPromptTextarea = this.userPromptFieldTarget.querySelector('textarea')
68
+ userPromptTextarea.value = content
69
+ this.previewContentTarget.innerHTML = marked.parse(content)
70
+ }
71
+
72
+ clearUserPrompt() {
73
+ const userPromptTextarea = this.userPromptFieldTarget.querySelector('textarea')
74
+ userPromptTextarea.value = ''
75
+ this.previewContentTarget.innerHTML = ''
76
+ }
77
+ })
78
+ })()
79
+ </script>
80
+ <!-- Include marked.js for Markdown parsing -->
81
+ <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
data/config/routes.rb CHANGED
@@ -1,12 +1,21 @@
1
1
  Leva::Engine.routes.draw do
2
2
  root 'workbench#index'
3
3
 
4
- resources :datasets
5
- resources :experiments
4
+ resources :datasets do
5
+ resources :dataset_records, path: 'records', only: [:index, :show]
6
+ end
7
+ resources :experiments, except: [:destroy] do
8
+ member do
9
+ post :rerun
10
+ end
11
+ resources :runner_results, only: [:show]
12
+ end
6
13
  resources :prompts
7
- resources :workbench, only: [:index, :new, :show] do
8
- post 'run', on: :collection
9
- post 'run_with_evaluation', on: :collection
10
- post 'run_evaluator', on: :collection
14
+ resources :workbench, only: [:index, :new, :create, :edit, :update] do
15
+ collection do
16
+ post 'run'
17
+ post 'run_all_evals'
18
+ post 'run_evaluator'
19
+ end
11
20
  end
12
21
  end
@@ -2,6 +2,7 @@ class CreateLevaDatasets < ActiveRecord::Migration[7.2]
2
2
  def change
3
3
  create_table :leva_datasets do |t|
4
4
  t.string :name
5
+ t.text :description
5
6
 
6
7
  t.timestamps
7
8
  end
@@ -2,6 +2,7 @@ class CreateLevaExperiments < ActiveRecord::Migration[7.2]
2
2
  def change
3
3
  create_table :leva_experiments do |t|
4
4
  t.string :name
5
+ t.text :description
5
6
  t.references :dataset, null: false, foreign_key: { to_table: :leva_datasets }
6
7
  t.references :prompt, null: true, foreign_key: { to_table: :leva_prompts }
7
8
  t.integer :status
@@ -0,0 +1,11 @@
1
+ class CreateLevaRunnerResults < ActiveRecord::Migration[7.2]
2
+ def change
3
+ create_table :leva_runner_results do |t|
4
+ t.references :experiment, null: false, foreign_key: { to_table: :leva_experiments }
5
+ t.references :dataset_record, null: false, foreign_key: { to_table: :leva_dataset_records }
6
+ t.text :prediction
7
+
8
+ t.timestamps
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,8 @@
1
+ class UpdateLevaEvaluationResults < ActiveRecord::Migration[7.2]
2
+ def change
3
+ add_reference :leva_evaluation_results, :runner_result, null: false, foreign_key: { to_table: :leva_runner_results }
4
+ add_column :leva_evaluation_results, :evaluator_class, :string, null: false
5
+ remove_column :leva_evaluation_results, :prediction, :string
6
+ remove_column :leva_evaluation_results, :label, :string
7
+ end
8
+ end
@@ -0,0 +1,6 @@
1
+ class MakeExperimentOptionalForRunnerResults < ActiveRecord::Migration[7.2]
2
+ def change
3
+ change_column_null :leva_runner_results, :experiment_id, true
4
+ change_column_null :leva_evaluation_results, :experiment_id, true
5
+ end
6
+ end
@@ -0,0 +1,5 @@
1
+ class AddPromptToLevaRunnerResults < ActiveRecord::Migration[7.2]
2
+ def change
3
+ add_reference :leva_runner_results, :prompt, null: false, foreign_key: { to_table: :leva_prompts }
4
+ end
5
+ end
@@ -0,0 +1,6 @@
1
+ class AddRunnerAndEvaluatorToLevaExperiments < ActiveRecord::Migration[7.2]
2
+ def change
3
+ add_column :leva_experiments, :runner_class, :string
4
+ add_column :leva_experiments, :evaluator_classes, :text
5
+ end
6
+ end
@@ -0,0 +1,5 @@
1
+ class AddActualResultToLevaDatasetRecords < ActiveRecord::Migration[7.2]
2
+ def change
3
+ add_column :leva_runner_results, :actual_result, :text
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ class RemoveActualResultFromLevaRunnerResults < ActiveRecord::Migration[7.2]
2
+ def change
3
+ remove_column :leva_runner_results, :actual_result, :text
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ class AddRunnerClassToLevaRunnerResults < ActiveRecord::Migration[7.2]
2
+ def change
3
+ add_column :leva_runner_results, :runner_class, :string
4
+ end
5
+ end
@@ -1,15 +1,14 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class <%= class_name %>Eval < Leva::BaseEval
4
- # @param prediction [String] The prediction to evaluate
5
- # @param record [YourRecordClass] The record to evaluate
6
- # @return [Leva::Result] The result of the evaluation
7
- def evaluate(prediction, record)
4
+ # @param runner_result [Leva::RunnerResult] The runner result to evaluate
5
+ # @param recordable [YourRecordClass] The recordable object to evaluate
6
+ # @return [Float] The score of the evaluation
7
+ def evaluate(runner_result, recordable)
8
8
  # Implement your evaluation logic here
9
+ # You can access the ground truth using recordable.ground_truth
9
10
 
10
- Leva::Result.new(
11
- label: "<%= file_name.underscore %>",
12
- score: score
13
- )
11
+ # Example implementation:
12
+ runner_result.parsed_predictions.first == recordable.ground_truth ? 1.0 : 0.0
14
13
  end
15
14
  end
@@ -8,4 +8,29 @@ class <%= class_name %>Run < Leva::BaseRun
8
8
  # This could involve calling an API, running a local model, etc.
9
9
  # Return the result of the run to be used to evaluate the model
10
10
  end
11
+
12
+ # Uncomment and modify this method to customize parsed predictions
13
+ # @param runner_result [Leva::RunnerResult] The runner result to parse
14
+ # @return [Array<String>] The parsed predictions
15
+ # def parsed_predictions(runner_result)
16
+ # # Example: Extract predictions from XML-like tags
17
+ # runner_result.prediction.scan(/<prediction>(.*?)<\/prediction>/).flatten
18
+ # end
19
+
20
+ # Uncomment and modify this method to customize ground truth extraction
21
+ # @param runner_result [Leva::RunnerResult] The runner result to get ground truth from
22
+ # @return [String] The ground truth for the runner result
23
+ # def ground_truth(runner_result)
24
+ # # Example: Extract ground truth from a specific field
25
+ # runner_result.dataset_record.recordable.custom_ground_truth_field
26
+ # end
27
+
28
+ # Uncomment and modify this method to customize regex extraction
29
+ # @param runner_result [Leva::RunnerResult] The runner result to extract regex from
30
+ # @return [Regexp, nil] The regex pattern to use for parsing predictions
31
+ # def extract_regex_pattern(runner_result)
32
+ # # Your custom regex extraction logic here
33
+ # # For example:
34
+ # # /\<result\>(.*?)\<\/result\>/
35
+ # end
11
36
  end
data/lib/leva/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Leva
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.7"
3
3
  end
data/lib/leva.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "leva/version"
2
2
  require "leva/engine"
3
+ require "liquid"
3
4
 
4
5
  module Leva
5
6
  # Runs an evaluation experiment with the given run and evals.
@@ -9,9 +10,34 @@ module Leva
9
10
  # @param evals [Array<Leva::BaseEval>] The evaluation implementations to use.
10
11
  # @return [void]
11
12
  def self.run_evaluation(experiment:, run:, evals:)
12
- results = run.run(experiment)
13
+ experiment.update(status: :running)
14
+
15
+ experiment.dataset.dataset_records.find_each do |dataset_record|
16
+ runner_result = run.execute_and_store(experiment, dataset_record, experiment.prompt)
17
+
18
+ evals.each do |eval|
19
+ eval.evaluate_and_store(experiment, runner_result)
20
+ end
21
+ end
22
+
23
+ experiment.update(status: :completed)
24
+ rescue StandardError => e
25
+ experiment.update(status: :failed)
26
+ Rails.logger.error "Error in experiment #{experiment.name}: #{e.message}"
27
+ end
28
+
29
+ # Runs a single evaluation for a dataset record
30
+ #
31
+ # @param experiment [Leva::Experiment] The experiment to run.
32
+ # @param run [Leva::BaseRun] The run implementation to use.
33
+ # @param evals [Array<Leva::BaseEval>] The evaluation implementations to use.
34
+ # @param dataset_record [Leva::DatasetRecord] The dataset record to process.
35
+ # @return [void]
36
+ def self.run_single_evaluation(experiment:, run:, evals:, dataset_record:)
37
+ runner_result = run.execute_and_store(experiment, dataset_record, experiment.prompt)
38
+
13
39
  evals.each do |eval|
14
- eval.evaluate_all(experiment, results)
40
+ eval.evaluate_and_store(experiment, runner_result)
15
41
  end
16
42
  end
17
43
 
@@ -29,20 +55,47 @@ module Leva
29
55
  raise NotImplementedError, "#{self.class} must implement #execute"
30
56
  end
31
57
 
32
- # Runs the model on all records in an experiment.
58
+ # Executes the run on a given dataset record and stores the result.
33
59
  #
34
- # @param experiment [Leva::Experiment] The experiment to run.
35
- # @return [Hash] A hash mapping dataset_record_ids to their execution results.
36
- def run(experiment)
60
+ # @param experiment [Leva::Experiment, nil] The experiment being run, if any.
61
+ # @param dataset_record [Leva::DatasetRecord] The dataset record to run the model on.
62
+ # @param prompt [Leva::Prompt] The prompt to store the version of.
63
+ # @return [Leva::RunnerResult] The stored runner result.
64
+ def execute_and_store(experiment, dataset_record, prompt)
65
+ # Expose these to the subclass execution
37
66
  @experiment = experiment
38
- @prompt = experiment.prompt
67
+ @prompt = prompt
68
+
69
+ result = execute(dataset_record.recordable)
70
+ RunnerResult.create!(
71
+ experiment: experiment,
72
+ dataset_record: dataset_record,
73
+ prompt: prompt,
74
+ prediction: result,
75
+ runner_class: self.class.name
76
+ )
77
+ end
39
78
 
40
- results = {}
41
- experiment.dataset.dataset_records.find_each do |dataset_record|
42
- result = execute(dataset_record.recordable)
43
- results[dataset_record.id] = result
79
+ # @param runner_result [Leva::RunnerResult] The runner result to parse
80
+ # @return [Array<String>] The parsed predictions
81
+ def parsed_predictions(runner_result)
82
+ if extract_regex_pattern(runner_result)
83
+ runner_result.prediction.scan(extract_regex_pattern(runner_result)).map { |match| match.first&.strip }.compact
84
+ else
85
+ [runner_result.prediction]
44
86
  end
45
- results
87
+ end
88
+
89
+ # @param runner_result [Leva::RunnerResult] The runner result to extract regex from
90
+ # @return [Regexp, nil] The regex pattern to use for parsing predictions
91
+ def extract_regex_pattern(runner_result)
92
+ runner_result.dataset_record.recordable.extract_regex_pattern if runner_result.dataset_record.recordable.respond_to?(:extract_regex_pattern)
93
+ end
94
+
95
+ # @param runner_result [Leva::RunnerResult] The runner result to get ground truth from
96
+ # @return [String] The ground truth for the runner result
97
+ def ground_truth(runner_result)
98
+ runner_result.dataset_record.ground_truth
46
99
  end
47
100
  end
48
101
 
@@ -51,47 +104,34 @@ module Leva
51
104
  # @abstract Subclass and override {#evaluate} to implement
52
105
  # custom evaluation logic.
53
106
  class BaseEval
54
- # Evaluates the model's prediction against the expected result.
107
+ # Evaluates the model's prediction against the ground truth.
55
108
  #
56
109
  # @param prediction [Object] The model's prediction.
57
- # @param record [Object] The expected result.
58
- # @return [Leva::Result] The evaluation result.
110
+ # @param recordable [Object] The recordable object containing the ground truth.
111
+ # @return [Float] The evaluation score.
59
112
  # @raise [NotImplementedError] if the method is not implemented in a subclass.
60
- def evaluate(prediction, record)
113
+ def evaluate(prediction, recordable)
61
114
  raise NotImplementedError, "#{self.class} must implement #evaluate"
62
115
  end
63
116
 
64
- # Evaluates all results for an experiment.
117
+ # Evaluates a single runner result and stores the evaluation.
65
118
  #
66
- # @param experiment [Leva::Experiment] The experiment to evaluate.
67
- # @param results [Hash] A hash mapping dataset_record_ids to their execution results.
68
- # @return [void]
69
- def evaluate_all(experiment, results)
70
- experiment.dataset.dataset_records.find_each do |dataset_record|
71
- prediction = results[dataset_record.id]
72
- evaluation = evaluate(prediction, dataset_record.recordable)
73
-
74
- Leva::EvaluationResult.create!(
75
- experiment: experiment,
76
- dataset_record: dataset_record,
77
- prediction: prediction,
78
- score: evaluation.score,
79
- label: evaluation.label
80
- )
81
- end
82
- end
83
- end
119
+ # @param experiment [Leva::Experiment, nil] The experiment being evaluated, if any.
120
+ # @param runner_result [Leva::RunnerResult] The runner result to evaluate.
121
+ # @return [Leva::EvaluationResult] The stored evaluation result.
122
+ def evaluate_and_store(experiment, runner_result)
123
+ @experiment = experiment
124
+ @runner_result = runner_result
84
125
 
85
- # Represents the result of an evaluation
86
- class Result
87
- attr_reader :label, :prediction, :score
126
+ score = evaluate(runner_result, runner_result.dataset_record.recordable)
88
127
 
89
- # Initialize a new Result
90
- # @param label [String] The label for the result
91
- # @param score [Float] The score of the evaluation (0.0 to 1.0)
92
- def initialize(label:, score:)
93
- @label = label
94
- @score = score
128
+ EvaluationResult.create!(
129
+ experiment: experiment,
130
+ dataset_record: runner_result.dataset_record,
131
+ runner_result: runner_result,
132
+ score: score,
133
+ evaluator_class: self.class.name
134
+ )
95
135
  end
96
136
  end
97
137
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: leva
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kieran Klaassen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-14 00:00:00.000000000 Z
11
+ date: 2024-09-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rails
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: 7.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: liquid
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 5.5.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 5.5.0
27
41
  description: Leva is a Ruby on Rails framework for evaluating Language Models (LLMs)
28
42
  using ActiveRecord datasets. It provides a flexible structure for creating experiments,
29
43
  managing datasets, and implementing various evaluation logic.
@@ -39,26 +53,49 @@ files:
39
53
  - app/assets/config/leva_manifest.js
40
54
  - app/assets/stylesheets/leva/application.css
41
55
  - app/controllers/leva/application_controller.rb
56
+ - app/controllers/leva/dataset_records_controller.rb
42
57
  - app/controllers/leva/datasets_controller.rb
43
58
  - app/controllers/leva/experiments_controller.rb
59
+ - app/controllers/leva/runner_results_controller.rb
44
60
  - app/controllers/leva/workbench_controller.rb
45
- - app/evals/test_sentiment_accuracy_eval.rb
46
61
  - app/helpers/leva/application_helper.rb
62
+ - app/javascript/controllers/prompt_form_controller.js
63
+ - app/javascript/controllers/prompt_selector_controller.js
47
64
  - app/jobs/leva/application_job.rb
48
65
  - app/jobs/leva/experiment_job.rb
66
+ - app/jobs/leva/run_eval_job.rb
49
67
  - app/mailers/leva/application_mailer.rb
68
+ - app/models/concerns/leva/recordable.rb
50
69
  - app/models/leva/application_record.rb
51
70
  - app/models/leva/dataset.rb
52
71
  - app/models/leva/dataset_record.rb
53
72
  - app/models/leva/evaluation_result.rb
54
73
  - app/models/leva/experiment.rb
55
74
  - app/models/leva/prompt.rb
56
- - app/runners/test_sentiment_run.rb
75
+ - app/models/leva/runner_result.rb
57
76
  - app/views/layouts/leva/application.html.erb
77
+ - app/views/leva/dataset_records/index.html.erb
78
+ - app/views/leva/dataset_records/show.html.erb
79
+ - app/views/leva/datasets/_dataset.html.erb
80
+ - app/views/leva/datasets/_form.html.erb
81
+ - app/views/leva/datasets/edit.html.erb
58
82
  - app/views/leva/datasets/index.html.erb
83
+ - app/views/leva/datasets/new.html.erb
59
84
  - app/views/leva/datasets/show.html.erb
85
+ - app/views/leva/experiments/_experiment.html.erb
86
+ - app/views/leva/experiments/_form.html.erb
87
+ - app/views/leva/experiments/edit.html.erb
60
88
  - app/views/leva/experiments/index.html.erb
89
+ - app/views/leva/experiments/new.html.erb
61
90
  - app/views/leva/experiments/show.html.erb
91
+ - app/views/leva/runner_results/show.html.erb
92
+ - app/views/leva/workbench/_evaluation_area.html.erb
93
+ - app/views/leva/workbench/_prompt_content.html.erb
94
+ - app/views/leva/workbench/_prompt_form.html.erb
95
+ - app/views/leva/workbench/_prompt_sidebar.html.erb
96
+ - app/views/leva/workbench/_results_section.html.erb
97
+ - app/views/leva/workbench/_top_bar.html.erb
98
+ - app/views/leva/workbench/edit.html.erb
62
99
  - app/views/leva/workbench/index.html.erb
63
100
  - app/views/leva/workbench/new.html.erb
64
101
  - config/routes.rb
@@ -67,12 +104,19 @@ files:
67
104
  - db/migrate/20240813173034_create_leva_prompts.rb
68
105
  - db/migrate/20240813173035_create_leva_experiments.rb
69
106
  - db/migrate/20240813173050_create_leva_evaluation_results.rb
107
+ - db/migrate/20240816201419_create_leva_runner_results.rb
108
+ - db/migrate/20240816201433_update_leva_evaluation_results.rb
109
+ - db/migrate/20240821163608_make_experiment_optional_for_runner_results.rb
110
+ - db/migrate/20240821181934_add_prompt_to_leva_runner_results.rb
111
+ - db/migrate/20240821183153_add_runner_and_evaluator_to_leva_experiments.rb
112
+ - db/migrate/20240821191713_add_actual_result_to_leva_dataset_records.rb
113
+ - db/migrate/20240822143201_remove_actual_result_from_leva_runner_results.rb
114
+ - db/migrate/20240912183556_add_runner_class_to_leva_runner_results.rb
70
115
  - lib/generators/leva/eval_generator.rb
71
116
  - lib/generators/leva/runner_generator.rb
72
117
  - lib/generators/leva/templates/eval.rb.erb
73
118
  - lib/generators/leva/templates/runner.rb.erb
74
119
  - lib/leva.rb
75
- - lib/leva/base_eval.rb
76
120
  - lib/leva/engine.rb
77
121
  - lib/leva/version.rb
78
122
  - lib/tasks/auto_annotate_models.rake
@@ -1,6 +0,0 @@
1
- class TestSentimentAccuracyEval < Leva::BaseEval
2
- def evaluate(prediction, expected)
3
- score = prediction == expected ? 1.0 : 0.0
4
- Leva::Result.new(label: 'sentiment_accuracy', score: score)
5
- end
6
- end
@@ -1,13 +0,0 @@
1
- class TestSentimentRun < Leva::BaseRun
2
- def execute(record)
3
- # Simple sentiment analysis logic for testing
4
- case record.content.downcase
5
- when /love|great|excellent/
6
- "Positive"
7
- when /terrible|bad|awful/
8
- "Negative"
9
- else
10
- "Neutral"
11
- end
12
- end
13
- end