leva 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +52 -16
  3. data/app/controllers/leva/dataset_records_controller.rb +21 -0
  4. data/app/controllers/leva/datasets_controller.rb +9 -2
  5. data/app/controllers/leva/experiments_controller.rb +34 -9
  6. data/app/controllers/leva/runner_results_controller.rb +8 -0
  7. data/app/controllers/leva/workbench_controller.rb +85 -12
  8. data/app/helpers/leva/application_helper.rb +39 -0
  9. data/app/javascript/controllers/prompt_form_controller.js +45 -0
  10. data/app/javascript/controllers/prompt_selector_controller.js +31 -0
  11. data/app/jobs/leva/experiment_job.rb +9 -4
  12. data/app/jobs/leva/run_eval_job.rb +40 -0
  13. data/app/models/concerns/leva/recordable.rb +37 -0
  14. data/app/models/leva/dataset.rb +15 -6
  15. data/app/models/leva/dataset_record.rb +43 -5
  16. data/app/models/leva/evaluation_result.rb +22 -14
  17. data/app/models/leva/experiment.rb +26 -14
  18. data/app/models/leva/prompt.rb +14 -1
  19. data/app/models/leva/runner_result.rb +54 -0
  20. data/app/views/layouts/leva/application.html.erb +24 -13
  21. data/app/views/leva/dataset_records/index.html.erb +49 -0
  22. data/app/views/leva/dataset_records/show.html.erb +30 -0
  23. data/app/views/leva/datasets/_dataset.html.erb +18 -0
  24. data/app/views/leva/datasets/_form.html.erb +24 -0
  25. data/app/views/leva/datasets/edit.html.erb +5 -0
  26. data/app/views/leva/datasets/index.html.erb +51 -38
  27. data/app/views/leva/datasets/new.html.erb +5 -0
  28. data/app/views/leva/datasets/show.html.erb +160 -8
  29. data/app/views/leva/experiments/_experiment.html.erb +42 -0
  30. data/app/views/leva/experiments/_form.html.erb +49 -0
  31. data/app/views/leva/experiments/edit.html.erb +5 -0
  32. data/app/views/leva/experiments/index.html.erb +53 -37
  33. data/app/views/leva/experiments/new.html.erb +5 -0
  34. data/app/views/leva/experiments/show.html.erb +115 -19
  35. data/app/views/leva/runner_results/show.html.erb +64 -0
  36. data/app/views/leva/workbench/_evaluation_area.html.erb +5 -0
  37. data/app/views/leva/workbench/_prompt_content.html.erb +216 -0
  38. data/app/views/leva/workbench/_prompt_form.html.erb +89 -0
  39. data/app/views/leva/workbench/_prompt_sidebar.html.erb +21 -0
  40. data/app/views/leva/workbench/_results_section.html.erb +159 -0
  41. data/app/views/leva/workbench/_top_bar.html.erb +10 -0
  42. data/app/views/leva/workbench/edit.html.erb +20 -0
  43. data/app/views/leva/workbench/index.html.erb +5 -91
  44. data/app/views/leva/workbench/new.html.erb +79 -36
  45. data/config/routes.rb +15 -6
  46. data/db/migrate/20240813172916_create_leva_datasets.rb +1 -0
  47. data/db/migrate/20240813173033_create_leva_dataset_records.rb +1 -1
  48. data/db/migrate/20240813173035_create_leva_experiments.rb +3 -2
  49. data/db/migrate/20240813173050_create_leva_evaluation_results.rb +2 -2
  50. data/db/migrate/20240816201419_create_leva_runner_results.rb +11 -0
  51. data/db/migrate/20240816201433_update_leva_evaluation_results.rb +8 -0
  52. data/db/migrate/20240821163608_make_experiment_optional_for_runner_results.rb +6 -0
  53. data/db/migrate/20240821181934_add_prompt_to_leva_runner_results.rb +5 -0
  54. data/db/migrate/20240821183153_add_runner_and_evaluator_to_leva_experiments.rb +6 -0
  55. data/db/migrate/20240821191713_add_actual_result_to_leva_dataset_records.rb +5 -0
  56. data/db/migrate/20240822143201_remove_actual_result_from_leva_runner_results.rb +5 -0
  57. data/lib/generators/leva/templates/eval.rb.erb +6 -7
  58. data/lib/leva/version.rb +1 -1
  59. data/lib/leva.rb +62 -45
  60. metadata +48 -5
  61. data/app/evals/test_sentiment_accuracy_eval.rb +0 -6
  62. data/app/runners/test_sentiment_run.rb +0 -13
  63. data/lib/leva/base_eval.rb +0 -75
@@ -1,38 +1,81 @@
1
- <% content_for :title, 'New Experiment' %>
2
- <div class="px-4 sm:px-6 lg:px-8">
3
- <div class="sm:flex sm:items-center">
4
- <div class="sm:flex-auto">
5
- <h1 class="text-2xl font-semibold text-gray-900">New Experiment</h1>
6
- <p class="mt-2 text-sm text-gray-700">Create a new experiment in your workbench.</p>
7
- </div>
8
- </div>
9
- <div class="mt-8 max-w-xl">
10
- <%= form_with(model: @experiment, url: experiments_path, local: true, class: "space-y-8 divide-y divide-gray-200") do |form| %>
11
- <div class="space-y-8 divide-y divide-gray-200">
12
- <div>
13
- <div>
14
- <h3 class="text-lg leading-6 font-medium text-gray-900">Experiment Information</h3>
15
- <p class="mt-1 text-sm text-gray-500">Provide details for your new experiment.</p>
16
- </div>
17
- <div class="mt-6 grid grid-cols-1 gap-y-6 gap-x-4 sm:grid-cols-6">
18
- <div class="sm:col-span-4">
19
- <%= form.label :name, class: "block text-sm font-medium text-gray-700" %>
20
- <div class="mt-1">
21
- <%= form.text_field :name, class: "shadow-sm focus:ring-indigo-500 focus:border-indigo-500 block w-full sm:text-sm border-gray-300 rounded-md" %>
22
- </div>
23
- </div>
24
- <div class="sm:col-span-2">
25
- <%= form.label :dataset_id, class: "block text-sm font-medium text-gray-700" %>
26
- <div class="mt-1">
27
- <%= form.select :dataset_id, options_for_select(@datasets.map { |dataset| [dataset.name, dataset.id] }), class: "shadow-sm focus:ring-indigo-500 focus:border-indigo-500 block w-full sm:text-sm border-gray-300 rounded-md" %>
28
- </div>
29
- </div>
30
- </div>
31
- </div>
32
- </div>
33
- <div class="pt-5">
34
- <%= form.submit class: "btn btn-primary btn-block" %>
1
+ <% content_for :title, "New Prompt" %>
2
+ <div class="container mx-auto px-4 py-8 bg-gray-950 text-white">
3
+ <h1 class="text-3xl font-bold text-indigo-400 mb-6">New Prompt</h1>
4
+ <%= form_with(model: @prompt, url: workbench_index_path, local: true, class: "bg-gray-800 rounded-lg shadow-lg p-6", data: { controller: "prompt-selector" }) do |form| %>
5
+ <% if @prompt.errors.any? %>
6
+ <div class="bg-red-900 border border-red-700 text-red-100 px-4 py-3 rounded-lg mb-4">
7
+ <h2><%= pluralize(@prompt.errors.count, "error") %> prohibited this prompt from being saved:</h2>
8
+ <ul>
9
+ <% @prompt.errors.full_messages.each do |message| %>
10
+ <li><%= message %></li>
11
+ <% end %>
12
+ </ul>
35
13
  </div>
36
14
  <% end %>
37
- </div>
38
- </div>
15
+ <div class="mb-4">
16
+ <%= form.label :name, class: "block text-sm font-semibold mb-2 text-indigo-300" %>
17
+ <%= form.text_field :name, autofocus: true, class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none" %>
18
+ </div>
19
+ <div class="mb-4">
20
+ <%= form.label :system_prompt, class: "block text-sm font-semibold mb-2 text-indigo-300" %>
21
+ <%= form.text_area :system_prompt, rows: 2, class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none" %>
22
+ </div>
23
+ <div class="mb-4">
24
+ <%= form.label :predefined_prompt, "Select Predefined Prompt", class: "block text-sm font-semibold mb-2 text-indigo-300" %>
25
+ <%= form.select :predefined_prompt,
26
+ options_for_select([['Custom Prompt', '']] + @predefined_prompts.map { |name, content| [name, content] }),
27
+ {},
28
+ class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none",
29
+ data: { action: "change->prompt-selector#toggleUserPrompt" }
30
+ %>
31
+ </div>
32
+ <div class="mb-4" data-prompt-selector-target="userPromptField">
33
+ <%= form.label :user_prompt, class: "block text-sm font-semibold mb-2 text-indigo-300" %>
34
+ <%= form.text_area :user_prompt, rows: 5, class: "w-full bg-gray-700 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none" %>
35
+ </div>
36
+ <div class="mb-4 hidden" data-prompt-selector-target="promptPreview">
37
+ <h3 class="text-lg font-semibold mb-2 text-indigo-300">Prompt Preview</h3>
38
+ <div class="bg-gray-700 text-white p-3 rounded-lg" data-prompt-selector-target="previewContent"></div>
39
+ </div>
40
+ <div class="flex items-center justify-end space-x-4">
41
+ <%= link_to "Cancel", workbench_index_path, class: "px-3 py-2 rounded-md text-sm font-medium text-gray-300 hover:bg-gray-800 hover:text-white transition-colors duration-150 ease-in-out" %>
42
+ <%= form.submit "Create Prompt", class: "px-3 py-2 rounded-md text-sm font-medium bg-indigo-600 text-white shadow-lg hover:bg-indigo-700 transition-colors duration-150 ease-in-out" %>
43
+ </div>
44
+ <% end %>
45
+ </div>
46
+ <script>
47
+ (() => {
48
+ const application = Stimulus.Application.start()
49
+
50
+ application.register("prompt-selector", class extends Stimulus.Controller {
51
+ static targets = ["userPromptField", "promptPreview", "previewContent"]
52
+
53
+ toggleUserPrompt(event) {
54
+ const selectedContent = event.target.value
55
+ if (selectedContent) {
56
+ this.userPromptFieldTarget.style.display = 'none'
57
+ this.promptPreviewTarget.classList.remove('hidden')
58
+ this.loadPredefinedPrompt(selectedContent)
59
+ } else {
60
+ this.userPromptFieldTarget.style.display = 'block'
61
+ this.promptPreviewTarget.classList.add('hidden')
62
+ this.clearUserPrompt()
63
+ }
64
+ }
65
+
66
+ loadPredefinedPrompt(content) {
67
+ const userPromptTextarea = this.userPromptFieldTarget.querySelector('textarea')
68
+ userPromptTextarea.value = content
69
+ this.previewContentTarget.innerHTML = marked.parse(content)
70
+ }
71
+
72
+ clearUserPrompt() {
73
+ const userPromptTextarea = this.userPromptFieldTarget.querySelector('textarea')
74
+ userPromptTextarea.value = ''
75
+ this.previewContentTarget.innerHTML = ''
76
+ }
77
+ })
78
+ })()
79
+ </script>
80
+ <!-- Include marked.js for Markdown parsing -->
81
+ <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
data/config/routes.rb CHANGED
@@ -1,12 +1,21 @@
1
1
  Leva::Engine.routes.draw do
2
2
  root 'workbench#index'
3
3
 
4
- resources :datasets
5
- resources :experiments
4
+ resources :datasets do
5
+ resources :dataset_records, path: 'records', only: [:index, :show]
6
+ end
7
+ resources :experiments, except: [:destroy] do
8
+ member do
9
+ post :rerun
10
+ end
11
+ resources :runner_results, only: [:show]
12
+ end
6
13
  resources :prompts
7
- resources :workbench, only: [:index, :new, :show] do
8
- post 'run', on: :collection
9
- post 'run_with_evaluation', on: :collection
10
- post 'run_evaluator', on: :collection
14
+ resources :workbench, only: [:index, :new, :create, :edit, :update] do
15
+ collection do
16
+ post 'run'
17
+ post 'run_all_evals'
18
+ post 'run_evaluator'
19
+ end
11
20
  end
12
21
  end
@@ -2,6 +2,7 @@ class CreateLevaDatasets < ActiveRecord::Migration[7.2]
2
2
  def change
3
3
  create_table :leva_datasets do |t|
4
4
  t.string :name
5
+ t.text :description
5
6
 
6
7
  t.timestamps
7
8
  end
@@ -1,7 +1,7 @@
1
1
  class CreateLevaDatasetRecords < ActiveRecord::Migration[7.2]
2
2
  def change
3
3
  create_table :leva_dataset_records do |t|
4
- t.references :leva_dataset, null: false, foreign_key: true
4
+ t.references :dataset, null: false, foreign_key: { to_table: :leva_datasets }
5
5
  t.references :recordable, polymorphic: true, null: false
6
6
 
7
7
  t.timestamps
@@ -2,8 +2,9 @@ class CreateLevaExperiments < ActiveRecord::Migration[7.2]
2
2
  def change
3
3
  create_table :leva_experiments do |t|
4
4
  t.string :name
5
- t.references :leva_dataset, null: false, foreign_key: true
6
- t.references :leva_prompt, null: true, foreign_key: true
5
+ t.text :description
6
+ t.references :dataset, null: false, foreign_key: { to_table: :leva_datasets }
7
+ t.references :prompt, null: true, foreign_key: { to_table: :leva_prompts }
7
8
  t.integer :status
8
9
  t.text :metadata
9
10
 
@@ -1,8 +1,8 @@
1
1
  class CreateLevaEvaluationResults < ActiveRecord::Migration[7.2]
2
2
  def change
3
3
  create_table :leva_evaluation_results do |t|
4
- t.references :leva_experiment, null: false, foreign_key: true
5
- t.references :leva_dataset_record, null: false, foreign_key: true
4
+ t.references :experiment, null: false, foreign_key: { to_table: :leva_experiments }
5
+ t.references :dataset_record, null: false, foreign_key: { to_table: :leva_dataset_records }
6
6
  t.string :prediction
7
7
  t.float :score
8
8
  t.string :label
@@ -0,0 +1,11 @@
1
+ class CreateLevaRunnerResults < ActiveRecord::Migration[7.2]
2
+ def change
3
+ create_table :leva_runner_results do |t|
4
+ t.references :experiment, null: false, foreign_key: { to_table: :leva_experiments }
5
+ t.references :dataset_record, null: false, foreign_key: { to_table: :leva_dataset_records }
6
+ t.text :prediction
7
+
8
+ t.timestamps
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,8 @@
1
+ class UpdateLevaEvaluationResults < ActiveRecord::Migration[7.2]
2
+ def change
3
+ add_reference :leva_evaluation_results, :runner_result, null: false, foreign_key: { to_table: :leva_runner_results }
4
+ add_column :leva_evaluation_results, :evaluator_class, :string, null: false
5
+ remove_column :leva_evaluation_results, :prediction, :string
6
+ remove_column :leva_evaluation_results, :label, :string
7
+ end
8
+ end
@@ -0,0 +1,6 @@
1
+ class MakeExperimentOptionalForRunnerResults < ActiveRecord::Migration[7.2]
2
+ def change
3
+ change_column_null :leva_runner_results, :experiment_id, true
4
+ change_column_null :leva_evaluation_results, :experiment_id, true
5
+ end
6
+ end
@@ -0,0 +1,5 @@
1
+ class AddPromptToLevaRunnerResults < ActiveRecord::Migration[7.2]
2
+ def change
3
+ add_reference :leva_runner_results, :prompt, null: false, foreign_key: { to_table: :leva_prompts }
4
+ end
5
+ end
@@ -0,0 +1,6 @@
1
+ class AddRunnerAndEvaluatorToLevaExperiments < ActiveRecord::Migration[7.2]
2
+ def change
3
+ add_column :leva_experiments, :runner_class, :string
4
+ add_column :leva_experiments, :evaluator_classes, :text
5
+ end
6
+ end
@@ -0,0 +1,5 @@
1
+ class AddActualResultToLevaDatasetRecords < ActiveRecord::Migration[7.2]
2
+ def change
3
+ add_column :leva_runner_results, :actual_result, :text
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ class RemoveActualResultFromLevaRunnerResults < ActiveRecord::Migration[7.2]
2
+ def change
3
+ remove_column :leva_runner_results, :actual_result, :text
4
+ end
5
+ end
@@ -2,14 +2,13 @@
2
2
 
3
3
  class <%= class_name %>Eval < Leva::BaseEval
4
4
  # @param prediction [String] The prediction to evaluate
5
- # @param record [YourRecordClass] The record to evaluate
6
- # @return [Leva::Result] The result of the evaluation
7
- def evaluate(prediction, record)
5
+ # @param recordable [YourRecordClass] The recordable object to evaluate
6
+ # @return [Float] The score of the evaluation
7
+ def evaluate(prediction, recordable)
8
8
  # Implement your evaluation logic here
9
+ # You can access the ground truth using recordable.ground_truth
9
10
 
10
- Leva::Result.new(
11
- label: "<%= file_name.underscore %>",
12
- score: score
13
- )
11
+ # Example implementation:
12
+ prediction == recordable.ground_truth ? 1.0 : 0.0
14
13
  end
15
14
  end
data/lib/leva/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Leva
2
- VERSION = "0.1.4"
2
+ VERSION = "0.1.6"
3
3
  end
data/lib/leva.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  require "leva/version"
2
2
  require "leva/engine"
3
+ require "liquid"
3
4
 
4
5
  module Leva
5
6
  # Runs an evaluation experiment with the given run and evals.
@@ -9,9 +10,34 @@ module Leva
9
10
  # @param evals [Array<Leva::BaseEval>] The evaluation implementations to use.
10
11
  # @return [void]
11
12
  def self.run_evaluation(experiment:, run:, evals:)
12
- results = run.run(experiment)
13
+ experiment.update(status: :running)
14
+
15
+ experiment.dataset.dataset_records.find_each do |dataset_record|
16
+ runner_result = run.execute_and_store(experiment, dataset_record, experiment.prompt)
17
+
18
+ evals.each do |eval|
19
+ eval.evaluate_and_store(experiment, runner_result)
20
+ end
21
+ end
22
+
23
+ experiment.update(status: :completed)
24
+ rescue StandardError => e
25
+ experiment.update(status: :failed)
26
+ Rails.logger.error "Error in experiment #{experiment.name}: #{e.message}"
27
+ end
28
+
29
+ # Runs a single evaluation for a dataset record
30
+ #
31
+ # @param experiment [Leva::Experiment] The experiment to run.
32
+ # @param run [Leva::BaseRun] The run implementation to use.
33
+ # @param evals [Array<Leva::BaseEval>] The evaluation implementations to use.
34
+ # @param dataset_record [Leva::DatasetRecord] The dataset record to process.
35
+ # @return [void]
36
+ def self.run_single_evaluation(experiment:, run:, evals:, dataset_record:)
37
+ runner_result = run.execute_and_store(experiment, dataset_record, experiment.prompt)
38
+
13
39
  evals.each do |eval|
14
- eval.evaluate_all(experiment, results)
40
+ eval.evaluate_and_store(experiment, runner_result)
15
41
  end
16
42
  end
17
43
 
@@ -29,20 +55,24 @@ module Leva
29
55
  raise NotImplementedError, "#{self.class} must implement #execute"
30
56
  end
31
57
 
32
- # Runs the model on all records in an experiment.
58
+ # Executes the run on a given dataset record and stores the result.
33
59
  #
34
- # @param experiment [Leva::Experiment] The experiment to run.
35
- # @return [Hash] A hash mapping dataset_record_ids to their execution results.
36
- def run(experiment)
60
+ # @param experiment [Leva::Experiment, nil] The experiment being run, if any.
61
+ # @param dataset_record [Leva::DatasetRecord] The dataset record to run the model on.
62
+ # @param prompt [Leva::Prompt] The prompt to store the version of.
63
+ # @return [Leva::RunnerResult] The stored runner result.
64
+ def execute_and_store(experiment, dataset_record, prompt)
65
+ # Expose these to the subclass execution
37
66
  @experiment = experiment
38
- @prompt = experiment.prompt
67
+ @prompt = prompt
39
68
 
40
- results = {}
41
- experiment.dataset.dataset_records.find_each do |dataset_record|
42
- result = execute(dataset_record.recordable)
43
- results[dataset_record.id] = result
44
- end
45
- results
69
+ result = execute(dataset_record.recordable)
70
+ RunnerResult.create!(
71
+ experiment: experiment,
72
+ dataset_record: dataset_record,
73
+ prompt: prompt,
74
+ prediction: result,
75
+ )
46
76
  end
47
77
  end
48
78
 
@@ -51,47 +81,34 @@ module Leva
51
81
  # @abstract Subclass and override {#evaluate} to implement
52
82
  # custom evaluation logic.
53
83
  class BaseEval
54
- # Evaluates the model's prediction against the expected result.
84
+ # Evaluates the model's prediction against the ground truth.
55
85
  #
56
86
  # @param prediction [Object] The model's prediction.
57
- # @param record [Object] The expected result.
58
- # @return [Leva::Result] The evaluation result.
87
+ # @param recordable [Object] The recordable object containing the ground truth.
88
+ # @return [Float] The evaluation score.
59
89
  # @raise [NotImplementedError] if the method is not implemented in a subclass.
60
- def evaluate(prediction, record)
90
+ def evaluate(prediction, recordable)
61
91
  raise NotImplementedError, "#{self.class} must implement #evaluate"
62
92
  end
63
93
 
64
- # Evaluates all results for an experiment.
94
+ # Evaluates a single runner result and stores the evaluation.
65
95
  #
66
- # @param experiment [Leva::Experiment] The experiment to evaluate.
67
- # @param results [Hash] A hash mapping dataset_record_ids to their execution results.
68
- # @return [void]
69
- def evaluate_all(experiment, results)
70
- experiment.dataset.dataset_records.find_each do |dataset_record|
71
- prediction = results[dataset_record.id]
72
- evaluation = evaluate(prediction, dataset_record.recordable)
73
-
74
- Leva::EvaluationResult.create!(
75
- experiment: experiment,
76
- dataset_record: dataset_record,
77
- prediction: prediction,
78
- score: evaluation.score,
79
- label: evaluation.label
80
- )
81
- end
82
- end
83
- end
96
+ # @param experiment [Leva::Experiment, nil] The experiment being evaluated, if any.
97
+ # @param runner_result [Leva::RunnerResult] The runner result to evaluate.
98
+ # @return [Leva::EvaluationResult] The stored evaluation result.
99
+ def evaluate_and_store(experiment, runner_result)
100
+ @experiment = experiment
101
+ @runner_result = runner_result
84
102
 
85
- # Represents the result of an evaluation
86
- class Result
87
- attr_reader :label, :prediction, :score
103
+ score = evaluate(runner_result, runner_result.dataset_record.recordable)
88
104
 
89
- # Initialize a new Result
90
- # @param label [String] The label for the result
91
- # @param score [Float] The score of the evaluation (0.0 to 1.0)
92
- def initialize(label:, score:)
93
- @label = label
94
- @score = score
105
+ EvaluationResult.create!(
106
+ experiment: experiment,
107
+ dataset_record: runner_result.dataset_record,
108
+ runner_result: runner_result,
109
+ score: score,
110
+ evaluator_class: self.class.name
111
+ )
95
112
  end
96
113
  end
97
114
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: leva
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kieran Klaassen
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-14 00:00:00.000000000 Z
11
+ date: 2024-09-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rails
@@ -24,6 +24,20 @@ dependencies:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
26
  version: 7.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: liquid
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 5.5.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 5.5.0
27
41
  description: Leva is a Ruby on Rails framework for evaluating Language Models (LLMs)
28
42
  using ActiveRecord datasets. It provides a flexible structure for creating experiments,
29
43
  managing datasets, and implementing various evaluation logic.
@@ -39,26 +53,49 @@ files:
39
53
  - app/assets/config/leva_manifest.js
40
54
  - app/assets/stylesheets/leva/application.css
41
55
  - app/controllers/leva/application_controller.rb
56
+ - app/controllers/leva/dataset_records_controller.rb
42
57
  - app/controllers/leva/datasets_controller.rb
43
58
  - app/controllers/leva/experiments_controller.rb
59
+ - app/controllers/leva/runner_results_controller.rb
44
60
  - app/controllers/leva/workbench_controller.rb
45
- - app/evals/test_sentiment_accuracy_eval.rb
46
61
  - app/helpers/leva/application_helper.rb
62
+ - app/javascript/controllers/prompt_form_controller.js
63
+ - app/javascript/controllers/prompt_selector_controller.js
47
64
  - app/jobs/leva/application_job.rb
48
65
  - app/jobs/leva/experiment_job.rb
66
+ - app/jobs/leva/run_eval_job.rb
49
67
  - app/mailers/leva/application_mailer.rb
68
+ - app/models/concerns/leva/recordable.rb
50
69
  - app/models/leva/application_record.rb
51
70
  - app/models/leva/dataset.rb
52
71
  - app/models/leva/dataset_record.rb
53
72
  - app/models/leva/evaluation_result.rb
54
73
  - app/models/leva/experiment.rb
55
74
  - app/models/leva/prompt.rb
56
- - app/runners/test_sentiment_run.rb
75
+ - app/models/leva/runner_result.rb
57
76
  - app/views/layouts/leva/application.html.erb
77
+ - app/views/leva/dataset_records/index.html.erb
78
+ - app/views/leva/dataset_records/show.html.erb
79
+ - app/views/leva/datasets/_dataset.html.erb
80
+ - app/views/leva/datasets/_form.html.erb
81
+ - app/views/leva/datasets/edit.html.erb
58
82
  - app/views/leva/datasets/index.html.erb
83
+ - app/views/leva/datasets/new.html.erb
59
84
  - app/views/leva/datasets/show.html.erb
85
+ - app/views/leva/experiments/_experiment.html.erb
86
+ - app/views/leva/experiments/_form.html.erb
87
+ - app/views/leva/experiments/edit.html.erb
60
88
  - app/views/leva/experiments/index.html.erb
89
+ - app/views/leva/experiments/new.html.erb
61
90
  - app/views/leva/experiments/show.html.erb
91
+ - app/views/leva/runner_results/show.html.erb
92
+ - app/views/leva/workbench/_evaluation_area.html.erb
93
+ - app/views/leva/workbench/_prompt_content.html.erb
94
+ - app/views/leva/workbench/_prompt_form.html.erb
95
+ - app/views/leva/workbench/_prompt_sidebar.html.erb
96
+ - app/views/leva/workbench/_results_section.html.erb
97
+ - app/views/leva/workbench/_top_bar.html.erb
98
+ - app/views/leva/workbench/edit.html.erb
62
99
  - app/views/leva/workbench/index.html.erb
63
100
  - app/views/leva/workbench/new.html.erb
64
101
  - config/routes.rb
@@ -67,12 +104,18 @@ files:
67
104
  - db/migrate/20240813173034_create_leva_prompts.rb
68
105
  - db/migrate/20240813173035_create_leva_experiments.rb
69
106
  - db/migrate/20240813173050_create_leva_evaluation_results.rb
107
+ - db/migrate/20240816201419_create_leva_runner_results.rb
108
+ - db/migrate/20240816201433_update_leva_evaluation_results.rb
109
+ - db/migrate/20240821163608_make_experiment_optional_for_runner_results.rb
110
+ - db/migrate/20240821181934_add_prompt_to_leva_runner_results.rb
111
+ - db/migrate/20240821183153_add_runner_and_evaluator_to_leva_experiments.rb
112
+ - db/migrate/20240821191713_add_actual_result_to_leva_dataset_records.rb
113
+ - db/migrate/20240822143201_remove_actual_result_from_leva_runner_results.rb
70
114
  - lib/generators/leva/eval_generator.rb
71
115
  - lib/generators/leva/runner_generator.rb
72
116
  - lib/generators/leva/templates/eval.rb.erb
73
117
  - lib/generators/leva/templates/runner.rb.erb
74
118
  - lib/leva.rb
75
- - lib/leva/base_eval.rb
76
119
  - lib/leva/engine.rb
77
120
  - lib/leva/version.rb
78
121
  - lib/tasks/auto_annotate_models.rake
@@ -1,6 +0,0 @@
1
- class TestSentimentAccuracyEval < Leva::BaseEval
2
- def evaluate(prediction, expected)
3
- score = prediction == expected ? 1.0 : 0.0
4
- Leva::Result.new(label: 'sentiment_accuracy', score: score)
5
- end
6
- end
@@ -1,13 +0,0 @@
1
- class TestSentimentRun < Leva::BaseRun
2
- def execute(record)
3
- # Simple sentiment analysis logic for testing
4
- case record.content.downcase
5
- when /love|great|excellent/
6
- "Positive"
7
- when /terrible|bad|awful/
8
- "Negative"
9
- else
10
- "Neutral"
11
- end
12
- end
13
- end
@@ -1,75 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Leva
4
- class BaseEval
5
- class << self
6
- attr_reader :dataset_record_class_name
7
-
8
- # Set the dataset record class for the eval
9
- # @param class_name [String] The name of the dataset record class
10
- def leva_dataset_record_class(class_name)
11
- @dataset_record_class_name = class_name
12
- end
13
-
14
- # Run the experiment
15
- # @param experiment [Leva::Experiment] The experiment to run
16
- def run_experiment(experiment)
17
- new.run_experiment(experiment)
18
- end
19
- end
20
-
21
- # Run the experiment
22
- # @param experiment [Leva::Experiment] The experiment to run
23
- def run_experiment(experiment)
24
- @experiment = experiment
25
- @experiment.update(status: :running)
26
-
27
- @experiment.dataset.records.each do |record|
28
- @record = record
29
- unless @record.class_name == self.class.dataset_record_class_name
30
- raise ArgumentError, "Record class #{@record.class_name} does not match expected class #{self.class.dataset_record_class_name}"
31
- end
32
- ExperimentJob.perform_later(self, @record)
33
- end
34
-
35
- @experiment.update(status: :completed)
36
- rescue StandardError => e
37
- @experiment.update(status: :failed)
38
- Rails.logger.error "Error in experiment #{@experiment.name}: #{e.message}"
39
- end
40
-
41
- # Run the evaluation for a single record
42
- # @param record [ActiveRecord::Base] The record to evaluate
43
- # @return [Leva::Result] The result of the evaluation
44
- def run_each(record)
45
- raise NotImplementedError, "Subclasses must implement the 'run_each' method"
46
- end
47
-
48
- # Save the result of an evaluation
49
- # @param result [Leva::Result] The result of the evaluation
50
- def save_result(result)
51
- Leva::EvaluationResult.create!(
52
- experiment: @experiment,
53
- dataset_record: Leva::DatasetRecord.find_by(recordable: @record, dataset: @experiment.dataset),
54
- prediction: result.prediction,
55
- score: result.score,
56
- label: result.label
57
- )
58
- end
59
- end
60
-
61
- # Represents the result of an evaluation
62
- class Result
63
- attr_reader :label, :prediction, :score
64
-
65
- # Initialize a new Result
66
- # @param label [String] The label for the result
67
- # @param prediction [String] The prediction made by the evaluation
68
- # @param score [Float] The score of the evaluation (0.0 to 1.0)
69
- def initialize(label:, prediction:, score:)
70
- @label = label
71
- @prediction = prediction
72
- @score = score
73
- end
74
- end
75
- end