leva 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -0
- data/app/assets/stylesheets/leva/application.css +13 -0
- data/app/models/leva/dataset_record.rb +2 -1
- data/app/models/leva/runner_result.rb +11 -9
- data/app/views/leva/workbench/_prompt_content.html.erb +9 -5
- data/app/views/leva/workbench/_results_section.html.erb +4 -4
- data/db/migrate/20240912183556_add_runner_class_to_leva_runner_results.rb +5 -0
- data/lib/generators/leva/templates/eval.rb.erb +3 -3
- data/lib/generators/leva/templates/runner.rb.erb +25 -0
- data/lib/leva/version.rb +1 -1
- data/lib/leva.rb +23 -0
- metadata +3 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 9d8e48a71a34a53451bc1756a6cdc7694ee6ea9747861fdb977b0859979bd101
         | 
| 4 | 
            +
              data.tar.gz: 83c44769f588b3daeeaa6bd750f9e9b643639871de57dd381cd84a76cd6177d6
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 74ad608263e8fe369693537d76247757bd39417bbf1e034ffc2402684a9800ed25803d1ff3506d05776c9c523ad7fd84254e78290fc99f21f7180f67e1e12667
         | 
| 7 | 
            +
              data.tar.gz: 3a2e4b9701d6cb63c05c2f672b44c8bfd16f0de5a6a98c63e4b6a13de6685efe001cf320304be1fa1aa6c6b56cec658ec0fb95e731fbf078055d5deaffb10959
         | 
    
        data/README.md
    CHANGED
    
    | @@ -1,7 +1,12 @@ | |
| 1 1 | 
             
            # Leva - Flexible Evaluation Framework for Language Models
         | 
| 2 2 |  | 
| 3 | 
            +
            [](https://badge.fury.io/rb/leva)
         | 
| 4 | 
            +
             | 
| 3 5 | 
             
            Leva is a Ruby on Rails framework for evaluating Language Models (LLMs) using ActiveRecord datasets on production models. It provides a flexible structure for creating experiments, managing datasets, and implementing various evaluation logic on production data with security in mind.
         | 
| 4 6 |  | 
| 7 | 
            +
            
         | 
| 8 | 
            +
            
         | 
| 9 | 
            +
             | 
| 5 10 | 
             
            ## Installation
         | 
| 6 11 |  | 
| 7 12 | 
             
            Add this line to your application's Gemfile:
         | 
| @@ -13,3 +13,16 @@ | |
| 13 13 | 
             
             *= require_tree .
         | 
| 14 14 | 
             
             *= require_self
         | 
| 15 15 | 
             
             */
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            /* Global styles for text overflow handling */
         | 
| 18 | 
            +
            pre {
         | 
| 19 | 
            +
              word-wrap: break-word;
         | 
| 20 | 
            +
              word-break: break-word;
         | 
| 21 | 
            +
              max-width: 100%;
         | 
| 22 | 
            +
            }
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            textarea {
         | 
| 25 | 
            +
              word-wrap: break-word;
         | 
| 26 | 
            +
              word-break: break-word;
         | 
| 27 | 
            +
              max-width: 100%;
         | 
| 28 | 
            +
            }
         | 
| @@ -3,6 +3,7 @@ | |
| 3 3 | 
             
            # Table name: leva_dataset_records
         | 
| 4 4 | 
             
            #
         | 
| 5 5 | 
             
            #  id              :integer          not null, primary key
         | 
| 6 | 
            +
            #  actual_result   :text
         | 
| 6 7 | 
             
            #  recordable_type :string           not null
         | 
| 7 8 | 
             
            #  created_at      :datetime         not null
         | 
| 8 9 | 
             
            #  updated_at      :datetime         not null
         | 
| @@ -61,4 +62,4 @@ module Leva | |
| 61 62 | 
             
                  end
         | 
| 62 63 | 
             
                end
         | 
| 63 64 | 
             
              end
         | 
| 64 | 
            -
            end
         | 
| 65 | 
            +
            end
         | 
| @@ -5,6 +5,7 @@ | |
| 5 5 | 
             
            #  id                :integer          not null, primary key
         | 
| 6 6 | 
             
            #  prediction        :text
         | 
| 7 7 | 
             
            #  prompt_version    :integer
         | 
| 8 | 
            +
            #  runner_class      :string
         | 
| 8 9 | 
             
            #  created_at        :datetime         not null
         | 
| 9 10 | 
             
            #  updated_at        :datetime         not null
         | 
| 10 11 | 
             
            #  dataset_record_id :integer          not null
         | 
| @@ -32,23 +33,24 @@ module Leva | |
| 32 33 |  | 
| 33 34 | 
             
                validates :prediction, presence: true
         | 
| 34 35 | 
             
                validates :prompt, presence: true
         | 
| 36 | 
            +
                validates :runner_class, presence: true
         | 
| 35 37 |  | 
| 36 38 | 
             
                delegate :ground_truth, to: :dataset_record
         | 
| 37 39 |  | 
| 38 40 | 
             
                # @return [Array<String>] The parsed draft responses
         | 
| 39 41 | 
             
                def parsed_predictions
         | 
| 40 | 
            -
                  @parsed_predictions ||=
         | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
             | 
| 42 | 
            +
                  @parsed_predictions ||= runner&.parsed_predictions(self) || []
         | 
| 43 | 
            +
                end
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                # @return [String] The ground truth for this runner result
         | 
| 46 | 
            +
                def ground_truth
         | 
| 47 | 
            +
                  @ground_truth ||= runner&.ground_truth(self)
         | 
| 46 48 | 
             
                end
         | 
| 47 49 |  | 
| 48 50 | 
             
                private
         | 
| 49 51 |  | 
| 50 | 
            -
                def  | 
| 51 | 
            -
                   | 
| 52 | 
            +
                def runner
         | 
| 53 | 
            +
                  @runner ||= runner_class&.constantize&.new
         | 
| 52 54 | 
             
                end
         | 
| 53 55 | 
             
              end
         | 
| 54 | 
            -
            end
         | 
| 56 | 
            +
            end
         | 
| @@ -12,7 +12,7 @@ | |
| 12 12 | 
             
                    </button>
         | 
| 13 13 | 
             
                  </div>
         | 
| 14 14 | 
             
                  <textarea
         | 
| 15 | 
            -
                    class="w-full bg-gray-800 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none min-h-[100px] overflow- | 
| 15 | 
            +
                    class="w-full bg-gray-800 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none min-h-[100px] overflow-y-auto resize-none break-words"
         | 
| 16 16 | 
             
                    name="prompt[system_prompt]"
         | 
| 17 17 | 
             
                    data-prompt-autosave-target="input"
         | 
| 18 18 | 
             
                    id="systemPrompt"
         | 
| @@ -31,7 +31,7 @@ | |
| 31 31 | 
             
                    </button>
         | 
| 32 32 | 
             
                  </div>
         | 
| 33 33 | 
             
                  <textarea
         | 
| 34 | 
            -
                    class="w-full bg-gray-800 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none min-h-[200px] overflow- | 
| 34 | 
            +
                    class="w-full bg-gray-800 text-white p-3 rounded-lg focus:ring-2 focus:ring-indigo-500 focus:outline-none min-h-[200px] overflow-y-auto resize-none break-words"
         | 
| 35 35 | 
             
                    name="prompt[user_prompt]"
         | 
| 36 36 | 
             
                    data-prompt-autosave-target="input"
         | 
| 37 37 | 
             
                    id="userPrompt"
         | 
| @@ -59,7 +59,7 @@ | |
| 59 59 | 
             
                              Copy
         | 
| 60 60 | 
             
                            </button>
         | 
| 61 61 | 
             
                          </summary>
         | 
| 62 | 
            -
                          <pre class="text-xs text-gray-300 mt-1 whitespace-pre-wrap" id="liquidTag<%= key %>"><%= value.to_s %></pre>
         | 
| 62 | 
            +
                          <pre class="text-xs text-gray-300 mt-1 whitespace-pre-wrap break-words overflow-x-auto max-w-full" id="liquidTag<%= key %>"><%= value.to_s %></pre>
         | 
| 63 63 | 
             
                        </details>
         | 
| 64 64 | 
             
                      <% end %>
         | 
| 65 65 | 
             
                    </div>
         | 
| @@ -77,7 +77,7 @@ | |
| 77 77 | 
             
                        Copy
         | 
| 78 78 | 
             
                      </button>
         | 
| 79 79 | 
             
                    </div>
         | 
| 80 | 
            -
                    <pre class="w-full bg-gray-800 text-white p-3 rounded-lg text-sm whitespace-pre-wrap" id="fullPrompt"><%= Liquid::Template.parse(@selected_prompt.user_prompt).render(@dataset_record.recordable.to_llm_context.stringify_keys) %></pre>
         | 
| 80 | 
            +
                    <pre class="w-full bg-gray-800 text-white p-3 rounded-lg text-sm whitespace-pre-wrap overflow-x-auto break-words max-w-full" id="fullPrompt"><%= Liquid::Template.parse(@selected_prompt.user_prompt).render(@dataset_record.recordable.to_llm_context.stringify_keys) %></pre>
         | 
| 81 81 | 
             
                  </div>
         | 
| 82 82 | 
             
                <% end %>
         | 
| 83 83 | 
             
                <div class="text-sm text-center" data-prompt-autosave-target="status"></div>
         | 
| @@ -102,7 +102,11 @@ | |
| 102 102 | 
             
                      const textareas = textarea ? [textarea] : this.inputTargets
         | 
| 103 103 | 
             
                      textareas.forEach(ta => {
         | 
| 104 104 | 
             
                        ta.style.height = 'auto'
         | 
| 105 | 
            -
                        ta.style.height = ta.scrollHeight + 'px'
         | 
| 105 | 
            +
                        ta.style.height = (ta.scrollHeight + 5) + 'px'
         | 
| 106 | 
            +
                        
         | 
| 107 | 
            +
                        // Ensure horizontal text wrapping
         | 
| 108 | 
            +
                        ta.style.wordBreak = 'break-word'
         | 
| 109 | 
            +
                        ta.style.wordWrap = 'break-word'
         | 
| 106 110 | 
             
                      })
         | 
| 107 111 | 
             
                    }
         | 
| 108 112 |  | 
| @@ -1,4 +1,4 @@ | |
| 1 | 
            -
            <div class="w-1/2 bg-gray-900 border-l border-gray-800 p-5 overflow-y-auto" data-controller="button-loader">
         | 
| 1 | 
            +
            <div class="w-1/2 bg-gray-900 border-l border-gray-800 p-5 overflow-y-auto overflow-x-hidden" data-controller="button-loader">
         | 
| 2 2 | 
             
              <!-- Runner Dropdown -->
         | 
| 3 3 | 
             
              <div class="mb-5">
         | 
| 4 4 | 
             
                <h3 class="text-sm font-semibold mb-2 text-indigo-300">Select Runner</h3>
         | 
| @@ -42,17 +42,17 @@ | |
| 42 42 | 
             
                <% if @dataset_record && (runner_result = @dataset_record.runner_results.last) %>
         | 
| 43 43 | 
             
                  <div class="mb-3">
         | 
| 44 44 | 
             
                    <h4 class="text-xs font-semibold text-indigo-200 mb-1">Ground Truth:</h4>
         | 
| 45 | 
            -
                    <pre class="text-sm text-gray-300 whitespace-pre-wrap bg-gray-700 p-2 rounded"><%=  | 
| 45 | 
            +
                    <pre class="text-sm text-gray-300 whitespace-pre-wrap break-words overflow-x-auto max-w-full bg-gray-700 p-2 rounded"><%= runner_result.ground_truth %></pre>
         | 
| 46 46 | 
             
                  </div>
         | 
| 47 47 | 
             
                  <div>
         | 
| 48 48 | 
             
                    <h4 class="text-xs font-semibold text-indigo-200 mb-1">Raw Prediction:</h4>
         | 
| 49 | 
            -
                    <pre class="text-sm text-gray-300 whitespace-pre-wrap bg-gray-700 p-2 rounded"><%= runner_result.prediction %></pre>
         | 
| 49 | 
            +
                    <pre class="text-sm text-gray-300 whitespace-pre-wrap break-words overflow-x-auto max-w-full bg-gray-700 p-2 rounded"><%= runner_result.prediction %></pre>
         | 
| 50 50 | 
             
                  </div>
         | 
| 51 51 | 
             
                  <% if runner_result.dataset_record.recordable.extract_regex_pattern %>
         | 
| 52 52 | 
             
                    <div>
         | 
| 53 53 | 
             
                      <h4 class="text-xs font-semibold text-indigo-200 my-2 gap-2">Parsed Predictions:  <%= runner_result.dataset_record.recordable.extract_regex_pattern.to_s %></h4>
         | 
| 54 54 | 
             
                      <% runner_result.parsed_predictions.each do |prediction| %>
         | 
| 55 | 
            -
                        <pre class="text-sm text-gray-300 whitespace-pre-wrap bg-gray-700 p-2 rounded mb-2"><%= prediction %></pre>
         | 
| 55 | 
            +
                        <pre class="text-sm text-gray-300 whitespace-pre-wrap break-words overflow-x-auto max-w-full bg-gray-700 p-2 rounded mb-2"><%= prediction %></pre>
         | 
| 56 56 | 
             
                      <% end %>
         | 
| 57 57 | 
             
                    </div>
         | 
| 58 58 | 
             
                  <% end %>
         | 
| @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            # frozen_string_literal: true
         | 
| 2 2 |  | 
| 3 3 | 
             
            class <%= class_name %>Eval < Leva::BaseEval
         | 
| 4 | 
            -
              # @param  | 
| 4 | 
            +
              # @param runner_result [Leva::RunnerResult] The runner result to evaluate
         | 
| 5 5 | 
             
              # @param recordable [YourRecordClass] The recordable object to evaluate
         | 
| 6 6 | 
             
              # @return [Float] The score of the evaluation
         | 
| 7 | 
            -
              def evaluate( | 
| 7 | 
            +
              def evaluate(runner_result, recordable)
         | 
| 8 8 | 
             
                # Implement your evaluation logic here
         | 
| 9 9 | 
             
                # You can access the ground truth using recordable.ground_truth
         | 
| 10 10 |  | 
| 11 11 | 
             
                # Example implementation:
         | 
| 12 | 
            -
                 | 
| 12 | 
            +
                runner_result.parsed_predictions.first == recordable.ground_truth ? 1.0 : 0.0
         | 
| 13 13 | 
             
              end
         | 
| 14 14 | 
             
            end
         | 
| @@ -8,4 +8,29 @@ class <%= class_name %>Run < Leva::BaseRun | |
| 8 8 | 
             
                # This could involve calling an API, running a local model, etc.
         | 
| 9 9 | 
             
                # Return the result of the run to be used to evaluate the model
         | 
| 10 10 | 
             
              end
         | 
| 11 | 
            +
             | 
| 12 | 
            +
              # Uncomment and modify this method to customize parsed predictions
         | 
| 13 | 
            +
              # @param runner_result [Leva::RunnerResult] The runner result to parse
         | 
| 14 | 
            +
              # @return [Array<String>] The parsed predictions
         | 
| 15 | 
            +
              # def parsed_predictions(runner_result)
         | 
| 16 | 
            +
              #   # Example: Extract predictions from XML-like tags
         | 
| 17 | 
            +
              #   runner_result.prediction.scan(/<prediction>(.*?)<\/prediction>/).flatten
         | 
| 18 | 
            +
              # end
         | 
| 19 | 
            +
             | 
| 20 | 
            +
              # Uncomment and modify this method to customize ground truth extraction
         | 
| 21 | 
            +
              # @param runner_result [Leva::RunnerResult] The runner result to get ground truth from
         | 
| 22 | 
            +
              # @return [String] The ground truth for the runner result
         | 
| 23 | 
            +
              # def ground_truth(runner_result)
         | 
| 24 | 
            +
              #   # Example: Extract ground truth from a specific field
         | 
| 25 | 
            +
              #   runner_result.dataset_record.recordable.custom_ground_truth_field
         | 
| 26 | 
            +
              # end
         | 
| 27 | 
            +
             | 
| 28 | 
            +
              # Uncomment and modify this method to customize regex extraction
         | 
| 29 | 
            +
              # @param runner_result [Leva::RunnerResult] The runner result to extract regex from
         | 
| 30 | 
            +
              # @return [Regexp, nil] The regex pattern to use for parsing predictions
         | 
| 31 | 
            +
              # def extract_regex_pattern(runner_result)
         | 
| 32 | 
            +
              #   # Your custom regex extraction logic here
         | 
| 33 | 
            +
              #   # For example:
         | 
| 34 | 
            +
              #   # /\<result\>(.*?)\<\/result\>/
         | 
| 35 | 
            +
              # end
         | 
| 11 36 | 
             
            end
         | 
    
        data/lib/leva/version.rb
    CHANGED
    
    
    
        data/lib/leva.rb
    CHANGED
    
    | @@ -72,8 +72,31 @@ module Leva | |
| 72 72 | 
             
                    dataset_record: dataset_record,
         | 
| 73 73 | 
             
                    prompt: prompt,
         | 
| 74 74 | 
             
                    prediction: result,
         | 
| 75 | 
            +
                    runner_class: self.class.name
         | 
| 75 76 | 
             
                  )
         | 
| 76 77 | 
             
                end
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                # @param runner_result [Leva::RunnerResult] The runner result to parse
         | 
| 80 | 
            +
                # @return [Array<String>] The parsed predictions
         | 
| 81 | 
            +
                def parsed_predictions(runner_result)
         | 
| 82 | 
            +
                  if extract_regex_pattern(runner_result)
         | 
| 83 | 
            +
                    runner_result.prediction.scan(extract_regex_pattern(runner_result)).map { |match| match.first&.strip }.compact
         | 
| 84 | 
            +
                  else
         | 
| 85 | 
            +
                    [runner_result.prediction]
         | 
| 86 | 
            +
                  end
         | 
| 87 | 
            +
                end
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                # @param runner_result [Leva::RunnerResult] The runner result to extract regex from
         | 
| 90 | 
            +
                # @return [Regexp, nil] The regex pattern to use for parsing predictions
         | 
| 91 | 
            +
                def extract_regex_pattern(runner_result)
         | 
| 92 | 
            +
                  runner_result.dataset_record.recordable.extract_regex_pattern if runner_result.dataset_record.recordable.respond_to?(:extract_regex_pattern)
         | 
| 93 | 
            +
                end
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                # @param runner_result [Leva::RunnerResult] The runner result to get ground truth from
         | 
| 96 | 
            +
                # @return [String] The ground truth for the runner result
         | 
| 97 | 
            +
                def ground_truth(runner_result)
         | 
| 98 | 
            +
                  runner_result.dataset_record.ground_truth
         | 
| 99 | 
            +
                end
         | 
| 77 100 | 
             
              end
         | 
| 78 101 |  | 
| 79 102 | 
             
              # Base class for all evaluation implementations in Leva.
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: leva
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.1. | 
| 4 | 
            +
              version: 0.1.8
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Kieran Klaassen
         | 
| 8 8 | 
             
            autorequire:
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date:  | 
| 11 | 
            +
            date: 2025-03-13 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: rails
         | 
| @@ -111,6 +111,7 @@ files: | |
| 111 111 | 
             
            - db/migrate/20240821183153_add_runner_and_evaluator_to_leva_experiments.rb
         | 
| 112 112 | 
             
            - db/migrate/20240821191713_add_actual_result_to_leva_dataset_records.rb
         | 
| 113 113 | 
             
            - db/migrate/20240822143201_remove_actual_result_from_leva_runner_results.rb
         | 
| 114 | 
            +
            - db/migrate/20240912183556_add_runner_class_to_leva_runner_results.rb
         | 
| 114 115 | 
             
            - lib/generators/leva/eval_generator.rb
         | 
| 115 116 | 
             
            - lib/generators/leva/runner_generator.rb
         | 
| 116 117 | 
             
            - lib/generators/leva/templates/eval.rb.erb
         |