ruby_llm-evals 0.0.1 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +180 -8
- data/Rakefile +0 -2
- data/app/assets/stylesheets/ruby_llm/evals/application.css +15 -0
- data/app/assets/stylesheets/ruby_llm/evals/bulma.min.css +3 -0
- data/app/assets/stylesheets/ruby_llm/evals/json_editor.css +25 -0
- data/app/controllers/concerns/ruby_llm/evals/prompt_executions/prompt_execution_scoped.rb +19 -0
- data/app/controllers/ruby_llm/evals/application_controller.rb +14 -0
- data/app/controllers/ruby_llm/evals/prompt_executions/failures_controller.rb +15 -0
- data/app/controllers/ruby_llm/evals/prompt_executions/passages_controller.rb +15 -0
- data/app/controllers/ruby_llm/evals/prompt_executions/retries_controller.rb +16 -0
- data/app/controllers/ruby_llm/evals/prompts_controller.rb +87 -0
- data/app/controllers/ruby_llm/evals/runs_controller.rb +46 -0
- data/app/helpers/ruby_llm/evals/application_helper.rb +39 -0
- data/app/helpers/ruby_llm/evals/prompt_executions_helper.rb +6 -0
- data/app/helpers/ruby_llm/evals/prompts_helper.rb +37 -0
- data/app/helpers/ruby_llm/evals/runs_helper.rb +6 -0
- data/app/javascript/ruby_llm/evals/application.js +3 -0
- data/app/javascript/ruby_llm/evals/controllers/application.js +13 -0
- data/app/javascript/ruby_llm/evals/controllers/eval_type_selector_controller.js +37 -0
- data/app/javascript/ruby_llm/evals/controllers/file_input_controller.js +21 -0
- data/app/javascript/ruby_llm/evals/controllers/index.js +4 -0
- data/app/javascript/ruby_llm/evals/controllers/json_editor_controller.js +129 -0
- data/app/javascript/ruby_llm/evals/controllers/provider_model_controller.js +85 -0
- data/app/javascript/ruby_llm/evals/controllers/schema_selector_controller.js +31 -0
- data/app/jobs/ruby_llm/evals/application_job.rb +6 -0
- data/app/jobs/ruby_llm/evals/execute_sample_job.rb +26 -0
- data/app/jobs/ruby_llm/evals/perform_run_job.rb +21 -0
- data/app/mailers/ruby_llm/evals/application_mailer.rb +8 -0
- data/app/models/concerns/ruby_llm/evals/job_trackable.rb +15 -0
- data/app/models/ruby_llm/evals/application_record.rb +7 -0
- data/app/models/ruby_llm/evals/page.rb +53 -0
- data/app/models/ruby_llm/evals/prompt.rb +55 -0
- data/app/models/ruby_llm/evals/prompt_execution.rb +169 -0
- data/app/models/ruby_llm/evals/run.rb +45 -0
- data/app/models/ruby_llm/evals/sample.rb +20 -0
- data/app/schemas/ruby_llm/evals/judge_verdict_schema.rb +8 -0
- data/app/views/layouts/ruby_llm/evals/application.html.erb +29 -0
- data/app/views/ruby_llm/evals/application/_flashes.html.erb +9 -0
- data/app/views/ruby_llm/evals/application/_nav.html.erb +12 -0
- data/app/views/ruby_llm/evals/application/_pagination.html.erb +7 -0
- data/app/views/ruby_llm/evals/application/_tabs.html.erb +6 -0
- data/app/views/ruby_llm/evals/prompts/_filters.html.erb +15 -0
- data/app/views/ruby_llm/evals/prompts/_form.html.erb +104 -0
- data/app/views/ruby_llm/evals/prompts/_prompt.html.erb +14 -0
- data/app/views/ruby_llm/evals/prompts/compare.html.erb +90 -0
- data/app/views/ruby_llm/evals/prompts/edit.html.erb +5 -0
- data/app/views/ruby_llm/evals/prompts/index.html.erb +32 -0
- data/app/views/ruby_llm/evals/prompts/new.html.erb +5 -0
- data/app/views/ruby_llm/evals/prompts/show.html.erb +107 -0
- data/app/views/ruby_llm/evals/runs/_filters.html.erb +17 -0
- data/app/views/ruby_llm/evals/runs/_run.html.erb +13 -0
- data/app/views/ruby_llm/evals/runs/index.html.erb +30 -0
- data/app/views/ruby_llm/evals/runs/show.html.erb +188 -0
- data/app/views/ruby_llm/evals/samples/_form.html.erb +88 -0
- data/config/importmap.rb +13 -0
- data/config/locales/en.yml +7 -0
- data/config/routes.rb +20 -1
- data/db/migrate/20251022211228_create_ruby_llm_evals_prompts.rb +21 -0
- data/db/migrate/20251022211229_create_ruby_llm_evals_samples.rb +14 -0
- data/db/migrate/20251022211230_create_ruby_llm_evals_runs.rb +21 -0
- data/db/migrate/20251022211231_create_ruby_llm_evals_prompt_executions.rb +26 -0
- data/lib/activemodel/validations/json_validator.rb +14 -0
- data/lib/ruby_llm/evals/engine.rb +49 -1
- data/lib/ruby_llm/evals/version.rb +2 -2
- data/lib/ruby_llm/evals.rb +7 -3
- metadata +65 -6
- /data/lib/tasks/{ruby_llm/evals_tasks.rake → ruby_llm_evals_tasks.rake} +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 3ac750c3e2f75518591afb49f7914ba0512f1c8837f76af0d111d1e20707f127
|
|
4
|
+
data.tar.gz: 1430a5bbf5ccaa37bd86fb0065ed472ab331ca3ff5ec78a00687d5b8c0998b8c
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2ae0ec74f8d651e3f55559767c5757a2b6680d8eb8fbd65a80696236c7e6104e089540c37ece8671b5150b22823f4aafd4ad6183d9584ffb95c9e28fc7f72f4c
|
|
7
|
+
data.tar.gz: 8b751eb6510b03aa7929a446b8435942e206814dd0c71edf4985bd46ff12a58f742b28bd31c251cdb49d38f6915330a204d7c582618e8d01bec4a684ab54ce2e
|
data/README.md
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
|
-
#
|
|
2
|
-
Short description and motivation.
|
|
1
|
+
# RubyLLM::Evals
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
How to use my plugin.
|
|
3
|
+
Test, compare, and improve your LLM prompts within your Rails application.
|
|
6
4
|
|
|
7
5
|
## Installation
|
|
6
|
+
|
|
7
|
+
> [!NOTE]
|
|
8
|
+
> This engine relies on ActiveJob, ActiveStorage, and [RubyLLM](https://github.com/crmne/ruby_llm). Make sure you have them installed and configured.
|
|
9
|
+
|
|
8
10
|
Add this line to your application's Gemfile:
|
|
9
11
|
|
|
10
12
|
```ruby
|
|
@@ -12,17 +14,187 @@ gem "ruby_llm-evals"
|
|
|
12
14
|
```
|
|
13
15
|
|
|
14
16
|
And then execute:
|
|
17
|
+
|
|
15
18
|
```bash
|
|
16
19
|
$ bundle
|
|
17
20
|
```
|
|
18
21
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
+
To copy and migrate RubyLLM::Evals's migrations, run:
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
$ rails ruby_llm_evals:install:migrations db:migrate
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
And then mount the engine in your `config/routes.rb`:
|
|
29
|
+
|
|
30
|
+
```ruby
|
|
31
|
+
Rails.application.routes.draw do
|
|
32
|
+
# ...
|
|
33
|
+
|
|
34
|
+
mount RubyLLM::Evals::Engine, at: "/evals"
|
|
35
|
+
end
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Now you should be able to browse to `/evals` and create, test, compare, and improve your LLM prompts. Continue reading to see how a typical workflow looks like, and how you can leverage your app's data to add samples to your prompts.
|
|
39
|
+
|
|
40
|
+

|
|
41
|
+

|
|
42
|
+

|
|
43
|
+
|
|
44
|
+
### Authentication and authorization
|
|
45
|
+
|
|
46
|
+
RubyLLM::Evals leaves authentication and authorization to the user. If no authentication is enforced, `/evals` will be available to everyone.
|
|
47
|
+
|
|
48
|
+
To enforce authentication, you can use route [constraints](https://guides.rubyonrails.org/routing.html#advanced-constraints), or set up a HTTP Basic auth middleware.
|
|
49
|
+
|
|
50
|
+
For example, if you're using devise, you can do this:
|
|
51
|
+
|
|
52
|
+
```ruby
|
|
53
|
+
# config/routes.rb
|
|
54
|
+
authenticate :user do
|
|
55
|
+
mount RubyLLM::Evals::Engine, at: "/evals"
|
|
56
|
+
end
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
See more examples [here](https://github.com/heartcombo/devise/wiki/How-To%3A-Define-resource-actions-that-require-authentication-using-routes.rb).
|
|
60
|
+
|
|
61
|
+
However, if you're using Rails' default authentication generator, or an authentication solution that doesn't provide constraints, you need to roll out your own solution:
|
|
62
|
+
|
|
63
|
+
```ruby
|
|
64
|
+
# config/routes.rb
|
|
65
|
+
constraints ->(request) { Constraints::Auth.authenticated?(request) } do
|
|
66
|
+
mount RubyLLM::Evals::Engine, at: "/evals"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# lib/constraints/auth.rb
|
|
70
|
+
class Constraints::Auth
|
|
71
|
+
def self.authenticated?(request)
|
|
72
|
+
cookies = ActionDispatch::Cookies::CookieJar.build(request, request.cookies)
|
|
73
|
+
|
|
74
|
+
Session.find_by id: cookies.signed[:session_id]
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
You can also set up a HTTP Basic auth middleware in the engine:
|
|
80
|
+
|
|
81
|
+
```ruby
|
|
82
|
+
# config/initializers/ruby_llm-evals.rb
|
|
83
|
+
RubyLLM::Evals::Engine.middleware.use(Rack::Auth::Basic) do |username, password|
|
|
84
|
+
ActiveSupport::SecurityUtils.secure_compare(Rails.application.credentials.ruby_llm_evals_username, username) &
|
|
85
|
+
ActiveSupport::SecurityUtils.secure_compare(Rails.application.credentials.ruby_llm_evals_password, password)
|
|
86
|
+
end
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Usage
|
|
90
|
+
|
|
91
|
+
### Workflow
|
|
92
|
+
|
|
93
|
+
A typical workflow looks like this:
|
|
94
|
+
|
|
95
|
+
#### Create a prompt
|
|
96
|
+
|
|
97
|
+
A prompt represents an LLM prompt template with:
|
|
98
|
+
|
|
99
|
+
* Provider: see [available providers](https://github.com/crmne/ruby_llm/tree/main/lib/ruby_llm/providers)
|
|
100
|
+
* Model: see [available models](https://github.com/crmne/ruby_llm/blob/main/lib/ruby_llm/models.json). In case you're selecting a local provider (eg. Ollama), you can enter the model name in a text field.
|
|
101
|
+
* Instructions: optional, the system prompt.
|
|
102
|
+
* Message: message template.
|
|
103
|
+
* Temperature: optional, controls randomness (0.0 to 1.0). Lower values make output more focused and deterministic.
|
|
104
|
+
* Params: optional, additional provider-specific parameters as JSON (e.g., `{"max_tokens": 1000}`).
|
|
105
|
+
* Tools: optional, array of tool class names that the LLM can use (e.g., `["Weather", "Calculator"]`). See how tools are defined in [RubyLLM](https://rubyllm.com/tools/).
|
|
106
|
+
* Schema: optional, a Ruby class name (e.g., `User`) to structure the LLM's response, or use "other" to provide a custom JSON schema in the Schema Other field. See [RubyLLM structured output](https://rubyllm.com/structured-output/).
|
|
107
|
+
|
|
108
|
+
Both the instructions and the message template can contain liquid tags that will be rendered at runtime. To add variables, enclose them with braces. Eg: `{{name}}`.
|
|
109
|
+
|
|
110
|
+
> [!NOTE]
|
|
111
|
+
> In order to use a provider, you must have it configured in `config/initializers/ruby_llm.rb` as explained [here](https://rubyllm.com/configuration/#provider-configuration)
|
|
112
|
+
|
|
113
|
+
#### Add samples
|
|
114
|
+
|
|
115
|
+
When creating/editing a prompt you can add samples, where you can define:
|
|
116
|
+
|
|
117
|
+
* Variables: a JSON that contains the values to use when executing the prompt. Eg: `{ "name": "Patricio" }`
|
|
118
|
+
* Eval type: the evaluation criteria: exact match, contains, regex, or human review.
|
|
119
|
+
* Expected output: optional if the eval type is `human`
|
|
120
|
+
* Files: optional attachments.
|
|
121
|
+
|
|
122
|
+
#### Run evaluations
|
|
123
|
+
|
|
124
|
+
Once you have a prompt with its examples you can run the evaluations. This will enqueue a job that will create an run and run each sample with the current prompt configuration.
|
|
125
|
+
|
|
126
|
+
The run will save the current prompt configuration for later analysis, such as the current provider/model, instructions, messages, variables, etc.
|
|
127
|
+
|
|
128
|
+
#### Analyze the results
|
|
129
|
+
|
|
130
|
+
You can view the accuracy, cost, and duration of the entire run and each individual prompt execution.
|
|
131
|
+
|
|
132
|
+
If you chose the human review eval type, it's now that you can review if an eval passed or not.
|
|
133
|
+
|
|
134
|
+
### Beyond a typical workflow
|
|
135
|
+
|
|
136
|
+
#### Using your data to create prompts/samples
|
|
137
|
+
|
|
138
|
+
Suppose you want to categorize images. You can have a prompt (eg. `image-categorization`) and then add your data to the eval set:
|
|
139
|
+
|
|
140
|
+
```ruby
|
|
141
|
+
prompt = RubyLLM::Evals::Prompt.find_by slug: "image-categorization"
|
|
142
|
+
|
|
143
|
+
Image.where(category: nil).take(50).each do |image|
|
|
144
|
+
sample = prompt.samples.create eval_type: :human_judge
|
|
145
|
+
sample.files.attach image.attachment.blob
|
|
146
|
+
end
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Then you can iterate over the prompt trying to find the best configuration possible.
|
|
150
|
+
|
|
151
|
+
#### Using the prompt
|
|
152
|
+
|
|
153
|
+
Once you've tested and refined your prompt, you can use it in your application code.
|
|
154
|
+
|
|
155
|
+
Execute prompts by their slug to get a response object with content and metadata:
|
|
156
|
+
|
|
157
|
+
```ruby
|
|
158
|
+
# Simple execution without variables
|
|
159
|
+
response = RubyLLM::Evals::Prompt.execute("image-categorization")
|
|
160
|
+
response.content # => "landscape"
|
|
161
|
+
|
|
162
|
+
# With variables
|
|
163
|
+
response = RubyLLM::Evals::Prompt.execute(
|
|
164
|
+
"text-summarization",
|
|
165
|
+
variables: { "text" => "Long article content here..." }
|
|
166
|
+
)
|
|
167
|
+
response.content # => "Brief summary of the article"
|
|
168
|
+
|
|
169
|
+
# With file attachments
|
|
170
|
+
response = RubyLLM::Evals::Prompt.execute(
|
|
171
|
+
"image-categorization",
|
|
172
|
+
files: [image.attachment.blob]
|
|
173
|
+
)
|
|
174
|
+
response.content # => "person"
|
|
175
|
+
|
|
176
|
+
# Access token counts and metadata
|
|
177
|
+
response = RubyLLM::Evals::Prompt.execute(
|
|
178
|
+
"sentiment-analysis",
|
|
179
|
+
variables: { "text" => "I love this product!" }
|
|
180
|
+
)
|
|
181
|
+
response.content # => "positive"
|
|
182
|
+
response.input_tokens # => 25
|
|
183
|
+
response.output_tokens # => 3
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
You can also execute a prompt directly on a Prompt instance:
|
|
187
|
+
|
|
188
|
+
```ruby
|
|
189
|
+
prompt = RubyLLM::Evals::Prompt.find_by(slug: "sentiment-analysis")
|
|
190
|
+
response = prompt.execute(variables: { "text" => "I love this product!" })
|
|
191
|
+
response.content # => "positive"
|
|
22
192
|
```
|
|
23
193
|
|
|
24
194
|
## Contributing
|
|
25
|
-
|
|
195
|
+
|
|
196
|
+
You can open an issue or a PR in GitHub.
|
|
26
197
|
|
|
27
198
|
## License
|
|
199
|
+
|
|
28
200
|
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
data/Rakefile
CHANGED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* This is a manifest file that'll be compiled into application.css, which will include all the files
|
|
3
|
+
* listed below.
|
|
4
|
+
*
|
|
5
|
+
* Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets,
|
|
6
|
+
* or any plugin's vendor/assets/stylesheets directory can be referenced here using a relative path.
|
|
7
|
+
*
|
|
8
|
+
* You're free to add application-wide styles to this file and they'll appear at the bottom of the
|
|
9
|
+
* compiled file so the styles you add here take precedence over styles defined in any other CSS/SCSS
|
|
10
|
+
* files in this directory. Styles in this file should be added after the last require_* statement.
|
|
11
|
+
* It is generally better to create a new file per style scope.
|
|
12
|
+
*
|
|
13
|
+
*= require_tree .
|
|
14
|
+
*= require_self
|
|
15
|
+
*/
|