ruby_llm-evals 0.0.1 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +180 -8
  3. data/Rakefile +0 -2
  4. data/app/assets/stylesheets/ruby_llm/evals/application.css +15 -0
  5. data/app/assets/stylesheets/ruby_llm/evals/bulma.min.css +3 -0
  6. data/app/assets/stylesheets/ruby_llm/evals/json_editor.css +25 -0
  7. data/app/controllers/concerns/ruby_llm/evals/prompt_executions/prompt_execution_scoped.rb +19 -0
  8. data/app/controllers/ruby_llm/evals/application_controller.rb +14 -0
  9. data/app/controllers/ruby_llm/evals/prompt_executions/failures_controller.rb +15 -0
  10. data/app/controllers/ruby_llm/evals/prompt_executions/passages_controller.rb +15 -0
  11. data/app/controllers/ruby_llm/evals/prompt_executions/retries_controller.rb +16 -0
  12. data/app/controllers/ruby_llm/evals/prompts_controller.rb +87 -0
  13. data/app/controllers/ruby_llm/evals/runs_controller.rb +46 -0
  14. data/app/helpers/ruby_llm/evals/application_helper.rb +39 -0
  15. data/app/helpers/ruby_llm/evals/prompt_executions_helper.rb +6 -0
  16. data/app/helpers/ruby_llm/evals/prompts_helper.rb +37 -0
  17. data/app/helpers/ruby_llm/evals/runs_helper.rb +6 -0
  18. data/app/javascript/ruby_llm/evals/application.js +3 -0
  19. data/app/javascript/ruby_llm/evals/controllers/application.js +13 -0
  20. data/app/javascript/ruby_llm/evals/controllers/eval_type_selector_controller.js +37 -0
  21. data/app/javascript/ruby_llm/evals/controllers/file_input_controller.js +21 -0
  22. data/app/javascript/ruby_llm/evals/controllers/index.js +4 -0
  23. data/app/javascript/ruby_llm/evals/controllers/json_editor_controller.js +129 -0
  24. data/app/javascript/ruby_llm/evals/controllers/provider_model_controller.js +85 -0
  25. data/app/javascript/ruby_llm/evals/controllers/schema_selector_controller.js +31 -0
  26. data/app/jobs/ruby_llm/evals/application_job.rb +6 -0
  27. data/app/jobs/ruby_llm/evals/execute_sample_job.rb +26 -0
  28. data/app/jobs/ruby_llm/evals/perform_run_job.rb +21 -0
  29. data/app/mailers/ruby_llm/evals/application_mailer.rb +8 -0
  30. data/app/models/concerns/ruby_llm/evals/job_trackable.rb +15 -0
  31. data/app/models/ruby_llm/evals/application_record.rb +7 -0
  32. data/app/models/ruby_llm/evals/page.rb +53 -0
  33. data/app/models/ruby_llm/evals/prompt.rb +55 -0
  34. data/app/models/ruby_llm/evals/prompt_execution.rb +169 -0
  35. data/app/models/ruby_llm/evals/run.rb +45 -0
  36. data/app/models/ruby_llm/evals/sample.rb +20 -0
  37. data/app/schemas/ruby_llm/evals/judge_verdict_schema.rb +8 -0
  38. data/app/views/layouts/ruby_llm/evals/application.html.erb +29 -0
  39. data/app/views/ruby_llm/evals/application/_flashes.html.erb +9 -0
  40. data/app/views/ruby_llm/evals/application/_nav.html.erb +12 -0
  41. data/app/views/ruby_llm/evals/application/_pagination.html.erb +7 -0
  42. data/app/views/ruby_llm/evals/application/_tabs.html.erb +6 -0
  43. data/app/views/ruby_llm/evals/prompts/_filters.html.erb +15 -0
  44. data/app/views/ruby_llm/evals/prompts/_form.html.erb +104 -0
  45. data/app/views/ruby_llm/evals/prompts/_prompt.html.erb +14 -0
  46. data/app/views/ruby_llm/evals/prompts/compare.html.erb +90 -0
  47. data/app/views/ruby_llm/evals/prompts/edit.html.erb +5 -0
  48. data/app/views/ruby_llm/evals/prompts/index.html.erb +32 -0
  49. data/app/views/ruby_llm/evals/prompts/new.html.erb +5 -0
  50. data/app/views/ruby_llm/evals/prompts/show.html.erb +107 -0
  51. data/app/views/ruby_llm/evals/runs/_filters.html.erb +17 -0
  52. data/app/views/ruby_llm/evals/runs/_run.html.erb +13 -0
  53. data/app/views/ruby_llm/evals/runs/index.html.erb +30 -0
  54. data/app/views/ruby_llm/evals/runs/show.html.erb +188 -0
  55. data/app/views/ruby_llm/evals/samples/_form.html.erb +88 -0
  56. data/config/importmap.rb +13 -0
  57. data/config/locales/en.yml +7 -0
  58. data/config/routes.rb +20 -1
  59. data/db/migrate/20251022211228_create_ruby_llm_evals_prompts.rb +21 -0
  60. data/db/migrate/20251022211229_create_ruby_llm_evals_samples.rb +14 -0
  61. data/db/migrate/20251022211230_create_ruby_llm_evals_runs.rb +21 -0
  62. data/db/migrate/20251022211231_create_ruby_llm_evals_prompt_executions.rb +26 -0
  63. data/lib/activemodel/validations/json_validator.rb +14 -0
  64. data/lib/ruby_llm/evals/engine.rb +49 -1
  65. data/lib/ruby_llm/evals/version.rb +2 -2
  66. data/lib/ruby_llm/evals.rb +7 -3
  67. metadata +65 -6
  68. /data/lib/tasks/{ruby_llm/evals_tasks.rake → ruby_llm_evals_tasks.rake} +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 85722601415fbabae3058f547a2cb814db4b6c4895bbc31d5eaac0fd6af68fb9
4
- data.tar.gz: 732663d65dfb1707a1bcc4188ed0f5d98ac062fba7f02ae018037d8b7e0129c1
3
+ metadata.gz: 3ac750c3e2f75518591afb49f7914ba0512f1c8837f76af0d111d1e20707f127
4
+ data.tar.gz: 1430a5bbf5ccaa37bd86fb0065ed472ab331ca3ff5ec78a00687d5b8c0998b8c
5
5
  SHA512:
6
- metadata.gz: 32246c8e3d33f6230893a295c4a7dc347d81634a52d83b607b455c1522a55609ad22d866b636cbde7838fe54eb58ae8e7dbc29b98853aeba7859488de021a097
7
- data.tar.gz: efa95f8b3f32cdde9b1fe40e649c9c028d73f85a31024a219903104dba0032a4ec47fd991c42a7bf0496d2325609ba3d0c3151472f6ba52e3a2c5b9bbf32a7df
6
+ metadata.gz: 2ae0ec74f8d651e3f55559767c5757a2b6680d8eb8fbd65a80696236c7e6104e089540c37ece8671b5150b22823f4aafd4ad6183d9584ffb95c9e28fc7f72f4c
7
+ data.tar.gz: 8b751eb6510b03aa7929a446b8435942e206814dd0c71edf4985bd46ff12a58f742b28bd31c251cdb49d38f6915330a204d7c582618e8d01bec4a684ab54ce2e
data/README.md CHANGED
@@ -1,10 +1,12 @@
1
- # RubyLlm::Evals
2
- Short description and motivation.
1
+ # RubyLLM::Evals
3
2
 
4
- ## Usage
5
- How to use my plugin.
3
+ Test, compare, and improve your LLM prompts within your Rails application.
6
4
 
7
5
  ## Installation
6
+
7
+ > [!NOTE]
8
+ > This engine relies on ActiveJob, ActiveStorage, and [RubyLLM](https://github.com/crmne/ruby_llm). Make sure you have them installed and configured.
9
+
8
10
  Add this line to your application's Gemfile:
9
11
 
10
12
  ```ruby
@@ -12,17 +14,187 @@ gem "ruby_llm-evals"
12
14
  ```
13
15
 
14
16
  And then execute:
17
+
15
18
  ```bash
16
19
  $ bundle
17
20
  ```
18
21
 
19
- Or install it yourself as:
20
- ```bash
21
- $ gem install ruby_llm-evals
22
+ To copy and migrate RubyLLM::Evals's migrations, run:
23
+
24
+ ```
25
+ $ rails ruby_llm_evals:install:migrations db:migrate
26
+ ```
27
+
28
+ And then mount the engine in your `config/routes.rb`:
29
+
30
+ ```ruby
31
+ Rails.application.routes.draw do
32
+ # ...
33
+
34
+ mount RubyLLM::Evals::Engine, at: "/evals"
35
+ end
36
+ ```
37
+
38
+ Now you should be able to browse to `/evals` and create, test, compare, and improve your LLM prompts. Continue reading to see how a typical workflow looks like, and how you can leverage your app's data to add samples to your prompts.
39
+
40
+ ![prompts](./assets/prompts.png)
41
+ ![runs](./assets/runs.png)
42
+ ![run](./assets/run.png)
43
+
44
+ ### Authentication and authorization
45
+
46
+ RubyLLM::Evals leaves authentication and authorization to the user. If no authentication is enforced, `/evals` will be available to everyone.
47
+
48
+ To enforce authentication, you can use route [constraints](https://guides.rubyonrails.org/routing.html#advanced-constraints), or set up a HTTP Basic auth middleware.
49
+
50
+ For example, if you're using devise, you can do this:
51
+
52
+ ```ruby
53
+ # config/routes.rb
54
+ authenticate :user do
55
+ mount RubyLLM::Evals::Engine, at: "/evals"
56
+ end
57
+ ```
58
+
59
+ See more examples [here](https://github.com/heartcombo/devise/wiki/How-To%3A-Define-resource-actions-that-require-authentication-using-routes.rb).
60
+
61
+ However, if you're using Rails' default authentication generator, or an authentication solution that doesn't provide constraints, you need to roll out your own solution:
62
+
63
+ ```ruby
64
+ # config/routes.rb
65
+ constraints ->(request) { Constraints::Auth.authenticated?(request) } do
66
+ mount RubyLLM::Evals::Engine, at: "/evals"
67
+ end
68
+
69
+ # lib/constraints/auth.rb
70
+ class Constraints::Auth
71
+ def self.authenticated?(request)
72
+ cookies = ActionDispatch::Cookies::CookieJar.build(request, request.cookies)
73
+
74
+ Session.find_by id: cookies.signed[:session_id]
75
+ end
76
+ end
77
+ ```
78
+
79
+ You can also set up a HTTP Basic auth middleware in the engine:
80
+
81
+ ```ruby
82
+ # config/initializers/ruby_llm-evals.rb
83
+ RubyLLM::Evals::Engine.middleware.use(Rack::Auth::Basic) do |username, password|
84
+ ActiveSupport::SecurityUtils.secure_compare(Rails.application.credentials.ruby_llm_evals_username, username) &
85
+ ActiveSupport::SecurityUtils.secure_compare(Rails.application.credentials.ruby_llm_evals_password, password)
86
+ end
87
+ ```
88
+
89
+ ## Usage
90
+
91
+ ### Workflow
92
+
93
+ A typical workflow looks like this:
94
+
95
+ #### Create a prompt
96
+
97
+ A prompt represents an LLM prompt template with:
98
+
99
+ * Provider: see [available providers](https://github.com/crmne/ruby_llm/tree/main/lib/ruby_llm/providers)
100
+ * Model: see [available models](https://github.com/crmne/ruby_llm/blob/main/lib/ruby_llm/models.json). In case you're selecting a local provider (eg. Ollama), you can enter the model name in a text field.
101
+ * Instructions: optional, the system prompt.
102
+ * Message: message template.
103
+ * Temperature: optional, controls randomness (0.0 to 1.0). Lower values make output more focused and deterministic.
104
+ * Params: optional, additional provider-specific parameters as JSON (e.g., `{"max_tokens": 1000}`).
105
+ * Tools: optional, array of tool class names that the LLM can use (e.g., `["Weather", "Calculator"]`). See how tools are defined in [RubyLLM](https://rubyllm.com/tools/).
106
+ * Schema: optional, a Ruby class name (e.g., `User`) to structure the LLM's response, or use "other" to provide a custom JSON schema in the Schema Other field. See [RubyLLM structured output](https://rubyllm.com/structured-output/).
107
+
108
+ Both the instructions and the message template can contain liquid tags that will be rendered at runtime. To add variables, enclose them with braces. Eg: `{{name}}`.
109
+
110
+ > [!NOTE]
111
+ > In order to use a provider, you must have it configured in `config/initializers/ruby_llm.rb` as explained [here](https://rubyllm.com/configuration/#provider-configuration)
112
+
113
+ #### Add samples
114
+
115
+ When creating/editing a prompt you can add samples, where you can define:
116
+
117
+ * Variables: a JSON that contains the values to use when executing the prompt. Eg: `{ "name": "Patricio" }`
118
+ * Eval type: the evaluation criteria: exact match, contains, regex, or human review.
119
+ * Expected output: optional if the eval type is `human`
120
+ * Files: optional attachments.
121
+
122
+ #### Run evaluations
123
+
124
+ Once you have a prompt with its examples you can run the evaluations. This will enqueue a job that will create an run and run each sample with the current prompt configuration.
125
+
126
+ The run will save the current prompt configuration for later analysis, such as the current provider/model, instructions, messages, variables, etc.
127
+
128
+ #### Analyze the results
129
+
130
+ You can view the accuracy, cost, and duration of the entire run and each individual prompt execution.
131
+
132
+ If you chose the human review eval type, it's now that you can review if an eval passed or not.
133
+
134
+ ### Beyond a typical workflow
135
+
136
+ #### Using your data to create prompts/samples
137
+
138
+ Suppose you want to categorize images. You can have a prompt (eg. `image-categorization`) and then add your data to the eval set:
139
+
140
+ ```ruby
141
+ prompt = RubyLLM::Evals::Prompt.find_by slug: "image-categorization"
142
+
143
+ Image.where(category: nil).take(50).each do |image|
144
+ sample = prompt.samples.create eval_type: :human_judge
145
+ sample.files.attach image.attachment.blob
146
+ end
147
+ ```
148
+
149
+ Then you can iterate over the prompt trying to find the best configuration possible.
150
+
151
+ #### Using the prompt
152
+
153
+ Once you've tested and refined your prompt, you can use it in your application code.
154
+
155
+ Execute prompts by their slug to get a response object with content and metadata:
156
+
157
+ ```ruby
158
+ # Simple execution without variables
159
+ response = RubyLLM::Evals::Prompt.execute("image-categorization")
160
+ response.content # => "landscape"
161
+
162
+ # With variables
163
+ response = RubyLLM::Evals::Prompt.execute(
164
+ "text-summarization",
165
+ variables: { "text" => "Long article content here..." }
166
+ )
167
+ response.content # => "Brief summary of the article"
168
+
169
+ # With file attachments
170
+ response = RubyLLM::Evals::Prompt.execute(
171
+ "image-categorization",
172
+ files: [image.attachment.blob]
173
+ )
174
+ response.content # => "person"
175
+
176
+ # Access token counts and metadata
177
+ response = RubyLLM::Evals::Prompt.execute(
178
+ "sentiment-analysis",
179
+ variables: { "text" => "I love this product!" }
180
+ )
181
+ response.content # => "positive"
182
+ response.input_tokens # => 25
183
+ response.output_tokens # => 3
184
+ ```
185
+
186
+ You can also execute a prompt directly on a Prompt instance:
187
+
188
+ ```ruby
189
+ prompt = RubyLLM::Evals::Prompt.find_by(slug: "sentiment-analysis")
190
+ response = prompt.execute(variables: { "text" => "I love this product!" })
191
+ response.content # => "positive"
22
192
  ```
23
193
 
24
194
  ## Contributing
25
- Contribution directions go here.
195
+
196
+ You can open an issue or a PR in GitHub.
26
197
 
27
198
  ## License
199
+
28
200
  The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
data/Rakefile CHANGED
@@ -3,6 +3,4 @@ require "bundler/setup"
3
3
  APP_RAKEFILE = File.expand_path("test/dummy/Rakefile", __dir__)
4
4
  load "rails/tasks/engine.rake"
5
5
 
6
- load "rails/tasks/statistics.rake"
7
-
8
6
  require "bundler/gem_tasks"
@@ -0,0 +1,15 @@
1
+ /*
2
+ * This is a manifest file that'll be compiled into application.css, which will include all the files
3
+ * listed below.
4
+ *
5
+ * Any CSS and SCSS file within this directory, lib/assets/stylesheets, vendor/assets/stylesheets,
6
+ * or any plugin's vendor/assets/stylesheets directory can be referenced here using a relative path.
7
+ *
8
+ * You're free to add application-wide styles to this file and they'll appear at the bottom of the
9
+ * compiled file so the styles you add here take precedence over styles defined in any other CSS/SCSS
10
+ * files in this directory. Styles in this file should be added after the last require_* statement.
11
+ * It is generally better to create a new file per style scope.
12
+ *
13
+ *= require_tree .
14
+ *= require_self
15
+ */