evals 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.env.example +1 -0
- data/README.md +8 -2
- data/examples/demo.rb +130 -0
- data/lib/evals/prompt_evaluator.rb +566 -0
- data/lib/evals/version.rb +1 -1
- data/lib/evals.rb +1 -0
- metadata +19 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d605671c8627234dfb82834acb733e308b2cf9dc8f4ec1c9d3335d6cb63a42ab
|
4
|
+
data.tar.gz: fbff5f3dabd7f3721a09f068318d7912c642b75811673789bbee50bb1fbec31d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d86068f9647b5a8f32dfc71f914f5e5afa74a98692c2d264a3c684ea9f9c096453767a5e0072adbe8e7d3d9440a35280e69de1e1165d22ef27054f6ad2ddc5d4
|
7
|
+
data.tar.gz: 5d4086f3a135e70d7e47f52b81ff9f411e0dc21bf47128ca0be9c14f417c98329f40ed004f90c7528951ce02cd6193ceb29b2ca16b2b6ea9c9c1ef54df6ed756
|
data/.env.example
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
ANTHROPIC_API_KEY=
|
data/README.md
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# Evals
|
2
2
|
|
3
|
-
A library for LLM
|
3
|
+
A Ruby library for evaluating LLM responses.
|
4
|
+
|
5
|
+
Based on the Prompt Evaluation example in the [Anthropic Skilljar course](https://anthropic.skilljar.com/claude-with-the-anthropic-api).
|
4
6
|
|
5
7
|
## Installation
|
6
8
|
|
@@ -16,9 +18,13 @@ If bundler is not being used to manage dependencies, install the gem by executin
|
|
16
18
|
gem install evals
|
17
19
|
```
|
18
20
|
|
21
|
+
## Configuration
|
22
|
+
|
23
|
+
Copy `.env.example` to `.env` and add your Anthropic API key.
|
24
|
+
|
19
25
|
## Usage
|
20
26
|
|
21
|
-
|
27
|
+
See [examples/demo.rb](examples/demo.rb) for an example.
|
22
28
|
|
23
29
|
## Development
|
24
30
|
|
data/examples/demo.rb
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "json"
|
5
|
+
require "dotenv/load"
|
6
|
+
require "anthropic"
|
7
|
+
require "evals"
|
8
|
+
|
9
|
+
# Client Initialization and helper functions
|
10
|
+
Dotenv.load
|
11
|
+
|
12
|
+
CLIENT = Anthropic::Client.new(api_key: ENV["ANTHROPIC_API_KEY"])
|
13
|
+
MODEL = "claude-3-5-haiku-latest"
|
14
|
+
|
15
|
+
def add_user_message(messages, text)
|
16
|
+
messages << {role: "user", content: text}
|
17
|
+
end
|
18
|
+
|
19
|
+
def add_assistant_message(messages, text)
|
20
|
+
messages << {role: "assistant", content: text}
|
21
|
+
end
|
22
|
+
|
23
|
+
def chat(messages, system: nil, temperature: 1.0, stop_sequences: [])
|
24
|
+
params = {
|
25
|
+
model: MODEL,
|
26
|
+
max_tokens: 1000,
|
27
|
+
messages: messages,
|
28
|
+
temperature: temperature,
|
29
|
+
stop_sequences: stop_sequences
|
30
|
+
}
|
31
|
+
|
32
|
+
params[:system] = system if system
|
33
|
+
|
34
|
+
response = CLIENT.messages.create(params)
|
35
|
+
response.content[0].text
|
36
|
+
end
|
37
|
+
|
38
|
+
# Create an instance of Evals::PromptEvaluator
|
39
|
+
# Increase `max_concurrent_tasks` for greater concurrency, but beware of rate limit errors!
|
40
|
+
evaluator = Evals::PromptEvaluator.new(max_concurrent_tasks: 1)
|
41
|
+
|
42
|
+
# Generate dataset
|
43
|
+
evaluator.generate_dataset(
|
44
|
+
# Describe the purpose or goal of the prompt you're trying to test
|
45
|
+
"Write a compact, concise 1 day meal plan for a single athlete",
|
46
|
+
# Describe the different inputs that your prompt requires
|
47
|
+
prompt_inputs_spec: {
|
48
|
+
"height" => "Athlete's height in cm",
|
49
|
+
"weight" => "Athlete's weight in kg",
|
50
|
+
"goal" => "Goal of the athlete",
|
51
|
+
"restrictions" => "Dietary restrictions of the athlete"
|
52
|
+
},
|
53
|
+
# Where to write the generated dataset
|
54
|
+
output_file: "dataset.json",
|
55
|
+
# Number of test cases to generate (recommend keeping this low if you're getting rate limit errors)
|
56
|
+
num_cases: 1
|
57
|
+
)
|
58
|
+
|
59
|
+
# Define and run the prompt you want to evaluate, returning the raw model output
|
60
|
+
# This function is executed once for each test case
|
61
|
+
def run_prompt(prompt_inputs)
|
62
|
+
prompt = <<~PROMPT
|
63
|
+
Generate a one-day meal plan for an athlete that meets their dietary restrictions.
|
64
|
+
|
65
|
+
<athlete_information>
|
66
|
+
- Height: #{prompt_inputs["height"]}
|
67
|
+
- Weight: #{prompt_inputs["weight"]}
|
68
|
+
- Goal: #{prompt_inputs["goal"]}
|
69
|
+
- Dietary restrictions: #{prompt_inputs["restrictions"]}
|
70
|
+
</athlete_information>
|
71
|
+
|
72
|
+
Guidelines:
|
73
|
+
1. Include accurate daily calorie amount
|
74
|
+
2. Show protein, fat, and carb amounts
|
75
|
+
3. Specify when to eat each meal
|
76
|
+
4. Use only foods that fit restrictions
|
77
|
+
5. List all portion sizes in grams
|
78
|
+
6. Keep budget-friendly if mentioned
|
79
|
+
|
80
|
+
Here is an example with a sample input and an ideal output:
|
81
|
+
<sample_input>
|
82
|
+
height: 170
|
83
|
+
weight: 70
|
84
|
+
goal: Maintain fitness and improve cholesterol levels
|
85
|
+
restrictions: High cholesterol
|
86
|
+
</sample_input>
|
87
|
+
<ideal_output>
|
88
|
+
Here is a one-day meal plan for an athlete aiming to maintain fitness and improve cholesterol levels:
|
89
|
+
|
90
|
+
* **Calorie Target:** Approximately 2500 calories
|
91
|
+
* **Macronutrient Breakdown:** Protein (140g), Fat (70g), Carbs (340g)
|
92
|
+
|
93
|
+
**Meal Plan:**
|
94
|
+
|
95
|
+
* **Breakfast (7:00 AM):** Oatmeal (80g dry weight) with berries (100g) and walnuts (15g). Skim milk (240g).
|
96
|
+
* Protein: 15g, Fat: 15g, Carbs: 60g
|
97
|
+
* **Mid-Morning Snack (10:00 AM):** Apple (150g) with almond butter (30g).
|
98
|
+
* Protein: 7g, Fat: 18g, Carbs: 25g
|
99
|
+
* **Lunch (1:00 PM):** Grilled chicken breast (120g) salad with mixed greens (150g), cucumber (50g), tomato (50g), and a light vinaigrette dressing (30g). Whole wheat bread (60g).
|
100
|
+
* Protein: 40g, Fat: 15g, Carbs: 70g
|
101
|
+
* **Afternoon Snack (4:00 PM):** Greek yogurt (170g, non-fat) with a banana (120g).
|
102
|
+
* Protein: 20g, Fat: 0g, Carbs: 40g
|
103
|
+
* **Dinner (7:00 PM):** Baked salmon (140g) with steamed broccoli (200g) and quinoa (75g dry weight).
|
104
|
+
* Protein: 40g, Fat: 20g, Carbs: 80g
|
105
|
+
* **Evening Snack (9:00 PM):** Small handful of almonds (20g).
|
106
|
+
* Protein: 8g, Fat: 12g, Carbs: 15g
|
107
|
+
|
108
|
+
This meal plan prioritizes lean protein sources, whole grains, fruits, and vegetables, while limiting saturated and trans fats to support healthy cholesterol levels.
|
109
|
+
</ideal_output>
|
110
|
+
This example meal plan is well-structured, provides detailed information on food choices and quantities, and aligns with the athlete's goals and restrictions.
|
111
|
+
PROMPT
|
112
|
+
|
113
|
+
messages = []
|
114
|
+
add_user_message(messages, prompt)
|
115
|
+
chat(messages)
|
116
|
+
end
|
117
|
+
|
118
|
+
# Run evaluation
|
119
|
+
evaluator.run_evaluation(
|
120
|
+
method(:run_prompt),
|
121
|
+
"dataset.json",
|
122
|
+
extra_criteria: <<~CRITERIA
|
123
|
+
The output should include:
|
124
|
+
- Daily caloric total
|
125
|
+
- Macronutrient breakdown
|
126
|
+
- Meals with exact foods, portions, and timing
|
127
|
+
CRITERIA
|
128
|
+
)
|
129
|
+
|
130
|
+
puts "Evaluation completed. Results saved to output.json and output.html"
|
@@ -0,0 +1,566 @@
|
|
1
|
+
require "json"
|
2
|
+
require "net/http"
|
3
|
+
require "uri"
|
4
|
+
|
5
|
+
module Evals
|
6
|
+
class PromptEvaluator
|
7
|
+
def initialize(max_concurrent_tasks: 3)
|
8
|
+
@max_concurrent_tasks = max_concurrent_tasks
|
9
|
+
@client = Anthropic::Client.new(api_key: ENV["ANTHROPIC_API_KEY"])
|
10
|
+
@model = "claude-3-5-haiku-latest"
|
11
|
+
end
|
12
|
+
|
13
|
+
def render(template_string, variables)
|
14
|
+
placeholders = template_string.scan(/{([^{}]+)}/)
|
15
|
+
|
16
|
+
result = template_string
|
17
|
+
placeholders.flatten.each do |placeholder|
|
18
|
+
if variables.key?(placeholder)
|
19
|
+
result = result.gsub("{#{placeholder}}", variables[placeholder].to_s)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
result.gsub("{{", "{").gsub("}}", "}")
|
24
|
+
end
|
25
|
+
|
26
|
+
def add_user_message(messages, text)
|
27
|
+
messages << {role: "user", content: text}
|
28
|
+
end
|
29
|
+
|
30
|
+
def add_assistant_message(messages, text)
|
31
|
+
messages << {role: "assistant", content: text}
|
32
|
+
end
|
33
|
+
|
34
|
+
def chat(messages, system: nil, temperature: 1.0, stop_sequences: [])
|
35
|
+
params = {
|
36
|
+
model: @model,
|
37
|
+
max_tokens: 1000,
|
38
|
+
messages: messages,
|
39
|
+
temperature: temperature,
|
40
|
+
stop_sequences: stop_sequences
|
41
|
+
}
|
42
|
+
|
43
|
+
params[:system] = system if system
|
44
|
+
|
45
|
+
response = @client.messages.create(params)
|
46
|
+
response.content[0].text
|
47
|
+
end
|
48
|
+
|
49
|
+
def generate_unique_ideas(task_description, prompt_inputs_spec, num_cases)
|
50
|
+
prompt = <<~TEXT
|
51
|
+
Generate #{num_cases} unique, diverse ideas for testing a prompt that accomplishes this task:
|
52
|
+
|
53
|
+
<task_description>
|
54
|
+
#{task_description}
|
55
|
+
</task_description>
|
56
|
+
|
57
|
+
The prompt will receive the following inputs
|
58
|
+
<prompt_inputs>
|
59
|
+
#{prompt_inputs_spec}
|
60
|
+
</prompt_inputs>
|
61
|
+
|
62
|
+
Each idea should represent a distinct scenario or example that tests different aspects of the task.
|
63
|
+
|
64
|
+
Output Format:
|
65
|
+
Provide your response as a structured JSON array where each item is a brief description of the idea.
|
66
|
+
|
67
|
+
Example:
|
68
|
+
```json
|
69
|
+
[
|
70
|
+
"Testing with technical computer science terminology",
|
71
|
+
"Testing with medical research findings",
|
72
|
+
"Testing with complex mathematical concepts",
|
73
|
+
...
|
74
|
+
]
|
75
|
+
```
|
76
|
+
|
77
|
+
Ensure each idea is:
|
78
|
+
- Clearly distinct from the others
|
79
|
+
- Relevant to the task description
|
80
|
+
- Specific enough to guide generation of a full test case
|
81
|
+
- Quick to solve without requiring extensive computation or multi-step processing
|
82
|
+
- Solvable with no more than 400 tokens of output
|
83
|
+
|
84
|
+
Remember, only generate #{num_cases} unique ideas
|
85
|
+
TEXT
|
86
|
+
|
87
|
+
system_prompt = "You are a test scenario designer specialized in creating diverse, unique testing scenarios."
|
88
|
+
|
89
|
+
example_prompt_inputs = ""
|
90
|
+
prompt_inputs_spec.each do |key, value|
|
91
|
+
val = value.gsub("\n", "\\n")
|
92
|
+
example_prompt_inputs += "\"#{key}\": str # #{val},"
|
93
|
+
end
|
94
|
+
|
95
|
+
rendered_prompt = render(
|
96
|
+
prompt.strip,
|
97
|
+
{
|
98
|
+
"task_description" => task_description,
|
99
|
+
"num_cases" => num_cases,
|
100
|
+
"prompt_inputs" => example_prompt_inputs
|
101
|
+
}
|
102
|
+
)
|
103
|
+
|
104
|
+
messages = []
|
105
|
+
add_user_message(messages, rendered_prompt)
|
106
|
+
add_assistant_message(messages, "```json")
|
107
|
+
text = chat(
|
108
|
+
messages,
|
109
|
+
stop_sequences: ["```"],
|
110
|
+
system: system_prompt,
|
111
|
+
temperature: 1.0
|
112
|
+
)
|
113
|
+
|
114
|
+
JSON.parse(text)
|
115
|
+
end
|
116
|
+
|
117
|
+
def generate_test_case(task_description, idea, prompt_inputs_spec = {})
|
118
|
+
example_prompt_inputs = ""
|
119
|
+
prompt_inputs_spec.each do |key, value|
|
120
|
+
val = value.gsub("\n", "\\n")
|
121
|
+
example_prompt_inputs += "\"#{key}\": \"EXAMPLE_VALUE\", // #{val}\n"
|
122
|
+
end
|
123
|
+
|
124
|
+
allowed_keys = prompt_inputs_spec.keys.map { |key| "\"#{key}\"" }.join(", ")
|
125
|
+
|
126
|
+
prompt = <<~TEXT
|
127
|
+
Generate a single detailed test case for a prompt evaluation based on:
|
128
|
+
|
129
|
+
<task_description>
|
130
|
+
#{task_description}
|
131
|
+
</task_description>
|
132
|
+
|
133
|
+
<specific_idea>
|
134
|
+
#{idea}
|
135
|
+
</specific_idea>
|
136
|
+
|
137
|
+
<allowed_input_keys>
|
138
|
+
#{allowed_keys}
|
139
|
+
</allowed_input_keys>
|
140
|
+
|
141
|
+
Output Format:
|
142
|
+
```json
|
143
|
+
{
|
144
|
+
"prompt_inputs": {
|
145
|
+
#{example_prompt_inputs}
|
146
|
+
},
|
147
|
+
"solution_criteria": ["criterion 1", "criterion 2", ...] // Concise list of criteria for evaluating the solution, 1 to 4 items
|
148
|
+
}
|
149
|
+
```
|
150
|
+
|
151
|
+
IMPORTANT REQUIREMENTS:
|
152
|
+
- You MUST ONLY use these exact input keys in your prompt_inputs: #{allowed_keys}
|
153
|
+
- Do NOT add any additional keys to prompt_inputs
|
154
|
+
- All keys listed in allowed_input_keys must be included in your response
|
155
|
+
- Make the test case realistic and practically useful
|
156
|
+
- Include measurable, concise solution criteria
|
157
|
+
- The solution criteria should ONLY address the direct requirements of the task description and the generated prompt_inputs
|
158
|
+
- Avoid over-specifying criteria with requirements that go beyond the core task
|
159
|
+
- Keep solution criteria simple, focused, and directly tied to the fundamental task
|
160
|
+
- The test case should be tailored to the specific idea provided
|
161
|
+
- Quick to solve without requiring extensive computation or multi-step processing
|
162
|
+
- Solvable with no more than 400 tokens of output
|
163
|
+
- DO NOT include any fields beyond those specified in the output format
|
164
|
+
TEXT
|
165
|
+
|
166
|
+
system_prompt = "You are a test case creator specializing in designing evaluation scenarios."
|
167
|
+
|
168
|
+
rendered_prompt = render(
|
169
|
+
prompt.strip,
|
170
|
+
{
|
171
|
+
"allowed_keys" => allowed_keys,
|
172
|
+
"task_description" => task_description,
|
173
|
+
"idea" => idea,
|
174
|
+
"example_prompt_inputs" => example_prompt_inputs
|
175
|
+
}
|
176
|
+
)
|
177
|
+
|
178
|
+
messages = []
|
179
|
+
add_user_message(messages, rendered_prompt)
|
180
|
+
add_assistant_message(messages, "```json")
|
181
|
+
text = chat(
|
182
|
+
messages,
|
183
|
+
stop_sequences: ["```"],
|
184
|
+
system: system_prompt,
|
185
|
+
temperature: 0.7
|
186
|
+
)
|
187
|
+
|
188
|
+
test_case = JSON.parse(text)
|
189
|
+
test_case["task_description"] = task_description
|
190
|
+
test_case["scenario"] = idea
|
191
|
+
|
192
|
+
test_case
|
193
|
+
end
|
194
|
+
|
195
|
+
def generate_dataset(task_description, prompt_inputs_spec: {}, num_cases: 1, output_file: "dataset.json")
|
196
|
+
ideas = generate_unique_ideas(task_description, prompt_inputs_spec, num_cases)
|
197
|
+
|
198
|
+
dataset = []
|
199
|
+
completed = 0
|
200
|
+
total = ideas.length
|
201
|
+
last_reported_percentage = 0
|
202
|
+
|
203
|
+
threads = ideas.map do |idea|
|
204
|
+
Thread.new do
|
205
|
+
generate_test_case(task_description, idea, prompt_inputs_spec)
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
threads.each do |thread|
|
210
|
+
result = thread.value
|
211
|
+
completed += 1
|
212
|
+
current_percentage = ((completed.to_f / total) * 100).to_i
|
213
|
+
milestone_percentage = (current_percentage / 20) * 20
|
214
|
+
|
215
|
+
if milestone_percentage > last_reported_percentage
|
216
|
+
puts "Generated #{completed}/#{total} test cases"
|
217
|
+
last_reported_percentage = milestone_percentage
|
218
|
+
end
|
219
|
+
|
220
|
+
dataset << result
|
221
|
+
rescue => e
|
222
|
+
puts "Error generating test case: #{e}"
|
223
|
+
end
|
224
|
+
|
225
|
+
File.write(output_file, JSON.pretty_generate(dataset))
|
226
|
+
dataset
|
227
|
+
end
|
228
|
+
|
229
|
+
def grade_output(test_case, output, extra_criteria)
|
230
|
+
prompt_inputs = ""
|
231
|
+
test_case["prompt_inputs"].each do |key, value|
|
232
|
+
val = value.gsub("\n", "\\n")
|
233
|
+
prompt_inputs += "\"#{key}\":\"#{val}\",\n"
|
234
|
+
end
|
235
|
+
|
236
|
+
extra_criteria_section = ""
|
237
|
+
if extra_criteria
|
238
|
+
extra_criteria_template = <<~TEXT
|
239
|
+
Mandatory Requirements - ANY VIOLATION MEANS AUTOMATIC FAILURE (score of 3 or lower):
|
240
|
+
<extra_important_criteria>
|
241
|
+
#{extra_criteria}
|
242
|
+
</extra_important_criteria>
|
243
|
+
TEXT
|
244
|
+
extra_criteria_section = render(
|
245
|
+
extra_criteria_template.strip,
|
246
|
+
{"extra_criteria" => extra_criteria}
|
247
|
+
)
|
248
|
+
end
|
249
|
+
|
250
|
+
eval_template = <<~TEXT
|
251
|
+
Your task is to evaluate the following AI-generated solution with EXTREME RIGOR.
|
252
|
+
|
253
|
+
Original task description:
|
254
|
+
<task_description>
|
255
|
+
#{test_case["task_description"]}
|
256
|
+
</task_description>
|
257
|
+
|
258
|
+
Original task inputs:
|
259
|
+
<task_inputs>
|
260
|
+
{ #{prompt_inputs} }
|
261
|
+
</task_inputs>
|
262
|
+
|
263
|
+
Solution to Evaluate:
|
264
|
+
<solution>
|
265
|
+
#{output}
|
266
|
+
</solution>
|
267
|
+
|
268
|
+
Criteria you should use to evaluate the solution:
|
269
|
+
<criteria>
|
270
|
+
#{test_case["solution_criteria"].join("\n")}
|
271
|
+
</criteria>
|
272
|
+
|
273
|
+
#{extra_criteria_section}
|
274
|
+
|
275
|
+
Scoring Guidelines:
|
276
|
+
* Score 1-3: Solution fails to meet one or more MANDATORY requirements
|
277
|
+
* Score 4-6: Solution meets all mandatory requirements but has significant deficiencies in secondary criteria
|
278
|
+
* Score 7-8: Solution meets all mandatory requirements and most secondary criteria, with minor issues
|
279
|
+
* Score 9-10: Solution meets all mandatory and secondary criteria
|
280
|
+
|
281
|
+
IMPORTANT SCORING INSTRUCTIONS:
|
282
|
+
* Grade the output based ONLY on the listed criteria. Do not add your own extra requirements.
|
283
|
+
* If a solution meets all of the mandatory and secondary criteria give it a 10
|
284
|
+
* Don't complain that the solution "only" meets the mandatory and secondary criteria. Solutions shouldn't go above and beyond - they should meet the exact listed criteria.
|
285
|
+
* ANY violation of a mandatory requirement MUST result in a score of 3 or lower
|
286
|
+
* The full 1-10 scale should be utilized - don't hesitate to give low scores when warranted
|
287
|
+
|
288
|
+
Output Format
|
289
|
+
Provide your evaluation as a structured JSON object with the following fields, in this specific order:
|
290
|
+
- "strengths": An array of 1-3 key strengths
|
291
|
+
- "weaknesses": An array of 1-3 key areas for improvement
|
292
|
+
- "reasoning": A concise explanation of your overall assessment
|
293
|
+
- "score": A number between 1-10
|
294
|
+
|
295
|
+
Respond with JSON. Keep your response concise and direct.
|
296
|
+
TEXT
|
297
|
+
|
298
|
+
eval_prompt = render(
|
299
|
+
eval_template.strip,
|
300
|
+
{
|
301
|
+
"task_description" => test_case["task_description"],
|
302
|
+
"prompt_inputs" => prompt_inputs,
|
303
|
+
"output" => output,
|
304
|
+
"solution_criteria" => test_case["solution_criteria"].join("\n"),
|
305
|
+
"extra_criteria_section" => extra_criteria_section
|
306
|
+
}
|
307
|
+
)
|
308
|
+
|
309
|
+
messages = []
|
310
|
+
add_user_message(messages, eval_prompt)
|
311
|
+
add_assistant_message(messages, "```json")
|
312
|
+
eval_text = chat(
|
313
|
+
messages,
|
314
|
+
stop_sequences: ["```"],
|
315
|
+
temperature: 0.0
|
316
|
+
)
|
317
|
+
|
318
|
+
JSON.parse(eval_text)
|
319
|
+
end
|
320
|
+
|
321
|
+
def run_test_case(test_case, run_prompt_function, extra_criteria = nil)
|
322
|
+
output = run_prompt_function.call(test_case["prompt_inputs"])
|
323
|
+
|
324
|
+
model_grade = grade_output(test_case, output, extra_criteria)
|
325
|
+
model_score = model_grade["score"]
|
326
|
+
reasoning = model_grade["reasoning"]
|
327
|
+
|
328
|
+
{
|
329
|
+
"output" => output,
|
330
|
+
"test_case" => test_case,
|
331
|
+
"score" => model_score,
|
332
|
+
"reasoning" => reasoning
|
333
|
+
}
|
334
|
+
end
|
335
|
+
|
336
|
+
def run_evaluation(run_prompt_function, dataset_file, extra_criteria: nil, json_output_file: "output.json", html_output_file: "output.html")
|
337
|
+
dataset = JSON.parse(File.read(dataset_file))
|
338
|
+
|
339
|
+
results = []
|
340
|
+
completed = 0
|
341
|
+
total = dataset.length
|
342
|
+
last_reported_percentage = 0
|
343
|
+
|
344
|
+
threads = dataset.map do |test_case|
|
345
|
+
Thread.new do
|
346
|
+
run_test_case(test_case, run_prompt_function, extra_criteria)
|
347
|
+
end
|
348
|
+
end
|
349
|
+
|
350
|
+
threads.each do |thread|
|
351
|
+
result = thread.value
|
352
|
+
completed += 1
|
353
|
+
current_percentage = ((completed.to_f / total) * 100).to_i
|
354
|
+
milestone_percentage = (current_percentage / 20) * 20
|
355
|
+
|
356
|
+
if milestone_percentage > last_reported_percentage
|
357
|
+
puts "Graded #{completed}/#{total} test cases"
|
358
|
+
last_reported_percentage = milestone_percentage
|
359
|
+
end
|
360
|
+
|
361
|
+
results << result
|
362
|
+
end
|
363
|
+
|
364
|
+
average_score = results.sum { |result| result["score"] } / results.length.to_f
|
365
|
+
puts "Average score: #{average_score}"
|
366
|
+
|
367
|
+
File.write(json_output_file, JSON.pretty_generate(results))
|
368
|
+
|
369
|
+
html = generate_prompt_evaluation_report(results)
|
370
|
+
File.write(html_output_file, html)
|
371
|
+
|
372
|
+
results
|
373
|
+
end
|
374
|
+
|
375
|
+
private
|
376
|
+
|
377
|
+
def generate_prompt_evaluation_report(evaluation_results)
|
378
|
+
total_tests = evaluation_results.length
|
379
|
+
scores = evaluation_results.map { |result| result["score"] }
|
380
|
+
avg_score = scores.empty? ? 0 : scores.sum.to_f / scores.length
|
381
|
+
max_possible_score = 10
|
382
|
+
pass_rate = if total_tests > 0
|
383
|
+
100.0 * scores.count { |s| s >= 7 } / total_tests
|
384
|
+
else
|
385
|
+
0
|
386
|
+
end
|
387
|
+
|
388
|
+
html = <<~HTML
|
389
|
+
<!DOCTYPE html>
|
390
|
+
<html lang="en">
|
391
|
+
<head>
|
392
|
+
<meta charset="UTF-8">
|
393
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
394
|
+
<title>Prompt Evaluation Report</title>
|
395
|
+
<style>
|
396
|
+
body {
|
397
|
+
font-family: Arial, sans-serif;
|
398
|
+
line-height: 1.6;
|
399
|
+
margin: 0;
|
400
|
+
padding: 20px;
|
401
|
+
color: #333;
|
402
|
+
}
|
403
|
+
.header {
|
404
|
+
background-color: #f0f0f0;
|
405
|
+
padding: 20px;
|
406
|
+
border-radius: 5px;
|
407
|
+
margin-bottom: 20px;
|
408
|
+
}
|
409
|
+
.summary-stats {
|
410
|
+
display: flex;
|
411
|
+
justify-content: space-between;
|
412
|
+
flex-wrap: wrap;
|
413
|
+
gap: 10px;
|
414
|
+
}
|
415
|
+
.stat-box {
|
416
|
+
background-color: #fff;
|
417
|
+
border-radius: 5px;
|
418
|
+
padding: 15px;
|
419
|
+
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
|
420
|
+
flex-basis: 30%;
|
421
|
+
min-width: 200px;
|
422
|
+
}
|
423
|
+
.stat-value {
|
424
|
+
font-size: 24px;
|
425
|
+
font-weight: bold;
|
426
|
+
margin-top: 5px;
|
427
|
+
}
|
428
|
+
table {
|
429
|
+
width: 100%;
|
430
|
+
border-collapse: collapse;
|
431
|
+
margin-top: 20px;
|
432
|
+
}
|
433
|
+
th {
|
434
|
+
background-color: #4a4a4a;
|
435
|
+
color: white;
|
436
|
+
text-align: left;
|
437
|
+
padding: 12px;
|
438
|
+
}
|
439
|
+
td {
|
440
|
+
padding: 10px;
|
441
|
+
border-bottom: 1px solid #ddd;
|
442
|
+
vertical-align: top;
|
443
|
+
}
|
444
|
+
tr:nth-child(even) {
|
445
|
+
background-color: #f9f9f9;
|
446
|
+
}
|
447
|
+
.output-cell {
|
448
|
+
white-space: pre-wrap;
|
449
|
+
}
|
450
|
+
.score {
|
451
|
+
font-weight: bold;
|
452
|
+
padding: 5px 10px;
|
453
|
+
border-radius: 3px;
|
454
|
+
display: inline-block;
|
455
|
+
}
|
456
|
+
.score-high {
|
457
|
+
background-color: #c8e6c9;
|
458
|
+
color: #2e7d32;
|
459
|
+
}
|
460
|
+
.score-medium {
|
461
|
+
background-color: #fff9c4;
|
462
|
+
color: #f57f17;
|
463
|
+
}
|
464
|
+
.score-low {
|
465
|
+
background-color: #ffcdd2;
|
466
|
+
color: #c62828;
|
467
|
+
}
|
468
|
+
.output {
|
469
|
+
overflow: auto;
|
470
|
+
white-space: pre-wrap;
|
471
|
+
}
|
472
|
+
.output pre {
|
473
|
+
background-color: #f5f5f5;
|
474
|
+
border: 1px solid #ddd;
|
475
|
+
border-radius: 4px;
|
476
|
+
padding: 10px;
|
477
|
+
margin: 0;
|
478
|
+
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
|
479
|
+
font-size: 14px;
|
480
|
+
line-height: 1.4;
|
481
|
+
color: #333;
|
482
|
+
box-shadow: inset 0 1px 3px rgba(0, 0, 0, 0.1);
|
483
|
+
overflow-x: auto;
|
484
|
+
white-space: pre-wrap;
|
485
|
+
word-wrap: break-word;
|
486
|
+
}
|
487
|
+
td {
|
488
|
+
width: 20%;
|
489
|
+
}
|
490
|
+
.score-col {
|
491
|
+
width: 80px;
|
492
|
+
}
|
493
|
+
</style>
|
494
|
+
</head>
|
495
|
+
<body>
|
496
|
+
<div class="header">
|
497
|
+
<h1>Prompt Evaluation Report</h1>
|
498
|
+
<div class="summary-stats">
|
499
|
+
<div class="stat-box">
|
500
|
+
<div>Total Test Cases</div>
|
501
|
+
<div class="stat-value">#{total_tests}</div>
|
502
|
+
</div>
|
503
|
+
<div class="stat-box">
|
504
|
+
<div>Average Score</div>
|
505
|
+
<div class="stat-value">#{sprintf("%.1f", avg_score)} / #{max_possible_score}</div>
|
506
|
+
</div>
|
507
|
+
<div class="stat-box">
|
508
|
+
<div>Pass Rate (≥7)</div>
|
509
|
+
<div class="stat-value">#{sprintf("%.1f", pass_rate)}%</div>
|
510
|
+
</div>
|
511
|
+
</div>
|
512
|
+
</div>
|
513
|
+
|
514
|
+
<table>
|
515
|
+
<thead>
|
516
|
+
<tr>
|
517
|
+
<th>Scenario</th>
|
518
|
+
<th>Prompt Inputs</th>
|
519
|
+
<th>Solution Criteria</th>
|
520
|
+
<th>Output</th>
|
521
|
+
<th>Score</th>
|
522
|
+
<th>Reasoning</th>
|
523
|
+
</tr>
|
524
|
+
</thead>
|
525
|
+
<tbody>
|
526
|
+
HTML
|
527
|
+
|
528
|
+
evaluation_results.each do |result|
|
529
|
+
prompt_inputs_html = result["test_case"]["prompt_inputs"].map do |key, value|
|
530
|
+
"<strong>#{key}:</strong> #{value}"
|
531
|
+
end.join("<br>")
|
532
|
+
|
533
|
+
criteria_string = result["test_case"]["solution_criteria"].join("<br>• ")
|
534
|
+
|
535
|
+
score = result["score"]
|
536
|
+
score_class = if score >= 8
|
537
|
+
"score-high"
|
538
|
+
elsif score <= 5
|
539
|
+
"score-low"
|
540
|
+
else
|
541
|
+
"score-medium"
|
542
|
+
end
|
543
|
+
|
544
|
+
html += <<~HTML
|
545
|
+
<tr>
|
546
|
+
<td>#{result["test_case"]["scenario"]}</td>
|
547
|
+
<td class="prompt-inputs">#{prompt_inputs_html}</td>
|
548
|
+
<td class="criteria">• #{criteria_string}</td>
|
549
|
+
<td class="output"><pre>#{result["output"]}</pre></td>
|
550
|
+
<td class="score-col"><span class="score #{score_class}">#{score}</span></td>
|
551
|
+
<td class="reasoning">#{result["reasoning"]}</td>
|
552
|
+
</tr>
|
553
|
+
HTML
|
554
|
+
end
|
555
|
+
|
556
|
+
html += <<~HTML
|
557
|
+
</tbody>
|
558
|
+
</table>
|
559
|
+
</body>
|
560
|
+
</html>
|
561
|
+
HTML
|
562
|
+
|
563
|
+
html
|
564
|
+
end
|
565
|
+
end
|
566
|
+
end
|
data/lib/evals/version.rb
CHANGED
data/lib/evals.rb
CHANGED
metadata
CHANGED
@@ -1,26 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: evals
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andy Waite
|
8
8
|
bindir: exe
|
9
9
|
cert_chain: []
|
10
10
|
date: 1980-01-02 00:00:00.000000000 Z
|
11
|
-
dependencies:
|
11
|
+
dependencies:
|
12
|
+
- !ruby/object:Gem::Dependency
|
13
|
+
name: anthropic
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - "~>"
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '1.0'
|
19
|
+
type: :runtime
|
20
|
+
prerelease: false
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
requirements:
|
23
|
+
- - "~>"
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: '1.0'
|
12
26
|
email:
|
13
27
|
- 13400+andyw8@users.noreply.github.com
|
14
28
|
executables: []
|
15
29
|
extensions: []
|
16
30
|
extra_rdoc_files: []
|
17
31
|
files:
|
32
|
+
- ".env.example"
|
18
33
|
- ".ruby-version"
|
19
34
|
- ".standard.yml"
|
20
35
|
- LICENSE.txt
|
21
36
|
- README.md
|
22
37
|
- Rakefile
|
38
|
+
- examples/demo.rb
|
23
39
|
- lib/evals.rb
|
40
|
+
- lib/evals/prompt_evaluator.rb
|
24
41
|
- lib/evals/version.rb
|
25
42
|
- sig/evals.rbs
|
26
43
|
homepage: https://github.com/andyw8/evals
|