qualspec 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.qualspec_cassettes/comparison_test.yml +439 -0
- data/.qualspec_cassettes/quick_test.yml +232 -0
- data/.rspec +3 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +70 -0
- data/CHANGELOG.md +16 -0
- data/README.md +84 -0
- data/Rakefile +8 -0
- data/docs/configuration.md +132 -0
- data/docs/evaluation-suites.md +180 -0
- data/docs/getting-started.md +102 -0
- data/docs/recording.md +196 -0
- data/docs/rspec-integration.md +233 -0
- data/docs/rubrics.md +174 -0
- data/examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml +65 -0
- data/examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml +64 -0
- data/examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml +74 -0
- data/examples/cassettes/qualspec_rspec_integration_score_matchers_supports_score_comparisons.yml +65 -0
- data/examples/cassettes/qualspec_rspec_integration_vcr_integration_records_and_plays_back_api_calls_automatically.yml +65 -0
- data/examples/cassettes/qualspec_rspec_integration_with_context_uses_context_in_evaluation.yml +67 -0
- data/examples/cassettes/qualspec_rspec_integration_with_rubrics_evaluates_using_builtin_rubrics.yml +67 -0
- data/examples/comparison.rb +22 -0
- data/examples/model_comparison.rb +38 -0
- data/examples/persona_test.rb +49 -0
- data/examples/quick_test.rb +28 -0
- data/examples/report.html +399 -0
- data/examples/rspec_example_spec.rb +153 -0
- data/exe/qualspec +142 -0
- data/lib/qualspec/builtin_rubrics.rb +83 -0
- data/lib/qualspec/client.rb +127 -0
- data/lib/qualspec/configuration.rb +32 -0
- data/lib/qualspec/evaluation.rb +52 -0
- data/lib/qualspec/judge.rb +217 -0
- data/lib/qualspec/recorder.rb +55 -0
- data/lib/qualspec/rspec/configuration.rb +49 -0
- data/lib/qualspec/rspec/evaluation_result.rb +142 -0
- data/lib/qualspec/rspec/helpers.rb +155 -0
- data/lib/qualspec/rspec/matchers.rb +163 -0
- data/lib/qualspec/rspec.rb +66 -0
- data/lib/qualspec/rubric.rb +43 -0
- data/lib/qualspec/suite/behavior.rb +43 -0
- data/lib/qualspec/suite/builtin_behaviors.rb +84 -0
- data/lib/qualspec/suite/candidate.rb +30 -0
- data/lib/qualspec/suite/dsl.rb +64 -0
- data/lib/qualspec/suite/html_reporter.rb +673 -0
- data/lib/qualspec/suite/reporter.rb +274 -0
- data/lib/qualspec/suite/runner.rb +261 -0
- data/lib/qualspec/suite/scenario.rb +57 -0
- data/lib/qualspec/version.rb +5 -0
- data/lib/qualspec.rb +103 -0
- data/sig/qualspec.rbs +4 -0
- metadata +142 -0
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
---
|
|
2
|
+
http_interactions:
|
|
3
|
+
- request:
|
|
4
|
+
method: post
|
|
5
|
+
uri: https://openrouter.ai/api/v1/chat/completions
|
|
6
|
+
body:
|
|
7
|
+
encoding: UTF-8
|
|
8
|
+
string: '{"model":"google/gemini-3-flash-preview","messages":[{"role":"user","content":"Hello!
|
|
9
|
+
How are you today?"}]}'
|
|
10
|
+
headers:
|
|
11
|
+
Content-Type:
|
|
12
|
+
- application/json
|
|
13
|
+
Authorization:
|
|
14
|
+
- Bearer <API_KEY>
|
|
15
|
+
User-Agent:
|
|
16
|
+
- Faraday v2.14.0
|
|
17
|
+
response:
|
|
18
|
+
status:
|
|
19
|
+
code: 200
|
|
20
|
+
message: OK
|
|
21
|
+
headers:
|
|
22
|
+
date:
|
|
23
|
+
- Thu, 25 Dec 2025 00:26:39 GMT
|
|
24
|
+
content-type:
|
|
25
|
+
- application/json
|
|
26
|
+
transfer-encoding:
|
|
27
|
+
- chunked
|
|
28
|
+
connection:
|
|
29
|
+
- keep-alive
|
|
30
|
+
content-encoding:
|
|
31
|
+
- gzip
|
|
32
|
+
access-control-allow-origin:
|
|
33
|
+
- "*"
|
|
34
|
+
vary:
|
|
35
|
+
- Accept-Encoding
|
|
36
|
+
permissions-policy:
|
|
37
|
+
- payment=(self "https://checkout.stripe.com" "https://connect-js.stripe.com"
|
|
38
|
+
"https://js.stripe.com" "https://*.js.stripe.com" "https://hooks.stripe.com")
|
|
39
|
+
referrer-policy:
|
|
40
|
+
- no-referrer, strict-origin-when-cross-origin
|
|
41
|
+
x-content-type-options:
|
|
42
|
+
- nosniff
|
|
43
|
+
server:
|
|
44
|
+
- cloudflare
|
|
45
|
+
cf-ray:
|
|
46
|
+
- 9b343544ce98a201-MSP
|
|
47
|
+
body:
|
|
48
|
+
encoding: ASCII-8BIT
|
|
49
|
+
string: "\n \n\n \n{\"id\":\"gen-1766622398-RkCshdTuH7jzWuOcPVbF\",\"provider\":\"Google\",\"model\":\"google/gemini-3-flash-preview\",\"object\":\"chat.completion\",\"created\":1766622398,\"choices\":[{\"logprobs\":null,\"finish_reason\":\"stop\",\"native_finish_reason\":\"STOP\",\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"I'm
|
|
50
|
+
doing great, thank you for asking! I'm ready to help you with whatever you
|
|
51
|
+
need. \\n\\nHow are things going with you? Is there anything specific I can
|
|
52
|
+
help you with today?\",\"refusal\":null,\"reasoning\":null}}],\"usage\":{\"prompt_tokens\":7,\"completion_tokens\":43,\"total_tokens\":50,\"cost\":0.0001325,\"is_byok\":false,\"prompt_tokens_details\":{\"cached_tokens\":0,\"audio_tokens\":0,\"video_tokens\":0},\"cost_details\":{\"upstream_inference_cost\":null,\"upstream_inference_prompt_cost\":0.0000035,\"upstream_inference_completions_cost\":0.000129},\"completion_tokens_details\":{\"reasoning_tokens\":0,\"image_tokens\":0}}}"
|
|
53
|
+
recorded_at: Thu, 25 Dec 2025 00:26:39 GMT
|
|
54
|
+
- request:
|
|
55
|
+
method: post
|
|
56
|
+
uri: https://openrouter.ai/api/v1/chat/completions
|
|
57
|
+
body:
|
|
58
|
+
encoding: UTF-8
|
|
59
|
+
string: '{"model":"google/gemini-3-flash-preview","messages":[{"role":"system","content":"You
|
|
60
|
+
are an evaluation judge. You will be given a response and one or more evaluation
|
|
61
|
+
criteria.\nYour job is to score how well the response meets the criteria.\n\nScoring:\n-
|
|
62
|
+
0: Completely fails to meet the criteria\n- 1-3: Mostly fails, with minor
|
|
63
|
+
positive elements\n- 4-6: Partially meets criteria, significant room for improvement\n-
|
|
64
|
+
7-8: Mostly meets criteria with minor issues\n- 9: Meets criteria well\n-
|
|
65
|
+
10: Perfectly meets all criteria\n\nBe strict but fair. Consider each criterion
|
|
66
|
+
carefully.\n\nYou MUST respond with valid JSON in this exact format:\n{\"score\":
|
|
67
|
+
<0-10>, \"reasoning\": \"Brief explanation of the score\"}\n\nYour reasoning
|
|
68
|
+
should be concise (1-2 sentences max).\n"},{"role":"user","content":"## Response
|
|
69
|
+
to evaluate:\nI''m doing great, thank you for asking! I''m ready to help you
|
|
70
|
+
with whatever you need. \n\nHow are things going with you? Is there anything
|
|
71
|
+
specific I can help you with today?\n\n## Additional context:\nUser prompt:
|
|
72
|
+
Hello! How are you today?\n\n## Evaluation criteria:\nresponds in a friendly
|
|
73
|
+
manner\nis appropriate in length\n\nScore this response from 0-10. Respond
|
|
74
|
+
with JSON only."}],"response_format":{"type":"json_object"}}'
|
|
75
|
+
headers:
|
|
76
|
+
Content-Type:
|
|
77
|
+
- application/json
|
|
78
|
+
Authorization:
|
|
79
|
+
- Bearer <API_KEY>
|
|
80
|
+
User-Agent:
|
|
81
|
+
- Faraday v2.14.0
|
|
82
|
+
response:
|
|
83
|
+
status:
|
|
84
|
+
code: 200
|
|
85
|
+
message: OK
|
|
86
|
+
headers:
|
|
87
|
+
date:
|
|
88
|
+
- Thu, 25 Dec 2025 00:26:40 GMT
|
|
89
|
+
content-type:
|
|
90
|
+
- application/json
|
|
91
|
+
transfer-encoding:
|
|
92
|
+
- chunked
|
|
93
|
+
connection:
|
|
94
|
+
- keep-alive
|
|
95
|
+
content-encoding:
|
|
96
|
+
- gzip
|
|
97
|
+
access-control-allow-origin:
|
|
98
|
+
- "*"
|
|
99
|
+
vary:
|
|
100
|
+
- Accept-Encoding
|
|
101
|
+
permissions-policy:
|
|
102
|
+
- payment=(self "https://checkout.stripe.com" "https://connect-js.stripe.com"
|
|
103
|
+
"https://js.stripe.com" "https://*.js.stripe.com" "https://hooks.stripe.com")
|
|
104
|
+
referrer-policy:
|
|
105
|
+
- no-referrer, strict-origin-when-cross-origin
|
|
106
|
+
x-content-type-options:
|
|
107
|
+
- nosniff
|
|
108
|
+
server:
|
|
109
|
+
- cloudflare
|
|
110
|
+
cf-ray:
|
|
111
|
+
- 9b34354c5bada1d1-MSP
|
|
112
|
+
body:
|
|
113
|
+
encoding: ASCII-8BIT
|
|
114
|
+
string: "\n \n{\"id\":\"gen-1766622399-NovBTMk4uVXaHxIReaM3\",\"provider\":\"Google
|
|
115
|
+
AI Studio\",\"model\":\"google/gemini-3-flash-preview\",\"object\":\"chat.completion\",\"created\":1766622399,\"choices\":[{\"logprobs\":null,\"finish_reason\":\"stop\",\"native_finish_reason\":\"STOP\",\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"{\\\"score\\\":
|
|
116
|
+
10, \\\"reasoning\\\": \\\"The response is warm and friendly while maintaining
|
|
117
|
+
a perfectly concise and appropriate length for the greeting.\\\"}\",\"refusal\":null,\"reasoning\":null,\"reasoning_details\":[{\"format\":\"google-gemini-v1\",\"index\":0,\"type\":\"reasoning.encrypted\",\"data\":\"EjQKMgFyyNp8oYVEKG82B2pc5SoTKa1N7Hmj3FgVav3EngLEb3J24El8ffeyEsycGIylLBcv\"}]}}],\"usage\":{\"prompt_tokens\":265,\"completion_tokens\":31,\"total_tokens\":296,\"cost\":0.0002255,\"is_byok\":false,\"prompt_tokens_details\":{\"cached_tokens\":0,\"audio_tokens\":0,\"video_tokens\":0},\"cost_details\":{\"upstream_inference_cost\":null,\"upstream_inference_prompt_cost\":0.0001325,\"upstream_inference_completions_cost\":0.000093},\"completion_tokens_details\":{\"reasoning_tokens\":0,\"image_tokens\":0}}}"
|
|
118
|
+
recorded_at: Thu, 25 Dec 2025 00:26:40 GMT
|
|
119
|
+
- request:
|
|
120
|
+
method: post
|
|
121
|
+
uri: https://openrouter.ai/api/v1/chat/completions
|
|
122
|
+
body:
|
|
123
|
+
encoding: UTF-8
|
|
124
|
+
string: '{"model":"google/gemini-3-flash-preview","messages":[{"role":"user","content":"What
|
|
125
|
+
is 7 times 8?"}]}'
|
|
126
|
+
headers:
|
|
127
|
+
Content-Type:
|
|
128
|
+
- application/json
|
|
129
|
+
Authorization:
|
|
130
|
+
- Bearer <API_KEY>
|
|
131
|
+
User-Agent:
|
|
132
|
+
- Faraday v2.14.0
|
|
133
|
+
response:
|
|
134
|
+
status:
|
|
135
|
+
code: 200
|
|
136
|
+
message: OK
|
|
137
|
+
headers:
|
|
138
|
+
date:
|
|
139
|
+
- Thu, 25 Dec 2025 00:26:40 GMT
|
|
140
|
+
content-type:
|
|
141
|
+
- application/json
|
|
142
|
+
transfer-encoding:
|
|
143
|
+
- chunked
|
|
144
|
+
connection:
|
|
145
|
+
- keep-alive
|
|
146
|
+
content-encoding:
|
|
147
|
+
- gzip
|
|
148
|
+
access-control-allow-origin:
|
|
149
|
+
- "*"
|
|
150
|
+
vary:
|
|
151
|
+
- Accept-Encoding
|
|
152
|
+
permissions-policy:
|
|
153
|
+
- payment=(self "https://checkout.stripe.com" "https://connect-js.stripe.com"
|
|
154
|
+
"https://js.stripe.com" "https://*.js.stripe.com" "https://hooks.stripe.com")
|
|
155
|
+
referrer-policy:
|
|
156
|
+
- no-referrer, strict-origin-when-cross-origin
|
|
157
|
+
x-content-type-options:
|
|
158
|
+
- nosniff
|
|
159
|
+
server:
|
|
160
|
+
- cloudflare
|
|
161
|
+
cf-ray:
|
|
162
|
+
- 9b343551efdee02b-MSP
|
|
163
|
+
body:
|
|
164
|
+
encoding: ASCII-8BIT
|
|
165
|
+
string: "\n \n{\"id\":\"gen-1766622400-FZhCKv98cSCsFMYiK3QW\",\"provider\":\"Google
|
|
166
|
+
AI Studio\",\"model\":\"google/gemini-3-flash-preview\",\"object\":\"chat.completion\",\"created\":1766622400,\"choices\":[{\"logprobs\":null,\"finish_reason\":\"stop\",\"native_finish_reason\":\"STOP\",\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"7
|
|
167
|
+
times 8 is **56**.\",\"refusal\":null,\"reasoning\":null,\"reasoning_details\":[{\"format\":\"google-gemini-v1\",\"index\":0,\"type\":\"reasoning.encrypted\",\"data\":\"EjQKMgFyyNp88llrW7rph3pu6U7rTIoXdbn+DMh3mGII1VQPcclBfCOU7g7KA42LUdRSJuDx\"}]}}],\"usage\":{\"prompt_tokens\":9,\"completion_tokens\":9,\"total_tokens\":18,\"cost\":0.0000315,\"is_byok\":false,\"prompt_tokens_details\":{\"cached_tokens\":0,\"audio_tokens\":0,\"video_tokens\":0},\"cost_details\":{\"upstream_inference_cost\":null,\"upstream_inference_prompt_cost\":0.0000045,\"upstream_inference_completions_cost\":0.000027},\"completion_tokens_details\":{\"reasoning_tokens\":0,\"image_tokens\":0}}}"
|
|
168
|
+
recorded_at: Thu, 25 Dec 2025 00:26:40 GMT
|
|
169
|
+
- request:
|
|
170
|
+
method: post
|
|
171
|
+
uri: https://openrouter.ai/api/v1/chat/completions
|
|
172
|
+
body:
|
|
173
|
+
encoding: UTF-8
|
|
174
|
+
string: '{"model":"google/gemini-3-flash-preview","messages":[{"role":"system","content":"You
|
|
175
|
+
are an evaluation judge. You will be given a response and one or more evaluation
|
|
176
|
+
criteria.\nYour job is to score how well the response meets the criteria.\n\nScoring:\n-
|
|
177
|
+
0: Completely fails to meet the criteria\n- 1-3: Mostly fails, with minor
|
|
178
|
+
positive elements\n- 4-6: Partially meets criteria, significant room for improvement\n-
|
|
179
|
+
7-8: Mostly meets criteria with minor issues\n- 9: Meets criteria well\n-
|
|
180
|
+
10: Perfectly meets all criteria\n\nBe strict but fair. Consider each criterion
|
|
181
|
+
carefully.\n\nYou MUST respond with valid JSON in this exact format:\n{\"score\":
|
|
182
|
+
<0-10>, \"reasoning\": \"Brief explanation of the score\"}\n\nYour reasoning
|
|
183
|
+
should be concise (1-2 sentences max).\n"},{"role":"user","content":"## Response
|
|
184
|
+
to evaluate:\n7 times 8 is **56**.\n\n## Additional context:\nUser prompt:
|
|
185
|
+
What is 7 times 8?\n\n## Evaluation criteria:\nprovides the correct answer
|
|
186
|
+
(56)\ndoesn''t over-explain\n\nScore this response from 0-10. Respond with
|
|
187
|
+
JSON only."}],"response_format":{"type":"json_object"}}'
|
|
188
|
+
headers:
|
|
189
|
+
Content-Type:
|
|
190
|
+
- application/json
|
|
191
|
+
Authorization:
|
|
192
|
+
- Bearer <API_KEY>
|
|
193
|
+
User-Agent:
|
|
194
|
+
- Faraday v2.14.0
|
|
195
|
+
response:
|
|
196
|
+
status:
|
|
197
|
+
code: 200
|
|
198
|
+
message: OK
|
|
199
|
+
headers:
|
|
200
|
+
date:
|
|
201
|
+
- Thu, 25 Dec 2025 00:26:41 GMT
|
|
202
|
+
content-type:
|
|
203
|
+
- application/json
|
|
204
|
+
transfer-encoding:
|
|
205
|
+
- chunked
|
|
206
|
+
connection:
|
|
207
|
+
- keep-alive
|
|
208
|
+
content-encoding:
|
|
209
|
+
- gzip
|
|
210
|
+
access-control-allow-origin:
|
|
211
|
+
- "*"
|
|
212
|
+
vary:
|
|
213
|
+
- Accept-Encoding
|
|
214
|
+
permissions-policy:
|
|
215
|
+
- payment=(self "https://checkout.stripe.com" "https://connect-js.stripe.com"
|
|
216
|
+
"https://js.stripe.com" "https://*.js.stripe.com" "https://hooks.stripe.com")
|
|
217
|
+
referrer-policy:
|
|
218
|
+
- no-referrer, strict-origin-when-cross-origin
|
|
219
|
+
x-content-type-options:
|
|
220
|
+
- nosniff
|
|
221
|
+
server:
|
|
222
|
+
- cloudflare
|
|
223
|
+
cf-ray:
|
|
224
|
+
- 9b34355638a3aced-MSP
|
|
225
|
+
body:
|
|
226
|
+
encoding: ASCII-8BIT
|
|
227
|
+
string: "\n \n\n \n{\"id\":\"gen-1766622401-oGF5F5TOjGtBql8KEu7Y\",\"provider\":\"Google
|
|
228
|
+
AI Studio\",\"model\":\"google/gemini-3-flash-preview\",\"object\":\"chat.completion\",\"created\":1766622401,\"choices\":[{\"logprobs\":null,\"finish_reason\":\"stop\",\"native_finish_reason\":\"STOP\",\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"{\\\"score\\\":
|
|
229
|
+
10, \\\"reasoning\\\": \\\"The response provides the correct mathematical
|
|
230
|
+
answer and is perfectly concise without any unnecessary explanation.\\\"}\",\"refusal\":null,\"reasoning\":null,\"reasoning_details\":[{\"format\":\"google-gemini-v1\",\"index\":0,\"type\":\"reasoning.encrypted\",\"data\":\"EjQKMgFyyNp8+lZt6C5XsGlmh2G0c/l+peG53Biq0ZHTLBRF89P+OMYjEIbfqTnuKPtgolfx\"}]}}],\"usage\":{\"prompt_tokens\":237,\"completion_tokens\":29,\"total_tokens\":266,\"cost\":0.0002055,\"is_byok\":false,\"prompt_tokens_details\":{\"cached_tokens\":0,\"audio_tokens\":0,\"video_tokens\":0},\"cost_details\":{\"upstream_inference_cost\":null,\"upstream_inference_prompt_cost\":0.0001185,\"upstream_inference_completions_cost\":0.000087},\"completion_tokens_details\":{\"reasoning_tokens\":0,\"image_tokens\":0}}}"
|
|
231
|
+
recorded_at: Thu, 25 Dec 2025 00:26:41 GMT
|
|
232
|
+
recorded_with: VCR 6.4.0
|
data/.rspec
ADDED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
inherit_from: .rubocop_todo.yml
|
data/.rubocop_todo.yml
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# This configuration was generated by
|
|
2
|
+
# `rubocop --auto-gen-config`
|
|
3
|
+
# on 2025-12-25 02:22:03 UTC using RuboCop version 1.82.1.
|
|
4
|
+
# The point is for the user to remove these configuration records
|
|
5
|
+
# one by one as the offenses are removed from the code base.
|
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
|
8
|
+
|
|
9
|
+
# Offense count: 5
|
|
10
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
11
|
+
Lint/RedundantCopDisableDirective:
|
|
12
|
+
Exclude:
|
|
13
|
+
- 'lib/qualspec/rubric.rb'
|
|
14
|
+
- 'lib/qualspec/suite/behavior.rb'
|
|
15
|
+
- 'lib/qualspec/suite/dsl.rb'
|
|
16
|
+
- 'lib/qualspec/suite/scenario.rb'
|
|
17
|
+
|
|
18
|
+
# Offense count: 17
|
|
19
|
+
# Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes.
|
|
20
|
+
Metrics/AbcSize:
|
|
21
|
+
Max: 74
|
|
22
|
+
|
|
23
|
+
# Offense count: 6
|
|
24
|
+
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
25
|
+
# AllowedMethods: refine
|
|
26
|
+
Metrics/BlockLength:
|
|
27
|
+
Max: 76
|
|
28
|
+
|
|
29
|
+
# Offense count: 4
|
|
30
|
+
# Configuration parameters: CountComments, CountAsOne.
|
|
31
|
+
Metrics/ClassLength:
|
|
32
|
+
Max: 552
|
|
33
|
+
|
|
34
|
+
# Offense count: 9
|
|
35
|
+
# Configuration parameters: AllowedMethods, AllowedPatterns.
|
|
36
|
+
Metrics/CyclomaticComplexity:
|
|
37
|
+
Max: 16
|
|
38
|
+
|
|
39
|
+
# Offense count: 30
|
|
40
|
+
# Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
|
|
41
|
+
Metrics/MethodLength:
|
|
42
|
+
Max: 69
|
|
43
|
+
|
|
44
|
+
# Offense count: 2
|
|
45
|
+
# Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
|
|
46
|
+
Metrics/ParameterLists:
|
|
47
|
+
Max: 9
|
|
48
|
+
|
|
49
|
+
# Offense count: 6
|
|
50
|
+
# Configuration parameters: AllowedMethods, AllowedPatterns.
|
|
51
|
+
Metrics/PerceivedComplexity:
|
|
52
|
+
Max: 16
|
|
53
|
+
|
|
54
|
+
# Offense count: 21
|
|
55
|
+
# Configuration parameters: AllowedConstants.
|
|
56
|
+
Style/Documentation:
|
|
57
|
+
Enabled: false
|
|
58
|
+
|
|
59
|
+
# Offense count: 3
|
|
60
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
61
|
+
Style/EvalWithLocation:
|
|
62
|
+
Exclude:
|
|
63
|
+
- 'lib/qualspec/suite/builtin_behaviors.rb'
|
|
64
|
+
|
|
65
|
+
# Offense count: 1
|
|
66
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
67
|
+
# Configuration parameters: AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
|
|
68
|
+
# URISchemes: http, https
|
|
69
|
+
Layout/LineLength:
|
|
70
|
+
Max: 187
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.0.1] - 2025-12-25
|
|
4
|
+
|
|
5
|
+
Initial release.
|
|
6
|
+
|
|
7
|
+
### Features
|
|
8
|
+
|
|
9
|
+
- DSL for defining evaluation suites to compare LLM responses
|
|
10
|
+
- RSpec integration with custom matchers (`be_passing`, `have_score_above`, etc.)
|
|
11
|
+
- LLM-as-judge scoring system (0-10 scale)
|
|
12
|
+
- Built-in rubrics: `tool_calling`, `helpful`, `concise`, `in_character`, `safety`, and more
|
|
13
|
+
- Shared behaviors for reusable test scenarios
|
|
14
|
+
- VCR-based recording/playback for deterministic tests
|
|
15
|
+
- HTML and JSON report generation
|
|
16
|
+
- OpenRouter API support with configurable models
|
data/README.md
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# Qualspec
|
|
2
|
+
|
|
3
|
+
LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and test subjective qualities that traditional assertions can't capture.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```ruby
|
|
8
|
+
gem "qualspec"
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Configuration
|
|
12
|
+
|
|
13
|
+
Set your API key (required):
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
export QUALSPEC_API_KEY=your_openrouter_key
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### Environment Variables
|
|
20
|
+
|
|
21
|
+
| Variable | Description | Default |
|
|
22
|
+
|----------|-------------|---------|
|
|
23
|
+
| `QUALSPEC_API_KEY` | API key (required) | - |
|
|
24
|
+
| `QUALSPEC_API_URL` | API endpoint | `https://openrouter.ai/api/v1` |
|
|
25
|
+
| `QUALSPEC_MODEL` | Default model for candidates | `google/gemini-3-flash-preview` |
|
|
26
|
+
| `QUALSPEC_JUDGE_MODEL` | Model used as judge | Same as `QUALSPEC_MODEL` |
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
### Compare Models (CLI)
|
|
31
|
+
|
|
32
|
+
```ruby
|
|
33
|
+
# eval/comparison.rb
|
|
34
|
+
Qualspec.evaluation "Model Comparison" do
|
|
35
|
+
candidates do
|
|
36
|
+
candidate "gpt4", model: "openai/gpt-4"
|
|
37
|
+
candidate "claude", model: "anthropic/claude-3-sonnet"
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
scenario "helpfulness" do
|
|
41
|
+
prompt "How do I center a div in CSS?"
|
|
42
|
+
eval "provides a working solution"
|
|
43
|
+
eval "explains the approach"
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
# Run comparison
|
|
50
|
+
qualspec eval/comparison.rb
|
|
51
|
+
|
|
52
|
+
# Generate HTML report
|
|
53
|
+
qualspec --html report.html eval/comparison.rb
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Test Your Agent (RSpec)
|
|
57
|
+
|
|
58
|
+
```ruby
|
|
59
|
+
require "qualspec/rspec"
|
|
60
|
+
|
|
61
|
+
RSpec.describe MyAgent do
|
|
62
|
+
include Qualspec::RSpec::Helpers
|
|
63
|
+
|
|
64
|
+
it "responds helpfully" do
|
|
65
|
+
response = MyAgent.call("Hello")
|
|
66
|
+
|
|
67
|
+
result = qualspec_evaluate(response, "responds in a friendly manner")
|
|
68
|
+
expect(result).to be_passing
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Documentation
|
|
74
|
+
|
|
75
|
+
- [Getting Started](docs/getting-started.md)
|
|
76
|
+
- [Evaluation Suites](docs/evaluation-suites.md) - CLI for model comparison
|
|
77
|
+
- [RSpec Integration](docs/rspec-integration.md) - Testing your agents
|
|
78
|
+
- [Rubrics](docs/rubrics.md) - Builtin and custom evaluation criteria
|
|
79
|
+
- [Configuration](docs/configuration.md) - All options
|
|
80
|
+
- [Recording](docs/recording.md) - VCR integration
|
|
81
|
+
|
|
82
|
+
## License
|
|
83
|
+
|
|
84
|
+
MIT
|
data/Rakefile
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# Configuration
|
|
2
|
+
|
|
3
|
+
## Environment Variables
|
|
4
|
+
|
|
5
|
+
| Variable | Description | Default |
|
|
6
|
+
|----------|-------------|---------|
|
|
7
|
+
| `QUALSPEC_API_KEY` | API key (required) | - |
|
|
8
|
+
| `QUALSPEC_API_URL` | API endpoint | `https://openrouter.ai/api/v1` |
|
|
9
|
+
| `QUALSPEC_MODEL` | Default model for candidates | `google/gemini-3-flash-preview` |
|
|
10
|
+
| `QUALSPEC_JUDGE_MODEL` | Model for judging | Same as `QUALSPEC_MODEL` |
|
|
11
|
+
| `QUALSPEC_SSL_VERIFY` | SSL verification (disable with `false`) | `true` |
|
|
12
|
+
|
|
13
|
+
### Required Setup
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
export QUALSPEC_API_KEY=your_openrouter_api_key
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### Using Different Providers
|
|
20
|
+
|
|
21
|
+
**OpenRouter (default):**
|
|
22
|
+
```bash
|
|
23
|
+
export QUALSPEC_API_KEY=sk-or-...
|
|
24
|
+
# API URL defaults to https://openrouter.ai/api/v1
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
**OpenAI:**
|
|
28
|
+
```bash
|
|
29
|
+
export QUALSPEC_API_KEY=sk-...
|
|
30
|
+
export QUALSPEC_API_URL=https://api.openai.com/v1
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
**Ollama (local):**
|
|
34
|
+
```bash
|
|
35
|
+
export QUALSPEC_API_URL=http://localhost:11434/v1
|
|
36
|
+
# No API key needed for local Ollama
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Programmatic Configuration
|
|
40
|
+
|
|
41
|
+
```ruby
|
|
42
|
+
Qualspec.configure do |config|
|
|
43
|
+
# API settings
|
|
44
|
+
config.api_url = "https://openrouter.ai/api/v1"
|
|
45
|
+
config.api_key = ENV["MY_API_KEY"]
|
|
46
|
+
|
|
47
|
+
# Models
|
|
48
|
+
config.default_model = "google/gemini-2.5-flash-preview"
|
|
49
|
+
config.judge_model = "openai/gpt-4"
|
|
50
|
+
|
|
51
|
+
# Timeouts
|
|
52
|
+
config.request_timeout = 120 # seconds
|
|
53
|
+
|
|
54
|
+
# Custom judge prompt (optional)
|
|
55
|
+
config.judge_system_prompt = <<~PROMPT
|
|
56
|
+
You are a strict evaluator...
|
|
57
|
+
PROMPT
|
|
58
|
+
end
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## RSpec Configuration
|
|
62
|
+
|
|
63
|
+
Additional configuration for RSpec integration:
|
|
64
|
+
|
|
65
|
+
```ruby
|
|
66
|
+
Qualspec::RSpec.configure do |config|
|
|
67
|
+
# Default pass threshold (0-10)
|
|
68
|
+
config.default_threshold = 7
|
|
69
|
+
|
|
70
|
+
# VCR cassette directory
|
|
71
|
+
config.vcr_cassette_dir = "spec/cassettes/qualspec"
|
|
72
|
+
|
|
73
|
+
# VCR recording mode (:new_episodes, :none, :all, :once)
|
|
74
|
+
config.record_mode = :new_episodes
|
|
75
|
+
|
|
76
|
+
# Auto-load builtin rubrics
|
|
77
|
+
config.load_builtins = true
|
|
78
|
+
end
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## CLI Options
|
|
82
|
+
|
|
83
|
+
Override configuration via command line:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# Override judge model
|
|
87
|
+
qualspec -m openai/gpt-4 eval/suite.rb
|
|
88
|
+
|
|
89
|
+
# Override API URL
|
|
90
|
+
qualspec -u https://api.openai.com/v1 eval/suite.rb
|
|
91
|
+
|
|
92
|
+
# Override API key
|
|
93
|
+
qualspec -k sk-xxx eval/suite.rb
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Configuration Precedence
|
|
97
|
+
|
|
98
|
+
1. CLI options (highest)
|
|
99
|
+
2. Programmatic configuration
|
|
100
|
+
3. Environment variables
|
|
101
|
+
4. Defaults (lowest)
|
|
102
|
+
|
|
103
|
+
## Supported Providers
|
|
104
|
+
|
|
105
|
+
Any OpenAI-compatible API works:
|
|
106
|
+
|
|
107
|
+
| Provider | API URL |
|
|
108
|
+
|----------|---------|
|
|
109
|
+
| OpenRouter | `https://openrouter.ai/api/v1` |
|
|
110
|
+
| OpenAI | `https://api.openai.com/v1` |
|
|
111
|
+
| Ollama | `http://localhost:11434/v1` |
|
|
112
|
+
| Azure OpenAI | `https://YOUR.openai.azure.com/openai/deployments/YOUR_DEPLOYMENT` |
|
|
113
|
+
| Together AI | `https://api.together.xyz/v1` |
|
|
114
|
+
| Groq | `https://api.groq.com/openai/v1` |
|
|
115
|
+
|
|
116
|
+
## SSL Configuration
|
|
117
|
+
|
|
118
|
+
SSL verification is enabled by default. Disable it if you encounter certificate issues:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
export QUALSPEC_SSL_VERIFY=false
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Accessing Configuration
|
|
125
|
+
|
|
126
|
+
```ruby
|
|
127
|
+
config = Qualspec.configuration
|
|
128
|
+
|
|
129
|
+
config.api_url # Current API URL
|
|
130
|
+
config.judge_model # Current judge model
|
|
131
|
+
config.api_headers # Headers sent with requests
|
|
132
|
+
```
|