qualspec 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.qualspec_cassettes/comparison_test.yml +439 -0
  3. data/.qualspec_cassettes/quick_test.yml +232 -0
  4. data/.rspec +3 -0
  5. data/.rubocop.yml +1 -0
  6. data/.rubocop_todo.yml +70 -0
  7. data/CHANGELOG.md +16 -0
  8. data/README.md +84 -0
  9. data/Rakefile +8 -0
  10. data/docs/configuration.md +132 -0
  11. data/docs/evaluation-suites.md +180 -0
  12. data/docs/getting-started.md +102 -0
  13. data/docs/recording.md +196 -0
  14. data/docs/rspec-integration.md +233 -0
  15. data/docs/rubrics.md +174 -0
  16. data/examples/cassettes/qualspec_rspec_integration_basic_evaluation_evaluates_responses_with_inline_criteria.yml +65 -0
  17. data/examples/cassettes/qualspec_rspec_integration_basic_evaluation_provides_detailed_feedback_on_failure.yml +64 -0
  18. data/examples/cassettes/qualspec_rspec_integration_comparative_evaluation_compares_multiple_responses.yml +74 -0
  19. data/examples/cassettes/qualspec_rspec_integration_score_matchers_supports_score_comparisons.yml +65 -0
  20. data/examples/cassettes/qualspec_rspec_integration_vcr_integration_records_and_plays_back_api_calls_automatically.yml +65 -0
  21. data/examples/cassettes/qualspec_rspec_integration_with_context_uses_context_in_evaluation.yml +67 -0
  22. data/examples/cassettes/qualspec_rspec_integration_with_rubrics_evaluates_using_builtin_rubrics.yml +67 -0
  23. data/examples/comparison.rb +22 -0
  24. data/examples/model_comparison.rb +38 -0
  25. data/examples/persona_test.rb +49 -0
  26. data/examples/quick_test.rb +28 -0
  27. data/examples/report.html +399 -0
  28. data/examples/rspec_example_spec.rb +153 -0
  29. data/exe/qualspec +142 -0
  30. data/lib/qualspec/builtin_rubrics.rb +83 -0
  31. data/lib/qualspec/client.rb +127 -0
  32. data/lib/qualspec/configuration.rb +32 -0
  33. data/lib/qualspec/evaluation.rb +52 -0
  34. data/lib/qualspec/judge.rb +217 -0
  35. data/lib/qualspec/recorder.rb +55 -0
  36. data/lib/qualspec/rspec/configuration.rb +49 -0
  37. data/lib/qualspec/rspec/evaluation_result.rb +142 -0
  38. data/lib/qualspec/rspec/helpers.rb +155 -0
  39. data/lib/qualspec/rspec/matchers.rb +163 -0
  40. data/lib/qualspec/rspec.rb +66 -0
  41. data/lib/qualspec/rubric.rb +43 -0
  42. data/lib/qualspec/suite/behavior.rb +43 -0
  43. data/lib/qualspec/suite/builtin_behaviors.rb +84 -0
  44. data/lib/qualspec/suite/candidate.rb +30 -0
  45. data/lib/qualspec/suite/dsl.rb +64 -0
  46. data/lib/qualspec/suite/html_reporter.rb +673 -0
  47. data/lib/qualspec/suite/reporter.rb +274 -0
  48. data/lib/qualspec/suite/runner.rb +261 -0
  49. data/lib/qualspec/suite/scenario.rb +57 -0
  50. data/lib/qualspec/version.rb +5 -0
  51. data/lib/qualspec.rb +103 -0
  52. data/sig/qualspec.rbs +4 -0
  53. metadata +142 -0
@@ -0,0 +1,232 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: post
5
+ uri: https://openrouter.ai/api/v1/chat/completions
6
+ body:
7
+ encoding: UTF-8
8
+ string: '{"model":"google/gemini-3-flash-preview","messages":[{"role":"user","content":"Hello!
9
+ How are you today?"}]}'
10
+ headers:
11
+ Content-Type:
12
+ - application/json
13
+ Authorization:
14
+ - Bearer <API_KEY>
15
+ User-Agent:
16
+ - Faraday v2.14.0
17
+ response:
18
+ status:
19
+ code: 200
20
+ message: OK
21
+ headers:
22
+ date:
23
+ - Thu, 25 Dec 2025 00:26:39 GMT
24
+ content-type:
25
+ - application/json
26
+ transfer-encoding:
27
+ - chunked
28
+ connection:
29
+ - keep-alive
30
+ content-encoding:
31
+ - gzip
32
+ access-control-allow-origin:
33
+ - "*"
34
+ vary:
35
+ - Accept-Encoding
36
+ permissions-policy:
37
+ - payment=(self "https://checkout.stripe.com" "https://connect-js.stripe.com"
38
+ "https://js.stripe.com" "https://*.js.stripe.com" "https://hooks.stripe.com")
39
+ referrer-policy:
40
+ - no-referrer, strict-origin-when-cross-origin
41
+ x-content-type-options:
42
+ - nosniff
43
+ server:
44
+ - cloudflare
45
+ cf-ray:
46
+ - 9b343544ce98a201-MSP
47
+ body:
48
+ encoding: ASCII-8BIT
49
+ string: "\n \n\n \n{\"id\":\"gen-1766622398-RkCshdTuH7jzWuOcPVbF\",\"provider\":\"Google\",\"model\":\"google/gemini-3-flash-preview\",\"object\":\"chat.completion\",\"created\":1766622398,\"choices\":[{\"logprobs\":null,\"finish_reason\":\"stop\",\"native_finish_reason\":\"STOP\",\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"I'm
50
+ doing great, thank you for asking! I'm ready to help you with whatever you
51
+ need. \\n\\nHow are things going with you? Is there anything specific I can
52
+ help you with today?\",\"refusal\":null,\"reasoning\":null}}],\"usage\":{\"prompt_tokens\":7,\"completion_tokens\":43,\"total_tokens\":50,\"cost\":0.0001325,\"is_byok\":false,\"prompt_tokens_details\":{\"cached_tokens\":0,\"audio_tokens\":0,\"video_tokens\":0},\"cost_details\":{\"upstream_inference_cost\":null,\"upstream_inference_prompt_cost\":0.0000035,\"upstream_inference_completions_cost\":0.000129},\"completion_tokens_details\":{\"reasoning_tokens\":0,\"image_tokens\":0}}}"
53
+ recorded_at: Thu, 25 Dec 2025 00:26:39 GMT
54
+ - request:
55
+ method: post
56
+ uri: https://openrouter.ai/api/v1/chat/completions
57
+ body:
58
+ encoding: UTF-8
59
+ string: '{"model":"google/gemini-3-flash-preview","messages":[{"role":"system","content":"You
60
+ are an evaluation judge. You will be given a response and one or more evaluation
61
+ criteria.\nYour job is to score how well the response meets the criteria.\n\nScoring:\n-
62
+ 0: Completely fails to meet the criteria\n- 1-3: Mostly fails, with minor
63
+ positive elements\n- 4-6: Partially meets criteria, significant room for improvement\n-
64
+ 7-8: Mostly meets criteria with minor issues\n- 9: Meets criteria well\n-
65
+ 10: Perfectly meets all criteria\n\nBe strict but fair. Consider each criterion
66
+ carefully.\n\nYou MUST respond with valid JSON in this exact format:\n{\"score\":
67
+ <0-10>, \"reasoning\": \"Brief explanation of the score\"}\n\nYour reasoning
68
+ should be concise (1-2 sentences max).\n"},{"role":"user","content":"## Response
69
+ to evaluate:\nI''m doing great, thank you for asking! I''m ready to help you
70
+ with whatever you need. \n\nHow are things going with you? Is there anything
71
+ specific I can help you with today?\n\n## Additional context:\nUser prompt:
72
+ Hello! How are you today?\n\n## Evaluation criteria:\nresponds in a friendly
73
+ manner\nis appropriate in length\n\nScore this response from 0-10. Respond
74
+ with JSON only."}],"response_format":{"type":"json_object"}}'
75
+ headers:
76
+ Content-Type:
77
+ - application/json
78
+ Authorization:
79
+ - Bearer <API_KEY>
80
+ User-Agent:
81
+ - Faraday v2.14.0
82
+ response:
83
+ status:
84
+ code: 200
85
+ message: OK
86
+ headers:
87
+ date:
88
+ - Thu, 25 Dec 2025 00:26:40 GMT
89
+ content-type:
90
+ - application/json
91
+ transfer-encoding:
92
+ - chunked
93
+ connection:
94
+ - keep-alive
95
+ content-encoding:
96
+ - gzip
97
+ access-control-allow-origin:
98
+ - "*"
99
+ vary:
100
+ - Accept-Encoding
101
+ permissions-policy:
102
+ - payment=(self "https://checkout.stripe.com" "https://connect-js.stripe.com"
103
+ "https://js.stripe.com" "https://*.js.stripe.com" "https://hooks.stripe.com")
104
+ referrer-policy:
105
+ - no-referrer, strict-origin-when-cross-origin
106
+ x-content-type-options:
107
+ - nosniff
108
+ server:
109
+ - cloudflare
110
+ cf-ray:
111
+ - 9b34354c5bada1d1-MSP
112
+ body:
113
+ encoding: ASCII-8BIT
114
+ string: "\n \n{\"id\":\"gen-1766622399-NovBTMk4uVXaHxIReaM3\",\"provider\":\"Google
115
+ AI Studio\",\"model\":\"google/gemini-3-flash-preview\",\"object\":\"chat.completion\",\"created\":1766622399,\"choices\":[{\"logprobs\":null,\"finish_reason\":\"stop\",\"native_finish_reason\":\"STOP\",\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"{\\\"score\\\":
116
+ 10, \\\"reasoning\\\": \\\"The response is warm and friendly while maintaining
117
+ a perfectly concise and appropriate length for the greeting.\\\"}\",\"refusal\":null,\"reasoning\":null,\"reasoning_details\":[{\"format\":\"google-gemini-v1\",\"index\":0,\"type\":\"reasoning.encrypted\",\"data\":\"EjQKMgFyyNp8oYVEKG82B2pc5SoTKa1N7Hmj3FgVav3EngLEb3J24El8ffeyEsycGIylLBcv\"}]}}],\"usage\":{\"prompt_tokens\":265,\"completion_tokens\":31,\"total_tokens\":296,\"cost\":0.0002255,\"is_byok\":false,\"prompt_tokens_details\":{\"cached_tokens\":0,\"audio_tokens\":0,\"video_tokens\":0},\"cost_details\":{\"upstream_inference_cost\":null,\"upstream_inference_prompt_cost\":0.0001325,\"upstream_inference_completions_cost\":0.000093},\"completion_tokens_details\":{\"reasoning_tokens\":0,\"image_tokens\":0}}}"
118
+ recorded_at: Thu, 25 Dec 2025 00:26:40 GMT
119
+ - request:
120
+ method: post
121
+ uri: https://openrouter.ai/api/v1/chat/completions
122
+ body:
123
+ encoding: UTF-8
124
+ string: '{"model":"google/gemini-3-flash-preview","messages":[{"role":"user","content":"What
125
+ is 7 times 8?"}]}'
126
+ headers:
127
+ Content-Type:
128
+ - application/json
129
+ Authorization:
130
+ - Bearer <API_KEY>
131
+ User-Agent:
132
+ - Faraday v2.14.0
133
+ response:
134
+ status:
135
+ code: 200
136
+ message: OK
137
+ headers:
138
+ date:
139
+ - Thu, 25 Dec 2025 00:26:40 GMT
140
+ content-type:
141
+ - application/json
142
+ transfer-encoding:
143
+ - chunked
144
+ connection:
145
+ - keep-alive
146
+ content-encoding:
147
+ - gzip
148
+ access-control-allow-origin:
149
+ - "*"
150
+ vary:
151
+ - Accept-Encoding
152
+ permissions-policy:
153
+ - payment=(self "https://checkout.stripe.com" "https://connect-js.stripe.com"
154
+ "https://js.stripe.com" "https://*.js.stripe.com" "https://hooks.stripe.com")
155
+ referrer-policy:
156
+ - no-referrer, strict-origin-when-cross-origin
157
+ x-content-type-options:
158
+ - nosniff
159
+ server:
160
+ - cloudflare
161
+ cf-ray:
162
+ - 9b343551efdee02b-MSP
163
+ body:
164
+ encoding: ASCII-8BIT
165
+ string: "\n \n{\"id\":\"gen-1766622400-FZhCKv98cSCsFMYiK3QW\",\"provider\":\"Google
166
+ AI Studio\",\"model\":\"google/gemini-3-flash-preview\",\"object\":\"chat.completion\",\"created\":1766622400,\"choices\":[{\"logprobs\":null,\"finish_reason\":\"stop\",\"native_finish_reason\":\"STOP\",\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"7
167
+ times 8 is **56**.\",\"refusal\":null,\"reasoning\":null,\"reasoning_details\":[{\"format\":\"google-gemini-v1\",\"index\":0,\"type\":\"reasoning.encrypted\",\"data\":\"EjQKMgFyyNp88llrW7rph3pu6U7rTIoXdbn+DMh3mGII1VQPcclBfCOU7g7KA42LUdRSJuDx\"}]}}],\"usage\":{\"prompt_tokens\":9,\"completion_tokens\":9,\"total_tokens\":18,\"cost\":0.0000315,\"is_byok\":false,\"prompt_tokens_details\":{\"cached_tokens\":0,\"audio_tokens\":0,\"video_tokens\":0},\"cost_details\":{\"upstream_inference_cost\":null,\"upstream_inference_prompt_cost\":0.0000045,\"upstream_inference_completions_cost\":0.000027},\"completion_tokens_details\":{\"reasoning_tokens\":0,\"image_tokens\":0}}}"
168
+ recorded_at: Thu, 25 Dec 2025 00:26:40 GMT
169
+ - request:
170
+ method: post
171
+ uri: https://openrouter.ai/api/v1/chat/completions
172
+ body:
173
+ encoding: UTF-8
174
+ string: '{"model":"google/gemini-3-flash-preview","messages":[{"role":"system","content":"You
175
+ are an evaluation judge. You will be given a response and one or more evaluation
176
+ criteria.\nYour job is to score how well the response meets the criteria.\n\nScoring:\n-
177
+ 0: Completely fails to meet the criteria\n- 1-3: Mostly fails, with minor
178
+ positive elements\n- 4-6: Partially meets criteria, significant room for improvement\n-
179
+ 7-8: Mostly meets criteria with minor issues\n- 9: Meets criteria well\n-
180
+ 10: Perfectly meets all criteria\n\nBe strict but fair. Consider each criterion
181
+ carefully.\n\nYou MUST respond with valid JSON in this exact format:\n{\"score\":
182
+ <0-10>, \"reasoning\": \"Brief explanation of the score\"}\n\nYour reasoning
183
+ should be concise (1-2 sentences max).\n"},{"role":"user","content":"## Response
184
+ to evaluate:\n7 times 8 is **56**.\n\n## Additional context:\nUser prompt:
185
+ What is 7 times 8?\n\n## Evaluation criteria:\nprovides the correct answer
186
+ (56)\ndoesn''t over-explain\n\nScore this response from 0-10. Respond with
187
+ JSON only."}],"response_format":{"type":"json_object"}}'
188
+ headers:
189
+ Content-Type:
190
+ - application/json
191
+ Authorization:
192
+ - Bearer <API_KEY>
193
+ User-Agent:
194
+ - Faraday v2.14.0
195
+ response:
196
+ status:
197
+ code: 200
198
+ message: OK
199
+ headers:
200
+ date:
201
+ - Thu, 25 Dec 2025 00:26:41 GMT
202
+ content-type:
203
+ - application/json
204
+ transfer-encoding:
205
+ - chunked
206
+ connection:
207
+ - keep-alive
208
+ content-encoding:
209
+ - gzip
210
+ access-control-allow-origin:
211
+ - "*"
212
+ vary:
213
+ - Accept-Encoding
214
+ permissions-policy:
215
+ - payment=(self "https://checkout.stripe.com" "https://connect-js.stripe.com"
216
+ "https://js.stripe.com" "https://*.js.stripe.com" "https://hooks.stripe.com")
217
+ referrer-policy:
218
+ - no-referrer, strict-origin-when-cross-origin
219
+ x-content-type-options:
220
+ - nosniff
221
+ server:
222
+ - cloudflare
223
+ cf-ray:
224
+ - 9b34355638a3aced-MSP
225
+ body:
226
+ encoding: ASCII-8BIT
227
+ string: "\n \n\n \n{\"id\":\"gen-1766622401-oGF5F5TOjGtBql8KEu7Y\",\"provider\":\"Google
228
+ AI Studio\",\"model\":\"google/gemini-3-flash-preview\",\"object\":\"chat.completion\",\"created\":1766622401,\"choices\":[{\"logprobs\":null,\"finish_reason\":\"stop\",\"native_finish_reason\":\"STOP\",\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"{\\\"score\\\":
229
+ 10, \\\"reasoning\\\": \\\"The response provides the correct mathematical
230
+ answer and is perfectly concise without any unnecessary explanation.\\\"}\",\"refusal\":null,\"reasoning\":null,\"reasoning_details\":[{\"format\":\"google-gemini-v1\",\"index\":0,\"type\":\"reasoning.encrypted\",\"data\":\"EjQKMgFyyNp8+lZt6C5XsGlmh2G0c/l+peG53Biq0ZHTLBRF89P+OMYjEIbfqTnuKPtgolfx\"}]}}],\"usage\":{\"prompt_tokens\":237,\"completion_tokens\":29,\"total_tokens\":266,\"cost\":0.0002055,\"is_byok\":false,\"prompt_tokens_details\":{\"cached_tokens\":0,\"audio_tokens\":0,\"video_tokens\":0},\"cost_details\":{\"upstream_inference_cost\":null,\"upstream_inference_prompt_cost\":0.0001185,\"upstream_inference_completions_cost\":0.000087},\"completion_tokens_details\":{\"reasoning_tokens\":0,\"image_tokens\":0}}}"
231
+ recorded_at: Thu, 25 Dec 2025 00:26:41 GMT
232
+ recorded_with: VCR 6.4.0
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
data/.rubocop.yml ADDED
@@ -0,0 +1 @@
1
+ inherit_from: .rubocop_todo.yml
data/.rubocop_todo.yml ADDED
@@ -0,0 +1,70 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2025-12-25 02:22:03 UTC using RuboCop version 1.82.1.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 5
10
+ # This cop supports safe autocorrection (--autocorrect).
11
+ Lint/RedundantCopDisableDirective:
12
+ Exclude:
13
+ - 'lib/qualspec/rubric.rb'
14
+ - 'lib/qualspec/suite/behavior.rb'
15
+ - 'lib/qualspec/suite/dsl.rb'
16
+ - 'lib/qualspec/suite/scenario.rb'
17
+
18
+ # Offense count: 17
19
+ # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes.
20
+ Metrics/AbcSize:
21
+ Max: 74
22
+
23
+ # Offense count: 6
24
+ # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
25
+ # AllowedMethods: refine
26
+ Metrics/BlockLength:
27
+ Max: 76
28
+
29
+ # Offense count: 4
30
+ # Configuration parameters: CountComments, CountAsOne.
31
+ Metrics/ClassLength:
32
+ Max: 552
33
+
34
+ # Offense count: 9
35
+ # Configuration parameters: AllowedMethods, AllowedPatterns.
36
+ Metrics/CyclomaticComplexity:
37
+ Max: 16
38
+
39
+ # Offense count: 30
40
+ # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
41
+ Metrics/MethodLength:
42
+ Max: 69
43
+
44
+ # Offense count: 2
45
+ # Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
46
+ Metrics/ParameterLists:
47
+ Max: 9
48
+
49
+ # Offense count: 6
50
+ # Configuration parameters: AllowedMethods, AllowedPatterns.
51
+ Metrics/PerceivedComplexity:
52
+ Max: 16
53
+
54
+ # Offense count: 21
55
+ # Configuration parameters: AllowedConstants.
56
+ Style/Documentation:
57
+ Enabled: false
58
+
59
+ # Offense count: 3
60
+ # This cop supports safe autocorrection (--autocorrect).
61
+ Style/EvalWithLocation:
62
+ Exclude:
63
+ - 'lib/qualspec/suite/builtin_behaviors.rb'
64
+
65
+ # Offense count: 1
66
+ # This cop supports safe autocorrection (--autocorrect).
67
+ # Configuration parameters: AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
68
+ # URISchemes: http, https
69
+ Layout/LineLength:
70
+ Max: 187
data/CHANGELOG.md ADDED
@@ -0,0 +1,16 @@
1
+ # Changelog
2
+
3
+ ## [0.0.1] - 2025-12-25
4
+
5
+ Initial release.
6
+
7
+ ### Features
8
+
9
+ - DSL for defining evaluation suites to compare LLM responses
10
+ - RSpec integration with custom matchers (`be_passing`, `have_score_above`, etc.)
11
+ - LLM-as-judge scoring system (0-10 scale)
12
+ - Built-in rubrics: `tool_calling`, `helpful`, `concise`, `in_character`, `safety`, and more
13
+ - Shared behaviors for reusable test scenarios
14
+ - VCR-based recording/playback for deterministic tests
15
+ - HTML and JSON report generation
16
+ - OpenRouter API support with configurable models
data/README.md ADDED
@@ -0,0 +1,84 @@
1
+ # Qualspec
2
+
3
+ LLM-judged qualitative testing for Ruby. Evaluate AI agents, compare models, and test subjective qualities that traditional assertions can't capture.
4
+
5
+ ## Installation
6
+
7
+ ```ruby
8
+ gem "qualspec"
9
+ ```
10
+
11
+ ## Configuration
12
+
13
+ Set your API key (required):
14
+
15
+ ```bash
16
+ export QUALSPEC_API_KEY=your_openrouter_key
17
+ ```
18
+
19
+ ### Environment Variables
20
+
21
+ | Variable | Description | Default |
22
+ |----------|-------------|---------|
23
+ | `QUALSPEC_API_KEY` | API key (required) | - |
24
+ | `QUALSPEC_API_URL` | API endpoint | `https://openrouter.ai/api/v1` |
25
+ | `QUALSPEC_MODEL` | Default model for candidates | `google/gemini-3-flash-preview` |
26
+ | `QUALSPEC_JUDGE_MODEL` | Model used as judge | Same as `QUALSPEC_MODEL` |
27
+
28
+ ## Quick Start
29
+
30
+ ### Compare Models (CLI)
31
+
32
+ ```ruby
33
+ # eval/comparison.rb
34
+ Qualspec.evaluation "Model Comparison" do
35
+ candidates do
36
+ candidate "gpt4", model: "openai/gpt-4"
37
+ candidate "claude", model: "anthropic/claude-3-sonnet"
38
+ end
39
+
40
+ scenario "helpfulness" do
41
+ prompt "How do I center a div in CSS?"
42
+ eval "provides a working solution"
43
+ eval "explains the approach"
44
+ end
45
+ end
46
+ ```
47
+
48
+ ```bash
49
+ # Run comparison
50
+ qualspec eval/comparison.rb
51
+
52
+ # Generate HTML report
53
+ qualspec --html report.html eval/comparison.rb
54
+ ```
55
+
56
+ ### Test Your Agent (RSpec)
57
+
58
+ ```ruby
59
+ require "qualspec/rspec"
60
+
61
+ RSpec.describe MyAgent do
62
+ include Qualspec::RSpec::Helpers
63
+
64
+ it "responds helpfully" do
65
+ response = MyAgent.call("Hello")
66
+
67
+ result = qualspec_evaluate(response, "responds in a friendly manner")
68
+ expect(result).to be_passing
69
+ end
70
+ end
71
+ ```
72
+
73
+ ## Documentation
74
+
75
+ - [Getting Started](docs/getting-started.md)
76
+ - [Evaluation Suites](docs/evaluation-suites.md) - CLI for model comparison
77
+ - [RSpec Integration](docs/rspec-integration.md) - Testing your agents
78
+ - [Rubrics](docs/rubrics.md) - Builtin and custom evaluation criteria
79
+ - [Configuration](docs/configuration.md) - All options
80
+ - [Recording](docs/recording.md) - VCR integration
81
+
82
+ ## License
83
+
84
+ MIT
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
@@ -0,0 +1,132 @@
1
+ # Configuration
2
+
3
+ ## Environment Variables
4
+
5
+ | Variable | Description | Default |
6
+ |----------|-------------|---------|
7
+ | `QUALSPEC_API_KEY` | API key (required) | - |
8
+ | `QUALSPEC_API_URL` | API endpoint | `https://openrouter.ai/api/v1` |
9
+ | `QUALSPEC_MODEL` | Default model for candidates | `google/gemini-3-flash-preview` |
10
+ | `QUALSPEC_JUDGE_MODEL` | Model for judging | Same as `QUALSPEC_MODEL` |
11
+ | `QUALSPEC_SSL_VERIFY` | SSL verification (disable with `false`) | `true` |
12
+
13
+ ### Required Setup
14
+
15
+ ```bash
16
+ export QUALSPEC_API_KEY=your_openrouter_api_key
17
+ ```
18
+
19
+ ### Using Different Providers
20
+
21
+ **OpenRouter (default):**
22
+ ```bash
23
+ export QUALSPEC_API_KEY=sk-or-...
24
+ # API URL defaults to https://openrouter.ai/api/v1
25
+ ```
26
+
27
+ **OpenAI:**
28
+ ```bash
29
+ export QUALSPEC_API_KEY=sk-...
30
+ export QUALSPEC_API_URL=https://api.openai.com/v1
31
+ ```
32
+
33
+ **Ollama (local):**
34
+ ```bash
35
+ export QUALSPEC_API_URL=http://localhost:11434/v1
36
+ # No API key needed for local Ollama
37
+ ```
38
+
39
+ ## Programmatic Configuration
40
+
41
+ ```ruby
42
+ Qualspec.configure do |config|
43
+ # API settings
44
+ config.api_url = "https://openrouter.ai/api/v1"
45
+ config.api_key = ENV["MY_API_KEY"]
46
+
47
+ # Models
48
+ config.default_model = "google/gemini-2.5-flash-preview"
49
+ config.judge_model = "openai/gpt-4"
50
+
51
+ # Timeouts
52
+ config.request_timeout = 120 # seconds
53
+
54
+ # Custom judge prompt (optional)
55
+ config.judge_system_prompt = <<~PROMPT
56
+ You are a strict evaluator...
57
+ PROMPT
58
+ end
59
+ ```
60
+
61
+ ## RSpec Configuration
62
+
63
+ Additional configuration for RSpec integration:
64
+
65
+ ```ruby
66
+ Qualspec::RSpec.configure do |config|
67
+ # Default pass threshold (0-10)
68
+ config.default_threshold = 7
69
+
70
+ # VCR cassette directory
71
+ config.vcr_cassette_dir = "spec/cassettes/qualspec"
72
+
73
+ # VCR recording mode (:new_episodes, :none, :all, :once)
74
+ config.record_mode = :new_episodes
75
+
76
+ # Auto-load builtin rubrics
77
+ config.load_builtins = true
78
+ end
79
+ ```
80
+
81
+ ## CLI Options
82
+
83
+ Override configuration via command line:
84
+
85
+ ```bash
86
+ # Override judge model
87
+ qualspec -m openai/gpt-4 eval/suite.rb
88
+
89
+ # Override API URL
90
+ qualspec -u https://api.openai.com/v1 eval/suite.rb
91
+
92
+ # Override API key
93
+ qualspec -k sk-xxx eval/suite.rb
94
+ ```
95
+
96
+ ## Configuration Precedence
97
+
98
+ 1. CLI options (highest)
99
+ 2. Programmatic configuration
100
+ 3. Environment variables
101
+ 4. Defaults (lowest)
102
+
103
+ ## Supported Providers
104
+
105
+ Any OpenAI-compatible API works:
106
+
107
+ | Provider | API URL |
108
+ |----------|---------|
109
+ | OpenRouter | `https://openrouter.ai/api/v1` |
110
+ | OpenAI | `https://api.openai.com/v1` |
111
+ | Ollama | `http://localhost:11434/v1` |
112
+ | Azure OpenAI | `https://YOUR.openai.azure.com/openai/deployments/YOUR_DEPLOYMENT` |
113
+ | Together AI | `https://api.together.xyz/v1` |
114
+ | Groq | `https://api.groq.com/openai/v1` |
115
+
116
+ ## SSL Configuration
117
+
118
+ SSL verification is enabled by default. Disable it if you encounter certificate issues:
119
+
120
+ ```bash
121
+ export QUALSPEC_SSL_VERIFY=false
122
+ ```
123
+
124
+ ## Accessing Configuration
125
+
126
+ ```ruby
127
+ config = Qualspec.configuration
128
+
129
+ config.api_url # Current API URL
130
+ config.judge_model # Current judge model
131
+ config.api_headers # Headers sent with requests
132
+ ```