rspec-llm 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.idea/rspec-llm.iml +4 -0
- data/.rubocop.yml +1 -1
- data/CHANGELOG.md +30 -0
- data/README.md +34 -5
- data/lib/rspec/llm/adapters/base.rb +9 -0
- data/lib/rspec/llm/adapters/fake.rb +22 -0
- data/lib/rspec/llm/adapters/ruby_llm.rb +20 -0
- data/lib/rspec/llm/matchers/match_json_schema.rb +37 -3
- data/lib/rspec/llm/matchers/pass_llm_judge.rb +35 -5
- data/lib/rspec/llm/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 91156e04ac3b433d59baecabcc24e857e6ede80327e9d08f453a34f0c9d511bd
|
|
4
|
+
data.tar.gz: c0fdf5f3a4e7af09e69062a9989dac19ef1e34db5e382fa2420cf51945c852d9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e1ca1d12ed29de3c2560c82675a953fa169958f96fa9fe5b7e215017c4d9607ce4be370b4898d232e24118350b483543bb4b7950fd48aebe1c825d8f5229a3f3
|
|
7
|
+
data.tar.gz: 80858d6da5a91633e237552510e6d804a2e15cf4f8693f1d567afacd52335660e23aeb60ac53092b8979b6199fa690a9be105a47e63079896438903ebe9c1509
|
data/.idea/rspec-llm.iml
CHANGED
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
<orderEntry type="library" scope="PROVIDED" name="crack (v1.0.1, rbenv: 3.4.9) [gem]" level="application" />
|
|
21
21
|
<orderEntry type="library" scope="PROVIDED" name="csv (v3.3.5, rbenv: 3.4.9) [gem]" level="application" />
|
|
22
22
|
<orderEntry type="library" scope="PROVIDED" name="diff-lcs (v1.6.2, rbenv: 3.4.9) [gem]" level="application" />
|
|
23
|
+
<orderEntry type="library" scope="PROVIDED" name="docile (v1.4.1, rbenv: 3.4.9) [gem]" level="application" />
|
|
23
24
|
<orderEntry type="library" scope="PROVIDED" name="event_stream_parser (v1.0.0, rbenv: 3.4.9) [gem]" level="application" />
|
|
24
25
|
<orderEntry type="library" scope="PROVIDED" name="faraday (v2.14.2, rbenv: 3.4.9) [gem]" level="application" />
|
|
25
26
|
<orderEntry type="library" scope="PROVIDED" name="faraday-multipart (v1.2.0, rbenv: 3.4.9) [gem]" level="application" />
|
|
@@ -56,6 +57,9 @@
|
|
|
56
57
|
<orderEntry type="library" scope="PROVIDED" name="ruby-progressbar (v1.13.0, rbenv: 3.4.9) [gem]" level="application" />
|
|
57
58
|
<orderEntry type="library" scope="TEST" name="ruby_llm (v1.15.0, rbenv: 3.4.9) [gem]" level="application" />
|
|
58
59
|
<orderEntry type="library" scope="PROVIDED" name="ruby_llm-schema (v0.4.0, rbenv: 3.4.9) [gem]" level="application" />
|
|
60
|
+
<orderEntry type="library" scope="TEST" name="simplecov (v0.22.0, rbenv: 3.4.9) [gem]" level="application" />
|
|
61
|
+
<orderEntry type="library" scope="PROVIDED" name="simplecov-html (v0.13.2, rbenv: 3.4.9) [gem]" level="application" />
|
|
62
|
+
<orderEntry type="library" scope="PROVIDED" name="simplecov_json_formatter (v0.1.4, rbenv: 3.4.9) [gem]" level="application" />
|
|
59
63
|
<orderEntry type="library" scope="PROVIDED" name="unicode-display_width (v3.2.0, rbenv: 3.4.9) [gem]" level="application" />
|
|
60
64
|
<orderEntry type="library" scope="PROVIDED" name="unicode-emoji (v4.2.0, rbenv: 3.4.9) [gem]" level="application" />
|
|
61
65
|
<orderEntry type="library" scope="PROVIDED" name="uri (v1.1.1, rbenv: 3.4.9) [gem]" level="application" />
|
data/.rubocop.yml
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,35 @@
|
|
|
1
1
|
## [Unreleased]
|
|
2
2
|
|
|
3
|
+
## [0.2.0] - 2026-06-05
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
|
|
7
|
+
- **`pass_llm_judge` — structured JSON evaluation** (#1).
|
|
8
|
+
When the `ruby_llm` gem is loaded, the judge prompt is now sent with a
|
|
9
|
+
`RubyLLM::Schema` contract that constrains the model to return a
|
|
10
|
+
machine-readable `{ passed: boolean, reason: string }` payload. This
|
|
11
|
+
eliminates brittle first-token YES/NO parsing and surfaces a full explanation
|
|
12
|
+
in every failure message. Adapters or configurations that do not support
|
|
13
|
+
structured output fall back automatically to the original text-parsing
|
|
14
|
+
strategy, so no existing code or specs need to change.
|
|
15
|
+
- **`match_json_schema` — class-based schema introspection** (#1).
|
|
16
|
+
The matcher now accepts any Ruby class as its argument in addition to raw
|
|
17
|
+
JSON Schema hashes:
|
|
18
|
+
- `Data.define` / `Struct` → attributes are read via `.members`.
|
|
19
|
+
- PORO / ActiveModel → attributes are discovered from public writer methods
|
|
20
|
+
(`#name=`).
|
|
21
|
+
|
|
22
|
+
The derived schema marks every attribute as `required` (typed `string`).
|
|
23
|
+
Raw Hash schemas continue to work without any changes.
|
|
24
|
+
- `Adapters::Base#chat_structured(messages, schema: nil)` — new adapter
|
|
25
|
+
surface for structured-output requests. The default implementation falls
|
|
26
|
+
back to `#chat`, preserving full backward compatibility for custom adapters.
|
|
27
|
+
- `Adapters::RubyLLM#chat_structured` — calls `client.with_schema(schema)`
|
|
28
|
+
before `client.ask` so the underlying model returns a parsed Hash.
|
|
29
|
+
- `Adapters::Fake#chat_structured` — automatically JSON-parses stub strings
|
|
30
|
+
into Hashes, making it trivial to write hermetic structured-output tests
|
|
31
|
+
without needing the `ruby_llm-schema` gem.
|
|
32
|
+
|
|
3
33
|
## [0.1.0] - 2026-06-04
|
|
4
34
|
|
|
5
35
|
### Added
|
data/README.md
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
# rspec-llm
|
|
2
2
|
|
|
3
|
-
[](https://github.com/washu/rspec-llm/actions/workflows/ci.yml)
|
|
4
|
+
[](https://codecov.io/gh/washu/rspec-llm)
|
|
5
|
+
[](https://rubygems.org/gems/rspec-llm)
|
|
6
|
+
[](https://rubygems.org/gems/rspec-llm)
|
|
4
7
|
|
|
5
8
|
RSpec matchers, helpers, and a thin DSL for testing LLM-backed code in Ruby.
|
|
6
9
|
|
|
@@ -74,7 +77,7 @@ end
|
|
|
74
77
|
|
|
75
78
|
### `pass_llm_judge`
|
|
76
79
|
|
|
77
|
-
Sends the response and a criterion to the configured judge model
|
|
80
|
+
Sends the response and a criterion to the configured judge model. When the `ruby_llm` gem is loaded, the judge uses a `RubyLLM::Schema`-backed structured contract that forces the model to return `{ passed: boolean, reason: string }` — eliminating brittle first-token YES/NO parsing and always surfacing a rich explanation in the failure message. For all other adapters the matcher falls back to parsing the first YES/NO token transparently.
|
|
78
81
|
|
|
79
82
|
```ruby
|
|
80
83
|
expect(reply).to pass_llm_judge("is polite and apologetic")
|
|
@@ -92,7 +95,9 @@ expect(reply).to match_llm_intent("a refund confirmation for order #12345")
|
|
|
92
95
|
|
|
93
96
|
### `match_json_schema`
|
|
94
97
|
|
|
95
|
-
Parses the actual value as JSON (or accepts a Hash/Array directly) and validates against
|
|
98
|
+
Parses the actual value as JSON (or accepts a Hash/Array directly) and validates against a schema via the `json-schema` gem. The schema argument can be:
|
|
99
|
+
|
|
100
|
+
**A raw JSON Schema hash** (original behaviour — fully backward-compatible):
|
|
96
101
|
|
|
97
102
|
```ruby
|
|
98
103
|
schema = {
|
|
@@ -103,6 +108,24 @@ schema = {
|
|
|
103
108
|
expect(response).to match_json_schema(schema)
|
|
104
109
|
```
|
|
105
110
|
|
|
111
|
+
**A Ruby class** — `Data.define`, `Struct`, or any PORO with `attr_accessor`. The matcher introspects the class and derives the required fields automatically:
|
|
112
|
+
|
|
113
|
+
```ruby
|
|
114
|
+
# Data.define (Ruby >= 3.2)
|
|
115
|
+
UserProfile = Data.define(:full_name, :verified_email)
|
|
116
|
+
expect(response).to match_json_schema(UserProfile)
|
|
117
|
+
|
|
118
|
+
# Struct
|
|
119
|
+
Point = Struct.new(:x, :y)
|
|
120
|
+
expect(response).to match_json_schema(Point)
|
|
121
|
+
|
|
122
|
+
# PORO
|
|
123
|
+
class OrderSummary
|
|
124
|
+
attr_accessor :order_id, :total, :status
|
|
125
|
+
end
|
|
126
|
+
expect(response).to match_json_schema(OrderSummary)
|
|
127
|
+
```
|
|
128
|
+
|
|
106
129
|
### `be_semantically_similar_to`
|
|
107
130
|
|
|
108
131
|
Embeds both sides via the configured `embedder`, computes cosine similarity, and compares to the threshold. Override the threshold per-matcher with `.within(0.9)`.
|
|
@@ -129,9 +152,15 @@ RSpec.describe "Greeter" do
|
|
|
129
152
|
end
|
|
130
153
|
```
|
|
131
154
|
|
|
132
|
-
Use `stub_llm_judge` to stub the judge model separately — handy when testing your own code that wraps `pass_llm_judge
|
|
155
|
+
Use `stub_llm_judge` to stub the judge model separately — handy when testing your own code that wraps `pass_llm_judge`. Stub with a JSON string to exercise the structured-output path, or with a `YES`/`NO` string to exercise the text-parsing fallback:
|
|
133
156
|
|
|
134
157
|
```ruby
|
|
158
|
+
# Structured output (recommended — matches real ruby_llm behaviour)
|
|
159
|
+
stub_llm_judge do |fake|
|
|
160
|
+
fake.default('{"passed":true,"reason":"Looks good to me."}')
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Legacy text format (still works for backward compatibility)
|
|
135
164
|
stub_llm_judge do |fake|
|
|
136
165
|
fake.default("YES\nLooks good to me.")
|
|
137
166
|
end
|
|
@@ -187,7 +216,7 @@ bin/console # interactive prompt
|
|
|
187
216
|
|
|
188
217
|
## Contributing
|
|
189
218
|
|
|
190
|
-
Bug reports and pull requests welcome on GitHub at https://github.com/
|
|
219
|
+
Bug reports and pull requests welcome on GitHub at https://github.com/washu/rspec-llm.
|
|
191
220
|
|
|
192
221
|
## License
|
|
193
222
|
|
|
@@ -40,6 +40,15 @@ module RSpec
|
|
|
40
40
|
raise NotImplementedError
|
|
41
41
|
end
|
|
42
42
|
|
|
43
|
+
# Send a chat with an optional structured output schema. When a
|
|
44
|
+
# RubyLLM::Schema class is provided via +schema:+, adapters that support
|
|
45
|
+
# structured output will return a Hash instead of a String. Adapters that
|
|
46
|
+
# do not support structured output fall back to plain #chat and return a
|
|
47
|
+
# String; callers are responsible for parsing that fallback.
|
|
48
|
+
def chat_structured(messages, schema: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
49
|
+
chat(messages)
|
|
50
|
+
end
|
|
51
|
+
|
|
43
52
|
# Embed text. Returns Array<Float>. Optional — adapters may raise
|
|
44
53
|
# NotImplementedError if the underlying client doesn't support it.
|
|
45
54
|
def embed(_text)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
3
5
|
module RSpec
|
|
4
6
|
module LLM
|
|
5
7
|
module Adapters
|
|
@@ -67,6 +69,26 @@ module RSpec
|
|
|
67
69
|
response.is_a?(Proc) ? response.call(prompt) : response
|
|
68
70
|
end
|
|
69
71
|
|
|
72
|
+
# Extends #chat with structured-output support for tests. When +schema+
|
|
73
|
+
# is provided the raw stub value is JSON-parsed; if it produces a Hash it
|
|
74
|
+
# is returned directly so callers receive the same shape that a real
|
|
75
|
+
# structured adapter would deliver. Stubs that are already a Hash are
|
|
76
|
+
# returned unchanged. Non-JSON strings fall back to the plain string so
|
|
77
|
+
# existing YES/NO-style tests continue to work.
|
|
78
|
+
#
|
|
79
|
+
# When no schema is given the adapter still attempts JSON parsing — this
|
|
80
|
+
# lets tests stub structured JSON responses without needing the
|
|
81
|
+
# RubyLLM::Schema gem loaded.
|
|
82
|
+
def chat_structured(messages, schema: nil) # rubocop:disable Lint/UnusedMethodArgument
|
|
83
|
+
response = chat(messages)
|
|
84
|
+
return response if response.is_a?(Hash)
|
|
85
|
+
|
|
86
|
+
parsed = JSON.parse(response.to_s, symbolize_names: true)
|
|
87
|
+
parsed.is_a?(Hash) ? parsed : response
|
|
88
|
+
rescue JSON::ParserError
|
|
89
|
+
response
|
|
90
|
+
end
|
|
91
|
+
|
|
70
92
|
def embed(text)
|
|
71
93
|
raise NotImplementedError, "configure with #embed_with { |text| vector }" unless @embedder
|
|
72
94
|
|
|
@@ -20,6 +20,26 @@ module RSpec
|
|
|
20
20
|
extract_content(response)
|
|
21
21
|
end
|
|
22
22
|
|
|
23
|
+
# Uses +with_schema+ on the underlying RubyLLM::Chat client when a schema
|
|
24
|
+
# is provided, yielding a Hash response instead of raw text. Falls back to
|
|
25
|
+
# plain #chat when the client doesn't support +with_schema+.
|
|
26
|
+
def chat_structured(messages, schema: nil)
|
|
27
|
+
return chat(messages) unless schema && client.respond_to?(:with_schema)
|
|
28
|
+
|
|
29
|
+
normalized = normalize_messages(messages)
|
|
30
|
+
last = normalized.last
|
|
31
|
+
system_msgs = normalized[0..-2].select { |m| m[:role] == "system" }
|
|
32
|
+
if system_msgs.any? && client.respond_to?(:with_instructions)
|
|
33
|
+
system_msgs.each do |m|
|
|
34
|
+
client.with_instructions(m[:content])
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
client.with_schema(schema)
|
|
39
|
+
response = client.ask(last[:content])
|
|
40
|
+
extract_content(response)
|
|
41
|
+
end
|
|
42
|
+
|
|
23
43
|
def embed(text)
|
|
24
44
|
return @embedder.call(text) if @embedder
|
|
25
45
|
|
|
@@ -7,10 +7,23 @@ module RSpec
|
|
|
7
7
|
module LLM
|
|
8
8
|
module Matchers
|
|
9
9
|
# Asserts the actual value parses as JSON and conforms to the provided
|
|
10
|
-
#
|
|
10
|
+
# schema. The schema argument may be:
|
|
11
|
+
#
|
|
12
|
+
# * A Hash — raw JSON Schema (original behaviour, fully backward-compatible).
|
|
13
|
+
# * A Class — any Ruby class whose attributes can be introspected:
|
|
14
|
+
# * +Data.define+ / +Struct+: attributes are read via +.members+.
|
|
15
|
+
# * PORO / ActiveModel: attributes are discovered from public writer
|
|
16
|
+
# methods (+#name=+).
|
|
17
|
+
# In both cases a JSON Schema is derived automatically; every attribute
|
|
18
|
+
# is typed as +string+ and marked +required+.
|
|
11
19
|
class MatchJsonSchema
|
|
12
20
|
def initialize(schema)
|
|
13
|
-
|
|
21
|
+
if schema.is_a?(Class)
|
|
22
|
+
@class_name = schema.name || schema.inspect
|
|
23
|
+
@schema = schema_from_class(schema)
|
|
24
|
+
else
|
|
25
|
+
@schema = schema
|
|
26
|
+
end
|
|
14
27
|
end
|
|
15
28
|
|
|
16
29
|
def matches?(actual)
|
|
@@ -23,7 +36,7 @@ module RSpec
|
|
|
23
36
|
end
|
|
24
37
|
|
|
25
38
|
def description
|
|
26
|
-
"match JSON schema"
|
|
39
|
+
@class_name ? "match JSON schema for #{@class_name}" : "match JSON schema"
|
|
27
40
|
end
|
|
28
41
|
|
|
29
42
|
def failure_message
|
|
@@ -40,6 +53,27 @@ module RSpec
|
|
|
40
53
|
|
|
41
54
|
private
|
|
42
55
|
|
|
56
|
+
# Converts a Ruby class to a JSON Schema hash by duck-typing its shape.
|
|
57
|
+
# Data.define and Struct expose +.members+; POROs and ActiveModel objects
|
|
58
|
+
# expose writer methods whose names end in +=+.
|
|
59
|
+
def schema_from_class(klass)
|
|
60
|
+
members = if klass.respond_to?(:members)
|
|
61
|
+
klass.members.map(&:to_s)
|
|
62
|
+
else
|
|
63
|
+
klass.public_instance_methods(false)
|
|
64
|
+
.map(&:to_s)
|
|
65
|
+
.select { |m| m.end_with?("=") }
|
|
66
|
+
.map { |m| m.chomp("=") }
|
|
67
|
+
.sort
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
{
|
|
71
|
+
"type" => "object",
|
|
72
|
+
"required" => members,
|
|
73
|
+
"properties" => members.each_with_object({}) { |m, h| h[m] = { "type" => "string" } }
|
|
74
|
+
}
|
|
75
|
+
end
|
|
76
|
+
|
|
43
77
|
def parse(actual)
|
|
44
78
|
return actual if actual.is_a?(Hash) || actual.is_a?(Array)
|
|
45
79
|
|
|
@@ -4,8 +4,18 @@ module RSpec
|
|
|
4
4
|
module LLM
|
|
5
5
|
module Matchers
|
|
6
6
|
# LLM-as-judge matcher. Asks the configured judge model whether the
|
|
7
|
-
# actual response satisfies the given criterion.
|
|
8
|
-
#
|
|
7
|
+
# actual response satisfies the given criterion.
|
|
8
|
+
#
|
|
9
|
+
# When the +ruby_llm+ gem is loaded, the judge prompt is sent with a
|
|
10
|
+
# structured +RubyLLM::Schema+ contract that forces the model to return
|
|
11
|
+
# a machine-readable JSON payload:
|
|
12
|
+
#
|
|
13
|
+
# { passed: true/false, reason: "..." }
|
|
14
|
+
#
|
|
15
|
+
# This eliminates brittle first-token YES/NO parsing and surfaces a rich
|
|
16
|
+
# reason string for failure messages. Adapters (or gem configurations)
|
|
17
|
+
# that do not support structured output fall back automatically to the
|
|
18
|
+
# original YES/NO text-parsing strategy.
|
|
9
19
|
class PassLlmJudge
|
|
10
20
|
def initialize(criterion)
|
|
11
21
|
@criterion = criterion
|
|
@@ -20,8 +30,16 @@ module RSpec
|
|
|
20
30
|
|
|
21
31
|
def matches?(actual)
|
|
22
32
|
@actual = actual.to_s
|
|
23
|
-
|
|
24
|
-
|
|
33
|
+
result = judge_adapter.chat_structured(prompt_for(@actual, @criterion), schema: judge_schema)
|
|
34
|
+
|
|
35
|
+
if result.is_a?(Hash)
|
|
36
|
+
@verdict = result[:passed] || result["passed"]
|
|
37
|
+
@reason = (result[:reason] || result["reason"] || "").to_s
|
|
38
|
+
else
|
|
39
|
+
@verdict_text = result.to_s
|
|
40
|
+
@verdict, @reason = parse_verdict(@verdict_text)
|
|
41
|
+
end
|
|
42
|
+
|
|
25
43
|
@verdict == true
|
|
26
44
|
end
|
|
27
45
|
|
|
@@ -41,6 +59,18 @@ module RSpec
|
|
|
41
59
|
|
|
42
60
|
private
|
|
43
61
|
|
|
62
|
+
# Returns a RubyLLM::Schema class that constrains the judge's reply to a
|
|
63
|
+
# structured { passed:, reason: } payload, or +nil+ when the gem is not
|
|
64
|
+
# loaded (triggering the text-parsing fallback path).
|
|
65
|
+
def judge_schema
|
|
66
|
+
return nil unless defined?(::RubyLLM::Schema)
|
|
67
|
+
|
|
68
|
+
@judge_schema ||= ::RubyLLM::Schema.create do
|
|
69
|
+
boolean :passed, description: "True if the text meets the criteria, false otherwise"
|
|
70
|
+
string :reason, description: "Detailed explanation of why the criteria was or was not met"
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
44
74
|
def judge_adapter
|
|
45
75
|
@judge || RSpec::LLM.judge or raise(
|
|
46
76
|
RSpec::LLM::Error,
|
|
@@ -63,7 +93,7 @@ module RSpec
|
|
|
63
93
|
end
|
|
64
94
|
|
|
65
95
|
def format_reason
|
|
66
|
-
@reason.empty? ? @verdict_text.
|
|
96
|
+
@reason.empty? ? (@verdict_text || "").strip : @reason
|
|
67
97
|
end
|
|
68
98
|
end
|
|
69
99
|
end
|
data/lib/rspec/llm/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: rspec-llm
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Sal Scotto
|
|
@@ -108,7 +108,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
108
108
|
requirements:
|
|
109
109
|
- - ">="
|
|
110
110
|
- !ruby/object:Gem::Version
|
|
111
|
-
version: 3.
|
|
111
|
+
version: 3.3.0
|
|
112
112
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
113
113
|
requirements:
|
|
114
114
|
- - ">="
|