rspec-llm 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.idea/.gitignore +5 -0
- data/.idea/inspectionProfiles/Project_Default.xml +5 -0
- data/.idea/jsLibraryMappings.xml +6 -0
- data/.idea/misc.xml +17 -0
- data/.idea/modules.xml +8 -0
- data/.idea/rspec-llm.iml +66 -0
- data/.idea/vcs.xml +6 -0
- data/.rspec +3 -0
- data/.rubocop.yml +33 -0
- data/CHANGELOG.md +11 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +194 -0
- data/Rakefile +12 -0
- data/build_release.sh +13 -0
- data/lib/rspec/llm/adapters/base.rb +64 -0
- data/lib/rspec/llm/adapters/fake.rb +114 -0
- data/lib/rspec/llm/adapters/langchain.rb +40 -0
- data/lib/rspec/llm/adapters/ruby_llm.rb +50 -0
- data/lib/rspec/llm/configuration.rb +42 -0
- data/lib/rspec/llm/dsl.rb +39 -0
- data/lib/rspec/llm/helpers.rb +31 -0
- data/lib/rspec/llm/matchers/be_semantically_similar_to.rb +75 -0
- data/lib/rspec/llm/matchers/match_json_schema.rb +54 -0
- data/lib/rspec/llm/matchers/match_llm_intent.rb +30 -0
- data/lib/rspec/llm/matchers/pass_llm_judge.rb +71 -0
- data/lib/rspec/llm/matchers.rb +30 -0
- data/lib/rspec/llm/rspec.rb +12 -0
- data/lib/rspec/llm/version.rb +7 -0
- data/lib/rspec/llm.rb +43 -0
- data/sig/rspec/llm.rbs +6 -0
- metadata +121 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RSpec
|
|
4
|
+
module LLM
|
|
5
|
+
module Adapters
|
|
6
|
+
# In-memory programmable adapter. Use in unit-style specs to avoid
|
|
7
|
+
# hitting a real LLM while still exercising matcher and DSL behavior.
|
|
8
|
+
#
|
|
9
|
+
# fake = RSpec::LLM::Adapters::Fake.new
|
|
10
|
+
# fake.respond_to("Summarize: foo").with("foo summary")
|
|
11
|
+
# fake.respond_to_pattern(/^Summarize/).with { |prompt| "summary of #{prompt}" }
|
|
12
|
+
# fake.default("I don't know")
|
|
13
|
+
# fake.embed_with { |text| [text.length.to_f, 0.0, 0.0] }
|
|
14
|
+
class Fake < Base
|
|
15
|
+
class UnstubbedPromptError < StandardError; end
|
|
16
|
+
|
|
17
|
+
# Builder returned by #respond_to / #respond_to_pattern so callers can
|
|
18
|
+
# chain `.with(text)` or `.with { |prompt| ... }`.
|
|
19
|
+
class Stub
|
|
20
|
+
def initialize(parent, matcher)
|
|
21
|
+
@parent = parent
|
|
22
|
+
@matcher = matcher
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def with(text = nil, &block)
|
|
26
|
+
raise ArgumentError, "pass text or a block" if text.nil? && block.nil?
|
|
27
|
+
|
|
28
|
+
@parent.send(:register, @matcher, text || block)
|
|
29
|
+
@parent
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def initialize(client = nil)
|
|
34
|
+
super
|
|
35
|
+
@stubs = []
|
|
36
|
+
@default = nil
|
|
37
|
+
@embedder = nil
|
|
38
|
+
@call_log = []
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
attr_reader :call_log
|
|
42
|
+
|
|
43
|
+
def respond_to(prompt)
|
|
44
|
+
Stub.new(self, prompt)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def respond_to_pattern(regex)
|
|
48
|
+
Stub.new(self, regex)
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def default(text = nil, &block)
|
|
52
|
+
raise ArgumentError, "pass text or a block" if text.nil? && block.nil?
|
|
53
|
+
|
|
54
|
+
@default = text || block
|
|
55
|
+
self
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def embed_with(&block)
|
|
59
|
+
@embedder = block
|
|
60
|
+
self
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def chat(messages)
|
|
64
|
+
prompt = last_user_message(normalize_messages(messages))
|
|
65
|
+
@call_log << prompt
|
|
66
|
+
response = lookup(prompt)
|
|
67
|
+
response.is_a?(Proc) ? response.call(prompt) : response
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def embed(text)
|
|
71
|
+
raise NotImplementedError, "configure with #embed_with { |text| vector }" unless @embedder
|
|
72
|
+
|
|
73
|
+
@embedder.call(text)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def reset!
|
|
77
|
+
@stubs.clear
|
|
78
|
+
@default = nil
|
|
79
|
+
@embedder = nil
|
|
80
|
+
@call_log.clear
|
|
81
|
+
self
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
private
|
|
85
|
+
|
|
86
|
+
def register(matcher, value)
|
|
87
|
+
@stubs << [matcher, value]
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def last_user_message(messages)
|
|
91
|
+
last_user = messages.reverse.find { |m| m[:role].to_s == "user" }
|
|
92
|
+
(last_user || messages.last)[:content].to_s
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
def lookup(prompt)
|
|
96
|
+
stub = @stubs.find { |matcher, _| matches?(matcher, prompt) }
|
|
97
|
+
return stub.last if stub
|
|
98
|
+
return @default if @default
|
|
99
|
+
|
|
100
|
+
raise UnstubbedPromptError,
|
|
101
|
+
"No fake response stubbed for prompt: #{prompt.inspect}. " \
|
|
102
|
+
"Use fake.respond_to(...).with(...) or fake.default(...)."
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def matches?(matcher, prompt)
|
|
106
|
+
case matcher
|
|
107
|
+
when Regexp then matcher.match?(prompt)
|
|
108
|
+
else matcher == prompt
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RSpec
|
|
4
|
+
module LLM
|
|
5
|
+
module Adapters
|
|
6
|
+
# Adapter for the langchainrb gem (https://github.com/patterns-ai-core/langchainrb).
|
|
7
|
+
# Wraps a Langchain::LLM::* instance.
|
|
8
|
+
class Langchain < Base
|
|
9
|
+
def chat(messages)
|
|
10
|
+
normalized = normalize_messages(messages)
|
|
11
|
+
response = client.chat(messages: normalized)
|
|
12
|
+
extract_content(response)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def embed(text)
|
|
16
|
+
response = client.embed(text: text)
|
|
17
|
+
extract_vector(response)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
private
|
|
21
|
+
|
|
22
|
+
def extract_content(response)
|
|
23
|
+
return response if response.is_a?(String)
|
|
24
|
+
return response.chat_completion if response.respond_to?(:chat_completion) && response.chat_completion
|
|
25
|
+
return response.completion if response.respond_to?(:completion) && response.completion
|
|
26
|
+
|
|
27
|
+
response.to_s
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def extract_vector(response)
|
|
31
|
+
return response if response.is_a?(Array)
|
|
32
|
+
return response.embedding if response.respond_to?(:embedding) && response.embedding
|
|
33
|
+
return response.embeddings.first if response.respond_to?(:embeddings) && response.embeddings
|
|
34
|
+
|
|
35
|
+
raise NotImplementedError, "Cannot extract vector from #{response.class}"
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RSpec
|
|
4
|
+
module LLM
|
|
5
|
+
module Adapters
|
|
6
|
+
# Adapter for the ruby_llm gem (https://github.com/crmne/ruby_llm).
|
|
7
|
+
# Wraps a RubyLLM::Chat instance and exposes the common adapter surface.
|
|
8
|
+
class RubyLLM < Base
|
|
9
|
+
def chat(messages)
|
|
10
|
+
normalized = normalize_messages(messages)
|
|
11
|
+
last = normalized.last
|
|
12
|
+
system_msgs = normalized[0..-2].select { |m| m[:role] == "system" }
|
|
13
|
+
if system_msgs.any? && client.respond_to?(:with_instructions)
|
|
14
|
+
system_msgs.each do |m|
|
|
15
|
+
client.with_instructions(m[:content])
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
response = client.ask(last[:content])
|
|
20
|
+
extract_content(response)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def embed(text)
|
|
24
|
+
return @embedder.call(text) if @embedder
|
|
25
|
+
|
|
26
|
+
raise NotImplementedError, "RubyLLM.embed is not available" unless defined?(::RubyLLM) && ::RubyLLM.respond_to?(:embed)
|
|
27
|
+
|
|
28
|
+
result = ::RubyLLM.embed(text)
|
|
29
|
+
vectors = result.respond_to?(:vectors) ? result.vectors : result
|
|
30
|
+
vectors.is_a?(Array) && vectors.first.is_a?(Array) ? vectors.first : vectors
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Override the embedder (useful in tests). Accepts a callable.
|
|
34
|
+
def with_embedder(callable)
|
|
35
|
+
@embedder = callable
|
|
36
|
+
self
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def extract_content(response)
|
|
42
|
+
return response if response.is_a?(String)
|
|
43
|
+
return response.content if response.respond_to?(:content)
|
|
44
|
+
|
|
45
|
+
response.to_s
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RSpec
|
|
4
|
+
module LLM
|
|
5
|
+
# Holds gem-wide configuration: the default client used by specs, the judge
|
|
6
|
+
# model used by LLM-as-judge matchers, an embedder callable used by the
|
|
7
|
+
# similarity matcher, and tunable thresholds/prompts.
|
|
8
|
+
class Configuration
|
|
9
|
+
DEFAULT_SIMILARITY_THRESHOLD = 0.8
|
|
10
|
+
|
|
11
|
+
DEFAULT_JUDGE_PROMPT = <<~PROMPT
|
|
12
|
+
You are a strict evaluator. Read the response and decide whether it satisfies the criterion.
|
|
13
|
+
Reply with YES or NO on the first line, then a single short sentence explaining your decision.
|
|
14
|
+
|
|
15
|
+
Response:
|
|
16
|
+
%<response>s
|
|
17
|
+
|
|
18
|
+
Criterion:
|
|
19
|
+
%<criterion>s
|
|
20
|
+
PROMPT
|
|
21
|
+
|
|
22
|
+
attr_accessor :client, :judge, :embedder, :similarity_threshold, :judge_prompt_template
|
|
23
|
+
|
|
24
|
+
def initialize
|
|
25
|
+
@similarity_threshold = DEFAULT_SIMILARITY_THRESHOLD
|
|
26
|
+
@judge_prompt_template = DEFAULT_JUDGE_PROMPT
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def client_adapter
|
|
30
|
+
return nil if client.nil?
|
|
31
|
+
|
|
32
|
+
Adapters::Base.wrap(client)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def judge_adapter
|
|
36
|
+
return nil if judge.nil?
|
|
37
|
+
|
|
38
|
+
Adapters::Base.wrap(judge)
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RSpec
|
|
4
|
+
module LLM
|
|
5
|
+
# Lightweight DSL for grouping LLM evaluations. Sugar over `describe` + `it`.
|
|
6
|
+
#
|
|
7
|
+
# RSpec.describe_llm "Summarizer" do
|
|
8
|
+
# evaluate "single sentence",
|
|
9
|
+
# prompt: "Summarize: ...",
|
|
10
|
+
# expect: [pass_llm_judge("is one sentence"), match_llm_intent("a summary")]
|
|
11
|
+
# end
|
|
12
|
+
module DSL
|
|
13
|
+
# Class-level helper exposed inside `describe_llm` blocks.
|
|
14
|
+
module GroupMethods
|
|
15
|
+
def evaluate(name, prompt:, expect:, client: nil)
|
|
16
|
+
it(name) do
|
|
17
|
+
adapter = client ? RSpec::LLM::Adapters::Base.wrap(client) : RSpec::LLM.client
|
|
18
|
+
raise RSpec::LLM::Error, "No LLM client configured" if adapter.nil?
|
|
19
|
+
|
|
20
|
+
response = adapter.chat(prompt)
|
|
21
|
+
Array(expect).each do |matcher|
|
|
22
|
+
::RSpec::Expectations::PositiveExpectationHandler.handle_matcher(response, matcher)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Defines RSpec.describe_llm — a sugar for RSpec.describe that extends the
|
|
29
|
+
# resulting example group with GroupMethods so `evaluate` is available
|
|
30
|
+
# inside the user's block.
|
|
31
|
+
def describe_llm(*args, &user_block)
|
|
32
|
+
describe(*args) do
|
|
33
|
+
extend(::RSpec::LLM::DSL::GroupMethods)
|
|
34
|
+
class_exec(&user_block) if user_block
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RSpec
|
|
4
|
+
module LLM
|
|
5
|
+
# Methods mixed into example groups via RSpec.configure (see rspec.rb).
|
|
6
|
+
module Helpers
|
|
7
|
+
# The configured client wrapped as an Adapter. Returns nil if not configured.
|
|
8
|
+
def llm
|
|
9
|
+
RSpec::LLM.client
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Configure a Fake adapter and yield it for stubbing. Replaces the global
|
|
13
|
+
# client for the duration of the example. After the example RSpec::LLM is
|
|
14
|
+
# reset via the `after` hook installed in rspec.rb.
|
|
15
|
+
def stub_llm
|
|
16
|
+
fake = RSpec::LLM::Adapters::Fake.new
|
|
17
|
+
yield fake if block_given?
|
|
18
|
+
RSpec::LLM.configuration.client = fake
|
|
19
|
+
fake
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Same, but installs the fake as the judge.
|
|
23
|
+
def stub_llm_judge
|
|
24
|
+
fake = RSpec::LLM::Adapters::Fake.new
|
|
25
|
+
yield fake if block_given?
|
|
26
|
+
RSpec::LLM.configuration.judge = fake
|
|
27
|
+
fake
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RSpec
|
|
4
|
+
module LLM
|
|
5
|
+
module Matchers
|
|
6
|
+
# Embeds the actual and expected texts via the configured embedder and
|
|
7
|
+
# asserts cosine similarity >= threshold (default from configuration,
|
|
8
|
+
# overridable with .within(0.9)).
|
|
9
|
+
class BeSemanticallySimilarTo
|
|
10
|
+
def initialize(expected)
|
|
11
|
+
@expected = expected
|
|
12
|
+
@threshold = nil
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def within(threshold)
|
|
16
|
+
@threshold = threshold
|
|
17
|
+
self
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def matches?(actual)
|
|
21
|
+
@actual = actual.to_s
|
|
22
|
+
@actual_vec = embedder.call(@actual)
|
|
23
|
+
@expected_vec = embedder.call(@expected.to_s)
|
|
24
|
+
@similarity = cosine(@actual_vec, @expected_vec)
|
|
25
|
+
@similarity >= threshold_value
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def description
|
|
29
|
+
"be semantically similar to #{@expected.inspect} (>= #{threshold_value})"
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def failure_message
|
|
33
|
+
"expected response to be semantically similar to expected text " \
|
|
34
|
+
"(similarity #{format_similarity} < threshold #{threshold_value}).\n" \
|
|
35
|
+
"Expected: #{@expected.inspect}\nActual: #{@actual.inspect}"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def failure_message_when_negated
|
|
39
|
+
"expected response NOT to be semantically similar to expected text " \
|
|
40
|
+
"(similarity #{format_similarity} >= threshold #{threshold_value}).\n" \
|
|
41
|
+
"Expected: #{@expected.inspect}\nActual: #{@actual.inspect}"
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
def embedder
|
|
47
|
+
RSpec::LLM.configuration.embedder or raise(
|
|
48
|
+
RSpec::LLM::Error,
|
|
49
|
+
"No embedder configured. Set RSpec::LLM.configure { |c| c.embedder = ->(text) { ... } }."
|
|
50
|
+
)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def threshold_value
|
|
54
|
+
@threshold || RSpec::LLM.configuration.similarity_threshold
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def cosine(vec_a, vec_b)
|
|
58
|
+
raise ArgumentError, "embedder returned empty vector" if vec_a.empty? || vec_b.empty?
|
|
59
|
+
raise ArgumentError, "vector dimensions differ (#{vec_a.size} vs #{vec_b.size})" if vec_a.size != vec_b.size
|
|
60
|
+
|
|
61
|
+
dot = vec_a.zip(vec_b).sum { |a, b| a * b }
|
|
62
|
+
mag_a = Math.sqrt(vec_a.sum { |v| v * v })
|
|
63
|
+
mag_b = Math.sqrt(vec_b.sum { |v| v * v })
|
|
64
|
+
return 0.0 if mag_a.zero? || mag_b.zero?
|
|
65
|
+
|
|
66
|
+
dot / (mag_a * mag_b)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def format_similarity
|
|
70
|
+
format("%.4f", @similarity)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "json-schema"
|
|
5
|
+
|
|
6
|
+
module RSpec
|
|
7
|
+
module LLM
|
|
8
|
+
module Matchers
|
|
9
|
+
# Asserts the actual value parses as JSON and conforms to the provided
|
|
10
|
+
# JSON Schema (a Hash, JSON string, or schema file path).
|
|
11
|
+
class MatchJsonSchema
|
|
12
|
+
def initialize(schema)
|
|
13
|
+
@schema = schema
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def matches?(actual)
|
|
17
|
+
@actual = actual
|
|
18
|
+
@parsed = parse(actual)
|
|
19
|
+
return false if @parse_error
|
|
20
|
+
|
|
21
|
+
@errors = JSON::Validator.fully_validate(@schema, @parsed)
|
|
22
|
+
@errors.empty?
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def description
|
|
26
|
+
"match JSON schema"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def failure_message
|
|
30
|
+
if @parse_error
|
|
31
|
+
"expected response to parse as JSON, but got: #{@parse_error}\n\nResponse:\n#{@actual}"
|
|
32
|
+
else
|
|
33
|
+
"expected response to match JSON schema, but got errors:\n#{@errors.join("\n")}\n\nResponse:\n#{@actual}"
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def failure_message_when_negated
|
|
38
|
+
"expected response NOT to match JSON schema, but it did.\n\nResponse:\n#{@actual}"
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
private
|
|
42
|
+
|
|
43
|
+
def parse(actual)
|
|
44
|
+
return actual if actual.is_a?(Hash) || actual.is_a?(Array)
|
|
45
|
+
|
|
46
|
+
JSON.parse(actual.to_s)
|
|
47
|
+
rescue JSON::ParserError => e
|
|
48
|
+
@parse_error = e.message
|
|
49
|
+
nil
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RSpec
|
|
4
|
+
module LLM
|
|
5
|
+
module Matchers
|
|
6
|
+
# Variant of PassLlmJudge that frames the criterion as an intent the
|
|
7
|
+
# response should satisfy. Sugar — the heavy lifting is identical.
|
|
8
|
+
class MatchLlmIntent < PassLlmJudge
|
|
9
|
+
def initialize(intent)
|
|
10
|
+
super("The response matches the following intent: #{intent}")
|
|
11
|
+
@intent = intent
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def description
|
|
15
|
+
"match LLM intent: #{@intent.inspect}"
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def failure_message
|
|
19
|
+
"expected response to match intent #{@intent.inspect}, " \
|
|
20
|
+
"but judge said: #{format_reason}\n\nResponse:\n#{@actual}"
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def failure_message_when_negated
|
|
24
|
+
"expected response NOT to match intent #{@intent.inspect}, " \
|
|
25
|
+
"but judge said: #{format_reason}\n\nResponse:\n#{@actual}"
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module RSpec
|
|
4
|
+
module LLM
|
|
5
|
+
module Matchers
|
|
6
|
+
# LLM-as-judge matcher. Asks the configured judge model whether the
|
|
7
|
+
# actual response satisfies the given criterion. Parses YES/NO from the
|
|
8
|
+
# first non-whitespace token of the judge's reply.
|
|
9
|
+
class PassLlmJudge
|
|
10
|
+
def initialize(criterion)
|
|
11
|
+
@criterion = criterion
|
|
12
|
+
@judge = nil
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# Override the judge for this matcher invocation (optional).
|
|
16
|
+
def using(judge)
|
|
17
|
+
@judge = judge
|
|
18
|
+
self
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def matches?(actual)
|
|
22
|
+
@actual = actual.to_s
|
|
23
|
+
@verdict_text = judge_adapter.chat(prompt_for(@actual, @criterion))
|
|
24
|
+
@verdict, @reason = parse_verdict(@verdict_text)
|
|
25
|
+
@verdict == true
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def description
|
|
29
|
+
"pass LLM judge with criterion: #{@criterion.inspect}"
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def failure_message
|
|
33
|
+
"expected response to pass judge criterion #{@criterion.inspect}, " \
|
|
34
|
+
"but judge said: #{format_reason}\n\nResponse:\n#{@actual}"
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def failure_message_when_negated
|
|
38
|
+
"expected response NOT to pass judge criterion #{@criterion.inspect}, " \
|
|
39
|
+
"but judge said: #{format_reason}\n\nResponse:\n#{@actual}"
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
def judge_adapter
|
|
45
|
+
@judge || RSpec::LLM.judge or raise(
|
|
46
|
+
RSpec::LLM::Error,
|
|
47
|
+
"No judge configured. Call RSpec::LLM.configure { |c| c.judge = ... } " \
|
|
48
|
+
"or .using(client) on the matcher."
|
|
49
|
+
)
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def prompt_for(response, criterion)
|
|
53
|
+
format(RSpec::LLM.configuration.judge_prompt_template, response: response, criterion: criterion)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def parse_verdict(text)
|
|
57
|
+
stripped = text.to_s.strip
|
|
58
|
+
first_token = stripped.split(/\s+/, 2).first.to_s.upcase
|
|
59
|
+
verdict = first_token.start_with?("YES")
|
|
60
|
+
reason = stripped.lines.drop(1).join.strip
|
|
61
|
+
reason = stripped if reason.empty?
|
|
62
|
+
[verdict, reason]
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def format_reason
|
|
66
|
+
@reason.empty? ? @verdict_text.to_s.strip : @reason
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "matchers/pass_llm_judge"
|
|
4
|
+
require_relative "matchers/match_llm_intent"
|
|
5
|
+
require_relative "matchers/match_json_schema"
|
|
6
|
+
require_relative "matchers/be_semantically_similar_to"
|
|
7
|
+
|
|
8
|
+
module RSpec
|
|
9
|
+
module LLM
|
|
10
|
+
# Module to be included in example groups (RSpec.configure does this
|
|
11
|
+
# automatically via lib/rspec/llm/rspec.rb). Exposes the matcher DSL.
|
|
12
|
+
module Matchers
|
|
13
|
+
def pass_llm_judge(criterion)
|
|
14
|
+
RSpec::LLM::Matchers::PassLlmJudge.new(criterion)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def match_llm_intent(intent)
|
|
18
|
+
RSpec::LLM::Matchers::MatchLlmIntent.new(intent)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def match_json_schema(schema)
|
|
22
|
+
RSpec::LLM::Matchers::MatchJsonSchema.new(schema)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def be_semantically_similar_to(expected)
|
|
26
|
+
RSpec::LLM::Matchers::BeSemanticallySimilarTo.new(expected)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Wire matchers, helpers, and DSL into RSpec automatically when the gem is
|
|
4
|
+
# required. Users only need `require "rspec/llm"` in their spec_helper.
|
|
5
|
+
|
|
6
|
+
RSpec.configure do |config|
|
|
7
|
+
config.include RSpec::LLM::Matchers
|
|
8
|
+
config.include RSpec::LLM::Helpers
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
# Expose `describe_llm` at the top level by extending RSpec itself.
|
|
12
|
+
RSpec.extend(RSpec::LLM::DSL)
|
data/lib/rspec/llm.rb
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "rspec/core"
|
|
4
|
+
require "rspec/expectations"
|
|
5
|
+
|
|
6
|
+
require_relative "llm/version"
|
|
7
|
+
require_relative "llm/configuration"
|
|
8
|
+
require_relative "llm/adapters/base"
|
|
9
|
+
require_relative "llm/adapters/fake"
|
|
10
|
+
require_relative "llm/adapters/ruby_llm"
|
|
11
|
+
require_relative "llm/adapters/langchain"
|
|
12
|
+
require_relative "llm/matchers"
|
|
13
|
+
require_relative "llm/helpers"
|
|
14
|
+
require_relative "llm/dsl"
|
|
15
|
+
require_relative "llm/rspec"
|
|
16
|
+
|
|
17
|
+
module RSpec
|
|
18
|
+
module LLM
|
|
19
|
+
class Error < StandardError; end
|
|
20
|
+
|
|
21
|
+
class << self
|
|
22
|
+
def configuration
|
|
23
|
+
@configuration ||= Configuration.new
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def configure
|
|
27
|
+
yield configuration
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def reset!
|
|
31
|
+
@configuration = Configuration.new
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def client
|
|
35
|
+
configuration.client_adapter
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def judge
|
|
39
|
+
configuration.judge_adapter || client
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|