rspec-llm 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RSpec
4
+ module LLM
5
+ module Adapters
6
+ # In-memory programmable adapter. Use in unit-style specs to avoid
7
+ # hitting a real LLM while still exercising matcher and DSL behavior.
8
+ #
9
+ # fake = RSpec::LLM::Adapters::Fake.new
10
+ # fake.respond_to("Summarize: foo").with("foo summary")
11
+ # fake.respond_to_pattern(/^Summarize/).with { |prompt| "summary of #{prompt}" }
12
+ # fake.default("I don't know")
13
+ # fake.embed_with { |text| [text.length.to_f, 0.0, 0.0] }
14
+ class Fake < Base
15
+ class UnstubbedPromptError < StandardError; end
16
+
17
+ # Builder returned by #respond_to / #respond_to_pattern so callers can
18
+ # chain `.with(text)` or `.with { |prompt| ... }`.
19
+ class Stub
20
+ def initialize(parent, matcher)
21
+ @parent = parent
22
+ @matcher = matcher
23
+ end
24
+
25
+ def with(text = nil, &block)
26
+ raise ArgumentError, "pass text or a block" if text.nil? && block.nil?
27
+
28
+ @parent.send(:register, @matcher, text || block)
29
+ @parent
30
+ end
31
+ end
32
+
33
+ def initialize(client = nil)
34
+ super
35
+ @stubs = []
36
+ @default = nil
37
+ @embedder = nil
38
+ @call_log = []
39
+ end
40
+
41
+ attr_reader :call_log
42
+
43
+ def respond_to(prompt)
44
+ Stub.new(self, prompt)
45
+ end
46
+
47
+ def respond_to_pattern(regex)
48
+ Stub.new(self, regex)
49
+ end
50
+
51
+ def default(text = nil, &block)
52
+ raise ArgumentError, "pass text or a block" if text.nil? && block.nil?
53
+
54
+ @default = text || block
55
+ self
56
+ end
57
+
58
+ def embed_with(&block)
59
+ @embedder = block
60
+ self
61
+ end
62
+
63
+ def chat(messages)
64
+ prompt = last_user_message(normalize_messages(messages))
65
+ @call_log << prompt
66
+ response = lookup(prompt)
67
+ response.is_a?(Proc) ? response.call(prompt) : response
68
+ end
69
+
70
+ def embed(text)
71
+ raise NotImplementedError, "configure with #embed_with { |text| vector }" unless @embedder
72
+
73
+ @embedder.call(text)
74
+ end
75
+
76
+ def reset!
77
+ @stubs.clear
78
+ @default = nil
79
+ @embedder = nil
80
+ @call_log.clear
81
+ self
82
+ end
83
+
84
+ private
85
+
86
+ def register(matcher, value)
87
+ @stubs << [matcher, value]
88
+ end
89
+
90
+ def last_user_message(messages)
91
+ last_user = messages.reverse.find { |m| m[:role].to_s == "user" }
92
+ (last_user || messages.last)[:content].to_s
93
+ end
94
+
95
+ def lookup(prompt)
96
+ stub = @stubs.find { |matcher, _| matches?(matcher, prompt) }
97
+ return stub.last if stub
98
+ return @default if @default
99
+
100
+ raise UnstubbedPromptError,
101
+ "No fake response stubbed for prompt: #{prompt.inspect}. " \
102
+ "Use fake.respond_to(...).with(...) or fake.default(...)."
103
+ end
104
+
105
+ def matches?(matcher, prompt)
106
+ case matcher
107
+ when Regexp then matcher.match?(prompt)
108
+ else matcher == prompt
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RSpec
4
+ module LLM
5
+ module Adapters
6
+ # Adapter for the langchainrb gem (https://github.com/patterns-ai-core/langchainrb).
7
+ # Wraps a Langchain::LLM::* instance.
8
+ class Langchain < Base
9
+ def chat(messages)
10
+ normalized = normalize_messages(messages)
11
+ response = client.chat(messages: normalized)
12
+ extract_content(response)
13
+ end
14
+
15
+ def embed(text)
16
+ response = client.embed(text: text)
17
+ extract_vector(response)
18
+ end
19
+
20
+ private
21
+
22
+ def extract_content(response)
23
+ return response if response.is_a?(String)
24
+ return response.chat_completion if response.respond_to?(:chat_completion) && response.chat_completion
25
+ return response.completion if response.respond_to?(:completion) && response.completion
26
+
27
+ response.to_s
28
+ end
29
+
30
+ def extract_vector(response)
31
+ return response if response.is_a?(Array)
32
+ return response.embedding if response.respond_to?(:embedding) && response.embedding
33
+ return response.embeddings.first if response.respond_to?(:embeddings) && response.embeddings
34
+
35
+ raise NotImplementedError, "Cannot extract vector from #{response.class}"
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RSpec
4
+ module LLM
5
+ module Adapters
6
+ # Adapter for the ruby_llm gem (https://github.com/crmne/ruby_llm).
7
+ # Wraps a RubyLLM::Chat instance and exposes the common adapter surface.
8
+ class RubyLLM < Base
9
+ def chat(messages)
10
+ normalized = normalize_messages(messages)
11
+ last = normalized.last
12
+ system_msgs = normalized[0..-2].select { |m| m[:role] == "system" }
13
+ if system_msgs.any? && client.respond_to?(:with_instructions)
14
+ system_msgs.each do |m|
15
+ client.with_instructions(m[:content])
16
+ end
17
+ end
18
+
19
+ response = client.ask(last[:content])
20
+ extract_content(response)
21
+ end
22
+
23
+ def embed(text)
24
+ return @embedder.call(text) if @embedder
25
+
26
+ raise NotImplementedError, "RubyLLM.embed is not available" unless defined?(::RubyLLM) && ::RubyLLM.respond_to?(:embed)
27
+
28
+ result = ::RubyLLM.embed(text)
29
+ vectors = result.respond_to?(:vectors) ? result.vectors : result
30
+ vectors.is_a?(Array) && vectors.first.is_a?(Array) ? vectors.first : vectors
31
+ end
32
+
33
+ # Override the embedder (useful in tests). Accepts a callable.
34
+ def with_embedder(callable)
35
+ @embedder = callable
36
+ self
37
+ end
38
+
39
+ private
40
+
41
+ def extract_content(response)
42
+ return response if response.is_a?(String)
43
+ return response.content if response.respond_to?(:content)
44
+
45
+ response.to_s
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RSpec
4
+ module LLM
5
+ # Holds gem-wide configuration: the default client used by specs, the judge
6
+ # model used by LLM-as-judge matchers, an embedder callable used by the
7
+ # similarity matcher, and tunable thresholds/prompts.
8
+ class Configuration
9
+ DEFAULT_SIMILARITY_THRESHOLD = 0.8
10
+
11
+ DEFAULT_JUDGE_PROMPT = <<~PROMPT
12
+ You are a strict evaluator. Read the response and decide whether it satisfies the criterion.
13
+ Reply with YES or NO on the first line, then a single short sentence explaining your decision.
14
+
15
+ Response:
16
+ %<response>s
17
+
18
+ Criterion:
19
+ %<criterion>s
20
+ PROMPT
21
+
22
+ attr_accessor :client, :judge, :embedder, :similarity_threshold, :judge_prompt_template
23
+
24
+ def initialize
25
+ @similarity_threshold = DEFAULT_SIMILARITY_THRESHOLD
26
+ @judge_prompt_template = DEFAULT_JUDGE_PROMPT
27
+ end
28
+
29
+ def client_adapter
30
+ return nil if client.nil?
31
+
32
+ Adapters::Base.wrap(client)
33
+ end
34
+
35
+ def judge_adapter
36
+ return nil if judge.nil?
37
+
38
+ Adapters::Base.wrap(judge)
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RSpec
4
+ module LLM
5
+ # Lightweight DSL for grouping LLM evaluations. Sugar over `describe` + `it`.
6
+ #
7
+ # RSpec.describe_llm "Summarizer" do
8
+ # evaluate "single sentence",
9
+ # prompt: "Summarize: ...",
10
+ # expect: [pass_llm_judge("is one sentence"), match_llm_intent("a summary")]
11
+ # end
12
+ module DSL
13
+ # Class-level helper exposed inside `describe_llm` blocks.
14
+ module GroupMethods
15
+ def evaluate(name, prompt:, expect:, client: nil)
16
+ it(name) do
17
+ adapter = client ? RSpec::LLM::Adapters::Base.wrap(client) : RSpec::LLM.client
18
+ raise RSpec::LLM::Error, "No LLM client configured" if adapter.nil?
19
+
20
+ response = adapter.chat(prompt)
21
+ Array(expect).each do |matcher|
22
+ ::RSpec::Expectations::PositiveExpectationHandler.handle_matcher(response, matcher)
23
+ end
24
+ end
25
+ end
26
+ end
27
+
28
+ # Defines RSpec.describe_llm — a sugar for RSpec.describe that extends the
29
+ # resulting example group with GroupMethods so `evaluate` is available
30
+ # inside the user's block.
31
+ def describe_llm(*args, &user_block)
32
+ describe(*args) do
33
+ extend(::RSpec::LLM::DSL::GroupMethods)
34
+ class_exec(&user_block) if user_block
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RSpec
4
+ module LLM
5
+ # Methods mixed into example groups via RSpec.configure (see rspec.rb).
6
+ module Helpers
7
+ # The configured client wrapped as an Adapter. Returns nil if not configured.
8
+ def llm
9
+ RSpec::LLM.client
10
+ end
11
+
12
+ # Configure a Fake adapter and yield it for stubbing. Replaces the global
13
+ # client for the duration of the example. After the example RSpec::LLM is
14
+ # reset via the `after` hook installed in rspec.rb.
15
+ def stub_llm
16
+ fake = RSpec::LLM::Adapters::Fake.new
17
+ yield fake if block_given?
18
+ RSpec::LLM.configuration.client = fake
19
+ fake
20
+ end
21
+
22
+ # Same, but installs the fake as the judge.
23
+ def stub_llm_judge
24
+ fake = RSpec::LLM::Adapters::Fake.new
25
+ yield fake if block_given?
26
+ RSpec::LLM.configuration.judge = fake
27
+ fake
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,75 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RSpec
4
+ module LLM
5
+ module Matchers
6
+ # Embeds the actual and expected texts via the configured embedder and
7
+ # asserts cosine similarity >= threshold (default from configuration,
8
+ # overridable with .within(0.9)).
9
+ class BeSemanticallySimilarTo
10
+ def initialize(expected)
11
+ @expected = expected
12
+ @threshold = nil
13
+ end
14
+
15
+ def within(threshold)
16
+ @threshold = threshold
17
+ self
18
+ end
19
+
20
+ def matches?(actual)
21
+ @actual = actual.to_s
22
+ @actual_vec = embedder.call(@actual)
23
+ @expected_vec = embedder.call(@expected.to_s)
24
+ @similarity = cosine(@actual_vec, @expected_vec)
25
+ @similarity >= threshold_value
26
+ end
27
+
28
+ def description
29
+ "be semantically similar to #{@expected.inspect} (>= #{threshold_value})"
30
+ end
31
+
32
+ def failure_message
33
+ "expected response to be semantically similar to expected text " \
34
+ "(similarity #{format_similarity} < threshold #{threshold_value}).\n" \
35
+ "Expected: #{@expected.inspect}\nActual: #{@actual.inspect}"
36
+ end
37
+
38
+ def failure_message_when_negated
39
+ "expected response NOT to be semantically similar to expected text " \
40
+ "(similarity #{format_similarity} >= threshold #{threshold_value}).\n" \
41
+ "Expected: #{@expected.inspect}\nActual: #{@actual.inspect}"
42
+ end
43
+
44
+ private
45
+
46
+ def embedder
47
+ RSpec::LLM.configuration.embedder or raise(
48
+ RSpec::LLM::Error,
49
+ "No embedder configured. Set RSpec::LLM.configure { |c| c.embedder = ->(text) { ... } }."
50
+ )
51
+ end
52
+
53
+ def threshold_value
54
+ @threshold || RSpec::LLM.configuration.similarity_threshold
55
+ end
56
+
57
+ def cosine(vec_a, vec_b)
58
+ raise ArgumentError, "embedder returned empty vector" if vec_a.empty? || vec_b.empty?
59
+ raise ArgumentError, "vector dimensions differ (#{vec_a.size} vs #{vec_b.size})" if vec_a.size != vec_b.size
60
+
61
+ dot = vec_a.zip(vec_b).sum { |a, b| a * b }
62
+ mag_a = Math.sqrt(vec_a.sum { |v| v * v })
63
+ mag_b = Math.sqrt(vec_b.sum { |v| v * v })
64
+ return 0.0 if mag_a.zero? || mag_b.zero?
65
+
66
+ dot / (mag_a * mag_b)
67
+ end
68
+
69
+ def format_similarity
70
+ format("%.4f", @similarity)
71
+ end
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "json-schema"
5
+
6
+ module RSpec
7
+ module LLM
8
+ module Matchers
9
+ # Asserts the actual value parses as JSON and conforms to the provided
10
+ # JSON Schema (a Hash, JSON string, or schema file path).
11
+ class MatchJsonSchema
12
+ def initialize(schema)
13
+ @schema = schema
14
+ end
15
+
16
+ def matches?(actual)
17
+ @actual = actual
18
+ @parsed = parse(actual)
19
+ return false if @parse_error
20
+
21
+ @errors = JSON::Validator.fully_validate(@schema, @parsed)
22
+ @errors.empty?
23
+ end
24
+
25
+ def description
26
+ "match JSON schema"
27
+ end
28
+
29
+ def failure_message
30
+ if @parse_error
31
+ "expected response to parse as JSON, but got: #{@parse_error}\n\nResponse:\n#{@actual}"
32
+ else
33
+ "expected response to match JSON schema, but got errors:\n#{@errors.join("\n")}\n\nResponse:\n#{@actual}"
34
+ end
35
+ end
36
+
37
+ def failure_message_when_negated
38
+ "expected response NOT to match JSON schema, but it did.\n\nResponse:\n#{@actual}"
39
+ end
40
+
41
+ private
42
+
43
+ def parse(actual)
44
+ return actual if actual.is_a?(Hash) || actual.is_a?(Array)
45
+
46
+ JSON.parse(actual.to_s)
47
+ rescue JSON::ParserError => e
48
+ @parse_error = e.message
49
+ nil
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RSpec
4
+ module LLM
5
+ module Matchers
6
+ # Variant of PassLlmJudge that frames the criterion as an intent the
7
+ # response should satisfy. Sugar — the heavy lifting is identical.
8
+ class MatchLlmIntent < PassLlmJudge
9
+ def initialize(intent)
10
+ super("The response matches the following intent: #{intent}")
11
+ @intent = intent
12
+ end
13
+
14
+ def description
15
+ "match LLM intent: #{@intent.inspect}"
16
+ end
17
+
18
+ def failure_message
19
+ "expected response to match intent #{@intent.inspect}, " \
20
+ "but judge said: #{format_reason}\n\nResponse:\n#{@actual}"
21
+ end
22
+
23
+ def failure_message_when_negated
24
+ "expected response NOT to match intent #{@intent.inspect}, " \
25
+ "but judge said: #{format_reason}\n\nResponse:\n#{@actual}"
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RSpec
4
+ module LLM
5
+ module Matchers
6
+ # LLM-as-judge matcher. Asks the configured judge model whether the
7
+ # actual response satisfies the given criterion. Parses YES/NO from the
8
+ # first non-whitespace token of the judge's reply.
9
+ class PassLlmJudge
10
+ def initialize(criterion)
11
+ @criterion = criterion
12
+ @judge = nil
13
+ end
14
+
15
+ # Override the judge for this matcher invocation (optional).
16
+ def using(judge)
17
+ @judge = judge
18
+ self
19
+ end
20
+
21
+ def matches?(actual)
22
+ @actual = actual.to_s
23
+ @verdict_text = judge_adapter.chat(prompt_for(@actual, @criterion))
24
+ @verdict, @reason = parse_verdict(@verdict_text)
25
+ @verdict == true
26
+ end
27
+
28
+ def description
29
+ "pass LLM judge with criterion: #{@criterion.inspect}"
30
+ end
31
+
32
+ def failure_message
33
+ "expected response to pass judge criterion #{@criterion.inspect}, " \
34
+ "but judge said: #{format_reason}\n\nResponse:\n#{@actual}"
35
+ end
36
+
37
+ def failure_message_when_negated
38
+ "expected response NOT to pass judge criterion #{@criterion.inspect}, " \
39
+ "but judge said: #{format_reason}\n\nResponse:\n#{@actual}"
40
+ end
41
+
42
+ private
43
+
44
+ def judge_adapter
45
+ @judge || RSpec::LLM.judge or raise(
46
+ RSpec::LLM::Error,
47
+ "No judge configured. Call RSpec::LLM.configure { |c| c.judge = ... } " \
48
+ "or .using(client) on the matcher."
49
+ )
50
+ end
51
+
52
+ def prompt_for(response, criterion)
53
+ format(RSpec::LLM.configuration.judge_prompt_template, response: response, criterion: criterion)
54
+ end
55
+
56
+ def parse_verdict(text)
57
+ stripped = text.to_s.strip
58
+ first_token = stripped.split(/\s+/, 2).first.to_s.upcase
59
+ verdict = first_token.start_with?("YES")
60
+ reason = stripped.lines.drop(1).join.strip
61
+ reason = stripped if reason.empty?
62
+ [verdict, reason]
63
+ end
64
+
65
+ def format_reason
66
+ @reason.empty? ? @verdict_text.to_s.strip : @reason
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "matchers/pass_llm_judge"
4
+ require_relative "matchers/match_llm_intent"
5
+ require_relative "matchers/match_json_schema"
6
+ require_relative "matchers/be_semantically_similar_to"
7
+
8
+ module RSpec
9
+ module LLM
10
+ # Module to be included in example groups (RSpec.configure does this
11
+ # automatically via lib/rspec/llm/rspec.rb). Exposes the matcher DSL.
12
+ module Matchers
13
+ def pass_llm_judge(criterion)
14
+ RSpec::LLM::Matchers::PassLlmJudge.new(criterion)
15
+ end
16
+
17
+ def match_llm_intent(intent)
18
+ RSpec::LLM::Matchers::MatchLlmIntent.new(intent)
19
+ end
20
+
21
+ def match_json_schema(schema)
22
+ RSpec::LLM::Matchers::MatchJsonSchema.new(schema)
23
+ end
24
+
25
+ def be_semantically_similar_to(expected)
26
+ RSpec::LLM::Matchers::BeSemanticallySimilarTo.new(expected)
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Wire matchers, helpers, and DSL into RSpec automatically when the gem is
4
+ # required. Users only need `require "rspec/llm"` in their spec_helper.
5
+
6
+ RSpec.configure do |config|
7
+ config.include RSpec::LLM::Matchers
8
+ config.include RSpec::LLM::Helpers
9
+ end
10
+
11
+ # Expose `describe_llm` at the top level by extending RSpec itself.
12
+ RSpec.extend(RSpec::LLM::DSL)
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RSpec
4
+ module LLM
5
+ VERSION = "0.1.0"
6
+ end
7
+ end
data/lib/rspec/llm.rb ADDED
@@ -0,0 +1,43 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "rspec/core"
4
+ require "rspec/expectations"
5
+
6
+ require_relative "llm/version"
7
+ require_relative "llm/configuration"
8
+ require_relative "llm/adapters/base"
9
+ require_relative "llm/adapters/fake"
10
+ require_relative "llm/adapters/ruby_llm"
11
+ require_relative "llm/adapters/langchain"
12
+ require_relative "llm/matchers"
13
+ require_relative "llm/helpers"
14
+ require_relative "llm/dsl"
15
+ require_relative "llm/rspec"
16
+
17
+ module RSpec
18
+ module LLM
19
+ class Error < StandardError; end
20
+
21
+ class << self
22
+ def configuration
23
+ @configuration ||= Configuration.new
24
+ end
25
+
26
+ def configure
27
+ yield configuration
28
+ end
29
+
30
+ def reset!
31
+ @configuration = Configuration.new
32
+ end
33
+
34
+ def client
35
+ configuration.client_adapter
36
+ end
37
+
38
+ def judge
39
+ configuration.judge_adapter || client
40
+ end
41
+ end
42
+ end
43
+ end
data/sig/rspec/llm.rbs ADDED
@@ -0,0 +1,6 @@
1
+ module Rspec
2
+ module Llm
3
+ VERSION: String
4
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
5
+ end
6
+ end