deja 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +24 -0
- data/LICENSE +21 -0
- data/README.md +226 -0
- data/lib/deja/adapters/anthropic.rb +45 -0
- data/lib/deja/adapters/base.rb +83 -0
- data/lib/deja/cache.rb +281 -0
- data/lib/deja/configuration.rb +86 -0
- data/lib/deja/judges/anthropic.rb +32 -0
- data/lib/deja/judges/base.rb +69 -0
- data/lib/deja/requirements_cache.rb +117 -0
- data/lib/deja/rspec.rb +162 -0
- data/lib/deja/session.rb +53 -0
- data/lib/deja/version.rb +5 -0
- data/lib/deja.rb +84 -0
- metadata +120 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "pathname"
|
|
4
|
+
|
|
5
|
+
module Deja
|
|
6
|
+
# Holds everything host-specific so the gem itself stays ignorant of your app.
|
|
7
|
+
# You register at least one provider; cache_root has a sensible default, and the
|
|
8
|
+
# judge settings only matter if you use the `meet_requirements` matcher.
|
|
9
|
+
class Configuration
|
|
10
|
+
# Directory display in error messages is computed relative to this.
|
|
11
|
+
attr_reader :project_root, :adapters
|
|
12
|
+
|
|
13
|
+
# Default recorded-cache location, relative to project_root.
|
|
14
|
+
DEFAULT_CACHE_SUBPATH = "spec/support/deja_cache"
|
|
15
|
+
|
|
16
|
+
# Attrs that override the `meet_requirements` judge's defaults. Set
|
|
17
|
+
# provider-specific args here (model, temperature, …) without Deja having to
|
|
18
|
+
# name each one — different judge LLMs expose different args. The defaults
|
|
19
|
+
# themselves live with the judge code, not here, since they're specific to
|
|
20
|
+
# whatever LLM the judge speaks. `messages` and `output_config` are reserved
|
|
21
|
+
# by the matcher and can't be overridden.
|
|
22
|
+
attr_writer :judge_attrs
|
|
23
|
+
|
|
24
|
+
def initialize
|
|
25
|
+
@cache_root = nil
|
|
26
|
+
@project_root = Pathname.new(Dir.pwd)
|
|
27
|
+
@judge_attrs = {}
|
|
28
|
+
@judge_client = nil
|
|
29
|
+
@adapters = {}
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def judge_attrs
|
|
33
|
+
@judge_attrs || {}
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Where recorded cache files live. Defaults to project_root/spec/support/deja_cache.
|
|
37
|
+
def cache_root
|
|
38
|
+
@cache_root || project_root.join(DEFAULT_CACHE_SUBPATH)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Accepts a String or Pathname (e.g. Rails.root.join(...)).
|
|
42
|
+
def cache_root=(value)
|
|
43
|
+
@cache_root = value && Pathname.new(value.to_s)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def project_root=(value)
|
|
47
|
+
@project_root = Pathname.new(value.to_s)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Register a provider adapter. `provider` is a built-in adapter name (today:
|
|
51
|
+
# `:anthropic`). `install` swaps your app's client for Deja's stub and runs in
|
|
52
|
+
# the example's context (RSpec's `allow` is available). `real_client` is an
|
|
53
|
+
# optional block building a live client; it defaults per provider. `as` names
|
|
54
|
+
# the registration when you want two of the same provider.
|
|
55
|
+
#
|
|
56
|
+
# c.register :anthropic,
|
|
57
|
+
# install: ->(client) { allow(AnthropicClient).to receive(:client).and_return(client) },
|
|
58
|
+
# real_client: -> { Anthropic::Client.new(api_key: my_key) }
|
|
59
|
+
def register(provider, install:, real_client: nil, as: provider)
|
|
60
|
+
@adapters[as] = Deja::Adapters.build(provider, key: as, install:, real_client:)
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# How to build the client used by the `meet_requirements` judge. Required if
|
|
64
|
+
# you use that matcher — there is no default, so the judge's auth/model is an
|
|
65
|
+
# explicit choice. The block returns a client.
|
|
66
|
+
#
|
|
67
|
+
# c.judge_client { Anthropic::Client.new }
|
|
68
|
+
#
|
|
69
|
+
# Called with no block, returns the configured proc (raises if unset).
|
|
70
|
+
def judge_client(&block)
|
|
71
|
+
if block
|
|
72
|
+
@judge_client = block
|
|
73
|
+
else
|
|
74
|
+
@judge_client || raise(Deja::Error, <<~MSG)
|
|
75
|
+
Deja.configuration.judge_client is not set. The `meet_requirements`
|
|
76
|
+
matcher needs a client to judge values against requirements. Set one in
|
|
77
|
+
your Deja.configure block:
|
|
78
|
+
|
|
79
|
+
Deja.configure do |c|
|
|
80
|
+
c.judge_client { Anthropic::Client.new }
|
|
81
|
+
end
|
|
82
|
+
MSG
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "deja/judges/base"
|
|
4
|
+
|
|
5
|
+
module Deja
|
|
6
|
+
module Judges
|
|
7
|
+
# Judge backed by the Anthropic Ruby SDK. Use `::Anthropic` for the SDK
|
|
8
|
+
# constant — bare `Anthropic` would resolve to this class.
|
|
9
|
+
class Anthropic < Base
|
|
10
|
+
DEFAULTS = {
|
|
11
|
+
model: "claude-sonnet-4-5",
|
|
12
|
+
max_tokens: 512,
|
|
13
|
+
system: "You evaluate whether a candidate value meets a set of requirements. " \
|
|
14
|
+
"Use the structured output schema to return your verdict.",
|
|
15
|
+
}.freeze
|
|
16
|
+
|
|
17
|
+
def self.handles?(client)
|
|
18
|
+
defined?(::Anthropic::Client) && client.is_a?(::Anthropic::Client)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def self.client_description
|
|
22
|
+
"Anthropic::Client"
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def defaults
|
|
26
|
+
DEFAULTS
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
register(Anthropic)
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Deja
|
|
4
|
+
# Judge adapters teach the `meet_requirements` matcher how to judge with a given
|
|
5
|
+
# LLM client. An adapter is selected by the *type* of the object your
|
|
6
|
+
# `judge_client` returns, so the right defaults follow from the provider you
|
|
7
|
+
# chose rather than being assumed globally.
|
|
8
|
+
#
|
|
9
|
+
# Today an adapter supplies the default request attrs (model, etc.). The matcher
|
|
10
|
+
# still builds the request and parses the response (both Anthropic-shaped); as
|
|
11
|
+
# more judge providers are added, that construction/parsing is meant to move
|
|
12
|
+
# onto the adapter too — which is why dispatch already happens here.
|
|
13
|
+
module Judges
|
|
14
|
+
@registered = []
|
|
15
|
+
|
|
16
|
+
class << self
|
|
17
|
+
# Built-in judge adapters register themselves. Newest-first, so a more
|
|
18
|
+
# specific adapter registered later can shadow a more general one.
|
|
19
|
+
def register(klass)
|
|
20
|
+
@registered.unshift(klass)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def registered
|
|
24
|
+
@registered
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# The adapter for the client your `judge_client` returned. Raises a helpful
|
|
28
|
+
# error when no registered adapter handles it.
|
|
29
|
+
def for_client(client)
|
|
30
|
+
klass = @registered.find {|k| k.handles?(client) }
|
|
31
|
+
klass or raise Deja::Error, <<~MSG
|
|
32
|
+
No Deja judge adapter handles #{client.class} (the object your
|
|
33
|
+
judge_client returned). Deja can judge with: #{descriptions}.
|
|
34
|
+
Point judge_client at one of those, or add a Deja::Judges::Base
|
|
35
|
+
subclass that handles your client.
|
|
36
|
+
MSG
|
|
37
|
+
klass.new(client)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def descriptions
|
|
41
|
+
@registered.map(&:client_description).join(", ")
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
class Base
|
|
46
|
+
attr_reader :client
|
|
47
|
+
|
|
48
|
+
def initialize(client)
|
|
49
|
+
@client = client
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Does this adapter handle the given judge-client instance?
|
|
53
|
+
def self.handles?(_client)
|
|
54
|
+
raise NotImplementedError, "#{name} must implement .handles?"
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Human-readable client name, used in error messages.
|
|
58
|
+
def self.client_description
|
|
59
|
+
name
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Default request attrs for this judge (model, etc.). The matcher merges the
|
|
63
|
+
# user's judge_attrs over these, then its own reserved keys over both.
|
|
64
|
+
def defaults
|
|
65
|
+
raise NotImplementedError, "#{self.class} must implement #defaults"
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "deja/cache"
|
|
4
|
+
|
|
5
|
+
module Deja
|
|
6
|
+
# Cache layer behind the `meet_requirements` matcher (defined in deja/rspec.rb).
|
|
7
|
+
# Stores confirmed requirement/value pairs keyed by a hash of the requirements
|
|
8
|
+
# text. One file per test: `<cache_root>/meets_requirements/<suite>/<id>.yaml`.
|
|
9
|
+
#
|
|
10
|
+
# test_suite: <derived from spec file path>
|
|
11
|
+
# test_name: <full RSpec description>
|
|
12
|
+
# summary: <human-readable counts: assertions / total confirmed values>
|
|
13
|
+
# assertions:
|
|
14
|
+
# - hash: <12-char fingerprint of the requirements text — used for lookup>
|
|
15
|
+
# requirements: <the requirements text — auditable from the file alone>
|
|
16
|
+
# confirmed_values:
|
|
17
|
+
# - <values previously approved by the LLM judge>
|
|
18
|
+
#
|
|
19
|
+
# Pruning mirrors Deja::Cache: at the end of a passing example (when
|
|
20
|
+
# ALLOW_LLM_CALL=1), assertions whose hash wasn't touched are dropped — so
|
|
21
|
+
# changing the requirements text blows away the now-stale confirmed values.
|
|
22
|
+
module RequirementsCache
|
|
23
|
+
module_function
|
|
24
|
+
|
|
25
|
+
def cache_dir
|
|
26
|
+
Deja.configuration.cache_root.join("meets_requirements")
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def values_for(requirements)
|
|
30
|
+
record_touched(requirements)
|
|
31
|
+
assertion = load_assertion(requirements)
|
|
32
|
+
assertion ? assertion.fetch("confirmed_values") : []
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def append!(requirements, value)
|
|
36
|
+
record_touched(requirements)
|
|
37
|
+
data = load_or_init
|
|
38
|
+
upsert_assertion(data, requirements, value)
|
|
39
|
+
data["summary"] = build_summary(data["assertions"])
|
|
40
|
+
cache_file.write(YAML.dump(Deja::Cache.stringify(data)))
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def prune_untouched_in_current_example!
|
|
44
|
+
return unless cache_file.exist?
|
|
45
|
+
|
|
46
|
+
data = YAML.safe_load(cache_file.read)
|
|
47
|
+
touched = touched_hashes
|
|
48
|
+
fresh_assertions = data["assertions"].select {|a| touched.include?(a["hash"]) }
|
|
49
|
+
return if fresh_assertions.size == data["assertions"].size
|
|
50
|
+
|
|
51
|
+
if fresh_assertions.empty?
|
|
52
|
+
cache_file.delete
|
|
53
|
+
else
|
|
54
|
+
data["assertions"] = fresh_assertions
|
|
55
|
+
data["summary"] = build_summary(fresh_assertions)
|
|
56
|
+
cache_file.write(YAML.dump(Deja::Cache.stringify(data)))
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def cache_file
|
|
61
|
+
cache_dir.join(Deja::Cache.test_suite, "#{Deja::Cache.current_id!}.yaml")
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def requirements_hash(requirements)
|
|
65
|
+
Digest::SHA256.hexdigest(requirements.strip)[0, 12]
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def load_assertion(requirements)
|
|
69
|
+
return nil unless cache_file.exist?
|
|
70
|
+
|
|
71
|
+
hash = requirements_hash(requirements)
|
|
72
|
+
YAML.safe_load(cache_file.read).fetch("assertions").find {|a| a["hash"] == hash }
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def load_or_init
|
|
76
|
+
if cache_file.exist?
|
|
77
|
+
YAML.safe_load(cache_file.read)
|
|
78
|
+
else
|
|
79
|
+
FileUtils.mkdir_p(cache_file.dirname)
|
|
80
|
+
{
|
|
81
|
+
"test_suite" => Deja::Cache.test_suite,
|
|
82
|
+
"test_name" => Deja::Cache.current_test_name,
|
|
83
|
+
"summary" => "",
|
|
84
|
+
"assertions" => [],
|
|
85
|
+
}
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
def upsert_assertion(data, requirements, value)
|
|
90
|
+
hash = requirements_hash(requirements)
|
|
91
|
+
existing = data["assertions"].find {|a| a["hash"] == hash }
|
|
92
|
+
if existing
|
|
93
|
+
existing["confirmed_values"] = existing.fetch("confirmed_values") + [ value ]
|
|
94
|
+
else
|
|
95
|
+
data["assertions"] << {
|
|
96
|
+
"hash" => hash,
|
|
97
|
+
"requirements" => requirements.strip,
|
|
98
|
+
"confirmed_values" => [ value ],
|
|
99
|
+
}
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def build_summary(assertions)
|
|
104
|
+
total_values = assertions.sum {|a| a["confirmed_values"].size }
|
|
105
|
+
"#{assertions.size} #{assertions.size == 1 ? 'assertion' : 'assertions'}, " \
|
|
106
|
+
"#{total_values} confirmed #{total_values == 1 ? 'value' : 'values'} total."
|
|
107
|
+
end
|
|
108
|
+
|
|
109
|
+
def record_touched(requirements)
|
|
110
|
+
touched_hashes << requirements_hash(requirements)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
def touched_hashes
|
|
114
|
+
Deja::Cache.current_example!.metadata[:touched_meet_requirements_hashes] ||= Set.new
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
data/lib/deja/rspec.rb
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "deja"
|
|
4
|
+
require "json"
|
|
5
|
+
|
|
6
|
+
# Require the RSpec libraries we actually use, not the "rspec" meta-gem — hosts
|
|
7
|
+
# on rspec-rails have rspec-core/expectations/mocks but not the meta-gem.
|
|
8
|
+
require "rspec/core"
|
|
9
|
+
require "rspec/expectations"
|
|
10
|
+
require "rspec/mocks"
|
|
11
|
+
|
|
12
|
+
module Deja
|
|
13
|
+
# The test-facing DSL, mixed into every example by the RSpec.configure block
|
|
14
|
+
# below. Require "deja/rspec" from your spec setup to install it.
|
|
15
|
+
module Helpers
|
|
16
|
+
# Call from the top of an `it` block (per-test — the id should be distinct
|
|
17
|
+
# for each test) to install the caching client and set the cache id used for
|
|
18
|
+
# this example.
|
|
19
|
+
def use_llm_cache(id)
|
|
20
|
+
RSpec.current_example.metadata[:llm_cache_id] = id
|
|
21
|
+
Deja::Session.enable
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Assert the code path under test never reaches the LLM. Call from a `before`
|
|
25
|
+
# block or the top of an example.
|
|
26
|
+
def forbid_llm_calls
|
|
27
|
+
Deja::Session.forbid
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Assert exactly one LLM call happened (across all providers) and return its
|
|
31
|
+
# kwargs.
|
|
32
|
+
def expect_llm_called
|
|
33
|
+
Deja::Session.expect_called
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
# Read a value from a recorded cache YAML file by walking `path`. Each segment
|
|
37
|
+
# is a string key (for hashes) or an integer index (for arrays). Raises with
|
|
38
|
+
# the path traversed so far if any segment is missing — so a renamed key or
|
|
39
|
+
# shifted index fails loud rather than returning nil.
|
|
40
|
+
#
|
|
41
|
+
# cached_llm_value("2026-04-30_17-03",
|
|
42
|
+
# "calls", 0, "response", "tool_uses", 0, "input", "session_instructions")
|
|
43
|
+
def cached_llm_value(id, *path)
|
|
44
|
+
file = Deja::Cache.cache_dir.join(Deja::Cache.test_suite, "#{id}.yaml")
|
|
45
|
+
rel = Deja::Cache.display_path(file)
|
|
46
|
+
raise "No cached LLM file at #{rel}" unless file.exist?
|
|
47
|
+
|
|
48
|
+
current = YAML.safe_load(file.read)
|
|
49
|
+
path.each_with_index do |segment, i|
|
|
50
|
+
crumb = i.zero? ? "<root>" : path[0...i].map(&:inspect).join("/")
|
|
51
|
+
current = case current
|
|
52
|
+
when Hash
|
|
53
|
+
unless current.key?(segment)
|
|
54
|
+
raise "No key #{segment.inspect} at #{crumb} in #{rel}; available: #{current.keys.inspect}"
|
|
55
|
+
end
|
|
56
|
+
current[segment]
|
|
57
|
+
when Array
|
|
58
|
+
unless segment.is_a?(Integer)
|
|
59
|
+
raise "Expected integer index at #{crumb} in #{rel}, got #{segment.inspect}"
|
|
60
|
+
end
|
|
61
|
+
unless segment < current.size
|
|
62
|
+
raise "Index #{segment} out of range at #{crumb} (size #{current.size}) in #{rel}"
|
|
63
|
+
end
|
|
64
|
+
current[segment]
|
|
65
|
+
else
|
|
66
|
+
raise "Cannot traverse into #{current.class} at #{crumb} in #{rel}"
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
current
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# `meet_requirements(requirements_text)` asserts that an LLM-generated value
|
|
75
|
+
# satisfies a free-text description without pinning to a specific stringification.
|
|
76
|
+
#
|
|
77
|
+
# 1. Looks for the requirements_hash in the cache. If `actual` is already a
|
|
78
|
+
# confirmed value, passes — no LLM call.
|
|
79
|
+
# 2. Otherwise, with ALLOW_LLM_CALL=1, asks the judge model whether `actual`
|
|
80
|
+
# meets the requirements (structured output). On "yes", caches and passes.
|
|
81
|
+
# 3. Otherwise, fails telling you to re-record under ALLOW_LLM_CALL=1.
|
|
82
|
+
RSpec::Matchers.define :meet_requirements do |requirements|
|
|
83
|
+
match do |actual|
|
|
84
|
+
@requirements = requirements
|
|
85
|
+
|
|
86
|
+
cached = Deja::RequirementsCache.values_for(requirements)
|
|
87
|
+
next true if cached.include?(actual)
|
|
88
|
+
|
|
89
|
+
unless ENV["ALLOW_LLM_CALL"]
|
|
90
|
+
file = Deja::Cache.display_path(Deja::RequirementsCache.cache_file)
|
|
91
|
+
@reason = "value is not in #{file} for the current requirements. " \
|
|
92
|
+
"Set ALLOW_LLM_CALL=1 to verify it against the requirements via LLM and add it to the cache."
|
|
93
|
+
next false
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Use the dedicated judge client — independent of whatever provider the spec
|
|
97
|
+
# is recording, and outside the Deja::Cache layer. The meet_requirements cache
|
|
98
|
+
# is the only cache that should track these calls.
|
|
99
|
+
config = Deja.configuration
|
|
100
|
+
judge_client = config.judge_client.call
|
|
101
|
+
# The judge adapter is chosen by the client's type, and supplies
|
|
102
|
+
# provider-appropriate defaults; judge_attrs override them. messages and
|
|
103
|
+
# output_config are reserved — they carry the requirements and the
|
|
104
|
+
# structured-output contract this matcher parses, so they're merged last and
|
|
105
|
+
# win over both.
|
|
106
|
+
judge = Deja::Judges.for_client(judge_client)
|
|
107
|
+
judge_args = judge.defaults.merge(config.judge_attrs).merge(
|
|
108
|
+
messages: [
|
|
109
|
+
{
|
|
110
|
+
role: "user",
|
|
111
|
+
content: "Requirements:\n#{requirements}\n\nCandidate value:\n#{actual}\n\n" \
|
|
112
|
+
"Does the candidate value meet the requirements?",
|
|
113
|
+
},
|
|
114
|
+
],
|
|
115
|
+
output_config: {
|
|
116
|
+
format: {
|
|
117
|
+
type: :json_schema,
|
|
118
|
+
schema: {
|
|
119
|
+
"type" => "object",
|
|
120
|
+
"properties" => {
|
|
121
|
+
"meets_requirements" => {"type" => "boolean"},
|
|
122
|
+
"reason" => {"type" => "string"},
|
|
123
|
+
},
|
|
124
|
+
"required" => [ "meets_requirements", "reason" ],
|
|
125
|
+
"additionalProperties" => false,
|
|
126
|
+
},
|
|
127
|
+
},
|
|
128
|
+
},
|
|
129
|
+
)
|
|
130
|
+
response = judge_client.messages.create(**judge_args)
|
|
131
|
+
|
|
132
|
+
parsed = JSON.parse(response.content.first.text)
|
|
133
|
+
if parsed["meets_requirements"]
|
|
134
|
+
Deja::RequirementsCache.append!(requirements, actual)
|
|
135
|
+
true
|
|
136
|
+
else
|
|
137
|
+
@reason = "LLM judge rejected the value: #{parsed['reason']}"
|
|
138
|
+
false
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
failure_message do |actual|
|
|
143
|
+
"expected value to meet requirements\n#{@reason}\nGot: #{actual.inspect}"
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
RSpec.configure do |config|
|
|
148
|
+
config.include Deja::Helpers
|
|
149
|
+
|
|
150
|
+
# Prune stale entries (calls/assertions whose hash wasn't looked up this
|
|
151
|
+
# example) only when ALLOW_LLM_CALL=1 — the re-record path. Cache-only runs
|
|
152
|
+
# leave both files alone so a temporarily-disabled call/assertion doesn't lose
|
|
153
|
+
# its cached entry.
|
|
154
|
+
config.after(:each) do |example|
|
|
155
|
+
next if example.exception
|
|
156
|
+
next unless example.metadata[:llm_cache_id]
|
|
157
|
+
next unless ENV["ALLOW_LLM_CALL"]
|
|
158
|
+
|
|
159
|
+
Deja::Cache.prune_untouched_in_current_example!
|
|
160
|
+
Deja::RequirementsCache.prune_untouched_in_current_example!
|
|
161
|
+
end
|
|
162
|
+
end
|
data/lib/deja/session.rb
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Deja
|
|
4
|
+
# The per-example runtime. Installs every registered adapter's caching stub (so
|
|
5
|
+
# a suite can mix providers — each test exercises whichever it actually calls),
|
|
6
|
+
# and aggregates the captured calls across adapters.
|
|
7
|
+
module Session
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
# Install all registered adapters' stubs and reset the captured call log.
|
|
11
|
+
def enable
|
|
12
|
+
Deja.reset_calls!
|
|
13
|
+
adapters = Deja.adapters
|
|
14
|
+
if adapters.empty?
|
|
15
|
+
raise Deja::Error, "No providers registered. Call `c.register :anthropic, ...` inside Deja.configure."
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
adapters.each {|adapter| install(adapter, adapter.build_mock_client) }
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Install a poison client for every adapter so any LLM access raises.
|
|
22
|
+
def forbid
|
|
23
|
+
Deja.adapters.each {|adapter| install(adapter, poison_client) }
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Runs an adapter's install block in the current example's context (so RSpec's
|
|
27
|
+
# `allow` is available), handing it the client to return.
|
|
28
|
+
def install(adapter, client)
|
|
29
|
+
example_instance!.instance_exec(client, &adapter.install_block)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Assert exactly one call was captured across all adapters; return its kwargs.
|
|
33
|
+
def expect_called
|
|
34
|
+
instance = example_instance!
|
|
35
|
+
instance.instance_exec do
|
|
36
|
+
expect(Deja.calls.size).to eq(1)
|
|
37
|
+
end
|
|
38
|
+
Deja.calls.first[:kwargs]
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def example_instance!
|
|
42
|
+
RSpec.current_example&.example_group_instance or
|
|
43
|
+
raise Deja::Error, "Deja must be used inside an RSpec example"
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def poison_client
|
|
47
|
+
poison = Object.new
|
|
48
|
+
def poison.method_missing(*) = raise("LLM should not be called (deja forbid_llm_calls)")
|
|
49
|
+
def poison.respond_to_missing?(*) = true
|
|
50
|
+
poison
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
data/lib/deja/version.rb
ADDED
data/lib/deja.rb
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "deja/version"
|
|
4
|
+
|
|
5
|
+
# Deja records a non-deterministic call (today: an Anthropic LLM call) the first
|
|
6
|
+
# time it happens and replays the recorded response on every run after that, so
|
|
7
|
+
# tests that exercise real model behavior stay fast, offline, and deterministic.
|
|
8
|
+
#
|
|
9
|
+
# Providers are pluggable via adapters (see Deja::Adapters) — a suite can mix
|
|
10
|
+
# them, and each test exercises whichever it actually calls.
|
|
11
|
+
#
|
|
12
|
+
# It also ships `meet_requirements`, an RSpec matcher that asserts an LLM-produced
|
|
13
|
+
# value satisfies a free-text description (judged once, then cached).
|
|
14
|
+
#
|
|
15
|
+
# See README.md for the full record/replay workflow and configuration.
|
|
16
|
+
module Deja
|
|
17
|
+
class Error < StandardError; end
|
|
18
|
+
|
|
19
|
+
# Raised on a cache miss when ALLOW_LLM_CALL is not set — i.e. replay mode hit
|
|
20
|
+
# a request it has never recorded.
|
|
21
|
+
class MissingCacheError < Error; end
|
|
22
|
+
|
|
23
|
+
# Raised when an LLM call is made before `use_llm_cache(id)` set a cache id.
|
|
24
|
+
class MissingIdError < Error; end
|
|
25
|
+
|
|
26
|
+
class << self
|
|
27
|
+
# Configure the gem. Yields the Configuration; returns it.
|
|
28
|
+
#
|
|
29
|
+
# Deja.configure do |c|
|
|
30
|
+
# c.cache_root = Rails.root.join("spec/support/cache")
|
|
31
|
+
# c.register :anthropic,
|
|
32
|
+
# install: ->(client) { allow(AnthropicClient).to receive(:client).and_return(client) }
|
|
33
|
+
# end
|
|
34
|
+
def configure
|
|
35
|
+
yield(configuration)
|
|
36
|
+
configuration
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def configuration
|
|
40
|
+
@configuration ||= Configuration.new
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Drops configuration and the captured call log — used between examples and by
|
|
44
|
+
# the gem's own suite.
|
|
45
|
+
def reset_configuration!
|
|
46
|
+
@configuration = Configuration.new
|
|
47
|
+
reset_calls!
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Register a provider adapter (delegates to the configuration). See
|
|
51
|
+
# Configuration#register.
|
|
52
|
+
def register(provider, **opts)
|
|
53
|
+
configuration.register(provider, **opts)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# The registered adapters, in registration order.
|
|
57
|
+
def adapters
|
|
58
|
+
configuration.adapters.values
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# --- captured calls (reset per example by Session.enable) ---
|
|
62
|
+
|
|
63
|
+
def calls
|
|
64
|
+
@calls ||= []
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def record_call(provider, method, kwargs)
|
|
68
|
+
calls << {provider:, method:, kwargs:}
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def reset_calls!
|
|
72
|
+
@calls = []
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
require "deja/configuration"
|
|
78
|
+
require "deja/cache"
|
|
79
|
+
require "deja/requirements_cache"
|
|
80
|
+
require "deja/adapters/base"
|
|
81
|
+
require "deja/adapters/anthropic"
|
|
82
|
+
require "deja/judges/base"
|
|
83
|
+
require "deja/judges/anthropic"
|
|
84
|
+
require "deja/session"
|