rspec-agents 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/rspec-agents +24 -0
- data/lib/async_workers/channel_config.rb +34 -0
- data/lib/async_workers/doc/process_manager_design.md +512 -0
- data/lib/async_workers/errors.rb +21 -0
- data/lib/async_workers/managed_process.rb +284 -0
- data/lib/async_workers/output_stream.rb +86 -0
- data/lib/async_workers/rpc_channel.rb +159 -0
- data/lib/async_workers/transport/base.rb +57 -0
- data/lib/async_workers/transport/stdio_transport.rb +91 -0
- data/lib/async_workers/transport/unix_socket_transport.rb +112 -0
- data/lib/async_workers/worker_group.rb +175 -0
- data/lib/async_workers.rb +17 -0
- data/lib/rspec/agents/agent_response.rb +61 -0
- data/lib/rspec/agents/agents/base.rb +123 -0
- data/lib/rspec/agents/cli.rb +342 -0
- data/lib/rspec/agents/conversation.rb +308 -0
- data/lib/rspec/agents/criterion.rb +237 -0
- data/lib/rspec/agents/doc/2026_01_22_observer-system-design.md +757 -0
- data/lib/rspec/agents/doc/2026_01_23_parallel_spec_runner-design.md +1060 -0
- data/lib/rspec/agents/doc/2026_01_27_event_serialization-design.md +294 -0
- data/lib/rspec/agents/doc/2026_01_27_experiment_aggregation_design.md +831 -0
- data/lib/rspec/agents/doc/2026_01_29_rspec-agents-studio-design.md +1332 -0
- data/lib/rspec/agents/doc/2026_01_29_testing-framework-design.md +1037 -0
- data/lib/rspec/agents/doc/2026_02_04-parallel-runner-ui.md +537 -0
- data/lib/rspec/agents/doc/2026_02_05_html_renderer_extensions.md +708 -0
- data/lib/rspec/agents/doc/scenario_guide.md +289 -0
- data/lib/rspec/agents/dsl/agent_proxy.rb +141 -0
- data/lib/rspec/agents/dsl/criterion_definition.rb +78 -0
- data/lib/rspec/agents/dsl/graph_builder.rb +38 -0
- data/lib/rspec/agents/dsl/runner_factory.rb +52 -0
- data/lib/rspec/agents/dsl/scenario_set_dsl.rb +166 -0
- data/lib/rspec/agents/dsl/test_context.rb +223 -0
- data/lib/rspec/agents/dsl/user_proxy.rb +71 -0
- data/lib/rspec/agents/dsl.rb +398 -0
- data/lib/rspec/agents/evaluation_result.rb +44 -0
- data/lib/rspec/agents/event_bus.rb +78 -0
- data/lib/rspec/agents/events.rb +141 -0
- data/lib/rspec/agents/isolated_event_bus.rb +86 -0
- data/lib/rspec/agents/judge.rb +244 -0
- data/lib/rspec/agents/llm/anthropic.rb +143 -0
- data/lib/rspec/agents/llm/base.rb +64 -0
- data/lib/rspec/agents/llm/mock.rb +181 -0
- data/lib/rspec/agents/llm/response.rb +52 -0
- data/lib/rspec/agents/matchers.rb +554 -0
- data/lib/rspec/agents/message.rb +81 -0
- data/lib/rspec/agents/metadata.rb +120 -0
- data/lib/rspec/agents/observers/base.rb +70 -0
- data/lib/rspec/agents/observers/parallel_terminal_observer.rb +151 -0
- data/lib/rspec/agents/observers/rpc_notify_observer.rb +43 -0
- data/lib/rspec/agents/observers/terminal_observer.rb +103 -0
- data/lib/rspec/agents/parallel/controller.rb +284 -0
- data/lib/rspec/agents/parallel/example_discovery.rb +153 -0
- data/lib/rspec/agents/parallel/partitioner.rb +31 -0
- data/lib/rspec/agents/parallel/run_result.rb +22 -0
- data/lib/rspec/agents/parallel/ui/interactive_ui.rb +605 -0
- data/lib/rspec/agents/parallel/ui/interleaved_ui.rb +139 -0
- data/lib/rspec/agents/parallel/ui/output_adapter.rb +127 -0
- data/lib/rspec/agents/parallel/ui/quiet_ui.rb +100 -0
- data/lib/rspec/agents/parallel/ui/ui_factory.rb +53 -0
- data/lib/rspec/agents/parallel/ui/ui_mode.rb +101 -0
- data/lib/rspec/agents/prompt_builders/base.rb +113 -0
- data/lib/rspec/agents/prompt_builders/criterion_evaluation.rb +136 -0
- data/lib/rspec/agents/prompt_builders/goal_achievement_evaluation.rb +142 -0
- data/lib/rspec/agents/prompt_builders/grounding_evaluation.rb +172 -0
- data/lib/rspec/agents/prompt_builders/intent_evaluation.rb +111 -0
- data/lib/rspec/agents/prompt_builders/topic_classification.rb +105 -0
- data/lib/rspec/agents/prompt_builders/user_simulation.rb +131 -0
- data/lib/rspec/agents/runners/headless_runner.rb +272 -0
- data/lib/rspec/agents/runners/parallel_terminal_runner.rb +220 -0
- data/lib/rspec/agents/runners/terminal_runner.rb +186 -0
- data/lib/rspec/agents/runners/user_simulator.rb +261 -0
- data/lib/rspec/agents/scenario.rb +133 -0
- data/lib/rspec/agents/scenario_loader.rb +145 -0
- data/lib/rspec/agents/serialization/conversation_renderer.rb +161 -0
- data/lib/rspec/agents/serialization/extension.rb +199 -0
- data/lib/rspec/agents/serialization/extensions/core_extension.rb +66 -0
- data/lib/rspec/agents/serialization/presenters.rb +281 -0
- data/lib/rspec/agents/serialization/run_data_aggregator.rb +197 -0
- data/lib/rspec/agents/serialization/run_data_builder.rb +189 -0
- data/lib/rspec/agents/serialization/templates/_alpine.min.js +5 -0
- data/lib/rspec/agents/serialization/templates/_base_components.css +196 -0
- data/lib/rspec/agents/serialization/templates/_base_components.js +46 -0
- data/lib/rspec/agents/serialization/templates/_conversation_fragment.html.haml +34 -0
- data/lib/rspec/agents/serialization/templates/_metadata_default.html.haml +17 -0
- data/lib/rspec/agents/serialization/templates/_scripts.js +89 -0
- data/lib/rspec/agents/serialization/templates/_styles.css +1211 -0
- data/lib/rspec/agents/serialization/templates/conversation_document.html.haml +29 -0
- data/lib/rspec/agents/serialization/templates/test_suite.html.haml +238 -0
- data/lib/rspec/agents/serialization/test_suite_renderer.rb +207 -0
- data/lib/rspec/agents/serialization.rb +374 -0
- data/lib/rspec/agents/simulator_config.rb +336 -0
- data/lib/rspec/agents/spec_executor.rb +494 -0
- data/lib/rspec/agents/stable_example_id.rb +147 -0
- data/lib/rspec/agents/templates/user_simulation.erb +9 -0
- data/lib/rspec/agents/tool_call.rb +53 -0
- data/lib/rspec/agents/topic.rb +307 -0
- data/lib/rspec/agents/topic_graph.rb +236 -0
- data/lib/rspec/agents/triggers.rb +122 -0
- data/lib/rspec/agents/turn.rb +63 -0
- data/lib/rspec/agents/turn_executor.rb +91 -0
- data/lib/rspec/agents/version.rb +7 -0
- data/lib/rspec/agents.rb +145 -0
- metadata +242 -0
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
# RSpec Agents Scenario Guide
|
|
2
|
+
|
|
3
|
+
This guide provides practical examples for writing agent tests using the rspec-agents DSL.
|
|
4
|
+
|
|
5
|
+
## Quick Reference
|
|
6
|
+
|
|
7
|
+
### Criteria Types
|
|
8
|
+
|
|
9
|
+
```ruby
|
|
10
|
+
# 1. Named criterion (defined at describe/context level)
|
|
11
|
+
criterion :friendly, "The agent's response should be friendly"
|
|
12
|
+
expect(agent).to satisfy(:friendly)
|
|
13
|
+
|
|
14
|
+
# 2. Adhoc criterion (inline string description, evaluated by LLM)
|
|
15
|
+
expect(agent).to satisfy("Der Agent fragt nach Details oder zeigt das Formular an")
|
|
16
|
+
|
|
17
|
+
# 3. Lambda criterion (code-based, no LLM)
|
|
18
|
+
expect(agent).to satisfy(->(turn) { turn.agent_response.text.length <= 500 })
|
|
19
|
+
|
|
20
|
+
# 4. Named lambda criterion
|
|
21
|
+
expect(agent).to satisfy(:concise, ->(turn) { turn.agent_response.text.length <= 300 })
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Soft vs Hard Assertions
|
|
25
|
+
|
|
26
|
+
```ruby
|
|
27
|
+
# Soft: records result, continues even if fails
|
|
28
|
+
evaluate(agent).to satisfy(:friendly)
|
|
29
|
+
|
|
30
|
+
# Hard: test fails immediately if not met
|
|
31
|
+
expect(agent).to satisfy(:friendly)
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Common Patterns
|
|
37
|
+
|
|
38
|
+
### Basic Scripted Conversation
|
|
39
|
+
|
|
40
|
+
```ruby
|
|
41
|
+
RSpec.describe "Room Booking Agent", type: :agent do
|
|
42
|
+
criterion :friendly, "The agent's response should be friendly"
|
|
43
|
+
|
|
44
|
+
it "handles booking request" do
|
|
45
|
+
user.says "Hi, I need to book a meeting room"
|
|
46
|
+
|
|
47
|
+
evaluate(agent).to satisfy(:friendly)
|
|
48
|
+
expect(agent).not_to call_tool(:book_room)
|
|
49
|
+
|
|
50
|
+
user.says "The Blue Room for tomorrow at 2pm"
|
|
51
|
+
|
|
52
|
+
expect(agent).to call_tool(:check_availability)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Using Adhoc Criterions
|
|
58
|
+
|
|
59
|
+
Adhoc criterions are useful when you need a one-off evaluation without defining a named criterion. The string description is passed directly to the LLM judge.
|
|
60
|
+
|
|
61
|
+
```ruby
|
|
62
|
+
it "handles form display" do
|
|
63
|
+
user.says "I want to book a room"
|
|
64
|
+
|
|
65
|
+
# German description - LLM evaluates naturally
|
|
66
|
+
expect(agent).to satisfy("Der Agent fragt nach Details oder zeigt das Formular an")
|
|
67
|
+
|
|
68
|
+
# English description
|
|
69
|
+
evaluate(agent).to satisfy("The agent acknowledges the request")
|
|
70
|
+
|
|
71
|
+
# Mix named and adhoc
|
|
72
|
+
evaluate(agent).to satisfy(:friendly)
|
|
73
|
+
expect(agent).to satisfy("Response contains actionable next steps")
|
|
74
|
+
end
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Combining Criterion Types
|
|
78
|
+
|
|
79
|
+
```ruby
|
|
80
|
+
it "validates response quality" do
|
|
81
|
+
user.says "Search for venues in Stuttgart"
|
|
82
|
+
|
|
83
|
+
# Named criterion
|
|
84
|
+
evaluate(agent).to satisfy(:friendly)
|
|
85
|
+
|
|
86
|
+
# Adhoc criterion for specific requirement
|
|
87
|
+
expect(agent).to satisfy("The agent confirms the search location")
|
|
88
|
+
|
|
89
|
+
# Lambda for deterministic checks
|
|
90
|
+
expect(agent).to satisfy(->(turn) {
|
|
91
|
+
turn.agent_response.tool_calls.any? { |tc| tc.name == :search_venues }
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
# Multiple in one call
|
|
95
|
+
evaluate(agent).to satisfy(
|
|
96
|
+
:helpful,
|
|
97
|
+
"Response is professional",
|
|
98
|
+
->(turn) { turn.agent_response.text.length < 1000 }
|
|
99
|
+
)
|
|
100
|
+
end
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Scenario-Based Testing (from JSON file)
|
|
104
|
+
|
|
105
|
+
```ruby
|
|
106
|
+
RSpec.describe "Event Booking Agent", type: :agent do
|
|
107
|
+
criterion :friendly, "The agent's response should be friendly"
|
|
108
|
+
criterion :helpful, "The agent should move toward the user's goal"
|
|
109
|
+
|
|
110
|
+
scenario_set "venue_searches", from: "scenarios/venue_search.json" do |scenario|
|
|
111
|
+
it "handles #{scenario[:name]}" do
|
|
112
|
+
user.simulate do
|
|
113
|
+
goal scenario[:goal]
|
|
114
|
+
personality scenario[:personality]
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Adhoc criterion using scenario data
|
|
118
|
+
expect(agent).to satisfy("The agent addresses: #{scenario[:goal]}")
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Scenario-Based Testing (inline array)
|
|
125
|
+
|
|
126
|
+
Scenarios can be defined directly in the test file using `scenarios:` instead of `from:`. This is useful for self-contained tests or dynamically generated scenarios.
|
|
127
|
+
|
|
128
|
+
```ruby
|
|
129
|
+
RSpec.describe "Event Booking Agent", type: :agent do
|
|
130
|
+
criterion :friendly, "The agent's response should be friendly"
|
|
131
|
+
criterion :helpful, "The agent should move toward the user's goal"
|
|
132
|
+
|
|
133
|
+
scenarios = [
|
|
134
|
+
{
|
|
135
|
+
id: "weihnachtsfeier",
|
|
136
|
+
name: "Unternehmens-Weihnachtsfeier",
|
|
137
|
+
goal: "Eine festliche Weihnachtsfeier für 60 Mitarbeiter in Berlin organisieren",
|
|
138
|
+
personality: "Festlich gestimmt, achtet auf Details wie Dekoration und Menü",
|
|
139
|
+
context: ["Dezember-Termin", "Abendveranstaltung mit Dinner", "Unterhaltungsprogramm gewünscht"]
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
id: "produktlaunch",
|
|
143
|
+
name: "Produktlaunch-Event",
|
|
144
|
+
goal: "Ein Produktlaunch-Event für 100 Gäste in Hamburg organisieren",
|
|
145
|
+
personality: "Marketing-orientiert, achtet auf Präsentationstechnik und Impression",
|
|
146
|
+
context: ["Presse und Kunden eingeladen", "Moderne Location gewünscht", "Catering wichtig"]
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
id: "vertriebstagung",
|
|
150
|
+
name: "Vertriebstagung",
|
|
151
|
+
goal: "Eine zweitägige Vertriebstagung für 50 Außendienstmitarbeiter in Düsseldorf",
|
|
152
|
+
personality: "Ergebnisorientiert, fokussiert auf Motivation und Schulung",
|
|
153
|
+
context: ["Motivationstraining geplant", "Award-Verleihung am Abend", "Networking wichtig"]
|
|
154
|
+
},
|
|
155
|
+
{
|
|
156
|
+
id: "klausurtagung",
|
|
157
|
+
name: "Management-Klausurtagung",
|
|
158
|
+
goal: "Eine vertrauliche Klausurtagung für 12 Führungskräfte in einem ruhigen Hotel im Schwarzwald",
|
|
159
|
+
personality: "Diskret, bevorzugt ruhige und abgeschiedene Locations",
|
|
160
|
+
context: ["Strategische Themen", "Absolute Vertraulichkeit", "Keine Ablenkungen"]
|
|
161
|
+
},
|
|
162
|
+
{
|
|
163
|
+
id: "azubi_onboarding",
|
|
164
|
+
name: "Azubi-Onboarding",
|
|
165
|
+
goal: "Ein Onboarding-Event für 20 neue Auszubildende in der Nähe von München",
|
|
166
|
+
personality: "Jugendlich, achtet auf abwechslungsreiches Programm",
|
|
167
|
+
context: ["Junge Teilnehmer", "Teambuilding-Aktivitäten wichtig", "Lockere Atmosphäre"]
|
|
168
|
+
}
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
scenario_set "corporate_events", scenarios: scenarios do |scenario|
|
|
172
|
+
it "handles #{scenario[:name]}" do
|
|
173
|
+
user.simulate do
|
|
174
|
+
goal scenario[:goal]
|
|
175
|
+
personality scenario[:personality]
|
|
176
|
+
context { scenario[:context].each { |c| note c } }
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
evaluate(agent).to satisfy(:friendly)
|
|
180
|
+
evaluate(agent).to satisfy(:helpful)
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
**When to use each approach:**
|
|
187
|
+
|
|
188
|
+
| Approach | Use When |
|
|
189
|
+
|----------|----------|
|
|
190
|
+
| `from: "file.json"` | Scenarios shared across multiple test files |
|
|
191
|
+
| `scenarios: [...]` | Self-contained tests, dynamically generated scenarios |
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## When to Use Each Criterion Type
|
|
196
|
+
|
|
197
|
+
| Type | Use When | Example |
|
|
198
|
+
|------|----------|---------|
|
|
199
|
+
| Named | Reused across multiple tests | `:friendly`, `:helpful` |
|
|
200
|
+
| Adhoc | One-off, test-specific requirement | `"Der Agent zeigt das Formular an"` |
|
|
201
|
+
| Lambda | Deterministic, no LLM needed | `->(turn) { turn.text.length < 500 }` |
|
|
202
|
+
| Named Lambda | Reusable code check with a name | `:concise, ->(turn) { ... }` |
|
|
203
|
+
|
|
204
|
+
### Adhoc Criterion Best Practices
|
|
205
|
+
|
|
206
|
+
1. **Use natural language**: Write descriptions as you would explain to a human
|
|
207
|
+
2. **Be specific**: "The agent asks for the event date" is better than "Agent asks questions"
|
|
208
|
+
3. **Language flexibility**: Use any language - the LLM judge handles German, English, etc.
|
|
209
|
+
4. **Mix with named criteria**: Use adhoc for edge cases, named for common patterns
|
|
210
|
+
|
|
211
|
+
```ruby
|
|
212
|
+
# Good: specific, actionable
|
|
213
|
+
expect(agent).to satisfy("The agent provides at least 3 venue options")
|
|
214
|
+
expect(agent).to satisfy("Der Agent fragt nach dem Budget")
|
|
215
|
+
|
|
216
|
+
# Avoid: too vague
|
|
217
|
+
expect(agent).to satisfy("Good response")
|
|
218
|
+
expect(agent).to satisfy("Agent works correctly")
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
## Tool Call Assertions
|
|
224
|
+
|
|
225
|
+
```ruby
|
|
226
|
+
# Basic tool call check
|
|
227
|
+
expect(agent).to call_tool(:search_venues)
|
|
228
|
+
|
|
229
|
+
# With parameters
|
|
230
|
+
expect(agent).to call_tool(:book_room).with(
|
|
231
|
+
room: "Blue Room",
|
|
232
|
+
capacity: be >= 10
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Negated
|
|
236
|
+
expect(agent).not_to call_tool(:book_room)
|
|
237
|
+
|
|
238
|
+
# Conversation-level (across all turns)
|
|
239
|
+
expect(conversation).to have_tool_call(:search_venues)
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## Grounding Assertions
|
|
245
|
+
|
|
246
|
+
```ruby
|
|
247
|
+
# Verify claims are grounded in tool results
|
|
248
|
+
expect(agent).to be_grounded_in(:venues, :pricing)
|
|
249
|
+
|
|
250
|
+
# Specify source tools
|
|
251
|
+
expect(agent).to be_grounded_in(:venues, from_tools: [:search_venues])
|
|
252
|
+
|
|
253
|
+
# Forbid ungrounded claims
|
|
254
|
+
expect(agent).not_to claim(:availability)
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## Topic Tracking
|
|
260
|
+
|
|
261
|
+
**No self-loops:** A topic cannot list itself in `next:`. The tracker stays in a topic until a transition occurs naturally.
|
|
262
|
+
|
|
263
|
+
```ruby
|
|
264
|
+
it "progresses through booking flow" do
|
|
265
|
+
expect_conversation_to do
|
|
266
|
+
use_topic :greeting, next: :gathering_details
|
|
267
|
+
use_topic :gathering_details, next: :confirming
|
|
268
|
+
use_topic :confirming
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
user.says "Hello!"
|
|
272
|
+
expect(agent).to be_in_topic(:greeting)
|
|
273
|
+
|
|
274
|
+
user.says "I need a room for 10 people tomorrow"
|
|
275
|
+
expect(agent).to be_in_topic(:gathering_details)
|
|
276
|
+
end
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
---
|
|
280
|
+
|
|
281
|
+
## Goal Achievement
|
|
282
|
+
|
|
283
|
+
```ruby
|
|
284
|
+
# Check stated goal (from simulator config)
|
|
285
|
+
expect(agent).to have_achieved_stated_goal
|
|
286
|
+
|
|
287
|
+
# Check custom goal description
|
|
288
|
+
expect(agent).to have_achieved_goal("User received venue options under budget")
|
|
289
|
+
```
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
module RSpec
|
|
2
|
+
module Agents
|
|
3
|
+
module DSL
|
|
4
|
+
# Proxy for agent state inspection in scripted tests
|
|
5
|
+
# Provides read-only access to turn executor state and assertion helpers
|
|
6
|
+
class AgentProxy
|
|
7
|
+
# @param turn_executor [TurnExecutor]
|
|
8
|
+
# @param judge [Judge, nil] Optional judge for LLM-based assertions
|
|
9
|
+
def initialize(turn_executor:, judge: nil)
|
|
10
|
+
@turn_executor = turn_executor
|
|
11
|
+
@judge = judge
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Get the turn executor (runner)
|
|
15
|
+
# @return [TurnExecutor]
|
|
16
|
+
def runner
|
|
17
|
+
@turn_executor
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Get the conversation (delegates to turn executor)
|
|
21
|
+
# @return [Conversation]
|
|
22
|
+
def conversation
|
|
23
|
+
@turn_executor.conversation
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Get the current response object
|
|
27
|
+
# @return [AgentResponse, nil]
|
|
28
|
+
def response
|
|
29
|
+
@turn_executor.current_response
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Get the response text
|
|
33
|
+
# @return [String, nil]
|
|
34
|
+
def last_response
|
|
35
|
+
response&.text
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Get the current topic
|
|
39
|
+
# @return [Symbol, nil]
|
|
40
|
+
def current_topic
|
|
41
|
+
@turn_executor.current_topic
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Get the current turn
|
|
45
|
+
# @return [Turn, nil]
|
|
46
|
+
def current_turn
|
|
47
|
+
@turn_executor.current_turn
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# Get tool calls from current response
|
|
51
|
+
# @return [Array<ToolCall>]
|
|
52
|
+
def tool_calls
|
|
53
|
+
response&.tool_calls || []
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Check if agent called a specific tool
|
|
57
|
+
# @param name [Symbol, String]
|
|
58
|
+
# @param params [Hash, nil]
|
|
59
|
+
# @return [Boolean]
|
|
60
|
+
def called_tool?(name, params: nil)
|
|
61
|
+
response&.has_tool_call?(name, params: params) || false
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Check if in a specific topic
|
|
65
|
+
# @param topic_name [Symbol]
|
|
66
|
+
# @return [Boolean]
|
|
67
|
+
def in_topic?(topic_name)
|
|
68
|
+
@turn_executor.in_topic?(topic_name)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Check if current response matches a pattern
|
|
72
|
+
#
|
|
73
|
+
# @param pattern [Regexp] Pattern to match
|
|
74
|
+
# @return [Boolean]
|
|
75
|
+
def response_matches?(pattern)
|
|
76
|
+
response&.match?(pattern) || false
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Evaluate a criterion against the current turn
|
|
80
|
+
# Uses the Criterion class to normalize and evaluate all criterion types
|
|
81
|
+
#
|
|
82
|
+
# @param criterion [Symbol, String, Proc, Criterion] Criterion to evaluate
|
|
83
|
+
# @return [Hash] { satisfied: Boolean, reasoning: String }
|
|
84
|
+
def evaluate_criterion(criterion)
|
|
85
|
+
turn = current_turn
|
|
86
|
+
return { satisfied: false, reasoning: "No turn to evaluate" } unless turn
|
|
87
|
+
|
|
88
|
+
# Normalize to Criterion object
|
|
89
|
+
criterion_obj = criterion.is_a?(Criterion) ? criterion : Criterion.from(criterion)
|
|
90
|
+
|
|
91
|
+
criterion_obj.evaluate(
|
|
92
|
+
turn: turn,
|
|
93
|
+
judge: @judge,
|
|
94
|
+
conversation: conversation,
|
|
95
|
+
criteria_registry: @judge&.criteria || {}
|
|
96
|
+
)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Check if response is grounded in tool results
|
|
100
|
+
#
|
|
101
|
+
# @param claim_types [Array<Symbol>] Claim types to verify
|
|
102
|
+
# @param from_tools [Array<Symbol>] Tool names that should provide grounding
|
|
103
|
+
# @return [Hash] { grounded: Boolean, violations: Array }
|
|
104
|
+
def check_grounding(claim_types, from_tools: [])
|
|
105
|
+
return { grounded: true, violations: [] } unless @judge
|
|
106
|
+
|
|
107
|
+
turn = current_turn
|
|
108
|
+
return { grounded: false, violations: ["No turn to evaluate"] } unless turn
|
|
109
|
+
|
|
110
|
+
@judge.evaluate_grounding(claim_types, [turn], from_tools: from_tools)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Check for forbidden claims
|
|
114
|
+
#
|
|
115
|
+
# @param claim_types [Array<Symbol>] Claim types that are forbidden
|
|
116
|
+
# @return [Hash] { violated: Boolean, claims_found: Array }
|
|
117
|
+
def check_forbidden_claims(claim_types)
|
|
118
|
+
return { violated: false, claims_found: [] } unless @judge
|
|
119
|
+
|
|
120
|
+
turn = current_turn
|
|
121
|
+
return { violated: false, claims_found: [] } unless turn
|
|
122
|
+
|
|
123
|
+
@judge.evaluate_forbidden_claims(claim_types, [turn])
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Check if agent demonstrates expected intent
|
|
127
|
+
#
|
|
128
|
+
# @param intent_description [String] Description of expected intent
|
|
129
|
+
# @return [Hash] { matches: Boolean, observed_intent: String, reasoning: String }
|
|
130
|
+
def check_intent(intent_description)
|
|
131
|
+
return { matches: false, reasoning: "No judge configured" } unless @judge
|
|
132
|
+
|
|
133
|
+
turn = current_turn
|
|
134
|
+
return { matches: false, reasoning: "No turn to evaluate" } unless turn
|
|
135
|
+
|
|
136
|
+
@judge.evaluate_intent(intent_description, turn)
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
module RSpec
|
|
2
|
+
module Agents
|
|
3
|
+
module DSL
|
|
4
|
+
# Criterion definition with optional examples for LLM-based evaluation
|
|
5
|
+
class CriterionDefinition
|
|
6
|
+
attr_reader :name, :good_examples, :bad_examples, :edge_cases
|
|
7
|
+
attr_reader :match_block, :match_messages_block
|
|
8
|
+
|
|
9
|
+
# @param name [Symbol, String]
|
|
10
|
+
# @param description [String, nil]
|
|
11
|
+
# @yield Optional block for complex definition
|
|
12
|
+
def initialize(name, description: nil, &block)
|
|
13
|
+
@name = name.to_sym
|
|
14
|
+
@description = description
|
|
15
|
+
@good_examples = []
|
|
16
|
+
@bad_examples = []
|
|
17
|
+
@edge_cases = []
|
|
18
|
+
@match_block = nil
|
|
19
|
+
@match_messages_block = nil
|
|
20
|
+
instance_eval(&block) if block_given?
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Get or set description
|
|
24
|
+
# @param text [String, nil]
|
|
25
|
+
# @return [String, nil]
|
|
26
|
+
def description(text = nil)
|
|
27
|
+
text.nil? ? @description : (@description = text)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Add a good example
|
|
31
|
+
# @param text [String]
|
|
32
|
+
# @param explanation [String]
|
|
33
|
+
def good_example(text, explanation:)
|
|
34
|
+
@good_examples << { text: text, explanation: explanation }
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Add a bad example
|
|
38
|
+
# @param text [String]
|
|
39
|
+
# @param explanation [String]
|
|
40
|
+
def bad_example(text, explanation:)
|
|
41
|
+
@bad_examples << { text: text, explanation: explanation }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# Add an edge case
|
|
45
|
+
# @param text [String]
|
|
46
|
+
# @param verdict [Boolean]
|
|
47
|
+
# @param explanation [String]
|
|
48
|
+
def edge_case(text, verdict:, explanation:)
|
|
49
|
+
@edge_cases << { text: text, verdict: verdict, explanation: explanation }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Set code-based evaluation block
|
|
53
|
+
# @yield [conversation]
|
|
54
|
+
def match(&block)
|
|
55
|
+
@match_block = block
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Set message-based evaluation block
|
|
59
|
+
# @yield [messages]
|
|
60
|
+
def match_messages(&block)
|
|
61
|
+
@match_messages_block = block
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Check if this criterion uses code-based evaluation
|
|
65
|
+
# @return [Boolean]
|
|
66
|
+
def code_based?
|
|
67
|
+
!!(@match_block || @match_messages_block)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Check if this criterion has examples
|
|
71
|
+
# @return [Boolean]
|
|
72
|
+
def has_examples?
|
|
73
|
+
@good_examples.any? || @bad_examples.any? || @edge_cases.any?
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
module RSpec
|
|
2
|
+
module Agents
|
|
3
|
+
module DSL
|
|
4
|
+
# Builder for constructing TopicGraph from DSL blocks
|
|
5
|
+
class GraphBuilder
|
|
6
|
+
# @param shared_topics [Hash<Symbol, Topic>]
|
|
7
|
+
def initialize(shared_topics = {})
|
|
8
|
+
@shared_topics = shared_topics
|
|
9
|
+
@graph = TopicGraph.new
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Wire an existing shared topic into the graph
|
|
13
|
+
# @param name [Symbol]
|
|
14
|
+
# @param next [Symbol, Array<Symbol>, nil]
|
|
15
|
+
def use_topic(name, next: nil, **options)
|
|
16
|
+
next_topics = binding.local_variable_get(:next)
|
|
17
|
+
@graph.use_topic(name.to_sym, next_topics: next_topics, shared_topics: @shared_topics)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Define and add an inline topic to the graph
|
|
21
|
+
# @param name [Symbol]
|
|
22
|
+
# @param next [Symbol, Array<Symbol>, nil]
|
|
23
|
+
# @yield Block for topic definition
|
|
24
|
+
def topic(name, next: nil, &block)
|
|
25
|
+
next_topics = binding.local_variable_get(:next)
|
|
26
|
+
topic_instance = Topic.new(name, &block)
|
|
27
|
+
@graph.add_topic(topic_instance, next_topics: next_topics)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Build the final TopicGraph
|
|
31
|
+
# @return [TopicGraph]
|
|
32
|
+
def build
|
|
33
|
+
@graph
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
module RSpec
|
|
2
|
+
module Agents
|
|
3
|
+
module DSL
|
|
4
|
+
# Factory for creating runner instances with proper dependencies
|
|
5
|
+
# Encapsulates the complexity of wiring up runners with their dependencies
|
|
6
|
+
class RunnerFactory
|
|
7
|
+
# @param context [TestContext]
|
|
8
|
+
def initialize(context)
|
|
9
|
+
@context = context
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
# Build a turn executor for step-by-step test conversations
|
|
13
|
+
# @return [TurnExecutor]
|
|
14
|
+
def build_turn_executor
|
|
15
|
+
TurnExecutor.new(
|
|
16
|
+
agent: @context.build_agent,
|
|
17
|
+
conversation: @context.conversation,
|
|
18
|
+
graph: @context.topic_graph,
|
|
19
|
+
judge: @context.build_judge(@context.build_llm),
|
|
20
|
+
event_bus: @context.event_bus
|
|
21
|
+
)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Build an agent proxy for assertions
|
|
25
|
+
# @param turn_executor [TurnExecutor]
|
|
26
|
+
# @return [AgentProxy]
|
|
27
|
+
def build_agent_proxy(turn_executor)
|
|
28
|
+
AgentProxy.new(
|
|
29
|
+
turn_executor: turn_executor,
|
|
30
|
+
judge: @context.build_judge(@context.build_llm)
|
|
31
|
+
)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Build a user simulator for LLM-driven conversations
|
|
35
|
+
# @param simulator_config [SimulatorConfig]
|
|
36
|
+
# @return [Runners::UserSimulator]
|
|
37
|
+
def build_user_simulator(simulator_config)
|
|
38
|
+
llm = @context.build_llm
|
|
39
|
+
Runners::UserSimulator.new(
|
|
40
|
+
agent: @context.build_agent,
|
|
41
|
+
llm: llm,
|
|
42
|
+
judge: @context.build_judge(llm),
|
|
43
|
+
graph: @context.topic_graph,
|
|
44
|
+
simulator_config: simulator_config,
|
|
45
|
+
event_bus: @context.event_bus,
|
|
46
|
+
conversation: @context.conversation
|
|
47
|
+
)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|