rspec-agents 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +7 -0
  2. data/bin/rspec-agents +24 -0
  3. data/lib/async_workers/channel_config.rb +34 -0
  4. data/lib/async_workers/doc/process_manager_design.md +512 -0
  5. data/lib/async_workers/errors.rb +21 -0
  6. data/lib/async_workers/managed_process.rb +284 -0
  7. data/lib/async_workers/output_stream.rb +86 -0
  8. data/lib/async_workers/rpc_channel.rb +159 -0
  9. data/lib/async_workers/transport/base.rb +57 -0
  10. data/lib/async_workers/transport/stdio_transport.rb +91 -0
  11. data/lib/async_workers/transport/unix_socket_transport.rb +112 -0
  12. data/lib/async_workers/worker_group.rb +175 -0
  13. data/lib/async_workers.rb +17 -0
  14. data/lib/rspec/agents/agent_response.rb +61 -0
  15. data/lib/rspec/agents/agents/base.rb +123 -0
  16. data/lib/rspec/agents/cli.rb +342 -0
  17. data/lib/rspec/agents/conversation.rb +308 -0
  18. data/lib/rspec/agents/criterion.rb +237 -0
  19. data/lib/rspec/agents/doc/2026_01_22_observer-system-design.md +757 -0
  20. data/lib/rspec/agents/doc/2026_01_23_parallel_spec_runner-design.md +1060 -0
  21. data/lib/rspec/agents/doc/2026_01_27_event_serialization-design.md +294 -0
  22. data/lib/rspec/agents/doc/2026_01_27_experiment_aggregation_design.md +831 -0
  23. data/lib/rspec/agents/doc/2026_01_29_rspec-agents-studio-design.md +1332 -0
  24. data/lib/rspec/agents/doc/2026_01_29_testing-framework-design.md +1037 -0
  25. data/lib/rspec/agents/doc/2026_02_04-parallel-runner-ui.md +537 -0
  26. data/lib/rspec/agents/doc/2026_02_05_html_renderer_extensions.md +708 -0
  27. data/lib/rspec/agents/doc/scenario_guide.md +289 -0
  28. data/lib/rspec/agents/dsl/agent_proxy.rb +141 -0
  29. data/lib/rspec/agents/dsl/criterion_definition.rb +78 -0
  30. data/lib/rspec/agents/dsl/graph_builder.rb +38 -0
  31. data/lib/rspec/agents/dsl/runner_factory.rb +52 -0
  32. data/lib/rspec/agents/dsl/scenario_set_dsl.rb +166 -0
  33. data/lib/rspec/agents/dsl/test_context.rb +223 -0
  34. data/lib/rspec/agents/dsl/user_proxy.rb +71 -0
  35. data/lib/rspec/agents/dsl.rb +398 -0
  36. data/lib/rspec/agents/evaluation_result.rb +44 -0
  37. data/lib/rspec/agents/event_bus.rb +78 -0
  38. data/lib/rspec/agents/events.rb +141 -0
  39. data/lib/rspec/agents/isolated_event_bus.rb +86 -0
  40. data/lib/rspec/agents/judge.rb +244 -0
  41. data/lib/rspec/agents/llm/anthropic.rb +143 -0
  42. data/lib/rspec/agents/llm/base.rb +64 -0
  43. data/lib/rspec/agents/llm/mock.rb +181 -0
  44. data/lib/rspec/agents/llm/response.rb +52 -0
  45. data/lib/rspec/agents/matchers.rb +554 -0
  46. data/lib/rspec/agents/message.rb +81 -0
  47. data/lib/rspec/agents/metadata.rb +120 -0
  48. data/lib/rspec/agents/observers/base.rb +70 -0
  49. data/lib/rspec/agents/observers/parallel_terminal_observer.rb +151 -0
  50. data/lib/rspec/agents/observers/rpc_notify_observer.rb +43 -0
  51. data/lib/rspec/agents/observers/terminal_observer.rb +103 -0
  52. data/lib/rspec/agents/parallel/controller.rb +284 -0
  53. data/lib/rspec/agents/parallel/example_discovery.rb +153 -0
  54. data/lib/rspec/agents/parallel/partitioner.rb +31 -0
  55. data/lib/rspec/agents/parallel/run_result.rb +22 -0
  56. data/lib/rspec/agents/parallel/ui/interactive_ui.rb +605 -0
  57. data/lib/rspec/agents/parallel/ui/interleaved_ui.rb +139 -0
  58. data/lib/rspec/agents/parallel/ui/output_adapter.rb +127 -0
  59. data/lib/rspec/agents/parallel/ui/quiet_ui.rb +100 -0
  60. data/lib/rspec/agents/parallel/ui/ui_factory.rb +53 -0
  61. data/lib/rspec/agents/parallel/ui/ui_mode.rb +101 -0
  62. data/lib/rspec/agents/prompt_builders/base.rb +113 -0
  63. data/lib/rspec/agents/prompt_builders/criterion_evaluation.rb +136 -0
  64. data/lib/rspec/agents/prompt_builders/goal_achievement_evaluation.rb +142 -0
  65. data/lib/rspec/agents/prompt_builders/grounding_evaluation.rb +172 -0
  66. data/lib/rspec/agents/prompt_builders/intent_evaluation.rb +111 -0
  67. data/lib/rspec/agents/prompt_builders/topic_classification.rb +105 -0
  68. data/lib/rspec/agents/prompt_builders/user_simulation.rb +131 -0
  69. data/lib/rspec/agents/runners/headless_runner.rb +272 -0
  70. data/lib/rspec/agents/runners/parallel_terminal_runner.rb +220 -0
  71. data/lib/rspec/agents/runners/terminal_runner.rb +186 -0
  72. data/lib/rspec/agents/runners/user_simulator.rb +261 -0
  73. data/lib/rspec/agents/scenario.rb +133 -0
  74. data/lib/rspec/agents/scenario_loader.rb +145 -0
  75. data/lib/rspec/agents/serialization/conversation_renderer.rb +161 -0
  76. data/lib/rspec/agents/serialization/extension.rb +199 -0
  77. data/lib/rspec/agents/serialization/extensions/core_extension.rb +66 -0
  78. data/lib/rspec/agents/serialization/presenters.rb +281 -0
  79. data/lib/rspec/agents/serialization/run_data_aggregator.rb +197 -0
  80. data/lib/rspec/agents/serialization/run_data_builder.rb +189 -0
  81. data/lib/rspec/agents/serialization/templates/_alpine.min.js +5 -0
  82. data/lib/rspec/agents/serialization/templates/_base_components.css +196 -0
  83. data/lib/rspec/agents/serialization/templates/_base_components.js +46 -0
  84. data/lib/rspec/agents/serialization/templates/_conversation_fragment.html.haml +34 -0
  85. data/lib/rspec/agents/serialization/templates/_metadata_default.html.haml +17 -0
  86. data/lib/rspec/agents/serialization/templates/_scripts.js +89 -0
  87. data/lib/rspec/agents/serialization/templates/_styles.css +1211 -0
  88. data/lib/rspec/agents/serialization/templates/conversation_document.html.haml +29 -0
  89. data/lib/rspec/agents/serialization/templates/test_suite.html.haml +238 -0
  90. data/lib/rspec/agents/serialization/test_suite_renderer.rb +207 -0
  91. data/lib/rspec/agents/serialization.rb +374 -0
  92. data/lib/rspec/agents/simulator_config.rb +336 -0
  93. data/lib/rspec/agents/spec_executor.rb +494 -0
  94. data/lib/rspec/agents/stable_example_id.rb +147 -0
  95. data/lib/rspec/agents/templates/user_simulation.erb +9 -0
  96. data/lib/rspec/agents/tool_call.rb +53 -0
  97. data/lib/rspec/agents/topic.rb +307 -0
  98. data/lib/rspec/agents/topic_graph.rb +236 -0
  99. data/lib/rspec/agents/triggers.rb +122 -0
  100. data/lib/rspec/agents/turn.rb +63 -0
  101. data/lib/rspec/agents/turn_executor.rb +91 -0
  102. data/lib/rspec/agents/version.rb +7 -0
  103. data/lib/rspec/agents.rb +145 -0
  104. metadata +242 -0
@@ -0,0 +1,289 @@
1
+ # RSpec Agents Scenario Guide
2
+
3
+ This guide provides practical examples for writing agent tests using the rspec-agents DSL.
4
+
5
+ ## Quick Reference
6
+
7
+ ### Criteria Types
8
+
9
+ ```ruby
10
+ # 1. Named criterion (defined at describe/context level)
11
+ criterion :friendly, "The agent's response should be friendly"
12
+ expect(agent).to satisfy(:friendly)
13
+
14
+ # 2. Adhoc criterion (inline string description, evaluated by LLM)
15
+ expect(agent).to satisfy("Der Agent fragt nach Details oder zeigt das Formular an")
16
+
17
+ # 3. Lambda criterion (code-based, no LLM)
18
+ expect(agent).to satisfy(->(turn) { turn.agent_response.text.length <= 500 })
19
+
20
+ # 4. Named lambda criterion
21
+ expect(agent).to satisfy(:concise, ->(turn) { turn.agent_response.text.length <= 300 })
22
+ ```
23
+
24
+ ### Soft vs Hard Assertions
25
+
26
+ ```ruby
27
+ # Soft: records result, continues even if fails
28
+ evaluate(agent).to satisfy(:friendly)
29
+
30
+ # Hard: test fails immediately if not met
31
+ expect(agent).to satisfy(:friendly)
32
+ ```
33
+
34
+ ---
35
+
36
+ ## Common Patterns
37
+
38
+ ### Basic Scripted Conversation
39
+
40
+ ```ruby
41
+ RSpec.describe "Room Booking Agent", type: :agent do
42
+ criterion :friendly, "The agent's response should be friendly"
43
+
44
+ it "handles booking request" do
45
+ user.says "Hi, I need to book a meeting room"
46
+
47
+ evaluate(agent).to satisfy(:friendly)
48
+ expect(agent).not_to call_tool(:book_room)
49
+
50
+ user.says "The Blue Room for tomorrow at 2pm"
51
+
52
+ expect(agent).to call_tool(:check_availability)
53
+ end
54
+ end
55
+ ```
56
+
57
+ ### Using Adhoc Criterions
58
+
59
+ Adhoc criterions are useful when you need a one-off evaluation without defining a named criterion. The string description is passed directly to the LLM judge.
60
+
61
+ ```ruby
62
+ it "handles form display" do
63
+ user.says "I want to book a room"
64
+
65
+ # German description - LLM evaluates naturally
66
+ expect(agent).to satisfy("Der Agent fragt nach Details oder zeigt das Formular an")
67
+
68
+ # English description
69
+ evaluate(agent).to satisfy("The agent acknowledges the request")
70
+
71
+ # Mix named and adhoc
72
+ evaluate(agent).to satisfy(:friendly)
73
+ expect(agent).to satisfy("Response contains actionable next steps")
74
+ end
75
+ ```
76
+
77
+ ### Combining Criterion Types
78
+
79
+ ```ruby
80
+ it "validates response quality" do
81
+ user.says "Search for venues in Stuttgart"
82
+
83
+ # Named criterion
84
+ evaluate(agent).to satisfy(:friendly)
85
+
86
+ # Adhoc criterion for specific requirement
87
+ expect(agent).to satisfy("The agent confirms the search location")
88
+
89
+ # Lambda for deterministic checks
90
+ expect(agent).to satisfy(->(turn) {
91
+ turn.agent_response.tool_calls.any? { |tc| tc.name == :search_venues }
92
+ })
93
+
94
+ # Multiple in one call
95
+ evaluate(agent).to satisfy(
96
+ :helpful,
97
+ "Response is professional",
98
+ ->(turn) { turn.agent_response.text.length < 1000 }
99
+ )
100
+ end
101
+ ```
102
+
103
+ ### Scenario-Based Testing (from JSON file)
104
+
105
+ ```ruby
106
+ RSpec.describe "Event Booking Agent", type: :agent do
107
+ criterion :friendly, "The agent's response should be friendly"
108
+ criterion :helpful, "The agent should move toward the user's goal"
109
+
110
+ scenario_set "venue_searches", from: "scenarios/venue_search.json" do |scenario|
111
+ it "handles #{scenario[:name]}" do
112
+ user.simulate do
113
+ goal scenario[:goal]
114
+ personality scenario[:personality]
115
+ end
116
+
117
+ # Adhoc criterion using scenario data
118
+ expect(agent).to satisfy("The agent addresses: #{scenario[:goal]}")
119
+ end
120
+ end
121
+ end
122
+ ```
123
+
124
+ ### Scenario-Based Testing (inline array)
125
+
126
+ Scenarios can be defined directly in the test file using `scenarios:` instead of `from:`. This is useful for self-contained tests or dynamically generated scenarios.
127
+
128
+ ```ruby
129
+ RSpec.describe "Event Booking Agent", type: :agent do
130
+ criterion :friendly, "The agent's response should be friendly"
131
+ criterion :helpful, "The agent should move toward the user's goal"
132
+
133
+ scenarios = [
134
+ {
135
+ id: "weihnachtsfeier",
136
+ name: "Unternehmens-Weihnachtsfeier",
137
+ goal: "Eine festliche Weihnachtsfeier für 60 Mitarbeiter in Berlin organisieren",
138
+ personality: "Festlich gestimmt, achtet auf Details wie Dekoration und Menü",
139
+ context: ["Dezember-Termin", "Abendveranstaltung mit Dinner", "Unterhaltungsprogramm gewünscht"]
140
+ },
141
+ {
142
+ id: "produktlaunch",
143
+ name: "Produktlaunch-Event",
144
+ goal: "Ein Produktlaunch-Event für 100 Gäste in Hamburg organisieren",
145
+ personality: "Marketing-orientiert, achtet auf Präsentationstechnik und Impression",
146
+ context: ["Presse und Kunden eingeladen", "Moderne Location gewünscht", "Catering wichtig"]
147
+ },
148
+ {
149
+ id: "vertriebstagung",
150
+ name: "Vertriebstagung",
151
+ goal: "Eine zweitägige Vertriebstagung für 50 Außendienstmitarbeiter in Düsseldorf",
152
+ personality: "Ergebnisorientiert, fokussiert auf Motivation und Schulung",
153
+ context: ["Motivationstraining geplant", "Award-Verleihung am Abend", "Networking wichtig"]
154
+ },
155
+ {
156
+ id: "klausurtagung",
157
+ name: "Management-Klausurtagung",
158
+ goal: "Eine vertrauliche Klausurtagung für 12 Führungskräfte in einem ruhigen Hotel im Schwarzwald",
159
+ personality: "Diskret, bevorzugt ruhige und abgeschiedene Locations",
160
+ context: ["Strategische Themen", "Absolute Vertraulichkeit", "Keine Ablenkungen"]
161
+ },
162
+ {
163
+ id: "azubi_onboarding",
164
+ name: "Azubi-Onboarding",
165
+ goal: "Ein Onboarding-Event für 20 neue Auszubildende in der Nähe von München",
166
+ personality: "Jugendlich, achtet auf abwechslungsreiches Programm",
167
+ context: ["Junge Teilnehmer", "Teambuilding-Aktivitäten wichtig", "Lockere Atmosphäre"]
168
+ }
169
+ ]
170
+
171
+ scenario_set "corporate_events", scenarios: scenarios do |scenario|
172
+ it "handles #{scenario[:name]}" do
173
+ user.simulate do
174
+ goal scenario[:goal]
175
+ personality scenario[:personality]
176
+ context { scenario[:context].each { |c| note c } }
177
+ end
178
+
179
+ evaluate(agent).to satisfy(:friendly)
180
+ evaluate(agent).to satisfy(:helpful)
181
+ end
182
+ end
183
+ end
184
+ ```
185
+
186
+ **When to use each approach:**
187
+
188
+ | Approach | Use When |
189
+ |----------|----------|
190
+ | `from: "file.json"` | Scenarios shared across multiple test files |
191
+ | `scenarios: [...]` | Self-contained tests, dynamically generated scenarios |
192
+
193
+ ---
194
+
195
+ ## When to Use Each Criterion Type
196
+
197
+ | Type | Use When | Example |
198
+ |------|----------|---------|
199
+ | Named | Reused across multiple tests | `:friendly`, `:helpful` |
200
+ | Adhoc | One-off, test-specific requirement | `"Der Agent zeigt das Formular an"` |
201
+ | Lambda | Deterministic, no LLM needed | `->(turn) { turn.text.length < 500 }` |
202
+ | Named Lambda | Reusable code check with a name | `:concise, ->(turn) { ... }` |
203
+
204
+ ### Adhoc Criterion Best Practices
205
+
206
+ 1. **Use natural language**: Write descriptions as you would explain to a human
207
+ 2. **Be specific**: "The agent asks for the event date" is better than "Agent asks questions"
208
+ 3. **Language flexibility**: Use any language - the LLM judge handles German, English, etc.
209
+ 4. **Mix with named criteria**: Use adhoc for edge cases, named for common patterns
210
+
211
+ ```ruby
212
+ # Good: specific, actionable
213
+ expect(agent).to satisfy("The agent provides at least 3 venue options")
214
+ expect(agent).to satisfy("Der Agent fragt nach dem Budget")
215
+
216
+ # Avoid: too vague
217
+ expect(agent).to satisfy("Good response")
218
+ expect(agent).to satisfy("Agent works correctly")
219
+ ```
220
+
221
+ ---
222
+
223
+ ## Tool Call Assertions
224
+
225
+ ```ruby
226
+ # Basic tool call check
227
+ expect(agent).to call_tool(:search_venues)
228
+
229
+ # With parameters
230
+ expect(agent).to call_tool(:book_room).with(
231
+ room: "Blue Room",
232
+ capacity: be >= 10
233
+ )
234
+
235
+ # Negated
236
+ expect(agent).not_to call_tool(:book_room)
237
+
238
+ # Conversation-level (across all turns)
239
+ expect(conversation).to have_tool_call(:search_venues)
240
+ ```
241
+
242
+ ---
243
+
244
+ ## Grounding Assertions
245
+
246
+ ```ruby
247
+ # Verify claims are grounded in tool results
248
+ expect(agent).to be_grounded_in(:venues, :pricing)
249
+
250
+ # Specify source tools
251
+ expect(agent).to be_grounded_in(:venues, from_tools: [:search_venues])
252
+
253
+ # Forbid ungrounded claims
254
+ expect(agent).not_to claim(:availability)
255
+ ```
256
+
257
+ ---
258
+
259
+ ## Topic Tracking
260
+
261
+ **No self-loops:** A topic cannot list itself in `next:`. The tracker stays in a topic until a transition occurs naturally.
262
+
263
+ ```ruby
264
+ it "progresses through booking flow" do
265
+ expect_conversation_to do
266
+ use_topic :greeting, next: :gathering_details
267
+ use_topic :gathering_details, next: :confirming
268
+ use_topic :confirming
269
+ end
270
+
271
+ user.says "Hello!"
272
+ expect(agent).to be_in_topic(:greeting)
273
+
274
+ user.says "I need a room for 10 people tomorrow"
275
+ expect(agent).to be_in_topic(:gathering_details)
276
+ end
277
+ ```
278
+
279
+ ---
280
+
281
+ ## Goal Achievement
282
+
283
+ ```ruby
284
+ # Check stated goal (from simulator config)
285
+ expect(agent).to have_achieved_stated_goal
286
+
287
+ # Check custom goal description
288
+ expect(agent).to have_achieved_goal("User received venue options under budget")
289
+ ```
@@ -0,0 +1,141 @@
1
+ module RSpec
2
+ module Agents
3
+ module DSL
4
+ # Proxy for agent state inspection in scripted tests
5
+ # Provides read-only access to turn executor state and assertion helpers
6
+ class AgentProxy
7
+ # @param turn_executor [TurnExecutor]
8
+ # @param judge [Judge, nil] Optional judge for LLM-based assertions
9
+ def initialize(turn_executor:, judge: nil)
10
+ @turn_executor = turn_executor
11
+ @judge = judge
12
+ end
13
+
14
+ # Get the turn executor (runner)
15
+ # @return [TurnExecutor]
16
+ def runner
17
+ @turn_executor
18
+ end
19
+
20
+ # Get the conversation (delegates to turn executor)
21
+ # @return [Conversation]
22
+ def conversation
23
+ @turn_executor.conversation
24
+ end
25
+
26
+ # Get the current response object
27
+ # @return [AgentResponse, nil]
28
+ def response
29
+ @turn_executor.current_response
30
+ end
31
+
32
+ # Get the response text
33
+ # @return [String, nil]
34
+ def last_response
35
+ response&.text
36
+ end
37
+
38
+ # Get the current topic
39
+ # @return [Symbol, nil]
40
+ def current_topic
41
+ @turn_executor.current_topic
42
+ end
43
+
44
+ # Get the current turn
45
+ # @return [Turn, nil]
46
+ def current_turn
47
+ @turn_executor.current_turn
48
+ end
49
+
50
+ # Get tool calls from current response
51
+ # @return [Array<ToolCall>]
52
+ def tool_calls
53
+ response&.tool_calls || []
54
+ end
55
+
56
+ # Check if agent called a specific tool
57
+ # @param name [Symbol, String]
58
+ # @param params [Hash, nil]
59
+ # @return [Boolean]
60
+ def called_tool?(name, params: nil)
61
+ response&.has_tool_call?(name, params: params) || false
62
+ end
63
+
64
+ # Check if in a specific topic
65
+ # @param topic_name [Symbol]
66
+ # @return [Boolean]
67
+ def in_topic?(topic_name)
68
+ @turn_executor.in_topic?(topic_name)
69
+ end
70
+
71
+ # Check if current response matches a pattern
72
+ #
73
+ # @param pattern [Regexp] Pattern to match
74
+ # @return [Boolean]
75
+ def response_matches?(pattern)
76
+ response&.match?(pattern) || false
77
+ end
78
+
79
+ # Evaluate a criterion against the current turn
80
+ # Uses the Criterion class to normalize and evaluate all criterion types
81
+ #
82
+ # @param criterion [Symbol, String, Proc, Criterion] Criterion to evaluate
83
+ # @return [Hash] { satisfied: Boolean, reasoning: String }
84
+ def evaluate_criterion(criterion)
85
+ turn = current_turn
86
+ return { satisfied: false, reasoning: "No turn to evaluate" } unless turn
87
+
88
+ # Normalize to Criterion object
89
+ criterion_obj = criterion.is_a?(Criterion) ? criterion : Criterion.from(criterion)
90
+
91
+ criterion_obj.evaluate(
92
+ turn: turn,
93
+ judge: @judge,
94
+ conversation: conversation,
95
+ criteria_registry: @judge&.criteria || {}
96
+ )
97
+ end
98
+
99
+ # Check if response is grounded in tool results
100
+ #
101
+ # @param claim_types [Array<Symbol>] Claim types to verify
102
+ # @param from_tools [Array<Symbol>] Tool names that should provide grounding
103
+ # @return [Hash] { grounded: Boolean, violations: Array }
104
+ def check_grounding(claim_types, from_tools: [])
105
+ return { grounded: true, violations: [] } unless @judge
106
+
107
+ turn = current_turn
108
+ return { grounded: false, violations: ["No turn to evaluate"] } unless turn
109
+
110
+ @judge.evaluate_grounding(claim_types, [turn], from_tools: from_tools)
111
+ end
112
+
113
+ # Check for forbidden claims
114
+ #
115
+ # @param claim_types [Array<Symbol>] Claim types that are forbidden
116
+ # @return [Hash] { violated: Boolean, claims_found: Array }
117
+ def check_forbidden_claims(claim_types)
118
+ return { violated: false, claims_found: [] } unless @judge
119
+
120
+ turn = current_turn
121
+ return { violated: false, claims_found: [] } unless turn
122
+
123
+ @judge.evaluate_forbidden_claims(claim_types, [turn])
124
+ end
125
+
126
+ # Check if agent demonstrates expected intent
127
+ #
128
+ # @param intent_description [String] Description of expected intent
129
+ # @return [Hash] { matches: Boolean, observed_intent: String, reasoning: String }
130
+ def check_intent(intent_description)
131
+ return { matches: false, reasoning: "No judge configured" } unless @judge
132
+
133
+ turn = current_turn
134
+ return { matches: false, reasoning: "No turn to evaluate" } unless turn
135
+
136
+ @judge.evaluate_intent(intent_description, turn)
137
+ end
138
+ end
139
+ end
140
+ end
141
+ end
@@ -0,0 +1,78 @@
1
+ module RSpec
2
+ module Agents
3
+ module DSL
4
+ # Criterion definition with optional examples for LLM-based evaluation
5
+ class CriterionDefinition
6
+ attr_reader :name, :good_examples, :bad_examples, :edge_cases
7
+ attr_reader :match_block, :match_messages_block
8
+
9
+ # @param name [Symbol, String]
10
+ # @param description [String, nil]
11
+ # @yield Optional block for complex definition
12
+ def initialize(name, description: nil, &block)
13
+ @name = name.to_sym
14
+ @description = description
15
+ @good_examples = []
16
+ @bad_examples = []
17
+ @edge_cases = []
18
+ @match_block = nil
19
+ @match_messages_block = nil
20
+ instance_eval(&block) if block_given?
21
+ end
22
+
23
+ # Get or set description
24
+ # @param text [String, nil]
25
+ # @return [String, nil]
26
+ def description(text = nil)
27
+ text.nil? ? @description : (@description = text)
28
+ end
29
+
30
+ # Add a good example
31
+ # @param text [String]
32
+ # @param explanation [String]
33
+ def good_example(text, explanation:)
34
+ @good_examples << { text: text, explanation: explanation }
35
+ end
36
+
37
+ # Add a bad example
38
+ # @param text [String]
39
+ # @param explanation [String]
40
+ def bad_example(text, explanation:)
41
+ @bad_examples << { text: text, explanation: explanation }
42
+ end
43
+
44
+ # Add an edge case
45
+ # @param text [String]
46
+ # @param verdict [Boolean]
47
+ # @param explanation [String]
48
+ def edge_case(text, verdict:, explanation:)
49
+ @edge_cases << { text: text, verdict: verdict, explanation: explanation }
50
+ end
51
+
52
+ # Set code-based evaluation block
53
+ # @yield [conversation]
54
+ def match(&block)
55
+ @match_block = block
56
+ end
57
+
58
+ # Set message-based evaluation block
59
+ # @yield [messages]
60
+ def match_messages(&block)
61
+ @match_messages_block = block
62
+ end
63
+
64
+ # Check if this criterion uses code-based evaluation
65
+ # @return [Boolean]
66
+ def code_based?
67
+ !!(@match_block || @match_messages_block)
68
+ end
69
+
70
+ # Check if this criterion has examples
71
+ # @return [Boolean]
72
+ def has_examples?
73
+ @good_examples.any? || @bad_examples.any? || @edge_cases.any?
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,38 @@
1
+ module RSpec
2
+ module Agents
3
+ module DSL
4
+ # Builder for constructing TopicGraph from DSL blocks
5
+ class GraphBuilder
6
+ # @param shared_topics [Hash<Symbol, Topic>]
7
+ def initialize(shared_topics = {})
8
+ @shared_topics = shared_topics
9
+ @graph = TopicGraph.new
10
+ end
11
+
12
+ # Wire an existing shared topic into the graph
13
+ # @param name [Symbol]
14
+ # @param next [Symbol, Array<Symbol>, nil]
15
+ def use_topic(name, next: nil, **options)
16
+ next_topics = binding.local_variable_get(:next)
17
+ @graph.use_topic(name.to_sym, next_topics: next_topics, shared_topics: @shared_topics)
18
+ end
19
+
20
+ # Define and add an inline topic to the graph
21
+ # @param name [Symbol]
22
+ # @param next [Symbol, Array<Symbol>, nil]
23
+ # @yield Block for topic definition
24
+ def topic(name, next: nil, &block)
25
+ next_topics = binding.local_variable_get(:next)
26
+ topic_instance = Topic.new(name, &block)
27
+ @graph.add_topic(topic_instance, next_topics: next_topics)
28
+ end
29
+
30
+ # Build the final TopicGraph
31
+ # @return [TopicGraph]
32
+ def build
33
+ @graph
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,52 @@
1
+ module RSpec
2
+ module Agents
3
+ module DSL
4
+ # Factory for creating runner instances with proper dependencies
5
+ # Encapsulates the complexity of wiring up runners with their dependencies
6
+ class RunnerFactory
7
+ # @param context [TestContext]
8
+ def initialize(context)
9
+ @context = context
10
+ end
11
+
12
+ # Build a turn executor for step-by-step test conversations
13
+ # @return [TurnExecutor]
14
+ def build_turn_executor
15
+ TurnExecutor.new(
16
+ agent: @context.build_agent,
17
+ conversation: @context.conversation,
18
+ graph: @context.topic_graph,
19
+ judge: @context.build_judge(@context.build_llm),
20
+ event_bus: @context.event_bus
21
+ )
22
+ end
23
+
24
+ # Build an agent proxy for assertions
25
+ # @param turn_executor [TurnExecutor]
26
+ # @return [AgentProxy]
27
+ def build_agent_proxy(turn_executor)
28
+ AgentProxy.new(
29
+ turn_executor: turn_executor,
30
+ judge: @context.build_judge(@context.build_llm)
31
+ )
32
+ end
33
+
34
+ # Build a user simulator for LLM-driven conversations
35
+ # @param simulator_config [SimulatorConfig]
36
+ # @return [Runners::UserSimulator]
37
+ def build_user_simulator(simulator_config)
38
+ llm = @context.build_llm
39
+ Runners::UserSimulator.new(
40
+ agent: @context.build_agent,
41
+ llm: llm,
42
+ judge: @context.build_judge(llm),
43
+ graph: @context.topic_graph,
44
+ simulator_config: simulator_config,
45
+ event_bus: @context.event_bus,
46
+ conversation: @context.conversation
47
+ )
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end