tactus 0.31.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tactus/__init__.py +49 -0
- tactus/adapters/__init__.py +9 -0
- tactus/adapters/broker_log.py +76 -0
- tactus/adapters/cli_hitl.py +189 -0
- tactus/adapters/cli_log.py +223 -0
- tactus/adapters/cost_collector_log.py +56 -0
- tactus/adapters/file_storage.py +367 -0
- tactus/adapters/http_callback_log.py +109 -0
- tactus/adapters/ide_log.py +71 -0
- tactus/adapters/lua_tools.py +336 -0
- tactus/adapters/mcp.py +289 -0
- tactus/adapters/mcp_manager.py +196 -0
- tactus/adapters/memory.py +53 -0
- tactus/adapters/plugins.py +419 -0
- tactus/backends/http_backend.py +58 -0
- tactus/backends/model_backend.py +35 -0
- tactus/backends/pytorch_backend.py +110 -0
- tactus/broker/__init__.py +12 -0
- tactus/broker/client.py +247 -0
- tactus/broker/protocol.py +183 -0
- tactus/broker/server.py +1123 -0
- tactus/broker/stdio.py +12 -0
- tactus/cli/__init__.py +7 -0
- tactus/cli/app.py +2245 -0
- tactus/cli/commands/__init__.py +0 -0
- tactus/core/__init__.py +32 -0
- tactus/core/config_manager.py +790 -0
- tactus/core/dependencies/__init__.py +14 -0
- tactus/core/dependencies/registry.py +180 -0
- tactus/core/dsl_stubs.py +2117 -0
- tactus/core/exceptions.py +66 -0
- tactus/core/execution_context.py +480 -0
- tactus/core/lua_sandbox.py +508 -0
- tactus/core/message_history_manager.py +236 -0
- tactus/core/mocking.py +286 -0
- tactus/core/output_validator.py +291 -0
- tactus/core/registry.py +499 -0
- tactus/core/runtime.py +2907 -0
- tactus/core/template_resolver.py +142 -0
- tactus/core/yaml_parser.py +301 -0
- tactus/docker/Dockerfile +61 -0
- tactus/docker/entrypoint.sh +69 -0
- tactus/dspy/__init__.py +39 -0
- tactus/dspy/agent.py +1144 -0
- tactus/dspy/broker_lm.py +181 -0
- tactus/dspy/config.py +212 -0
- tactus/dspy/history.py +196 -0
- tactus/dspy/module.py +405 -0
- tactus/dspy/prediction.py +318 -0
- tactus/dspy/signature.py +185 -0
- tactus/formatting/__init__.py +7 -0
- tactus/formatting/formatter.py +437 -0
- tactus/ide/__init__.py +9 -0
- tactus/ide/coding_assistant.py +343 -0
- tactus/ide/server.py +2223 -0
- tactus/primitives/__init__.py +49 -0
- tactus/primitives/control.py +168 -0
- tactus/primitives/file.py +229 -0
- tactus/primitives/handles.py +378 -0
- tactus/primitives/host.py +94 -0
- tactus/primitives/human.py +342 -0
- tactus/primitives/json.py +189 -0
- tactus/primitives/log.py +187 -0
- tactus/primitives/message_history.py +157 -0
- tactus/primitives/model.py +163 -0
- tactus/primitives/procedure.py +564 -0
- tactus/primitives/procedure_callable.py +318 -0
- tactus/primitives/retry.py +155 -0
- tactus/primitives/session.py +152 -0
- tactus/primitives/state.py +182 -0
- tactus/primitives/step.py +209 -0
- tactus/primitives/system.py +93 -0
- tactus/primitives/tool.py +375 -0
- tactus/primitives/tool_handle.py +279 -0
- tactus/primitives/toolset.py +229 -0
- tactus/protocols/__init__.py +38 -0
- tactus/protocols/chat_recorder.py +81 -0
- tactus/protocols/config.py +97 -0
- tactus/protocols/cost.py +31 -0
- tactus/protocols/hitl.py +71 -0
- tactus/protocols/log_handler.py +27 -0
- tactus/protocols/models.py +355 -0
- tactus/protocols/result.py +33 -0
- tactus/protocols/storage.py +90 -0
- tactus/providers/__init__.py +13 -0
- tactus/providers/base.py +92 -0
- tactus/providers/bedrock.py +117 -0
- tactus/providers/google.py +105 -0
- tactus/providers/openai.py +98 -0
- tactus/sandbox/__init__.py +63 -0
- tactus/sandbox/config.py +171 -0
- tactus/sandbox/container_runner.py +1099 -0
- tactus/sandbox/docker_manager.py +433 -0
- tactus/sandbox/entrypoint.py +227 -0
- tactus/sandbox/protocol.py +213 -0
- tactus/stdlib/__init__.py +10 -0
- tactus/stdlib/io/__init__.py +13 -0
- tactus/stdlib/io/csv.py +88 -0
- tactus/stdlib/io/excel.py +136 -0
- tactus/stdlib/io/file.py +90 -0
- tactus/stdlib/io/fs.py +154 -0
- tactus/stdlib/io/hdf5.py +121 -0
- tactus/stdlib/io/json.py +109 -0
- tactus/stdlib/io/parquet.py +83 -0
- tactus/stdlib/io/tsv.py +88 -0
- tactus/stdlib/loader.py +274 -0
- tactus/stdlib/tac/tactus/tools/done.tac +33 -0
- tactus/stdlib/tac/tactus/tools/log.tac +50 -0
- tactus/testing/README.md +273 -0
- tactus/testing/__init__.py +61 -0
- tactus/testing/behave_integration.py +380 -0
- tactus/testing/context.py +486 -0
- tactus/testing/eval_models.py +114 -0
- tactus/testing/evaluation_runner.py +222 -0
- tactus/testing/evaluators.py +634 -0
- tactus/testing/events.py +94 -0
- tactus/testing/gherkin_parser.py +134 -0
- tactus/testing/mock_agent.py +315 -0
- tactus/testing/mock_dependencies.py +234 -0
- tactus/testing/mock_hitl.py +171 -0
- tactus/testing/mock_registry.py +168 -0
- tactus/testing/mock_tools.py +133 -0
- tactus/testing/models.py +115 -0
- tactus/testing/pydantic_eval_runner.py +508 -0
- tactus/testing/steps/__init__.py +13 -0
- tactus/testing/steps/builtin.py +902 -0
- tactus/testing/steps/custom.py +69 -0
- tactus/testing/steps/registry.py +68 -0
- tactus/testing/test_runner.py +489 -0
- tactus/tracing/__init__.py +5 -0
- tactus/tracing/trace_manager.py +417 -0
- tactus/utils/__init__.py +1 -0
- tactus/utils/cost_calculator.py +72 -0
- tactus/utils/model_pricing.py +132 -0
- tactus/utils/safe_file_library.py +502 -0
- tactus/utils/safe_libraries.py +234 -0
- tactus/validation/LuaLexerBase.py +66 -0
- tactus/validation/LuaParserBase.py +23 -0
- tactus/validation/README.md +224 -0
- tactus/validation/__init__.py +7 -0
- tactus/validation/error_listener.py +21 -0
- tactus/validation/generated/LuaLexer.interp +231 -0
- tactus/validation/generated/LuaLexer.py +5548 -0
- tactus/validation/generated/LuaLexer.tokens +124 -0
- tactus/validation/generated/LuaLexerBase.py +66 -0
- tactus/validation/generated/LuaParser.interp +173 -0
- tactus/validation/generated/LuaParser.py +6439 -0
- tactus/validation/generated/LuaParser.tokens +124 -0
- tactus/validation/generated/LuaParserBase.py +23 -0
- tactus/validation/generated/LuaParserVisitor.py +118 -0
- tactus/validation/generated/__init__.py +7 -0
- tactus/validation/grammar/LuaLexer.g4 +123 -0
- tactus/validation/grammar/LuaParser.g4 +178 -0
- tactus/validation/semantic_visitor.py +817 -0
- tactus/validation/validator.py +157 -0
- tactus-0.31.0.dist-info/METADATA +1809 -0
- tactus-0.31.0.dist-info/RECORD +160 -0
- tactus-0.31.0.dist-info/WHEEL +4 -0
- tactus-0.31.0.dist-info/entry_points.txt +2 -0
- tactus-0.31.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,902 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Built-in step definitions for Tactus primitives.
|
|
3
|
+
|
|
4
|
+
Provides a comprehensive library of steps for testing:
|
|
5
|
+
- Tool calls
|
|
6
|
+
- State management
|
|
7
|
+
- Procedure completion
|
|
8
|
+
- Iterations and timing
|
|
9
|
+
- Parameters and context
|
|
10
|
+
- Regex pattern matching
|
|
11
|
+
- Fuzzy string matching
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import re
|
|
16
|
+
import ast
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from .registry import StepRegistry
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _parse_step_string_literal(value: str) -> tuple[str, bool]:
|
|
26
|
+
"""
|
|
27
|
+
Parse an optional quoted string literal from a step capture group.
|
|
28
|
+
|
|
29
|
+
Supports single-quoted or double-quoted Python-style escapes, e.g.:
|
|
30
|
+
"Hello! I'm World"
|
|
31
|
+
'He said: "hi"'
|
|
32
|
+
"Line 1\\nLine 2"
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
(parsed_value, was_quoted)
|
|
36
|
+
"""
|
|
37
|
+
stripped = value.strip()
|
|
38
|
+
if len(stripped) >= 2 and stripped[0] in {"'", '"'} and stripped[-1] == stripped[0]:
|
|
39
|
+
try:
|
|
40
|
+
parsed = ast.literal_eval(stripped)
|
|
41
|
+
if isinstance(parsed, str):
|
|
42
|
+
return parsed, True
|
|
43
|
+
except Exception:
|
|
44
|
+
# Fall back to raw string if the literal is malformed.
|
|
45
|
+
return stripped, True
|
|
46
|
+
return value, False
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def register_builtin_steps(registry: StepRegistry) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Register all built-in step definitions.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
registry: StepRegistry to register steps with
|
|
55
|
+
"""
|
|
56
|
+
# Tool-related steps
|
|
57
|
+
register_tool_steps(registry)
|
|
58
|
+
|
|
59
|
+
# State-related steps
|
|
60
|
+
register_state_steps(registry)
|
|
61
|
+
|
|
62
|
+
# Output-related steps
|
|
63
|
+
register_output_steps(registry)
|
|
64
|
+
|
|
65
|
+
# Completion steps
|
|
66
|
+
register_completion_steps(registry)
|
|
67
|
+
|
|
68
|
+
# Iteration/timing steps
|
|
69
|
+
register_iteration_steps(registry)
|
|
70
|
+
|
|
71
|
+
# Parameter/context steps
|
|
72
|
+
register_parameter_steps(registry)
|
|
73
|
+
|
|
74
|
+
# Agent steps
|
|
75
|
+
register_agent_steps(registry)
|
|
76
|
+
|
|
77
|
+
# Regex pattern matching steps
|
|
78
|
+
register_regex_steps(registry)
|
|
79
|
+
|
|
80
|
+
# Model-related steps
|
|
81
|
+
register_model_steps(registry)
|
|
82
|
+
|
|
83
|
+
# Fuzzy string matching steps
|
|
84
|
+
register_fuzzy_steps(registry)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# Tool-related steps
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def register_tool_steps(registry: StepRegistry) -> None:
|
|
91
|
+
"""Register tool-related step definitions."""
|
|
92
|
+
|
|
93
|
+
registry.register(r"the (?P<tool>[-\w]+) tool should be called", step_tool_called)
|
|
94
|
+
|
|
95
|
+
registry.register(r"the (?P<tool>[-\w]+) tool should not be called", step_tool_not_called)
|
|
96
|
+
|
|
97
|
+
registry.register(
|
|
98
|
+
r"the (?P<tool>[-\w]+) tool should be called at least (?P<n>\d+) time",
|
|
99
|
+
step_tool_called_at_least,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
registry.register(
|
|
103
|
+
r"the (?P<tool>[-\w]+) tool should be called at least (?P<n>\d+) times",
|
|
104
|
+
step_tool_called_at_least,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
registry.register(
|
|
108
|
+
r"the (?P<tool>[-\w]+) tool should be called exactly (?P<n>\d+) time",
|
|
109
|
+
step_tool_called_exactly,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
registry.register(
|
|
113
|
+
r"the (?P<tool>[-\w]+) tool should be called exactly (?P<n>\d+) times",
|
|
114
|
+
step_tool_called_exactly,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
registry.register(
|
|
118
|
+
r"the (?P<tool>[-\w]+) tool should be called with (?P<param>\w+)=(?P<value>.+)",
|
|
119
|
+
step_tool_called_with_param,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
registry.register(
|
|
123
|
+
r'the tool "(?P<tool>[-\w]+)" returns (?P<value>.+)',
|
|
124
|
+
step_mock_tool_returns,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def step_tool_called(context: Any, tool: str) -> None:
|
|
129
|
+
"""Check if a tool was called."""
|
|
130
|
+
assert context.tool_called(tool), f"Tool '{tool}' was not called"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def step_tool_not_called(context: Any, tool: str) -> None:
|
|
134
|
+
"""Check if a tool was not called."""
|
|
135
|
+
assert not context.tool_called(tool), f"Tool '{tool}' was called but shouldn't be"
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def step_tool_called_at_least(context: Any, tool: str, n: str) -> None:
|
|
139
|
+
"""Check if tool was called at least N times."""
|
|
140
|
+
count = context.tool_call_count(tool)
|
|
141
|
+
min_count = int(n)
|
|
142
|
+
assert count >= min_count, f"Tool '{tool}' called {count} times, expected at least {min_count}"
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def step_tool_called_exactly(context: Any, tool: str, n: str) -> None:
|
|
146
|
+
"""Check if tool was called exactly N times."""
|
|
147
|
+
count = context.tool_call_count(tool)
|
|
148
|
+
expected = int(n)
|
|
149
|
+
assert count == expected, f"Tool '{tool}' called {count} times, expected exactly {expected}"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def step_tool_called_with_param(context: Any, tool: str, param: str, value: str) -> None:
|
|
153
|
+
"""Check if tool was called with specific parameter value."""
|
|
154
|
+
calls = context.tool_calls(tool)
|
|
155
|
+
assert calls, f"Tool '{tool}' was not called"
|
|
156
|
+
|
|
157
|
+
# Check if any call has the parameter with the expected value
|
|
158
|
+
found = any(call.get("args", {}).get(param) == value for call in calls)
|
|
159
|
+
assert found, f"Tool '{tool}' was not called with {param}={value}"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def step_mock_tool_returns(context: Any, tool: str, value: str) -> None:
|
|
163
|
+
"""Configure a runtime tool mock response for this scenario."""
|
|
164
|
+
parsed_value, was_quoted = _parse_step_string_literal(value)
|
|
165
|
+
if not was_quoted:
|
|
166
|
+
try:
|
|
167
|
+
parsed_value = ast.literal_eval(parsed_value)
|
|
168
|
+
except Exception:
|
|
169
|
+
# Treat unquoted values as plain strings (e.g., positive/neutral)
|
|
170
|
+
pass
|
|
171
|
+
|
|
172
|
+
if not hasattr(context, "mock_tool_returns"):
|
|
173
|
+
raise AssertionError("Context does not support tool mocking")
|
|
174
|
+
|
|
175
|
+
context.mock_tool_returns(tool, parsed_value)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def step_procedure_started(context: Any) -> None:
|
|
179
|
+
"""Mark that procedure context is ready (setup step)."""
|
|
180
|
+
# This is a setup step - just verify context is ready
|
|
181
|
+
# The actual execution happens in "When" steps
|
|
182
|
+
assert context is not None, "Test context not initialized"
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
# State-related steps
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def register_state_steps(registry: StepRegistry) -> None:
|
|
189
|
+
"""Register state-related step definitions."""
|
|
190
|
+
|
|
191
|
+
registry.register(r"the state (?P<key>\w+) should be (?P<value>.+)", step_state_equals)
|
|
192
|
+
|
|
193
|
+
registry.register(r"the state (?P<key>\w+) should exist", step_state_exists)
|
|
194
|
+
|
|
195
|
+
registry.register(r"the state should contain (?P<key>\w+)", step_state_contains)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def step_state_equals(context: Any, key: str, value: str) -> None:
|
|
199
|
+
"""Check if state value equals expected."""
|
|
200
|
+
actual = context.state_get(key)
|
|
201
|
+
value, was_quoted = _parse_step_string_literal(value)
|
|
202
|
+
# Convert to string for comparison
|
|
203
|
+
actual_str = str(actual) if actual is not None else "None"
|
|
204
|
+
if was_quoted:
|
|
205
|
+
assert actual_str == value, f"State '{key}' is '{actual_str}', expected '{value}'"
|
|
206
|
+
return
|
|
207
|
+
assert actual_str == value, f"State '{key}' is '{actual_str}', expected '{value}'"
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def step_state_exists(context: Any, key: str) -> None:
|
|
211
|
+
"""Check if state key exists."""
|
|
212
|
+
exists = context.state_exists(key)
|
|
213
|
+
assert exists, f"State key '{key}' does not exist"
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def step_state_contains(context: Any, key: str) -> None:
|
|
217
|
+
"""Check if state contains key."""
|
|
218
|
+
exists = context.state_exists(key)
|
|
219
|
+
assert exists, f"State does not contain key '{key}'"
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# Output-related steps
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def register_output_steps(registry: StepRegistry) -> None:
|
|
226
|
+
"""Register output-related step definitions."""
|
|
227
|
+
|
|
228
|
+
registry.register(r"the output should exist", step_output_value_exists)
|
|
229
|
+
registry.register(r"the output should be (?P<value>.+)", step_output_value_equals)
|
|
230
|
+
registry.register(
|
|
231
|
+
r"the output should fuzzy match (?P<value>.+) with threshold (?P<threshold>[0-9]*\.?[0-9]+)",
|
|
232
|
+
step_output_value_fuzzy_match,
|
|
233
|
+
)
|
|
234
|
+
registry.register(r"the output should fuzzy match (?P<value>.+)", step_output_value_fuzzy_match)
|
|
235
|
+
|
|
236
|
+
registry.register(r"the output (?P<key>\w+) should be (?P<value>.+)", step_output_equals)
|
|
237
|
+
|
|
238
|
+
registry.register(
|
|
239
|
+
r"the output (?P<key>\w+) should not be (?P<value>.+)", step_output_not_equals
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
registry.register(r"the output (?P<key>\w+) should exist", step_output_exists)
|
|
243
|
+
|
|
244
|
+
registry.register(r"the output should contain (?P<key>\w+)", step_output_contains)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def step_output_equals(context: Any, key: str, value: str) -> None:
|
|
248
|
+
"""Check if output value equals expected."""
|
|
249
|
+
actual = context.output_get(key)
|
|
250
|
+
value, was_quoted = _parse_step_string_literal(value)
|
|
251
|
+
if was_quoted:
|
|
252
|
+
actual_str = str(actual) if actual is not None else "None"
|
|
253
|
+
assert actual_str == value, f"Output '{key}' is '{actual_str}', expected '{value}'"
|
|
254
|
+
return
|
|
255
|
+
|
|
256
|
+
# Handle boolean comparison specially
|
|
257
|
+
if value.lower() in ("true", "false"):
|
|
258
|
+
expected_bool = value.lower() == "true"
|
|
259
|
+
if isinstance(actual, bool):
|
|
260
|
+
assert actual == expected_bool, f"Output '{key}' is {actual}, expected {expected_bool}"
|
|
261
|
+
else:
|
|
262
|
+
actual_str = str(actual).lower()
|
|
263
|
+
assert actual_str == value.lower(), f"Output '{key}' is '{actual}', expected '{value}'"
|
|
264
|
+
else:
|
|
265
|
+
# Try numeric comparison first
|
|
266
|
+
try:
|
|
267
|
+
expected_num = float(value)
|
|
268
|
+
if isinstance(actual, (int, float)):
|
|
269
|
+
assert (
|
|
270
|
+
actual == expected_num
|
|
271
|
+
), f"Output '{key}' is {actual}, expected {expected_num}"
|
|
272
|
+
else:
|
|
273
|
+
actual_num = float(actual)
|
|
274
|
+
assert (
|
|
275
|
+
actual_num == expected_num
|
|
276
|
+
), f"Output '{key}' is {actual_num}, expected {expected_num}"
|
|
277
|
+
except (ValueError, TypeError):
|
|
278
|
+
# Fall back to string comparison
|
|
279
|
+
actual_str = str(actual) if actual is not None else "None"
|
|
280
|
+
assert actual_str == value, f"Output '{key}' is '{actual_str}', expected '{value}'"
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def step_output_value_exists(context: Any) -> None:
|
|
284
|
+
"""Check if scalar output exists (non-None)."""
|
|
285
|
+
actual = context.output_value()
|
|
286
|
+
assert actual is not None, "Output is missing"
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def step_output_value_equals(context: Any, value: str) -> None:
|
|
290
|
+
"""Check if scalar output equals expected."""
|
|
291
|
+
actual = context.output_value()
|
|
292
|
+
value, was_quoted = _parse_step_string_literal(value)
|
|
293
|
+
if was_quoted:
|
|
294
|
+
actual_str = str(actual) if actual is not None else "None"
|
|
295
|
+
assert actual_str == value, f"Output is '{actual_str}', expected '{value}'"
|
|
296
|
+
return
|
|
297
|
+
|
|
298
|
+
# Handle boolean comparison specially
|
|
299
|
+
if value.lower() in ("true", "false"):
|
|
300
|
+
expected_bool = value.lower() == "true"
|
|
301
|
+
if isinstance(actual, bool):
|
|
302
|
+
assert actual == expected_bool, f"Output is {actual}, expected {expected_bool}"
|
|
303
|
+
else:
|
|
304
|
+
actual_str = str(actual).lower() if actual is not None else "none"
|
|
305
|
+
assert actual_str == value.lower(), f"Output is '{actual}', expected '{value}'"
|
|
306
|
+
return
|
|
307
|
+
|
|
308
|
+
# Try numeric comparison first
|
|
309
|
+
try:
|
|
310
|
+
expected_num = float(value)
|
|
311
|
+
if isinstance(actual, (int, float)):
|
|
312
|
+
assert actual == expected_num, f"Output is {actual}, expected {expected_num}"
|
|
313
|
+
else:
|
|
314
|
+
actual_num = float(actual)
|
|
315
|
+
assert actual_num == expected_num, f"Output is '{actual}', expected {expected_num}"
|
|
316
|
+
return
|
|
317
|
+
except (ValueError, TypeError):
|
|
318
|
+
pass
|
|
319
|
+
|
|
320
|
+
actual_str = str(actual) if actual is not None else "None"
|
|
321
|
+
assert actual_str == value, f"Output is '{actual_str}', expected '{value}'"
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def step_output_value_fuzzy_match(context: Any, value: str, threshold: str = "0.8") -> None:
|
|
325
|
+
"""Check if scalar output is similar to expected value above a threshold.
|
|
326
|
+
|
|
327
|
+
This is a deterministic, non-LLM fuzzy match based on string similarity.
|
|
328
|
+
|
|
329
|
+
Default behavior:
|
|
330
|
+
- Case-insensitive (compares lowercased text)
|
|
331
|
+
- Punctuation-insensitive (strips punctuation)
|
|
332
|
+
|
|
333
|
+
Multi-match syntax (best-effort):
|
|
334
|
+
Then the output should fuzzy match any of ["Hello", "Hi", "Hey"] with threshold 0.9
|
|
335
|
+
"""
|
|
336
|
+
import difflib
|
|
337
|
+
|
|
338
|
+
def _normalize_text(text: str) -> str:
|
|
339
|
+
# Lowercase + strip punctuation + collapse whitespace.
|
|
340
|
+
normalized = re.sub(r"[^\w\s]", "", text.lower())
|
|
341
|
+
normalized = re.sub(r"\s+", " ", normalized).strip()
|
|
342
|
+
return normalized
|
|
343
|
+
|
|
344
|
+
actual = context.output_value()
|
|
345
|
+
assert actual is not None, "Output is missing"
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
threshold_f = float(threshold)
|
|
349
|
+
except ValueError:
|
|
350
|
+
raise AssertionError(f"Invalid threshold: {threshold}")
|
|
351
|
+
|
|
352
|
+
expected_raw, was_quoted = _parse_step_string_literal(value)
|
|
353
|
+
expected_raw = expected_raw.strip() if not was_quoted else expected_raw
|
|
354
|
+
|
|
355
|
+
expected_values: list[str]
|
|
356
|
+
|
|
357
|
+
if expected_raw.lower().startswith("any of "):
|
|
358
|
+
values_str = expected_raw[7:].strip()
|
|
359
|
+
try:
|
|
360
|
+
parsed = ast.literal_eval(values_str)
|
|
361
|
+
except Exception:
|
|
362
|
+
parsed = None
|
|
363
|
+
|
|
364
|
+
expected_values = []
|
|
365
|
+
if isinstance(parsed, (list, tuple)):
|
|
366
|
+
for item in parsed:
|
|
367
|
+
expected_values.append(item if isinstance(item, str) else str(item))
|
|
368
|
+
else:
|
|
369
|
+
parts = [p.strip() for p in values_str.split(",") if p.strip()]
|
|
370
|
+
for part in parts:
|
|
371
|
+
parsed_part, _ = _parse_step_string_literal(part)
|
|
372
|
+
expected_values.append(parsed_part)
|
|
373
|
+
|
|
374
|
+
if not expected_values:
|
|
375
|
+
raise AssertionError(f"No expected values provided: {value}")
|
|
376
|
+
else:
|
|
377
|
+
expected_values = [expected_raw]
|
|
378
|
+
|
|
379
|
+
actual_norm = _normalize_text(str(actual))
|
|
380
|
+
best_ratio = -1.0
|
|
381
|
+
best_expected = None
|
|
382
|
+
|
|
383
|
+
for expected in expected_values:
|
|
384
|
+
expected_norm = _normalize_text(expected)
|
|
385
|
+
if expected_norm and (expected_norm in actual_norm or actual_norm in expected_norm):
|
|
386
|
+
ratio = 1.0
|
|
387
|
+
else:
|
|
388
|
+
ratio = difflib.SequenceMatcher(None, actual_norm, expected_norm).ratio()
|
|
389
|
+
|
|
390
|
+
if ratio > best_ratio:
|
|
391
|
+
best_ratio = ratio
|
|
392
|
+
best_expected = expected
|
|
393
|
+
|
|
394
|
+
assert best_ratio >= threshold_f, (
|
|
395
|
+
f"Output similarity is {best_ratio:.3f} (threshold {threshold_f:.3f}). "
|
|
396
|
+
f"Output is '{actual}', best match was '{best_expected}'. "
|
|
397
|
+
f"Expected: {expected_values}"
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def step_output_not_equals(context: Any, key: str, value: str) -> None:
|
|
402
|
+
"""Check if output value does not equal the specified value."""
|
|
403
|
+
actual = context.output_get(key)
|
|
404
|
+
value, was_quoted = _parse_step_string_literal(value)
|
|
405
|
+
if was_quoted:
|
|
406
|
+
actual_str = str(actual) if actual is not None else "None"
|
|
407
|
+
assert actual_str != value, f"Output '{key}' is '{actual_str}', should not be '{value}'"
|
|
408
|
+
return
|
|
409
|
+
|
|
410
|
+
# Handle boolean comparison specially
|
|
411
|
+
if value.lower() in ("true", "false"):
|
|
412
|
+
expected_bool = value.lower() == "true"
|
|
413
|
+
if isinstance(actual, bool):
|
|
414
|
+
assert (
|
|
415
|
+
actual != expected_bool
|
|
416
|
+
), f"Output '{key}' is {actual}, should not be {expected_bool}"
|
|
417
|
+
else:
|
|
418
|
+
actual_str = str(actual).lower()
|
|
419
|
+
assert (
|
|
420
|
+
actual_str != value.lower()
|
|
421
|
+
), f"Output '{key}' is '{actual}', should not be '{value}'"
|
|
422
|
+
else:
|
|
423
|
+
# Try numeric comparison first
|
|
424
|
+
try:
|
|
425
|
+
expected_num = float(value)
|
|
426
|
+
if isinstance(actual, (int, float)):
|
|
427
|
+
assert (
|
|
428
|
+
actual != expected_num
|
|
429
|
+
), f"Output '{key}' is {actual}, should not be {expected_num}"
|
|
430
|
+
else:
|
|
431
|
+
actual_num = float(actual)
|
|
432
|
+
assert (
|
|
433
|
+
actual_num != expected_num
|
|
434
|
+
), f"Output '{key}' is {actual_num}, should not be {expected_num}"
|
|
435
|
+
except (ValueError, TypeError):
|
|
436
|
+
# Fall back to string comparison
|
|
437
|
+
actual_str = str(actual) if actual is not None else "None"
|
|
438
|
+
assert actual_str != value, f"Output '{key}' is '{actual_str}', should not be '{value}'"
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def step_output_exists(context: Any, key: str) -> None:
|
|
442
|
+
"""Check if output key exists."""
|
|
443
|
+
exists = context.output_exists(key)
|
|
444
|
+
assert exists, f"Output key '{key}' does not exist"
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def step_output_contains(context: Any, key: str) -> None:
|
|
448
|
+
"""Check if output contains key."""
|
|
449
|
+
exists = context.output_exists(key)
|
|
450
|
+
assert exists, f"Output does not contain key '{key}'"
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
# Completion steps
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def register_completion_steps(registry: StepRegistry) -> None:
|
|
457
|
+
"""Register completion-related step definitions."""
|
|
458
|
+
|
|
459
|
+
registry.register(r"the procedure has started", step_procedure_started)
|
|
460
|
+
registry.register(r"the procedure should complete successfully", step_procedure_completes)
|
|
461
|
+
|
|
462
|
+
registry.register(r"the procedure should fail", step_procedure_fails)
|
|
463
|
+
|
|
464
|
+
registry.register(r"the stop reason should be (?P<reason>.+)", step_stop_reason_equals)
|
|
465
|
+
|
|
466
|
+
registry.register(r"the stop reason should contain (?P<text>.+)", step_stop_reason_contains)
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def step_procedure_completes(context: Any) -> None:
|
|
470
|
+
"""Check if procedure completed successfully."""
|
|
471
|
+
assert context.stop_success(), "Procedure did not complete successfully"
|
|
472
|
+
|
|
473
|
+
|
|
474
|
+
def step_procedure_fails(context: Any) -> None:
|
|
475
|
+
"""Check if procedure failed."""
|
|
476
|
+
assert not context.stop_success(), "Procedure completed successfully but should have failed"
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def step_stop_reason_equals(context: Any, reason: str) -> None:
|
|
480
|
+
"""Check if stop reason equals expected."""
|
|
481
|
+
actual = context.stop_reason()
|
|
482
|
+
assert actual == reason, f"Stop reason is '{actual}', expected '{reason}'"
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def step_stop_reason_contains(context: Any, text: str) -> None:
|
|
486
|
+
"""Check if stop reason contains text."""
|
|
487
|
+
reason = context.stop_reason()
|
|
488
|
+
assert text in reason, f"Stop reason '{reason}' does not contain '{text}'"
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
# Iteration/timing steps
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def register_iteration_steps(registry: StepRegistry) -> None:
|
|
495
|
+
"""Register iteration and timing step definitions."""
|
|
496
|
+
|
|
497
|
+
registry.register(
|
|
498
|
+
r"the total iterations should be less than (?P<n>\d+)", step_iterations_less_than
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
registry.register(
|
|
502
|
+
r"the total iterations should be between (?P<min>\d+) and (?P<max>\d+)",
|
|
503
|
+
step_iterations_between,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
registry.register(r"the agent should take at least (?P<n>\d+) turn", step_agent_turns_at_least)
|
|
507
|
+
|
|
508
|
+
registry.register(r"the agent should take at least (?P<n>\d+) turns", step_agent_turns_at_least)
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def step_iterations_less_than(context: Any, n: str) -> None:
|
|
512
|
+
"""Check if total iterations is less than N."""
|
|
513
|
+
iterations = context.iterations
|
|
514
|
+
max_iterations = int(n)
|
|
515
|
+
assert (
|
|
516
|
+
iterations < max_iterations
|
|
517
|
+
), f"Total iterations is {iterations}, expected less than {max_iterations}"
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def step_iterations_between(context: Any, min: str, max: str) -> None:
|
|
521
|
+
"""Check if iterations is between min and max."""
|
|
522
|
+
iterations = context.iterations
|
|
523
|
+
min_val = int(min)
|
|
524
|
+
max_val = int(max)
|
|
525
|
+
assert (
|
|
526
|
+
min_val <= iterations <= max_val
|
|
527
|
+
), f"Total iterations is {iterations}, expected between {min_val} and {max_val}"
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
def step_agent_turns_at_least(context: Any, n: str) -> None:
|
|
531
|
+
"""Check if agent took at least N turns."""
|
|
532
|
+
turns = context.agent_turns()
|
|
533
|
+
min_turns = int(n)
|
|
534
|
+
assert turns >= min_turns, f"Agent took {turns} turns, expected at least {min_turns}"
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
# Parameter/context steps
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def register_parameter_steps(registry: StepRegistry) -> None:
|
|
541
|
+
"""Register parameter and context step definitions."""
|
|
542
|
+
|
|
543
|
+
registry.register(r"the (?P<param>\w+) parameter is (?P<value>.+)", step_parameter_equals)
|
|
544
|
+
|
|
545
|
+
registry.register(
|
|
546
|
+
r"the agent'?s? context should include (?P<text>.+)", step_agent_context_includes
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
# Input-setting steps (Given clauses to set procedure inputs)
|
|
550
|
+
registry.register(r'the input (?P<key>\w+) is "(?P<value>.+)"', step_input_set_string)
|
|
551
|
+
|
|
552
|
+
registry.register(r"the input (?P<key>\w+) is \[(?P<values>.+)\]", step_input_set_array)
|
|
553
|
+
|
|
554
|
+
registry.register(r"the input (?P<key>\w+) is (?P<value>-?\d+\.?\d*)", step_input_set_number)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
def step_parameter_equals(context: Any, param: str, value: str) -> None:
|
|
558
|
+
"""Check if parameter equals expected value."""
|
|
559
|
+
params = context.get_params()
|
|
560
|
+
actual = params.get(param)
|
|
561
|
+
actual_str = str(actual) if actual is not None else "None"
|
|
562
|
+
assert actual_str == value, f"Parameter '{param}' is '{actual_str}', expected '{value}'"
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def step_agent_context_includes(context: Any, text: str) -> None:
|
|
566
|
+
"""Check if agent context includes text."""
|
|
567
|
+
agent_context = context.agent_context()
|
|
568
|
+
assert text in agent_context, f"Agent context does not include '{text}'"
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
def step_input_set_string(context: Any, key: str, value: str) -> None:
|
|
572
|
+
"""Set a string input parameter."""
|
|
573
|
+
context.set_input(key, value)
|
|
574
|
+
|
|
575
|
+
|
|
576
|
+
def step_input_set_number(context: Any, key: str, value: str) -> None:
|
|
577
|
+
"""Set a numeric input parameter."""
|
|
578
|
+
# Parse as float or int
|
|
579
|
+
if "." in value:
|
|
580
|
+
context.set_input(key, float(value))
|
|
581
|
+
else:
|
|
582
|
+
context.set_input(key, int(value))
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
def step_input_set_array(context: Any, key: str, values: str) -> None:
|
|
586
|
+
"""Set an array input parameter from comma-separated values."""
|
|
587
|
+
import ast
|
|
588
|
+
|
|
589
|
+
# Try to parse as Python literal first
|
|
590
|
+
try:
|
|
591
|
+
parsed = ast.literal_eval(f"[{values}]")
|
|
592
|
+
context.set_input(key, parsed)
|
|
593
|
+
except (ValueError, SyntaxError):
|
|
594
|
+
# Fall back to comma-split for simple values
|
|
595
|
+
items = [v.strip() for v in values.split(",")]
|
|
596
|
+
# Try to convert to numbers if possible
|
|
597
|
+
parsed_items = []
|
|
598
|
+
for item in items:
|
|
599
|
+
try:
|
|
600
|
+
if "." in item:
|
|
601
|
+
parsed_items.append(float(item))
|
|
602
|
+
else:
|
|
603
|
+
parsed_items.append(int(item))
|
|
604
|
+
except ValueError:
|
|
605
|
+
parsed_items.append(item)
|
|
606
|
+
context.set_input(key, parsed_items)
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
# Agent steps
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
def register_agent_steps(registry: StepRegistry) -> None:
|
|
613
|
+
"""Register agent-related step definitions."""
|
|
614
|
+
|
|
615
|
+
registry.register(r"the (?P<agent>\w+) agent takes turn", step_agent_takes_turn)
|
|
616
|
+
|
|
617
|
+
registry.register(r"the (?P<agent>\w+) agent takes turns", step_agent_takes_turn)
|
|
618
|
+
|
|
619
|
+
registry.register(
|
|
620
|
+
r'the agent "(?P<agent>[^"]+)" responds with (?P<message>.+)',
|
|
621
|
+
step_mock_agent_responds_with,
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
registry.register(
|
|
625
|
+
r'the agent "(?P<agent>[^"]+)" calls tool "(?P<tool>[^"]+)" with args (?P<args>.+)',
|
|
626
|
+
step_mock_agent_calls_tool_with_args,
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
registry.register(
|
|
630
|
+
r'the agent "(?P<agent>[^"]+)" returns data (?P<data>.+)',
|
|
631
|
+
step_mock_agent_returns_data,
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
registry.register(r"the message is (?P<message>.+)", step_set_scenario_message)
|
|
635
|
+
|
|
636
|
+
registry.register(r"the procedure run", step_procedure_runs)
|
|
637
|
+
|
|
638
|
+
registry.register(r"the procedure runs", step_procedure_runs)
|
|
639
|
+
|
|
640
|
+
|
|
641
|
+
def step_agent_takes_turn(context: Any, agent: str) -> None:
|
|
642
|
+
"""Execute agent turn(s)."""
|
|
643
|
+
# This step actually executes the procedure
|
|
644
|
+
# The agent parameter is informational - the procedure runs as defined
|
|
645
|
+
context.run_procedure()
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def step_mock_agent_responds_with(
|
|
649
|
+
context: Any, agent: str, message: str, when_message: str | None = None
|
|
650
|
+
) -> None:
|
|
651
|
+
"""Configure a per-scenario mock agent response (temporal)."""
|
|
652
|
+
message, _ = _parse_step_string_literal(message)
|
|
653
|
+
when_message_parsed = None
|
|
654
|
+
if when_message is not None:
|
|
655
|
+
when_message_parsed, _ = _parse_step_string_literal(when_message)
|
|
656
|
+
if not hasattr(context, "mock_agent_response"):
|
|
657
|
+
raise AssertionError("Context does not support agent mocking")
|
|
658
|
+
context.mock_agent_response(agent, message, when_message=when_message_parsed)
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def step_set_scenario_message(context: Any, message: str) -> None:
|
|
662
|
+
"""Set the scenario's primary message for coordinating mocks with expectations."""
|
|
663
|
+
message, _ = _parse_step_string_literal(message)
|
|
664
|
+
if not hasattr(context, "set_scenario_message"):
|
|
665
|
+
raise AssertionError("Context does not support scenario message")
|
|
666
|
+
context.set_scenario_message(message)
|
|
667
|
+
|
|
668
|
+
|
|
669
|
+
def step_mock_agent_calls_tool_with_args(context: Any, agent: str, tool: str, args: str) -> None:
|
|
670
|
+
"""Configure a per-scenario mocked agent tool call (recorded into Tool primitive)."""
|
|
671
|
+
args_str, _ = _parse_step_string_literal(args)
|
|
672
|
+
try:
|
|
673
|
+
parsed_args = ast.literal_eval(args_str)
|
|
674
|
+
except Exception:
|
|
675
|
+
raise AssertionError(f"Invalid tool args literal: {args}")
|
|
676
|
+
|
|
677
|
+
if not isinstance(parsed_args, dict):
|
|
678
|
+
raise AssertionError(f"Tool args must be an object/dict, got {type(parsed_args).__name__}")
|
|
679
|
+
|
|
680
|
+
if not hasattr(context, "mock_agent_tool_call"):
|
|
681
|
+
raise AssertionError("Context does not support agent tool call mocking")
|
|
682
|
+
|
|
683
|
+
context.mock_agent_tool_call(agent, tool, parsed_args)
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
def step_mock_agent_returns_data(context: Any, agent: str, data: str) -> None:
|
|
687
|
+
"""Configure structured output mock data for an agent's next mocked turn."""
|
|
688
|
+
data_str, _ = _parse_step_string_literal(data)
|
|
689
|
+
try:
|
|
690
|
+
parsed = ast.literal_eval(data_str)
|
|
691
|
+
except Exception:
|
|
692
|
+
raise AssertionError(f"Invalid data literal: {data}")
|
|
693
|
+
|
|
694
|
+
if not isinstance(parsed, dict):
|
|
695
|
+
raise AssertionError(f"Data must be an object/dict, got {type(parsed).__name__}")
|
|
696
|
+
|
|
697
|
+
if not hasattr(context, "mock_agent_data"):
|
|
698
|
+
raise AssertionError("Context does not support agent data mocking")
|
|
699
|
+
|
|
700
|
+
context.mock_agent_data(agent, parsed)
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
def step_procedure_runs(context: Any) -> None:
|
|
704
|
+
"""Execute the procedure.
|
|
705
|
+
|
|
706
|
+
Fails the step if the procedure has an execution error (e.g., undefined variables).
|
|
707
|
+
"""
|
|
708
|
+
context.run_procedure()
|
|
709
|
+
|
|
710
|
+
# Check for execution errors (e.g., Lua errors like undefined variables)
|
|
711
|
+
# context is TactusTestContext when called from generated behave steps
|
|
712
|
+
if hasattr(context, "execution_result") and context.execution_result:
|
|
713
|
+
result = context.execution_result
|
|
714
|
+
if not result.get("success", True):
|
|
715
|
+
error = result.get("error", "Unknown error")
|
|
716
|
+
raise AssertionError(f"Procedure execution failed: {error}")
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
# Regex pattern matching steps
|
|
720
|
+
|
|
721
|
+
|
|
722
|
+
def register_regex_steps(registry: StepRegistry) -> None:
|
|
723
|
+
"""Register regex pattern matching steps."""
|
|
724
|
+
|
|
725
|
+
# Output regex matching
|
|
726
|
+
registry.register(
|
|
727
|
+
r'the output (?P<key>\w+) should match pattern "(?P<pattern>.+)"',
|
|
728
|
+
step_output_matches_pattern,
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
# State regex matching
|
|
732
|
+
registry.register(
|
|
733
|
+
r'the state (?P<key>\w+) should match pattern "(?P<pattern>.+)"',
|
|
734
|
+
step_state_matches_pattern,
|
|
735
|
+
)
|
|
736
|
+
|
|
737
|
+
# Stop reason regex matching
|
|
738
|
+
registry.register(
|
|
739
|
+
r'the stop reason should match pattern "(?P<pattern>.+)"',
|
|
740
|
+
step_stop_reason_matches_pattern,
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
# Tool argument regex matching
|
|
744
|
+
registry.register(
|
|
745
|
+
r'the (?P<tool>[-\w]+) tool should be called with (?P<param>\w+) matching pattern "(?P<pattern>.+)"',
|
|
746
|
+
step_tool_arg_matches_pattern,
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
def step_output_matches_pattern(context: Any, key: str, pattern: str) -> None:
|
|
751
|
+
"""Check if output value matches regex pattern."""
|
|
752
|
+
actual = context.output_get(key)
|
|
753
|
+
actual_str = str(actual) if actual is not None else ""
|
|
754
|
+
|
|
755
|
+
try:
|
|
756
|
+
regex = re.compile(pattern)
|
|
757
|
+
assert regex.search(
|
|
758
|
+
actual_str
|
|
759
|
+
), f"Output '{key}' value '{actual_str}' does not match pattern '{pattern}'"
|
|
760
|
+
except re.error as e:
|
|
761
|
+
raise AssertionError(f"Invalid regex pattern '{pattern}': {e}")
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
def step_state_matches_pattern(context: Any, key: str, pattern: str) -> None:
|
|
765
|
+
"""Check if state value matches regex pattern."""
|
|
766
|
+
actual = context.state_get(key)
|
|
767
|
+
actual_str = str(actual) if actual is not None else ""
|
|
768
|
+
|
|
769
|
+
try:
|
|
770
|
+
regex = re.compile(pattern)
|
|
771
|
+
assert regex.search(
|
|
772
|
+
actual_str
|
|
773
|
+
), f"State '{key}' value '{actual_str}' does not match pattern '{pattern}'"
|
|
774
|
+
except re.error as e:
|
|
775
|
+
raise AssertionError(f"Invalid regex pattern '{pattern}': {e}")
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
def step_stop_reason_matches_pattern(context: Any, pattern: str) -> None:
|
|
779
|
+
"""Check if stop reason matches regex pattern."""
|
|
780
|
+
actual = context.stop_reason()
|
|
781
|
+
|
|
782
|
+
try:
|
|
783
|
+
regex = re.compile(pattern)
|
|
784
|
+
assert regex.search(actual), f"Stop reason '{actual}' does not match pattern '{pattern}'"
|
|
785
|
+
except re.error as e:
|
|
786
|
+
raise AssertionError(f"Invalid regex pattern '{pattern}': {e}")
|
|
787
|
+
|
|
788
|
+
|
|
789
|
+
def step_tool_arg_matches_pattern(context: Any, tool: str, param: str, pattern: str) -> None:
|
|
790
|
+
"""Check if tool was called with parameter matching regex pattern."""
|
|
791
|
+
calls = context.tool_calls(tool)
|
|
792
|
+
assert calls, f"Tool '{tool}' was not called"
|
|
793
|
+
|
|
794
|
+
try:
|
|
795
|
+
regex = re.compile(pattern)
|
|
796
|
+
# Check if any call has the parameter matching the pattern
|
|
797
|
+
found = False
|
|
798
|
+
for call in calls:
|
|
799
|
+
param_value = call.get("args", {}).get(param)
|
|
800
|
+
if param_value is not None:
|
|
801
|
+
param_str = str(param_value)
|
|
802
|
+
if regex.search(param_str):
|
|
803
|
+
found = True
|
|
804
|
+
break
|
|
805
|
+
|
|
806
|
+
assert found, f"Tool '{tool}' was not called with {param} matching pattern '{pattern}'"
|
|
807
|
+
except re.error as e:
|
|
808
|
+
raise AssertionError(f"Invalid regex pattern '{pattern}': {e}")
|
|
809
|
+
|
|
810
|
+
|
|
811
|
+
# Fuzzy string matching steps
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
def register_fuzzy_steps(registry: StepRegistry) -> None:
|
|
815
|
+
"""Register fuzzy string matching steps."""
|
|
816
|
+
|
|
817
|
+
# Output fuzzy matching (default threshold)
|
|
818
|
+
registry.register(
|
|
819
|
+
r'the output (?P<key>\w+) should be similar to "(?P<text>.+)"',
|
|
820
|
+
step_output_similar_default,
|
|
821
|
+
)
|
|
822
|
+
|
|
823
|
+
# Output fuzzy matching (custom threshold)
|
|
824
|
+
registry.register(
|
|
825
|
+
r'the output (?P<key>\w+) should be similar to "(?P<text>.+)" with (?P<threshold>\d+)% similarity',
|
|
826
|
+
step_output_similar_threshold,
|
|
827
|
+
)
|
|
828
|
+
|
|
829
|
+
# State fuzzy matching (default threshold)
|
|
830
|
+
registry.register(
|
|
831
|
+
r'the state (?P<key>\w+) should be similar to "(?P<text>.+)"',
|
|
832
|
+
step_state_similar_default,
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
# State fuzzy matching (custom threshold)
|
|
836
|
+
registry.register(
|
|
837
|
+
r'the state (?P<key>\w+) should be similar to "(?P<text>.+)" with (?P<threshold>\d+)% similarity',
|
|
838
|
+
step_state_similar_threshold,
|
|
839
|
+
)
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
def step_output_similar_default(context: Any, key: str, text: str) -> None:
|
|
843
|
+
"""Check if output is similar to expected text (80% default threshold)."""
|
|
844
|
+
step_output_similar_threshold(context, key, text, "80")
|
|
845
|
+
|
|
846
|
+
|
|
847
|
+
def step_output_similar_threshold(context: Any, key: str, text: str, threshold: str) -> None:
|
|
848
|
+
"""Check if output is similar to expected text with custom threshold."""
|
|
849
|
+
from rapidfuzz import fuzz
|
|
850
|
+
|
|
851
|
+
actual = context.output_get(key)
|
|
852
|
+
actual_str = str(actual) if actual is not None else ""
|
|
853
|
+
|
|
854
|
+
threshold_val = int(threshold)
|
|
855
|
+
similarity = fuzz.ratio(actual_str, text)
|
|
856
|
+
|
|
857
|
+
assert similarity >= threshold_val, (
|
|
858
|
+
f"Output '{key}' similarity is {similarity}% (expected >= {threshold_val}%)\n"
|
|
859
|
+
f" Actual: '{actual_str}'\n"
|
|
860
|
+
f" Expected: '{text}'"
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def step_state_similar_default(context: Any, key: str, text: str) -> None:
|
|
865
|
+
"""Check if state is similar to expected text (80% default threshold)."""
|
|
866
|
+
step_state_similar_threshold(context, key, text, "80")
|
|
867
|
+
|
|
868
|
+
|
|
869
|
+
def step_state_similar_threshold(context: Any, key: str, text: str, threshold: str) -> None:
|
|
870
|
+
"""Check if state is similar to expected text with custom threshold."""
|
|
871
|
+
from rapidfuzz import fuzz
|
|
872
|
+
|
|
873
|
+
actual = context.state_get(key)
|
|
874
|
+
actual_str = str(actual) if actual is not None else ""
|
|
875
|
+
|
|
876
|
+
threshold_val = int(threshold)
|
|
877
|
+
similarity = fuzz.ratio(actual_str, text)
|
|
878
|
+
|
|
879
|
+
assert similarity >= threshold_val, (
|
|
880
|
+
f"State '{key}' similarity is {similarity}% (expected >= {threshold_val}%)\n"
|
|
881
|
+
f" Actual: '{actual_str}'\n"
|
|
882
|
+
f" Expected: '{text}'"
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
|
|
886
|
+
# Model-related steps
|
|
887
|
+
|
|
888
|
+
|
|
889
|
+
def register_model_steps(registry: StepRegistry) -> None:
|
|
890
|
+
"""Register model-related step definitions."""
|
|
891
|
+
|
|
892
|
+
# Model prediction step (When clause)
|
|
893
|
+
registry.register(r"the (?P<model>\w+) model predicts", step_model_predicts)
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
def step_model_predicts(context: Any, model: str) -> None:
|
|
897
|
+
"""Trigger model prediction by running the procedure.
|
|
898
|
+
|
|
899
|
+
This step runs the procedure which should contain the model prediction.
|
|
900
|
+
"""
|
|
901
|
+
# Model prediction happens during procedure execution
|
|
902
|
+
context.run_procedure()
|