sandboxy 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandboxy/__init__.py +3 -0
- sandboxy/agents/__init__.py +21 -0
- sandboxy/agents/base.py +66 -0
- sandboxy/agents/llm_prompt.py +308 -0
- sandboxy/agents/loader.py +222 -0
- sandboxy/api/__init__.py +5 -0
- sandboxy/api/app.py +76 -0
- sandboxy/api/routes/__init__.py +1 -0
- sandboxy/api/routes/agents.py +92 -0
- sandboxy/api/routes/local.py +1388 -0
- sandboxy/api/routes/tools.py +106 -0
- sandboxy/cli/__init__.py +1 -0
- sandboxy/cli/main.py +1196 -0
- sandboxy/cli/type_detector.py +48 -0
- sandboxy/config.py +49 -0
- sandboxy/core/__init__.py +1 -0
- sandboxy/core/async_runner.py +824 -0
- sandboxy/core/mdl_parser.py +441 -0
- sandboxy/core/runner.py +599 -0
- sandboxy/core/safe_eval.py +165 -0
- sandboxy/core/state.py +234 -0
- sandboxy/datasets/__init__.py +20 -0
- sandboxy/datasets/loader.py +193 -0
- sandboxy/datasets/runner.py +442 -0
- sandboxy/errors.py +166 -0
- sandboxy/local/context.py +235 -0
- sandboxy/local/results.py +173 -0
- sandboxy/logging.py +31 -0
- sandboxy/mcp/__init__.py +25 -0
- sandboxy/mcp/client.py +360 -0
- sandboxy/mcp/wrapper.py +99 -0
- sandboxy/providers/__init__.py +34 -0
- sandboxy/providers/anthropic_provider.py +271 -0
- sandboxy/providers/base.py +123 -0
- sandboxy/providers/http_client.py +101 -0
- sandboxy/providers/openai_provider.py +282 -0
- sandboxy/providers/openrouter.py +958 -0
- sandboxy/providers/registry.py +199 -0
- sandboxy/scenarios/__init__.py +11 -0
- sandboxy/scenarios/comparison.py +491 -0
- sandboxy/scenarios/loader.py +262 -0
- sandboxy/scenarios/runner.py +468 -0
- sandboxy/scenarios/unified.py +1434 -0
- sandboxy/session/__init__.py +21 -0
- sandboxy/session/manager.py +278 -0
- sandboxy/tools/__init__.py +34 -0
- sandboxy/tools/base.py +127 -0
- sandboxy/tools/loader.py +270 -0
- sandboxy/tools/yaml_tools.py +708 -0
- sandboxy/ui/__init__.py +27 -0
- sandboxy/ui/dist/assets/index-CgAkYWrJ.css +1 -0
- sandboxy/ui/dist/assets/index-D4zoGFcr.js +347 -0
- sandboxy/ui/dist/index.html +14 -0
- sandboxy/utils/__init__.py +3 -0
- sandboxy/utils/time.py +20 -0
- sandboxy-0.0.1.dist-info/METADATA +241 -0
- sandboxy-0.0.1.dist-info/RECORD +60 -0
- sandboxy-0.0.1.dist-info/WHEEL +4 -0
- sandboxy-0.0.1.dist-info/entry_points.txt +3 -0
- sandboxy-0.0.1.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Safe expression evaluation module.
|
|
2
|
+
|
|
3
|
+
Provides sandboxed expression evaluation using simpleeval, replacing raw eval()
|
|
4
|
+
calls throughout the codebase with a secure, consistent implementation.
|
|
5
|
+
|
|
6
|
+
Security features:
|
|
7
|
+
- Only whitelisted functions/operators are allowed
|
|
8
|
+
- No access to __builtins__, __import__, or dunder methods
|
|
9
|
+
- Consistent evaluation context across all callers
|
|
10
|
+
- Proper error handling with typed exceptions
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from simpleeval import EvalWithCompoundTypes, InvalidExpression
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EvaluationError(Exception):
|
|
22
|
+
"""Error during expression evaluation."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, message: str, expression: str | None = None) -> None:
|
|
25
|
+
"""Initialize evaluation error with message and optional expression.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
message: Error message describing what went wrong.
|
|
29
|
+
expression: The expression that caused the error, if applicable.
|
|
30
|
+
"""
|
|
31
|
+
super().__init__(message)
|
|
32
|
+
self.expression = expression
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# Safe functions available in all evaluations
|
|
36
|
+
SAFE_FUNCTIONS = {
|
|
37
|
+
# Type conversions
|
|
38
|
+
"str": str,
|
|
39
|
+
"int": int,
|
|
40
|
+
"float": float,
|
|
41
|
+
"bool": bool,
|
|
42
|
+
"list": list,
|
|
43
|
+
"dict": dict,
|
|
44
|
+
# Math
|
|
45
|
+
"abs": abs,
|
|
46
|
+
"round": round,
|
|
47
|
+
"min": min,
|
|
48
|
+
"max": max,
|
|
49
|
+
"sum": sum,
|
|
50
|
+
# Collections
|
|
51
|
+
"len": len,
|
|
52
|
+
"any": any,
|
|
53
|
+
"all": all,
|
|
54
|
+
"sorted": sorted,
|
|
55
|
+
"reversed": lambda x: list(reversed(x)),
|
|
56
|
+
# Constants
|
|
57
|
+
"True": True,
|
|
58
|
+
"False": False,
|
|
59
|
+
"None": None,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def safe_eval(
|
|
64
|
+
expr: str,
|
|
65
|
+
context: dict[str, Any] | None = None,
|
|
66
|
+
) -> Any:
|
|
67
|
+
"""Safely evaluate an expression with restricted scope.
|
|
68
|
+
|
|
69
|
+
Uses simpleeval to provide a sandboxed evaluation environment.
|
|
70
|
+
Only whitelisted functions are available.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
expr: Expression to evaluate.
|
|
74
|
+
context: Additional variables available in the expression.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Result of the expression evaluation.
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
EvaluationError: If the expression is invalid or evaluation fails.
|
|
81
|
+
|
|
82
|
+
"""
|
|
83
|
+
if not expr or not expr.strip():
|
|
84
|
+
raise EvaluationError("Empty expression", expr)
|
|
85
|
+
|
|
86
|
+
# Build evaluation context
|
|
87
|
+
names = dict(SAFE_FUNCTIONS)
|
|
88
|
+
if context:
|
|
89
|
+
names.update(context)
|
|
90
|
+
|
|
91
|
+
try:
|
|
92
|
+
evaluator = EvalWithCompoundTypes(names=names)
|
|
93
|
+
return evaluator.eval(expr)
|
|
94
|
+
except InvalidExpression as e:
|
|
95
|
+
logger.debug("Invalid expression '%s': %s", expr, e)
|
|
96
|
+
msg = f"Invalid expression: {e}"
|
|
97
|
+
raise EvaluationError(msg, expr) from e
|
|
98
|
+
except (TypeError, ValueError, KeyError, AttributeError) as e:
|
|
99
|
+
logger.debug("Evaluation error for '%s': %s", expr, e)
|
|
100
|
+
msg = f"Evaluation failed: {e}"
|
|
101
|
+
raise EvaluationError(msg, expr) from e
|
|
102
|
+
except Exception as e:
|
|
103
|
+
# Catch any other errors from simpleeval
|
|
104
|
+
logger.warning("Unexpected evaluation error for '%s': %s", expr, e)
|
|
105
|
+
msg = f"Evaluation failed: {e}"
|
|
106
|
+
raise EvaluationError(msg, expr) from e
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def safe_eval_condition(
|
|
110
|
+
condition: str,
|
|
111
|
+
variables: dict[str, Any] | None = None,
|
|
112
|
+
) -> bool:
|
|
113
|
+
"""Safely evaluate a boolean condition.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
condition: Condition expression (e.g., "x > 5", "name == 'test'").
|
|
117
|
+
variables: Variables available in the condition.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Boolean result of the condition.
|
|
121
|
+
|
|
122
|
+
Note:
|
|
123
|
+
Returns False if evaluation fails (defensive default for conditions).
|
|
124
|
+
|
|
125
|
+
"""
|
|
126
|
+
if not condition or not condition.strip():
|
|
127
|
+
return False
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
result = safe_eval(condition, variables)
|
|
131
|
+
return bool(result)
|
|
132
|
+
except EvaluationError:
|
|
133
|
+
return False
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def safe_eval_formula(
|
|
137
|
+
formula: str,
|
|
138
|
+
check_values: dict[str, float],
|
|
139
|
+
env_state: dict[str, Any] | None = None,
|
|
140
|
+
) -> float:
|
|
141
|
+
"""Safely evaluate a score formula.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
formula: Formula expression (e.g., "score * 0.5 + bonus").
|
|
145
|
+
check_values: Check result values available as variables.
|
|
146
|
+
env_state: Optional environment state for advanced formulas.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Numeric result of the formula.
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
EvaluationError: If the formula is invalid or doesn't return a number.
|
|
153
|
+
|
|
154
|
+
"""
|
|
155
|
+
context = dict(check_values)
|
|
156
|
+
if env_state:
|
|
157
|
+
context["env_state"] = env_state
|
|
158
|
+
|
|
159
|
+
result = safe_eval(formula, context)
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
return float(result)
|
|
163
|
+
except (TypeError, ValueError) as e:
|
|
164
|
+
msg = f"Formula must return a number, got {type(result).__name__}"
|
|
165
|
+
raise EvaluationError(msg, formula) from e
|
sandboxy/core/state.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"""Core state models for Sandboxy MDL and runtime."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from typing import Any, Literal
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
Role = Literal["system", "user", "assistant", "tool"]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SessionState(str, Enum):
|
|
12
|
+
"""State of an interactive session."""
|
|
13
|
+
|
|
14
|
+
IDLE = "idle" # Created but not started
|
|
15
|
+
RUNNING = "running" # Executing steps
|
|
16
|
+
AWAITING_USER = "awaiting_user" # Paused waiting for user input
|
|
17
|
+
AWAITING_AGENT = "awaiting_agent" # Waiting for LLM response
|
|
18
|
+
PAUSED = "paused" # Manually paused
|
|
19
|
+
COMPLETED = "completed" # All steps done
|
|
20
|
+
ERROR = "error" # Execution failed
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class StepAction(str, Enum):
|
|
24
|
+
"""Valid step actions in MDL."""
|
|
25
|
+
|
|
26
|
+
INJECT_USER = "inject_user" # Add scripted user message
|
|
27
|
+
AWAIT_USER = "await_user" # Wait for real user input (interactive)
|
|
28
|
+
AWAIT_AGENT = "await_agent" # Wait for agent response
|
|
29
|
+
BRANCH = "branch" # Conditional branching
|
|
30
|
+
TOOL_CALL = "tool_call" # Direct tool invocation (not via agent)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ToolCall(BaseModel):
|
|
34
|
+
"""A tool call made by the assistant."""
|
|
35
|
+
|
|
36
|
+
id: str
|
|
37
|
+
name: str
|
|
38
|
+
arguments: str # JSON string
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class Message(BaseModel):
|
|
42
|
+
"""A message in the conversation history."""
|
|
43
|
+
|
|
44
|
+
role: Role
|
|
45
|
+
content: str
|
|
46
|
+
tool_name: str | None = None
|
|
47
|
+
tool_call_id: str | None = None
|
|
48
|
+
tool_calls: list[ToolCall] | None = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ToolRef(BaseModel):
|
|
52
|
+
"""Reference to a tool in a module's environment."""
|
|
53
|
+
|
|
54
|
+
name: str
|
|
55
|
+
type: str
|
|
56
|
+
description: str = ""
|
|
57
|
+
config: dict[str, Any] = Field(default_factory=dict)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class EnvConfig(BaseModel):
|
|
61
|
+
"""Environment configuration for a module."""
|
|
62
|
+
|
|
63
|
+
sandbox_type: str = "local"
|
|
64
|
+
tools: list[ToolRef] = Field(default_factory=list)
|
|
65
|
+
initial_state: dict[str, Any] = Field(default_factory=dict)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class Step(BaseModel):
|
|
69
|
+
"""A step in the module's execution flow.
|
|
70
|
+
|
|
71
|
+
Actions:
|
|
72
|
+
inject_user: Add a scripted user message
|
|
73
|
+
params: {content: str}
|
|
74
|
+
await_user: Wait for real user input (interactive sessions only)
|
|
75
|
+
params: {prompt?: str, timeout?: int}
|
|
76
|
+
await_agent: Wait for agent response
|
|
77
|
+
params: {}
|
|
78
|
+
branch: Conditional branching
|
|
79
|
+
params: {branch_name: str}
|
|
80
|
+
tool_call: Direct tool invocation
|
|
81
|
+
params: {tool: str, action: str, args: dict}
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
id: str
|
|
85
|
+
action: str # See StepAction enum
|
|
86
|
+
params: dict[str, Any] = Field(default_factory=dict)
|
|
87
|
+
condition: str | None = None # Optional condition expression for conditional steps
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class BranchCondition(BaseModel):
|
|
91
|
+
"""Condition for branching in the execution flow."""
|
|
92
|
+
|
|
93
|
+
expr: str
|
|
94
|
+
next_step: str
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class CheckKind(str, Enum):
|
|
98
|
+
"""Types of evaluation checks."""
|
|
99
|
+
|
|
100
|
+
CONTAINS = "contains" # Check if target contains/doesn't contain a value
|
|
101
|
+
REGEX = "regex" # Check if target matches a regex pattern
|
|
102
|
+
COUNT = "count" # Check count of items (min/max)
|
|
103
|
+
TOOL_CALLED = "tool_called" # Check if a tool was called
|
|
104
|
+
EQUALS = "equals" # Check if target equals a value
|
|
105
|
+
ENV_STATE = "env_state" # Check environment state value
|
|
106
|
+
# Legacy support
|
|
107
|
+
DETERMINISTIC = "deterministic" # Raw Python expression (deprecated)
|
|
108
|
+
LLM = "llm" # LLM-based evaluation (not implemented)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class CheckTarget(str, Enum):
|
|
112
|
+
"""Valid targets for evaluation checks."""
|
|
113
|
+
|
|
114
|
+
AGENT_MESSAGES = "agent_messages" # All agent message content
|
|
115
|
+
USER_MESSAGES = "user_messages" # All user message content
|
|
116
|
+
ALL_MESSAGES = "all_messages" # All message content
|
|
117
|
+
TOOL_CALLS = "tool_calls" # List of tool calls
|
|
118
|
+
LAST_AGENT_MESSAGE = "last_agent_message" # Most recent agent message
|
|
119
|
+
LAST_USER_MESSAGE = "last_user_message" # Most recent user message
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class EvaluationCheck(BaseModel):
|
|
123
|
+
"""An evaluation check to run after module execution.
|
|
124
|
+
|
|
125
|
+
Predefined check types:
|
|
126
|
+
contains: Check if target contains a string
|
|
127
|
+
- target: what to search (e.g., "agent_messages")
|
|
128
|
+
- value: string to look for
|
|
129
|
+
- expected: True if should contain, False if should not (default: True)
|
|
130
|
+
- case_sensitive: whether to do case-sensitive match (default: False)
|
|
131
|
+
|
|
132
|
+
regex: Check if target matches a regex pattern
|
|
133
|
+
- target: what to search
|
|
134
|
+
- pattern: regex pattern
|
|
135
|
+
- expected: True if should match, False if should not (default: True)
|
|
136
|
+
|
|
137
|
+
count: Check count of items
|
|
138
|
+
- target: what to count (e.g., "agent_messages", "tool_calls")
|
|
139
|
+
- min: minimum count (optional)
|
|
140
|
+
- max: maximum count (optional)
|
|
141
|
+
|
|
142
|
+
tool_called: Check if a specific tool was called
|
|
143
|
+
- tool: tool name
|
|
144
|
+
- action: action name (optional)
|
|
145
|
+
- expected: True if should be called, False if should not (default: True)
|
|
146
|
+
|
|
147
|
+
equals: Check if a value equals expected
|
|
148
|
+
- target: what to check (e.g., "env.order_status")
|
|
149
|
+
- value: expected value
|
|
150
|
+
|
|
151
|
+
env_state: Check environment state
|
|
152
|
+
- key: state key to check
|
|
153
|
+
- value: expected value
|
|
154
|
+
|
|
155
|
+
deterministic: (deprecated) Raw Python expression
|
|
156
|
+
- expr: Python expression string
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
name: str
|
|
160
|
+
kind: str # See CheckKind enum
|
|
161
|
+
# Common fields
|
|
162
|
+
target: str | None = None # What to evaluate (see CheckTarget)
|
|
163
|
+
value: Any = None # Value to check against
|
|
164
|
+
expected: bool = True # Expected result (True = should match/contain)
|
|
165
|
+
# Type-specific fields
|
|
166
|
+
pattern: str | None = None # For regex
|
|
167
|
+
case_sensitive: bool = False # For contains
|
|
168
|
+
min: int | None = None # For count
|
|
169
|
+
max: int | None = None # For count
|
|
170
|
+
tool: str | None = None # For tool_called
|
|
171
|
+
action: str | None = None # For tool_called
|
|
172
|
+
key: str | None = None # For env_state
|
|
173
|
+
# Legacy support
|
|
174
|
+
config: dict[str, Any] = Field(default_factory=dict) # For deterministic/llm
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class VariableOption(BaseModel):
|
|
178
|
+
"""An option for a select/dropdown variable."""
|
|
179
|
+
|
|
180
|
+
value: str
|
|
181
|
+
label: str
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class ModuleVariable(BaseModel):
|
|
185
|
+
"""A configurable variable for a module."""
|
|
186
|
+
|
|
187
|
+
name: str
|
|
188
|
+
label: str
|
|
189
|
+
description: str = ""
|
|
190
|
+
type: str = "string" # "string" | "number" | "boolean" | "select" | "slider"
|
|
191
|
+
default: Any = None
|
|
192
|
+
options: list[VariableOption] | None = None # For select type
|
|
193
|
+
min: float | None = None # For slider type
|
|
194
|
+
max: float | None = None # For slider type
|
|
195
|
+
step: float | None = None # For slider type
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
class ScoringConfig(BaseModel):
|
|
199
|
+
"""Configuration for how the final score is computed."""
|
|
200
|
+
|
|
201
|
+
# Score formula using check names as variables, e.g.:
|
|
202
|
+
# "Profit * 2 + Reputation + CustomersServed * 5 - CustomersLost * 10"
|
|
203
|
+
formula: str | None = None
|
|
204
|
+
|
|
205
|
+
# If no formula, use weighted average. Default weight is 1.0.
|
|
206
|
+
weights: dict[str, float] = Field(default_factory=dict)
|
|
207
|
+
|
|
208
|
+
# Normalization settings
|
|
209
|
+
normalize: bool = False # Normalize score to 0-1 range
|
|
210
|
+
min_score: float = 0.0 # Expected minimum for normalization
|
|
211
|
+
max_score: float = 100.0 # Expected maximum for normalization
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
class ModuleSpec(BaseModel):
|
|
215
|
+
"""Complete specification for an MDL module."""
|
|
216
|
+
|
|
217
|
+
id: str
|
|
218
|
+
description: str = ""
|
|
219
|
+
variables: list[ModuleVariable] = Field(default_factory=list)
|
|
220
|
+
agent_config: dict[str, Any] = Field(default_factory=dict) # Override agent settings
|
|
221
|
+
environment: EnvConfig
|
|
222
|
+
steps: list[Step] = Field(default_factory=list)
|
|
223
|
+
branches: dict[str, list[Step]] = Field(default_factory=dict)
|
|
224
|
+
evaluation: list[EvaluationCheck] = Field(default_factory=list)
|
|
225
|
+
scoring: ScoringConfig = Field(default_factory=ScoringConfig) # Score computation config
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
class EvaluationResult(BaseModel):
|
|
229
|
+
"""Result of running evaluation checks."""
|
|
230
|
+
|
|
231
|
+
checks: dict[str, Any] = Field(default_factory=dict)
|
|
232
|
+
score: float = 0.0
|
|
233
|
+
num_events: int = 0
|
|
234
|
+
status: str = "ok"
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Dataset support for multi-case benchmarking."""
|
|
2
|
+
|
|
3
|
+
from sandboxy.datasets.loader import Dataset, TestCase, load_dataset, load_multiple_datasets
|
|
4
|
+
from sandboxy.datasets.runner import (
|
|
5
|
+
CaseResult,
|
|
6
|
+
DatasetResult,
|
|
7
|
+
run_dataset,
|
|
8
|
+
run_dataset_parallel,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"Dataset",
|
|
13
|
+
"TestCase",
|
|
14
|
+
"load_dataset",
|
|
15
|
+
"load_multiple_datasets",
|
|
16
|
+
"CaseResult",
|
|
17
|
+
"DatasetResult",
|
|
18
|
+
"run_dataset",
|
|
19
|
+
"run_dataset_parallel",
|
|
20
|
+
]
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Dataset loader for multi-case benchmarking."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from itertools import product
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import yaml
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class TestCase:
|
|
15
|
+
"""Single test case from a dataset."""
|
|
16
|
+
|
|
17
|
+
id: str
|
|
18
|
+
expected: list[str] = field(default_factory=list)
|
|
19
|
+
variables: dict[str, Any] = field(default_factory=dict)
|
|
20
|
+
tool_responses: dict[str, Any] = field(default_factory=dict)
|
|
21
|
+
tags: list[str] = field(default_factory=list)
|
|
22
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class Dataset:
|
|
27
|
+
"""Collection of test cases."""
|
|
28
|
+
|
|
29
|
+
id: str
|
|
30
|
+
name: str = ""
|
|
31
|
+
description: str = ""
|
|
32
|
+
scenario_id: str | None = None
|
|
33
|
+
cases: list[TestCase] = field(default_factory=list)
|
|
34
|
+
source_path: Path | None = None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def load_dataset(path: Path) -> Dataset:
|
|
38
|
+
"""Load dataset from YAML file.
|
|
39
|
+
|
|
40
|
+
Supports both manual case definitions and generators.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
path: Path to the dataset YAML file
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Dataset with loaded/generated test cases
|
|
47
|
+
|
|
48
|
+
"""
|
|
49
|
+
with open(path) as f:
|
|
50
|
+
data = yaml.safe_load(f)
|
|
51
|
+
|
|
52
|
+
dataset_id = data.get("id", path.stem)
|
|
53
|
+
dataset = Dataset(
|
|
54
|
+
id=dataset_id,
|
|
55
|
+
name=data.get("name", dataset_id),
|
|
56
|
+
description=data.get("description", ""),
|
|
57
|
+
scenario_id=data.get("scenario_id"),
|
|
58
|
+
source_path=path,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if "cases" in data:
|
|
62
|
+
for case_data in data["cases"]:
|
|
63
|
+
expected_raw = case_data.get("expected")
|
|
64
|
+
if expected_raw is None:
|
|
65
|
+
expected = []
|
|
66
|
+
elif isinstance(expected_raw, list):
|
|
67
|
+
expected = expected_raw
|
|
68
|
+
else:
|
|
69
|
+
expected = [expected_raw]
|
|
70
|
+
|
|
71
|
+
case = TestCase(
|
|
72
|
+
id=case_data.get("id", f"case_{len(dataset.cases)}"),
|
|
73
|
+
expected=expected,
|
|
74
|
+
variables=case_data.get("variables", {}),
|
|
75
|
+
tool_responses=case_data.get("tool_responses", {}),
|
|
76
|
+
tags=case_data.get("tags", []),
|
|
77
|
+
metadata=case_data.get("metadata", {}),
|
|
78
|
+
)
|
|
79
|
+
dataset.cases.append(case)
|
|
80
|
+
|
|
81
|
+
if "generator" in data:
|
|
82
|
+
generated = _generate_cases(data["generator"])
|
|
83
|
+
dataset.cases.extend(generated)
|
|
84
|
+
|
|
85
|
+
return dataset
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def load_multiple_datasets(paths: list[Path]) -> Dataset:
|
|
89
|
+
"""Load and merge multiple datasets."""
|
|
90
|
+
if not paths:
|
|
91
|
+
return Dataset(id="empty", name="Empty Dataset")
|
|
92
|
+
|
|
93
|
+
merged = load_dataset(paths[0])
|
|
94
|
+
|
|
95
|
+
for path in paths[1:]:
|
|
96
|
+
ds = load_dataset(path)
|
|
97
|
+
merged.cases.extend(ds.cases)
|
|
98
|
+
merged.id = f"{merged.id}+{ds.id}"
|
|
99
|
+
merged.name = f"{merged.name} + {ds.name}"
|
|
100
|
+
|
|
101
|
+
return merged
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _generate_cases(config: dict[str, Any]) -> list[TestCase]:
|
|
105
|
+
"""Generate cases from dimension combinations."""
|
|
106
|
+
dimensions = config.get("dimensions", {})
|
|
107
|
+
rules = config.get("rules", [])
|
|
108
|
+
tool_mapping = config.get("tool_mapping", {})
|
|
109
|
+
|
|
110
|
+
if not dimensions:
|
|
111
|
+
return []
|
|
112
|
+
|
|
113
|
+
dim_names = list(dimensions.keys())
|
|
114
|
+
dim_values = [dimensions[name] for name in dim_names]
|
|
115
|
+
|
|
116
|
+
cases = []
|
|
117
|
+
for i, combo in enumerate(product(*dim_values)):
|
|
118
|
+
case_data = dict(zip(dim_names, combo, strict=True))
|
|
119
|
+
expected = _find_expected(case_data, rules)
|
|
120
|
+
tool_responses = _build_tool_responses(case_data, tool_mapping)
|
|
121
|
+
|
|
122
|
+
cases.append(
|
|
123
|
+
TestCase(
|
|
124
|
+
id=f"gen_{i:04d}",
|
|
125
|
+
expected=expected,
|
|
126
|
+
variables=case_data.copy(),
|
|
127
|
+
tool_responses=tool_responses,
|
|
128
|
+
)
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return cases
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _find_expected(case_data: dict[str, Any], rules: list[dict]) -> list[str]:
|
|
135
|
+
"""Find expected outcome(s) from rules."""
|
|
136
|
+
for rule in rules:
|
|
137
|
+
if "when" in rule:
|
|
138
|
+
if _matches_rule(case_data, rule["when"]):
|
|
139
|
+
expected = rule.get("expected")
|
|
140
|
+
if expected is None:
|
|
141
|
+
return []
|
|
142
|
+
return expected if isinstance(expected, list) else [expected]
|
|
143
|
+
elif "otherwise" in rule or "expected" in rule:
|
|
144
|
+
expected = rule.get("expected")
|
|
145
|
+
if expected is None:
|
|
146
|
+
return []
|
|
147
|
+
return expected if isinstance(expected, list) else [expected]
|
|
148
|
+
|
|
149
|
+
return []
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _matches_rule(data: dict[str, Any], conditions: dict[str, Any]) -> bool:
|
|
153
|
+
"""Check if case data matches rule conditions."""
|
|
154
|
+
for key, condition in conditions.items():
|
|
155
|
+
value = data.get(key)
|
|
156
|
+
|
|
157
|
+
if isinstance(condition, dict):
|
|
158
|
+
if "gte" in condition and value < condition["gte"]:
|
|
159
|
+
return False
|
|
160
|
+
if "lte" in condition and value > condition["lte"]:
|
|
161
|
+
return False
|
|
162
|
+
if "gt" in condition and value <= condition["gt"]:
|
|
163
|
+
return False
|
|
164
|
+
if "lt" in condition and value >= condition["lt"]:
|
|
165
|
+
return False
|
|
166
|
+
if "eq" in condition and value != condition["eq"]:
|
|
167
|
+
return False
|
|
168
|
+
if "ne" in condition and value == condition["ne"]:
|
|
169
|
+
return False
|
|
170
|
+
if "in" in condition and value not in condition["in"]:
|
|
171
|
+
return False
|
|
172
|
+
elif value != condition:
|
|
173
|
+
return False
|
|
174
|
+
|
|
175
|
+
return True
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _build_tool_responses(
|
|
179
|
+
case_data: dict[str, Any], tool_mapping: dict[str, Any]
|
|
180
|
+
) -> dict[str, Any]:
|
|
181
|
+
"""Build tool response overrides from case data and mapping."""
|
|
182
|
+
tool_responses: dict[str, Any] = {}
|
|
183
|
+
|
|
184
|
+
for tool_name, mapping in tool_mapping.items():
|
|
185
|
+
response: dict[str, Any] = {}
|
|
186
|
+
for response_key, source_key in mapping.items():
|
|
187
|
+
if isinstance(source_key, str) and source_key in case_data:
|
|
188
|
+
response[response_key] = case_data[source_key]
|
|
189
|
+
else:
|
|
190
|
+
response[response_key] = source_key
|
|
191
|
+
tool_responses[tool_name] = response
|
|
192
|
+
|
|
193
|
+
return tool_responses
|