sandboxy 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. sandboxy/__init__.py +3 -0
  2. sandboxy/agents/__init__.py +21 -0
  3. sandboxy/agents/base.py +66 -0
  4. sandboxy/agents/llm_prompt.py +308 -0
  5. sandboxy/agents/loader.py +222 -0
  6. sandboxy/api/__init__.py +5 -0
  7. sandboxy/api/app.py +76 -0
  8. sandboxy/api/routes/__init__.py +1 -0
  9. sandboxy/api/routes/agents.py +92 -0
  10. sandboxy/api/routes/local.py +1388 -0
  11. sandboxy/api/routes/tools.py +106 -0
  12. sandboxy/cli/__init__.py +1 -0
  13. sandboxy/cli/main.py +1196 -0
  14. sandboxy/cli/type_detector.py +48 -0
  15. sandboxy/config.py +49 -0
  16. sandboxy/core/__init__.py +1 -0
  17. sandboxy/core/async_runner.py +824 -0
  18. sandboxy/core/mdl_parser.py +441 -0
  19. sandboxy/core/runner.py +599 -0
  20. sandboxy/core/safe_eval.py +165 -0
  21. sandboxy/core/state.py +234 -0
  22. sandboxy/datasets/__init__.py +20 -0
  23. sandboxy/datasets/loader.py +193 -0
  24. sandboxy/datasets/runner.py +442 -0
  25. sandboxy/errors.py +166 -0
  26. sandboxy/local/context.py +235 -0
  27. sandboxy/local/results.py +173 -0
  28. sandboxy/logging.py +31 -0
  29. sandboxy/mcp/__init__.py +25 -0
  30. sandboxy/mcp/client.py +360 -0
  31. sandboxy/mcp/wrapper.py +99 -0
  32. sandboxy/providers/__init__.py +34 -0
  33. sandboxy/providers/anthropic_provider.py +271 -0
  34. sandboxy/providers/base.py +123 -0
  35. sandboxy/providers/http_client.py +101 -0
  36. sandboxy/providers/openai_provider.py +282 -0
  37. sandboxy/providers/openrouter.py +958 -0
  38. sandboxy/providers/registry.py +199 -0
  39. sandboxy/scenarios/__init__.py +11 -0
  40. sandboxy/scenarios/comparison.py +491 -0
  41. sandboxy/scenarios/loader.py +262 -0
  42. sandboxy/scenarios/runner.py +468 -0
  43. sandboxy/scenarios/unified.py +1434 -0
  44. sandboxy/session/__init__.py +21 -0
  45. sandboxy/session/manager.py +278 -0
  46. sandboxy/tools/__init__.py +34 -0
  47. sandboxy/tools/base.py +127 -0
  48. sandboxy/tools/loader.py +270 -0
  49. sandboxy/tools/yaml_tools.py +708 -0
  50. sandboxy/ui/__init__.py +27 -0
  51. sandboxy/ui/dist/assets/index-CgAkYWrJ.css +1 -0
  52. sandboxy/ui/dist/assets/index-D4zoGFcr.js +347 -0
  53. sandboxy/ui/dist/index.html +14 -0
  54. sandboxy/utils/__init__.py +3 -0
  55. sandboxy/utils/time.py +20 -0
  56. sandboxy-0.0.1.dist-info/METADATA +241 -0
  57. sandboxy-0.0.1.dist-info/RECORD +60 -0
  58. sandboxy-0.0.1.dist-info/WHEEL +4 -0
  59. sandboxy-0.0.1.dist-info/entry_points.txt +3 -0
  60. sandboxy-0.0.1.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,165 @@
1
+ """Safe expression evaluation module.
2
+
3
+ Provides sandboxed expression evaluation using simpleeval, replacing raw eval()
4
+ calls throughout the codebase with a secure, consistent implementation.
5
+
6
+ Security features:
7
+ - Only whitelisted functions/operators are allowed
8
+ - No access to __builtins__, __import__, or dunder methods
9
+ - Consistent evaluation context across all callers
10
+ - Proper error handling with typed exceptions
11
+ """
12
+
13
+ import logging
14
+ from typing import Any
15
+
16
+ from simpleeval import EvalWithCompoundTypes, InvalidExpression
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class EvaluationError(Exception):
22
+ """Error during expression evaluation."""
23
+
24
+ def __init__(self, message: str, expression: str | None = None) -> None:
25
+ """Initialize evaluation error with message and optional expression.
26
+
27
+ Args:
28
+ message: Error message describing what went wrong.
29
+ expression: The expression that caused the error, if applicable.
30
+ """
31
+ super().__init__(message)
32
+ self.expression = expression
33
+
34
+
35
+ # Safe functions available in all evaluations
36
+ SAFE_FUNCTIONS = {
37
+ # Type conversions
38
+ "str": str,
39
+ "int": int,
40
+ "float": float,
41
+ "bool": bool,
42
+ "list": list,
43
+ "dict": dict,
44
+ # Math
45
+ "abs": abs,
46
+ "round": round,
47
+ "min": min,
48
+ "max": max,
49
+ "sum": sum,
50
+ # Collections
51
+ "len": len,
52
+ "any": any,
53
+ "all": all,
54
+ "sorted": sorted,
55
+ "reversed": lambda x: list(reversed(x)),
56
+ # Constants
57
+ "True": True,
58
+ "False": False,
59
+ "None": None,
60
+ }
61
+
62
+
63
+ def safe_eval(
64
+ expr: str,
65
+ context: dict[str, Any] | None = None,
66
+ ) -> Any:
67
+ """Safely evaluate an expression with restricted scope.
68
+
69
+ Uses simpleeval to provide a sandboxed evaluation environment.
70
+ Only whitelisted functions are available.
71
+
72
+ Args:
73
+ expr: Expression to evaluate.
74
+ context: Additional variables available in the expression.
75
+
76
+ Returns:
77
+ Result of the expression evaluation.
78
+
79
+ Raises:
80
+ EvaluationError: If the expression is invalid or evaluation fails.
81
+
82
+ """
83
+ if not expr or not expr.strip():
84
+ raise EvaluationError("Empty expression", expr)
85
+
86
+ # Build evaluation context
87
+ names = dict(SAFE_FUNCTIONS)
88
+ if context:
89
+ names.update(context)
90
+
91
+ try:
92
+ evaluator = EvalWithCompoundTypes(names=names)
93
+ return evaluator.eval(expr)
94
+ except InvalidExpression as e:
95
+ logger.debug("Invalid expression '%s': %s", expr, e)
96
+ msg = f"Invalid expression: {e}"
97
+ raise EvaluationError(msg, expr) from e
98
+ except (TypeError, ValueError, KeyError, AttributeError) as e:
99
+ logger.debug("Evaluation error for '%s': %s", expr, e)
100
+ msg = f"Evaluation failed: {e}"
101
+ raise EvaluationError(msg, expr) from e
102
+ except Exception as e:
103
+ # Catch any other errors from simpleeval
104
+ logger.warning("Unexpected evaluation error for '%s': %s", expr, e)
105
+ msg = f"Evaluation failed: {e}"
106
+ raise EvaluationError(msg, expr) from e
107
+
108
+
109
+ def safe_eval_condition(
110
+ condition: str,
111
+ variables: dict[str, Any] | None = None,
112
+ ) -> bool:
113
+ """Safely evaluate a boolean condition.
114
+
115
+ Args:
116
+ condition: Condition expression (e.g., "x > 5", "name == 'test'").
117
+ variables: Variables available in the condition.
118
+
119
+ Returns:
120
+ Boolean result of the condition.
121
+
122
+ Note:
123
+ Returns False if evaluation fails (defensive default for conditions).
124
+
125
+ """
126
+ if not condition or not condition.strip():
127
+ return False
128
+
129
+ try:
130
+ result = safe_eval(condition, variables)
131
+ return bool(result)
132
+ except EvaluationError:
133
+ return False
134
+
135
+
136
+ def safe_eval_formula(
137
+ formula: str,
138
+ check_values: dict[str, float],
139
+ env_state: dict[str, Any] | None = None,
140
+ ) -> float:
141
+ """Safely evaluate a score formula.
142
+
143
+ Args:
144
+ formula: Formula expression (e.g., "score * 0.5 + bonus").
145
+ check_values: Check result values available as variables.
146
+ env_state: Optional environment state for advanced formulas.
147
+
148
+ Returns:
149
+ Numeric result of the formula.
150
+
151
+ Raises:
152
+ EvaluationError: If the formula is invalid or doesn't return a number.
153
+
154
+ """
155
+ context = dict(check_values)
156
+ if env_state:
157
+ context["env_state"] = env_state
158
+
159
+ result = safe_eval(formula, context)
160
+
161
+ try:
162
+ return float(result)
163
+ except (TypeError, ValueError) as e:
164
+ msg = f"Formula must return a number, got {type(result).__name__}"
165
+ raise EvaluationError(msg, formula) from e
sandboxy/core/state.py ADDED
@@ -0,0 +1,234 @@
1
+ """Core state models for Sandboxy MDL and runtime."""
2
+
3
+ from enum import Enum
4
+ from typing import Any, Literal
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ Role = Literal["system", "user", "assistant", "tool"]
9
+
10
+
11
+ class SessionState(str, Enum):
12
+ """State of an interactive session."""
13
+
14
+ IDLE = "idle" # Created but not started
15
+ RUNNING = "running" # Executing steps
16
+ AWAITING_USER = "awaiting_user" # Paused waiting for user input
17
+ AWAITING_AGENT = "awaiting_agent" # Waiting for LLM response
18
+ PAUSED = "paused" # Manually paused
19
+ COMPLETED = "completed" # All steps done
20
+ ERROR = "error" # Execution failed
21
+
22
+
23
+ class StepAction(str, Enum):
24
+ """Valid step actions in MDL."""
25
+
26
+ INJECT_USER = "inject_user" # Add scripted user message
27
+ AWAIT_USER = "await_user" # Wait for real user input (interactive)
28
+ AWAIT_AGENT = "await_agent" # Wait for agent response
29
+ BRANCH = "branch" # Conditional branching
30
+ TOOL_CALL = "tool_call" # Direct tool invocation (not via agent)
31
+
32
+
33
+ class ToolCall(BaseModel):
34
+ """A tool call made by the assistant."""
35
+
36
+ id: str
37
+ name: str
38
+ arguments: str # JSON string
39
+
40
+
41
+ class Message(BaseModel):
42
+ """A message in the conversation history."""
43
+
44
+ role: Role
45
+ content: str
46
+ tool_name: str | None = None
47
+ tool_call_id: str | None = None
48
+ tool_calls: list[ToolCall] | None = None
49
+
50
+
51
+ class ToolRef(BaseModel):
52
+ """Reference to a tool in a module's environment."""
53
+
54
+ name: str
55
+ type: str
56
+ description: str = ""
57
+ config: dict[str, Any] = Field(default_factory=dict)
58
+
59
+
60
+ class EnvConfig(BaseModel):
61
+ """Environment configuration for a module."""
62
+
63
+ sandbox_type: str = "local"
64
+ tools: list[ToolRef] = Field(default_factory=list)
65
+ initial_state: dict[str, Any] = Field(default_factory=dict)
66
+
67
+
68
+ class Step(BaseModel):
69
+ """A step in the module's execution flow.
70
+
71
+ Actions:
72
+ inject_user: Add a scripted user message
73
+ params: {content: str}
74
+ await_user: Wait for real user input (interactive sessions only)
75
+ params: {prompt?: str, timeout?: int}
76
+ await_agent: Wait for agent response
77
+ params: {}
78
+ branch: Conditional branching
79
+ params: {branch_name: str}
80
+ tool_call: Direct tool invocation
81
+ params: {tool: str, action: str, args: dict}
82
+ """
83
+
84
+ id: str
85
+ action: str # See StepAction enum
86
+ params: dict[str, Any] = Field(default_factory=dict)
87
+ condition: str | None = None # Optional condition expression for conditional steps
88
+
89
+
90
+ class BranchCondition(BaseModel):
91
+ """Condition for branching in the execution flow."""
92
+
93
+ expr: str
94
+ next_step: str
95
+
96
+
97
+ class CheckKind(str, Enum):
98
+ """Types of evaluation checks."""
99
+
100
+ CONTAINS = "contains" # Check if target contains/doesn't contain a value
101
+ REGEX = "regex" # Check if target matches a regex pattern
102
+ COUNT = "count" # Check count of items (min/max)
103
+ TOOL_CALLED = "tool_called" # Check if a tool was called
104
+ EQUALS = "equals" # Check if target equals a value
105
+ ENV_STATE = "env_state" # Check environment state value
106
+ # Legacy support
107
+ DETERMINISTIC = "deterministic" # Raw Python expression (deprecated)
108
+ LLM = "llm" # LLM-based evaluation (not implemented)
109
+
110
+
111
+ class CheckTarget(str, Enum):
112
+ """Valid targets for evaluation checks."""
113
+
114
+ AGENT_MESSAGES = "agent_messages" # All agent message content
115
+ USER_MESSAGES = "user_messages" # All user message content
116
+ ALL_MESSAGES = "all_messages" # All message content
117
+ TOOL_CALLS = "tool_calls" # List of tool calls
118
+ LAST_AGENT_MESSAGE = "last_agent_message" # Most recent agent message
119
+ LAST_USER_MESSAGE = "last_user_message" # Most recent user message
120
+
121
+
122
+ class EvaluationCheck(BaseModel):
123
+ """An evaluation check to run after module execution.
124
+
125
+ Predefined check types:
126
+ contains: Check if target contains a string
127
+ - target: what to search (e.g., "agent_messages")
128
+ - value: string to look for
129
+ - expected: True if should contain, False if should not (default: True)
130
+ - case_sensitive: whether to do case-sensitive match (default: False)
131
+
132
+ regex: Check if target matches a regex pattern
133
+ - target: what to search
134
+ - pattern: regex pattern
135
+ - expected: True if should match, False if should not (default: True)
136
+
137
+ count: Check count of items
138
+ - target: what to count (e.g., "agent_messages", "tool_calls")
139
+ - min: minimum count (optional)
140
+ - max: maximum count (optional)
141
+
142
+ tool_called: Check if a specific tool was called
143
+ - tool: tool name
144
+ - action: action name (optional)
145
+ - expected: True if should be called, False if should not (default: True)
146
+
147
+ equals: Check if a value equals expected
148
+ - target: what to check (e.g., "env.order_status")
149
+ - value: expected value
150
+
151
+ env_state: Check environment state
152
+ - key: state key to check
153
+ - value: expected value
154
+
155
+ deterministic: (deprecated) Raw Python expression
156
+ - expr: Python expression string
157
+ """
158
+
159
+ name: str
160
+ kind: str # See CheckKind enum
161
+ # Common fields
162
+ target: str | None = None # What to evaluate (see CheckTarget)
163
+ value: Any = None # Value to check against
164
+ expected: bool = True # Expected result (True = should match/contain)
165
+ # Type-specific fields
166
+ pattern: str | None = None # For regex
167
+ case_sensitive: bool = False # For contains
168
+ min: int | None = None # For count
169
+ max: int | None = None # For count
170
+ tool: str | None = None # For tool_called
171
+ action: str | None = None # For tool_called
172
+ key: str | None = None # For env_state
173
+ # Legacy support
174
+ config: dict[str, Any] = Field(default_factory=dict) # For deterministic/llm
175
+
176
+
177
+ class VariableOption(BaseModel):
178
+ """An option for a select/dropdown variable."""
179
+
180
+ value: str
181
+ label: str
182
+
183
+
184
+ class ModuleVariable(BaseModel):
185
+ """A configurable variable for a module."""
186
+
187
+ name: str
188
+ label: str
189
+ description: str = ""
190
+ type: str = "string" # "string" | "number" | "boolean" | "select" | "slider"
191
+ default: Any = None
192
+ options: list[VariableOption] | None = None # For select type
193
+ min: float | None = None # For slider type
194
+ max: float | None = None # For slider type
195
+ step: float | None = None # For slider type
196
+
197
+
198
+ class ScoringConfig(BaseModel):
199
+ """Configuration for how the final score is computed."""
200
+
201
+ # Score formula using check names as variables, e.g.:
202
+ # "Profit * 2 + Reputation + CustomersServed * 5 - CustomersLost * 10"
203
+ formula: str | None = None
204
+
205
+ # If no formula, use weighted average. Default weight is 1.0.
206
+ weights: dict[str, float] = Field(default_factory=dict)
207
+
208
+ # Normalization settings
209
+ normalize: bool = False # Normalize score to 0-1 range
210
+ min_score: float = 0.0 # Expected minimum for normalization
211
+ max_score: float = 100.0 # Expected maximum for normalization
212
+
213
+
214
+ class ModuleSpec(BaseModel):
215
+ """Complete specification for an MDL module."""
216
+
217
+ id: str
218
+ description: str = ""
219
+ variables: list[ModuleVariable] = Field(default_factory=list)
220
+ agent_config: dict[str, Any] = Field(default_factory=dict) # Override agent settings
221
+ environment: EnvConfig
222
+ steps: list[Step] = Field(default_factory=list)
223
+ branches: dict[str, list[Step]] = Field(default_factory=dict)
224
+ evaluation: list[EvaluationCheck] = Field(default_factory=list)
225
+ scoring: ScoringConfig = Field(default_factory=ScoringConfig) # Score computation config
226
+
227
+
228
+ class EvaluationResult(BaseModel):
229
+ """Result of running evaluation checks."""
230
+
231
+ checks: dict[str, Any] = Field(default_factory=dict)
232
+ score: float = 0.0
233
+ num_events: int = 0
234
+ status: str = "ok"
@@ -0,0 +1,20 @@
1
+ """Dataset support for multi-case benchmarking."""
2
+
3
+ from sandboxy.datasets.loader import Dataset, TestCase, load_dataset, load_multiple_datasets
4
+ from sandboxy.datasets.runner import (
5
+ CaseResult,
6
+ DatasetResult,
7
+ run_dataset,
8
+ run_dataset_parallel,
9
+ )
10
+
11
+ __all__ = [
12
+ "Dataset",
13
+ "TestCase",
14
+ "load_dataset",
15
+ "load_multiple_datasets",
16
+ "CaseResult",
17
+ "DatasetResult",
18
+ "run_dataset",
19
+ "run_dataset_parallel",
20
+ ]
@@ -0,0 +1,193 @@
1
+ """Dataset loader for multi-case benchmarking."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from itertools import product
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ import yaml
11
+
12
+
13
+ @dataclass
14
+ class TestCase:
15
+ """Single test case from a dataset."""
16
+
17
+ id: str
18
+ expected: list[str] = field(default_factory=list)
19
+ variables: dict[str, Any] = field(default_factory=dict)
20
+ tool_responses: dict[str, Any] = field(default_factory=dict)
21
+ tags: list[str] = field(default_factory=list)
22
+ metadata: dict[str, Any] = field(default_factory=dict)
23
+
24
+
25
+ @dataclass
26
+ class Dataset:
27
+ """Collection of test cases."""
28
+
29
+ id: str
30
+ name: str = ""
31
+ description: str = ""
32
+ scenario_id: str | None = None
33
+ cases: list[TestCase] = field(default_factory=list)
34
+ source_path: Path | None = None
35
+
36
+
37
+ def load_dataset(path: Path) -> Dataset:
38
+ """Load dataset from YAML file.
39
+
40
+ Supports both manual case definitions and generators.
41
+
42
+ Args:
43
+ path: Path to the dataset YAML file
44
+
45
+ Returns:
46
+ Dataset with loaded/generated test cases
47
+
48
+ """
49
+ with open(path) as f:
50
+ data = yaml.safe_load(f)
51
+
52
+ dataset_id = data.get("id", path.stem)
53
+ dataset = Dataset(
54
+ id=dataset_id,
55
+ name=data.get("name", dataset_id),
56
+ description=data.get("description", ""),
57
+ scenario_id=data.get("scenario_id"),
58
+ source_path=path,
59
+ )
60
+
61
+ if "cases" in data:
62
+ for case_data in data["cases"]:
63
+ expected_raw = case_data.get("expected")
64
+ if expected_raw is None:
65
+ expected = []
66
+ elif isinstance(expected_raw, list):
67
+ expected = expected_raw
68
+ else:
69
+ expected = [expected_raw]
70
+
71
+ case = TestCase(
72
+ id=case_data.get("id", f"case_{len(dataset.cases)}"),
73
+ expected=expected,
74
+ variables=case_data.get("variables", {}),
75
+ tool_responses=case_data.get("tool_responses", {}),
76
+ tags=case_data.get("tags", []),
77
+ metadata=case_data.get("metadata", {}),
78
+ )
79
+ dataset.cases.append(case)
80
+
81
+ if "generator" in data:
82
+ generated = _generate_cases(data["generator"])
83
+ dataset.cases.extend(generated)
84
+
85
+ return dataset
86
+
87
+
88
+ def load_multiple_datasets(paths: list[Path]) -> Dataset:
89
+ """Load and merge multiple datasets."""
90
+ if not paths:
91
+ return Dataset(id="empty", name="Empty Dataset")
92
+
93
+ merged = load_dataset(paths[0])
94
+
95
+ for path in paths[1:]:
96
+ ds = load_dataset(path)
97
+ merged.cases.extend(ds.cases)
98
+ merged.id = f"{merged.id}+{ds.id}"
99
+ merged.name = f"{merged.name} + {ds.name}"
100
+
101
+ return merged
102
+
103
+
104
+ def _generate_cases(config: dict[str, Any]) -> list[TestCase]:
105
+ """Generate cases from dimension combinations."""
106
+ dimensions = config.get("dimensions", {})
107
+ rules = config.get("rules", [])
108
+ tool_mapping = config.get("tool_mapping", {})
109
+
110
+ if not dimensions:
111
+ return []
112
+
113
+ dim_names = list(dimensions.keys())
114
+ dim_values = [dimensions[name] for name in dim_names]
115
+
116
+ cases = []
117
+ for i, combo in enumerate(product(*dim_values)):
118
+ case_data = dict(zip(dim_names, combo, strict=True))
119
+ expected = _find_expected(case_data, rules)
120
+ tool_responses = _build_tool_responses(case_data, tool_mapping)
121
+
122
+ cases.append(
123
+ TestCase(
124
+ id=f"gen_{i:04d}",
125
+ expected=expected,
126
+ variables=case_data.copy(),
127
+ tool_responses=tool_responses,
128
+ )
129
+ )
130
+
131
+ return cases
132
+
133
+
134
+ def _find_expected(case_data: dict[str, Any], rules: list[dict]) -> list[str]:
135
+ """Find expected outcome(s) from rules."""
136
+ for rule in rules:
137
+ if "when" in rule:
138
+ if _matches_rule(case_data, rule["when"]):
139
+ expected = rule.get("expected")
140
+ if expected is None:
141
+ return []
142
+ return expected if isinstance(expected, list) else [expected]
143
+ elif "otherwise" in rule or "expected" in rule:
144
+ expected = rule.get("expected")
145
+ if expected is None:
146
+ return []
147
+ return expected if isinstance(expected, list) else [expected]
148
+
149
+ return []
150
+
151
+
152
+ def _matches_rule(data: dict[str, Any], conditions: dict[str, Any]) -> bool:
153
+ """Check if case data matches rule conditions."""
154
+ for key, condition in conditions.items():
155
+ value = data.get(key)
156
+
157
+ if isinstance(condition, dict):
158
+ if "gte" in condition and value < condition["gte"]:
159
+ return False
160
+ if "lte" in condition and value > condition["lte"]:
161
+ return False
162
+ if "gt" in condition and value <= condition["gt"]:
163
+ return False
164
+ if "lt" in condition and value >= condition["lt"]:
165
+ return False
166
+ if "eq" in condition and value != condition["eq"]:
167
+ return False
168
+ if "ne" in condition and value == condition["ne"]:
169
+ return False
170
+ if "in" in condition and value not in condition["in"]:
171
+ return False
172
+ elif value != condition:
173
+ return False
174
+
175
+ return True
176
+
177
+
178
+ def _build_tool_responses(
179
+ case_data: dict[str, Any], tool_mapping: dict[str, Any]
180
+ ) -> dict[str, Any]:
181
+ """Build tool response overrides from case data and mapping."""
182
+ tool_responses: dict[str, Any] = {}
183
+
184
+ for tool_name, mapping in tool_mapping.items():
185
+ response: dict[str, Any] = {}
186
+ for response_key, source_key in mapping.items():
187
+ if isinstance(source_key, str) and source_key in case_data:
188
+ response[response_key] = case_data[source_key]
189
+ else:
190
+ response[response_key] = source_key
191
+ tool_responses[tool_name] = response
192
+
193
+ return tool_responses