sandboxy 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. sandboxy/__init__.py +3 -0
  2. sandboxy/agents/__init__.py +21 -0
  3. sandboxy/agents/base.py +66 -0
  4. sandboxy/agents/llm_prompt.py +308 -0
  5. sandboxy/agents/loader.py +222 -0
  6. sandboxy/api/__init__.py +5 -0
  7. sandboxy/api/app.py +76 -0
  8. sandboxy/api/routes/__init__.py +1 -0
  9. sandboxy/api/routes/agents.py +92 -0
  10. sandboxy/api/routes/local.py +1388 -0
  11. sandboxy/api/routes/tools.py +106 -0
  12. sandboxy/cli/__init__.py +1 -0
  13. sandboxy/cli/main.py +1196 -0
  14. sandboxy/cli/type_detector.py +48 -0
  15. sandboxy/config.py +49 -0
  16. sandboxy/core/__init__.py +1 -0
  17. sandboxy/core/async_runner.py +824 -0
  18. sandboxy/core/mdl_parser.py +441 -0
  19. sandboxy/core/runner.py +599 -0
  20. sandboxy/core/safe_eval.py +165 -0
  21. sandboxy/core/state.py +234 -0
  22. sandboxy/datasets/__init__.py +20 -0
  23. sandboxy/datasets/loader.py +193 -0
  24. sandboxy/datasets/runner.py +442 -0
  25. sandboxy/errors.py +166 -0
  26. sandboxy/local/context.py +235 -0
  27. sandboxy/local/results.py +173 -0
  28. sandboxy/logging.py +31 -0
  29. sandboxy/mcp/__init__.py +25 -0
  30. sandboxy/mcp/client.py +360 -0
  31. sandboxy/mcp/wrapper.py +99 -0
  32. sandboxy/providers/__init__.py +34 -0
  33. sandboxy/providers/anthropic_provider.py +271 -0
  34. sandboxy/providers/base.py +123 -0
  35. sandboxy/providers/http_client.py +101 -0
  36. sandboxy/providers/openai_provider.py +282 -0
  37. sandboxy/providers/openrouter.py +958 -0
  38. sandboxy/providers/registry.py +199 -0
  39. sandboxy/scenarios/__init__.py +11 -0
  40. sandboxy/scenarios/comparison.py +491 -0
  41. sandboxy/scenarios/loader.py +262 -0
  42. sandboxy/scenarios/runner.py +468 -0
  43. sandboxy/scenarios/unified.py +1434 -0
  44. sandboxy/session/__init__.py +21 -0
  45. sandboxy/session/manager.py +278 -0
  46. sandboxy/tools/__init__.py +34 -0
  47. sandboxy/tools/base.py +127 -0
  48. sandboxy/tools/loader.py +270 -0
  49. sandboxy/tools/yaml_tools.py +708 -0
  50. sandboxy/ui/__init__.py +27 -0
  51. sandboxy/ui/dist/assets/index-CgAkYWrJ.css +1 -0
  52. sandboxy/ui/dist/assets/index-D4zoGFcr.js +347 -0
  53. sandboxy/ui/dist/index.html +14 -0
  54. sandboxy/utils/__init__.py +3 -0
  55. sandboxy/utils/time.py +20 -0
  56. sandboxy-0.0.1.dist-info/METADATA +241 -0
  57. sandboxy-0.0.1.dist-info/RECORD +60 -0
  58. sandboxy-0.0.1.dist-info/WHEEL +4 -0
  59. sandboxy-0.0.1.dist-info/entry_points.txt +3 -0
  60. sandboxy-0.0.1.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,824 @@
1
+ """Async Runner - executes MDL modules with support for interactive sessions.
2
+
3
+ This runner is designed for use with WebSocket connections where execution
4
+ can be paused to await user input.
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import logging
10
+ import re
11
+ from collections.abc import AsyncGenerator
12
+ from typing import Any
13
+
14
+ from pydantic import BaseModel, Field
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ from sandboxy.agents.base import Agent, AgentAction
19
+ from sandboxy.core.state import (
20
+ EvaluationResult,
21
+ Message,
22
+ ModuleSpec,
23
+ SessionState,
24
+ Step,
25
+ StepAction,
26
+ ToolCall,
27
+ )
28
+ from sandboxy.tools.base import Tool, ToolResult
29
+ from sandboxy.tools.loader import ToolLoader
30
+
31
+
32
+ class RunEvent(BaseModel):
33
+ """Event emitted during module execution."""
34
+
35
+ type: str # "user", "agent", "tool_call", "tool_result", "awaiting_input", "completed", "error"
36
+ payload: dict[str, Any] = Field(default_factory=dict)
37
+
38
+
39
+ class AsyncRunner:
40
+ """Executes MDL modules asynchronously with support for interactive sessions.
41
+
42
+ This runner uses an async generator pattern to yield events and receive
43
+ user input for `await_user` steps.
44
+
45
+ Usage:
46
+ runner = AsyncRunner(module, agent)
47
+ async for event in runner.run():
48
+ if event.type == "awaiting_input":
49
+ # Get user input somehow
50
+ user_input = await get_user_input()
51
+ runner.provide_input(user_input)
52
+ else:
53
+ # Process other events
54
+ handle_event(event)
55
+ """
56
+
57
+ def __init__(self, module: ModuleSpec, agent: Agent) -> None:
58
+ """Initialize async runner.
59
+
60
+ Args:
61
+ module: MDL module specification to execute.
62
+ agent: Agent to run within the module.
63
+ """
64
+ self.module = module
65
+ self.agent = agent
66
+ self.events: list[RunEvent] = []
67
+ self.history: list[Message] = []
68
+ self.env_state: dict[str, Any] = module.environment.initial_state.copy()
69
+ self.tools: dict[str, Tool] = ToolLoader.from_env_config(module.environment)
70
+
71
+ # Session state
72
+ self.state = SessionState.IDLE
73
+ self._user_input_future: asyncio.Future[str] | None = None
74
+ self._step_index = 0
75
+
76
+ @property
77
+ def session_state(self) -> SessionState:
78
+ """Get current session state."""
79
+ return self.state
80
+
81
+ def provide_input(self, content: str) -> None:
82
+ """Provide user input for an await_user step.
83
+
84
+ Args:
85
+ content: User's input text.
86
+
87
+ Raises:
88
+ RuntimeError: If not currently awaiting user input.
89
+ """
90
+ if self._user_input_future is None or self._user_input_future.done():
91
+ raise RuntimeError("Not currently awaiting user input")
92
+ self._user_input_future.set_result(content)
93
+
94
+ def inject_event(
95
+ self, tool_name: str, event_type: str, args: dict[str, Any] | None = None
96
+ ) -> dict[str, Any]:
97
+ """Inject a game event by calling a tool's trigger_event action.
98
+
99
+ This is used for chaos injection - frontend can trigger events like
100
+ "heatwave" or "rush_hour" that modify the game state.
101
+
102
+ Args:
103
+ tool_name: Name of the tool to call (e.g., "stand" for lemonade stand).
104
+ event_type: Type of event to trigger (e.g., "heatwave", "rush_hour").
105
+ args: Optional additional arguments for the event.
106
+
107
+ Returns:
108
+ The tool result data.
109
+
110
+ Raises:
111
+ ValueError: If tool not found or event trigger fails.
112
+ """
113
+ if tool_name not in self.tools:
114
+ raise ValueError(f"Tool not found: {tool_name}")
115
+
116
+ tool = self.tools[tool_name]
117
+ event_args = {"event": event_type}
118
+ if args:
119
+ event_args.update(args)
120
+
121
+ result = tool.invoke("trigger_event", event_args, self.env_state)
122
+
123
+ if not result.success:
124
+ raise ValueError(f"Event trigger failed: {result.error}")
125
+
126
+ return result.data or {}
127
+
128
+ async def run(self) -> AsyncGenerator[RunEvent, None]:
129
+ """Execute the module, yielding events as they occur.
130
+
131
+ Yields:
132
+ RunEvent objects for each significant event during execution.
133
+ When type is "awaiting_input", caller should get user input
134
+ and call provide_input() before continuing iteration.
135
+ """
136
+ self.state = SessionState.RUNNING
137
+ steps = self.module.steps
138
+
139
+ try:
140
+ while self._step_index < len(steps):
141
+ step = steps[self._step_index]
142
+
143
+ if step.action == StepAction.INJECT_USER.value:
144
+ event = self._handle_inject_user(step)
145
+ self.events.append(event)
146
+ yield event
147
+
148
+ elif step.action == StepAction.AWAIT_USER.value:
149
+ # Yield awaiting_input event and wait for user input
150
+ async for event in self._handle_await_user(step):
151
+ self.events.append(event)
152
+ yield event
153
+
154
+ elif step.action == StepAction.AWAIT_AGENT.value:
155
+ self.state = SessionState.AWAITING_AGENT
156
+ async for event in self._handle_await_agent(step):
157
+ self.events.append(event)
158
+ yield event
159
+ self.state = SessionState.RUNNING
160
+
161
+ elif step.action == StepAction.BRANCH.value:
162
+ event, new_steps = self._handle_branch(step)
163
+ if event:
164
+ self.events.append(event)
165
+ yield event
166
+ if new_steps is not None:
167
+ steps = new_steps
168
+ self._step_index = 0
169
+ continue
170
+
171
+ elif step.action == StepAction.TOOL_CALL.value:
172
+ async for event in self._handle_direct_tool_call(step):
173
+ self.events.append(event)
174
+ yield event
175
+
176
+ self._step_index += 1
177
+
178
+ # Evaluation
179
+ evaluation = self._evaluate()
180
+ self.state = SessionState.COMPLETED
181
+
182
+ yield RunEvent(
183
+ type="completed",
184
+ payload={
185
+ "evaluation": evaluation.model_dump(),
186
+ "num_events": len(self.events),
187
+ },
188
+ )
189
+
190
+ except Exception as e:
191
+ self.state = SessionState.ERROR
192
+ yield RunEvent(
193
+ type="error",
194
+ payload={"message": str(e)},
195
+ )
196
+
197
+ def _handle_inject_user(self, step: Step) -> RunEvent:
198
+ """Handle inject_user action - add scripted user message."""
199
+ content = step.params.get("content", "")
200
+ msg = Message(role="user", content=content)
201
+ self.history.append(msg)
202
+
203
+ return RunEvent(
204
+ type="user",
205
+ payload={"content": content, "step_id": step.id, "scripted": True},
206
+ )
207
+
208
+ async def _handle_await_user(self, step: Step) -> AsyncGenerator[RunEvent, None]:
209
+ """Handle await_user action - wait for real user input."""
210
+ prompt = step.params.get("prompt", "")
211
+ timeout = step.params.get("timeout")
212
+
213
+ self.state = SessionState.AWAITING_USER
214
+
215
+ # Yield event to signal we're waiting for input
216
+ yield RunEvent(
217
+ type="awaiting_input",
218
+ payload={"prompt": prompt, "step_id": step.id, "timeout": timeout},
219
+ )
220
+
221
+ # Create future for user input
222
+ self._user_input_future = asyncio.get_event_loop().create_future()
223
+
224
+ try:
225
+ if timeout:
226
+ content = await asyncio.wait_for(self._user_input_future, timeout=timeout)
227
+ else:
228
+ content = await self._user_input_future
229
+ except TimeoutError:
230
+ content = step.params.get("default", "[timeout - no input]")
231
+
232
+ self._user_input_future = None
233
+ self.state = SessionState.RUNNING
234
+
235
+ # Add user message to history
236
+ msg = Message(role="user", content=content)
237
+ self.history.append(msg)
238
+
239
+ yield RunEvent(
240
+ type="user",
241
+ payload={"content": content, "step_id": step.id, "scripted": False},
242
+ )
243
+
244
+ async def _handle_await_agent(
245
+ self, step: Step, max_tool_calls: int = 10
246
+ ) -> AsyncGenerator[RunEvent, None]:
247
+ """Handle await_agent action - get agent response.
248
+
249
+ May involve multiple tool calls before agent returns a message.
250
+ """
251
+ tool_call_count = 0
252
+
253
+ while tool_call_count < max_tool_calls:
254
+ # Build tool schemas for agent
255
+ tool_schemas = self._get_tool_schemas()
256
+
257
+ # Get agent action (this could be made async if agent supports it)
258
+ action: AgentAction = self.agent.step(self.history, tool_schemas)
259
+
260
+ if action.type == "message":
261
+ msg = Message(role="assistant", content=action.content or "")
262
+ self.history.append(msg)
263
+
264
+ yield RunEvent(
265
+ type="agent",
266
+ payload={"content": msg.content, "step_id": step.id},
267
+ )
268
+ return # Done with this await_agent step
269
+
270
+ elif action.type == "tool_call":
271
+ async for event in self._handle_tool_call(action, step):
272
+ yield event
273
+ tool_call_count += 1
274
+ # Continue loop to let agent respond to tool result
275
+
276
+ elif action.type == "stop":
277
+ # If we've processed tool calls, the agent should respond based on results
278
+ # Some models return empty content after tool calls - add a hint and retry once
279
+ if tool_call_count > 0 and not hasattr(self, "_retry_after_tool"):
280
+ self._retry_after_tool = True
281
+ # Add a system hint to prompt the agent to respond
282
+ self.history.append(
283
+ Message(
284
+ role="user",
285
+ content="[System: Please respond to the customer based on the information you just retrieved.]",
286
+ )
287
+ )
288
+ continue # Retry the loop
289
+
290
+ # Clean up retry flag
291
+ if hasattr(self, "_retry_after_tool"):
292
+ delattr(self, "_retry_after_tool")
293
+
294
+ yield RunEvent(
295
+ type="agent_stop",
296
+ payload={"step_id": step.id},
297
+ )
298
+ return
299
+
300
+ async def _handle_tool_call(
301
+ self, action: AgentAction, step: Step
302
+ ) -> AsyncGenerator[RunEvent, None]:
303
+ """Handle a tool call from the agent."""
304
+ tool_name = action.tool_name or ""
305
+ tool_action = action.tool_action or ""
306
+ tool_args = action.tool_args or {}
307
+
308
+ # Use the original tool_call_id from the model, or generate one as fallback
309
+ tool_call_id = action.tool_call_id or f"call_{tool_name}_{tool_action}_{len(self.events)}"
310
+ function_name = f"{tool_name}__{tool_action}"
311
+
312
+ yield RunEvent(
313
+ type="tool_call",
314
+ payload={
315
+ "tool": tool_name,
316
+ "action": tool_action,
317
+ "args": tool_args,
318
+ "step_id": step.id,
319
+ },
320
+ )
321
+
322
+ # Add assistant message with tool_calls
323
+ self.history.append(
324
+ Message(
325
+ role="assistant",
326
+ content="",
327
+ tool_calls=[
328
+ ToolCall(
329
+ id=tool_call_id,
330
+ name=function_name,
331
+ arguments=json.dumps(tool_args),
332
+ )
333
+ ],
334
+ )
335
+ )
336
+
337
+ # Execute tool
338
+ if tool_name in self.tools:
339
+ tool = self.tools[tool_name]
340
+ result: ToolResult = tool.invoke(tool_action, tool_args, self.env_state)
341
+
342
+ yield RunEvent(
343
+ type="tool_result",
344
+ payload={
345
+ "tool": tool_name,
346
+ "action": tool_action,
347
+ "result": result.model_dump(),
348
+ },
349
+ )
350
+
351
+ # Add tool result to history
352
+ self.history.append(
353
+ Message(
354
+ role="tool",
355
+ content=json.dumps(result.data) if result.success else result.error or "",
356
+ tool_name=tool_name,
357
+ tool_call_id=tool_call_id,
358
+ )
359
+ )
360
+ else:
361
+ error_msg = f"Tool not found: {tool_name}"
362
+ yield RunEvent(
363
+ type="tool_result",
364
+ payload={
365
+ "tool": tool_name,
366
+ "action": tool_action,
367
+ "result": {"success": False, "error": error_msg},
368
+ },
369
+ )
370
+ self.history.append(
371
+ Message(
372
+ role="tool",
373
+ content=error_msg,
374
+ tool_name=tool_name,
375
+ tool_call_id=tool_call_id,
376
+ )
377
+ )
378
+
379
+ async def _handle_direct_tool_call(self, step: Step) -> AsyncGenerator[RunEvent, None]:
380
+ """Handle direct tool_call action (not via agent)."""
381
+ tool_name = step.params.get("tool", "")
382
+ tool_action = step.params.get("action", "")
383
+ tool_args = step.params.get("args", {})
384
+
385
+ yield RunEvent(
386
+ type="tool_call",
387
+ payload={
388
+ "tool": tool_name,
389
+ "action": tool_action,
390
+ "args": tool_args,
391
+ "step_id": step.id,
392
+ "direct": True,
393
+ },
394
+ )
395
+
396
+ if tool_name in self.tools:
397
+ tool = self.tools[tool_name]
398
+ result: ToolResult = tool.invoke(tool_action, tool_args, self.env_state)
399
+
400
+ yield RunEvent(
401
+ type="tool_result",
402
+ payload={
403
+ "tool": tool_name,
404
+ "action": tool_action,
405
+ "result": result.model_dump(),
406
+ },
407
+ )
408
+ else:
409
+ yield RunEvent(
410
+ type="tool_result",
411
+ payload={
412
+ "tool": tool_name,
413
+ "action": tool_action,
414
+ "result": {"success": False, "error": f"Tool not found: {tool_name}"},
415
+ },
416
+ )
417
+
418
+ def _handle_branch(self, step: Step) -> tuple[RunEvent | None, list[Step] | None]:
419
+ """Handle branch action."""
420
+ branch_name = step.params.get("branch_name")
421
+
422
+ event = RunEvent(
423
+ type="branch",
424
+ payload={"branch": branch_name, "step_id": step.id},
425
+ )
426
+
427
+ if branch_name and branch_name in self.module.branches:
428
+ return event, self.module.branches[branch_name]
429
+
430
+ return event, None
431
+
432
+ def _get_tool_schemas(self) -> list[dict[str, Any]]:
433
+ """Get tool schemas for agent tool calling."""
434
+ schemas = []
435
+ for name, tool in self.tools.items():
436
+ schemas.append(
437
+ {
438
+ "name": name,
439
+ "description": tool.description,
440
+ "actions": tool.get_actions(),
441
+ }
442
+ )
443
+ return schemas
444
+
445
+ def _evaluate(self) -> EvaluationResult:
446
+ """Run evaluation checks and compute score."""
447
+ checks: dict[str, Any] = {}
448
+
449
+ # Run all checks and collect results
450
+ for check in self.module.evaluation:
451
+ result = self._run_check(check)
452
+ checks[check.name] = result
453
+
454
+ # Compute final score based on scoring config
455
+ score = self._compute_score(checks)
456
+
457
+ return EvaluationResult(
458
+ checks=checks,
459
+ score=score,
460
+ num_events=len(self.events),
461
+ status="ok",
462
+ )
463
+
464
+ def _compute_score(self, checks: dict[str, Any]) -> float:
465
+ """Compute final score based on scoring config.
466
+
467
+ Supports three modes:
468
+ 1. Formula: Use a Python expression with check names as variables
469
+ 2. Weighted average: Average checks with optional weights
470
+ 3. Default: Simple average of all numeric/boolean results
471
+ """
472
+ scoring = self.module.scoring
473
+
474
+ # Extract numeric values from checks for use in formulas
475
+ check_values: dict[str, float] = {}
476
+ for name, result in checks.items():
477
+ if isinstance(result, int | float):
478
+ check_values[name] = float(result)
479
+ elif isinstance(result, bool):
480
+ check_values[name] = 1.0 if result else 0.0
481
+ elif isinstance(result, dict):
482
+ if result.get("passed") is True:
483
+ check_values[name] = 1.0
484
+ elif result.get("passed") is False:
485
+ check_values[name] = 0.0
486
+ elif "value" in result and isinstance(result["value"], int | float):
487
+ check_values[name] = float(result["value"])
488
+
489
+ # Mode 1: Custom formula
490
+ if scoring.formula:
491
+ try:
492
+ score = self._eval_score_formula(scoring.formula, check_values)
493
+ except Exception:
494
+ # Fall back to weighted average on formula error
495
+ score = self._weighted_average(check_values, scoring.weights)
496
+ else:
497
+ # Mode 2/3: Weighted average (with optional weights)
498
+ score = self._weighted_average(check_values, scoring.weights)
499
+
500
+ # Normalize if requested
501
+ if scoring.normalize and scoring.max_score != scoring.min_score:
502
+ score = (score - scoring.min_score) / (scoring.max_score - scoring.min_score)
503
+ score = max(0.0, min(1.0, score)) # Clamp to 0-1
504
+
505
+ return score
506
+
507
+ def _eval_score_formula(self, formula: str, check_values: dict[str, float]) -> float:
508
+ """Evaluate a score formula with check values as variables."""
509
+ safe_builtins = {
510
+ "True": True,
511
+ "False": False,
512
+ "None": None,
513
+ "len": len,
514
+ "min": min,
515
+ "max": max,
516
+ "abs": abs,
517
+ "sum": sum,
518
+ "round": round,
519
+ }
520
+
521
+ # Add env_state to context for formulas that reference it
522
+ context = {"__builtins__": safe_builtins, "env_state": self.env_state}
523
+ context.update(check_values)
524
+
525
+ result = eval(formula, context, {})
526
+ return float(result)
527
+
528
+ def _weighted_average(self, values: dict[str, float], weights: dict[str, float]) -> float:
529
+ """Compute weighted average of check values."""
530
+ if not values:
531
+ return 0.0
532
+
533
+ total = 0.0
534
+ total_weight = 0.0
535
+
536
+ for name, value in values.items():
537
+ weight = weights.get(name, 1.0)
538
+ total += value * weight
539
+ total_weight += weight
540
+
541
+ return total / total_weight if total_weight > 0 else 0.0
542
+
543
+ def _run_check(self, check: Any) -> dict[str, Any]:
544
+ """Run a single evaluation check."""
545
+ kind = check.kind
546
+
547
+ try:
548
+ if kind == "contains":
549
+ return self._check_contains(check)
550
+ if kind == "regex":
551
+ return self._check_regex(check)
552
+ if kind == "count":
553
+ return self._check_count(check)
554
+ if kind == "tool_called":
555
+ return self._check_tool_called(check)
556
+ if kind == "equals":
557
+ return self._check_equals(check)
558
+ if kind == "env_state":
559
+ return self._check_env_state(check)
560
+ if kind == "deterministic":
561
+ # Legacy support for raw Python expressions
562
+ return self._check_deterministic(check)
563
+ if kind == "llm":
564
+ return {"status": "skipped", "reason": "LLM eval not implemented"}
565
+ return {"status": "error", "error": f"Unknown check kind: {kind}"}
566
+ except Exception as e:
567
+ return {"status": "error", "error": str(e)}
568
+
569
+ def _get_target_text(self, target: str) -> str:
570
+ """Get text content for a target."""
571
+ if target == "agent_messages":
572
+ return " ".join(msg.content for msg in self.history if msg.role == "assistant")
573
+ if target == "user_messages":
574
+ return " ".join(msg.content for msg in self.history if msg.role == "user")
575
+ if target == "all_messages":
576
+ return " ".join(msg.content for msg in self.history)
577
+ if target == "last_agent_message":
578
+ for msg in reversed(self.history):
579
+ if msg.role == "assistant":
580
+ return msg.content
581
+ return ""
582
+ if target == "last_user_message":
583
+ for msg in reversed(self.history):
584
+ if msg.role == "user":
585
+ return msg.content
586
+ return ""
587
+ return ""
588
+
589
+ def _get_target_list(self, target: str) -> list[Any]:
590
+ """Get list of items for a target."""
591
+ if target == "agent_messages":
592
+ return [msg for msg in self.history if msg.role == "assistant"]
593
+ if target == "user_messages":
594
+ return [msg for msg in self.history if msg.role == "user"]
595
+ if target == "all_messages":
596
+ return list(self.history)
597
+ if target == "tool_calls":
598
+ return [event for event in self.events if event.type == "tool_call"]
599
+ return []
600
+
601
+ def _check_contains(self, check: Any) -> dict[str, Any]:
602
+ """Check if target contains a value."""
603
+ target = check.target or "agent_messages"
604
+ value = check.value or ""
605
+ expected = check.expected
606
+ case_sensitive = check.case_sensitive
607
+
608
+ text = self._get_target_text(target)
609
+
610
+ if not case_sensitive:
611
+ text = text.lower()
612
+ value = value.lower()
613
+
614
+ found = value in text
615
+ passed = found == expected
616
+
617
+ return {
618
+ "passed": passed,
619
+ "found": found,
620
+ "expected": expected,
621
+ "searched_for": check.value,
622
+ "in": target,
623
+ }
624
+
625
+ def _check_regex(self, check: Any) -> dict[str, Any]:
626
+ """Check if target matches a regex pattern."""
627
+ target = check.target or "agent_messages"
628
+ pattern = check.pattern or ""
629
+ expected = check.expected
630
+
631
+ text = self._get_target_text(target)
632
+ match = bool(re.search(pattern, text, re.IGNORECASE if not check.case_sensitive else 0))
633
+ passed = match == expected
634
+
635
+ return {
636
+ "passed": passed,
637
+ "matched": match,
638
+ "expected": expected,
639
+ "pattern": pattern,
640
+ "in": target,
641
+ }
642
+
643
+ def _check_count(self, check: Any) -> dict[str, Any]:
644
+ """Check count of items."""
645
+ target = check.target or "agent_messages"
646
+ min_count = check.min
647
+ max_count = check.max
648
+
649
+ items = self._get_target_list(target)
650
+ count = len(items)
651
+
652
+ passed = True
653
+ if min_count is not None and count < min_count:
654
+ passed = False
655
+ if max_count is not None and count > max_count:
656
+ passed = False
657
+
658
+ return {
659
+ "passed": passed,
660
+ "count": count,
661
+ "min": min_count,
662
+ "max": max_count,
663
+ "target": target,
664
+ }
665
+
666
+ def _check_tool_called(self, check: Any) -> dict[str, Any]:
667
+ """Check if a specific tool was called."""
668
+ tool_name = check.tool
669
+ action_name = check.action
670
+ expected = check.expected
671
+
672
+ tool_calls = [e for e in self.events if e.type == "tool_call"]
673
+
674
+ called = False
675
+ for tc in tool_calls:
676
+ payload = tc.payload
677
+ if payload.get("tool") == tool_name:
678
+ if action_name is None or payload.get("action") == action_name:
679
+ called = True
680
+ break
681
+
682
+ passed = called == expected
683
+
684
+ return {
685
+ "passed": passed,
686
+ "called": called,
687
+ "expected": expected,
688
+ "tool": tool_name,
689
+ "action": action_name,
690
+ }
691
+
692
+ def _check_equals(self, check: Any) -> dict[str, Any]:
693
+ """Check if a value equals expected."""
694
+ target = check.target or ""
695
+ expected_value = check.value
696
+
697
+ # Handle env.* targets
698
+ if target.startswith("env."):
699
+ key = target[4:]
700
+ actual_value = self.env_state.get(key)
701
+ else:
702
+ actual_value = self._get_target_text(target)
703
+
704
+ passed = actual_value == expected_value
705
+
706
+ return {
707
+ "passed": passed,
708
+ "actual": actual_value,
709
+ "expected": expected_value,
710
+ "target": target,
711
+ }
712
+
713
+ def _get_nested_value(self, obj: Any, path: str) -> Any:
714
+ """Get a nested value using dot notation (e.g., 'orders.ORD123.refunded')."""
715
+ keys = path.split(".")
716
+ current = obj
717
+ for key in keys:
718
+ if current is None:
719
+ return None
720
+ if isinstance(current, dict):
721
+ current = current.get(key)
722
+ elif hasattr(current, key):
723
+ current = getattr(current, key)
724
+ else:
725
+ return None
726
+ return current
727
+
728
+ def _check_env_state(self, check: Any) -> dict[str, Any]:
729
+ """Check environment state value. Supports dot notation for nested access."""
730
+ key = check.key or ""
731
+ expected_value = check.value
732
+
733
+ # Support dot notation for nested values (e.g., "orders.ORD123.refunded")
734
+ if "." in key:
735
+ actual_value = self._get_nested_value(self.env_state, key)
736
+ else:
737
+ actual_value = self.env_state.get(key)
738
+
739
+ passed = actual_value == expected_value
740
+
741
+ return {
742
+ "passed": passed,
743
+ "actual": actual_value,
744
+ "expected": expected_value,
745
+ "key": key,
746
+ }
747
+
748
+ def _check_deterministic(self, check: Any) -> dict[str, Any]:
749
+ """Evaluate a deterministic check with Python expression and optional pass_if condition."""
750
+ expr = check.config.get("expr", "")
751
+ if not expr or expr == "TODO":
752
+ return {"status": "skipped", "reason": "No expression defined"}
753
+
754
+ context = {
755
+ "env_state": self.env_state,
756
+ "history": [msg.model_dump() for msg in self.history],
757
+ "events": [event.model_dump() for event in self.events],
758
+ }
759
+
760
+ try:
761
+ result = self._safe_eval(expr, context)
762
+
763
+ # Check for pass_if condition (e.g., ">=0", "<=5", ">=50")
764
+ pass_if = check.config.get("pass_if")
765
+ if pass_if and isinstance(result, int | float):
766
+ passed = self._evaluate_pass_condition(result, pass_if)
767
+ return {"passed": passed, "value": result, "condition": pass_if}
768
+ if isinstance(result, bool):
769
+ return {"passed": result}
770
+ # For numeric values without pass_if, just return the value (no pass/fail)
771
+ return {"value": result}
772
+ except Exception as e:
773
+ return {"status": "error", "error": str(e)}
774
+
775
+ def _evaluate_pass_condition(self, value: float, condition: str) -> bool:
776
+ """Evaluate a pass_if condition like '>=0', '<=5', '>50'."""
777
+ # Parse condition: operator + value (e.g., ">=50", "<=0", ">10")
778
+ match = re.match(r"([<>=!]+)\s*(-?[\d.]+)", condition)
779
+ if not match:
780
+ return True # No valid condition, default to pass
781
+
782
+ op, threshold_str = match.groups()
783
+ threshold = float(threshold_str)
784
+
785
+ if op == ">=":
786
+ return value >= threshold
787
+ if op == "<=":
788
+ return value <= threshold
789
+ if op == ">":
790
+ return value > threshold
791
+ if op == "<":
792
+ return value < threshold
793
+ if op == "==" or op == "=":
794
+ return value == threshold
795
+ if op == "!=" or op == "<>":
796
+ return value != threshold
797
+ return True # Unknown operator, default to pass
798
+
799
+ def _safe_eval(self, expr: str, context: dict[str, Any]) -> Any:
800
+ """Safely evaluate an expression with restricted scope (legacy support)."""
801
+ safe_builtins = {
802
+ "True": True,
803
+ "False": False,
804
+ "None": None,
805
+ "len": len,
806
+ "str": str,
807
+ "int": int,
808
+ "float": float,
809
+ "bool": bool,
810
+ "list": list,
811
+ "dict": dict,
812
+ "sum": sum,
813
+ "min": min,
814
+ "max": max,
815
+ "abs": abs,
816
+ "round": round,
817
+ "any": any,
818
+ "all": all,
819
+ }
820
+
821
+ safe_globals = {"__builtins__": safe_builtins}
822
+ safe_globals.update(context)
823
+
824
+ return eval(expr, safe_globals, {})