sandboxy 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. sandboxy/__init__.py +3 -0
  2. sandboxy/agents/__init__.py +21 -0
  3. sandboxy/agents/base.py +66 -0
  4. sandboxy/agents/llm_prompt.py +308 -0
  5. sandboxy/agents/loader.py +222 -0
  6. sandboxy/api/__init__.py +5 -0
  7. sandboxy/api/app.py +76 -0
  8. sandboxy/api/routes/__init__.py +1 -0
  9. sandboxy/api/routes/agents.py +92 -0
  10. sandboxy/api/routes/local.py +1388 -0
  11. sandboxy/api/routes/tools.py +106 -0
  12. sandboxy/cli/__init__.py +1 -0
  13. sandboxy/cli/main.py +1196 -0
  14. sandboxy/cli/type_detector.py +48 -0
  15. sandboxy/config.py +49 -0
  16. sandboxy/core/__init__.py +1 -0
  17. sandboxy/core/async_runner.py +824 -0
  18. sandboxy/core/mdl_parser.py +441 -0
  19. sandboxy/core/runner.py +599 -0
  20. sandboxy/core/safe_eval.py +165 -0
  21. sandboxy/core/state.py +234 -0
  22. sandboxy/datasets/__init__.py +20 -0
  23. sandboxy/datasets/loader.py +193 -0
  24. sandboxy/datasets/runner.py +442 -0
  25. sandboxy/errors.py +166 -0
  26. sandboxy/local/context.py +235 -0
  27. sandboxy/local/results.py +173 -0
  28. sandboxy/logging.py +31 -0
  29. sandboxy/mcp/__init__.py +25 -0
  30. sandboxy/mcp/client.py +360 -0
  31. sandboxy/mcp/wrapper.py +99 -0
  32. sandboxy/providers/__init__.py +34 -0
  33. sandboxy/providers/anthropic_provider.py +271 -0
  34. sandboxy/providers/base.py +123 -0
  35. sandboxy/providers/http_client.py +101 -0
  36. sandboxy/providers/openai_provider.py +282 -0
  37. sandboxy/providers/openrouter.py +958 -0
  38. sandboxy/providers/registry.py +199 -0
  39. sandboxy/scenarios/__init__.py +11 -0
  40. sandboxy/scenarios/comparison.py +491 -0
  41. sandboxy/scenarios/loader.py +262 -0
  42. sandboxy/scenarios/runner.py +468 -0
  43. sandboxy/scenarios/unified.py +1434 -0
  44. sandboxy/session/__init__.py +21 -0
  45. sandboxy/session/manager.py +278 -0
  46. sandboxy/tools/__init__.py +34 -0
  47. sandboxy/tools/base.py +127 -0
  48. sandboxy/tools/loader.py +270 -0
  49. sandboxy/tools/yaml_tools.py +708 -0
  50. sandboxy/ui/__init__.py +27 -0
  51. sandboxy/ui/dist/assets/index-CgAkYWrJ.css +1 -0
  52. sandboxy/ui/dist/assets/index-D4zoGFcr.js +347 -0
  53. sandboxy/ui/dist/index.html +14 -0
  54. sandboxy/utils/__init__.py +3 -0
  55. sandboxy/utils/time.py +20 -0
  56. sandboxy-0.0.1.dist-info/METADATA +241 -0
  57. sandboxy-0.0.1.dist-info/RECORD +60 -0
  58. sandboxy-0.0.1.dist-info/WHEEL +4 -0
  59. sandboxy-0.0.1.dist-info/entry_points.txt +3 -0
  60. sandboxy-0.0.1.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,1434 @@
1
+ """Unified scenario runner - handles all scenario variations based on YAML structure.
2
+
3
+ This module provides a single runner that can handle:
4
+ - Single-turn prompts
5
+ - Multi-turn conversations with tools
6
+ - Goal-based and judge-based evaluation
7
+ - Multi-model comparison with statistical runs
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import asyncio
13
+ import json
14
+ import logging
15
+ import time
16
+ import uuid
17
+ from dataclasses import dataclass, field
18
+ from datetime import datetime
19
+ from pathlib import Path
20
+ from typing import Any
21
+
22
+ import yaml
23
+ from pydantic import BaseModel, Field
24
+
25
+ from sandboxy.providers import get_registry
26
+ from sandboxy.tools.loader import get_tool_dirs
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ def _get_style_prompt(style: str) -> str:
32
+ """Get a simple style prompt modifier."""
33
+ styles = {
34
+ "brief": "Keep your response brief and to the point.",
35
+ "detailed": "Provide a detailed, comprehensive response.",
36
+ "technical": "Be technical and precise in your response.",
37
+ "casual": "Respond in a casual, friendly manner.",
38
+ }
39
+ return styles.get(style, f"Respond in a {style} manner.")
40
+
41
+
42
+ def generate_tool_call_id() -> str:
43
+ """Generate a tool call ID compatible with all providers.
44
+
45
+ Some providers (e.g., Mistral) require IDs to be exactly 9 alphanumeric characters.
46
+ """
47
+ # Use UUID and take first 9 alphanumeric characters
48
+ raw = uuid.uuid4().hex[:9] # hex is already alphanumeric (0-9, a-f)
49
+ return raw
50
+
51
+
52
+ # =============================================================================
53
+ # Data Models
54
+ # =============================================================================
55
+
56
+
57
+ class JudgeSpec(BaseModel):
58
+ """Specification for a judge (LLM or rule-based)."""
59
+
60
+ type: str = "llm" # llm, contains, regex, exact, length, consensus, computed
61
+ model: str | None = None # For LLM judge
62
+ rubric: str = "" # Scoring rubric for LLM judge
63
+ pattern: str | None = None # For contains, regex, exact
64
+ case_sensitive: bool = False
65
+ min_length: int | None = None # For length judge
66
+ max_length: int | None = None
67
+ voters: list[str] = Field(default_factory=list) # For consensus judge
68
+ helper: str | None = None # For computed judge
69
+ pass_threshold: float = 0.5
70
+
71
+
72
+ class GoalSpec(BaseModel):
73
+ """Specification for a goal (rule-based evaluation)."""
74
+
75
+ id: str
76
+ name: str = ""
77
+ description: str = ""
78
+ points: int = 0
79
+ detection: dict[str, Any] = Field(default_factory=dict)
80
+ outcome: bool = False # Mutually exclusive outcome goal (for dataset benchmarking)
81
+
82
+
83
+ class StepSpec(BaseModel):
84
+ """Specification for a conversation step."""
85
+
86
+ id: str
87
+ action: str # inject_user, await_user, await_agent, branch
88
+ params: dict[str, Any] = Field(default_factory=dict)
89
+
90
+
91
+ class EvaluationSpec(BaseModel):
92
+ """Evaluation configuration combining goals and judge."""
93
+
94
+ goals: list[GoalSpec] = Field(default_factory=list)
95
+ judge: JudgeSpec | None = None
96
+ max_score: float | None = None
97
+ formula: str | None = None
98
+
99
+
100
+ class McpServerSpec(BaseModel):
101
+ """MCP server connection specification."""
102
+
103
+ name: str
104
+ command: str | None = None
105
+ args: list[str] = Field(default_factory=list)
106
+ env: dict[str, str] = Field(default_factory=dict)
107
+ url: str | None = None
108
+ headers: dict[str, str] = Field(default_factory=dict)
109
+ transport: str = "auto"
110
+
111
+
112
+ class VariableSpec(BaseModel):
113
+ """Variable specification for scenario parameters."""
114
+
115
+ name: str
116
+ label: str = ""
117
+ type: str = "string" # string, number, boolean, select, slider
118
+ default: Any = None
119
+ options: list[str] = Field(default_factory=list)
120
+ min: float | None = None
121
+ max: float | None = None
122
+ step: float | None = None
123
+
124
+
125
+ class UnifiedScenarioSpec(BaseModel):
126
+ """Unified scenario specification.
127
+
128
+ The structure determines behavior:
129
+ - prompt only → single-turn
130
+ - steps → multi-turn
131
+ - tools/tools_from → enable tool use
132
+ - evaluation.goals → goal-based scoring
133
+ - evaluation.judge → LLM/rule-based judging
134
+ - style → response style constraint
135
+ """
136
+
137
+ # Metadata
138
+ id: str
139
+ name: str = ""
140
+ description: str = ""
141
+ category: str = ""
142
+ tags: list[str] = Field(default_factory=list)
143
+ variables: list[VariableSpec] = Field(default_factory=list)
144
+
145
+ # Agent configuration
146
+ system_prompt: str = ""
147
+
148
+ # Interaction (one of these)
149
+ prompt: str | None = None # Simple single-turn
150
+ steps: list[StepSpec] = Field(default_factory=list) # Multi-turn
151
+
152
+ # Tools
153
+ tools: dict[str, Any] = Field(default_factory=dict) # Inline definitions
154
+ tools_from: list[str] = Field(default_factory=list) # Library imports
155
+ mcp_servers: list[McpServerSpec] = Field(default_factory=list)
156
+
157
+ # State
158
+ initial_state: dict[str, Any] = Field(default_factory=dict)
159
+
160
+ # Evaluation
161
+ evaluation: EvaluationSpec = Field(default_factory=EvaluationSpec)
162
+
163
+ # Style (for blitz-like runs)
164
+ style: str | None = None
165
+
166
+ # Events (for chaos injection)
167
+ events: dict[str, Any] = Field(default_factory=dict)
168
+
169
+ def has_steps(self) -> bool:
170
+ """Check if this is a multi-turn scenario."""
171
+ return len(self.steps) > 0
172
+
173
+ def has_tools(self) -> bool:
174
+ """Check if this scenario uses tools."""
175
+ return bool(self.tools) or bool(self.tools_from) or bool(self.mcp_servers)
176
+
177
+ def has_evaluation(self) -> bool:
178
+ """Check if this scenario has evaluation configured."""
179
+ return bool(self.evaluation.goals) or self.evaluation.judge is not None
180
+
181
+ def has_goals(self) -> bool:
182
+ """Check if this scenario has goal-based evaluation."""
183
+ return bool(self.evaluation.goals)
184
+
185
+ def has_judge(self) -> bool:
186
+ """Check if this scenario has a judge configured."""
187
+ return self.evaluation.judge is not None
188
+
189
+
190
+ # =============================================================================
191
+ # Result Models
192
+ # =============================================================================
193
+
194
+
195
+ @dataclass
196
+ class Message:
197
+ """A message in conversation history."""
198
+
199
+ role: str # system, user, assistant, tool
200
+ content: str
201
+ tool_name: str | None = None
202
+ tool_call_id: str | None = None
203
+ tool_calls: list[dict[str, Any]] | None = None
204
+
205
+
206
+ @dataclass
207
+ class ToolCallRecord:
208
+ """Record of a tool call."""
209
+
210
+ tool: str
211
+ action: str
212
+ args: dict[str, Any]
213
+ result: Any = None
214
+ success: bool = True
215
+ error: str | None = None
216
+
217
+
218
+ @dataclass
219
+ class GoalResult:
220
+ """Result of evaluating a single goal."""
221
+
222
+ id: str
223
+ name: str
224
+ achieved: bool
225
+ points: int
226
+ reason: str = ""
227
+
228
+
229
+ @dataclass
230
+ class JudgeResult:
231
+ """Result from judge evaluation."""
232
+
233
+ score: float
234
+ passed: bool
235
+ reasoning: str
236
+ judge_type: str
237
+
238
+
239
+ @dataclass
240
+ class EvaluationResult:
241
+ """Combined evaluation result."""
242
+
243
+ goals: list[GoalResult] = field(default_factory=list)
244
+ judge: JudgeResult | None = None
245
+ total_score: float = 0.0
246
+ max_score: float = 0.0
247
+ percentage: float = 0.0
248
+
249
+ def to_dict(self) -> dict[str, Any]:
250
+ """Convert to dictionary."""
251
+ return {
252
+ "goals": [
253
+ {
254
+ "id": g.id,
255
+ "name": g.name,
256
+ "achieved": g.achieved,
257
+ "points": g.points,
258
+ "reason": g.reason,
259
+ }
260
+ for g in self.goals
261
+ ],
262
+ "judge": {
263
+ "score": self.judge.score,
264
+ "passed": self.judge.passed,
265
+ "reasoning": self.judge.reasoning,
266
+ "judge_type": self.judge.judge_type,
267
+ }
268
+ if self.judge
269
+ else None,
270
+ "total_score": self.total_score,
271
+ "max_score": self.max_score,
272
+ "percentage": self.percentage,
273
+ }
274
+
275
+
276
+ @dataclass
277
+ class RunResult:
278
+ """Result of running a single scenario with a single model."""
279
+
280
+ id: str
281
+ scenario_id: str
282
+ model: str
283
+ prompt: str | None = None
284
+ response: str = ""
285
+ history: list[Message] = field(default_factory=list)
286
+ tool_calls: list[ToolCallRecord] = field(default_factory=list)
287
+ final_state: dict[str, Any] = field(default_factory=dict)
288
+ evaluation: EvaluationResult | None = None
289
+ latency_ms: int = 0
290
+ input_tokens: int = 0
291
+ output_tokens: int = 0
292
+ cost_usd: float | None = None
293
+ error: str | None = None
294
+ created_at: datetime = field(default_factory=datetime.now)
295
+
296
+ def to_dict(self) -> dict[str, Any]:
297
+ """Convert to dictionary."""
298
+ return {
299
+ "id": self.id,
300
+ "scenario_id": self.scenario_id,
301
+ "model": self.model,
302
+ "prompt": self.prompt,
303
+ "response": self.response,
304
+ "history": [
305
+ {"role": m.role, "content": m.content, "tool_name": m.tool_name}
306
+ for m in self.history
307
+ ],
308
+ "tool_calls": [
309
+ {
310
+ "tool": tc.tool,
311
+ "action": tc.action,
312
+ "args": tc.args,
313
+ "result": tc.result,
314
+ "success": tc.success,
315
+ "error": tc.error,
316
+ }
317
+ for tc in self.tool_calls
318
+ ],
319
+ "final_state": self.final_state,
320
+ "evaluation": self.evaluation.to_dict() if self.evaluation else None,
321
+ "latency_ms": self.latency_ms,
322
+ "input_tokens": self.input_tokens,
323
+ "output_tokens": self.output_tokens,
324
+ "cost_usd": self.cost_usd,
325
+ "error": self.error,
326
+ "created_at": self.created_at.isoformat(),
327
+ }
328
+
329
+ def to_json(self, indent: int | None = 2) -> str:
330
+ """Serialize to JSON string."""
331
+ return json.dumps(self.to_dict(), indent=indent)
332
+
333
+ def pretty(self) -> str:
334
+ """Format for human-readable display."""
335
+ lines = [
336
+ f"Scenario: {self.scenario_id}",
337
+ f"Model: {self.model}",
338
+ f"Latency: {self.latency_ms}ms",
339
+ ]
340
+
341
+ if self.prompt:
342
+ lines.append(f"Prompt: {self.prompt[:100]}{'...' if len(self.prompt) > 100 else ''}")
343
+
344
+ if self.response:
345
+ resp = self.response[:200] + "..." if len(self.response) > 200 else self.response
346
+ lines.append(f"Response: {resp}")
347
+
348
+ if self.tool_calls:
349
+ lines.append(f"Tool Calls: {len(self.tool_calls)}")
350
+
351
+ if self.evaluation:
352
+ lines.append("")
353
+ if self.evaluation.goals:
354
+ achieved = sum(1 for g in self.evaluation.goals if g.achieved)
355
+ lines.append(f"Goals: {achieved}/{len(self.evaluation.goals)}")
356
+ if self.evaluation.judge:
357
+ lines.append(f"Judge Score: {self.evaluation.judge.score:.2f}")
358
+ lines.append(
359
+ f"Total Score: {self.evaluation.total_score:.1f}/{self.evaluation.max_score:.1f}"
360
+ )
361
+ lines.append(f"Percentage: {self.evaluation.percentage:.1f}%")
362
+
363
+ if self.error:
364
+ lines.append(f"Error: {self.error}")
365
+
366
+ return "\n".join(lines)
367
+
368
+
369
+ # =============================================================================
370
+ # Unified Runner
371
+ # =============================================================================
372
+
373
+
374
+ class UnifiedRunner:
375
+ """Single runner that handles all scenario variations.
376
+
377
+ Example:
378
+ runner = UnifiedRunner()
379
+
380
+ # Single model run
381
+ result = await runner.run(scenario, model="gpt-4o")
382
+
383
+ # Multi-model comparison
384
+ comparison = await runner.run_comparison(
385
+ scenario,
386
+ models=["gpt-4o", "claude-3.5-sonnet"],
387
+ runs_per_model=3,
388
+ )
389
+
390
+ """
391
+
392
+ def __init__(self, tool_dirs: list[Path] | None = None) -> None:
393
+ """Initialize the runner.
394
+
395
+ Args:
396
+ tool_dirs: Directories to search for tool libraries
397
+
398
+ """
399
+ self.tool_dirs = tool_dirs or get_tool_dirs()
400
+ self._registry = None
401
+
402
+ @property
403
+ def registry(self) -> Any:
404
+ """Get or create the provider registry."""
405
+ if self._registry is None:
406
+ self._registry = get_registry()
407
+ return self._registry
408
+
409
+ def _calculate_cost(self, model: str, input_tokens: int, output_tokens: int) -> float | None:
410
+ """Calculate cost in USD based on model pricing.
411
+
412
+ Uses pricing data from OpenRouter models registry.
413
+ """
414
+ try:
415
+ from sandboxy.providers.openrouter import OPENROUTER_MODELS
416
+
417
+ model_info = OPENROUTER_MODELS.get(model)
418
+ if not model_info or not model_info.input_cost_per_million:
419
+ return None
420
+
421
+ input_cost = (input_tokens / 1_000_000) * model_info.input_cost_per_million
422
+ output_cost = (output_tokens / 1_000_000) * model_info.output_cost_per_million
423
+ return round(input_cost + output_cost, 6)
424
+ except ImportError:
425
+ return None
426
+
427
+ async def run(
428
+ self,
429
+ scenario: UnifiedScenarioSpec,
430
+ model: str,
431
+ variables: dict[str, Any] | None = None,
432
+ max_turns: int = 20,
433
+ max_tokens: int = 1024,
434
+ temperature: float = 0.7,
435
+ tool_overrides: dict[str, Any] | None = None,
436
+ expected_outcome: str | None = None,
437
+ ) -> RunResult:
438
+ """Run a scenario with a single model.
439
+
440
+ Args:
441
+ scenario: The scenario specification
442
+ model: Model ID to use
443
+ variables: Variable substitutions
444
+ max_turns: Maximum conversation turns
445
+ max_tokens: Maximum tokens per response
446
+ temperature: Sampling temperature
447
+ tool_overrides: Optional dict mapping "tool.action" to override response data.
448
+ Used by dataset benchmarking to inject test case data.
449
+ expected_outcome: Optional expected outcome goal ID for dataset benchmarking.
450
+ When set, only this outcome goal is evaluated (others skipped).
451
+
452
+ Returns:
453
+ RunResult with response and evaluation
454
+
455
+ """
456
+ start_time = time.perf_counter()
457
+ run_id = str(uuid.uuid4())
458
+
459
+ # Apply variable substitutions
460
+ scenario = self._apply_variables(scenario, variables or {})
461
+
462
+ try:
463
+ if scenario.has_steps():
464
+ result = await self._run_multi_turn(
465
+ scenario, model, max_turns, max_tokens, temperature, tool_overrides
466
+ )
467
+ else:
468
+ result = await self._run_single_turn(scenario, model, max_tokens, temperature)
469
+
470
+ # Run evaluation if configured
471
+ if scenario.has_evaluation():
472
+ result.evaluation = await self._evaluate(result, scenario, expected_outcome)
473
+
474
+ result.id = run_id
475
+ result.latency_ms = int((time.perf_counter() - start_time) * 1000)
476
+ return result
477
+
478
+ except Exception as e:
479
+ logger.exception(f"Run failed for {model}: {e}")
480
+ return RunResult(
481
+ id=run_id,
482
+ scenario_id=scenario.id,
483
+ model=model,
484
+ error=str(e),
485
+ latency_ms=int((time.perf_counter() - start_time) * 1000),
486
+ )
487
+
488
+ async def _run_single_turn(
489
+ self,
490
+ scenario: UnifiedScenarioSpec,
491
+ model: str,
492
+ max_tokens: int,
493
+ temperature: float,
494
+ ) -> RunResult:
495
+ """Execute a single-turn prompt."""
496
+ prompt = scenario.prompt or ""
497
+
498
+ # Build system prompt with optional style
499
+ system_prompt = scenario.system_prompt or ""
500
+ if scenario.style:
501
+ style_instruction = _get_style_prompt(scenario.style)
502
+ system_prompt = f"{system_prompt}\n\n{style_instruction}".strip()
503
+
504
+ # Build messages
505
+ messages = []
506
+ if system_prompt:
507
+ messages.append({"role": "system", "content": system_prompt})
508
+ messages.append({"role": "user", "content": prompt})
509
+
510
+ # Call model
511
+ provider = self.registry.get_provider_for_model(model)
512
+ response = await provider.complete(
513
+ model=model,
514
+ messages=messages,
515
+ max_tokens=max_tokens,
516
+ temperature=temperature,
517
+ )
518
+
519
+ return RunResult(
520
+ id="", # Set by caller
521
+ scenario_id=scenario.id,
522
+ model=model,
523
+ prompt=prompt,
524
+ response=response.content,
525
+ history=[
526
+ Message(role="user", content=prompt),
527
+ Message(role="assistant", content=response.content),
528
+ ],
529
+ input_tokens=response.input_tokens,
530
+ output_tokens=response.output_tokens,
531
+ cost_usd=response.cost_usd,
532
+ )
533
+
534
+ async def _run_multi_turn(
535
+ self,
536
+ scenario: UnifiedScenarioSpec,
537
+ model: str,
538
+ max_turns: int,
539
+ max_tokens: int,
540
+ temperature: float,
541
+ tool_overrides: dict[str, Any] | None = None,
542
+ ) -> RunResult:
543
+ """Execute a multi-turn scenario with tools."""
544
+ from sandboxy.agents.base import AgentAction
545
+ from sandboxy.agents.loader import create_agent_from_model
546
+ from sandboxy.core.state import Message as CoreMessage
547
+ from sandboxy.core.state import ToolCall
548
+ from sandboxy.tools.yaml_tools import load_scenario_tools
549
+
550
+ # Load tools with optional overrides for dataset benchmarking
551
+ scenario_data = {
552
+ "tools_from": scenario.tools_from,
553
+ "tools": scenario.tools,
554
+ }
555
+ tools = load_scenario_tools(scenario_data, self.tool_dirs, tool_overrides)
556
+
557
+ # Load MCP tools if configured
558
+ mcp_manager = None
559
+ if scenario.mcp_servers:
560
+ try:
561
+ from sandboxy.mcp.client import McpManager, McpServerConfig
562
+
563
+ mcp_manager = McpManager()
564
+ configs = [
565
+ McpServerConfig(
566
+ name=s.name,
567
+ command=s.command,
568
+ args=s.args,
569
+ env=s.env,
570
+ url=s.url,
571
+ headers=s.headers,
572
+ transport=s.transport, # type: ignore
573
+ )
574
+ for s in scenario.mcp_servers
575
+ ]
576
+ mcp_tools = await mcp_manager.connect_all(configs)
577
+ for name, tool in mcp_tools.items():
578
+ if name not in tools:
579
+ tools[name] = tool
580
+ except ImportError:
581
+ logger.warning("MCP client not available")
582
+
583
+ try:
584
+ # Enhance system prompt when tools are available
585
+ system_prompt = scenario.system_prompt or ""
586
+
587
+ # Apply style if specified
588
+ if scenario.style:
589
+ style_instruction = _get_style_prompt(scenario.style)
590
+ system_prompt = f"{system_prompt}\n\n{style_instruction}".strip()
591
+
592
+ if tools:
593
+ tool_instruction = (
594
+ "\n\n---\n"
595
+ "IMPORTANT: You have tools available to take actions. "
596
+ "Do NOT just describe what commands to run or what steps to take. "
597
+ "Instead, USE YOUR TOOLS to actually execute actions, gather information, "
598
+ "and accomplish tasks directly. Act autonomously rather than giving instructions."
599
+ )
600
+ if tool_instruction not in system_prompt:
601
+ system_prompt = system_prompt + tool_instruction
602
+
603
+ # Create agent (use model ID directly)
604
+ agent = create_agent_from_model(model, system_prompt=system_prompt)
605
+
606
+ # Reset usage tracking if agent supports it
607
+ if hasattr(agent, "reset_usage"):
608
+ agent.reset_usage()
609
+
610
+ # Initialize state
611
+ env_state = dict(scenario.initial_state)
612
+ history: list[Message] = []
613
+ tool_call_log: list[ToolCallRecord] = []
614
+
615
+ # Add system prompt (use enhanced version with tool instructions)
616
+ if system_prompt:
617
+ history.append(Message(role="system", content=system_prompt))
618
+
619
+ # Execute steps
620
+ for step in scenario.steps:
621
+ if step.action == "inject_user":
622
+ content = step.params.get("content", "")
623
+ history.append(Message(role="user", content=content))
624
+
625
+ elif step.action == "await_agent":
626
+ # Get agent response with tool loop
627
+ tool_calls_made = 0
628
+ max_tool_calls = 10
629
+
630
+ while tool_calls_made < max_tool_calls:
631
+ # Build tool schemas
632
+ tool_schemas = [
633
+ {
634
+ "name": name,
635
+ "description": tool.description,
636
+ "actions": tool.get_actions(),
637
+ }
638
+ for name, tool in tools.items()
639
+ ]
640
+
641
+ # Convert to CoreMessage for agent
642
+ history_for_agent = [
643
+ CoreMessage(
644
+ role=m.role, # type: ignore
645
+ content=m.content,
646
+ tool_name=m.tool_name,
647
+ tool_call_id=m.tool_call_id,
648
+ tool_calls=[
649
+ ToolCall(
650
+ id=tc["id"],
651
+ name=tc["name"],
652
+ arguments=tc["arguments"],
653
+ )
654
+ for tc in m.tool_calls
655
+ ]
656
+ if m.tool_calls
657
+ else None,
658
+ )
659
+ for m in history
660
+ ]
661
+
662
+ # Get agent action
663
+ action: AgentAction = agent.step(
664
+ history_for_agent, tool_schemas if tools else None
665
+ )
666
+
667
+ if action.type == "message":
668
+ history.append(Message(role="assistant", content=action.content or ""))
669
+ break
670
+
671
+ if action.type == "tool_call":
672
+ tool_name = action.tool_name or ""
673
+ tool_action = action.tool_action or "call"
674
+ tool_args = action.tool_args or {}
675
+
676
+ tool_call_id = generate_tool_call_id()
677
+ function_name = f"{tool_name}__{tool_action}"
678
+
679
+ # Add assistant message with tool call
680
+ history.append(
681
+ Message(
682
+ role="assistant",
683
+ content="",
684
+ tool_calls=[
685
+ {
686
+ "id": tool_call_id,
687
+ "name": function_name,
688
+ "arguments": json.dumps(tool_args),
689
+ }
690
+ ],
691
+ )
692
+ )
693
+
694
+ # Execute tool
695
+ if tool_name in tools:
696
+ tool = tools[tool_name]
697
+ if hasattr(tool, "invoke_async"):
698
+ result = await tool.invoke_async(
699
+ tool_action, tool_args, env_state
700
+ )
701
+ else:
702
+ result = tool.invoke(tool_action, tool_args, env_state)
703
+
704
+ tool_call_log.append(
705
+ ToolCallRecord(
706
+ tool=tool_name,
707
+ action=tool_action,
708
+ args=tool_args,
709
+ result=result.data,
710
+ success=result.success,
711
+ error=result.error,
712
+ )
713
+ )
714
+
715
+ result_content = (
716
+ result.data if result.success else (result.error or "")
717
+ )
718
+ if not isinstance(result_content, str):
719
+ result_content = json.dumps(result_content)
720
+
721
+ history.append(
722
+ Message(
723
+ role="tool",
724
+ content=result_content,
725
+ tool_name=tool_name,
726
+ tool_call_id=tool_call_id,
727
+ )
728
+ )
729
+ else:
730
+ error_msg = f"Tool not found: {tool_name}"
731
+ tool_call_log.append(
732
+ ToolCallRecord(
733
+ tool=tool_name,
734
+ action=tool_action,
735
+ args=tool_args,
736
+ success=False,
737
+ error=error_msg,
738
+ )
739
+ )
740
+ history.append(
741
+ Message(
742
+ role="tool",
743
+ content=error_msg,
744
+ tool_name=tool_name,
745
+ tool_call_id=tool_call_id,
746
+ )
747
+ )
748
+
749
+ tool_calls_made += 1
750
+
751
+ elif action.type == "stop":
752
+ break
753
+
754
+ elif step.action == "await_user":
755
+ # Skip in batch mode
756
+ logger.debug("Skipping await_user step (batch mode)")
757
+
758
+ # Get final response text
759
+ response_text = ""
760
+ for msg in reversed(history):
761
+ if msg.role == "assistant" and msg.content:
762
+ response_text = msg.content
763
+ break
764
+
765
+ # Get token usage from agent
766
+ input_tokens = 0
767
+ output_tokens = 0
768
+ cost_usd = None
769
+ if hasattr(agent, "get_usage"):
770
+ usage = agent.get_usage()
771
+ input_tokens = usage.get("input_tokens", 0)
772
+ output_tokens = usage.get("output_tokens", 0)
773
+ # Calculate cost from token counts
774
+ cost_usd = self._calculate_cost(model, input_tokens, output_tokens)
775
+
776
+ return RunResult(
777
+ id="",
778
+ scenario_id=scenario.id,
779
+ model=model,
780
+ response=response_text,
781
+ history=history,
782
+ tool_calls=tool_call_log,
783
+ final_state=env_state,
784
+ input_tokens=input_tokens,
785
+ output_tokens=output_tokens,
786
+ cost_usd=cost_usd,
787
+ )
788
+
789
+ finally:
790
+ if mcp_manager:
791
+ await mcp_manager.disconnect_all()
792
+
793
+ async def _evaluate(
794
+ self,
795
+ result: RunResult,
796
+ scenario: UnifiedScenarioSpec,
797
+ expected_outcome: str | None = None,
798
+ ) -> EvaluationResult:
799
+ """Evaluate a run result.
800
+
801
+ Args:
802
+ result: The run result to evaluate
803
+ scenario: The scenario specification
804
+ expected_outcome: Optional expected outcome goal ID for dataset benchmarking.
805
+ When set, only evaluates this outcome goal (others skipped).
806
+
807
+ """
808
+ eval_result = EvaluationResult()
809
+
810
+ # Goal-based evaluation
811
+ if scenario.has_goals():
812
+ # Separate process goals from outcome goals
813
+ process_goals = [g for g in scenario.evaluation.goals if not g.outcome]
814
+ outcome_goals = [g for g in scenario.evaluation.goals if g.outcome]
815
+
816
+ # Always evaluate process goals
817
+ for goal in process_goals:
818
+ goal_result = self._evaluate_goal(goal, result)
819
+ eval_result.goals.append(goal_result)
820
+
821
+ # Handle outcome goals based on expected_outcome
822
+ if expected_outcome:
823
+ # Only evaluate the expected outcome goal
824
+ for goal in outcome_goals:
825
+ if goal.id == expected_outcome:
826
+ goal_result = self._evaluate_goal(goal, result)
827
+ eval_result.goals.append(goal_result)
828
+ break
829
+ else:
830
+ # No expected outcome - evaluate all outcome goals (backward compatible)
831
+ for goal in outcome_goals:
832
+ goal_result = self._evaluate_goal(goal, result)
833
+ eval_result.goals.append(goal_result)
834
+
835
+ # Calculate goal score
836
+ eval_result.total_score = sum(g.points for g in eval_result.goals if g.achieved)
837
+
838
+ # Max score = process goals + only the expected outcome goal (if specified)
839
+ if expected_outcome:
840
+ eval_result.max_score = sum(g.points for g in process_goals)
841
+ for g in outcome_goals:
842
+ if g.id == expected_outcome:
843
+ eval_result.max_score += g.points
844
+ break
845
+ else:
846
+ eval_result.max_score = sum(g.points for g in scenario.evaluation.goals)
847
+
848
+ # Judge-based evaluation
849
+ if scenario.has_judge():
850
+ judge = scenario.evaluation.judge
851
+ assert judge is not None
852
+
853
+ if judge.type == "llm":
854
+ judge_result = await self._judge_with_llm(judge, result, scenario)
855
+ elif judge.type == "contains":
856
+ judge_result = self._judge_with_contains(judge, result)
857
+ elif judge.type == "regex":
858
+ judge_result = self._judge_with_regex(judge, result)
859
+ elif judge.type == "exact":
860
+ judge_result = self._judge_with_exact(judge, result)
861
+ elif judge.type == "length":
862
+ judge_result = self._judge_with_length(judge, result)
863
+ else:
864
+ judge_result = JudgeResult(
865
+ score=0.5,
866
+ passed=True,
867
+ reasoning=f"Unknown judge type: {judge.type}",
868
+ judge_type=judge.type,
869
+ )
870
+
871
+ eval_result.judge = judge_result
872
+
873
+ # If no goals, use judge score
874
+ if not scenario.has_goals():
875
+ # Scale judge score (0-1) to max_score or 100
876
+ max_score = scenario.evaluation.max_score or 100
877
+ eval_result.total_score = judge_result.score * max_score
878
+ eval_result.max_score = max_score
879
+
880
+ # Calculate percentage
881
+ if eval_result.max_score > 0:
882
+ eval_result.percentage = (eval_result.total_score / eval_result.max_score) * 100
883
+
884
+ return eval_result
885
+
886
+ def _evaluate_goal(self, goal: GoalSpec, result: RunResult) -> GoalResult:
887
+ """Evaluate a single goal."""
888
+ detection = goal.detection
889
+ detection_type = detection.get("type", "")
890
+ achieved = False
891
+ reason = ""
892
+
893
+ if detection_type == "env_state":
894
+ key = detection.get("key", "")
895
+ expected = detection.get("value")
896
+ actual = result.final_state.get(key)
897
+ achieved = actual == expected
898
+ reason = f"State[{key}] = {actual} (expected {expected})"
899
+
900
+ elif detection_type == "tool_called":
901
+ tool = detection.get("tool", "")
902
+ action = detection.get("action")
903
+ for tc in result.tool_calls:
904
+ if tc.tool == tool and (action is None or tc.action == action):
905
+ achieved = True
906
+ reason = f"Tool {tool} was called"
907
+ break
908
+ if not achieved:
909
+ reason = f"Tool {tool} was not called"
910
+
911
+ elif detection_type == "any_tool_called":
912
+ # Supports both "tool_name" and "tool_name.action" formats
913
+ tools = detection.get("tools", [])
914
+ for tc in result.tool_calls:
915
+ call_combined = f"{tc.tool}.{tc.action}"
916
+ # Match against tool name, action, or combined format
917
+ if tc.tool in tools or tc.action in tools or call_combined in tools:
918
+ achieved = True
919
+ reason = f"Tool {call_combined} was called"
920
+ break
921
+ if not achieved:
922
+ reason = f"None of {tools} were called"
923
+
924
+ elif detection_type == "agent_contains":
925
+ patterns = detection.get("patterns", [])
926
+ agent_text = " ".join(
927
+ m.content for m in result.history if m.role == "assistant" and m.content
928
+ ).lower()
929
+
930
+ for pattern in patterns:
931
+ # Convert to string in case YAML parsed as int (e.g., 10000 without quotes)
932
+ pattern_str = str(pattern).lower()
933
+ if pattern_str in agent_text:
934
+ achieved = True
935
+ reason = f"Agent response contains '{pattern}'"
936
+ break
937
+ if not achieved:
938
+ reason = f"Agent response does not contain any of {patterns}"
939
+
940
+ elif detection_type == "tool_sequence":
941
+ # Check if certain tools were called before others
942
+ required_before = detection.get("required_before", {})
943
+ tool_order = [tc.tool for tc in result.tool_calls]
944
+
945
+ all_satisfied = True
946
+ for tool, prereqs in required_before.items():
947
+ tool_idx = None
948
+ for i, t in enumerate(tool_order):
949
+ if t == tool:
950
+ tool_idx = i
951
+ break
952
+
953
+ if tool_idx is not None:
954
+ for prereq in prereqs:
955
+ prereq_idx = None
956
+ for i, t in enumerate(tool_order):
957
+ if t == prereq:
958
+ prereq_idx = i
959
+ break
960
+ if prereq_idx is None or prereq_idx > tool_idx:
961
+ all_satisfied = False
962
+ reason = f"{prereq} should be called before {tool}"
963
+ break
964
+
965
+ achieved = all_satisfied
966
+ if achieved:
967
+ reason = "Tool sequence requirements met"
968
+
969
+ return GoalResult(
970
+ id=goal.id,
971
+ name=goal.name or goal.id,
972
+ achieved=achieved,
973
+ points=goal.points if achieved else 0,
974
+ reason=reason,
975
+ )
976
+
977
+ async def _judge_with_llm(
978
+ self,
979
+ judge: JudgeSpec,
980
+ result: RunResult,
981
+ scenario: UnifiedScenarioSpec,
982
+ ) -> JudgeResult:
983
+ """Judge using LLM-as-a-judge."""
984
+ model = judge.model or "gpt-4o-mini"
985
+ rubric = judge.rubric or "Score the response from 0.0 to 1.0 based on quality."
986
+
987
+ # Build judge prompt
988
+ prompt_text = scenario.prompt or ""
989
+ if not prompt_text and result.history:
990
+ # Get first user message
991
+ for msg in result.history:
992
+ if msg.role == "user":
993
+ prompt_text = msg.content
994
+ break
995
+
996
+ # Build tool calls summary for judge
997
+ tool_calls_summary = ""
998
+ if result.tool_calls:
999
+ tool_lines = []
1000
+ for tc in result.tool_calls:
1001
+ status = "SUCCESS" if tc.success else "FAILED"
1002
+ tool_lines.append(f" - {tc.tool}.{tc.action}({tc.args}) -> {status}")
1003
+ tool_calls_summary = "\n".join(tool_lines)
1004
+
1005
+ judge_prompt = f"""You are evaluating an AI model's response.
1006
+
1007
+ ORIGINAL PROMPT:
1008
+ {prompt_text}
1009
+
1010
+ TOOLS CALLED BY MODEL:
1011
+ {tool_calls_summary if tool_calls_summary else "(No tools were called)"}
1012
+
1013
+ MODEL FINAL RESPONSE:
1014
+ {result.response}
1015
+
1016
+ EVALUATION RUBRIC:
1017
+ {rubric}
1018
+
1019
+ Evaluate the response and provide your assessment in this exact JSON format:
1020
+ {{"score": <0.0-1.0>, "passed": <true/false>, "reasoning": "<brief explanation>"}}
1021
+
1022
+ Respond with ONLY the JSON, no other text."""
1023
+
1024
+ try:
1025
+ provider = self.registry.get_provider_for_model(model)
1026
+ # Shield from external cancellation (e.g., MCP's anyio cancel scopes)
1027
+ response = await asyncio.shield(
1028
+ provider.complete(
1029
+ model=model,
1030
+ messages=[{"role": "user", "content": judge_prompt}],
1031
+ temperature=0.1,
1032
+ max_tokens=500,
1033
+ )
1034
+ )
1035
+
1036
+ # Parse JSON
1037
+ content = response.content.strip()
1038
+ if content.startswith("```"):
1039
+ content = content.split("```")[1]
1040
+ if content.startswith("json"):
1041
+ content = content[4:]
1042
+ content = content.strip()
1043
+
1044
+ data = json.loads(content)
1045
+ return JudgeResult(
1046
+ score=float(data.get("score", 0.5)),
1047
+ passed=bool(data.get("passed", True)),
1048
+ reasoning=str(data.get("reasoning", "No reasoning")),
1049
+ judge_type="llm",
1050
+ )
1051
+
1052
+ except asyncio.CancelledError as e:
1053
+ # MCP's anyio cancel scopes can cancel our HTTP calls
1054
+ logger.warning(f"LLM judge cancelled (likely MCP cleanup issue): {e}")
1055
+ return JudgeResult(
1056
+ score=0.5,
1057
+ passed=True,
1058
+ reasoning="Judge skipped: async operation was cancelled",
1059
+ judge_type="llm",
1060
+ )
1061
+ except Exception as e:
1062
+ logger.error(f"LLM judge error: {e}")
1063
+ return JudgeResult(
1064
+ score=0.5,
1065
+ passed=True,
1066
+ reasoning=f"Judge error: {e}",
1067
+ judge_type="llm",
1068
+ )
1069
+
1070
+ def _judge_with_contains(self, judge: JudgeSpec, result: RunResult) -> JudgeResult:
1071
+ """Judge by checking if response contains pattern."""
1072
+ pattern = judge.pattern or ""
1073
+ response = result.response
1074
+
1075
+ if not judge.case_sensitive:
1076
+ response = response.lower()
1077
+ pattern = pattern.lower()
1078
+
1079
+ found = pattern in response
1080
+ return JudgeResult(
1081
+ score=1.0 if found else 0.0,
1082
+ passed=found,
1083
+ reasoning=f"Contains '{judge.pattern}': {found}",
1084
+ judge_type="contains",
1085
+ )
1086
+
1087
+ def _judge_with_regex(self, judge: JudgeSpec, result: RunResult) -> JudgeResult:
1088
+ """Judge by regex pattern match."""
1089
+ import re
1090
+
1091
+ pattern = judge.pattern or ".*"
1092
+ flags = 0 if judge.case_sensitive else re.IGNORECASE
1093
+
1094
+ try:
1095
+ match = re.search(pattern, result.response, flags)
1096
+ found = match is not None
1097
+ return JudgeResult(
1098
+ score=1.0 if found else 0.0,
1099
+ passed=found,
1100
+ reasoning=f"Regex match: {found}",
1101
+ judge_type="regex",
1102
+ )
1103
+ except re.error as e:
1104
+ return JudgeResult(
1105
+ score=0.0,
1106
+ passed=False,
1107
+ reasoning=f"Invalid regex: {e}",
1108
+ judge_type="regex",
1109
+ )
1110
+
1111
+ def _judge_with_exact(self, judge: JudgeSpec, result: RunResult) -> JudgeResult:
1112
+ """Judge by exact match."""
1113
+ expected = judge.pattern or ""
1114
+ response = result.response.strip()
1115
+
1116
+ if not judge.case_sensitive:
1117
+ expected = expected.lower()
1118
+ response = response.lower()
1119
+
1120
+ match = response == expected
1121
+ return JudgeResult(
1122
+ score=1.0 if match else 0.0,
1123
+ passed=match,
1124
+ reasoning=f"Exact match: {match}",
1125
+ judge_type="exact",
1126
+ )
1127
+
1128
+ def _judge_with_length(self, judge: JudgeSpec, result: RunResult) -> JudgeResult:
1129
+ """Judge by response length."""
1130
+ length = len(result.response)
1131
+ reasons = []
1132
+
1133
+ passes_min = judge.min_length is None or length >= judge.min_length
1134
+ passes_max = judge.max_length is None or length <= judge.max_length
1135
+
1136
+ if not passes_min:
1137
+ reasons.append(f"too short ({length} < {judge.min_length})")
1138
+ if not passes_max:
1139
+ reasons.append(f"too long ({length} > {judge.max_length})")
1140
+
1141
+ passed = passes_min and passes_max
1142
+ reasoning = f"Length: {length} chars"
1143
+ if reasons:
1144
+ reasoning += f" - {', '.join(reasons)}"
1145
+
1146
+ return JudgeResult(
1147
+ score=1.0 if passed else 0.0,
1148
+ passed=passed,
1149
+ reasoning=reasoning,
1150
+ judge_type="length",
1151
+ )
1152
+
1153
+ def _apply_variables(
1154
+ self,
1155
+ scenario: UnifiedScenarioSpec,
1156
+ variables: dict[str, Any],
1157
+ ) -> UnifiedScenarioSpec:
1158
+ """Apply variable substitutions to scenario."""
1159
+ import re
1160
+
1161
+ def interpolate(text: str) -> str:
1162
+ if not isinstance(text, str):
1163
+ return text
1164
+
1165
+ def replace(match: re.Match[str]) -> str:
1166
+ key = match.group(1)
1167
+ if key in variables:
1168
+ return str(variables[key])
1169
+ return match.group(0)
1170
+
1171
+ return re.sub(r"\{(\w+)\}", replace, text)
1172
+
1173
+ def interpolate_value(value: Any) -> Any:
1174
+ if isinstance(value, str):
1175
+ return interpolate(value)
1176
+ if isinstance(value, dict):
1177
+ return {k: interpolate_value(v) for k, v in value.items()}
1178
+ if isinstance(value, list):
1179
+ return [interpolate_value(item) for item in value]
1180
+ return value
1181
+
1182
+ # Create new spec with interpolated values
1183
+ new_steps = [
1184
+ StepSpec(
1185
+ id=s.id,
1186
+ action=s.action,
1187
+ params=interpolate_value(dict(s.params)),
1188
+ )
1189
+ for s in scenario.steps
1190
+ ]
1191
+
1192
+ return UnifiedScenarioSpec(
1193
+ id=scenario.id,
1194
+ name=scenario.name,
1195
+ description=interpolate(scenario.description),
1196
+ category=scenario.category,
1197
+ tags=scenario.tags,
1198
+ variables=scenario.variables,
1199
+ system_prompt=interpolate(scenario.system_prompt),
1200
+ prompt=interpolate(scenario.prompt) if scenario.prompt else None,
1201
+ steps=new_steps,
1202
+ tools=scenario.tools,
1203
+ tools_from=scenario.tools_from,
1204
+ mcp_servers=scenario.mcp_servers,
1205
+ initial_state=interpolate_value(dict(scenario.initial_state)),
1206
+ evaluation=scenario.evaluation,
1207
+ style=scenario.style,
1208
+ events=scenario.events,
1209
+ )
1210
+
1211
+
1212
+ # =============================================================================
1213
+ # Loader Functions
1214
+ # =============================================================================
1215
+
1216
+
1217
+ def load_unified_scenario(path: Path) -> UnifiedScenarioSpec:
1218
+ """Load a unified scenario from a YAML file.
1219
+
1220
+ Args:
1221
+ path: Path to the YAML file
1222
+
1223
+ Returns:
1224
+ UnifiedScenarioSpec
1225
+
1226
+ Raises:
1227
+ ValueError: If file cannot be loaded or parsed
1228
+
1229
+ """
1230
+ try:
1231
+ raw = yaml.safe_load(path.read_text())
1232
+ except yaml.YAMLError as e:
1233
+ raise ValueError(f"Invalid YAML: {e}") from e
1234
+ except FileNotFoundError as e:
1235
+ raise ValueError(f"File not found: {path}") from e
1236
+
1237
+ if not isinstance(raw, dict):
1238
+ raise ValueError("Scenario must be a YAML mapping")
1239
+
1240
+ return parse_unified_scenario(raw)
1241
+
1242
+
1243
+ def parse_unified_scenario(raw: dict[str, Any]) -> UnifiedScenarioSpec:
1244
+ """Parse raw YAML into UnifiedScenarioSpec.
1245
+
1246
+ Supports multiple scenario formats:
1247
+ - Standard: system_prompt, tools, evaluation.goals
1248
+ - Alternate: agent.system_prompt, environment.tools, evaluation list with kind:
1249
+ """
1250
+ # Parse tools_from
1251
+ tools_from = raw.get("tools_from", [])
1252
+ if isinstance(tools_from, str):
1253
+ tools_from = [tools_from]
1254
+
1255
+ # Support alternate system_prompt location (agent.system_prompt)
1256
+ system_prompt = raw.get("system_prompt", "")
1257
+ if not system_prompt:
1258
+ agent_config = raw.get("agent", {})
1259
+ if isinstance(agent_config, dict):
1260
+ system_prompt = agent_config.get("system_prompt", "")
1261
+
1262
+ # Support alternate tools location (environment.tools)
1263
+ tools = raw.get("tools", {})
1264
+ if not tools:
1265
+ env_config = raw.get("environment", {})
1266
+ if isinstance(env_config, dict):
1267
+ env_tools = env_config.get("tools", [])
1268
+ # Convert environment.tools format to standard format
1269
+ if isinstance(env_tools, list):
1270
+ for tool_def in env_tools:
1271
+ if isinstance(tool_def, dict):
1272
+ tool_name = tool_def.get("name", "")
1273
+ tool_config = tool_def.get("config", {})
1274
+ tool_actions = tool_config.get("tools", {})
1275
+ if tool_name and tool_actions:
1276
+ # Convert to standard inline tools format
1277
+ tools[tool_name] = {
1278
+ "description": f"{tool_name} tools",
1279
+ "actions": tool_actions,
1280
+ }
1281
+
1282
+ # Support alternate initial_state location (environment.initial_state)
1283
+ initial_state = raw.get("initial_state", {})
1284
+ if not initial_state:
1285
+ env_config = raw.get("environment", {})
1286
+ if isinstance(env_config, dict):
1287
+ initial_state = env_config.get("initial_state", {})
1288
+
1289
+ # Parse MCP servers
1290
+ mcp_servers = []
1291
+ for server in raw.get("mcp_servers", []):
1292
+ if isinstance(server, dict):
1293
+ mcp_servers.append(
1294
+ McpServerSpec(
1295
+ name=server.get("name", "unnamed"),
1296
+ command=server.get("command"),
1297
+ args=server.get("args", []),
1298
+ env=server.get("env", {}),
1299
+ url=server.get("url"),
1300
+ headers=server.get("headers", {}),
1301
+ transport=server.get("transport", "auto"),
1302
+ )
1303
+ )
1304
+
1305
+ # Parse steps
1306
+ steps = []
1307
+ for s in raw.get("steps", []):
1308
+ steps.append(
1309
+ StepSpec(
1310
+ id=s.get("id", f"step_{len(steps)}"),
1311
+ action=s.get("action", "await_agent"),
1312
+ params=s.get("params", {}),
1313
+ )
1314
+ )
1315
+
1316
+ # Parse variables
1317
+ variables = []
1318
+ for v in raw.get("variables", []):
1319
+ variables.append(
1320
+ VariableSpec(
1321
+ name=v.get("name", ""),
1322
+ label=v.get("label", ""),
1323
+ type=v.get("type", "string"),
1324
+ default=v.get("default"),
1325
+ options=v.get("options", []),
1326
+ min=v.get("min"),
1327
+ max=v.get("max"),
1328
+ step=v.get("step"),
1329
+ )
1330
+ )
1331
+
1332
+ # Parse evaluation
1333
+ evaluation_raw = raw.get("evaluation", {})
1334
+ # Handle case where evaluation is a list (of goals) instead of a dict
1335
+ if isinstance(evaluation_raw, list):
1336
+ goals_raw = evaluation_raw
1337
+ evaluation_raw = {}
1338
+ else:
1339
+ goals_raw = evaluation_raw.get("goals", []) or raw.get("goals", [])
1340
+ goals = []
1341
+ for g in goals_raw:
1342
+ # Support alternate format with 'kind:' instead of 'detection.type:'
1343
+ detection = g.get("detection", {})
1344
+ if not detection and "kind" in g:
1345
+ # Convert alternate format: kind, tool, action, key, value, target
1346
+ kind = g.get("kind", "")
1347
+ if kind == "tool_called":
1348
+ detection = {
1349
+ "type": "tool_called",
1350
+ "tool": g.get("tool"),
1351
+ "action": g.get("action"),
1352
+ }
1353
+ elif kind == "env_state":
1354
+ detection = {
1355
+ "type": "env_state",
1356
+ "key": g.get("key"),
1357
+ "value": g.get("value"),
1358
+ }
1359
+ elif kind == "contains":
1360
+ detection = {
1361
+ "type": "agent_contains",
1362
+ "patterns": [g.get("value", "")] if g.get("value") else [],
1363
+ }
1364
+ elif kind == "count":
1365
+ detection = {
1366
+ "type": "count",
1367
+ "target": g.get("target"),
1368
+ "max": g.get("max"),
1369
+ "min": g.get("min"),
1370
+ }
1371
+ else:
1372
+ detection = {"type": kind}
1373
+
1374
+ goals.append(
1375
+ GoalSpec(
1376
+ id=g.get("id", f"goal_{len(goals)}"),
1377
+ name=g.get("name", ""),
1378
+ description=g.get("description", ""),
1379
+ points=g.get("points", 10), # Default 10 points if not specified
1380
+ detection=detection,
1381
+ outcome=g.get("outcome", False), # Mutually exclusive outcome goal
1382
+ )
1383
+ )
1384
+
1385
+ judge = None
1386
+ judge_raw = evaluation_raw.get("judge")
1387
+ if judge_raw:
1388
+ judge = JudgeSpec(
1389
+ type=judge_raw.get("type", "llm"),
1390
+ model=judge_raw.get("model"),
1391
+ rubric=judge_raw.get("rubric", ""),
1392
+ pattern=judge_raw.get("pattern"),
1393
+ case_sensitive=judge_raw.get("case_sensitive", False),
1394
+ min_length=judge_raw.get("min_length"),
1395
+ max_length=judge_raw.get("max_length"),
1396
+ voters=judge_raw.get("voters", []),
1397
+ helper=judge_raw.get("helper"),
1398
+ pass_threshold=judge_raw.get("pass_threshold", 0.5),
1399
+ )
1400
+
1401
+ evaluation = EvaluationSpec(
1402
+ goals=goals,
1403
+ judge=judge,
1404
+ max_score=evaluation_raw.get("max_score"),
1405
+ formula=evaluation_raw.get("formula"),
1406
+ )
1407
+
1408
+ # Get metadata from raw or alternate location
1409
+ metadata = raw.get("metadata", {})
1410
+ category = raw.get("category", "")
1411
+ tags = raw.get("tags", [])
1412
+ if not category and isinstance(metadata, dict):
1413
+ category = metadata.get("category", "")
1414
+ if not tags and isinstance(metadata, dict):
1415
+ tags = metadata.get("tags", [])
1416
+
1417
+ return UnifiedScenarioSpec(
1418
+ id=raw.get("id", "unnamed"),
1419
+ name=raw.get("name", raw.get("id", "Unnamed Scenario")),
1420
+ description=raw.get("description", ""),
1421
+ category=category,
1422
+ tags=tags,
1423
+ variables=variables,
1424
+ system_prompt=system_prompt, # Use parsed value (supports agent.system_prompt)
1425
+ prompt=raw.get("prompt"),
1426
+ steps=steps,
1427
+ tools=tools, # Use parsed value (supports environment.tools)
1428
+ tools_from=tools_from,
1429
+ mcp_servers=mcp_servers,
1430
+ initial_state=initial_state, # Use parsed value (supports environment.initial_state)
1431
+ evaluation=evaluation,
1432
+ style=raw.get("style"),
1433
+ events=raw.get("events", {}),
1434
+ )