sandboxy 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. sandboxy/__init__.py +3 -0
  2. sandboxy/agents/__init__.py +21 -0
  3. sandboxy/agents/base.py +66 -0
  4. sandboxy/agents/llm_prompt.py +308 -0
  5. sandboxy/agents/loader.py +222 -0
  6. sandboxy/api/__init__.py +5 -0
  7. sandboxy/api/app.py +76 -0
  8. sandboxy/api/routes/__init__.py +1 -0
  9. sandboxy/api/routes/agents.py +92 -0
  10. sandboxy/api/routes/local.py +1388 -0
  11. sandboxy/api/routes/tools.py +106 -0
  12. sandboxy/cli/__init__.py +1 -0
  13. sandboxy/cli/main.py +1196 -0
  14. sandboxy/cli/type_detector.py +48 -0
  15. sandboxy/config.py +49 -0
  16. sandboxy/core/__init__.py +1 -0
  17. sandboxy/core/async_runner.py +824 -0
  18. sandboxy/core/mdl_parser.py +441 -0
  19. sandboxy/core/runner.py +599 -0
  20. sandboxy/core/safe_eval.py +165 -0
  21. sandboxy/core/state.py +234 -0
  22. sandboxy/datasets/__init__.py +20 -0
  23. sandboxy/datasets/loader.py +193 -0
  24. sandboxy/datasets/runner.py +442 -0
  25. sandboxy/errors.py +166 -0
  26. sandboxy/local/context.py +235 -0
  27. sandboxy/local/results.py +173 -0
  28. sandboxy/logging.py +31 -0
  29. sandboxy/mcp/__init__.py +25 -0
  30. sandboxy/mcp/client.py +360 -0
  31. sandboxy/mcp/wrapper.py +99 -0
  32. sandboxy/providers/__init__.py +34 -0
  33. sandboxy/providers/anthropic_provider.py +271 -0
  34. sandboxy/providers/base.py +123 -0
  35. sandboxy/providers/http_client.py +101 -0
  36. sandboxy/providers/openai_provider.py +282 -0
  37. sandboxy/providers/openrouter.py +958 -0
  38. sandboxy/providers/registry.py +199 -0
  39. sandboxy/scenarios/__init__.py +11 -0
  40. sandboxy/scenarios/comparison.py +491 -0
  41. sandboxy/scenarios/loader.py +262 -0
  42. sandboxy/scenarios/runner.py +468 -0
  43. sandboxy/scenarios/unified.py +1434 -0
  44. sandboxy/session/__init__.py +21 -0
  45. sandboxy/session/manager.py +278 -0
  46. sandboxy/tools/__init__.py +34 -0
  47. sandboxy/tools/base.py +127 -0
  48. sandboxy/tools/loader.py +270 -0
  49. sandboxy/tools/yaml_tools.py +708 -0
  50. sandboxy/ui/__init__.py +27 -0
  51. sandboxy/ui/dist/assets/index-CgAkYWrJ.css +1 -0
  52. sandboxy/ui/dist/assets/index-D4zoGFcr.js +347 -0
  53. sandboxy/ui/dist/index.html +14 -0
  54. sandboxy/utils/__init__.py +3 -0
  55. sandboxy/utils/time.py +20 -0
  56. sandboxy-0.0.1.dist-info/METADATA +241 -0
  57. sandboxy-0.0.1.dist-info/RECORD +60 -0
  58. sandboxy-0.0.1.dist-info/WHEEL +4 -0
  59. sandboxy-0.0.1.dist-info/entry_points.txt +3 -0
  60. sandboxy-0.0.1.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,1388 @@
1
+ """API routes for local development mode."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ import yaml
10
+ from fastapi import APIRouter, HTTPException
11
+ from pydantic import BaseModel, Field
12
+
13
+ from sandboxy.local.context import get_local_context
14
+
15
+ router = APIRouter()
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class LocalFileInfo(BaseModel):
20
+ """Information about a local file."""
21
+
22
+ id: str
23
+ name: str
24
+ description: str
25
+ type: str | None
26
+ path: str
27
+ relative_path: str
28
+
29
+
30
+ class LocalStatusResponse(BaseModel):
31
+ """Response for local status endpoint."""
32
+
33
+ mode: str = "local"
34
+ root_dir: str
35
+ scenarios: list[LocalFileInfo]
36
+ tools: list[LocalFileInfo]
37
+ agents: list[LocalFileInfo]
38
+
39
+
40
+ class VariableInfo(BaseModel):
41
+ """Information about a scenario variable."""
42
+
43
+ name: str
44
+ label: str = ""
45
+ type: str = "string" # string, number, boolean, select
46
+ default: Any = None
47
+ options: list[str] = [] # For select type
48
+ required: bool = True
49
+
50
+
51
+ class ScenarioDetail(BaseModel):
52
+ """Detailed scenario information."""
53
+
54
+ id: str
55
+ name: str
56
+ description: str
57
+ type: str | None
58
+ path: str
59
+ content: dict[str, Any]
60
+ variables: list[VariableInfo] = [] # Detected/defined variables
61
+
62
+
63
+ @router.get("/local/status", response_model=LocalStatusResponse)
64
+ async def get_local_status() -> LocalStatusResponse:
65
+ """Get status of local development environment.
66
+
67
+ Returns discovered files and current configuration.
68
+ """
69
+ ctx = get_local_context()
70
+ if not ctx:
71
+ raise HTTPException(status_code=500, detail="Not in local mode")
72
+
73
+ discovered = ctx.discover()
74
+
75
+ return LocalStatusResponse(
76
+ root_dir=str(ctx.root_dir),
77
+ scenarios=[LocalFileInfo(**s) for s in discovered["scenarios"]],
78
+ tools=[LocalFileInfo(**t) for t in discovered["tools"]],
79
+ agents=[LocalFileInfo(**a) for a in discovered["agents"]],
80
+ )
81
+
82
+
83
+ @router.get("/local/scenarios")
84
+ async def list_local_scenarios() -> list[LocalFileInfo]:
85
+ """List scenarios from local scenarios/ directory."""
86
+ ctx = get_local_context()
87
+ if not ctx:
88
+ raise HTTPException(status_code=500, detail="Not in local mode")
89
+
90
+ discovered = ctx.discover()
91
+ return [LocalFileInfo(**s) for s in discovered["scenarios"]]
92
+
93
+
94
+ def _extract_variables(content: dict[str, Any]) -> list[VariableInfo]:
95
+ """Extract variables from scenario content.
96
+
97
+ Variables can be:
98
+ 1. Explicitly defined in 'variables' section
99
+ 2. Detected from {var} patterns in content
100
+
101
+ Args:
102
+ content: Parsed scenario YAML
103
+
104
+ Returns:
105
+ List of detected variables
106
+ """
107
+ import re
108
+
109
+ variables: dict[str, VariableInfo] = {}
110
+
111
+ # 1. Get explicitly defined variables
112
+ for var in content.get("variables", []):
113
+ name = var.get("name", "")
114
+ if name:
115
+ variables[name] = VariableInfo(
116
+ name=name,
117
+ label=var.get("label", name.replace("_", " ").title()),
118
+ type=var.get("type", "string"),
119
+ default=var.get("default"),
120
+ options=var.get("options", []),
121
+ required=var.get("required", True),
122
+ )
123
+
124
+ # 2. Detect {var} patterns in content (only in user-facing text, not tool definitions)
125
+ def find_vars(obj: Any, found: set[str], skip_keys: set[str] | None = None) -> None:
126
+ if skip_keys is None:
127
+ skip_keys = set()
128
+ if isinstance(obj, str):
129
+ # Skip double-brace patterns like {{name}} - these are tool param refs
130
+ # First remove all {{...}} patterns, then find single {var}
131
+ cleaned = re.sub(r"\{\{[^}]+\}\}", "", obj)
132
+ # Find {variable} patterns, excluding {state.xxx} references
133
+ matches = re.findall(r"\{(\w+)\}", cleaned)
134
+ for match in matches:
135
+ if not match.startswith("state"):
136
+ found.add(match)
137
+ elif isinstance(obj, dict):
138
+ for k, v in obj.items():
139
+ # Skip tool definitions - they have their own param syntax
140
+ if k in skip_keys:
141
+ continue
142
+ find_vars(v, found, skip_keys)
143
+ elif isinstance(obj, list):
144
+ for item in obj:
145
+ find_vars(item, found, skip_keys)
146
+
147
+ detected: set[str] = set()
148
+ # Skip 'environment' section since it contains tool definitions with param refs
149
+ # Also skip 'tools' which might be inline tool definitions
150
+ skip_sections = {"environment", "tools", "config"}
151
+ find_vars(content, detected, skip_sections)
152
+
153
+ # Add detected variables that aren't already defined
154
+ for name in detected:
155
+ if name not in variables:
156
+ variables[name] = VariableInfo(
157
+ name=name,
158
+ label=name.replace("_", " ").title(),
159
+ type="string",
160
+ default=None,
161
+ options=[],
162
+ required=True,
163
+ )
164
+
165
+ return list(variables.values())
166
+
167
+
168
+ @router.get("/local/scenarios/{scenario_id}")
169
+ async def get_local_scenario(scenario_id: str) -> ScenarioDetail:
170
+ """Get a specific scenario by ID.
171
+
172
+ Args:
173
+ scenario_id: The scenario identifier.
174
+
175
+ Returns:
176
+ Scenario details including full YAML content and detected variables.
177
+ """
178
+ ctx = get_local_context()
179
+ if not ctx:
180
+ raise HTTPException(status_code=500, detail="Not in local mode")
181
+
182
+ discovered = ctx.discover()
183
+
184
+ # Find the scenario
185
+ for s in discovered["scenarios"]:
186
+ if s["id"] == scenario_id:
187
+ # Load full content
188
+ try:
189
+ content = yaml.safe_load(Path(s["path"]).read_text())
190
+ except Exception as e:
191
+ raise HTTPException(
192
+ status_code=500,
193
+ detail=f"Error loading scenario: {e}",
194
+ ) from e
195
+
196
+ # Extract variables
197
+ variables = _extract_variables(content or {})
198
+
199
+ return ScenarioDetail(
200
+ id=s["id"],
201
+ name=s["name"],
202
+ description=s["description"],
203
+ type=s["type"],
204
+ path=s["path"],
205
+ content=content or {},
206
+ variables=variables,
207
+ )
208
+
209
+ raise HTTPException(status_code=404, detail=f"Scenario not found: {scenario_id}")
210
+
211
+
212
+ @router.get("/local/tools")
213
+ async def list_local_tools() -> list[LocalFileInfo]:
214
+ """List tools from local tools/ directory."""
215
+ ctx = get_local_context()
216
+ if not ctx:
217
+ raise HTTPException(status_code=500, detail="Not in local mode")
218
+
219
+ discovered = ctx.discover()
220
+ return [LocalFileInfo(**t) for t in discovered["tools"]]
221
+
222
+
223
+ @router.get("/local/tools/{tool_id}")
224
+ async def get_local_tool(tool_id: str) -> dict[str, Any]:
225
+ """Get a specific tool library by ID.
226
+
227
+ Args:
228
+ tool_id: The tool library identifier.
229
+
230
+ Returns:
231
+ Tool library details including full YAML content.
232
+ """
233
+ ctx = get_local_context()
234
+ if not ctx:
235
+ raise HTTPException(status_code=500, detail="Not in local mode")
236
+
237
+ discovered = ctx.discover()
238
+
239
+ # Find the tool
240
+ for t in discovered["tools"]:
241
+ if t["id"] == tool_id:
242
+ # Load full content
243
+ try:
244
+ content = yaml.safe_load(Path(t["path"]).read_text())
245
+ except Exception as e:
246
+ raise HTTPException(
247
+ status_code=500,
248
+ detail=f"Error loading tool: {e}",
249
+ ) from e
250
+
251
+ return {
252
+ "id": t["id"],
253
+ "name": t["name"],
254
+ "description": t["description"],
255
+ "path": t["path"],
256
+ "content": content or {},
257
+ }
258
+
259
+ raise HTTPException(status_code=404, detail=f"Tool not found: {tool_id}")
260
+
261
+
262
+ @router.get("/local/agents")
263
+ async def list_local_agents() -> list[LocalFileInfo]:
264
+ """List agents from local agents/ directory."""
265
+ ctx = get_local_context()
266
+ if not ctx:
267
+ raise HTTPException(status_code=500, detail="Not in local mode")
268
+
269
+ discovered = ctx.discover()
270
+ return [LocalFileInfo(**a) for a in discovered["agents"]]
271
+
272
+
273
+ @router.get("/local/agents/{agent_id}")
274
+ async def get_local_agent(agent_id: str) -> dict[str, Any]:
275
+ """Get a specific agent by ID.
276
+
277
+ Args:
278
+ agent_id: The agent identifier.
279
+
280
+ Returns:
281
+ Agent details including full YAML content.
282
+ """
283
+ ctx = get_local_context()
284
+ if not ctx:
285
+ raise HTTPException(status_code=500, detail="Not in local mode")
286
+
287
+ discovered = ctx.discover()
288
+
289
+ # Find the agent
290
+ for a in discovered["agents"]:
291
+ if a["id"] == agent_id:
292
+ # Load full content
293
+ try:
294
+ content = yaml.safe_load(Path(a["path"]).read_text())
295
+ except Exception as e:
296
+ raise HTTPException(
297
+ status_code=500,
298
+ detail=f"Error loading agent: {e}",
299
+ ) from e
300
+
301
+ return {
302
+ "id": a["id"],
303
+ "name": a["name"],
304
+ "description": a["description"],
305
+ "path": a["path"],
306
+ "content": content or {},
307
+ }
308
+
309
+ raise HTTPException(status_code=404, detail=f"Agent not found: {agent_id}")
310
+
311
+
312
+ @router.get("/local/runs")
313
+ async def list_local_runs() -> list[dict[str, Any]]:
314
+ """List run results from local runs/ directory."""
315
+ import json
316
+
317
+ ctx = get_local_context()
318
+ if not ctx:
319
+ raise HTTPException(status_code=500, detail="Not in local mode")
320
+
321
+ runs = []
322
+ if ctx.runs_dir.exists():
323
+ for path in sorted(ctx.runs_dir.glob("*.json"), reverse=True):
324
+ try:
325
+ data = json.loads(path.read_text())
326
+ runs.append(
327
+ {
328
+ "filename": path.name,
329
+ "path": str(path),
330
+ "scenario_id": data.get("scenario_id"),
331
+ "timestamp": data.get("timestamp"),
332
+ "metadata": data.get("metadata", {}),
333
+ }
334
+ )
335
+ except Exception:
336
+ # Skip invalid files
337
+ continue
338
+
339
+ return runs[:100] # Limit to most recent 100
340
+
341
+
342
+ @router.get("/local/runs/{filename}")
343
+ async def get_local_run(filename: str) -> dict[str, Any]:
344
+ """Get a specific run result.
345
+
346
+ Args:
347
+ filename: The run result filename.
348
+
349
+ Returns:
350
+ Full run result data.
351
+ """
352
+ import json
353
+
354
+ ctx = get_local_context()
355
+ if not ctx:
356
+ raise HTTPException(status_code=500, detail="Not in local mode")
357
+
358
+ filepath = ctx.runs_dir / filename
359
+ if not filepath.exists():
360
+ raise HTTPException(status_code=404, detail=f"Run not found: {filename}")
361
+
362
+ try:
363
+ return json.loads(filepath.read_text())
364
+ except Exception as e:
365
+ raise HTTPException(status_code=500, detail=f"Error loading run: {e}") from e
366
+
367
+
368
+ # =============================================================================
369
+ # Scenario Execution API
370
+ # =============================================================================
371
+
372
+
373
+ class RunScenarioRequest(BaseModel):
374
+ """Request to run a scenario."""
375
+
376
+ scenario_id: str
377
+ model: str
378
+ variables: dict[str, Any] = Field(default_factory=dict)
379
+ max_turns: int = 20
380
+ max_tokens: int = 1024
381
+ temperature: float = 0.7
382
+
383
+
384
+ class RunScenarioResponse(BaseModel):
385
+ """Response from running a scenario."""
386
+
387
+ id: str
388
+ scenario_id: str
389
+ model: str
390
+ response: str
391
+ history: list[dict[str, Any]]
392
+ tool_calls: list[dict[str, Any]]
393
+ final_state: dict[str, Any]
394
+ evaluation: dict[str, Any] | None
395
+ latency_ms: int
396
+ error: str | None
397
+
398
+
399
+ class CompareModelsRequest(BaseModel):
400
+ """Request to compare multiple models on a scenario."""
401
+
402
+ scenario_id: str
403
+ models: list[str]
404
+ runs_per_model: int = 1
405
+ variables: dict[str, Any] = Field(default_factory=dict)
406
+ max_turns: int = 20
407
+
408
+
409
+ class CompareModelsResponse(BaseModel):
410
+ """Response from multi-model comparison."""
411
+
412
+ scenario_id: str
413
+ scenario_name: str
414
+ models: list[str]
415
+ runs_per_model: int
416
+ stats: dict[str, Any]
417
+ ranking: list[str]
418
+ winner: str | None
419
+ results: list[dict[str, Any]] = Field(default_factory=list) # Individual run results
420
+
421
+
422
+ @router.post("/local/run", response_model=RunScenarioResponse)
423
+ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
424
+ """Run a scenario with a single model.
425
+
426
+ Args:
427
+ request: Run configuration including scenario_id, model, and variables.
428
+
429
+ Returns:
430
+ Scenario execution result.
431
+ """
432
+ ctx = get_local_context()
433
+ if not ctx:
434
+ raise HTTPException(status_code=500, detail="Not in local mode")
435
+
436
+ # Find the scenario file
437
+ discovered = ctx.discover()
438
+ scenario_path = None
439
+
440
+ for s in discovered["scenarios"]:
441
+ if s["id"] == request.scenario_id:
442
+ scenario_path = Path(s["path"])
443
+ break
444
+
445
+ if not scenario_path:
446
+ raise HTTPException(
447
+ status_code=404,
448
+ detail=f"Scenario not found: {request.scenario_id}",
449
+ )
450
+
451
+ try:
452
+ from sandboxy.scenarios.unified import UnifiedRunner, load_unified_scenario
453
+
454
+ spec = load_unified_scenario(scenario_path)
455
+ runner = UnifiedRunner()
456
+
457
+ result = await runner.run(
458
+ scenario=spec,
459
+ model=request.model,
460
+ variables=request.variables,
461
+ max_turns=request.max_turns,
462
+ max_tokens=request.max_tokens,
463
+ temperature=request.temperature,
464
+ )
465
+
466
+ # Save result to runs/
467
+ from sandboxy.local.results import save_run_result
468
+
469
+ save_run_result(request.scenario_id, result.to_dict())
470
+
471
+ return RunScenarioResponse(
472
+ id=result.id,
473
+ scenario_id=result.scenario_id,
474
+ model=result.model,
475
+ response=result.response,
476
+ history=[{"role": m.role, "content": m.content} for m in result.history],
477
+ tool_calls=[
478
+ {"tool": tc.tool, "action": tc.action, "args": tc.args, "success": tc.success}
479
+ for tc in result.tool_calls
480
+ ],
481
+ final_state=result.final_state,
482
+ evaluation=result.evaluation.to_dict() if result.evaluation else None,
483
+ latency_ms=result.latency_ms,
484
+ error=result.error,
485
+ )
486
+
487
+ except Exception as e:
488
+ logger.exception(f"Error running scenario: {e}")
489
+ raise HTTPException(status_code=500, detail=str(e)) from e
490
+
491
+
492
+ @router.post("/local/compare", response_model=CompareModelsResponse)
493
+ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse:
494
+ """Run a scenario with multiple models and compare results.
495
+
496
+ Args:
497
+ request: Comparison configuration including scenario_id, models, and runs_per_model.
498
+
499
+ Returns:
500
+ Comparison results with statistics and ranking.
501
+ """
502
+ ctx = get_local_context()
503
+ if not ctx:
504
+ raise HTTPException(status_code=500, detail="Not in local mode")
505
+
506
+ # Find the scenario file
507
+ discovered = ctx.discover()
508
+ scenario_path = None
509
+
510
+ for s in discovered["scenarios"]:
511
+ if s["id"] == request.scenario_id:
512
+ scenario_path = Path(s["path"])
513
+ break
514
+
515
+ if not scenario_path:
516
+ raise HTTPException(
517
+ status_code=404,
518
+ detail=f"Scenario not found: {request.scenario_id}",
519
+ )
520
+
521
+ if len(request.models) < 1:
522
+ raise HTTPException(
523
+ status_code=400,
524
+ detail="At least one model is required",
525
+ )
526
+
527
+ try:
528
+ from sandboxy.scenarios.comparison import run_comparison
529
+ from sandboxy.scenarios.unified import load_unified_scenario
530
+
531
+ spec = load_unified_scenario(scenario_path)
532
+
533
+ comparison = await run_comparison(
534
+ scenario=spec,
535
+ models=request.models,
536
+ runs_per_model=request.runs_per_model,
537
+ variables=request.variables,
538
+ max_turns=request.max_turns,
539
+ )
540
+
541
+ # Save comparison result
542
+ from sandboxy.local.results import save_run_result
543
+
544
+ save_run_result(
545
+ f"{request.scenario_id}_comparison",
546
+ comparison.to_dict(),
547
+ )
548
+
549
+ return CompareModelsResponse(
550
+ scenario_id=comparison.scenario_id,
551
+ scenario_name=comparison.scenario_name,
552
+ models=comparison.models,
553
+ runs_per_model=comparison.runs_per_model,
554
+ stats={k: v.to_dict() for k, v in comparison.stats.items()},
555
+ ranking=comparison.get_ranking(),
556
+ winner=comparison.get_winner(),
557
+ results=[r.to_dict() for r in comparison.results],
558
+ )
559
+
560
+ except Exception as e:
561
+ logger.exception(f"Error comparing models: {e}")
562
+ raise HTTPException(status_code=500, detail=str(e)) from e
563
+
564
+
565
+ def get_model_pricing(model_id: str) -> dict[str, float] | None:
566
+ """Get pricing for a model from OpenRouter models."""
567
+ from sandboxy.providers.openrouter import OPENROUTER_MODELS
568
+
569
+ model_info = OPENROUTER_MODELS.get(model_id)
570
+ if not model_info:
571
+ return None
572
+ return {
573
+ "input": model_info.input_cost_per_million or 0,
574
+ "output": model_info.output_cost_per_million or 0,
575
+ }
576
+
577
+
578
+ def calculate_cost(model_id: str, input_tokens: int, output_tokens: int) -> float | None:
579
+ """Calculate cost in USD for a model run."""
580
+ pricing = get_model_pricing(model_id)
581
+ if not pricing:
582
+ return None
583
+ input_cost = (input_tokens / 1_000_000) * pricing["input"]
584
+ output_cost = (output_tokens / 1_000_000) * pricing["output"]
585
+ return input_cost + output_cost
586
+
587
+
588
+ @router.get("/local/models")
589
+ async def list_available_models() -> list[dict[str, Any]]:
590
+ """List available models from OpenRouter."""
591
+ from sandboxy.providers.openrouter import OPENROUTER_MODELS
592
+
593
+ models = []
594
+ for model_id, info in OPENROUTER_MODELS.items():
595
+ # Format price string
596
+ if info.input_cost_per_million == 0 and info.output_cost_per_million == 0:
597
+ price = "Free"
598
+ else:
599
+ price = f"${info.input_cost_per_million:.2f}/${info.output_cost_per_million:.2f}"
600
+
601
+ models.append(
602
+ {
603
+ "id": model_id,
604
+ "name": info.name,
605
+ "price": price,
606
+ "pricing": {
607
+ "input": info.input_cost_per_million or 0,
608
+ "output": info.output_cost_per_million or 0,
609
+ },
610
+ "provider": info.provider,
611
+ "context_length": info.context_length,
612
+ "supports_vision": info.supports_vision,
613
+ }
614
+ )
615
+
616
+ return models
617
+
618
+
619
+ # =============================================================================
620
+ # Scenario Management API
621
+ # =============================================================================
622
+
623
+
624
+ class SaveScenarioRequest(BaseModel):
625
+ """Request to save a scenario."""
626
+
627
+ id: str
628
+ content: str # YAML content
629
+
630
+
631
+ class SaveScenarioResponse(BaseModel):
632
+ """Response from saving a scenario."""
633
+
634
+ id: str
635
+ path: str
636
+ message: str
637
+
638
+
639
+ @router.post("/local/scenarios", response_model=SaveScenarioResponse)
640
+ async def save_scenario(request: SaveScenarioRequest) -> SaveScenarioResponse:
641
+ """Save a new scenario to the scenarios/ directory.
642
+
643
+ Args:
644
+ request: Scenario ID and YAML content.
645
+
646
+ Returns:
647
+ Saved scenario info.
648
+ """
649
+ import re
650
+
651
+ ctx = get_local_context()
652
+ if not ctx:
653
+ raise HTTPException(status_code=500, detail="Not in local mode")
654
+
655
+ # Validate ID
656
+ if not request.id:
657
+ raise HTTPException(status_code=400, detail="Scenario ID is required")
658
+
659
+ if not re.match(r"^[a-z0-9-]+$", request.id):
660
+ raise HTTPException(
661
+ status_code=400,
662
+ detail="Scenario ID must contain only lowercase letters, numbers, and hyphens",
663
+ )
664
+
665
+ # Validate YAML
666
+ try:
667
+ yaml.safe_load(request.content)
668
+ except yaml.YAMLError as e:
669
+ raise HTTPException(status_code=400, detail=f"Invalid YAML: {e}") from e
670
+
671
+ # Ensure scenarios directory exists
672
+ ctx.scenarios_dir.mkdir(parents=True, exist_ok=True)
673
+
674
+ # Save file
675
+ filepath = ctx.scenarios_dir / f"{request.id}.yml"
676
+ filepath.write_text(request.content)
677
+
678
+ return SaveScenarioResponse(
679
+ id=request.id,
680
+ path=str(filepath),
681
+ message=f"Scenario saved to {filepath}",
682
+ )
683
+
684
+
685
+ @router.put("/local/scenarios/{scenario_id}")
686
+ async def update_scenario(
687
+ scenario_id: str,
688
+ request: SaveScenarioRequest,
689
+ ) -> SaveScenarioResponse:
690
+ """Update an existing scenario.
691
+
692
+ Args:
693
+ scenario_id: The scenario ID to update.
694
+ request: New YAML content.
695
+
696
+ Returns:
697
+ Updated scenario info.
698
+ """
699
+ ctx = get_local_context()
700
+ if not ctx:
701
+ raise HTTPException(status_code=500, detail="Not in local mode")
702
+
703
+ # Find existing file
704
+ filepath = ctx.scenarios_dir / f"{scenario_id}.yml"
705
+ if not filepath.exists():
706
+ filepath = ctx.scenarios_dir / f"{scenario_id}.yaml"
707
+ if not filepath.exists():
708
+ raise HTTPException(status_code=404, detail=f"Scenario not found: {scenario_id}")
709
+
710
+ # Validate YAML
711
+ try:
712
+ yaml.safe_load(request.content)
713
+ except yaml.YAMLError as e:
714
+ raise HTTPException(status_code=400, detail=f"Invalid YAML: {e}") from e
715
+
716
+ # Update file
717
+ filepath.write_text(request.content)
718
+
719
+ return SaveScenarioResponse(
720
+ id=scenario_id,
721
+ path=str(filepath),
722
+ message=f"Scenario updated at {filepath}",
723
+ )
724
+
725
+
726
+ @router.delete("/local/scenarios/{scenario_id}")
727
+ async def delete_scenario(scenario_id: str) -> dict[str, str]:
728
+ """Delete a scenario.
729
+
730
+ Args:
731
+ scenario_id: The scenario ID to delete.
732
+
733
+ Returns:
734
+ Confirmation message.
735
+ """
736
+ ctx = get_local_context()
737
+ if not ctx:
738
+ raise HTTPException(status_code=500, detail="Not in local mode")
739
+
740
+ # Find existing file
741
+ filepath = ctx.scenarios_dir / f"{scenario_id}.yml"
742
+ if not filepath.exists():
743
+ filepath = ctx.scenarios_dir / f"{scenario_id}.yaml"
744
+ if not filepath.exists():
745
+ raise HTTPException(status_code=404, detail=f"Scenario not found: {scenario_id}")
746
+
747
+ filepath.unlink()
748
+
749
+ return {"message": f"Scenario {scenario_id} deleted"}
750
+
751
+
752
+ # =============================================================================
753
+ # Tool Management
754
+ # =============================================================================
755
+
756
+
757
+ class SaveToolRequest(BaseModel):
758
+ """Request to save a tool."""
759
+
760
+ name: str
761
+ toolType: str = "yaml" # yaml, python, or mcp
762
+ content: str # YAML or Python content
763
+
764
+
765
+ class SaveToolResponse(BaseModel):
766
+ """Response from saving a tool."""
767
+
768
+ name: str
769
+ path: str
770
+ message: str
771
+
772
+
773
+ @router.post("/local/tools", response_model=SaveToolResponse)
774
+ async def save_tool(request: SaveToolRequest) -> SaveToolResponse:
775
+ """Save a new tool to the tools/ directory.
776
+
777
+ Supports three tool types:
778
+ - yaml: Declarative YAML mock tools
779
+ - python: Python tool class (generates .py file)
780
+ - mcp: MCP server configuration (YAML with type: mcp)
781
+
782
+ Args:
783
+ request: Tool name, type, and content.
784
+
785
+ Returns:
786
+ Saved tool info.
787
+ """
788
+ import re
789
+
790
+ ctx = get_local_context()
791
+ if not ctx:
792
+ raise HTTPException(status_code=500, detail="Not in local mode")
793
+
794
+ # Validate name
795
+ if not request.name:
796
+ raise HTTPException(status_code=400, detail="Tool name is required")
797
+
798
+ if not re.match(r"^[a-z0-9_]+$", request.name):
799
+ raise HTTPException(
800
+ status_code=400,
801
+ detail="Tool name must contain only lowercase letters, numbers, and underscores",
802
+ )
803
+
804
+ # Ensure tools directory exists
805
+ ctx.tools_dir.mkdir(parents=True, exist_ok=True)
806
+
807
+ # Handle different tool types
808
+ if request.toolType == "python":
809
+ # Save as Python file
810
+ filepath = ctx.tools_dir / f"{request.name}.py"
811
+ filepath.write_text(request.content)
812
+ elif request.toolType == "mcp":
813
+ # Validate YAML for MCP config
814
+ try:
815
+ yaml.safe_load(request.content)
816
+ except yaml.YAMLError as e:
817
+ raise HTTPException(status_code=400, detail=f"Invalid YAML: {e}") from e
818
+ filepath = ctx.tools_dir / f"{request.name}.yml"
819
+ filepath.write_text(request.content)
820
+ else:
821
+ # Default: YAML mock tool
822
+ try:
823
+ yaml.safe_load(request.content)
824
+ except yaml.YAMLError as e:
825
+ raise HTTPException(status_code=400, detail=f"Invalid YAML: {e}") from e
826
+ filepath = ctx.tools_dir / f"{request.name}.yml"
827
+ filepath.write_text(request.content)
828
+
829
+ # Clear tool cache so new tools are discovered
830
+ from sandboxy.tools.loader import discover_python_tools
831
+
832
+ discover_python_tools(refresh=True)
833
+
834
+ return SaveToolResponse(
835
+ name=request.name,
836
+ path=str(filepath),
837
+ message=f"Tool saved to {filepath}",
838
+ )
839
+
840
+
841
+ # =============================================================================
842
+ # Dataset Management API
843
+ # =============================================================================
844
+
845
+
846
+ class DatasetInfo(BaseModel):
847
+ """Information about a dataset."""
848
+
849
+ id: str
850
+ name: str
851
+ description: str
852
+ case_count: int
853
+ path: str
854
+ relative_path: str
855
+
856
+
857
+ class DatasetCaseInfo(BaseModel):
858
+ """Information about a single test case."""
859
+
860
+ id: str
861
+ expected: list[str] = Field(default_factory=list) # Can have multiple expected outcomes
862
+ variables: dict[str, Any] = Field(default_factory=dict)
863
+ tool_responses: dict[str, Any] = Field(default_factory=dict)
864
+ tags: list[str] = Field(default_factory=list)
865
+
866
+
867
+ class DatasetDetail(BaseModel):
868
+ """Detailed dataset information."""
869
+
870
+ id: str
871
+ name: str
872
+ description: str
873
+ scenario_id: str | None = None # Linked scenario for goal discovery
874
+ cases: list[DatasetCaseInfo]
875
+ generator: dict[str, Any] | None = None
876
+ path: str
877
+
878
+
879
+ class ScenarioGoalInfo(BaseModel):
880
+ """Information about a goal from a scenario."""
881
+
882
+ id: str
883
+ name: str
884
+ description: str
885
+ outcome: bool = False
886
+
887
+
888
+ class SaveDatasetRequest(BaseModel):
889
+ """Request to save a dataset."""
890
+
891
+ id: str
892
+ content: str # YAML content
893
+
894
+
895
+ class SaveDatasetResponse(BaseModel):
896
+ """Response from saving a dataset."""
897
+
898
+ id: str
899
+ path: str
900
+ message: str
901
+
902
+
903
+ class RunDatasetRequest(BaseModel):
904
+ """Request to run a scenario against a dataset."""
905
+
906
+ scenario_id: str
907
+ dataset_id: str
908
+ model: str
909
+ max_turns: int = 20
910
+ max_tokens: int = 1024
911
+ temperature: float = 0.7
912
+ parallel: int = 1
913
+
914
+
915
+ class RunDatasetResponse(BaseModel):
916
+ """Response from running a dataset."""
917
+
918
+ scenario_id: str
919
+ model: str
920
+ dataset_id: str
921
+ total_cases: int
922
+ passed_cases: int
923
+ failed_cases: int
924
+ pass_rate: float
925
+ avg_score: float
926
+ avg_percentage: float
927
+ by_expected: dict[str, dict[str, int]]
928
+ total_time_ms: int
929
+ case_results: list[dict[str, Any]]
930
+
931
+
932
+ @router.get("/local/datasets", response_model=list[DatasetInfo])
933
+ async def list_local_datasets() -> list[DatasetInfo]:
934
+ """List datasets from local datasets/ directory."""
935
+ ctx = get_local_context()
936
+ if not ctx:
937
+ raise HTTPException(status_code=500, detail="Not in local mode")
938
+
939
+ datasets = []
940
+ datasets_dir = ctx.datasets_dir
941
+
942
+ if datasets_dir.exists():
943
+ for path in sorted(datasets_dir.glob("*.yml")):
944
+ try:
945
+ content = yaml.safe_load(path.read_text())
946
+ if content:
947
+ case_count = 0
948
+ if "cases" in content:
949
+ case_count = len(content.get("cases", []))
950
+ elif "generator" in content:
951
+ # Estimate generated case count
952
+ gen = content.get("generator", {})
953
+ dims = gen.get("dimensions", {})
954
+ case_count = 1
955
+ for values in dims.values():
956
+ if isinstance(values, list):
957
+ case_count *= len(values)
958
+
959
+ datasets.append(
960
+ DatasetInfo(
961
+ id=path.stem,
962
+ name=content.get("name", path.stem),
963
+ description=content.get("description", ""),
964
+ case_count=case_count,
965
+ path=str(path),
966
+ relative_path=str(path.relative_to(ctx.root_dir)),
967
+ )
968
+ )
969
+ except Exception as e:
970
+ logger.warning(f"Error loading dataset {path}: {e}")
971
+ continue
972
+
973
+ return datasets
974
+
975
+
976
+ @router.get("/local/datasets/{dataset_id}", response_model=DatasetDetail)
977
+ async def get_local_dataset(dataset_id: str) -> DatasetDetail:
978
+ """Get a specific dataset by ID.
979
+
980
+ Args:
981
+ dataset_id: The dataset identifier.
982
+
983
+ Returns:
984
+ Dataset details including all cases.
985
+ """
986
+ ctx = get_local_context()
987
+ if not ctx:
988
+ raise HTTPException(status_code=500, detail="Not in local mode")
989
+
990
+ datasets_dir = ctx.datasets_dir
991
+ filepath = datasets_dir / f"{dataset_id}.yml"
992
+
993
+ if not filepath.exists():
994
+ filepath = datasets_dir / f"{dataset_id}.yaml"
995
+ if not filepath.exists():
996
+ raise HTTPException(status_code=404, detail=f"Dataset not found: {dataset_id}")
997
+
998
+ try:
999
+ content = yaml.safe_load(filepath.read_text())
1000
+ except Exception as e:
1001
+ raise HTTPException(status_code=500, detail=f"Error loading dataset: {e}") from e
1002
+
1003
+ cases = []
1004
+ for case_data in content.get("cases", []):
1005
+ # Handle expected as string or list
1006
+ expected_raw = case_data.get("expected")
1007
+ if expected_raw is None:
1008
+ expected = []
1009
+ elif isinstance(expected_raw, list):
1010
+ expected = expected_raw
1011
+ else:
1012
+ expected = [expected_raw]
1013
+
1014
+ cases.append(
1015
+ DatasetCaseInfo(
1016
+ id=case_data.get("id", ""),
1017
+ expected=expected,
1018
+ variables=case_data.get("variables", {}),
1019
+ tool_responses=case_data.get("tool_responses", {}),
1020
+ tags=case_data.get("tags", []),
1021
+ )
1022
+ )
1023
+
1024
+ return DatasetDetail(
1025
+ id=dataset_id,
1026
+ name=content.get("name", dataset_id),
1027
+ description=content.get("description", ""),
1028
+ scenario_id=content.get("scenario_id"),
1029
+ cases=cases,
1030
+ generator=content.get("generator"),
1031
+ path=str(filepath),
1032
+ )
1033
+
1034
+
1035
+ @router.get("/local/scenarios/{scenario_id}/goals", response_model=list[ScenarioGoalInfo])
1036
+ async def get_scenario_goals(scenario_id: str) -> list[ScenarioGoalInfo]:
1037
+ """Get goals from a scenario for dataset editor dropdown.
1038
+
1039
+ Args:
1040
+ scenario_id: The scenario identifier.
1041
+
1042
+ Returns:
1043
+ List of goals with their outcome flag.
1044
+ """
1045
+ ctx = get_local_context()
1046
+ if not ctx:
1047
+ raise HTTPException(status_code=500, detail="Not in local mode")
1048
+
1049
+ # Find the scenario file
1050
+ discovered = ctx.discover()
1051
+ scenario_path = None
1052
+
1053
+ for s in discovered["scenarios"]:
1054
+ if s["id"] == scenario_id:
1055
+ scenario_path = Path(s["path"])
1056
+ break
1057
+
1058
+ if not scenario_path:
1059
+ raise HTTPException(status_code=404, detail=f"Scenario not found: {scenario_id}")
1060
+
1061
+ try:
1062
+ from sandboxy.scenarios.unified import load_unified_scenario
1063
+
1064
+ spec = load_unified_scenario(scenario_path)
1065
+
1066
+ goals = []
1067
+ if spec.evaluation and spec.evaluation.goals:
1068
+ for goal in spec.evaluation.goals:
1069
+ goals.append(
1070
+ ScenarioGoalInfo(
1071
+ id=goal.id,
1072
+ name=goal.name,
1073
+ description=goal.description,
1074
+ outcome=goal.outcome,
1075
+ )
1076
+ )
1077
+
1078
+ return goals
1079
+
1080
+ except Exception as e:
1081
+ raise HTTPException(status_code=500, detail=f"Error loading scenario: {e}") from e
1082
+
1083
+
1084
+ class ScenarioToolAction(BaseModel):
1085
+ """Information about a tool action."""
1086
+
1087
+ name: str
1088
+ description: str = ""
1089
+
1090
+
1091
+ class ScenarioToolInfo(BaseModel):
1092
+ """Information about a tool in a scenario."""
1093
+
1094
+ name: str
1095
+ description: str = ""
1096
+ actions: list[ScenarioToolAction] = []
1097
+
1098
+
1099
+ @router.get("/local/scenarios/{scenario_id}/tools", response_model=list[ScenarioToolInfo])
1100
+ async def get_scenario_tools(scenario_id: str) -> list[ScenarioToolInfo]:
1101
+ """Get tools from a scenario for dataset editor dropdown.
1102
+
1103
+ Args:
1104
+ scenario_id: The scenario identifier.
1105
+
1106
+ Returns:
1107
+ List of tools with their actions.
1108
+ """
1109
+ ctx = get_local_context()
1110
+ if not ctx:
1111
+ raise HTTPException(status_code=500, detail="Not in local mode")
1112
+
1113
+ # Find the scenario file
1114
+ discovered = ctx.discover()
1115
+ scenario_path = None
1116
+
1117
+ for s in discovered["scenarios"]:
1118
+ if s["id"] == scenario_id:
1119
+ scenario_path = Path(s["path"])
1120
+ break
1121
+
1122
+ if not scenario_path:
1123
+ raise HTTPException(status_code=404, detail=f"Scenario not found: {scenario_id}")
1124
+
1125
+ try:
1126
+ from sandboxy.scenarios.unified import load_unified_scenario
1127
+ from sandboxy.tools.yaml_tools import YamlToolLoader
1128
+
1129
+ spec = load_unified_scenario(scenario_path)
1130
+ loader = YamlToolLoader([ctx.tools_dir])
1131
+
1132
+ tools_info: list[ScenarioToolInfo] = []
1133
+
1134
+ # Get inline tools
1135
+ if spec.tools:
1136
+ inline_specs = loader.parse_inline_tools(spec.tools)
1137
+ for tool_name, tool_spec in inline_specs.items():
1138
+ actions = [
1139
+ ScenarioToolAction(
1140
+ name=action_name,
1141
+ description=action_spec.description,
1142
+ )
1143
+ for action_name, action_spec in tool_spec.get_effective_actions().items()
1144
+ ]
1145
+ tools_info.append(
1146
+ ScenarioToolInfo(
1147
+ name=tool_name,
1148
+ description=tool_spec.description,
1149
+ actions=actions,
1150
+ )
1151
+ )
1152
+
1153
+ # Get tools from libraries
1154
+ for lib_name in spec.tools_from:
1155
+ lib_path = ctx.tools_dir / f"{lib_name}.yml"
1156
+ if not lib_path.exists():
1157
+ lib_path = ctx.tools_dir / f"{lib_name}.yaml"
1158
+ if lib_path.exists():
1159
+ library = loader.load_library_file(lib_path)
1160
+ for tool_name, tool_spec in library.tools.items():
1161
+ actions = [
1162
+ ScenarioToolAction(
1163
+ name=action_name,
1164
+ description=action_spec.description,
1165
+ )
1166
+ for action_name, action_spec in tool_spec.get_effective_actions().items()
1167
+ ]
1168
+ tools_info.append(
1169
+ ScenarioToolInfo(
1170
+ name=tool_name,
1171
+ description=tool_spec.description,
1172
+ actions=actions,
1173
+ )
1174
+ )
1175
+
1176
+ return tools_info
1177
+
1178
+ except Exception as e:
1179
+ raise HTTPException(status_code=500, detail=f"Error loading scenario tools: {e}") from e
1180
+
1181
+
1182
+ @router.post("/local/datasets", response_model=SaveDatasetResponse)
1183
+ async def save_dataset(request: SaveDatasetRequest) -> SaveDatasetResponse:
1184
+ """Save a new dataset to the datasets/ directory.
1185
+
1186
+ Args:
1187
+ request: Dataset ID and YAML content.
1188
+
1189
+ Returns:
1190
+ Saved dataset info.
1191
+ """
1192
+ import re
1193
+
1194
+ ctx = get_local_context()
1195
+ if not ctx:
1196
+ raise HTTPException(status_code=500, detail="Not in local mode")
1197
+
1198
+ # Validate ID
1199
+ if not request.id:
1200
+ raise HTTPException(status_code=400, detail="Dataset ID is required")
1201
+
1202
+ if not re.match(r"^[a-z0-9_-]+$", request.id):
1203
+ raise HTTPException(
1204
+ status_code=400,
1205
+ detail="Dataset ID must contain only lowercase letters, numbers, hyphens, and underscores",
1206
+ )
1207
+
1208
+ # Validate YAML
1209
+ try:
1210
+ yaml.safe_load(request.content)
1211
+ except yaml.YAMLError as e:
1212
+ raise HTTPException(status_code=400, detail=f"Invalid YAML: {e}") from e
1213
+
1214
+ # Ensure datasets directory exists
1215
+ datasets_dir = ctx.datasets_dir
1216
+ datasets_dir.mkdir(parents=True, exist_ok=True)
1217
+
1218
+ # Save file
1219
+ filepath = datasets_dir / f"{request.id}.yml"
1220
+ filepath.write_text(request.content)
1221
+
1222
+ return SaveDatasetResponse(
1223
+ id=request.id,
1224
+ path=str(filepath),
1225
+ message=f"Dataset saved to {filepath}",
1226
+ )
1227
+
1228
+
1229
+ @router.put("/local/datasets/{dataset_id}")
1230
+ async def update_dataset(
1231
+ dataset_id: str,
1232
+ request: SaveDatasetRequest,
1233
+ ) -> SaveDatasetResponse:
1234
+ """Update an existing dataset.
1235
+
1236
+ Args:
1237
+ dataset_id: The dataset ID to update.
1238
+ request: New YAML content.
1239
+
1240
+ Returns:
1241
+ Updated dataset info.
1242
+ """
1243
+ ctx = get_local_context()
1244
+ if not ctx:
1245
+ raise HTTPException(status_code=500, detail="Not in local mode")
1246
+
1247
+ datasets_dir = ctx.datasets_dir
1248
+ filepath = datasets_dir / f"{dataset_id}.yml"
1249
+ if not filepath.exists():
1250
+ filepath = datasets_dir / f"{dataset_id}.yaml"
1251
+ if not filepath.exists():
1252
+ raise HTTPException(status_code=404, detail=f"Dataset not found: {dataset_id}")
1253
+
1254
+ # Validate YAML
1255
+ try:
1256
+ yaml.safe_load(request.content)
1257
+ except yaml.YAMLError as e:
1258
+ raise HTTPException(status_code=400, detail=f"Invalid YAML: {e}") from e
1259
+
1260
+ # Update file
1261
+ filepath.write_text(request.content)
1262
+
1263
+ return SaveDatasetResponse(
1264
+ id=dataset_id,
1265
+ path=str(filepath),
1266
+ message=f"Dataset updated at {filepath}",
1267
+ )
1268
+
1269
+
1270
+ @router.delete("/local/datasets/{dataset_id}")
1271
+ async def delete_dataset(dataset_id: str) -> dict[str, str]:
1272
+ """Delete a dataset.
1273
+
1274
+ Args:
1275
+ dataset_id: The dataset ID to delete.
1276
+
1277
+ Returns:
1278
+ Confirmation message.
1279
+ """
1280
+ ctx = get_local_context()
1281
+ if not ctx:
1282
+ raise HTTPException(status_code=500, detail="Not in local mode")
1283
+
1284
+ datasets_dir = ctx.datasets_dir
1285
+ filepath = datasets_dir / f"{dataset_id}.yml"
1286
+ if not filepath.exists():
1287
+ filepath = datasets_dir / f"{dataset_id}.yaml"
1288
+ if not filepath.exists():
1289
+ raise HTTPException(status_code=404, detail=f"Dataset not found: {dataset_id}")
1290
+
1291
+ filepath.unlink()
1292
+
1293
+ return {"message": f"Dataset {dataset_id} deleted"}
1294
+
1295
+
1296
+ @router.post("/local/run-dataset", response_model=RunDatasetResponse)
1297
+ async def run_with_dataset(request: RunDatasetRequest) -> RunDatasetResponse:
1298
+ """Run a scenario against a dataset.
1299
+
1300
+ Args:
1301
+ request: Run configuration including scenario_id, dataset_id, and model.
1302
+
1303
+ Returns:
1304
+ Dataset benchmark results.
1305
+ """
1306
+ ctx = get_local_context()
1307
+ if not ctx:
1308
+ raise HTTPException(status_code=500, detail="Not in local mode")
1309
+
1310
+ # Find the scenario file
1311
+ discovered = ctx.discover()
1312
+ scenario_path = None
1313
+
1314
+ for s in discovered["scenarios"]:
1315
+ if s["id"] == request.scenario_id:
1316
+ scenario_path = Path(s["path"])
1317
+ break
1318
+
1319
+ if not scenario_path:
1320
+ raise HTTPException(
1321
+ status_code=404,
1322
+ detail=f"Scenario not found: {request.scenario_id}",
1323
+ )
1324
+
1325
+ # Find the dataset file
1326
+ datasets_dir = ctx.datasets_dir
1327
+ dataset_path = datasets_dir / f"{request.dataset_id}.yml"
1328
+ if not dataset_path.exists():
1329
+ dataset_path = datasets_dir / f"{request.dataset_id}.yaml"
1330
+ if not dataset_path.exists():
1331
+ raise HTTPException(
1332
+ status_code=404,
1333
+ detail=f"Dataset not found: {request.dataset_id}",
1334
+ )
1335
+
1336
+ try:
1337
+ from sandboxy.datasets import load_dataset, run_dataset, run_dataset_parallel
1338
+ from sandboxy.scenarios.unified import load_unified_scenario
1339
+
1340
+ spec = load_unified_scenario(scenario_path)
1341
+ dataset = load_dataset(dataset_path)
1342
+
1343
+ if request.parallel > 1:
1344
+ result = await run_dataset_parallel(
1345
+ scenario=spec,
1346
+ model=request.model,
1347
+ dataset=dataset,
1348
+ max_turns=request.max_turns,
1349
+ max_tokens=request.max_tokens,
1350
+ temperature=request.temperature,
1351
+ max_concurrent=request.parallel,
1352
+ )
1353
+ else:
1354
+ result = await run_dataset(
1355
+ scenario=spec,
1356
+ model=request.model,
1357
+ dataset=dataset,
1358
+ max_turns=request.max_turns,
1359
+ max_tokens=request.max_tokens,
1360
+ temperature=request.temperature,
1361
+ )
1362
+
1363
+ # Save result
1364
+ from sandboxy.local.results import save_run_result
1365
+
1366
+ save_run_result(
1367
+ f"{request.scenario_id}_dataset_{request.dataset_id}",
1368
+ result.to_dict(),
1369
+ )
1370
+
1371
+ return RunDatasetResponse(
1372
+ scenario_id=result.scenario_id,
1373
+ model=result.model,
1374
+ dataset_id=result.dataset_id,
1375
+ total_cases=result.total_cases,
1376
+ passed_cases=result.passed_cases,
1377
+ failed_cases=result.failed_cases,
1378
+ pass_rate=result.pass_rate,
1379
+ avg_score=result.avg_score,
1380
+ avg_percentage=result.avg_percentage,
1381
+ by_expected=result.by_expected,
1382
+ total_time_ms=result.total_time_ms,
1383
+ case_results=[c.to_dict() for c in result.case_results],
1384
+ )
1385
+
1386
+ except Exception as e:
1387
+ logger.exception(f"Error running dataset: {e}")
1388
+ raise HTTPException(status_code=500, detail=str(e)) from e