sandboxy 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandboxy/__init__.py +3 -0
- sandboxy/agents/__init__.py +21 -0
- sandboxy/agents/base.py +66 -0
- sandboxy/agents/llm_prompt.py +308 -0
- sandboxy/agents/loader.py +222 -0
- sandboxy/api/__init__.py +5 -0
- sandboxy/api/app.py +76 -0
- sandboxy/api/routes/__init__.py +1 -0
- sandboxy/api/routes/agents.py +92 -0
- sandboxy/api/routes/local.py +1388 -0
- sandboxy/api/routes/tools.py +106 -0
- sandboxy/cli/__init__.py +1 -0
- sandboxy/cli/main.py +1196 -0
- sandboxy/cli/type_detector.py +48 -0
- sandboxy/config.py +49 -0
- sandboxy/core/__init__.py +1 -0
- sandboxy/core/async_runner.py +824 -0
- sandboxy/core/mdl_parser.py +441 -0
- sandboxy/core/runner.py +599 -0
- sandboxy/core/safe_eval.py +165 -0
- sandboxy/core/state.py +234 -0
- sandboxy/datasets/__init__.py +20 -0
- sandboxy/datasets/loader.py +193 -0
- sandboxy/datasets/runner.py +442 -0
- sandboxy/errors.py +166 -0
- sandboxy/local/context.py +235 -0
- sandboxy/local/results.py +173 -0
- sandboxy/logging.py +31 -0
- sandboxy/mcp/__init__.py +25 -0
- sandboxy/mcp/client.py +360 -0
- sandboxy/mcp/wrapper.py +99 -0
- sandboxy/providers/__init__.py +34 -0
- sandboxy/providers/anthropic_provider.py +271 -0
- sandboxy/providers/base.py +123 -0
- sandboxy/providers/http_client.py +101 -0
- sandboxy/providers/openai_provider.py +282 -0
- sandboxy/providers/openrouter.py +958 -0
- sandboxy/providers/registry.py +199 -0
- sandboxy/scenarios/__init__.py +11 -0
- sandboxy/scenarios/comparison.py +491 -0
- sandboxy/scenarios/loader.py +262 -0
- sandboxy/scenarios/runner.py +468 -0
- sandboxy/scenarios/unified.py +1434 -0
- sandboxy/session/__init__.py +21 -0
- sandboxy/session/manager.py +278 -0
- sandboxy/tools/__init__.py +34 -0
- sandboxy/tools/base.py +127 -0
- sandboxy/tools/loader.py +270 -0
- sandboxy/tools/yaml_tools.py +708 -0
- sandboxy/ui/__init__.py +27 -0
- sandboxy/ui/dist/assets/index-CgAkYWrJ.css +1 -0
- sandboxy/ui/dist/assets/index-D4zoGFcr.js +347 -0
- sandboxy/ui/dist/index.html +14 -0
- sandboxy/utils/__init__.py +3 -0
- sandboxy/utils/time.py +20 -0
- sandboxy-0.0.1.dist-info/METADATA +241 -0
- sandboxy-0.0.1.dist-info/RECORD +60 -0
- sandboxy-0.0.1.dist-info/WHEEL +4 -0
- sandboxy-0.0.1.dist-info/entry_points.txt +3 -0
- sandboxy-0.0.1.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,1388 @@
|
|
|
1
|
+
"""API routes for local development mode."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
from fastapi import APIRouter, HTTPException
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
|
|
13
|
+
from sandboxy.local.context import get_local_context
|
|
14
|
+
|
|
15
|
+
router = APIRouter()
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class LocalFileInfo(BaseModel):
|
|
20
|
+
"""Information about a local file."""
|
|
21
|
+
|
|
22
|
+
id: str
|
|
23
|
+
name: str
|
|
24
|
+
description: str
|
|
25
|
+
type: str | None
|
|
26
|
+
path: str
|
|
27
|
+
relative_path: str
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class LocalStatusResponse(BaseModel):
|
|
31
|
+
"""Response for local status endpoint."""
|
|
32
|
+
|
|
33
|
+
mode: str = "local"
|
|
34
|
+
root_dir: str
|
|
35
|
+
scenarios: list[LocalFileInfo]
|
|
36
|
+
tools: list[LocalFileInfo]
|
|
37
|
+
agents: list[LocalFileInfo]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class VariableInfo(BaseModel):
|
|
41
|
+
"""Information about a scenario variable."""
|
|
42
|
+
|
|
43
|
+
name: str
|
|
44
|
+
label: str = ""
|
|
45
|
+
type: str = "string" # string, number, boolean, select
|
|
46
|
+
default: Any = None
|
|
47
|
+
options: list[str] = [] # For select type
|
|
48
|
+
required: bool = True
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ScenarioDetail(BaseModel):
|
|
52
|
+
"""Detailed scenario information."""
|
|
53
|
+
|
|
54
|
+
id: str
|
|
55
|
+
name: str
|
|
56
|
+
description: str
|
|
57
|
+
type: str | None
|
|
58
|
+
path: str
|
|
59
|
+
content: dict[str, Any]
|
|
60
|
+
variables: list[VariableInfo] = [] # Detected/defined variables
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@router.get("/local/status", response_model=LocalStatusResponse)
|
|
64
|
+
async def get_local_status() -> LocalStatusResponse:
|
|
65
|
+
"""Get status of local development environment.
|
|
66
|
+
|
|
67
|
+
Returns discovered files and current configuration.
|
|
68
|
+
"""
|
|
69
|
+
ctx = get_local_context()
|
|
70
|
+
if not ctx:
|
|
71
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
72
|
+
|
|
73
|
+
discovered = ctx.discover()
|
|
74
|
+
|
|
75
|
+
return LocalStatusResponse(
|
|
76
|
+
root_dir=str(ctx.root_dir),
|
|
77
|
+
scenarios=[LocalFileInfo(**s) for s in discovered["scenarios"]],
|
|
78
|
+
tools=[LocalFileInfo(**t) for t in discovered["tools"]],
|
|
79
|
+
agents=[LocalFileInfo(**a) for a in discovered["agents"]],
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@router.get("/local/scenarios")
|
|
84
|
+
async def list_local_scenarios() -> list[LocalFileInfo]:
|
|
85
|
+
"""List scenarios from local scenarios/ directory."""
|
|
86
|
+
ctx = get_local_context()
|
|
87
|
+
if not ctx:
|
|
88
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
89
|
+
|
|
90
|
+
discovered = ctx.discover()
|
|
91
|
+
return [LocalFileInfo(**s) for s in discovered["scenarios"]]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _extract_variables(content: dict[str, Any]) -> list[VariableInfo]:
|
|
95
|
+
"""Extract variables from scenario content.
|
|
96
|
+
|
|
97
|
+
Variables can be:
|
|
98
|
+
1. Explicitly defined in 'variables' section
|
|
99
|
+
2. Detected from {var} patterns in content
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
content: Parsed scenario YAML
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
List of detected variables
|
|
106
|
+
"""
|
|
107
|
+
import re
|
|
108
|
+
|
|
109
|
+
variables: dict[str, VariableInfo] = {}
|
|
110
|
+
|
|
111
|
+
# 1. Get explicitly defined variables
|
|
112
|
+
for var in content.get("variables", []):
|
|
113
|
+
name = var.get("name", "")
|
|
114
|
+
if name:
|
|
115
|
+
variables[name] = VariableInfo(
|
|
116
|
+
name=name,
|
|
117
|
+
label=var.get("label", name.replace("_", " ").title()),
|
|
118
|
+
type=var.get("type", "string"),
|
|
119
|
+
default=var.get("default"),
|
|
120
|
+
options=var.get("options", []),
|
|
121
|
+
required=var.get("required", True),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# 2. Detect {var} patterns in content (only in user-facing text, not tool definitions)
|
|
125
|
+
def find_vars(obj: Any, found: set[str], skip_keys: set[str] | None = None) -> None:
|
|
126
|
+
if skip_keys is None:
|
|
127
|
+
skip_keys = set()
|
|
128
|
+
if isinstance(obj, str):
|
|
129
|
+
# Skip double-brace patterns like {{name}} - these are tool param refs
|
|
130
|
+
# First remove all {{...}} patterns, then find single {var}
|
|
131
|
+
cleaned = re.sub(r"\{\{[^}]+\}\}", "", obj)
|
|
132
|
+
# Find {variable} patterns, excluding {state.xxx} references
|
|
133
|
+
matches = re.findall(r"\{(\w+)\}", cleaned)
|
|
134
|
+
for match in matches:
|
|
135
|
+
if not match.startswith("state"):
|
|
136
|
+
found.add(match)
|
|
137
|
+
elif isinstance(obj, dict):
|
|
138
|
+
for k, v in obj.items():
|
|
139
|
+
# Skip tool definitions - they have their own param syntax
|
|
140
|
+
if k in skip_keys:
|
|
141
|
+
continue
|
|
142
|
+
find_vars(v, found, skip_keys)
|
|
143
|
+
elif isinstance(obj, list):
|
|
144
|
+
for item in obj:
|
|
145
|
+
find_vars(item, found, skip_keys)
|
|
146
|
+
|
|
147
|
+
detected: set[str] = set()
|
|
148
|
+
# Skip 'environment' section since it contains tool definitions with param refs
|
|
149
|
+
# Also skip 'tools' which might be inline tool definitions
|
|
150
|
+
skip_sections = {"environment", "tools", "config"}
|
|
151
|
+
find_vars(content, detected, skip_sections)
|
|
152
|
+
|
|
153
|
+
# Add detected variables that aren't already defined
|
|
154
|
+
for name in detected:
|
|
155
|
+
if name not in variables:
|
|
156
|
+
variables[name] = VariableInfo(
|
|
157
|
+
name=name,
|
|
158
|
+
label=name.replace("_", " ").title(),
|
|
159
|
+
type="string",
|
|
160
|
+
default=None,
|
|
161
|
+
options=[],
|
|
162
|
+
required=True,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
return list(variables.values())
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@router.get("/local/scenarios/{scenario_id}")
|
|
169
|
+
async def get_local_scenario(scenario_id: str) -> ScenarioDetail:
|
|
170
|
+
"""Get a specific scenario by ID.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
scenario_id: The scenario identifier.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Scenario details including full YAML content and detected variables.
|
|
177
|
+
"""
|
|
178
|
+
ctx = get_local_context()
|
|
179
|
+
if not ctx:
|
|
180
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
181
|
+
|
|
182
|
+
discovered = ctx.discover()
|
|
183
|
+
|
|
184
|
+
# Find the scenario
|
|
185
|
+
for s in discovered["scenarios"]:
|
|
186
|
+
if s["id"] == scenario_id:
|
|
187
|
+
# Load full content
|
|
188
|
+
try:
|
|
189
|
+
content = yaml.safe_load(Path(s["path"]).read_text())
|
|
190
|
+
except Exception as e:
|
|
191
|
+
raise HTTPException(
|
|
192
|
+
status_code=500,
|
|
193
|
+
detail=f"Error loading scenario: {e}",
|
|
194
|
+
) from e
|
|
195
|
+
|
|
196
|
+
# Extract variables
|
|
197
|
+
variables = _extract_variables(content or {})
|
|
198
|
+
|
|
199
|
+
return ScenarioDetail(
|
|
200
|
+
id=s["id"],
|
|
201
|
+
name=s["name"],
|
|
202
|
+
description=s["description"],
|
|
203
|
+
type=s["type"],
|
|
204
|
+
path=s["path"],
|
|
205
|
+
content=content or {},
|
|
206
|
+
variables=variables,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
raise HTTPException(status_code=404, detail=f"Scenario not found: {scenario_id}")
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
@router.get("/local/tools")
|
|
213
|
+
async def list_local_tools() -> list[LocalFileInfo]:
|
|
214
|
+
"""List tools from local tools/ directory."""
|
|
215
|
+
ctx = get_local_context()
|
|
216
|
+
if not ctx:
|
|
217
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
218
|
+
|
|
219
|
+
discovered = ctx.discover()
|
|
220
|
+
return [LocalFileInfo(**t) for t in discovered["tools"]]
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@router.get("/local/tools/{tool_id}")
|
|
224
|
+
async def get_local_tool(tool_id: str) -> dict[str, Any]:
|
|
225
|
+
"""Get a specific tool library by ID.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
tool_id: The tool library identifier.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Tool library details including full YAML content.
|
|
232
|
+
"""
|
|
233
|
+
ctx = get_local_context()
|
|
234
|
+
if not ctx:
|
|
235
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
236
|
+
|
|
237
|
+
discovered = ctx.discover()
|
|
238
|
+
|
|
239
|
+
# Find the tool
|
|
240
|
+
for t in discovered["tools"]:
|
|
241
|
+
if t["id"] == tool_id:
|
|
242
|
+
# Load full content
|
|
243
|
+
try:
|
|
244
|
+
content = yaml.safe_load(Path(t["path"]).read_text())
|
|
245
|
+
except Exception as e:
|
|
246
|
+
raise HTTPException(
|
|
247
|
+
status_code=500,
|
|
248
|
+
detail=f"Error loading tool: {e}",
|
|
249
|
+
) from e
|
|
250
|
+
|
|
251
|
+
return {
|
|
252
|
+
"id": t["id"],
|
|
253
|
+
"name": t["name"],
|
|
254
|
+
"description": t["description"],
|
|
255
|
+
"path": t["path"],
|
|
256
|
+
"content": content or {},
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
raise HTTPException(status_code=404, detail=f"Tool not found: {tool_id}")
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
@router.get("/local/agents")
|
|
263
|
+
async def list_local_agents() -> list[LocalFileInfo]:
|
|
264
|
+
"""List agents from local agents/ directory."""
|
|
265
|
+
ctx = get_local_context()
|
|
266
|
+
if not ctx:
|
|
267
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
268
|
+
|
|
269
|
+
discovered = ctx.discover()
|
|
270
|
+
return [LocalFileInfo(**a) for a in discovered["agents"]]
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
@router.get("/local/agents/{agent_id}")
|
|
274
|
+
async def get_local_agent(agent_id: str) -> dict[str, Any]:
|
|
275
|
+
"""Get a specific agent by ID.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
agent_id: The agent identifier.
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Agent details including full YAML content.
|
|
282
|
+
"""
|
|
283
|
+
ctx = get_local_context()
|
|
284
|
+
if not ctx:
|
|
285
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
286
|
+
|
|
287
|
+
discovered = ctx.discover()
|
|
288
|
+
|
|
289
|
+
# Find the agent
|
|
290
|
+
for a in discovered["agents"]:
|
|
291
|
+
if a["id"] == agent_id:
|
|
292
|
+
# Load full content
|
|
293
|
+
try:
|
|
294
|
+
content = yaml.safe_load(Path(a["path"]).read_text())
|
|
295
|
+
except Exception as e:
|
|
296
|
+
raise HTTPException(
|
|
297
|
+
status_code=500,
|
|
298
|
+
detail=f"Error loading agent: {e}",
|
|
299
|
+
) from e
|
|
300
|
+
|
|
301
|
+
return {
|
|
302
|
+
"id": a["id"],
|
|
303
|
+
"name": a["name"],
|
|
304
|
+
"description": a["description"],
|
|
305
|
+
"path": a["path"],
|
|
306
|
+
"content": content or {},
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
raise HTTPException(status_code=404, detail=f"Agent not found: {agent_id}")
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
@router.get("/local/runs")
|
|
313
|
+
async def list_local_runs() -> list[dict[str, Any]]:
|
|
314
|
+
"""List run results from local runs/ directory."""
|
|
315
|
+
import json
|
|
316
|
+
|
|
317
|
+
ctx = get_local_context()
|
|
318
|
+
if not ctx:
|
|
319
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
320
|
+
|
|
321
|
+
runs = []
|
|
322
|
+
if ctx.runs_dir.exists():
|
|
323
|
+
for path in sorted(ctx.runs_dir.glob("*.json"), reverse=True):
|
|
324
|
+
try:
|
|
325
|
+
data = json.loads(path.read_text())
|
|
326
|
+
runs.append(
|
|
327
|
+
{
|
|
328
|
+
"filename": path.name,
|
|
329
|
+
"path": str(path),
|
|
330
|
+
"scenario_id": data.get("scenario_id"),
|
|
331
|
+
"timestamp": data.get("timestamp"),
|
|
332
|
+
"metadata": data.get("metadata", {}),
|
|
333
|
+
}
|
|
334
|
+
)
|
|
335
|
+
except Exception:
|
|
336
|
+
# Skip invalid files
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
return runs[:100] # Limit to most recent 100
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
@router.get("/local/runs/{filename}")
|
|
343
|
+
async def get_local_run(filename: str) -> dict[str, Any]:
|
|
344
|
+
"""Get a specific run result.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
filename: The run result filename.
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
Full run result data.
|
|
351
|
+
"""
|
|
352
|
+
import json
|
|
353
|
+
|
|
354
|
+
ctx = get_local_context()
|
|
355
|
+
if not ctx:
|
|
356
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
357
|
+
|
|
358
|
+
filepath = ctx.runs_dir / filename
|
|
359
|
+
if not filepath.exists():
|
|
360
|
+
raise HTTPException(status_code=404, detail=f"Run not found: {filename}")
|
|
361
|
+
|
|
362
|
+
try:
|
|
363
|
+
return json.loads(filepath.read_text())
|
|
364
|
+
except Exception as e:
|
|
365
|
+
raise HTTPException(status_code=500, detail=f"Error loading run: {e}") from e
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
# =============================================================================
|
|
369
|
+
# Scenario Execution API
|
|
370
|
+
# =============================================================================
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
class RunScenarioRequest(BaseModel):
|
|
374
|
+
"""Request to run a scenario."""
|
|
375
|
+
|
|
376
|
+
scenario_id: str
|
|
377
|
+
model: str
|
|
378
|
+
variables: dict[str, Any] = Field(default_factory=dict)
|
|
379
|
+
max_turns: int = 20
|
|
380
|
+
max_tokens: int = 1024
|
|
381
|
+
temperature: float = 0.7
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
class RunScenarioResponse(BaseModel):
|
|
385
|
+
"""Response from running a scenario."""
|
|
386
|
+
|
|
387
|
+
id: str
|
|
388
|
+
scenario_id: str
|
|
389
|
+
model: str
|
|
390
|
+
response: str
|
|
391
|
+
history: list[dict[str, Any]]
|
|
392
|
+
tool_calls: list[dict[str, Any]]
|
|
393
|
+
final_state: dict[str, Any]
|
|
394
|
+
evaluation: dict[str, Any] | None
|
|
395
|
+
latency_ms: int
|
|
396
|
+
error: str | None
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
class CompareModelsRequest(BaseModel):
|
|
400
|
+
"""Request to compare multiple models on a scenario."""
|
|
401
|
+
|
|
402
|
+
scenario_id: str
|
|
403
|
+
models: list[str]
|
|
404
|
+
runs_per_model: int = 1
|
|
405
|
+
variables: dict[str, Any] = Field(default_factory=dict)
|
|
406
|
+
max_turns: int = 20
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
class CompareModelsResponse(BaseModel):
|
|
410
|
+
"""Response from multi-model comparison."""
|
|
411
|
+
|
|
412
|
+
scenario_id: str
|
|
413
|
+
scenario_name: str
|
|
414
|
+
models: list[str]
|
|
415
|
+
runs_per_model: int
|
|
416
|
+
stats: dict[str, Any]
|
|
417
|
+
ranking: list[str]
|
|
418
|
+
winner: str | None
|
|
419
|
+
results: list[dict[str, Any]] = Field(default_factory=list) # Individual run results
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
@router.post("/local/run", response_model=RunScenarioResponse)
|
|
423
|
+
async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
|
|
424
|
+
"""Run a scenario with a single model.
|
|
425
|
+
|
|
426
|
+
Args:
|
|
427
|
+
request: Run configuration including scenario_id, model, and variables.
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
Scenario execution result.
|
|
431
|
+
"""
|
|
432
|
+
ctx = get_local_context()
|
|
433
|
+
if not ctx:
|
|
434
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
435
|
+
|
|
436
|
+
# Find the scenario file
|
|
437
|
+
discovered = ctx.discover()
|
|
438
|
+
scenario_path = None
|
|
439
|
+
|
|
440
|
+
for s in discovered["scenarios"]:
|
|
441
|
+
if s["id"] == request.scenario_id:
|
|
442
|
+
scenario_path = Path(s["path"])
|
|
443
|
+
break
|
|
444
|
+
|
|
445
|
+
if not scenario_path:
|
|
446
|
+
raise HTTPException(
|
|
447
|
+
status_code=404,
|
|
448
|
+
detail=f"Scenario not found: {request.scenario_id}",
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
try:
|
|
452
|
+
from sandboxy.scenarios.unified import UnifiedRunner, load_unified_scenario
|
|
453
|
+
|
|
454
|
+
spec = load_unified_scenario(scenario_path)
|
|
455
|
+
runner = UnifiedRunner()
|
|
456
|
+
|
|
457
|
+
result = await runner.run(
|
|
458
|
+
scenario=spec,
|
|
459
|
+
model=request.model,
|
|
460
|
+
variables=request.variables,
|
|
461
|
+
max_turns=request.max_turns,
|
|
462
|
+
max_tokens=request.max_tokens,
|
|
463
|
+
temperature=request.temperature,
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
# Save result to runs/
|
|
467
|
+
from sandboxy.local.results import save_run_result
|
|
468
|
+
|
|
469
|
+
save_run_result(request.scenario_id, result.to_dict())
|
|
470
|
+
|
|
471
|
+
return RunScenarioResponse(
|
|
472
|
+
id=result.id,
|
|
473
|
+
scenario_id=result.scenario_id,
|
|
474
|
+
model=result.model,
|
|
475
|
+
response=result.response,
|
|
476
|
+
history=[{"role": m.role, "content": m.content} for m in result.history],
|
|
477
|
+
tool_calls=[
|
|
478
|
+
{"tool": tc.tool, "action": tc.action, "args": tc.args, "success": tc.success}
|
|
479
|
+
for tc in result.tool_calls
|
|
480
|
+
],
|
|
481
|
+
final_state=result.final_state,
|
|
482
|
+
evaluation=result.evaluation.to_dict() if result.evaluation else None,
|
|
483
|
+
latency_ms=result.latency_ms,
|
|
484
|
+
error=result.error,
|
|
485
|
+
)
|
|
486
|
+
|
|
487
|
+
except Exception as e:
|
|
488
|
+
logger.exception(f"Error running scenario: {e}")
|
|
489
|
+
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
@router.post("/local/compare", response_model=CompareModelsResponse)
|
|
493
|
+
async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse:
|
|
494
|
+
"""Run a scenario with multiple models and compare results.
|
|
495
|
+
|
|
496
|
+
Args:
|
|
497
|
+
request: Comparison configuration including scenario_id, models, and runs_per_model.
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
Comparison results with statistics and ranking.
|
|
501
|
+
"""
|
|
502
|
+
ctx = get_local_context()
|
|
503
|
+
if not ctx:
|
|
504
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
505
|
+
|
|
506
|
+
# Find the scenario file
|
|
507
|
+
discovered = ctx.discover()
|
|
508
|
+
scenario_path = None
|
|
509
|
+
|
|
510
|
+
for s in discovered["scenarios"]:
|
|
511
|
+
if s["id"] == request.scenario_id:
|
|
512
|
+
scenario_path = Path(s["path"])
|
|
513
|
+
break
|
|
514
|
+
|
|
515
|
+
if not scenario_path:
|
|
516
|
+
raise HTTPException(
|
|
517
|
+
status_code=404,
|
|
518
|
+
detail=f"Scenario not found: {request.scenario_id}",
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
if len(request.models) < 1:
|
|
522
|
+
raise HTTPException(
|
|
523
|
+
status_code=400,
|
|
524
|
+
detail="At least one model is required",
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
try:
|
|
528
|
+
from sandboxy.scenarios.comparison import run_comparison
|
|
529
|
+
from sandboxy.scenarios.unified import load_unified_scenario
|
|
530
|
+
|
|
531
|
+
spec = load_unified_scenario(scenario_path)
|
|
532
|
+
|
|
533
|
+
comparison = await run_comparison(
|
|
534
|
+
scenario=spec,
|
|
535
|
+
models=request.models,
|
|
536
|
+
runs_per_model=request.runs_per_model,
|
|
537
|
+
variables=request.variables,
|
|
538
|
+
max_turns=request.max_turns,
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
# Save comparison result
|
|
542
|
+
from sandboxy.local.results import save_run_result
|
|
543
|
+
|
|
544
|
+
save_run_result(
|
|
545
|
+
f"{request.scenario_id}_comparison",
|
|
546
|
+
comparison.to_dict(),
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
return CompareModelsResponse(
|
|
550
|
+
scenario_id=comparison.scenario_id,
|
|
551
|
+
scenario_name=comparison.scenario_name,
|
|
552
|
+
models=comparison.models,
|
|
553
|
+
runs_per_model=comparison.runs_per_model,
|
|
554
|
+
stats={k: v.to_dict() for k, v in comparison.stats.items()},
|
|
555
|
+
ranking=comparison.get_ranking(),
|
|
556
|
+
winner=comparison.get_winner(),
|
|
557
|
+
results=[r.to_dict() for r in comparison.results],
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
except Exception as e:
|
|
561
|
+
logger.exception(f"Error comparing models: {e}")
|
|
562
|
+
raise HTTPException(status_code=500, detail=str(e)) from e
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def get_model_pricing(model_id: str) -> dict[str, float] | None:
|
|
566
|
+
"""Get pricing for a model from OpenRouter models."""
|
|
567
|
+
from sandboxy.providers.openrouter import OPENROUTER_MODELS
|
|
568
|
+
|
|
569
|
+
model_info = OPENROUTER_MODELS.get(model_id)
|
|
570
|
+
if not model_info:
|
|
571
|
+
return None
|
|
572
|
+
return {
|
|
573
|
+
"input": model_info.input_cost_per_million or 0,
|
|
574
|
+
"output": model_info.output_cost_per_million or 0,
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
|
|
578
|
+
def calculate_cost(model_id: str, input_tokens: int, output_tokens: int) -> float | None:
|
|
579
|
+
"""Calculate cost in USD for a model run."""
|
|
580
|
+
pricing = get_model_pricing(model_id)
|
|
581
|
+
if not pricing:
|
|
582
|
+
return None
|
|
583
|
+
input_cost = (input_tokens / 1_000_000) * pricing["input"]
|
|
584
|
+
output_cost = (output_tokens / 1_000_000) * pricing["output"]
|
|
585
|
+
return input_cost + output_cost
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
@router.get("/local/models")
|
|
589
|
+
async def list_available_models() -> list[dict[str, Any]]:
|
|
590
|
+
"""List available models from OpenRouter."""
|
|
591
|
+
from sandboxy.providers.openrouter import OPENROUTER_MODELS
|
|
592
|
+
|
|
593
|
+
models = []
|
|
594
|
+
for model_id, info in OPENROUTER_MODELS.items():
|
|
595
|
+
# Format price string
|
|
596
|
+
if info.input_cost_per_million == 0 and info.output_cost_per_million == 0:
|
|
597
|
+
price = "Free"
|
|
598
|
+
else:
|
|
599
|
+
price = f"${info.input_cost_per_million:.2f}/${info.output_cost_per_million:.2f}"
|
|
600
|
+
|
|
601
|
+
models.append(
|
|
602
|
+
{
|
|
603
|
+
"id": model_id,
|
|
604
|
+
"name": info.name,
|
|
605
|
+
"price": price,
|
|
606
|
+
"pricing": {
|
|
607
|
+
"input": info.input_cost_per_million or 0,
|
|
608
|
+
"output": info.output_cost_per_million or 0,
|
|
609
|
+
},
|
|
610
|
+
"provider": info.provider,
|
|
611
|
+
"context_length": info.context_length,
|
|
612
|
+
"supports_vision": info.supports_vision,
|
|
613
|
+
}
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
return models
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
# =============================================================================
|
|
620
|
+
# Scenario Management API
|
|
621
|
+
# =============================================================================
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
class SaveScenarioRequest(BaseModel):
|
|
625
|
+
"""Request to save a scenario."""
|
|
626
|
+
|
|
627
|
+
id: str
|
|
628
|
+
content: str # YAML content
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
class SaveScenarioResponse(BaseModel):
|
|
632
|
+
"""Response from saving a scenario."""
|
|
633
|
+
|
|
634
|
+
id: str
|
|
635
|
+
path: str
|
|
636
|
+
message: str
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
@router.post("/local/scenarios", response_model=SaveScenarioResponse)
|
|
640
|
+
async def save_scenario(request: SaveScenarioRequest) -> SaveScenarioResponse:
|
|
641
|
+
"""Save a new scenario to the scenarios/ directory.
|
|
642
|
+
|
|
643
|
+
Args:
|
|
644
|
+
request: Scenario ID and YAML content.
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
Saved scenario info.
|
|
648
|
+
"""
|
|
649
|
+
import re
|
|
650
|
+
|
|
651
|
+
ctx = get_local_context()
|
|
652
|
+
if not ctx:
|
|
653
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
654
|
+
|
|
655
|
+
# Validate ID
|
|
656
|
+
if not request.id:
|
|
657
|
+
raise HTTPException(status_code=400, detail="Scenario ID is required")
|
|
658
|
+
|
|
659
|
+
if not re.match(r"^[a-z0-9-]+$", request.id):
|
|
660
|
+
raise HTTPException(
|
|
661
|
+
status_code=400,
|
|
662
|
+
detail="Scenario ID must contain only lowercase letters, numbers, and hyphens",
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
# Validate YAML
|
|
666
|
+
try:
|
|
667
|
+
yaml.safe_load(request.content)
|
|
668
|
+
except yaml.YAMLError as e:
|
|
669
|
+
raise HTTPException(status_code=400, detail=f"Invalid YAML: {e}") from e
|
|
670
|
+
|
|
671
|
+
# Ensure scenarios directory exists
|
|
672
|
+
ctx.scenarios_dir.mkdir(parents=True, exist_ok=True)
|
|
673
|
+
|
|
674
|
+
# Save file
|
|
675
|
+
filepath = ctx.scenarios_dir / f"{request.id}.yml"
|
|
676
|
+
filepath.write_text(request.content)
|
|
677
|
+
|
|
678
|
+
return SaveScenarioResponse(
|
|
679
|
+
id=request.id,
|
|
680
|
+
path=str(filepath),
|
|
681
|
+
message=f"Scenario saved to {filepath}",
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
|
|
685
|
+
@router.put("/local/scenarios/{scenario_id}")
|
|
686
|
+
async def update_scenario(
|
|
687
|
+
scenario_id: str,
|
|
688
|
+
request: SaveScenarioRequest,
|
|
689
|
+
) -> SaveScenarioResponse:
|
|
690
|
+
"""Update an existing scenario.
|
|
691
|
+
|
|
692
|
+
Args:
|
|
693
|
+
scenario_id: The scenario ID to update.
|
|
694
|
+
request: New YAML content.
|
|
695
|
+
|
|
696
|
+
Returns:
|
|
697
|
+
Updated scenario info.
|
|
698
|
+
"""
|
|
699
|
+
ctx = get_local_context()
|
|
700
|
+
if not ctx:
|
|
701
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
702
|
+
|
|
703
|
+
# Find existing file
|
|
704
|
+
filepath = ctx.scenarios_dir / f"{scenario_id}.yml"
|
|
705
|
+
if not filepath.exists():
|
|
706
|
+
filepath = ctx.scenarios_dir / f"{scenario_id}.yaml"
|
|
707
|
+
if not filepath.exists():
|
|
708
|
+
raise HTTPException(status_code=404, detail=f"Scenario not found: {scenario_id}")
|
|
709
|
+
|
|
710
|
+
# Validate YAML
|
|
711
|
+
try:
|
|
712
|
+
yaml.safe_load(request.content)
|
|
713
|
+
except yaml.YAMLError as e:
|
|
714
|
+
raise HTTPException(status_code=400, detail=f"Invalid YAML: {e}") from e
|
|
715
|
+
|
|
716
|
+
# Update file
|
|
717
|
+
filepath.write_text(request.content)
|
|
718
|
+
|
|
719
|
+
return SaveScenarioResponse(
|
|
720
|
+
id=scenario_id,
|
|
721
|
+
path=str(filepath),
|
|
722
|
+
message=f"Scenario updated at {filepath}",
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
@router.delete("/local/scenarios/{scenario_id}")
|
|
727
|
+
async def delete_scenario(scenario_id: str) -> dict[str, str]:
|
|
728
|
+
"""Delete a scenario.
|
|
729
|
+
|
|
730
|
+
Args:
|
|
731
|
+
scenario_id: The scenario ID to delete.
|
|
732
|
+
|
|
733
|
+
Returns:
|
|
734
|
+
Confirmation message.
|
|
735
|
+
"""
|
|
736
|
+
ctx = get_local_context()
|
|
737
|
+
if not ctx:
|
|
738
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
739
|
+
|
|
740
|
+
# Find existing file
|
|
741
|
+
filepath = ctx.scenarios_dir / f"{scenario_id}.yml"
|
|
742
|
+
if not filepath.exists():
|
|
743
|
+
filepath = ctx.scenarios_dir / f"{scenario_id}.yaml"
|
|
744
|
+
if not filepath.exists():
|
|
745
|
+
raise HTTPException(status_code=404, detail=f"Scenario not found: {scenario_id}")
|
|
746
|
+
|
|
747
|
+
filepath.unlink()
|
|
748
|
+
|
|
749
|
+
return {"message": f"Scenario {scenario_id} deleted"}
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
# =============================================================================
|
|
753
|
+
# Tool Management
|
|
754
|
+
# =============================================================================
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
class SaveToolRequest(BaseModel):
|
|
758
|
+
"""Request to save a tool."""
|
|
759
|
+
|
|
760
|
+
name: str
|
|
761
|
+
toolType: str = "yaml" # yaml, python, or mcp
|
|
762
|
+
content: str # YAML or Python content
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
class SaveToolResponse(BaseModel):
|
|
766
|
+
"""Response from saving a tool."""
|
|
767
|
+
|
|
768
|
+
name: str
|
|
769
|
+
path: str
|
|
770
|
+
message: str
|
|
771
|
+
|
|
772
|
+
|
|
773
|
+
@router.post("/local/tools", response_model=SaveToolResponse)
|
|
774
|
+
async def save_tool(request: SaveToolRequest) -> SaveToolResponse:
|
|
775
|
+
"""Save a new tool to the tools/ directory.
|
|
776
|
+
|
|
777
|
+
Supports three tool types:
|
|
778
|
+
- yaml: Declarative YAML mock tools
|
|
779
|
+
- python: Python tool class (generates .py file)
|
|
780
|
+
- mcp: MCP server configuration (YAML with type: mcp)
|
|
781
|
+
|
|
782
|
+
Args:
|
|
783
|
+
request: Tool name, type, and content.
|
|
784
|
+
|
|
785
|
+
Returns:
|
|
786
|
+
Saved tool info.
|
|
787
|
+
"""
|
|
788
|
+
import re
|
|
789
|
+
|
|
790
|
+
ctx = get_local_context()
|
|
791
|
+
if not ctx:
|
|
792
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
793
|
+
|
|
794
|
+
# Validate name
|
|
795
|
+
if not request.name:
|
|
796
|
+
raise HTTPException(status_code=400, detail="Tool name is required")
|
|
797
|
+
|
|
798
|
+
if not re.match(r"^[a-z0-9_]+$", request.name):
|
|
799
|
+
raise HTTPException(
|
|
800
|
+
status_code=400,
|
|
801
|
+
detail="Tool name must contain only lowercase letters, numbers, and underscores",
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
# Ensure tools directory exists
|
|
805
|
+
ctx.tools_dir.mkdir(parents=True, exist_ok=True)
|
|
806
|
+
|
|
807
|
+
# Handle different tool types
|
|
808
|
+
if request.toolType == "python":
|
|
809
|
+
# Save as Python file
|
|
810
|
+
filepath = ctx.tools_dir / f"{request.name}.py"
|
|
811
|
+
filepath.write_text(request.content)
|
|
812
|
+
elif request.toolType == "mcp":
|
|
813
|
+
# Validate YAML for MCP config
|
|
814
|
+
try:
|
|
815
|
+
yaml.safe_load(request.content)
|
|
816
|
+
except yaml.YAMLError as e:
|
|
817
|
+
raise HTTPException(status_code=400, detail=f"Invalid YAML: {e}") from e
|
|
818
|
+
filepath = ctx.tools_dir / f"{request.name}.yml"
|
|
819
|
+
filepath.write_text(request.content)
|
|
820
|
+
else:
|
|
821
|
+
# Default: YAML mock tool
|
|
822
|
+
try:
|
|
823
|
+
yaml.safe_load(request.content)
|
|
824
|
+
except yaml.YAMLError as e:
|
|
825
|
+
raise HTTPException(status_code=400, detail=f"Invalid YAML: {e}") from e
|
|
826
|
+
filepath = ctx.tools_dir / f"{request.name}.yml"
|
|
827
|
+
filepath.write_text(request.content)
|
|
828
|
+
|
|
829
|
+
# Clear tool cache so new tools are discovered
|
|
830
|
+
from sandboxy.tools.loader import discover_python_tools
|
|
831
|
+
|
|
832
|
+
discover_python_tools(refresh=True)
|
|
833
|
+
|
|
834
|
+
return SaveToolResponse(
|
|
835
|
+
name=request.name,
|
|
836
|
+
path=str(filepath),
|
|
837
|
+
message=f"Tool saved to {filepath}",
|
|
838
|
+
)
|
|
839
|
+
|
|
840
|
+
|
|
841
|
+
# =============================================================================
|
|
842
|
+
# Dataset Management API
|
|
843
|
+
# =============================================================================
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
class DatasetInfo(BaseModel):
|
|
847
|
+
"""Information about a dataset."""
|
|
848
|
+
|
|
849
|
+
id: str
|
|
850
|
+
name: str
|
|
851
|
+
description: str
|
|
852
|
+
case_count: int
|
|
853
|
+
path: str
|
|
854
|
+
relative_path: str
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
class DatasetCaseInfo(BaseModel):
|
|
858
|
+
"""Information about a single test case."""
|
|
859
|
+
|
|
860
|
+
id: str
|
|
861
|
+
expected: list[str] = Field(default_factory=list) # Can have multiple expected outcomes
|
|
862
|
+
variables: dict[str, Any] = Field(default_factory=dict)
|
|
863
|
+
tool_responses: dict[str, Any] = Field(default_factory=dict)
|
|
864
|
+
tags: list[str] = Field(default_factory=list)
|
|
865
|
+
|
|
866
|
+
|
|
867
|
+
class DatasetDetail(BaseModel):
|
|
868
|
+
"""Detailed dataset information."""
|
|
869
|
+
|
|
870
|
+
id: str
|
|
871
|
+
name: str
|
|
872
|
+
description: str
|
|
873
|
+
scenario_id: str | None = None # Linked scenario for goal discovery
|
|
874
|
+
cases: list[DatasetCaseInfo]
|
|
875
|
+
generator: dict[str, Any] | None = None
|
|
876
|
+
path: str
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
class ScenarioGoalInfo(BaseModel):
|
|
880
|
+
"""Information about a goal from a scenario."""
|
|
881
|
+
|
|
882
|
+
id: str
|
|
883
|
+
name: str
|
|
884
|
+
description: str
|
|
885
|
+
outcome: bool = False
|
|
886
|
+
|
|
887
|
+
|
|
888
|
+
class SaveDatasetRequest(BaseModel):
|
|
889
|
+
"""Request to save a dataset."""
|
|
890
|
+
|
|
891
|
+
id: str
|
|
892
|
+
content: str # YAML content
|
|
893
|
+
|
|
894
|
+
|
|
895
|
+
class SaveDatasetResponse(BaseModel):
|
|
896
|
+
"""Response from saving a dataset."""
|
|
897
|
+
|
|
898
|
+
id: str
|
|
899
|
+
path: str
|
|
900
|
+
message: str
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
class RunDatasetRequest(BaseModel):
|
|
904
|
+
"""Request to run a scenario against a dataset."""
|
|
905
|
+
|
|
906
|
+
scenario_id: str
|
|
907
|
+
dataset_id: str
|
|
908
|
+
model: str
|
|
909
|
+
max_turns: int = 20
|
|
910
|
+
max_tokens: int = 1024
|
|
911
|
+
temperature: float = 0.7
|
|
912
|
+
parallel: int = 1
|
|
913
|
+
|
|
914
|
+
|
|
915
|
+
class RunDatasetResponse(BaseModel):
|
|
916
|
+
"""Response from running a dataset."""
|
|
917
|
+
|
|
918
|
+
scenario_id: str
|
|
919
|
+
model: str
|
|
920
|
+
dataset_id: str
|
|
921
|
+
total_cases: int
|
|
922
|
+
passed_cases: int
|
|
923
|
+
failed_cases: int
|
|
924
|
+
pass_rate: float
|
|
925
|
+
avg_score: float
|
|
926
|
+
avg_percentage: float
|
|
927
|
+
by_expected: dict[str, dict[str, int]]
|
|
928
|
+
total_time_ms: int
|
|
929
|
+
case_results: list[dict[str, Any]]
|
|
930
|
+
|
|
931
|
+
|
|
932
|
+
@router.get("/local/datasets", response_model=list[DatasetInfo])
|
|
933
|
+
async def list_local_datasets() -> list[DatasetInfo]:
|
|
934
|
+
"""List datasets from local datasets/ directory."""
|
|
935
|
+
ctx = get_local_context()
|
|
936
|
+
if not ctx:
|
|
937
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
938
|
+
|
|
939
|
+
datasets = []
|
|
940
|
+
datasets_dir = ctx.datasets_dir
|
|
941
|
+
|
|
942
|
+
if datasets_dir.exists():
|
|
943
|
+
for path in sorted(datasets_dir.glob("*.yml")):
|
|
944
|
+
try:
|
|
945
|
+
content = yaml.safe_load(path.read_text())
|
|
946
|
+
if content:
|
|
947
|
+
case_count = 0
|
|
948
|
+
if "cases" in content:
|
|
949
|
+
case_count = len(content.get("cases", []))
|
|
950
|
+
elif "generator" in content:
|
|
951
|
+
# Estimate generated case count
|
|
952
|
+
gen = content.get("generator", {})
|
|
953
|
+
dims = gen.get("dimensions", {})
|
|
954
|
+
case_count = 1
|
|
955
|
+
for values in dims.values():
|
|
956
|
+
if isinstance(values, list):
|
|
957
|
+
case_count *= len(values)
|
|
958
|
+
|
|
959
|
+
datasets.append(
|
|
960
|
+
DatasetInfo(
|
|
961
|
+
id=path.stem,
|
|
962
|
+
name=content.get("name", path.stem),
|
|
963
|
+
description=content.get("description", ""),
|
|
964
|
+
case_count=case_count,
|
|
965
|
+
path=str(path),
|
|
966
|
+
relative_path=str(path.relative_to(ctx.root_dir)),
|
|
967
|
+
)
|
|
968
|
+
)
|
|
969
|
+
except Exception as e:
|
|
970
|
+
logger.warning(f"Error loading dataset {path}: {e}")
|
|
971
|
+
continue
|
|
972
|
+
|
|
973
|
+
return datasets
|
|
974
|
+
|
|
975
|
+
|
|
976
|
+
@router.get("/local/datasets/{dataset_id}", response_model=DatasetDetail)
|
|
977
|
+
async def get_local_dataset(dataset_id: str) -> DatasetDetail:
|
|
978
|
+
"""Get a specific dataset by ID.
|
|
979
|
+
|
|
980
|
+
Args:
|
|
981
|
+
dataset_id: The dataset identifier.
|
|
982
|
+
|
|
983
|
+
Returns:
|
|
984
|
+
Dataset details including all cases.
|
|
985
|
+
"""
|
|
986
|
+
ctx = get_local_context()
|
|
987
|
+
if not ctx:
|
|
988
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
989
|
+
|
|
990
|
+
datasets_dir = ctx.datasets_dir
|
|
991
|
+
filepath = datasets_dir / f"{dataset_id}.yml"
|
|
992
|
+
|
|
993
|
+
if not filepath.exists():
|
|
994
|
+
filepath = datasets_dir / f"{dataset_id}.yaml"
|
|
995
|
+
if not filepath.exists():
|
|
996
|
+
raise HTTPException(status_code=404, detail=f"Dataset not found: {dataset_id}")
|
|
997
|
+
|
|
998
|
+
try:
|
|
999
|
+
content = yaml.safe_load(filepath.read_text())
|
|
1000
|
+
except Exception as e:
|
|
1001
|
+
raise HTTPException(status_code=500, detail=f"Error loading dataset: {e}") from e
|
|
1002
|
+
|
|
1003
|
+
cases = []
|
|
1004
|
+
for case_data in content.get("cases", []):
|
|
1005
|
+
# Handle expected as string or list
|
|
1006
|
+
expected_raw = case_data.get("expected")
|
|
1007
|
+
if expected_raw is None:
|
|
1008
|
+
expected = []
|
|
1009
|
+
elif isinstance(expected_raw, list):
|
|
1010
|
+
expected = expected_raw
|
|
1011
|
+
else:
|
|
1012
|
+
expected = [expected_raw]
|
|
1013
|
+
|
|
1014
|
+
cases.append(
|
|
1015
|
+
DatasetCaseInfo(
|
|
1016
|
+
id=case_data.get("id", ""),
|
|
1017
|
+
expected=expected,
|
|
1018
|
+
variables=case_data.get("variables", {}),
|
|
1019
|
+
tool_responses=case_data.get("tool_responses", {}),
|
|
1020
|
+
tags=case_data.get("tags", []),
|
|
1021
|
+
)
|
|
1022
|
+
)
|
|
1023
|
+
|
|
1024
|
+
return DatasetDetail(
|
|
1025
|
+
id=dataset_id,
|
|
1026
|
+
name=content.get("name", dataset_id),
|
|
1027
|
+
description=content.get("description", ""),
|
|
1028
|
+
scenario_id=content.get("scenario_id"),
|
|
1029
|
+
cases=cases,
|
|
1030
|
+
generator=content.get("generator"),
|
|
1031
|
+
path=str(filepath),
|
|
1032
|
+
)
|
|
1033
|
+
|
|
1034
|
+
|
|
1035
|
+
@router.get("/local/scenarios/{scenario_id}/goals", response_model=list[ScenarioGoalInfo])
|
|
1036
|
+
async def get_scenario_goals(scenario_id: str) -> list[ScenarioGoalInfo]:
|
|
1037
|
+
"""Get goals from a scenario for dataset editor dropdown.
|
|
1038
|
+
|
|
1039
|
+
Args:
|
|
1040
|
+
scenario_id: The scenario identifier.
|
|
1041
|
+
|
|
1042
|
+
Returns:
|
|
1043
|
+
List of goals with their outcome flag.
|
|
1044
|
+
"""
|
|
1045
|
+
ctx = get_local_context()
|
|
1046
|
+
if not ctx:
|
|
1047
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
1048
|
+
|
|
1049
|
+
# Find the scenario file
|
|
1050
|
+
discovered = ctx.discover()
|
|
1051
|
+
scenario_path = None
|
|
1052
|
+
|
|
1053
|
+
for s in discovered["scenarios"]:
|
|
1054
|
+
if s["id"] == scenario_id:
|
|
1055
|
+
scenario_path = Path(s["path"])
|
|
1056
|
+
break
|
|
1057
|
+
|
|
1058
|
+
if not scenario_path:
|
|
1059
|
+
raise HTTPException(status_code=404, detail=f"Scenario not found: {scenario_id}")
|
|
1060
|
+
|
|
1061
|
+
try:
|
|
1062
|
+
from sandboxy.scenarios.unified import load_unified_scenario
|
|
1063
|
+
|
|
1064
|
+
spec = load_unified_scenario(scenario_path)
|
|
1065
|
+
|
|
1066
|
+
goals = []
|
|
1067
|
+
if spec.evaluation and spec.evaluation.goals:
|
|
1068
|
+
for goal in spec.evaluation.goals:
|
|
1069
|
+
goals.append(
|
|
1070
|
+
ScenarioGoalInfo(
|
|
1071
|
+
id=goal.id,
|
|
1072
|
+
name=goal.name,
|
|
1073
|
+
description=goal.description,
|
|
1074
|
+
outcome=goal.outcome,
|
|
1075
|
+
)
|
|
1076
|
+
)
|
|
1077
|
+
|
|
1078
|
+
return goals
|
|
1079
|
+
|
|
1080
|
+
except Exception as e:
|
|
1081
|
+
raise HTTPException(status_code=500, detail=f"Error loading scenario: {e}") from e
|
|
1082
|
+
|
|
1083
|
+
|
|
1084
|
+
class ScenarioToolAction(BaseModel):
|
|
1085
|
+
"""Information about a tool action."""
|
|
1086
|
+
|
|
1087
|
+
name: str
|
|
1088
|
+
description: str = ""
|
|
1089
|
+
|
|
1090
|
+
|
|
1091
|
+
class ScenarioToolInfo(BaseModel):
|
|
1092
|
+
"""Information about a tool in a scenario."""
|
|
1093
|
+
|
|
1094
|
+
name: str
|
|
1095
|
+
description: str = ""
|
|
1096
|
+
actions: list[ScenarioToolAction] = []
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
@router.get("/local/scenarios/{scenario_id}/tools", response_model=list[ScenarioToolInfo])
|
|
1100
|
+
async def get_scenario_tools(scenario_id: str) -> list[ScenarioToolInfo]:
|
|
1101
|
+
"""Get tools from a scenario for dataset editor dropdown.
|
|
1102
|
+
|
|
1103
|
+
Args:
|
|
1104
|
+
scenario_id: The scenario identifier.
|
|
1105
|
+
|
|
1106
|
+
Returns:
|
|
1107
|
+
List of tools with their actions.
|
|
1108
|
+
"""
|
|
1109
|
+
ctx = get_local_context()
|
|
1110
|
+
if not ctx:
|
|
1111
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
1112
|
+
|
|
1113
|
+
# Find the scenario file
|
|
1114
|
+
discovered = ctx.discover()
|
|
1115
|
+
scenario_path = None
|
|
1116
|
+
|
|
1117
|
+
for s in discovered["scenarios"]:
|
|
1118
|
+
if s["id"] == scenario_id:
|
|
1119
|
+
scenario_path = Path(s["path"])
|
|
1120
|
+
break
|
|
1121
|
+
|
|
1122
|
+
if not scenario_path:
|
|
1123
|
+
raise HTTPException(status_code=404, detail=f"Scenario not found: {scenario_id}")
|
|
1124
|
+
|
|
1125
|
+
try:
|
|
1126
|
+
from sandboxy.scenarios.unified import load_unified_scenario
|
|
1127
|
+
from sandboxy.tools.yaml_tools import YamlToolLoader
|
|
1128
|
+
|
|
1129
|
+
spec = load_unified_scenario(scenario_path)
|
|
1130
|
+
loader = YamlToolLoader([ctx.tools_dir])
|
|
1131
|
+
|
|
1132
|
+
tools_info: list[ScenarioToolInfo] = []
|
|
1133
|
+
|
|
1134
|
+
# Get inline tools
|
|
1135
|
+
if spec.tools:
|
|
1136
|
+
inline_specs = loader.parse_inline_tools(spec.tools)
|
|
1137
|
+
for tool_name, tool_spec in inline_specs.items():
|
|
1138
|
+
actions = [
|
|
1139
|
+
ScenarioToolAction(
|
|
1140
|
+
name=action_name,
|
|
1141
|
+
description=action_spec.description,
|
|
1142
|
+
)
|
|
1143
|
+
for action_name, action_spec in tool_spec.get_effective_actions().items()
|
|
1144
|
+
]
|
|
1145
|
+
tools_info.append(
|
|
1146
|
+
ScenarioToolInfo(
|
|
1147
|
+
name=tool_name,
|
|
1148
|
+
description=tool_spec.description,
|
|
1149
|
+
actions=actions,
|
|
1150
|
+
)
|
|
1151
|
+
)
|
|
1152
|
+
|
|
1153
|
+
# Get tools from libraries
|
|
1154
|
+
for lib_name in spec.tools_from:
|
|
1155
|
+
lib_path = ctx.tools_dir / f"{lib_name}.yml"
|
|
1156
|
+
if not lib_path.exists():
|
|
1157
|
+
lib_path = ctx.tools_dir / f"{lib_name}.yaml"
|
|
1158
|
+
if lib_path.exists():
|
|
1159
|
+
library = loader.load_library_file(lib_path)
|
|
1160
|
+
for tool_name, tool_spec in library.tools.items():
|
|
1161
|
+
actions = [
|
|
1162
|
+
ScenarioToolAction(
|
|
1163
|
+
name=action_name,
|
|
1164
|
+
description=action_spec.description,
|
|
1165
|
+
)
|
|
1166
|
+
for action_name, action_spec in tool_spec.get_effective_actions().items()
|
|
1167
|
+
]
|
|
1168
|
+
tools_info.append(
|
|
1169
|
+
ScenarioToolInfo(
|
|
1170
|
+
name=tool_name,
|
|
1171
|
+
description=tool_spec.description,
|
|
1172
|
+
actions=actions,
|
|
1173
|
+
)
|
|
1174
|
+
)
|
|
1175
|
+
|
|
1176
|
+
return tools_info
|
|
1177
|
+
|
|
1178
|
+
except Exception as e:
|
|
1179
|
+
raise HTTPException(status_code=500, detail=f"Error loading scenario tools: {e}") from e
|
|
1180
|
+
|
|
1181
|
+
|
|
1182
|
+
@router.post("/local/datasets", response_model=SaveDatasetResponse)
|
|
1183
|
+
async def save_dataset(request: SaveDatasetRequest) -> SaveDatasetResponse:
|
|
1184
|
+
"""Save a new dataset to the datasets/ directory.
|
|
1185
|
+
|
|
1186
|
+
Args:
|
|
1187
|
+
request: Dataset ID and YAML content.
|
|
1188
|
+
|
|
1189
|
+
Returns:
|
|
1190
|
+
Saved dataset info.
|
|
1191
|
+
"""
|
|
1192
|
+
import re
|
|
1193
|
+
|
|
1194
|
+
ctx = get_local_context()
|
|
1195
|
+
if not ctx:
|
|
1196
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
1197
|
+
|
|
1198
|
+
# Validate ID
|
|
1199
|
+
if not request.id:
|
|
1200
|
+
raise HTTPException(status_code=400, detail="Dataset ID is required")
|
|
1201
|
+
|
|
1202
|
+
if not re.match(r"^[a-z0-9_-]+$", request.id):
|
|
1203
|
+
raise HTTPException(
|
|
1204
|
+
status_code=400,
|
|
1205
|
+
detail="Dataset ID must contain only lowercase letters, numbers, hyphens, and underscores",
|
|
1206
|
+
)
|
|
1207
|
+
|
|
1208
|
+
# Validate YAML
|
|
1209
|
+
try:
|
|
1210
|
+
yaml.safe_load(request.content)
|
|
1211
|
+
except yaml.YAMLError as e:
|
|
1212
|
+
raise HTTPException(status_code=400, detail=f"Invalid YAML: {e}") from e
|
|
1213
|
+
|
|
1214
|
+
# Ensure datasets directory exists
|
|
1215
|
+
datasets_dir = ctx.datasets_dir
|
|
1216
|
+
datasets_dir.mkdir(parents=True, exist_ok=True)
|
|
1217
|
+
|
|
1218
|
+
# Save file
|
|
1219
|
+
filepath = datasets_dir / f"{request.id}.yml"
|
|
1220
|
+
filepath.write_text(request.content)
|
|
1221
|
+
|
|
1222
|
+
return SaveDatasetResponse(
|
|
1223
|
+
id=request.id,
|
|
1224
|
+
path=str(filepath),
|
|
1225
|
+
message=f"Dataset saved to {filepath}",
|
|
1226
|
+
)
|
|
1227
|
+
|
|
1228
|
+
|
|
1229
|
+
@router.put("/local/datasets/{dataset_id}")
|
|
1230
|
+
async def update_dataset(
|
|
1231
|
+
dataset_id: str,
|
|
1232
|
+
request: SaveDatasetRequest,
|
|
1233
|
+
) -> SaveDatasetResponse:
|
|
1234
|
+
"""Update an existing dataset.
|
|
1235
|
+
|
|
1236
|
+
Args:
|
|
1237
|
+
dataset_id: The dataset ID to update.
|
|
1238
|
+
request: New YAML content.
|
|
1239
|
+
|
|
1240
|
+
Returns:
|
|
1241
|
+
Updated dataset info.
|
|
1242
|
+
"""
|
|
1243
|
+
ctx = get_local_context()
|
|
1244
|
+
if not ctx:
|
|
1245
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
1246
|
+
|
|
1247
|
+
datasets_dir = ctx.datasets_dir
|
|
1248
|
+
filepath = datasets_dir / f"{dataset_id}.yml"
|
|
1249
|
+
if not filepath.exists():
|
|
1250
|
+
filepath = datasets_dir / f"{dataset_id}.yaml"
|
|
1251
|
+
if not filepath.exists():
|
|
1252
|
+
raise HTTPException(status_code=404, detail=f"Dataset not found: {dataset_id}")
|
|
1253
|
+
|
|
1254
|
+
# Validate YAML
|
|
1255
|
+
try:
|
|
1256
|
+
yaml.safe_load(request.content)
|
|
1257
|
+
except yaml.YAMLError as e:
|
|
1258
|
+
raise HTTPException(status_code=400, detail=f"Invalid YAML: {e}") from e
|
|
1259
|
+
|
|
1260
|
+
# Update file
|
|
1261
|
+
filepath.write_text(request.content)
|
|
1262
|
+
|
|
1263
|
+
return SaveDatasetResponse(
|
|
1264
|
+
id=dataset_id,
|
|
1265
|
+
path=str(filepath),
|
|
1266
|
+
message=f"Dataset updated at {filepath}",
|
|
1267
|
+
)
|
|
1268
|
+
|
|
1269
|
+
|
|
1270
|
+
@router.delete("/local/datasets/{dataset_id}")
|
|
1271
|
+
async def delete_dataset(dataset_id: str) -> dict[str, str]:
|
|
1272
|
+
"""Delete a dataset.
|
|
1273
|
+
|
|
1274
|
+
Args:
|
|
1275
|
+
dataset_id: The dataset ID to delete.
|
|
1276
|
+
|
|
1277
|
+
Returns:
|
|
1278
|
+
Confirmation message.
|
|
1279
|
+
"""
|
|
1280
|
+
ctx = get_local_context()
|
|
1281
|
+
if not ctx:
|
|
1282
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
1283
|
+
|
|
1284
|
+
datasets_dir = ctx.datasets_dir
|
|
1285
|
+
filepath = datasets_dir / f"{dataset_id}.yml"
|
|
1286
|
+
if not filepath.exists():
|
|
1287
|
+
filepath = datasets_dir / f"{dataset_id}.yaml"
|
|
1288
|
+
if not filepath.exists():
|
|
1289
|
+
raise HTTPException(status_code=404, detail=f"Dataset not found: {dataset_id}")
|
|
1290
|
+
|
|
1291
|
+
filepath.unlink()
|
|
1292
|
+
|
|
1293
|
+
return {"message": f"Dataset {dataset_id} deleted"}
|
|
1294
|
+
|
|
1295
|
+
|
|
1296
|
+
@router.post("/local/run-dataset", response_model=RunDatasetResponse)
|
|
1297
|
+
async def run_with_dataset(request: RunDatasetRequest) -> RunDatasetResponse:
|
|
1298
|
+
"""Run a scenario against a dataset.
|
|
1299
|
+
|
|
1300
|
+
Args:
|
|
1301
|
+
request: Run configuration including scenario_id, dataset_id, and model.
|
|
1302
|
+
|
|
1303
|
+
Returns:
|
|
1304
|
+
Dataset benchmark results.
|
|
1305
|
+
"""
|
|
1306
|
+
ctx = get_local_context()
|
|
1307
|
+
if not ctx:
|
|
1308
|
+
raise HTTPException(status_code=500, detail="Not in local mode")
|
|
1309
|
+
|
|
1310
|
+
# Find the scenario file
|
|
1311
|
+
discovered = ctx.discover()
|
|
1312
|
+
scenario_path = None
|
|
1313
|
+
|
|
1314
|
+
for s in discovered["scenarios"]:
|
|
1315
|
+
if s["id"] == request.scenario_id:
|
|
1316
|
+
scenario_path = Path(s["path"])
|
|
1317
|
+
break
|
|
1318
|
+
|
|
1319
|
+
if not scenario_path:
|
|
1320
|
+
raise HTTPException(
|
|
1321
|
+
status_code=404,
|
|
1322
|
+
detail=f"Scenario not found: {request.scenario_id}",
|
|
1323
|
+
)
|
|
1324
|
+
|
|
1325
|
+
# Find the dataset file
|
|
1326
|
+
datasets_dir = ctx.datasets_dir
|
|
1327
|
+
dataset_path = datasets_dir / f"{request.dataset_id}.yml"
|
|
1328
|
+
if not dataset_path.exists():
|
|
1329
|
+
dataset_path = datasets_dir / f"{request.dataset_id}.yaml"
|
|
1330
|
+
if not dataset_path.exists():
|
|
1331
|
+
raise HTTPException(
|
|
1332
|
+
status_code=404,
|
|
1333
|
+
detail=f"Dataset not found: {request.dataset_id}",
|
|
1334
|
+
)
|
|
1335
|
+
|
|
1336
|
+
try:
|
|
1337
|
+
from sandboxy.datasets import load_dataset, run_dataset, run_dataset_parallel
|
|
1338
|
+
from sandboxy.scenarios.unified import load_unified_scenario
|
|
1339
|
+
|
|
1340
|
+
spec = load_unified_scenario(scenario_path)
|
|
1341
|
+
dataset = load_dataset(dataset_path)
|
|
1342
|
+
|
|
1343
|
+
if request.parallel > 1:
|
|
1344
|
+
result = await run_dataset_parallel(
|
|
1345
|
+
scenario=spec,
|
|
1346
|
+
model=request.model,
|
|
1347
|
+
dataset=dataset,
|
|
1348
|
+
max_turns=request.max_turns,
|
|
1349
|
+
max_tokens=request.max_tokens,
|
|
1350
|
+
temperature=request.temperature,
|
|
1351
|
+
max_concurrent=request.parallel,
|
|
1352
|
+
)
|
|
1353
|
+
else:
|
|
1354
|
+
result = await run_dataset(
|
|
1355
|
+
scenario=spec,
|
|
1356
|
+
model=request.model,
|
|
1357
|
+
dataset=dataset,
|
|
1358
|
+
max_turns=request.max_turns,
|
|
1359
|
+
max_tokens=request.max_tokens,
|
|
1360
|
+
temperature=request.temperature,
|
|
1361
|
+
)
|
|
1362
|
+
|
|
1363
|
+
# Save result
|
|
1364
|
+
from sandboxy.local.results import save_run_result
|
|
1365
|
+
|
|
1366
|
+
save_run_result(
|
|
1367
|
+
f"{request.scenario_id}_dataset_{request.dataset_id}",
|
|
1368
|
+
result.to_dict(),
|
|
1369
|
+
)
|
|
1370
|
+
|
|
1371
|
+
return RunDatasetResponse(
|
|
1372
|
+
scenario_id=result.scenario_id,
|
|
1373
|
+
model=result.model,
|
|
1374
|
+
dataset_id=result.dataset_id,
|
|
1375
|
+
total_cases=result.total_cases,
|
|
1376
|
+
passed_cases=result.passed_cases,
|
|
1377
|
+
failed_cases=result.failed_cases,
|
|
1378
|
+
pass_rate=result.pass_rate,
|
|
1379
|
+
avg_score=result.avg_score,
|
|
1380
|
+
avg_percentage=result.avg_percentage,
|
|
1381
|
+
by_expected=result.by_expected,
|
|
1382
|
+
total_time_ms=result.total_time_ms,
|
|
1383
|
+
case_results=[c.to_dict() for c in result.case_results],
|
|
1384
|
+
)
|
|
1385
|
+
|
|
1386
|
+
except Exception as e:
|
|
1387
|
+
logger.exception(f"Error running dataset: {e}")
|
|
1388
|
+
raise HTTPException(status_code=500, detail=str(e)) from e
|