sandboxy 0.0.3__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sandboxy-0.0.3 → sandboxy-0.0.4}/PKG-INFO +37 -1
- {sandboxy-0.0.3 → sandboxy-0.0.4}/README.md +34 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/hooks/useScenarioRun.ts +21 -4
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/lib/api.ts +10 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/pages/RunPage.tsx +110 -4
- {sandboxy-0.0.3 → sandboxy-0.0.4}/pyproject.toml +6 -1
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/api/routes/local.py +182 -19
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/cli/main.py +292 -31
- sandboxy-0.0.4/sandboxy/mlflow/__init__.py +38 -0
- sandboxy-0.0.4/sandboxy/mlflow/artifacts.py +184 -0
- sandboxy-0.0.4/sandboxy/mlflow/config.py +90 -0
- sandboxy-0.0.4/sandboxy/mlflow/exporter.py +439 -0
- sandboxy-0.0.4/sandboxy/mlflow/metrics.py +115 -0
- sandboxy-0.0.4/sandboxy/mlflow/tags.py +140 -0
- sandboxy-0.0.4/sandboxy/mlflow/tracing.py +126 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/scenarios/loader.py +44 -2
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/scenarios/runner.py +57 -2
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/tools/yaml_tools.py +18 -0
- sandboxy-0.0.4/sandboxy/ui/dist/assets/index-CU06wBqc.js +362 -0
- sandboxy-0.0.4/sandboxy/ui/dist/assets/index-Cgg2wY2m.css +1 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/ui/dist/index.html +2 -2
- sandboxy-0.0.4/tests/integration/test_mlflow_integration.py +245 -0
- sandboxy-0.0.4/tests/unit/mlflow/__init__.py +1 -0
- sandboxy-0.0.4/tests/unit/mlflow/test_artifacts.py +206 -0
- sandboxy-0.0.4/tests/unit/mlflow/test_config.py +127 -0
- sandboxy-0.0.4/tests/unit/mlflow/test_metrics.py +131 -0
- sandboxy-0.0.4/tests/unit/mlflow/test_tags.py +209 -0
- sandboxy-0.0.3/sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -1
- sandboxy-0.0.3/sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -347
- {sandboxy-0.0.3 → sandboxy-0.0.4}/.env.example +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/.github/workflows/ci.yml +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/.github/workflows/publish.yml +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/.gitignore +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/CONTRIBUTING.md +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/LICENSE +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/Makefile +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/docs/yaml-tools.md +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/index.html +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/package-lock.json +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/package.json +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/postcss.config.js +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/App.tsx +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/components/Layout.tsx +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/components/ModelSelector.tsx +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/components/ResultDisplay.tsx +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/hooks/useScenarioBuilder.ts +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/hooks/useToolBuilder.ts +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/index.css +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/main.tsx +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/pages/BuilderPage.tsx +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/pages/DashboardPage.tsx +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/pages/DatasetPage.tsx +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/pages/ResultsPage.tsx +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/src/pages/ToolBuilderPage.tsx +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/tailwind.config.js +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/tsconfig.json +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/tsconfig.node.json +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/local-ui/vite.config.ts +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/agents/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/agents/base.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/agents/llm_prompt.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/agents/loader.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/api/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/api/app.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/api/routes/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/api/routes/agents.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/api/routes/tools.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/cli/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/cli/type_detector.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/config.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/core/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/core/async_runner.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/core/mdl_parser.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/core/runner.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/core/safe_eval.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/core/state.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/datasets/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/datasets/loader.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/datasets/runner.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/errors.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/local/context.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/local/results.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/logging.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/mcp/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/mcp/client.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/mcp/wrapper.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/providers/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/providers/anthropic_provider.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/providers/base.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/providers/http_client.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/providers/openai_provider.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/providers/openrouter.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/providers/registry.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/scenarios/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/scenarios/comparison.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/scenarios/unified.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/session/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/session/manager.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/tools/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/tools/base.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/tools/loader.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/ui/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/utils/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/sandboxy/utils/time.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/scenarios/customer_service.yml +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/conftest.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/factories.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/integration/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/integration/api/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/mocks/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/mocks/providers.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/agents/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/agents/test_base.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/agents/test_llm_prompt.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/agents/test_loader.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/core/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/core/test_async_runner.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/core/test_mdl_parser.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/core/test_runner.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/core/test_safe_eval.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/core/test_state.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/providers/test_openrouter.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/tools/__init__.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/tools/test_base.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/tests/unit/tools/test_loader.py +0 -0
- {sandboxy-0.0.3 → sandboxy-0.0.4}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sandboxy
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: Open-source agent simulation and benchmarking platform
|
|
5
5
|
Project-URL: Homepage, https://github.com/sandboxy-ai/sandboxy
|
|
6
6
|
Project-URL: Repository, https://github.com/sandboxy-ai/sandboxy
|
|
@@ -39,6 +39,8 @@ Requires-Dist: pytest-xdist>=3.5.0; extra == 'dev'
|
|
|
39
39
|
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
40
40
|
Requires-Dist: respx>=0.21.0; extra == 'dev'
|
|
41
41
|
Requires-Dist: ruff>=0.1; extra == 'dev'
|
|
42
|
+
Provides-Extra: mlflow
|
|
43
|
+
Requires-Dist: mlflow>=3.0; extra == 'mlflow'
|
|
42
44
|
Description-Content-Type: text/markdown
|
|
43
45
|
|
|
44
46
|
# Sandboxy
|
|
@@ -204,6 +206,39 @@ sandboxy list-models --search claude
|
|
|
204
206
|
sandboxy list-models --free
|
|
205
207
|
```
|
|
206
208
|
|
|
209
|
+
## MLflow Integration
|
|
210
|
+
|
|
211
|
+
Export scenario run results to MLflow for experiment tracking and model comparison.
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
# Install with MLflow support
|
|
215
|
+
pip install sandboxy[mlflow]
|
|
216
|
+
|
|
217
|
+
# Export run to MLflow
|
|
218
|
+
sandboxy scenario scenarios/test.yml -m openai/gpt-4o --mlflow-export
|
|
219
|
+
|
|
220
|
+
# Custom experiment name
|
|
221
|
+
sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export --mlflow-experiment "my-evals"
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
Or enable in scenario YAML:
|
|
225
|
+
|
|
226
|
+
```yaml
|
|
227
|
+
id: my-scenario
|
|
228
|
+
name: "My Test"
|
|
229
|
+
|
|
230
|
+
mlflow:
|
|
231
|
+
enabled: true
|
|
232
|
+
experiment: "agent-evals"
|
|
233
|
+
tags:
|
|
234
|
+
team: "support"
|
|
235
|
+
|
|
236
|
+
system_prompt: |
|
|
237
|
+
...
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
See `MLFLOW_TRACKING_URI` env variable to configure the MLflow server.
|
|
241
|
+
|
|
207
242
|
## Configuration
|
|
208
243
|
|
|
209
244
|
Environment variables (in `~/.sandboxy/.env` or project `.env`):
|
|
@@ -213,6 +248,7 @@ Environment variables (in `~/.sandboxy/.env` or project `.env`):
|
|
|
213
248
|
| `OPENROUTER_API_KEY` | OpenRouter API key (400+ models) |
|
|
214
249
|
| `OPENAI_API_KEY` | Direct OpenAI access |
|
|
215
250
|
| `ANTHROPIC_API_KEY` | Direct Anthropic access |
|
|
251
|
+
| `MLFLOW_TRACKING_URI` | MLflow tracking server URI |
|
|
216
252
|
|
|
217
253
|
## Project Structure
|
|
218
254
|
|
|
@@ -161,6 +161,39 @@ sandboxy list-models --search claude
|
|
|
161
161
|
sandboxy list-models --free
|
|
162
162
|
```
|
|
163
163
|
|
|
164
|
+
## MLflow Integration
|
|
165
|
+
|
|
166
|
+
Export scenario run results to MLflow for experiment tracking and model comparison.
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
# Install with MLflow support
|
|
170
|
+
pip install sandboxy[mlflow]
|
|
171
|
+
|
|
172
|
+
# Export run to MLflow
|
|
173
|
+
sandboxy scenario scenarios/test.yml -m openai/gpt-4o --mlflow-export
|
|
174
|
+
|
|
175
|
+
# Custom experiment name
|
|
176
|
+
sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export --mlflow-experiment "my-evals"
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Or enable in scenario YAML:
|
|
180
|
+
|
|
181
|
+
```yaml
|
|
182
|
+
id: my-scenario
|
|
183
|
+
name: "My Test"
|
|
184
|
+
|
|
185
|
+
mlflow:
|
|
186
|
+
enabled: true
|
|
187
|
+
experiment: "agent-evals"
|
|
188
|
+
tags:
|
|
189
|
+
team: "support"
|
|
190
|
+
|
|
191
|
+
system_prompt: |
|
|
192
|
+
...
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
See `MLFLOW_TRACKING_URI` env variable to configure the MLflow server.
|
|
196
|
+
|
|
164
197
|
## Configuration
|
|
165
198
|
|
|
166
199
|
Environment variables (in `~/.sandboxy/.env` or project `.env`):
|
|
@@ -170,6 +203,7 @@ Environment variables (in `~/.sandboxy/.env` or project `.env`):
|
|
|
170
203
|
| `OPENROUTER_API_KEY` | OpenRouter API key (400+ models) |
|
|
171
204
|
| `OPENAI_API_KEY` | Direct OpenAI access |
|
|
172
205
|
| `ANTHROPIC_API_KEY` | Direct Anthropic access |
|
|
206
|
+
| `MLFLOW_TRACKING_URI` | MLflow tracking server URI |
|
|
173
207
|
|
|
174
208
|
## Project Structure
|
|
175
209
|
|
|
@@ -7,13 +7,20 @@ import { api, RunScenarioResponse, CompareModelsResponse } from '../lib/api'
|
|
|
7
7
|
|
|
8
8
|
export type RunState = 'idle' | 'running' | 'completed' | 'error'
|
|
9
9
|
|
|
10
|
+
export interface MlflowOptions {
|
|
11
|
+
enabled: boolean
|
|
12
|
+
trackingUri?: string
|
|
13
|
+
experiment?: string
|
|
14
|
+
tracing?: boolean
|
|
15
|
+
}
|
|
16
|
+
|
|
10
17
|
export interface UseScenarioRunResult {
|
|
11
18
|
state: RunState
|
|
12
19
|
result: RunScenarioResponse | null
|
|
13
20
|
comparison: CompareModelsResponse | null
|
|
14
21
|
error: string | null
|
|
15
|
-
runScenario: (scenarioId: string, model: string, variables?: Record<string, unknown
|
|
16
|
-
compareModels: (scenarioId: string, models: string[], runsPerModel?: number, variables?: Record<string, unknown
|
|
22
|
+
runScenario: (scenarioId: string, model: string, variables?: Record<string, unknown>, mlflow?: MlflowOptions) => Promise<void>
|
|
23
|
+
compareModels: (scenarioId: string, models: string[], runsPerModel?: number, variables?: Record<string, unknown>, mlflow?: MlflowOptions) => Promise<void>
|
|
17
24
|
reset: () => void
|
|
18
25
|
}
|
|
19
26
|
|
|
@@ -33,7 +40,8 @@ export function useScenarioRun(): UseScenarioRunResult {
|
|
|
33
40
|
const runScenario = useCallback(async (
|
|
34
41
|
scenarioId: string,
|
|
35
42
|
model: string,
|
|
36
|
-
variables?: Record<string, unknown
|
|
43
|
+
variables?: Record<string, unknown>,
|
|
44
|
+
mlflow?: MlflowOptions
|
|
37
45
|
) => {
|
|
38
46
|
reset()
|
|
39
47
|
setState('running')
|
|
@@ -43,6 +51,10 @@ export function useScenarioRun(): UseScenarioRunResult {
|
|
|
43
51
|
scenario_id: scenarioId,
|
|
44
52
|
model,
|
|
45
53
|
variables,
|
|
54
|
+
mlflow_export: mlflow?.enabled,
|
|
55
|
+
mlflow_tracking_uri: mlflow?.trackingUri,
|
|
56
|
+
mlflow_experiment: mlflow?.experiment,
|
|
57
|
+
mlflow_tracing: mlflow?.tracing,
|
|
46
58
|
})
|
|
47
59
|
|
|
48
60
|
if (response.error) {
|
|
@@ -62,7 +74,8 @@ export function useScenarioRun(): UseScenarioRunResult {
|
|
|
62
74
|
scenarioId: string,
|
|
63
75
|
models: string[],
|
|
64
76
|
runsPerModel: number = 1,
|
|
65
|
-
variables?: Record<string, unknown
|
|
77
|
+
variables?: Record<string, unknown>,
|
|
78
|
+
mlflow?: MlflowOptions
|
|
66
79
|
) => {
|
|
67
80
|
reset()
|
|
68
81
|
setState('running')
|
|
@@ -73,6 +86,10 @@ export function useScenarioRun(): UseScenarioRunResult {
|
|
|
73
86
|
models,
|
|
74
87
|
runs_per_model: runsPerModel,
|
|
75
88
|
variables,
|
|
89
|
+
mlflow_export: mlflow?.enabled,
|
|
90
|
+
mlflow_tracking_uri: mlflow?.trackingUri,
|
|
91
|
+
mlflow_experiment: mlflow?.experiment,
|
|
92
|
+
mlflow_tracing: mlflow?.tracing,
|
|
76
93
|
})
|
|
77
94
|
|
|
78
95
|
setState('completed')
|
|
@@ -53,6 +53,10 @@ export interface RunScenarioRequest {
|
|
|
53
53
|
max_turns?: number
|
|
54
54
|
max_tokens?: number
|
|
55
55
|
temperature?: number
|
|
56
|
+
mlflow_export?: boolean
|
|
57
|
+
mlflow_tracking_uri?: string
|
|
58
|
+
mlflow_experiment?: string
|
|
59
|
+
mlflow_tracing?: boolean
|
|
56
60
|
}
|
|
57
61
|
|
|
58
62
|
export interface HistoryMessage {
|
|
@@ -112,6 +116,10 @@ export interface CompareModelsRequest {
|
|
|
112
116
|
runs_per_model?: number
|
|
113
117
|
variables?: Record<string, unknown>
|
|
114
118
|
max_turns?: number
|
|
119
|
+
mlflow_export?: boolean
|
|
120
|
+
mlflow_tracking_uri?: string
|
|
121
|
+
mlflow_experiment?: string
|
|
122
|
+
mlflow_tracing?: boolean
|
|
115
123
|
}
|
|
116
124
|
|
|
117
125
|
export interface ModelStats {
|
|
@@ -205,6 +213,8 @@ export interface RunDatasetRequest {
|
|
|
205
213
|
max_tokens?: number
|
|
206
214
|
temperature?: number
|
|
207
215
|
parallel?: number
|
|
216
|
+
mlflow_enabled?: boolean
|
|
217
|
+
mlflow_experiment?: string
|
|
208
218
|
}
|
|
209
219
|
|
|
210
220
|
export interface CaseResultInfo {
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { useState, useEffect } from 'react'
|
|
2
2
|
import { useParams, useSearchParams, Link } from 'react-router-dom'
|
|
3
|
-
import { ArrowLeft, Play, Loader2, XCircle, Edit, Settings, Database, Check, X } from 'lucide-react'
|
|
3
|
+
import { ArrowLeft, Play, Loader2, XCircle, Edit, Settings, Database, Check, X, ChevronDown, ChevronRight, ExternalLink } from 'lucide-react'
|
|
4
4
|
import { api, ScenarioDetail, ModelInfo, VariableInfo, DatasetInfo, RunDatasetResponse, LocalFileInfo } from '../lib/api'
|
|
5
|
-
import { useScenarioRun } from '../hooks/useScenarioRun'
|
|
5
|
+
import { useScenarioRun, MlflowOptions } from '../hooks/useScenarioRun'
|
|
6
6
|
import { SingleRunResult, ComparisonResult } from '../components/ResultDisplay'
|
|
7
7
|
import { ModelSelector, MultiModelSelector } from '../components/ModelSelector'
|
|
8
8
|
|
|
@@ -33,6 +33,13 @@ export default function RunPage() {
|
|
|
33
33
|
const [datasetRunning, setDatasetRunning] = useState(false)
|
|
34
34
|
const [parallel, setParallel] = useState(parallelFromUrl)
|
|
35
35
|
|
|
36
|
+
// MLflow state
|
|
37
|
+
const [mlflowEnabled, setMlflowEnabled] = useState(false)
|
|
38
|
+
const [mlflowExpanded, setMlflowExpanded] = useState(false)
|
|
39
|
+
const [mlflowTrackingUri, setMlflowTrackingUri] = useState('')
|
|
40
|
+
const [mlflowExperiment, setMlflowExperiment] = useState('')
|
|
41
|
+
const [mlflowTracing, setMlflowTracing] = useState(true)
|
|
42
|
+
|
|
36
43
|
const { state, result, comparison, error: runError, runScenario, compareModels } = useScenarioRun()
|
|
37
44
|
|
|
38
45
|
useEffect(() => {
|
|
@@ -93,6 +100,13 @@ export default function RunPage() {
|
|
|
93
100
|
const sid = selectedScenarioId || scenarioId
|
|
94
101
|
if (!sid) return
|
|
95
102
|
|
|
103
|
+
const mlflowOptions: MlflowOptions | undefined = mlflowEnabled ? {
|
|
104
|
+
enabled: true,
|
|
105
|
+
trackingUri: mlflowTrackingUri || undefined,
|
|
106
|
+
experiment: mlflowExperiment || undefined,
|
|
107
|
+
tracing: mlflowTracing,
|
|
108
|
+
} : undefined
|
|
109
|
+
|
|
96
110
|
if (runMode === 'dataset') {
|
|
97
111
|
if (!selectedDataset || !selectedModel) return
|
|
98
112
|
setDatasetRunning(true)
|
|
@@ -103,6 +117,8 @@ export default function RunPage() {
|
|
|
103
117
|
dataset_id: selectedDataset,
|
|
104
118
|
model: selectedModel,
|
|
105
119
|
parallel,
|
|
120
|
+
mlflow_enabled: mlflowEnabled,
|
|
121
|
+
mlflow_experiment: mlflowExperiment || undefined,
|
|
106
122
|
})
|
|
107
123
|
setDatasetResult(result)
|
|
108
124
|
} catch (err) {
|
|
@@ -112,10 +128,10 @@ export default function RunPage() {
|
|
|
112
128
|
}
|
|
113
129
|
} else if (runMode === 'single') {
|
|
114
130
|
if (!selectedModel) return
|
|
115
|
-
await runScenario(sid, selectedModel, variables)
|
|
131
|
+
await runScenario(sid, selectedModel, variables, mlflowOptions)
|
|
116
132
|
} else {
|
|
117
133
|
if (selectedModels.length === 0) return
|
|
118
|
-
await compareModels(sid, selectedModels, runsPerModel, variables)
|
|
134
|
+
await compareModels(sid, selectedModels, runsPerModel, variables, mlflowOptions)
|
|
119
135
|
}
|
|
120
136
|
}
|
|
121
137
|
|
|
@@ -360,6 +376,96 @@ export default function RunPage() {
|
|
|
360
376
|
</div>
|
|
361
377
|
)}
|
|
362
378
|
|
|
379
|
+
{/* MLflow Section */}
|
|
380
|
+
<div className="mb-6 p-4 panel-subtle">
|
|
381
|
+
<button
|
|
382
|
+
onClick={() => setMlflowExpanded(!mlflowExpanded)}
|
|
383
|
+
className="flex items-center gap-2 w-full text-left"
|
|
384
|
+
>
|
|
385
|
+
{mlflowExpanded ? <ChevronDown size={18} /> : <ChevronRight size={18} />}
|
|
386
|
+
<span className="font-medium text-slate-100">MLflow Tracking</span>
|
|
387
|
+
{mlflowEnabled && (
|
|
388
|
+
<span className="ml-2 px-2 py-0.5 text-xs bg-green-500/20 text-green-400 rounded">
|
|
389
|
+
Enabled
|
|
390
|
+
</span>
|
|
391
|
+
)}
|
|
392
|
+
</button>
|
|
393
|
+
|
|
394
|
+
{mlflowExpanded && (
|
|
395
|
+
<div className="mt-4 space-y-4">
|
|
396
|
+
{/* Enable Toggle */}
|
|
397
|
+
<label className="flex items-center gap-3 cursor-pointer">
|
|
398
|
+
<input
|
|
399
|
+
type="checkbox"
|
|
400
|
+
checked={mlflowEnabled}
|
|
401
|
+
onChange={(e) => setMlflowEnabled(e.target.checked)}
|
|
402
|
+
disabled={state === 'running' || datasetRunning}
|
|
403
|
+
className="w-4 h-4 rounded border-slate-600 text-orange-400 focus:ring-orange-400"
|
|
404
|
+
/>
|
|
405
|
+
<span className="text-slate-200">Enable MLflow tracking</span>
|
|
406
|
+
</label>
|
|
407
|
+
|
|
408
|
+
{mlflowEnabled && (
|
|
409
|
+
<>
|
|
410
|
+
{/* Tracking URI - only for non-dataset runs (dataset uses env var) */}
|
|
411
|
+
{runMode !== 'dataset' && (
|
|
412
|
+
<div>
|
|
413
|
+
<label className="block text-sm font-medium text-slate-400 mb-1">
|
|
414
|
+
Tracking URI
|
|
415
|
+
</label>
|
|
416
|
+
<input
|
|
417
|
+
type="text"
|
|
418
|
+
value={mlflowTrackingUri}
|
|
419
|
+
onChange={(e) => setMlflowTrackingUri(e.target.value)}
|
|
420
|
+
disabled={state === 'running'}
|
|
421
|
+
placeholder="http://127.0.0.1:5000 (uses MLFLOW_TRACKING_URI if empty)"
|
|
422
|
+
className="w-full panel-subtle px-3 py-2 text-slate-100 text-sm focus:outline-none focus:ring-2 focus:ring-orange-400"
|
|
423
|
+
/>
|
|
424
|
+
</div>
|
|
425
|
+
)}
|
|
426
|
+
|
|
427
|
+
{/* Experiment Name */}
|
|
428
|
+
<div>
|
|
429
|
+
<label className="block text-sm font-medium text-slate-400 mb-1">
|
|
430
|
+
Experiment Name
|
|
431
|
+
</label>
|
|
432
|
+
<input
|
|
433
|
+
type="text"
|
|
434
|
+
value={mlflowExperiment}
|
|
435
|
+
onChange={(e) => setMlflowExperiment(e.target.value)}
|
|
436
|
+
disabled={state === 'running' || datasetRunning}
|
|
437
|
+
placeholder={runMode === 'dataset' ? `${scenario?.name || 'scenario'}-dataset` : (scenario?.name || 'Defaults to scenario name')}
|
|
438
|
+
className="w-full panel-subtle px-3 py-2 text-slate-100 text-sm focus:outline-none focus:ring-2 focus:ring-orange-400"
|
|
439
|
+
/>
|
|
440
|
+
</div>
|
|
441
|
+
|
|
442
|
+
{/* Tracing Toggle - only for non-dataset runs */}
|
|
443
|
+
{runMode !== 'dataset' && (
|
|
444
|
+
<label className="flex items-center gap-3 cursor-pointer">
|
|
445
|
+
<input
|
|
446
|
+
type="checkbox"
|
|
447
|
+
checked={mlflowTracing}
|
|
448
|
+
onChange={(e) => setMlflowTracing(e.target.checked)}
|
|
449
|
+
disabled={state === 'running'}
|
|
450
|
+
className="w-4 h-4 rounded border-slate-600 text-orange-400 focus:ring-orange-400"
|
|
451
|
+
/>
|
|
452
|
+
<div>
|
|
453
|
+
<span className="text-slate-200">Enable LLM Tracing</span>
|
|
454
|
+
<p className="text-xs text-slate-500">Capture detailed traces of each LLM call</p>
|
|
455
|
+
</div>
|
|
456
|
+
</label>
|
|
457
|
+
)}
|
|
458
|
+
|
|
459
|
+
<p className="text-xs text-slate-500 flex items-center gap-1">
|
|
460
|
+
<ExternalLink size={12} />
|
|
461
|
+
View results at your MLflow server after the run completes
|
|
462
|
+
</p>
|
|
463
|
+
</>
|
|
464
|
+
)}
|
|
465
|
+
</div>
|
|
466
|
+
)}
|
|
467
|
+
</div>
|
|
468
|
+
|
|
363
469
|
<button
|
|
364
470
|
onClick={handleRun}
|
|
365
471
|
disabled={
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "sandboxy"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.4"
|
|
8
8
|
description = "Open-source agent simulation and benchmarking platform"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "Apache-2.0"
|
|
@@ -41,6 +41,9 @@ dependencies = [
|
|
|
41
41
|
]
|
|
42
42
|
|
|
43
43
|
[project.optional-dependencies]
|
|
44
|
+
mlflow = [
|
|
45
|
+
"mlflow>=3.0",
|
|
46
|
+
]
|
|
44
47
|
dev = [
|
|
45
48
|
"pytest>=8.0",
|
|
46
49
|
"pytest-cov>=4.0",
|
|
@@ -120,6 +123,8 @@ ignore = [
|
|
|
120
123
|
# S307: eval usage with safety measures (safe_builtins, simpleeval)
|
|
121
124
|
"sandboxy/tools/yaml_tools.py" = ["S307"]
|
|
122
125
|
"sandboxy/core/safe_eval.py" = ["S307"]
|
|
126
|
+
# S603, S607, S110: git subprocess call is safe (hardcoded command, known input)
|
|
127
|
+
"sandboxy/mlflow/tags.py" = ["S603", "S607", "S110"]
|
|
123
128
|
|
|
124
129
|
[tool.ruff.lint.pydocstyle]
|
|
125
130
|
convention = "google"
|
|
@@ -379,6 +379,10 @@ class RunScenarioRequest(BaseModel):
|
|
|
379
379
|
max_turns: int = 20
|
|
380
380
|
max_tokens: int = 1024
|
|
381
381
|
temperature: float = 0.7
|
|
382
|
+
mlflow_export: bool = False
|
|
383
|
+
mlflow_tracking_uri: str | None = None
|
|
384
|
+
mlflow_experiment: str | None = None
|
|
385
|
+
mlflow_tracing: bool = True
|
|
382
386
|
|
|
383
387
|
|
|
384
388
|
class RunScenarioResponse(BaseModel):
|
|
@@ -393,6 +397,9 @@ class RunScenarioResponse(BaseModel):
|
|
|
393
397
|
final_state: dict[str, Any]
|
|
394
398
|
evaluation: dict[str, Any] | None
|
|
395
399
|
latency_ms: int
|
|
400
|
+
input_tokens: int = 0
|
|
401
|
+
output_tokens: int = 0
|
|
402
|
+
cost_usd: float | None = None
|
|
396
403
|
error: str | None
|
|
397
404
|
|
|
398
405
|
|
|
@@ -404,6 +411,10 @@ class CompareModelsRequest(BaseModel):
|
|
|
404
411
|
runs_per_model: int = 1
|
|
405
412
|
variables: dict[str, Any] = Field(default_factory=dict)
|
|
406
413
|
max_turns: int = 20
|
|
414
|
+
mlflow_export: bool = False
|
|
415
|
+
mlflow_tracking_uri: str | None = None
|
|
416
|
+
mlflow_experiment: str | None = None
|
|
417
|
+
mlflow_tracing: bool = True # Enable LLM call tracing by default
|
|
407
418
|
|
|
408
419
|
|
|
409
420
|
class CompareModelsResponse(BaseModel):
|
|
@@ -454,20 +465,73 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
|
|
|
454
465
|
spec = load_unified_scenario(scenario_path)
|
|
455
466
|
runner = UnifiedRunner()
|
|
456
467
|
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
468
|
+
# Setup MLflow if requested
|
|
469
|
+
mlflow_config = None
|
|
470
|
+
if request.mlflow_export:
|
|
471
|
+
try:
|
|
472
|
+
from sandboxy.mlflow import MLflowConfig
|
|
473
|
+
|
|
474
|
+
mlflow_config = MLflowConfig.resolve(
|
|
475
|
+
cli_export=True,
|
|
476
|
+
cli_tracking_uri=request.mlflow_tracking_uri,
|
|
477
|
+
cli_experiment=request.mlflow_experiment,
|
|
478
|
+
cli_tracing=request.mlflow_tracing,
|
|
479
|
+
yaml_config=None,
|
|
480
|
+
scenario_name=spec.name,
|
|
481
|
+
)
|
|
482
|
+
except ImportError:
|
|
483
|
+
pass # MLflow not installed
|
|
484
|
+
|
|
485
|
+
# Run with MLflow context if enabled (connects traces to run)
|
|
486
|
+
if mlflow_config and mlflow_config.enabled:
|
|
487
|
+
from sandboxy.mlflow import MLflowExporter, mlflow_run_context
|
|
488
|
+
from sandboxy.mlflow.tracing import enable_tracing
|
|
489
|
+
|
|
490
|
+
if mlflow_config.tracing:
|
|
491
|
+
enable_tracing(
|
|
492
|
+
tracking_uri=mlflow_config.tracking_uri,
|
|
493
|
+
experiment_name=mlflow_config.experiment,
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
with mlflow_run_context(mlflow_config, run_name=request.model) as run_id:
|
|
497
|
+
result = await runner.run(
|
|
498
|
+
scenario=spec,
|
|
499
|
+
model=request.model,
|
|
500
|
+
variables=request.variables,
|
|
501
|
+
max_turns=request.max_turns,
|
|
502
|
+
max_tokens=request.max_tokens,
|
|
503
|
+
temperature=request.temperature,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
if run_id:
|
|
507
|
+
exporter = MLflowExporter(mlflow_config)
|
|
508
|
+
exporter.log_to_active_run(
|
|
509
|
+
result=result,
|
|
510
|
+
scenario_path=scenario_path,
|
|
511
|
+
scenario_name=spec.name,
|
|
512
|
+
scenario_id=spec.id,
|
|
513
|
+
agent_name=request.model,
|
|
514
|
+
)
|
|
515
|
+
else:
|
|
516
|
+
result = await runner.run(
|
|
517
|
+
scenario=spec,
|
|
518
|
+
model=request.model,
|
|
519
|
+
variables=request.variables,
|
|
520
|
+
max_turns=request.max_turns,
|
|
521
|
+
max_tokens=request.max_tokens,
|
|
522
|
+
temperature=request.temperature,
|
|
523
|
+
)
|
|
465
524
|
|
|
466
525
|
# Save result to runs/
|
|
467
526
|
from sandboxy.local.results import save_run_result
|
|
468
527
|
|
|
469
528
|
save_run_result(request.scenario_id, result.to_dict())
|
|
470
529
|
|
|
530
|
+
# Calculate cost
|
|
531
|
+
input_tokens = result.input_tokens or 0
|
|
532
|
+
output_tokens = result.output_tokens or 0
|
|
533
|
+
cost_usd = calculate_cost(result.model, input_tokens, output_tokens)
|
|
534
|
+
|
|
471
535
|
return RunScenarioResponse(
|
|
472
536
|
id=result.id,
|
|
473
537
|
scenario_id=result.scenario_id,
|
|
@@ -481,6 +545,9 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
|
|
|
481
545
|
final_state=result.final_state,
|
|
482
546
|
evaluation=result.evaluation.to_dict() if result.evaluation else None,
|
|
483
547
|
latency_ms=result.latency_ms,
|
|
548
|
+
input_tokens=input_tokens,
|
|
549
|
+
output_tokens=output_tokens,
|
|
550
|
+
cost_usd=cost_usd,
|
|
484
551
|
error=result.error,
|
|
485
552
|
)
|
|
486
553
|
|
|
@@ -530,6 +597,19 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
|
|
|
530
597
|
|
|
531
598
|
spec = load_unified_scenario(scenario_path)
|
|
532
599
|
|
|
600
|
+
# Enable MLflow tracing if requested (must be done BEFORE any LLM calls)
|
|
601
|
+
if request.mlflow_export and request.mlflow_tracing:
|
|
602
|
+
try:
|
|
603
|
+
from sandboxy.mlflow.tracing import enable_tracing
|
|
604
|
+
|
|
605
|
+
experiment = request.mlflow_experiment or spec.name
|
|
606
|
+
enable_tracing(
|
|
607
|
+
tracking_uri=request.mlflow_tracking_uri,
|
|
608
|
+
experiment_name=experiment,
|
|
609
|
+
)
|
|
610
|
+
except ImportError:
|
|
611
|
+
pass # MLflow not installed
|
|
612
|
+
|
|
533
613
|
comparison = await run_comparison(
|
|
534
614
|
scenario=spec,
|
|
535
615
|
models=request.models,
|
|
@@ -538,6 +618,31 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
|
|
|
538
618
|
max_turns=request.max_turns,
|
|
539
619
|
)
|
|
540
620
|
|
|
621
|
+
# MLflow export (if enabled)
|
|
622
|
+
if request.mlflow_export:
|
|
623
|
+
try:
|
|
624
|
+
from sandboxy.mlflow import MLflowConfig, MLflowExporter
|
|
625
|
+
|
|
626
|
+
for result in comparison.results:
|
|
627
|
+
config = MLflowConfig.resolve(
|
|
628
|
+
cli_export=True,
|
|
629
|
+
cli_tracking_uri=request.mlflow_tracking_uri,
|
|
630
|
+
cli_experiment=request.mlflow_experiment,
|
|
631
|
+
cli_tracing=request.mlflow_tracing,
|
|
632
|
+
yaml_config=None,
|
|
633
|
+
scenario_name=spec.name,
|
|
634
|
+
)
|
|
635
|
+
exporter = MLflowExporter(config)
|
|
636
|
+
exporter.export(
|
|
637
|
+
result=result.to_dict(),
|
|
638
|
+
scenario_path=scenario_path,
|
|
639
|
+
agent_name=result.model,
|
|
640
|
+
)
|
|
641
|
+
except ImportError:
|
|
642
|
+
logger.warning("MLflow not installed, skipping export")
|
|
643
|
+
except Exception as e:
|
|
644
|
+
logger.warning(f"Failed to export to MLflow: {e}")
|
|
645
|
+
|
|
541
646
|
# Save comparison result
|
|
542
647
|
from sandboxy.local.results import save_run_result
|
|
543
648
|
|
|
@@ -905,6 +1010,8 @@ class RunDatasetRequest(BaseModel):
|
|
|
905
1010
|
max_tokens: int = 1024
|
|
906
1011
|
temperature: float = 0.7
|
|
907
1012
|
parallel: int = 1
|
|
1013
|
+
mlflow_enabled: bool = False
|
|
1014
|
+
mlflow_experiment: str | None = None
|
|
908
1015
|
|
|
909
1016
|
|
|
910
1017
|
class RunDatasetResponse(BaseModel):
|
|
@@ -1335,25 +1442,81 @@ async def run_with_dataset(request: RunDatasetRequest) -> RunDatasetResponse:
|
|
|
1335
1442
|
spec = load_unified_scenario(scenario_path)
|
|
1336
1443
|
dataset = load_dataset(dataset_path)
|
|
1337
1444
|
|
|
1338
|
-
|
|
1339
|
-
|
|
1445
|
+
# Setup MLflow if enabled
|
|
1446
|
+
mlflow_config = None
|
|
1447
|
+
if request.mlflow_enabled:
|
|
1448
|
+
try:
|
|
1449
|
+
from sandboxy.mlflow import MLflowConfig
|
|
1450
|
+
|
|
1451
|
+
mlflow_config = MLflowConfig(
|
|
1452
|
+
enabled=True,
|
|
1453
|
+
experiment=request.mlflow_experiment or f"{spec.name}-dataset",
|
|
1454
|
+
tracing=False, # Tracing not needed for dataset aggregates
|
|
1455
|
+
)
|
|
1456
|
+
except ImportError:
|
|
1457
|
+
pass # MLflow not installed
|
|
1458
|
+
|
|
1459
|
+
async def run_dataset_benchmark():
|
|
1460
|
+
if request.parallel > 1:
|
|
1461
|
+
return await run_dataset_parallel(
|
|
1462
|
+
scenario=spec,
|
|
1463
|
+
model=request.model,
|
|
1464
|
+
dataset=dataset,
|
|
1465
|
+
max_turns=request.max_turns,
|
|
1466
|
+
max_tokens=request.max_tokens,
|
|
1467
|
+
temperature=request.temperature,
|
|
1468
|
+
max_concurrent=request.parallel,
|
|
1469
|
+
)
|
|
1470
|
+
return await run_dataset(
|
|
1340
1471
|
scenario=spec,
|
|
1341
1472
|
model=request.model,
|
|
1342
1473
|
dataset=dataset,
|
|
1343
1474
|
max_turns=request.max_turns,
|
|
1344
1475
|
max_tokens=request.max_tokens,
|
|
1345
1476
|
temperature=request.temperature,
|
|
1346
|
-
max_concurrent=request.parallel,
|
|
1347
1477
|
)
|
|
1478
|
+
|
|
1479
|
+
# Run with MLflow context if enabled
|
|
1480
|
+
if mlflow_config and mlflow_config.enabled:
|
|
1481
|
+
from sandboxy.mlflow import mlflow_run_context
|
|
1482
|
+
|
|
1483
|
+
run_name = f"{request.model}-{request.dataset_id}"
|
|
1484
|
+
with mlflow_run_context(mlflow_config, run_name=run_name) as run_id:
|
|
1485
|
+
result = await run_dataset_benchmark()
|
|
1486
|
+
|
|
1487
|
+
# Log aggregate metrics to MLflow
|
|
1488
|
+
if run_id:
|
|
1489
|
+
try:
|
|
1490
|
+
import mlflow
|
|
1491
|
+
|
|
1492
|
+
mlflow.log_params(
|
|
1493
|
+
{
|
|
1494
|
+
"scenario_id": result.scenario_id,
|
|
1495
|
+
"dataset_id": result.dataset_id,
|
|
1496
|
+
"model": result.model,
|
|
1497
|
+
"total_cases": result.total_cases,
|
|
1498
|
+
}
|
|
1499
|
+
)
|
|
1500
|
+
mlflow.log_metrics(
|
|
1501
|
+
{
|
|
1502
|
+
"passed_cases": result.passed_cases,
|
|
1503
|
+
"failed_cases": result.failed_cases,
|
|
1504
|
+
"pass_rate": result.pass_rate,
|
|
1505
|
+
"avg_score": result.avg_score,
|
|
1506
|
+
"avg_percentage": result.avg_percentage,
|
|
1507
|
+
"total_time_ms": result.total_time_ms,
|
|
1508
|
+
}
|
|
1509
|
+
)
|
|
1510
|
+
# Log per-expected-outcome metrics
|
|
1511
|
+
for expected, counts in result.by_expected.items():
|
|
1512
|
+
total = counts.get("passed", 0) + counts.get("failed", 0)
|
|
1513
|
+
if total > 0:
|
|
1514
|
+
rate = counts.get("passed", 0) / total
|
|
1515
|
+
mlflow.log_metric(f"pass_rate_{expected}", rate)
|
|
1516
|
+
except Exception as e:
|
|
1517
|
+
logger.warning(f"Failed to log dataset metrics to MLflow: {e}")
|
|
1348
1518
|
else:
|
|
1349
|
-
result = await
|
|
1350
|
-
scenario=spec,
|
|
1351
|
-
model=request.model,
|
|
1352
|
-
dataset=dataset,
|
|
1353
|
-
max_turns=request.max_turns,
|
|
1354
|
-
max_tokens=request.max_tokens,
|
|
1355
|
-
temperature=request.temperature,
|
|
1356
|
-
)
|
|
1519
|
+
result = await run_dataset_benchmark()
|
|
1357
1520
|
|
|
1358
1521
|
# Save result
|
|
1359
1522
|
from sandboxy.local.results import save_run_result
|