sandboxy 0.0.2__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. {sandboxy-0.0.2 → sandboxy-0.0.4}/PKG-INFO +37 -1
  2. {sandboxy-0.0.2 → sandboxy-0.0.4}/README.md +34 -0
  3. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/hooks/useScenarioRun.ts +21 -4
  4. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/lib/api.ts +10 -0
  5. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/pages/RunPage.tsx +110 -4
  6. {sandboxy-0.0.2 → sandboxy-0.0.4}/pyproject.toml +6 -1
  7. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/api/routes/local.py +182 -19
  8. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/cli/main.py +530 -174
  9. sandboxy-0.0.4/sandboxy/mlflow/__init__.py +38 -0
  10. sandboxy-0.0.4/sandboxy/mlflow/artifacts.py +184 -0
  11. sandboxy-0.0.4/sandboxy/mlflow/config.py +90 -0
  12. sandboxy-0.0.4/sandboxy/mlflow/exporter.py +439 -0
  13. sandboxy-0.0.4/sandboxy/mlflow/metrics.py +115 -0
  14. sandboxy-0.0.4/sandboxy/mlflow/tags.py +140 -0
  15. sandboxy-0.0.4/sandboxy/mlflow/tracing.py +126 -0
  16. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/scenarios/loader.py +44 -2
  17. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/scenarios/runner.py +57 -2
  18. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/tools/yaml_tools.py +18 -0
  19. sandboxy-0.0.4/sandboxy/ui/dist/assets/index-CU06wBqc.js +362 -0
  20. sandboxy-0.0.4/sandboxy/ui/dist/assets/index-Cgg2wY2m.css +1 -0
  21. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/ui/dist/index.html +2 -2
  22. sandboxy-0.0.4/tests/integration/test_mlflow_integration.py +245 -0
  23. sandboxy-0.0.4/tests/unit/mlflow/__init__.py +1 -0
  24. sandboxy-0.0.4/tests/unit/mlflow/test_artifacts.py +206 -0
  25. sandboxy-0.0.4/tests/unit/mlflow/test_config.py +127 -0
  26. sandboxy-0.0.4/tests/unit/mlflow/test_metrics.py +131 -0
  27. sandboxy-0.0.4/tests/unit/mlflow/test_tags.py +209 -0
  28. sandboxy-0.0.2/sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -1
  29. sandboxy-0.0.2/sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -347
  30. {sandboxy-0.0.2 → sandboxy-0.0.4}/.env.example +0 -0
  31. {sandboxy-0.0.2 → sandboxy-0.0.4}/.github/workflows/ci.yml +0 -0
  32. {sandboxy-0.0.2 → sandboxy-0.0.4}/.github/workflows/publish.yml +0 -0
  33. {sandboxy-0.0.2 → sandboxy-0.0.4}/.gitignore +0 -0
  34. {sandboxy-0.0.2 → sandboxy-0.0.4}/CONTRIBUTING.md +0 -0
  35. {sandboxy-0.0.2 → sandboxy-0.0.4}/LICENSE +0 -0
  36. {sandboxy-0.0.2 → sandboxy-0.0.4}/Makefile +0 -0
  37. {sandboxy-0.0.2 → sandboxy-0.0.4}/docs/yaml-tools.md +0 -0
  38. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/index.html +0 -0
  39. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/package-lock.json +0 -0
  40. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/package.json +0 -0
  41. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/postcss.config.js +0 -0
  42. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/App.tsx +0 -0
  43. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/components/Layout.tsx +0 -0
  44. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/components/ModelSelector.tsx +0 -0
  45. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/components/ResultDisplay.tsx +0 -0
  46. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/hooks/useScenarioBuilder.ts +0 -0
  47. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/hooks/useToolBuilder.ts +0 -0
  48. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/index.css +0 -0
  49. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/main.tsx +0 -0
  50. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/pages/BuilderPage.tsx +0 -0
  51. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/pages/DashboardPage.tsx +0 -0
  52. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/pages/DatasetPage.tsx +0 -0
  53. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/pages/ResultsPage.tsx +0 -0
  54. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/pages/ToolBuilderPage.tsx +0 -0
  55. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/tailwind.config.js +0 -0
  56. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/tsconfig.json +0 -0
  57. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/tsconfig.node.json +0 -0
  58. {sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/vite.config.ts +0 -0
  59. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/__init__.py +0 -0
  60. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/agents/__init__.py +0 -0
  61. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/agents/base.py +0 -0
  62. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/agents/llm_prompt.py +0 -0
  63. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/agents/loader.py +0 -0
  64. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/api/__init__.py +0 -0
  65. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/api/app.py +0 -0
  66. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/api/routes/__init__.py +0 -0
  67. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/api/routes/agents.py +0 -0
  68. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/api/routes/tools.py +0 -0
  69. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/cli/__init__.py +0 -0
  70. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/cli/type_detector.py +0 -0
  71. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/config.py +0 -0
  72. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/core/__init__.py +0 -0
  73. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/core/async_runner.py +0 -0
  74. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/core/mdl_parser.py +0 -0
  75. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/core/runner.py +0 -0
  76. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/core/safe_eval.py +0 -0
  77. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/core/state.py +0 -0
  78. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/datasets/__init__.py +0 -0
  79. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/datasets/loader.py +0 -0
  80. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/datasets/runner.py +0 -0
  81. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/errors.py +0 -0
  82. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/local/context.py +0 -0
  83. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/local/results.py +0 -0
  84. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/logging.py +0 -0
  85. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/mcp/__init__.py +0 -0
  86. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/mcp/client.py +0 -0
  87. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/mcp/wrapper.py +0 -0
  88. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/providers/__init__.py +0 -0
  89. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/providers/anthropic_provider.py +0 -0
  90. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/providers/base.py +0 -0
  91. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/providers/http_client.py +0 -0
  92. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/providers/openai_provider.py +0 -0
  93. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/providers/openrouter.py +0 -0
  94. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/providers/registry.py +0 -0
  95. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/scenarios/__init__.py +0 -0
  96. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/scenarios/comparison.py +0 -0
  97. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/scenarios/unified.py +0 -0
  98. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/session/__init__.py +0 -0
  99. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/session/manager.py +0 -0
  100. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/tools/__init__.py +0 -0
  101. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/tools/base.py +0 -0
  102. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/tools/loader.py +0 -0
  103. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/ui/__init__.py +0 -0
  104. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/utils/__init__.py +0 -0
  105. {sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/utils/time.py +0 -0
  106. {sandboxy-0.0.2 → sandboxy-0.0.4}/scenarios/customer_service.yml +0 -0
  107. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/__init__.py +0 -0
  108. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/conftest.py +0 -0
  109. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/factories.py +0 -0
  110. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/integration/__init__.py +0 -0
  111. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/integration/api/__init__.py +0 -0
  112. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/mocks/__init__.py +0 -0
  113. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/mocks/providers.py +0 -0
  114. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/__init__.py +0 -0
  115. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/agents/__init__.py +0 -0
  116. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/agents/test_base.py +0 -0
  117. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/agents/test_llm_prompt.py +0 -0
  118. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/agents/test_loader.py +0 -0
  119. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/core/__init__.py +0 -0
  120. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/core/test_async_runner.py +0 -0
  121. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/core/test_mdl_parser.py +0 -0
  122. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/core/test_runner.py +0 -0
  123. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/core/test_safe_eval.py +0 -0
  124. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/core/test_state.py +0 -0
  125. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/providers/test_openrouter.py +0 -0
  126. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/tools/__init__.py +0 -0
  127. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/tools/test_base.py +0 -0
  128. {sandboxy-0.0.2 → sandboxy-0.0.4}/tests/unit/tools/test_loader.py +0 -0
  129. {sandboxy-0.0.2 → sandboxy-0.0.4}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sandboxy
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: Open-source agent simulation and benchmarking platform
5
5
  Project-URL: Homepage, https://github.com/sandboxy-ai/sandboxy
6
6
  Project-URL: Repository, https://github.com/sandboxy-ai/sandboxy
@@ -39,6 +39,8 @@ Requires-Dist: pytest-xdist>=3.5.0; extra == 'dev'
39
39
  Requires-Dist: pytest>=8.0; extra == 'dev'
40
40
  Requires-Dist: respx>=0.21.0; extra == 'dev'
41
41
  Requires-Dist: ruff>=0.1; extra == 'dev'
42
+ Provides-Extra: mlflow
43
+ Requires-Dist: mlflow>=3.0; extra == 'mlflow'
42
44
  Description-Content-Type: text/markdown
43
45
 
44
46
  # Sandboxy
@@ -204,6 +206,39 @@ sandboxy list-models --search claude
204
206
  sandboxy list-models --free
205
207
  ```
206
208
 
209
+ ## MLflow Integration
210
+
211
+ Export scenario run results to MLflow for experiment tracking and model comparison.
212
+
213
+ ```bash
214
+ # Install with MLflow support
215
+ pip install sandboxy[mlflow]
216
+
217
+ # Export run to MLflow
218
+ sandboxy scenario scenarios/test.yml -m openai/gpt-4o --mlflow-export
219
+
220
+ # Custom experiment name
221
+ sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export --mlflow-experiment "my-evals"
222
+ ```
223
+
224
+ Or enable in scenario YAML:
225
+
226
+ ```yaml
227
+ id: my-scenario
228
+ name: "My Test"
229
+
230
+ mlflow:
231
+ enabled: true
232
+ experiment: "agent-evals"
233
+ tags:
234
+ team: "support"
235
+
236
+ system_prompt: |
237
+ ...
238
+ ```
239
+
240
+ See `MLFLOW_TRACKING_URI` env variable to configure the MLflow server.
241
+
207
242
  ## Configuration
208
243
 
209
244
  Environment variables (in `~/.sandboxy/.env` or project `.env`):
@@ -213,6 +248,7 @@ Environment variables (in `~/.sandboxy/.env` or project `.env`):
213
248
  | `OPENROUTER_API_KEY` | OpenRouter API key (400+ models) |
214
249
  | `OPENAI_API_KEY` | Direct OpenAI access |
215
250
  | `ANTHROPIC_API_KEY` | Direct Anthropic access |
251
+ | `MLFLOW_TRACKING_URI` | MLflow tracking server URI |
216
252
 
217
253
  ## Project Structure
218
254
 
@@ -161,6 +161,39 @@ sandboxy list-models --search claude
161
161
  sandboxy list-models --free
162
162
  ```
163
163
 
164
+ ## MLflow Integration
165
+
166
+ Export scenario run results to MLflow for experiment tracking and model comparison.
167
+
168
+ ```bash
169
+ # Install with MLflow support
170
+ pip install sandboxy[mlflow]
171
+
172
+ # Export run to MLflow
173
+ sandboxy scenario scenarios/test.yml -m openai/gpt-4o --mlflow-export
174
+
175
+ # Custom experiment name
176
+ sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export --mlflow-experiment "my-evals"
177
+ ```
178
+
179
+ Or enable in scenario YAML:
180
+
181
+ ```yaml
182
+ id: my-scenario
183
+ name: "My Test"
184
+
185
+ mlflow:
186
+ enabled: true
187
+ experiment: "agent-evals"
188
+ tags:
189
+ team: "support"
190
+
191
+ system_prompt: |
192
+ ...
193
+ ```
194
+
195
+ See `MLFLOW_TRACKING_URI` env variable to configure the MLflow server.
196
+
164
197
  ## Configuration
165
198
 
166
199
  Environment variables (in `~/.sandboxy/.env` or project `.env`):
@@ -170,6 +203,7 @@ Environment variables (in `~/.sandboxy/.env` or project `.env`):
170
203
  | `OPENROUTER_API_KEY` | OpenRouter API key (400+ models) |
171
204
  | `OPENAI_API_KEY` | Direct OpenAI access |
172
205
  | `ANTHROPIC_API_KEY` | Direct Anthropic access |
206
+ | `MLFLOW_TRACKING_URI` | MLflow tracking server URI |
173
207
 
174
208
  ## Project Structure
175
209
 
@@ -7,13 +7,20 @@ import { api, RunScenarioResponse, CompareModelsResponse } from '../lib/api'
7
7
 
8
8
  export type RunState = 'idle' | 'running' | 'completed' | 'error'
9
9
 
10
+ export interface MlflowOptions {
11
+ enabled: boolean
12
+ trackingUri?: string
13
+ experiment?: string
14
+ tracing?: boolean
15
+ }
16
+
10
17
  export interface UseScenarioRunResult {
11
18
  state: RunState
12
19
  result: RunScenarioResponse | null
13
20
  comparison: CompareModelsResponse | null
14
21
  error: string | null
15
- runScenario: (scenarioId: string, model: string, variables?: Record<string, unknown>) => Promise<void>
16
- compareModels: (scenarioId: string, models: string[], runsPerModel?: number, variables?: Record<string, unknown>) => Promise<void>
22
+ runScenario: (scenarioId: string, model: string, variables?: Record<string, unknown>, mlflow?: MlflowOptions) => Promise<void>
23
+ compareModels: (scenarioId: string, models: string[], runsPerModel?: number, variables?: Record<string, unknown>, mlflow?: MlflowOptions) => Promise<void>
17
24
  reset: () => void
18
25
  }
19
26
 
@@ -33,7 +40,8 @@ export function useScenarioRun(): UseScenarioRunResult {
33
40
  const runScenario = useCallback(async (
34
41
  scenarioId: string,
35
42
  model: string,
36
- variables?: Record<string, unknown>
43
+ variables?: Record<string, unknown>,
44
+ mlflow?: MlflowOptions
37
45
  ) => {
38
46
  reset()
39
47
  setState('running')
@@ -43,6 +51,10 @@ export function useScenarioRun(): UseScenarioRunResult {
43
51
  scenario_id: scenarioId,
44
52
  model,
45
53
  variables,
54
+ mlflow_export: mlflow?.enabled,
55
+ mlflow_tracking_uri: mlflow?.trackingUri,
56
+ mlflow_experiment: mlflow?.experiment,
57
+ mlflow_tracing: mlflow?.tracing,
46
58
  })
47
59
 
48
60
  if (response.error) {
@@ -62,7 +74,8 @@ export function useScenarioRun(): UseScenarioRunResult {
62
74
  scenarioId: string,
63
75
  models: string[],
64
76
  runsPerModel: number = 1,
65
- variables?: Record<string, unknown>
77
+ variables?: Record<string, unknown>,
78
+ mlflow?: MlflowOptions
66
79
  ) => {
67
80
  reset()
68
81
  setState('running')
@@ -73,6 +86,10 @@ export function useScenarioRun(): UseScenarioRunResult {
73
86
  models,
74
87
  runs_per_model: runsPerModel,
75
88
  variables,
89
+ mlflow_export: mlflow?.enabled,
90
+ mlflow_tracking_uri: mlflow?.trackingUri,
91
+ mlflow_experiment: mlflow?.experiment,
92
+ mlflow_tracing: mlflow?.tracing,
76
93
  })
77
94
 
78
95
  setState('completed')
@@ -53,6 +53,10 @@ export interface RunScenarioRequest {
53
53
  max_turns?: number
54
54
  max_tokens?: number
55
55
  temperature?: number
56
+ mlflow_export?: boolean
57
+ mlflow_tracking_uri?: string
58
+ mlflow_experiment?: string
59
+ mlflow_tracing?: boolean
56
60
  }
57
61
 
58
62
  export interface HistoryMessage {
@@ -112,6 +116,10 @@ export interface CompareModelsRequest {
112
116
  runs_per_model?: number
113
117
  variables?: Record<string, unknown>
114
118
  max_turns?: number
119
+ mlflow_export?: boolean
120
+ mlflow_tracking_uri?: string
121
+ mlflow_experiment?: string
122
+ mlflow_tracing?: boolean
115
123
  }
116
124
 
117
125
  export interface ModelStats {
@@ -205,6 +213,8 @@ export interface RunDatasetRequest {
205
213
  max_tokens?: number
206
214
  temperature?: number
207
215
  parallel?: number
216
+ mlflow_enabled?: boolean
217
+ mlflow_experiment?: string
208
218
  }
209
219
 
210
220
  export interface CaseResultInfo {
@@ -1,8 +1,8 @@
1
1
  import { useState, useEffect } from 'react'
2
2
  import { useParams, useSearchParams, Link } from 'react-router-dom'
3
- import { ArrowLeft, Play, Loader2, XCircle, Edit, Settings, Database, Check, X } from 'lucide-react'
3
+ import { ArrowLeft, Play, Loader2, XCircle, Edit, Settings, Database, Check, X, ChevronDown, ChevronRight, ExternalLink } from 'lucide-react'
4
4
  import { api, ScenarioDetail, ModelInfo, VariableInfo, DatasetInfo, RunDatasetResponse, LocalFileInfo } from '../lib/api'
5
- import { useScenarioRun } from '../hooks/useScenarioRun'
5
+ import { useScenarioRun, MlflowOptions } from '../hooks/useScenarioRun'
6
6
  import { SingleRunResult, ComparisonResult } from '../components/ResultDisplay'
7
7
  import { ModelSelector, MultiModelSelector } from '../components/ModelSelector'
8
8
 
@@ -33,6 +33,13 @@ export default function RunPage() {
33
33
  const [datasetRunning, setDatasetRunning] = useState(false)
34
34
  const [parallel, setParallel] = useState(parallelFromUrl)
35
35
 
36
+ // MLflow state
37
+ const [mlflowEnabled, setMlflowEnabled] = useState(false)
38
+ const [mlflowExpanded, setMlflowExpanded] = useState(false)
39
+ const [mlflowTrackingUri, setMlflowTrackingUri] = useState('')
40
+ const [mlflowExperiment, setMlflowExperiment] = useState('')
41
+ const [mlflowTracing, setMlflowTracing] = useState(true)
42
+
36
43
  const { state, result, comparison, error: runError, runScenario, compareModels } = useScenarioRun()
37
44
 
38
45
  useEffect(() => {
@@ -93,6 +100,13 @@ export default function RunPage() {
93
100
  const sid = selectedScenarioId || scenarioId
94
101
  if (!sid) return
95
102
 
103
+ const mlflowOptions: MlflowOptions | undefined = mlflowEnabled ? {
104
+ enabled: true,
105
+ trackingUri: mlflowTrackingUri || undefined,
106
+ experiment: mlflowExperiment || undefined,
107
+ tracing: mlflowTracing,
108
+ } : undefined
109
+
96
110
  if (runMode === 'dataset') {
97
111
  if (!selectedDataset || !selectedModel) return
98
112
  setDatasetRunning(true)
@@ -103,6 +117,8 @@ export default function RunPage() {
103
117
  dataset_id: selectedDataset,
104
118
  model: selectedModel,
105
119
  parallel,
120
+ mlflow_enabled: mlflowEnabled,
121
+ mlflow_experiment: mlflowExperiment || undefined,
106
122
  })
107
123
  setDatasetResult(result)
108
124
  } catch (err) {
@@ -112,10 +128,10 @@ export default function RunPage() {
112
128
  }
113
129
  } else if (runMode === 'single') {
114
130
  if (!selectedModel) return
115
- await runScenario(sid, selectedModel, variables)
131
+ await runScenario(sid, selectedModel, variables, mlflowOptions)
116
132
  } else {
117
133
  if (selectedModels.length === 0) return
118
- await compareModels(sid, selectedModels, runsPerModel, variables)
134
+ await compareModels(sid, selectedModels, runsPerModel, variables, mlflowOptions)
119
135
  }
120
136
  }
121
137
 
@@ -360,6 +376,96 @@ export default function RunPage() {
360
376
  </div>
361
377
  )}
362
378
 
379
+ {/* MLflow Section */}
380
+ <div className="mb-6 p-4 panel-subtle">
381
+ <button
382
+ onClick={() => setMlflowExpanded(!mlflowExpanded)}
383
+ className="flex items-center gap-2 w-full text-left"
384
+ >
385
+ {mlflowExpanded ? <ChevronDown size={18} /> : <ChevronRight size={18} />}
386
+ <span className="font-medium text-slate-100">MLflow Tracking</span>
387
+ {mlflowEnabled && (
388
+ <span className="ml-2 px-2 py-0.5 text-xs bg-green-500/20 text-green-400 rounded">
389
+ Enabled
390
+ </span>
391
+ )}
392
+ </button>
393
+
394
+ {mlflowExpanded && (
395
+ <div className="mt-4 space-y-4">
396
+ {/* Enable Toggle */}
397
+ <label className="flex items-center gap-3 cursor-pointer">
398
+ <input
399
+ type="checkbox"
400
+ checked={mlflowEnabled}
401
+ onChange={(e) => setMlflowEnabled(e.target.checked)}
402
+ disabled={state === 'running' || datasetRunning}
403
+ className="w-4 h-4 rounded border-slate-600 text-orange-400 focus:ring-orange-400"
404
+ />
405
+ <span className="text-slate-200">Enable MLflow tracking</span>
406
+ </label>
407
+
408
+ {mlflowEnabled && (
409
+ <>
410
+ {/* Tracking URI - only for non-dataset runs (dataset uses env var) */}
411
+ {runMode !== 'dataset' && (
412
+ <div>
413
+ <label className="block text-sm font-medium text-slate-400 mb-1">
414
+ Tracking URI
415
+ </label>
416
+ <input
417
+ type="text"
418
+ value={mlflowTrackingUri}
419
+ onChange={(e) => setMlflowTrackingUri(e.target.value)}
420
+ disabled={state === 'running'}
421
+ placeholder="http://127.0.0.1:5000 (uses MLFLOW_TRACKING_URI if empty)"
422
+ className="w-full panel-subtle px-3 py-2 text-slate-100 text-sm focus:outline-none focus:ring-2 focus:ring-orange-400"
423
+ />
424
+ </div>
425
+ )}
426
+
427
+ {/* Experiment Name */}
428
+ <div>
429
+ <label className="block text-sm font-medium text-slate-400 mb-1">
430
+ Experiment Name
431
+ </label>
432
+ <input
433
+ type="text"
434
+ value={mlflowExperiment}
435
+ onChange={(e) => setMlflowExperiment(e.target.value)}
436
+ disabled={state === 'running' || datasetRunning}
437
+ placeholder={runMode === 'dataset' ? `${scenario?.name || 'scenario'}-dataset` : (scenario?.name || 'Defaults to scenario name')}
438
+ className="w-full panel-subtle px-3 py-2 text-slate-100 text-sm focus:outline-none focus:ring-2 focus:ring-orange-400"
439
+ />
440
+ </div>
441
+
442
+ {/* Tracing Toggle - only for non-dataset runs */}
443
+ {runMode !== 'dataset' && (
444
+ <label className="flex items-center gap-3 cursor-pointer">
445
+ <input
446
+ type="checkbox"
447
+ checked={mlflowTracing}
448
+ onChange={(e) => setMlflowTracing(e.target.checked)}
449
+ disabled={state === 'running'}
450
+ className="w-4 h-4 rounded border-slate-600 text-orange-400 focus:ring-orange-400"
451
+ />
452
+ <div>
453
+ <span className="text-slate-200">Enable LLM Tracing</span>
454
+ <p className="text-xs text-slate-500">Capture detailed traces of each LLM call</p>
455
+ </div>
456
+ </label>
457
+ )}
458
+
459
+ <p className="text-xs text-slate-500 flex items-center gap-1">
460
+ <ExternalLink size={12} />
461
+ View results at your MLflow server after the run completes
462
+ </p>
463
+ </>
464
+ )}
465
+ </div>
466
+ )}
467
+ </div>
468
+
363
469
  <button
364
470
  onClick={handleRun}
365
471
  disabled={
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "sandboxy"
7
- version = "0.0.2"
7
+ version = "0.0.4"
8
8
  description = "Open-source agent simulation and benchmarking platform"
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -41,6 +41,9 @@ dependencies = [
41
41
  ]
42
42
 
43
43
  [project.optional-dependencies]
44
+ mlflow = [
45
+ "mlflow>=3.0",
46
+ ]
44
47
  dev = [
45
48
  "pytest>=8.0",
46
49
  "pytest-cov>=4.0",
@@ -120,6 +123,8 @@ ignore = [
120
123
  # S307: eval usage with safety measures (safe_builtins, simpleeval)
121
124
  "sandboxy/tools/yaml_tools.py" = ["S307"]
122
125
  "sandboxy/core/safe_eval.py" = ["S307"]
126
+ # S603, S607, S110: git subprocess call is safe (hardcoded command, known input)
127
+ "sandboxy/mlflow/tags.py" = ["S603", "S607", "S110"]
123
128
 
124
129
  [tool.ruff.lint.pydocstyle]
125
130
  convention = "google"
@@ -379,6 +379,10 @@ class RunScenarioRequest(BaseModel):
379
379
  max_turns: int = 20
380
380
  max_tokens: int = 1024
381
381
  temperature: float = 0.7
382
+ mlflow_export: bool = False
383
+ mlflow_tracking_uri: str | None = None
384
+ mlflow_experiment: str | None = None
385
+ mlflow_tracing: bool = True
382
386
 
383
387
 
384
388
  class RunScenarioResponse(BaseModel):
@@ -393,6 +397,9 @@ class RunScenarioResponse(BaseModel):
393
397
  final_state: dict[str, Any]
394
398
  evaluation: dict[str, Any] | None
395
399
  latency_ms: int
400
+ input_tokens: int = 0
401
+ output_tokens: int = 0
402
+ cost_usd: float | None = None
396
403
  error: str | None
397
404
 
398
405
 
@@ -404,6 +411,10 @@ class CompareModelsRequest(BaseModel):
404
411
  runs_per_model: int = 1
405
412
  variables: dict[str, Any] = Field(default_factory=dict)
406
413
  max_turns: int = 20
414
+ mlflow_export: bool = False
415
+ mlflow_tracking_uri: str | None = None
416
+ mlflow_experiment: str | None = None
417
+ mlflow_tracing: bool = True # Enable LLM call tracing by default
407
418
 
408
419
 
409
420
  class CompareModelsResponse(BaseModel):
@@ -454,20 +465,73 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
454
465
  spec = load_unified_scenario(scenario_path)
455
466
  runner = UnifiedRunner()
456
467
 
457
- result = await runner.run(
458
- scenario=spec,
459
- model=request.model,
460
- variables=request.variables,
461
- max_turns=request.max_turns,
462
- max_tokens=request.max_tokens,
463
- temperature=request.temperature,
464
- )
468
+ # Setup MLflow if requested
469
+ mlflow_config = None
470
+ if request.mlflow_export:
471
+ try:
472
+ from sandboxy.mlflow import MLflowConfig
473
+
474
+ mlflow_config = MLflowConfig.resolve(
475
+ cli_export=True,
476
+ cli_tracking_uri=request.mlflow_tracking_uri,
477
+ cli_experiment=request.mlflow_experiment,
478
+ cli_tracing=request.mlflow_tracing,
479
+ yaml_config=None,
480
+ scenario_name=spec.name,
481
+ )
482
+ except ImportError:
483
+ pass # MLflow not installed
484
+
485
+ # Run with MLflow context if enabled (connects traces to run)
486
+ if mlflow_config and mlflow_config.enabled:
487
+ from sandboxy.mlflow import MLflowExporter, mlflow_run_context
488
+ from sandboxy.mlflow.tracing import enable_tracing
489
+
490
+ if mlflow_config.tracing:
491
+ enable_tracing(
492
+ tracking_uri=mlflow_config.tracking_uri,
493
+ experiment_name=mlflow_config.experiment,
494
+ )
495
+
496
+ with mlflow_run_context(mlflow_config, run_name=request.model) as run_id:
497
+ result = await runner.run(
498
+ scenario=spec,
499
+ model=request.model,
500
+ variables=request.variables,
501
+ max_turns=request.max_turns,
502
+ max_tokens=request.max_tokens,
503
+ temperature=request.temperature,
504
+ )
505
+
506
+ if run_id:
507
+ exporter = MLflowExporter(mlflow_config)
508
+ exporter.log_to_active_run(
509
+ result=result,
510
+ scenario_path=scenario_path,
511
+ scenario_name=spec.name,
512
+ scenario_id=spec.id,
513
+ agent_name=request.model,
514
+ )
515
+ else:
516
+ result = await runner.run(
517
+ scenario=spec,
518
+ model=request.model,
519
+ variables=request.variables,
520
+ max_turns=request.max_turns,
521
+ max_tokens=request.max_tokens,
522
+ temperature=request.temperature,
523
+ )
465
524
 
466
525
  # Save result to runs/
467
526
  from sandboxy.local.results import save_run_result
468
527
 
469
528
  save_run_result(request.scenario_id, result.to_dict())
470
529
 
530
+ # Calculate cost
531
+ input_tokens = result.input_tokens or 0
532
+ output_tokens = result.output_tokens or 0
533
+ cost_usd = calculate_cost(result.model, input_tokens, output_tokens)
534
+
471
535
  return RunScenarioResponse(
472
536
  id=result.id,
473
537
  scenario_id=result.scenario_id,
@@ -481,6 +545,9 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
481
545
  final_state=result.final_state,
482
546
  evaluation=result.evaluation.to_dict() if result.evaluation else None,
483
547
  latency_ms=result.latency_ms,
548
+ input_tokens=input_tokens,
549
+ output_tokens=output_tokens,
550
+ cost_usd=cost_usd,
484
551
  error=result.error,
485
552
  )
486
553
 
@@ -530,6 +597,19 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
530
597
 
531
598
  spec = load_unified_scenario(scenario_path)
532
599
 
600
+ # Enable MLflow tracing if requested (must be done BEFORE any LLM calls)
601
+ if request.mlflow_export and request.mlflow_tracing:
602
+ try:
603
+ from sandboxy.mlflow.tracing import enable_tracing
604
+
605
+ experiment = request.mlflow_experiment or spec.name
606
+ enable_tracing(
607
+ tracking_uri=request.mlflow_tracking_uri,
608
+ experiment_name=experiment,
609
+ )
610
+ except ImportError:
611
+ pass # MLflow not installed
612
+
533
613
  comparison = await run_comparison(
534
614
  scenario=spec,
535
615
  models=request.models,
@@ -538,6 +618,31 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
538
618
  max_turns=request.max_turns,
539
619
  )
540
620
 
621
+ # MLflow export (if enabled)
622
+ if request.mlflow_export:
623
+ try:
624
+ from sandboxy.mlflow import MLflowConfig, MLflowExporter
625
+
626
+ for result in comparison.results:
627
+ config = MLflowConfig.resolve(
628
+ cli_export=True,
629
+ cli_tracking_uri=request.mlflow_tracking_uri,
630
+ cli_experiment=request.mlflow_experiment,
631
+ cli_tracing=request.mlflow_tracing,
632
+ yaml_config=None,
633
+ scenario_name=spec.name,
634
+ )
635
+ exporter = MLflowExporter(config)
636
+ exporter.export(
637
+ result=result.to_dict(),
638
+ scenario_path=scenario_path,
639
+ agent_name=result.model,
640
+ )
641
+ except ImportError:
642
+ logger.warning("MLflow not installed, skipping export")
643
+ except Exception as e:
644
+ logger.warning(f"Failed to export to MLflow: {e}")
645
+
541
646
  # Save comparison result
542
647
  from sandboxy.local.results import save_run_result
543
648
 
@@ -905,6 +1010,8 @@ class RunDatasetRequest(BaseModel):
905
1010
  max_tokens: int = 1024
906
1011
  temperature: float = 0.7
907
1012
  parallel: int = 1
1013
+ mlflow_enabled: bool = False
1014
+ mlflow_experiment: str | None = None
908
1015
 
909
1016
 
910
1017
  class RunDatasetResponse(BaseModel):
@@ -1335,25 +1442,81 @@ async def run_with_dataset(request: RunDatasetRequest) -> RunDatasetResponse:
1335
1442
  spec = load_unified_scenario(scenario_path)
1336
1443
  dataset = load_dataset(dataset_path)
1337
1444
 
1338
- if request.parallel > 1:
1339
- result = await run_dataset_parallel(
1445
+ # Setup MLflow if enabled
1446
+ mlflow_config = None
1447
+ if request.mlflow_enabled:
1448
+ try:
1449
+ from sandboxy.mlflow import MLflowConfig
1450
+
1451
+ mlflow_config = MLflowConfig(
1452
+ enabled=True,
1453
+ experiment=request.mlflow_experiment or f"{spec.name}-dataset",
1454
+ tracing=False, # Tracing not needed for dataset aggregates
1455
+ )
1456
+ except ImportError:
1457
+ pass # MLflow not installed
1458
+
1459
+ async def run_dataset_benchmark():
1460
+ if request.parallel > 1:
1461
+ return await run_dataset_parallel(
1462
+ scenario=spec,
1463
+ model=request.model,
1464
+ dataset=dataset,
1465
+ max_turns=request.max_turns,
1466
+ max_tokens=request.max_tokens,
1467
+ temperature=request.temperature,
1468
+ max_concurrent=request.parallel,
1469
+ )
1470
+ return await run_dataset(
1340
1471
  scenario=spec,
1341
1472
  model=request.model,
1342
1473
  dataset=dataset,
1343
1474
  max_turns=request.max_turns,
1344
1475
  max_tokens=request.max_tokens,
1345
1476
  temperature=request.temperature,
1346
- max_concurrent=request.parallel,
1347
1477
  )
1478
+
1479
+ # Run with MLflow context if enabled
1480
+ if mlflow_config and mlflow_config.enabled:
1481
+ from sandboxy.mlflow import mlflow_run_context
1482
+
1483
+ run_name = f"{request.model}-{request.dataset_id}"
1484
+ with mlflow_run_context(mlflow_config, run_name=run_name) as run_id:
1485
+ result = await run_dataset_benchmark()
1486
+
1487
+ # Log aggregate metrics to MLflow
1488
+ if run_id:
1489
+ try:
1490
+ import mlflow
1491
+
1492
+ mlflow.log_params(
1493
+ {
1494
+ "scenario_id": result.scenario_id,
1495
+ "dataset_id": result.dataset_id,
1496
+ "model": result.model,
1497
+ "total_cases": result.total_cases,
1498
+ }
1499
+ )
1500
+ mlflow.log_metrics(
1501
+ {
1502
+ "passed_cases": result.passed_cases,
1503
+ "failed_cases": result.failed_cases,
1504
+ "pass_rate": result.pass_rate,
1505
+ "avg_score": result.avg_score,
1506
+ "avg_percentage": result.avg_percentage,
1507
+ "total_time_ms": result.total_time_ms,
1508
+ }
1509
+ )
1510
+ # Log per-expected-outcome metrics
1511
+ for expected, counts in result.by_expected.items():
1512
+ total = counts.get("passed", 0) + counts.get("failed", 0)
1513
+ if total > 0:
1514
+ rate = counts.get("passed", 0) / total
1515
+ mlflow.log_metric(f"pass_rate_{expected}", rate)
1516
+ except Exception as e:
1517
+ logger.warning(f"Failed to log dataset metrics to MLflow: {e}")
1348
1518
  else:
1349
- result = await run_dataset(
1350
- scenario=spec,
1351
- model=request.model,
1352
- dataset=dataset,
1353
- max_turns=request.max_turns,
1354
- max_tokens=request.max_tokens,
1355
- temperature=request.temperature,
1356
- )
1519
+ result = await run_dataset_benchmark()
1357
1520
 
1358
1521
  # Save result
1359
1522
  from sandboxy.local.results import save_run_result