sandboxy 0.0.3__tar.gz → 0.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {sandboxy-0.0.3 → sandboxy-0.0.5}/PKG-INFO +103 -27
  2. {sandboxy-0.0.3 → sandboxy-0.0.5}/README.md +100 -26
  3. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/components/ModelSelector.tsx +66 -17
  4. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/hooks/useScenarioRun.ts +21 -4
  5. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/lib/api.ts +97 -2
  6. sandboxy-0.0.5/local-ui/src/pages/DashboardPage.tsx +416 -0
  7. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/pages/RunPage.tsx +110 -4
  8. {sandboxy-0.0.3 → sandboxy-0.0.5}/pyproject.toml +6 -1
  9. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/agents/llm_prompt.py +85 -14
  10. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/api/app.py +2 -1
  11. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/api/routes/local.py +216 -20
  12. sandboxy-0.0.5/sandboxy/api/routes/providers.py +369 -0
  13. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/cli/main.py +663 -31
  14. sandboxy-0.0.5/sandboxy/mlflow/__init__.py +38 -0
  15. sandboxy-0.0.5/sandboxy/mlflow/artifacts.py +184 -0
  16. sandboxy-0.0.5/sandboxy/mlflow/config.py +90 -0
  17. sandboxy-0.0.5/sandboxy/mlflow/exporter.py +445 -0
  18. sandboxy-0.0.5/sandboxy/mlflow/metrics.py +115 -0
  19. sandboxy-0.0.5/sandboxy/mlflow/tags.py +140 -0
  20. sandboxy-0.0.5/sandboxy/mlflow/tracing.py +126 -0
  21. sandboxy-0.0.5/sandboxy/providers/__init__.py +68 -0
  22. sandboxy-0.0.5/sandboxy/providers/config.py +243 -0
  23. sandboxy-0.0.5/sandboxy/providers/local.py +498 -0
  24. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/providers/registry.py +107 -13
  25. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/scenarios/loader.py +44 -2
  26. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/scenarios/runner.py +57 -2
  27. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/scenarios/unified.py +27 -3
  28. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/tools/yaml_tools.py +18 -0
  29. sandboxy-0.0.5/sandboxy/ui/dist/assets/index-CLxxjJuD.js +367 -0
  30. sandboxy-0.0.5/sandboxy/ui/dist/assets/index-DBB7ehs6.css +1 -0
  31. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/ui/dist/index.html +2 -2
  32. sandboxy-0.0.5/tests/integration/test_mlflow_integration.py +245 -0
  33. sandboxy-0.0.5/tests/unit/mlflow/__init__.py +1 -0
  34. sandboxy-0.0.5/tests/unit/mlflow/test_artifacts.py +206 -0
  35. sandboxy-0.0.5/tests/unit/mlflow/test_config.py +127 -0
  36. sandboxy-0.0.5/tests/unit/mlflow/test_metrics.py +131 -0
  37. sandboxy-0.0.5/tests/unit/mlflow/test_tags.py +209 -0
  38. sandboxy-0.0.3/local-ui/src/pages/DashboardPage.tsx +0 -163
  39. sandboxy-0.0.3/sandboxy/providers/__init__.py +0 -34
  40. sandboxy-0.0.3/sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -1
  41. sandboxy-0.0.3/sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -347
  42. {sandboxy-0.0.3 → sandboxy-0.0.5}/.env.example +0 -0
  43. {sandboxy-0.0.3 → sandboxy-0.0.5}/.github/workflows/ci.yml +0 -0
  44. {sandboxy-0.0.3 → sandboxy-0.0.5}/.github/workflows/publish.yml +0 -0
  45. {sandboxy-0.0.3 → sandboxy-0.0.5}/.gitignore +0 -0
  46. {sandboxy-0.0.3 → sandboxy-0.0.5}/CONTRIBUTING.md +0 -0
  47. {sandboxy-0.0.3 → sandboxy-0.0.5}/LICENSE +0 -0
  48. {sandboxy-0.0.3 → sandboxy-0.0.5}/Makefile +0 -0
  49. {sandboxy-0.0.3 → sandboxy-0.0.5}/docs/yaml-tools.md +0 -0
  50. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/index.html +0 -0
  51. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/package-lock.json +0 -0
  52. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/package.json +0 -0
  53. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/postcss.config.js +0 -0
  54. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/App.tsx +0 -0
  55. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/components/Layout.tsx +0 -0
  56. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/components/ResultDisplay.tsx +0 -0
  57. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/hooks/useScenarioBuilder.ts +0 -0
  58. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/hooks/useToolBuilder.ts +0 -0
  59. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/index.css +0 -0
  60. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/main.tsx +0 -0
  61. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/pages/BuilderPage.tsx +0 -0
  62. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/pages/DatasetPage.tsx +0 -0
  63. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/pages/ResultsPage.tsx +0 -0
  64. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/src/pages/ToolBuilderPage.tsx +0 -0
  65. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/tailwind.config.js +0 -0
  66. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/tsconfig.json +0 -0
  67. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/tsconfig.node.json +0 -0
  68. {sandboxy-0.0.3 → sandboxy-0.0.5}/local-ui/vite.config.ts +0 -0
  69. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/__init__.py +0 -0
  70. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/agents/__init__.py +0 -0
  71. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/agents/base.py +0 -0
  72. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/agents/loader.py +0 -0
  73. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/api/__init__.py +0 -0
  74. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/api/routes/__init__.py +0 -0
  75. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/api/routes/agents.py +0 -0
  76. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/api/routes/tools.py +0 -0
  77. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/cli/__init__.py +0 -0
  78. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/cli/type_detector.py +0 -0
  79. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/config.py +0 -0
  80. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/core/__init__.py +0 -0
  81. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/core/async_runner.py +0 -0
  82. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/core/mdl_parser.py +0 -0
  83. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/core/runner.py +0 -0
  84. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/core/safe_eval.py +0 -0
  85. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/core/state.py +0 -0
  86. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/datasets/__init__.py +0 -0
  87. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/datasets/loader.py +0 -0
  88. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/datasets/runner.py +0 -0
  89. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/errors.py +0 -0
  90. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/local/context.py +0 -0
  91. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/local/results.py +0 -0
  92. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/logging.py +0 -0
  93. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/mcp/__init__.py +0 -0
  94. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/mcp/client.py +0 -0
  95. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/mcp/wrapper.py +0 -0
  96. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/providers/anthropic_provider.py +0 -0
  97. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/providers/base.py +0 -0
  98. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/providers/http_client.py +0 -0
  99. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/providers/openai_provider.py +0 -0
  100. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/providers/openrouter.py +0 -0
  101. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/scenarios/__init__.py +0 -0
  102. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/scenarios/comparison.py +0 -0
  103. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/session/__init__.py +0 -0
  104. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/session/manager.py +0 -0
  105. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/tools/__init__.py +0 -0
  106. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/tools/base.py +0 -0
  107. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/tools/loader.py +0 -0
  108. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/ui/__init__.py +0 -0
  109. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/utils/__init__.py +0 -0
  110. {sandboxy-0.0.3 → sandboxy-0.0.5}/sandboxy/utils/time.py +0 -0
  111. {sandboxy-0.0.3 → sandboxy-0.0.5}/scenarios/customer_service.yml +0 -0
  112. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/__init__.py +0 -0
  113. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/conftest.py +0 -0
  114. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/factories.py +0 -0
  115. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/integration/__init__.py +0 -0
  116. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/integration/api/__init__.py +0 -0
  117. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/mocks/__init__.py +0 -0
  118. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/mocks/providers.py +0 -0
  119. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/__init__.py +0 -0
  120. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/agents/__init__.py +0 -0
  121. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/agents/test_base.py +0 -0
  122. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/agents/test_llm_prompt.py +0 -0
  123. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/agents/test_loader.py +0 -0
  124. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/core/__init__.py +0 -0
  125. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/core/test_async_runner.py +0 -0
  126. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/core/test_mdl_parser.py +0 -0
  127. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/core/test_runner.py +0 -0
  128. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/core/test_safe_eval.py +0 -0
  129. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/core/test_state.py +0 -0
  130. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/providers/test_openrouter.py +0 -0
  131. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/tools/__init__.py +0 -0
  132. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/tools/test_base.py +0 -0
  133. {sandboxy-0.0.3 → sandboxy-0.0.5}/tests/unit/tools/test_loader.py +0 -0
  134. {sandboxy-0.0.3 → sandboxy-0.0.5}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sandboxy
3
- Version: 0.0.3
3
+ Version: 0.0.5
4
4
  Summary: Open-source agent simulation and benchmarking platform
5
5
  Project-URL: Homepage, https://github.com/sandboxy-ai/sandboxy
6
6
  Project-URL: Repository, https://github.com/sandboxy-ai/sandboxy
@@ -39,6 +39,8 @@ Requires-Dist: pytest-xdist>=3.5.0; extra == 'dev'
39
39
  Requires-Dist: pytest>=8.0; extra == 'dev'
40
40
  Requires-Dist: respx>=0.21.0; extra == 'dev'
41
41
  Requires-Dist: ruff>=0.1; extra == 'dev'
42
+ Provides-Extra: mlflow
43
+ Requires-Dist: mlflow>=3.0; extra == 'mlflow'
42
44
  Description-Content-Type: text/markdown
43
45
 
44
46
  # Sandboxy
@@ -118,7 +120,37 @@ Opens a browser with a local UI for browsing scenarios, running them, and viewin
118
120
 
119
121
  ## Writing Scenarios
120
122
 
121
- Scenarios are YAML files that define agent interactions:
123
+ Scenarios are YAML files that define agent interactions. Sandboxy supports two modes:
124
+
125
+ ### Single-turn mode
126
+
127
+ Use `prompt:` for simple request/response scenarios without tool use:
128
+
129
+ ```yaml
130
+ id: simple-qa
131
+ name: "Simple Q&A"
132
+
133
+ system_prompt: |
134
+ You are a helpful assistant.
135
+
136
+ prompt: |
137
+ What is the capital of France?
138
+
139
+ evaluation:
140
+ max_score: 100
141
+ goals:
142
+ - id: correct_answer
143
+ name: "Correct Answer"
144
+ points: 100
145
+ detection:
146
+ type: agent_contains
147
+ patterns:
148
+ - "Paris"
149
+ ```
150
+
151
+ ### Agentic mode
152
+
153
+ Use `steps:` for multi-turn scenarios with tool support:
122
154
 
123
155
  ```yaml
124
156
  id: customer-support
@@ -129,35 +161,45 @@ system_prompt: |
129
161
  You are a customer support agent for TechCo.
130
162
  Be helpful but follow company policy.
131
163
 
132
- user_prompt: |
133
- I want a refund for my purchase. Order #12345.
164
+ steps:
165
+ - id: user_request
166
+ action: inject_user
167
+ params:
168
+ content: "I want a refund for my purchase. Order #12345."
169
+ - id: agent_response
170
+ action: await_agent
134
171
 
135
- # Define tools the agent can use
172
+ # Tools are only available in agentic mode (with steps)
136
173
  tools:
137
- - name: lookup_order
174
+ lookup_order:
138
175
  description: "Look up order details"
139
- params:
140
- order_id:
141
- type: string
142
- required: true
143
- returns: "Order details for {{order_id}}"
144
-
145
- # Evaluation criteria
146
- goals:
147
- - name: acknowledged_request
148
- description: "Agent acknowledged the refund request"
149
- check:
150
- type: contains
151
- value: "refund"
152
-
153
- - name: looked_up_order
154
- description: "Agent used the lookup tool"
155
- check:
156
- type: tool_called
157
- tool: lookup_order
158
-
159
- scoring:
176
+ actions:
177
+ call:
178
+ params:
179
+ order_id:
180
+ type: string
181
+ required: true
182
+ returns: "Order details for {{order_id}}"
183
+
184
+ evaluation:
160
185
  max_score: 100
186
+ goals:
187
+ - id: acknowledged_request
188
+ name: "Acknowledged Request"
189
+ description: "Agent acknowledged the refund request"
190
+ points: 50
191
+ detection:
192
+ type: agent_contains
193
+ patterns:
194
+ - "refund"
195
+
196
+ - id: looked_up_order
197
+ name: "Looked Up Order"
198
+ description: "Agent used the lookup tool"
199
+ points: 50
200
+ detection:
201
+ type: tool_called
202
+ tool: lookup_order
161
203
  ```
162
204
 
163
205
  ## CLI Reference
@@ -204,6 +246,39 @@ sandboxy list-models --search claude
204
246
  sandboxy list-models --free
205
247
  ```
206
248
 
249
+ ## MLflow Integration
250
+
251
+ Export scenario run results to MLflow for experiment tracking and model comparison.
252
+
253
+ ```bash
254
+ # Install with MLflow support
255
+ pip install sandboxy[mlflow]
256
+
257
+ # Export run to MLflow
258
+ sandboxy scenario scenarios/test.yml -m openai/gpt-4o --mlflow-export
259
+
260
+ # Custom experiment name
261
+ sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export --mlflow-experiment "my-evals"
262
+ ```
263
+
264
+ Or enable in scenario YAML:
265
+
266
+ ```yaml
267
+ id: my-scenario
268
+ name: "My Test"
269
+
270
+ mlflow:
271
+ enabled: true
272
+ experiment: "agent-evals"
273
+ tags:
274
+ team: "support"
275
+
276
+ system_prompt: |
277
+ ...
278
+ ```
279
+
280
+ See `MLFLOW_TRACKING_URI` env variable to configure the MLflow server.
281
+
207
282
  ## Configuration
208
283
 
209
284
  Environment variables (in `~/.sandboxy/.env` or project `.env`):
@@ -213,6 +288,7 @@ Environment variables (in `~/.sandboxy/.env` or project `.env`):
213
288
  | `OPENROUTER_API_KEY` | OpenRouter API key (400+ models) |
214
289
  | `OPENAI_API_KEY` | Direct OpenAI access |
215
290
  | `ANTHROPIC_API_KEY` | Direct Anthropic access |
291
+ | `MLFLOW_TRACKING_URI` | MLflow tracking server URI |
216
292
 
217
293
  ## Project Structure
218
294
 
@@ -75,7 +75,37 @@ Opens a browser with a local UI for browsing scenarios, running them, and viewin
75
75
 
76
76
  ## Writing Scenarios
77
77
 
78
- Scenarios are YAML files that define agent interactions:
78
+ Scenarios are YAML files that define agent interactions. Sandboxy supports two modes:
79
+
80
+ ### Single-turn mode
81
+
82
+ Use `prompt:` for simple request/response scenarios without tool use:
83
+
84
+ ```yaml
85
+ id: simple-qa
86
+ name: "Simple Q&A"
87
+
88
+ system_prompt: |
89
+ You are a helpful assistant.
90
+
91
+ prompt: |
92
+ What is the capital of France?
93
+
94
+ evaluation:
95
+ max_score: 100
96
+ goals:
97
+ - id: correct_answer
98
+ name: "Correct Answer"
99
+ points: 100
100
+ detection:
101
+ type: agent_contains
102
+ patterns:
103
+ - "Paris"
104
+ ```
105
+
106
+ ### Agentic mode
107
+
108
+ Use `steps:` for multi-turn scenarios with tool support:
79
109
 
80
110
  ```yaml
81
111
  id: customer-support
@@ -86,35 +116,45 @@ system_prompt: |
86
116
  You are a customer support agent for TechCo.
87
117
  Be helpful but follow company policy.
88
118
 
89
- user_prompt: |
90
- I want a refund for my purchase. Order #12345.
119
+ steps:
120
+ - id: user_request
121
+ action: inject_user
122
+ params:
123
+ content: "I want a refund for my purchase. Order #12345."
124
+ - id: agent_response
125
+ action: await_agent
91
126
 
92
- # Define tools the agent can use
127
+ # Tools are only available in agentic mode (with steps)
93
128
  tools:
94
- - name: lookup_order
129
+ lookup_order:
95
130
  description: "Look up order details"
96
- params:
97
- order_id:
98
- type: string
99
- required: true
100
- returns: "Order details for {{order_id}}"
101
-
102
- # Evaluation criteria
103
- goals:
104
- - name: acknowledged_request
105
- description: "Agent acknowledged the refund request"
106
- check:
107
- type: contains
108
- value: "refund"
109
-
110
- - name: looked_up_order
111
- description: "Agent used the lookup tool"
112
- check:
113
- type: tool_called
114
- tool: lookup_order
115
-
116
- scoring:
131
+ actions:
132
+ call:
133
+ params:
134
+ order_id:
135
+ type: string
136
+ required: true
137
+ returns: "Order details for {{order_id}}"
138
+
139
+ evaluation:
117
140
  max_score: 100
141
+ goals:
142
+ - id: acknowledged_request
143
+ name: "Acknowledged Request"
144
+ description: "Agent acknowledged the refund request"
145
+ points: 50
146
+ detection:
147
+ type: agent_contains
148
+ patterns:
149
+ - "refund"
150
+
151
+ - id: looked_up_order
152
+ name: "Looked Up Order"
153
+ description: "Agent used the lookup tool"
154
+ points: 50
155
+ detection:
156
+ type: tool_called
157
+ tool: lookup_order
118
158
  ```
119
159
 
120
160
  ## CLI Reference
@@ -161,6 +201,39 @@ sandboxy list-models --search claude
161
201
  sandboxy list-models --free
162
202
  ```
163
203
 
204
+ ## MLflow Integration
205
+
206
+ Export scenario run results to MLflow for experiment tracking and model comparison.
207
+
208
+ ```bash
209
+ # Install with MLflow support
210
+ pip install sandboxy[mlflow]
211
+
212
+ # Export run to MLflow
213
+ sandboxy scenario scenarios/test.yml -m openai/gpt-4o --mlflow-export
214
+
215
+ # Custom experiment name
216
+ sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export --mlflow-experiment "my-evals"
217
+ ```
218
+
219
+ Or enable in scenario YAML:
220
+
221
+ ```yaml
222
+ id: my-scenario
223
+ name: "My Test"
224
+
225
+ mlflow:
226
+ enabled: true
227
+ experiment: "agent-evals"
228
+ tags:
229
+ team: "support"
230
+
231
+ system_prompt: |
232
+ ...
233
+ ```
234
+
235
+ See `MLFLOW_TRACKING_URI` env variable to configure the MLflow server.
236
+
164
237
  ## Configuration
165
238
 
166
239
  Environment variables (in `~/.sandboxy/.env` or project `.env`):
@@ -170,6 +243,7 @@ Environment variables (in `~/.sandboxy/.env` or project `.env`):
170
243
  | `OPENROUTER_API_KEY` | OpenRouter API key (400+ models) |
171
244
  | `OPENAI_API_KEY` | Direct OpenAI access |
172
245
  | `ANTHROPIC_API_KEY` | Direct Anthropic access |
246
+ | `MLFLOW_TRACKING_URI` | MLflow tracking server URI |
173
247
 
174
248
  ## Project Structure
175
249
 
@@ -1,7 +1,17 @@
1
1
  import { useState, useRef, useEffect } from 'react'
2
- import { ChevronDown, Check, X, Search } from 'lucide-react'
2
+ import { ChevronDown, Check, X, Search, Monitor } from 'lucide-react'
3
3
  import { ModelInfo } from '../lib/api'
4
4
 
5
+ // Badge component for local models
6
+ function LocalBadge() {
7
+ return (
8
+ <span className="inline-flex items-center gap-1 px-1.5 py-0.5 bg-emerald-500/20 border border-emerald-500/40 rounded text-xs text-emerald-400">
9
+ <Monitor size={10} />
10
+ Local
11
+ </span>
12
+ )
13
+ }
14
+
5
15
  interface ModelSelectorProps {
6
16
  models: ModelInfo[]
7
17
  value: string
@@ -43,16 +53,31 @@ export function ModelSelector({ models, value, onChange, disabled, placeholder =
43
53
 
44
54
  // Group models by provider
45
55
  const groupedModels = filteredModels.reduce((acc, model) => {
46
- const provider = model.id.split('/')[0] || 'other'
56
+ // Use provider_name for local models, otherwise extract from id
57
+ const provider = model.provider_name || model.id.split('/')[0] || 'other'
47
58
  if (!acc[provider]) acc[provider] = []
48
59
  acc[provider].push(model)
49
60
  return acc
50
61
  }, {} as Record<string, ModelInfo[]>)
51
62
 
52
- const providerOrder = ['openai', 'anthropic', 'google', 'x-ai', 'deepseek', 'meta-llama', 'mistralai', 'qwen', 'perplexity']
63
+ // Local providers first, then cloud providers in preferred order
64
+ const cloudProviderOrder = ['openai', 'anthropic', 'google', 'x-ai', 'deepseek', 'meta-llama', 'mistralai', 'qwen', 'perplexity']
65
+
66
+ // Check if a provider group has local models
67
+ const isLocalProvider = (provider: string) => {
68
+ return groupedModels[provider]?.some(m => m.is_local)
69
+ }
70
+
53
71
  const sortedProviders = Object.keys(groupedModels).sort((a, b) => {
54
- const aIdx = providerOrder.indexOf(a)
55
- const bIdx = providerOrder.indexOf(b)
72
+ // Local providers always come first
73
+ const aIsLocal = isLocalProvider(a)
74
+ const bIsLocal = isLocalProvider(b)
75
+ if (aIsLocal && !bIsLocal) return -1
76
+ if (!aIsLocal && bIsLocal) return 1
77
+
78
+ // Within same category, sort by preference
79
+ const aIdx = cloudProviderOrder.indexOf(a)
80
+ const bIdx = cloudProviderOrder.indexOf(b)
56
81
  if (aIdx === -1 && bIdx === -1) return a.localeCompare(b)
57
82
  if (aIdx === -1) return 1
58
83
  if (bIdx === -1) return -1
@@ -70,9 +95,10 @@ export function ModelSelector({ models, value, onChange, disabled, placeholder =
70
95
  } ${open ? 'ring-2 ring-orange-400' : ''}`}
71
96
  >
72
97
  {selectedModel ? (
73
- <div className="flex items-center justify-between flex-1 min-w-0">
98
+ <div className="flex items-center justify-between flex-1 min-w-0 gap-2">
74
99
  <span className="text-slate-100 truncate">{selectedModel.name}</span>
75
- <span className="text-xs text-slate-500 ml-2 shrink-0">{selectedModel.price}</span>
100
+ {selectedModel.is_local && <LocalBadge />}
101
+ <span className="text-xs text-slate-500 shrink-0">{selectedModel.price}</span>
76
102
  </div>
77
103
  ) : (
78
104
  <span className="text-slate-500">{placeholder}</span>
@@ -101,8 +127,9 @@ export function ModelSelector({ models, value, onChange, disabled, placeholder =
101
127
  <div className="overflow-y-auto flex-1">
102
128
  {sortedProviders.map(provider => (
103
129
  <div key={provider}>
104
- <div className="px-3 py-1.5 text-xs font-medium text-slate-500 uppercase bg-slate-900 sticky top-0">
130
+ <div className="px-3 py-1.5 text-xs font-medium text-slate-500 uppercase bg-slate-900 sticky top-0 flex items-center gap-2">
105
131
  {provider}
132
+ {isLocalProvider(provider) && <LocalBadge />}
106
133
  </div>
107
134
  {groupedModels[provider].map(model => (
108
135
  <button
@@ -119,8 +146,9 @@ export function ModelSelector({ models, value, onChange, disabled, placeholder =
119
146
  : 'hover:bg-slate-800 text-slate-100'
120
147
  }`}
121
148
  >
122
- <div className="flex-1 min-w-0">
149
+ <div className="flex-1 min-w-0 flex items-center gap-2">
123
150
  <div className="truncate">{model.name}</div>
151
+ {model.is_local && !isLocalProvider(provider) && <LocalBadge />}
124
152
  </div>
125
153
  <span className="text-xs text-slate-500 shrink-0">{model.price}</span>
126
154
  {model.id === value && <Check size={16} className="text-orange-400 shrink-0" />}
@@ -188,18 +216,32 @@ export function MultiModelSelector({ models, selected, onChange, disabled }: Mul
188
216
  m.id.toLowerCase().includes(search.toLowerCase())
189
217
  )
190
218
 
191
- // Group models by provider
219
+ // Group models by provider (use provider_name for local models)
192
220
  const groupedModels = filteredModels.reduce((acc, model) => {
193
- const provider = model.id.split('/')[0] || 'other'
221
+ const provider = model.provider_name || model.id.split('/')[0] || 'other'
194
222
  if (!acc[provider]) acc[provider] = []
195
223
  acc[provider].push(model)
196
224
  return acc
197
225
  }, {} as Record<string, ModelInfo[]>)
198
226
 
199
- const providerOrder = ['openai', 'anthropic', 'google', 'x-ai', 'deepseek', 'meta-llama', 'mistralai', 'qwen', 'perplexity']
227
+ // Local providers first, then cloud providers in preferred order
228
+ const cloudProviderOrder = ['openai', 'anthropic', 'google', 'x-ai', 'deepseek', 'meta-llama', 'mistralai', 'qwen', 'perplexity']
229
+
230
+ // Check if a provider group has local models
231
+ const isLocalProvider = (provider: string) => {
232
+ return groupedModels[provider]?.some(m => m.is_local)
233
+ }
234
+
200
235
  const sortedProviders = Object.keys(groupedModels).sort((a, b) => {
201
- const aIdx = providerOrder.indexOf(a)
202
- const bIdx = providerOrder.indexOf(b)
236
+ // Local providers always come first
237
+ const aIsLocal = isLocalProvider(a)
238
+ const bIsLocal = isLocalProvider(b)
239
+ if (aIsLocal && !bIsLocal) return -1
240
+ if (!aIsLocal && bIsLocal) return 1
241
+
242
+ // Within same category, sort by preference
243
+ const aIdx = cloudProviderOrder.indexOf(a)
244
+ const bIdx = cloudProviderOrder.indexOf(b)
203
245
  if (aIdx === -1 && bIdx === -1) return a.localeCompare(b)
204
246
  if (aIdx === -1) return 1
205
247
  if (bIdx === -1) return -1
@@ -216,8 +258,13 @@ export function MultiModelSelector({ models, selected, onChange, disabled }: Mul
216
258
  return (
217
259
  <span
218
260
  key={modelId}
219
- className="flex items-center gap-1.5 px-2.5 py-1 bg-orange-500/20 border border-orange-400/40 rounded-full text-sm text-slate-100"
261
+ className={`flex items-center gap-1.5 px-2.5 py-1 rounded-full text-sm text-slate-100 ${
262
+ model?.is_local
263
+ ? 'bg-emerald-500/20 border border-emerald-400/40'
264
+ : 'bg-orange-500/20 border border-orange-400/40'
265
+ }`}
220
266
  >
267
+ {model?.is_local && <Monitor size={12} className="text-emerald-400" />}
221
268
  {model?.name || modelId}
222
269
  <button
223
270
  type="button"
@@ -268,8 +315,9 @@ export function MultiModelSelector({ models, selected, onChange, disabled }: Mul
268
315
  <div className="overflow-y-auto flex-1">
269
316
  {sortedProviders.map(provider => (
270
317
  <div key={provider}>
271
- <div className="px-3 py-1.5 text-xs font-medium text-slate-500 uppercase bg-slate-900 sticky top-0">
318
+ <div className="px-3 py-1.5 text-xs font-medium text-slate-500 uppercase bg-slate-900 sticky top-0 flex items-center gap-2">
272
319
  {provider}
320
+ {isLocalProvider(provider) && <LocalBadge />}
273
321
  </div>
274
322
  {groupedModels[provider].map(model => {
275
323
  const isSelected = selected.includes(model.id)
@@ -289,8 +337,9 @@ export function MultiModelSelector({ models, selected, onChange, disabled }: Mul
289
337
  }`}>
290
338
  {isSelected && <Check size={12} className="text-slate-900" />}
291
339
  </div>
292
- <div className="flex-1 min-w-0">
340
+ <div className="flex-1 min-w-0 flex items-center gap-2">
293
341
  <div className="truncate">{model.name}</div>
342
+ {model.is_local && !isLocalProvider(provider) && <LocalBadge />}
294
343
  </div>
295
344
  <span className="text-xs text-slate-500 shrink-0">{model.price}</span>
296
345
  </button>
@@ -7,13 +7,20 @@ import { api, RunScenarioResponse, CompareModelsResponse } from '../lib/api'
7
7
 
8
8
  export type RunState = 'idle' | 'running' | 'completed' | 'error'
9
9
 
10
+ export interface MlflowOptions {
11
+ enabled: boolean
12
+ trackingUri?: string
13
+ experiment?: string
14
+ tracing?: boolean
15
+ }
16
+
10
17
  export interface UseScenarioRunResult {
11
18
  state: RunState
12
19
  result: RunScenarioResponse | null
13
20
  comparison: CompareModelsResponse | null
14
21
  error: string | null
15
- runScenario: (scenarioId: string, model: string, variables?: Record<string, unknown>) => Promise<void>
16
- compareModels: (scenarioId: string, models: string[], runsPerModel?: number, variables?: Record<string, unknown>) => Promise<void>
22
+ runScenario: (scenarioId: string, model: string, variables?: Record<string, unknown>, mlflow?: MlflowOptions) => Promise<void>
23
+ compareModels: (scenarioId: string, models: string[], runsPerModel?: number, variables?: Record<string, unknown>, mlflow?: MlflowOptions) => Promise<void>
17
24
  reset: () => void
18
25
  }
19
26
 
@@ -33,7 +40,8 @@ export function useScenarioRun(): UseScenarioRunResult {
33
40
  const runScenario = useCallback(async (
34
41
  scenarioId: string,
35
42
  model: string,
36
- variables?: Record<string, unknown>
43
+ variables?: Record<string, unknown>,
44
+ mlflow?: MlflowOptions
37
45
  ) => {
38
46
  reset()
39
47
  setState('running')
@@ -43,6 +51,10 @@ export function useScenarioRun(): UseScenarioRunResult {
43
51
  scenario_id: scenarioId,
44
52
  model,
45
53
  variables,
54
+ mlflow_export: mlflow?.enabled,
55
+ mlflow_tracking_uri: mlflow?.trackingUri,
56
+ mlflow_experiment: mlflow?.experiment,
57
+ mlflow_tracing: mlflow?.tracing,
46
58
  })
47
59
 
48
60
  if (response.error) {
@@ -62,7 +74,8 @@ export function useScenarioRun(): UseScenarioRunResult {
62
74
  scenarioId: string,
63
75
  models: string[],
64
76
  runsPerModel: number = 1,
65
- variables?: Record<string, unknown>
77
+ variables?: Record<string, unknown>,
78
+ mlflow?: MlflowOptions
66
79
  ) => {
67
80
  reset()
68
81
  setState('running')
@@ -73,6 +86,10 @@ export function useScenarioRun(): UseScenarioRunResult {
73
86
  models,
74
87
  runs_per_model: runsPerModel,
75
88
  variables,
89
+ mlflow_export: mlflow?.enabled,
90
+ mlflow_tracking_uri: mlflow?.trackingUri,
91
+ mlflow_experiment: mlflow?.experiment,
92
+ mlflow_tracing: mlflow?.tracing,
76
93
  })
77
94
 
78
95
  setState('completed')
@@ -44,6 +44,8 @@ export interface ModelInfo {
44
44
  id: string
45
45
  name: string
46
46
  price: string
47
+ is_local?: boolean
48
+ provider_name?: string
47
49
  }
48
50
 
49
51
  export interface RunScenarioRequest {
@@ -53,6 +55,10 @@ export interface RunScenarioRequest {
53
55
  max_turns?: number
54
56
  max_tokens?: number
55
57
  temperature?: number
58
+ mlflow_export?: boolean
59
+ mlflow_tracking_uri?: string
60
+ mlflow_experiment?: string
61
+ mlflow_tracing?: boolean
56
62
  }
57
63
 
58
64
  export interface HistoryMessage {
@@ -112,6 +118,10 @@ export interface CompareModelsRequest {
112
118
  runs_per_model?: number
113
119
  variables?: Record<string, unknown>
114
120
  max_turns?: number
121
+ mlflow_export?: boolean
122
+ mlflow_tracking_uri?: string
123
+ mlflow_experiment?: string
124
+ mlflow_tracing?: boolean
115
125
  }
116
126
 
117
127
  export interface ModelStats {
@@ -205,6 +215,8 @@ export interface RunDatasetRequest {
205
215
  max_tokens?: number
206
216
  temperature?: number
207
217
  parallel?: number
218
+ mlflow_enabled?: boolean
219
+ mlflow_experiment?: string
208
220
  }
209
221
 
210
222
  export interface CaseResultInfo {
@@ -235,7 +247,7 @@ export interface RunDatasetResponse {
235
247
  }
236
248
 
237
249
  class ApiClient {
238
- private async fetch<T>(url: string, options?: RequestInit): Promise<T> {
250
+ protected async fetch<T>(url: string, options?: RequestInit): Promise<T> {
239
251
  const response = await fetch(`${API_BASE}${url}`, {
240
252
  ...options,
241
253
  headers: {
@@ -350,4 +362,87 @@ class ApiClient {
350
362
  }
351
363
  }
352
364
 
353
- export const api = new ApiClient()
365
+ // --- Provider Types ---
366
+
367
+ export interface ProviderSummary {
368
+ name: string
369
+ type: string
370
+ base_url: string
371
+ enabled: boolean
372
+ status: 'connected' | 'disconnected' | 'error' | 'unknown'
373
+ model_count: number
374
+ models: string[]
375
+ }
376
+
377
+ export interface ProviderListResponse {
378
+ providers: ProviderSummary[]
379
+ }
380
+
381
+ export interface LocalModelInfoResponse {
382
+ id: string
383
+ name: string
384
+ context_length: number
385
+ supports_tools: boolean
386
+ is_local: boolean
387
+ }
388
+
389
+ export interface ProviderDetailResponse {
390
+ config: Record<string, unknown>
391
+ status: {
392
+ status: string
393
+ last_checked: string | null
394
+ available_models: string[]
395
+ latency_ms: number | null
396
+ error_message: string | null
397
+ }
398
+ models: LocalModelInfoResponse[]
399
+ }
400
+
401
+ export interface AddProviderRequest {
402
+ name: string
403
+ type: 'ollama' | 'lmstudio' | 'vllm' | 'openai-compatible'
404
+ base_url: string
405
+ api_key?: string | null
406
+ models?: string[]
407
+ default_params?: Record<string, unknown>
408
+ }
409
+
410
+ export interface TestConnectionResponse {
411
+ success: boolean
412
+ latency_ms: number | null
413
+ models_found: string[]
414
+ error: string | null
415
+ }
416
+
417
+ // Extend ApiClient with provider methods
418
+ class ApiClientWithProviders extends ApiClient {
419
+ async listProviders(): Promise<ProviderSummary[]> {
420
+ const response = await this.fetch<ProviderListResponse>('/providers')
421
+ return response.providers
422
+ }
423
+
424
+ async addProvider(request: AddProviderRequest): Promise<ProviderSummary> {
425
+ return this.fetch<ProviderSummary>('/providers', {
426
+ method: 'POST',
427
+ body: JSON.stringify(request),
428
+ })
429
+ }
430
+
431
+ async getProvider(name: string): Promise<ProviderDetailResponse> {
432
+ return this.fetch<ProviderDetailResponse>(`/providers/${encodeURIComponent(name)}`)
433
+ }
434
+
435
+ async deleteProvider(name: string): Promise<void> {
436
+ await this.fetch<void>(`/providers/${encodeURIComponent(name)}`, {
437
+ method: 'DELETE',
438
+ })
439
+ }
440
+
441
+ async testProvider(name: string): Promise<TestConnectionResponse> {
442
+ return this.fetch<TestConnectionResponse>(`/providers/${encodeURIComponent(name)}/test`, {
443
+ method: 'POST',
444
+ })
445
+ }
446
+ }
447
+
448
+ export const api = new ApiClientWithProviders()