sandboxy 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandboxy/api/routes/local.py +182 -19
- sandboxy/cli/main.py +292 -31
- sandboxy/mlflow/__init__.py +38 -0
- sandboxy/mlflow/artifacts.py +184 -0
- sandboxy/mlflow/config.py +90 -0
- sandboxy/mlflow/exporter.py +439 -0
- sandboxy/mlflow/metrics.py +115 -0
- sandboxy/mlflow/tags.py +140 -0
- sandboxy/mlflow/tracing.py +126 -0
- sandboxy/scenarios/loader.py +44 -2
- sandboxy/scenarios/runner.py +57 -2
- sandboxy/tools/yaml_tools.py +18 -0
- sandboxy/ui/dist/assets/index-CU06wBqc.js +362 -0
- sandboxy/ui/dist/assets/index-Cgg2wY2m.css +1 -0
- sandboxy/ui/dist/index.html +2 -2
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/METADATA +37 -1
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/RECORD +20 -13
- sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -1
- sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -347
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/WHEEL +0 -0
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/entry_points.txt +0 -0
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
"""MLflow exporter for Sandboxy scenario run results.
|
|
2
|
+
|
|
3
|
+
This module handles exporting scenario run results to MLflow tracking server.
|
|
4
|
+
All methods are designed to be resilient - they log warnings on failure but
|
|
5
|
+
never raise exceptions that would crash the scenario run.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from collections.abc import Generator
|
|
12
|
+
from contextlib import contextmanager
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import TYPE_CHECKING, Any
|
|
15
|
+
|
|
16
|
+
from sandboxy.mlflow.artifacts import prepare_artifacts_dir
|
|
17
|
+
from sandboxy.mlflow.config import MLflowConfig
|
|
18
|
+
from sandboxy.mlflow.metrics import (
|
|
19
|
+
build_goal_metrics,
|
|
20
|
+
build_score_metrics,
|
|
21
|
+
build_timing_metrics,
|
|
22
|
+
build_token_metrics,
|
|
23
|
+
)
|
|
24
|
+
from sandboxy.mlflow.tags import build_standard_tags
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from sandboxy.scenarios.unified import RunResult
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@contextmanager
|
|
33
|
+
def mlflow_run_context(
|
|
34
|
+
config: MLflowConfig,
|
|
35
|
+
run_name: str | None = None,
|
|
36
|
+
) -> Generator[str | None, None, None]:
|
|
37
|
+
"""Context manager that starts an MLflow run before scenario execution.
|
|
38
|
+
|
|
39
|
+
This allows traces from LLM calls to be attached to the run.
|
|
40
|
+
Use this to wrap scenario execution so traces are connected to the run.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
config: MLflow configuration
|
|
44
|
+
run_name: Optional name for the run
|
|
45
|
+
|
|
46
|
+
Yields:
|
|
47
|
+
The run ID if successful, None otherwise
|
|
48
|
+
|
|
49
|
+
Example:
|
|
50
|
+
with mlflow_run_context(config, run_name="gpt-4o") as run_id:
|
|
51
|
+
# Run scenario here - traces will be attached to this run
|
|
52
|
+
result = run_scenario(...)
|
|
53
|
+
# After context exits, log metrics with exporter.log_to_run(run_id, result)
|
|
54
|
+
"""
|
|
55
|
+
if not config.enabled:
|
|
56
|
+
yield None
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
import mlflow
|
|
61
|
+
|
|
62
|
+
# Setup tracking
|
|
63
|
+
if config.tracking_uri:
|
|
64
|
+
mlflow.set_tracking_uri(config.tracking_uri)
|
|
65
|
+
if config.experiment:
|
|
66
|
+
mlflow.set_experiment(config.experiment)
|
|
67
|
+
|
|
68
|
+
# Start run - traces during this context will be attached
|
|
69
|
+
with mlflow.start_run(run_name=run_name) as run:
|
|
70
|
+
yield run.info.run_id
|
|
71
|
+
|
|
72
|
+
except ImportError:
|
|
73
|
+
logger.warning("MLflow not installed")
|
|
74
|
+
yield None
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.warning(f"Failed to start MLflow run: {e}")
|
|
77
|
+
yield None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class MLflowExporter:
|
|
81
|
+
"""Exports Sandboxy run results to MLflow tracking server.
|
|
82
|
+
|
|
83
|
+
This exporter is designed to be resilient - it catches all exceptions
|
|
84
|
+
and logs warnings instead of crashing the scenario run. MLflow failures
|
|
85
|
+
should never prevent local results from being saved.
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
def __init__(self, config: MLflowConfig) -> None:
|
|
89
|
+
"""Initialize exporter with resolved configuration.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
config: Resolved MLflowConfig instance
|
|
93
|
+
"""
|
|
94
|
+
self.config = config
|
|
95
|
+
self._mlflow = None # Lazy import
|
|
96
|
+
|
|
97
|
+
def _get_mlflow(self):
|
|
98
|
+
"""Lazy import mlflow to avoid import errors when not installed."""
|
|
99
|
+
if self._mlflow is None:
|
|
100
|
+
try:
|
|
101
|
+
import mlflow
|
|
102
|
+
|
|
103
|
+
self._mlflow = mlflow
|
|
104
|
+
except ImportError as e:
|
|
105
|
+
logger.warning(f"MLflow not installed: {e}")
|
|
106
|
+
return None
|
|
107
|
+
return self._mlflow
|
|
108
|
+
|
|
109
|
+
def log_to_active_run(
|
|
110
|
+
self,
|
|
111
|
+
result: dict[str, Any] | Any,
|
|
112
|
+
scenario_path: Path,
|
|
113
|
+
scenario_name: str,
|
|
114
|
+
scenario_id: str,
|
|
115
|
+
agent_name: str = "default",
|
|
116
|
+
dataset_case: str | None = None,
|
|
117
|
+
) -> bool:
|
|
118
|
+
"""Log results to the currently active MLflow run.
|
|
119
|
+
|
|
120
|
+
Use this with mlflow_run_context() to connect traces to the run.
|
|
121
|
+
The run should already be started via the context manager.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
result: Completed scenario run result (dict or object)
|
|
125
|
+
scenario_path: Path to scenario YAML file
|
|
126
|
+
scenario_name: Human-readable scenario name
|
|
127
|
+
scenario_id: Unique scenario identifier
|
|
128
|
+
agent_name: Agent configuration name
|
|
129
|
+
dataset_case: Optional dataset case identifier
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
True on success, False on failure
|
|
133
|
+
"""
|
|
134
|
+
if not self.config.enabled:
|
|
135
|
+
return False
|
|
136
|
+
|
|
137
|
+
mlflow = self._get_mlflow()
|
|
138
|
+
if mlflow is None:
|
|
139
|
+
return False
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
self._log_parameters(result, scenario_name, scenario_id)
|
|
143
|
+
self._log_metrics(result)
|
|
144
|
+
self._log_tags(
|
|
145
|
+
result,
|
|
146
|
+
scenario_name,
|
|
147
|
+
scenario_id,
|
|
148
|
+
agent_name,
|
|
149
|
+
dataset_case,
|
|
150
|
+
)
|
|
151
|
+
self._log_artifacts(result, scenario_path, scenario_name)
|
|
152
|
+
return True
|
|
153
|
+
|
|
154
|
+
except Exception as e:
|
|
155
|
+
logger.warning(f"Failed to log to MLflow run: {e}")
|
|
156
|
+
return False
|
|
157
|
+
|
|
158
|
+
def export(
|
|
159
|
+
self,
|
|
160
|
+
result: RunResult,
|
|
161
|
+
scenario_path: Path,
|
|
162
|
+
scenario_name: str,
|
|
163
|
+
scenario_id: str,
|
|
164
|
+
agent_name: str = "default",
|
|
165
|
+
dataset_case: str | None = None,
|
|
166
|
+
) -> str | None:
|
|
167
|
+
"""Export run result to MLflow (creates a new run).
|
|
168
|
+
|
|
169
|
+
NOTE: This creates a NEW run. If you want traces connected to the run,
|
|
170
|
+
use mlflow_run_context() + log_to_active_run() instead.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
result: Completed scenario run result
|
|
174
|
+
scenario_path: Path to scenario YAML file
|
|
175
|
+
scenario_name: Human-readable scenario name
|
|
176
|
+
scenario_id: Unique scenario identifier
|
|
177
|
+
agent_name: Agent configuration name
|
|
178
|
+
dataset_case: Optional dataset case identifier
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
MLflow run ID on success, None on failure
|
|
182
|
+
|
|
183
|
+
Note:
|
|
184
|
+
This method NEVER raises exceptions. All errors are logged
|
|
185
|
+
as warnings and the method returns None.
|
|
186
|
+
"""
|
|
187
|
+
if not self.config.enabled:
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
mlflow = self._get_mlflow()
|
|
191
|
+
if mlflow is None:
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
try:
|
|
195
|
+
# Setup tracking
|
|
196
|
+
if not self._setup_tracking():
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
# Start run and log everything
|
|
200
|
+
with mlflow.start_run() as run:
|
|
201
|
+
run_id = run.info.run_id
|
|
202
|
+
|
|
203
|
+
self._log_parameters(result, scenario_name, scenario_id)
|
|
204
|
+
self._log_metrics(result)
|
|
205
|
+
self._log_tags(
|
|
206
|
+
result,
|
|
207
|
+
scenario_name,
|
|
208
|
+
scenario_id,
|
|
209
|
+
agent_name,
|
|
210
|
+
dataset_case,
|
|
211
|
+
)
|
|
212
|
+
self._log_artifacts(result, scenario_path, scenario_name)
|
|
213
|
+
|
|
214
|
+
return run_id
|
|
215
|
+
|
|
216
|
+
except Exception as e:
|
|
217
|
+
logger.warning(f"Failed to export to MLflow: {e}")
|
|
218
|
+
return None
|
|
219
|
+
|
|
220
|
+
def _setup_tracking(self) -> bool:
|
|
221
|
+
"""Configure MLflow tracking URI and experiment.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
True on success, False on failure
|
|
225
|
+
"""
|
|
226
|
+
mlflow = self._get_mlflow()
|
|
227
|
+
if mlflow is None:
|
|
228
|
+
return False
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
if self.config.tracking_uri:
|
|
232
|
+
mlflow.set_tracking_uri(self.config.tracking_uri)
|
|
233
|
+
|
|
234
|
+
if self.config.experiment:
|
|
235
|
+
mlflow.set_experiment(self.config.experiment)
|
|
236
|
+
|
|
237
|
+
return True
|
|
238
|
+
except Exception as e:
|
|
239
|
+
logger.warning(f"Failed to setup MLflow tracking: {e}")
|
|
240
|
+
return False
|
|
241
|
+
|
|
242
|
+
def _log_parameters(
|
|
243
|
+
self,
|
|
244
|
+
result: Any,
|
|
245
|
+
scenario_name: str,
|
|
246
|
+
scenario_id: str,
|
|
247
|
+
) -> None:
|
|
248
|
+
"""Log run parameters to MLflow.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
result: Run result (RunResult or ScenarioResult or dict)
|
|
252
|
+
scenario_name: Scenario name
|
|
253
|
+
scenario_id: Scenario ID
|
|
254
|
+
"""
|
|
255
|
+
mlflow = self._get_mlflow()
|
|
256
|
+
if mlflow is None:
|
|
257
|
+
return
|
|
258
|
+
|
|
259
|
+
try:
|
|
260
|
+
# Handle both RunResult and ScenarioResult formats
|
|
261
|
+
if isinstance(result, dict):
|
|
262
|
+
model = result.get("model", "unknown")
|
|
263
|
+
else:
|
|
264
|
+
model = getattr(result, "model", "unknown")
|
|
265
|
+
|
|
266
|
+
mlflow.log_params(
|
|
267
|
+
{
|
|
268
|
+
"scenario_name": scenario_name,
|
|
269
|
+
"scenario_id": scenario_id,
|
|
270
|
+
"model": model,
|
|
271
|
+
}
|
|
272
|
+
)
|
|
273
|
+
except Exception as e:
|
|
274
|
+
logger.warning(f"Failed to log parameters to MLflow: {e}")
|
|
275
|
+
|
|
276
|
+
def _log_metrics(self, result: Any) -> None:
|
|
277
|
+
"""Log all metrics with standardized naming.
|
|
278
|
+
|
|
279
|
+
Handles both RunResult (unified) and ScenarioResult (legacy) formats.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
result: Run result containing evaluation data
|
|
283
|
+
"""
|
|
284
|
+
mlflow = self._get_mlflow()
|
|
285
|
+
if mlflow is None:
|
|
286
|
+
return
|
|
287
|
+
|
|
288
|
+
try:
|
|
289
|
+
metrics: dict[str, float] = {}
|
|
290
|
+
|
|
291
|
+
# Handle dict format
|
|
292
|
+
if isinstance(result, dict):
|
|
293
|
+
evaluation = result.get("evaluation")
|
|
294
|
+
if evaluation:
|
|
295
|
+
goals = evaluation.get("goals", [])
|
|
296
|
+
if goals:
|
|
297
|
+
metrics.update(build_goal_metrics(goals))
|
|
298
|
+
metrics.update(
|
|
299
|
+
build_score_metrics(
|
|
300
|
+
evaluation.get("total_score", 0),
|
|
301
|
+
evaluation.get("max_score", 0),
|
|
302
|
+
evaluation.get("percentage", 0),
|
|
303
|
+
)
|
|
304
|
+
)
|
|
305
|
+
# Legacy format - direct score
|
|
306
|
+
elif "score" in result:
|
|
307
|
+
metrics["score_total"] = float(result.get("score", 0))
|
|
308
|
+
|
|
309
|
+
if result.get("latency_ms"):
|
|
310
|
+
metrics.update(build_timing_metrics(result["latency_ms"]))
|
|
311
|
+
if result.get("input_tokens") or result.get("output_tokens"):
|
|
312
|
+
metrics.update(
|
|
313
|
+
build_token_metrics(
|
|
314
|
+
result.get("input_tokens", 0),
|
|
315
|
+
result.get("output_tokens", 0),
|
|
316
|
+
)
|
|
317
|
+
)
|
|
318
|
+
else:
|
|
319
|
+
# Handle object format (RunResult or ScenarioResult)
|
|
320
|
+
evaluation = getattr(result, "evaluation", None)
|
|
321
|
+
if evaluation:
|
|
322
|
+
goals = getattr(evaluation, "goals", None)
|
|
323
|
+
if goals:
|
|
324
|
+
metrics.update(build_goal_metrics(goals))
|
|
325
|
+
metrics.update(
|
|
326
|
+
build_score_metrics(
|
|
327
|
+
getattr(evaluation, "total_score", 0),
|
|
328
|
+
getattr(evaluation, "max_score", 0),
|
|
329
|
+
getattr(evaluation, "percentage", 0),
|
|
330
|
+
)
|
|
331
|
+
)
|
|
332
|
+
else:
|
|
333
|
+
# ScenarioResult format - goal_results and score at top level
|
|
334
|
+
goal_results = getattr(result, "goal_results", None)
|
|
335
|
+
if goal_results:
|
|
336
|
+
metrics.update(build_goal_metrics(goal_results))
|
|
337
|
+
|
|
338
|
+
score = getattr(result, "score", 0) or 0
|
|
339
|
+
max_score = getattr(result, "max_score", 0) or 0
|
|
340
|
+
percentage = (score / max_score * 100) if max_score > 0 else 0
|
|
341
|
+
metrics.update(build_score_metrics(score, max_score, percentage))
|
|
342
|
+
|
|
343
|
+
latency = getattr(result, "latency_ms", None)
|
|
344
|
+
if latency:
|
|
345
|
+
metrics.update(build_timing_metrics(latency))
|
|
346
|
+
|
|
347
|
+
input_tokens = getattr(result, "input_tokens", 0) or 0
|
|
348
|
+
output_tokens = getattr(result, "output_tokens", 0) or 0
|
|
349
|
+
if input_tokens or output_tokens:
|
|
350
|
+
metrics.update(build_token_metrics(input_tokens, output_tokens))
|
|
351
|
+
|
|
352
|
+
if metrics:
|
|
353
|
+
mlflow.log_metrics(metrics)
|
|
354
|
+
|
|
355
|
+
except Exception as e:
|
|
356
|
+
logger.warning(f"Failed to log metrics to MLflow: {e}")
|
|
357
|
+
|
|
358
|
+
def _log_tags(
|
|
359
|
+
self,
|
|
360
|
+
result: Any,
|
|
361
|
+
scenario_name: str,
|
|
362
|
+
scenario_id: str,
|
|
363
|
+
agent_name: str = "default",
|
|
364
|
+
dataset_case: str | None = None,
|
|
365
|
+
) -> None:
|
|
366
|
+
"""Log all tags including custom tags from config.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
result: Run result (RunResult, ScenarioResult, or dict)
|
|
370
|
+
scenario_name: Scenario name
|
|
371
|
+
scenario_id: Scenario ID
|
|
372
|
+
agent_name: Agent configuration name
|
|
373
|
+
dataset_case: Optional dataset case identifier
|
|
374
|
+
"""
|
|
375
|
+
mlflow = self._get_mlflow()
|
|
376
|
+
if mlflow is None:
|
|
377
|
+
return
|
|
378
|
+
|
|
379
|
+
try:
|
|
380
|
+
# Build standard tags
|
|
381
|
+
tags = build_standard_tags(
|
|
382
|
+
result=result,
|
|
383
|
+
scenario_name=scenario_name,
|
|
384
|
+
scenario_id=scenario_id,
|
|
385
|
+
agent_name=agent_name,
|
|
386
|
+
dataset_case=dataset_case,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Merge custom tags from config (config tags take precedence)
|
|
390
|
+
tags.update(self.config.tags)
|
|
391
|
+
|
|
392
|
+
mlflow.set_tags(tags)
|
|
393
|
+
|
|
394
|
+
except Exception as e:
|
|
395
|
+
logger.warning(f"Failed to log tags to MLflow: {e}")
|
|
396
|
+
|
|
397
|
+
def _log_artifacts(
|
|
398
|
+
self,
|
|
399
|
+
result: Any,
|
|
400
|
+
scenario_path: Path,
|
|
401
|
+
scenario_name: str,
|
|
402
|
+
) -> None:
|
|
403
|
+
"""Generate and upload artifacts.
|
|
404
|
+
|
|
405
|
+
Uploads:
|
|
406
|
+
- scenario.yaml: Original scenario file
|
|
407
|
+
- conversation.json: Full message history
|
|
408
|
+
- summary.txt: Human-readable summary
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
result: Run result
|
|
412
|
+
scenario_path: Path to scenario YAML
|
|
413
|
+
scenario_name: Scenario name
|
|
414
|
+
"""
|
|
415
|
+
mlflow = self._get_mlflow()
|
|
416
|
+
if mlflow is None:
|
|
417
|
+
return
|
|
418
|
+
|
|
419
|
+
artifacts_dir = None
|
|
420
|
+
try:
|
|
421
|
+
# Prepare artifacts directory
|
|
422
|
+
artifacts_dir = prepare_artifacts_dir(
|
|
423
|
+
result=result,
|
|
424
|
+
scenario_path=scenario_path,
|
|
425
|
+
scenario_name=scenario_name,
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Log all artifacts
|
|
429
|
+
mlflow.log_artifacts(str(artifacts_dir))
|
|
430
|
+
|
|
431
|
+
except Exception as e:
|
|
432
|
+
logger.warning(f"Failed to log artifacts to MLflow: {e}")
|
|
433
|
+
|
|
434
|
+
finally:
|
|
435
|
+
# Cleanup temp directory
|
|
436
|
+
if artifacts_dir and artifacts_dir.exists():
|
|
437
|
+
import shutil
|
|
438
|
+
|
|
439
|
+
shutil.rmtree(artifacts_dir, ignore_errors=True)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Metric naming helpers for MLflow integration.
|
|
2
|
+
|
|
3
|
+
Standardized naming conventions:
|
|
4
|
+
- goal_{goal_name}: Individual goal scores (0.0-1.0)
|
|
5
|
+
- timing_{phase}_ms: Timing metrics in milliseconds
|
|
6
|
+
- tokens_{type}: Token counts (input, output, total)
|
|
7
|
+
- score_{category}: Aggregate scores
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def sanitize_metric_name(name: str) -> str:
|
|
16
|
+
"""Sanitize a metric name for MLflow compatibility.
|
|
17
|
+
|
|
18
|
+
- Converts to lowercase
|
|
19
|
+
- Replaces spaces and special characters with underscores
|
|
20
|
+
- Removes leading/trailing underscores
|
|
21
|
+
- Collapses multiple underscores
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
name: Raw metric name
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Sanitized metric name safe for MLflow
|
|
28
|
+
"""
|
|
29
|
+
# Lowercase
|
|
30
|
+
result = name.lower()
|
|
31
|
+
# Replace spaces and special chars with underscores
|
|
32
|
+
result = re.sub(r"[^a-z0-9_]", "_", result)
|
|
33
|
+
# Collapse multiple underscores
|
|
34
|
+
result = re.sub(r"_+", "_", result)
|
|
35
|
+
# Remove leading/trailing underscores
|
|
36
|
+
result = result.strip("_")
|
|
37
|
+
return result or "unnamed"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def build_goal_metrics(goals: list) -> dict[str, float]:
|
|
41
|
+
"""Build metric dict from goal results.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
goals: List of GoalResult objects from scenario evaluation
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Dict mapping goal_{goal_name} to score value
|
|
48
|
+
"""
|
|
49
|
+
metrics: dict[str, float] = {}
|
|
50
|
+
for goal in goals:
|
|
51
|
+
# Handle both object (GoalResult) and dict formats
|
|
52
|
+
if isinstance(goal, dict):
|
|
53
|
+
name = goal.get("name", goal.get("id", "unnamed"))
|
|
54
|
+
score = goal.get("score")
|
|
55
|
+
if score is None:
|
|
56
|
+
score = 1.0 if goal.get("achieved", False) else 0.0
|
|
57
|
+
else:
|
|
58
|
+
name = getattr(goal, "name", getattr(goal, "id", "unnamed"))
|
|
59
|
+
score = getattr(goal, "score", None)
|
|
60
|
+
if score is None:
|
|
61
|
+
score = 1.0 if getattr(goal, "achieved", False) else 0.0
|
|
62
|
+
key = f"goal_{sanitize_metric_name(name)}"
|
|
63
|
+
metrics[key] = float(score)
|
|
64
|
+
return metrics
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def build_timing_metrics(latency_ms: int) -> dict[str, float]:
|
|
68
|
+
"""Build timing metric dict.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
latency_ms: Total execution time in milliseconds
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Dict with timing_total_ms metric
|
|
75
|
+
"""
|
|
76
|
+
return {"timing_total_ms": float(latency_ms)}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def build_token_metrics(input_tokens: int, output_tokens: int) -> dict[str, float]:
|
|
80
|
+
"""Build token metric dict.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
input_tokens: Number of input tokens
|
|
84
|
+
output_tokens: Number of output tokens
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Dict with tokens_input, tokens_output, tokens_total metrics
|
|
88
|
+
"""
|
|
89
|
+
return {
|
|
90
|
+
"tokens_input": float(input_tokens),
|
|
91
|
+
"tokens_output": float(output_tokens),
|
|
92
|
+
"tokens_total": float(input_tokens + output_tokens),
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def build_score_metrics(
|
|
97
|
+
total_score: float,
|
|
98
|
+
max_score: float,
|
|
99
|
+
percentage: float,
|
|
100
|
+
) -> dict[str, float]:
|
|
101
|
+
"""Build score metric dict.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
total_score: Sum of achieved scores
|
|
105
|
+
max_score: Maximum possible score
|
|
106
|
+
percentage: Percentage achieved (0-100)
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Dict with score_total, score_max, score_percentage metrics
|
|
110
|
+
"""
|
|
111
|
+
return {
|
|
112
|
+
"score_total": total_score,
|
|
113
|
+
"score_max": max_score,
|
|
114
|
+
"score_percentage": percentage,
|
|
115
|
+
}
|
sandboxy/mlflow/tags.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Tag helpers for MLflow integration.
|
|
2
|
+
|
|
3
|
+
Standard tags applied to every run:
|
|
4
|
+
- sandboxy_version: Package version
|
|
5
|
+
- scenario_name: Human-readable scenario name
|
|
6
|
+
- scenario_id: Unique scenario identifier
|
|
7
|
+
- model_name: Full model name (e.g., openai/gpt-4o)
|
|
8
|
+
- model_provider: Provider extracted from model name
|
|
9
|
+
- status: success or failed
|
|
10
|
+
- agent_name: Agent configuration name
|
|
11
|
+
|
|
12
|
+
Optional tags:
|
|
13
|
+
- commit_hash: Git commit hash (if in repo)
|
|
14
|
+
- dataset_case: Dataset case identifier (for benchmarks)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import subprocess
|
|
20
|
+
from typing import TYPE_CHECKING
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from sandboxy.scenarios.unified import RunResult
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_sandboxy_version() -> str:
|
|
27
|
+
"""Get the current Sandboxy package version.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Version string or "unknown" if not available
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
from sandboxy import __version__
|
|
34
|
+
|
|
35
|
+
return __version__
|
|
36
|
+
except (ImportError, AttributeError):
|
|
37
|
+
# Fallback: try to read from package metadata
|
|
38
|
+
try:
|
|
39
|
+
from importlib.metadata import version
|
|
40
|
+
|
|
41
|
+
return version("sandboxy")
|
|
42
|
+
except Exception:
|
|
43
|
+
return "unknown"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def parse_model_provider(model: str) -> str:
|
|
47
|
+
"""Extract provider from model string.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
model: Model identifier (e.g., "openai/gpt-4o", "anthropic/claude-3")
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Provider name or "unknown" if no provider prefix
|
|
54
|
+
|
|
55
|
+
Examples:
|
|
56
|
+
>>> parse_model_provider("openai/gpt-4o")
|
|
57
|
+
'openai'
|
|
58
|
+
>>> parse_model_provider("anthropic/claude-3")
|
|
59
|
+
'anthropic'
|
|
60
|
+
>>> parse_model_provider("gpt-4o")
|
|
61
|
+
'unknown'
|
|
62
|
+
"""
|
|
63
|
+
if "/" in model:
|
|
64
|
+
return model.split("/", 1)[0]
|
|
65
|
+
return "unknown"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def get_commit_hash() -> str | None:
|
|
69
|
+
"""Get current git commit hash (short form).
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
8-character commit hash or None if not in git repo
|
|
73
|
+
"""
|
|
74
|
+
try:
|
|
75
|
+
result = subprocess.run(
|
|
76
|
+
["git", "rev-parse", "HEAD"],
|
|
77
|
+
capture_output=True,
|
|
78
|
+
text=True,
|
|
79
|
+
timeout=5,
|
|
80
|
+
check=False,
|
|
81
|
+
)
|
|
82
|
+
if result.returncode == 0:
|
|
83
|
+
return result.stdout.strip()[:8]
|
|
84
|
+
except Exception:
|
|
85
|
+
pass
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def build_standard_tags(
|
|
90
|
+
result: RunResult | dict | object,
|
|
91
|
+
scenario_name: str,
|
|
92
|
+
scenario_id: str,
|
|
93
|
+
agent_name: str = "default",
|
|
94
|
+
dataset_case: str | None = None,
|
|
95
|
+
) -> dict[str, str]:
|
|
96
|
+
"""Build standard tag dict for an MLflow run.
|
|
97
|
+
|
|
98
|
+
Handles both RunResult (unified) and ScenarioResult (legacy) formats.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
result: Run result from scenario execution (any format)
|
|
102
|
+
scenario_name: Human-readable scenario name
|
|
103
|
+
scenario_id: Unique scenario identifier
|
|
104
|
+
agent_name: Agent configuration name (default: "default")
|
|
105
|
+
dataset_case: Optional dataset case identifier
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Dict of tag name to tag value
|
|
109
|
+
"""
|
|
110
|
+
# Extract fields from various result formats
|
|
111
|
+
if isinstance(result, dict):
|
|
112
|
+
error = result.get("error")
|
|
113
|
+
model = result.get("model", "unknown")
|
|
114
|
+
else:
|
|
115
|
+
error = getattr(result, "error", None)
|
|
116
|
+
model = getattr(result, "model", None) or agent_name
|
|
117
|
+
|
|
118
|
+
# Determine status
|
|
119
|
+
status = "failed" if error else "success"
|
|
120
|
+
|
|
121
|
+
tags = {
|
|
122
|
+
"sandboxy_version": get_sandboxy_version(),
|
|
123
|
+
"scenario_name": scenario_name,
|
|
124
|
+
"scenario_id": scenario_id,
|
|
125
|
+
"model_name": str(model),
|
|
126
|
+
"model_provider": parse_model_provider(str(model)),
|
|
127
|
+
"status": status,
|
|
128
|
+
"agent_name": agent_name,
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
# Optional: commit hash
|
|
132
|
+
commit_hash = get_commit_hash()
|
|
133
|
+
if commit_hash:
|
|
134
|
+
tags["commit_hash"] = commit_hash
|
|
135
|
+
|
|
136
|
+
# Optional: dataset case
|
|
137
|
+
if dataset_case:
|
|
138
|
+
tags["dataset_case"] = dataset_case
|
|
139
|
+
|
|
140
|
+
return tags
|