sandboxy 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,439 @@
1
+ """MLflow exporter for Sandboxy scenario run results.
2
+
3
+ This module handles exporting scenario run results to MLflow tracking server.
4
+ All methods are designed to be resilient - they log warnings on failure but
5
+ never raise exceptions that would crash the scenario run.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from collections.abc import Generator
12
+ from contextlib import contextmanager
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING, Any
15
+
16
+ from sandboxy.mlflow.artifacts import prepare_artifacts_dir
17
+ from sandboxy.mlflow.config import MLflowConfig
18
+ from sandboxy.mlflow.metrics import (
19
+ build_goal_metrics,
20
+ build_score_metrics,
21
+ build_timing_metrics,
22
+ build_token_metrics,
23
+ )
24
+ from sandboxy.mlflow.tags import build_standard_tags
25
+
26
+ if TYPE_CHECKING:
27
+ from sandboxy.scenarios.unified import RunResult
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ @contextmanager
33
+ def mlflow_run_context(
34
+ config: MLflowConfig,
35
+ run_name: str | None = None,
36
+ ) -> Generator[str | None, None, None]:
37
+ """Context manager that starts an MLflow run before scenario execution.
38
+
39
+ This allows traces from LLM calls to be attached to the run.
40
+ Use this to wrap scenario execution so traces are connected to the run.
41
+
42
+ Args:
43
+ config: MLflow configuration
44
+ run_name: Optional name for the run
45
+
46
+ Yields:
47
+ The run ID if successful, None otherwise
48
+
49
+ Example:
50
+ with mlflow_run_context(config, run_name="gpt-4o") as run_id:
51
+ # Run scenario here - traces will be attached to this run
52
+ result = run_scenario(...)
53
+ # After context exits, log metrics with exporter.log_to_run(run_id, result)
54
+ """
55
+ if not config.enabled:
56
+ yield None
57
+ return
58
+
59
+ try:
60
+ import mlflow
61
+
62
+ # Setup tracking
63
+ if config.tracking_uri:
64
+ mlflow.set_tracking_uri(config.tracking_uri)
65
+ if config.experiment:
66
+ mlflow.set_experiment(config.experiment)
67
+
68
+ # Start run - traces during this context will be attached
69
+ with mlflow.start_run(run_name=run_name) as run:
70
+ yield run.info.run_id
71
+
72
+ except ImportError:
73
+ logger.warning("MLflow not installed")
74
+ yield None
75
+ except Exception as e:
76
+ logger.warning(f"Failed to start MLflow run: {e}")
77
+ yield None
78
+
79
+
80
+ class MLflowExporter:
81
+ """Exports Sandboxy run results to MLflow tracking server.
82
+
83
+ This exporter is designed to be resilient - it catches all exceptions
84
+ and logs warnings instead of crashing the scenario run. MLflow failures
85
+ should never prevent local results from being saved.
86
+ """
87
+
88
+ def __init__(self, config: MLflowConfig) -> None:
89
+ """Initialize exporter with resolved configuration.
90
+
91
+ Args:
92
+ config: Resolved MLflowConfig instance
93
+ """
94
+ self.config = config
95
+ self._mlflow = None # Lazy import
96
+
97
+ def _get_mlflow(self):
98
+ """Lazy import mlflow to avoid import errors when not installed."""
99
+ if self._mlflow is None:
100
+ try:
101
+ import mlflow
102
+
103
+ self._mlflow = mlflow
104
+ except ImportError as e:
105
+ logger.warning(f"MLflow not installed: {e}")
106
+ return None
107
+ return self._mlflow
108
+
109
+ def log_to_active_run(
110
+ self,
111
+ result: dict[str, Any] | Any,
112
+ scenario_path: Path,
113
+ scenario_name: str,
114
+ scenario_id: str,
115
+ agent_name: str = "default",
116
+ dataset_case: str | None = None,
117
+ ) -> bool:
118
+ """Log results to the currently active MLflow run.
119
+
120
+ Use this with mlflow_run_context() to connect traces to the run.
121
+ The run should already be started via the context manager.
122
+
123
+ Args:
124
+ result: Completed scenario run result (dict or object)
125
+ scenario_path: Path to scenario YAML file
126
+ scenario_name: Human-readable scenario name
127
+ scenario_id: Unique scenario identifier
128
+ agent_name: Agent configuration name
129
+ dataset_case: Optional dataset case identifier
130
+
131
+ Returns:
132
+ True on success, False on failure
133
+ """
134
+ if not self.config.enabled:
135
+ return False
136
+
137
+ mlflow = self._get_mlflow()
138
+ if mlflow is None:
139
+ return False
140
+
141
+ try:
142
+ self._log_parameters(result, scenario_name, scenario_id)
143
+ self._log_metrics(result)
144
+ self._log_tags(
145
+ result,
146
+ scenario_name,
147
+ scenario_id,
148
+ agent_name,
149
+ dataset_case,
150
+ )
151
+ self._log_artifacts(result, scenario_path, scenario_name)
152
+ return True
153
+
154
+ except Exception as e:
155
+ logger.warning(f"Failed to log to MLflow run: {e}")
156
+ return False
157
+
158
+ def export(
159
+ self,
160
+ result: RunResult,
161
+ scenario_path: Path,
162
+ scenario_name: str,
163
+ scenario_id: str,
164
+ agent_name: str = "default",
165
+ dataset_case: str | None = None,
166
+ ) -> str | None:
167
+ """Export run result to MLflow (creates a new run).
168
+
169
+ NOTE: This creates a NEW run. If you want traces connected to the run,
170
+ use mlflow_run_context() + log_to_active_run() instead.
171
+
172
+ Args:
173
+ result: Completed scenario run result
174
+ scenario_path: Path to scenario YAML file
175
+ scenario_name: Human-readable scenario name
176
+ scenario_id: Unique scenario identifier
177
+ agent_name: Agent configuration name
178
+ dataset_case: Optional dataset case identifier
179
+
180
+ Returns:
181
+ MLflow run ID on success, None on failure
182
+
183
+ Note:
184
+ This method NEVER raises exceptions. All errors are logged
185
+ as warnings and the method returns None.
186
+ """
187
+ if not self.config.enabled:
188
+ return None
189
+
190
+ mlflow = self._get_mlflow()
191
+ if mlflow is None:
192
+ return None
193
+
194
+ try:
195
+ # Setup tracking
196
+ if not self._setup_tracking():
197
+ return None
198
+
199
+ # Start run and log everything
200
+ with mlflow.start_run() as run:
201
+ run_id = run.info.run_id
202
+
203
+ self._log_parameters(result, scenario_name, scenario_id)
204
+ self._log_metrics(result)
205
+ self._log_tags(
206
+ result,
207
+ scenario_name,
208
+ scenario_id,
209
+ agent_name,
210
+ dataset_case,
211
+ )
212
+ self._log_artifacts(result, scenario_path, scenario_name)
213
+
214
+ return run_id
215
+
216
+ except Exception as e:
217
+ logger.warning(f"Failed to export to MLflow: {e}")
218
+ return None
219
+
220
+ def _setup_tracking(self) -> bool:
221
+ """Configure MLflow tracking URI and experiment.
222
+
223
+ Returns:
224
+ True on success, False on failure
225
+ """
226
+ mlflow = self._get_mlflow()
227
+ if mlflow is None:
228
+ return False
229
+
230
+ try:
231
+ if self.config.tracking_uri:
232
+ mlflow.set_tracking_uri(self.config.tracking_uri)
233
+
234
+ if self.config.experiment:
235
+ mlflow.set_experiment(self.config.experiment)
236
+
237
+ return True
238
+ except Exception as e:
239
+ logger.warning(f"Failed to setup MLflow tracking: {e}")
240
+ return False
241
+
242
+ def _log_parameters(
243
+ self,
244
+ result: Any,
245
+ scenario_name: str,
246
+ scenario_id: str,
247
+ ) -> None:
248
+ """Log run parameters to MLflow.
249
+
250
+ Args:
251
+ result: Run result (RunResult or ScenarioResult or dict)
252
+ scenario_name: Scenario name
253
+ scenario_id: Scenario ID
254
+ """
255
+ mlflow = self._get_mlflow()
256
+ if mlflow is None:
257
+ return
258
+
259
+ try:
260
+ # Handle both RunResult and ScenarioResult formats
261
+ if isinstance(result, dict):
262
+ model = result.get("model", "unknown")
263
+ else:
264
+ model = getattr(result, "model", "unknown")
265
+
266
+ mlflow.log_params(
267
+ {
268
+ "scenario_name": scenario_name,
269
+ "scenario_id": scenario_id,
270
+ "model": model,
271
+ }
272
+ )
273
+ except Exception as e:
274
+ logger.warning(f"Failed to log parameters to MLflow: {e}")
275
+
276
+ def _log_metrics(self, result: Any) -> None:
277
+ """Log all metrics with standardized naming.
278
+
279
+ Handles both RunResult (unified) and ScenarioResult (legacy) formats.
280
+
281
+ Args:
282
+ result: Run result containing evaluation data
283
+ """
284
+ mlflow = self._get_mlflow()
285
+ if mlflow is None:
286
+ return
287
+
288
+ try:
289
+ metrics: dict[str, float] = {}
290
+
291
+ # Handle dict format
292
+ if isinstance(result, dict):
293
+ evaluation = result.get("evaluation")
294
+ if evaluation:
295
+ goals = evaluation.get("goals", [])
296
+ if goals:
297
+ metrics.update(build_goal_metrics(goals))
298
+ metrics.update(
299
+ build_score_metrics(
300
+ evaluation.get("total_score", 0),
301
+ evaluation.get("max_score", 0),
302
+ evaluation.get("percentage", 0),
303
+ )
304
+ )
305
+ # Legacy format - direct score
306
+ elif "score" in result:
307
+ metrics["score_total"] = float(result.get("score", 0))
308
+
309
+ if result.get("latency_ms"):
310
+ metrics.update(build_timing_metrics(result["latency_ms"]))
311
+ if result.get("input_tokens") or result.get("output_tokens"):
312
+ metrics.update(
313
+ build_token_metrics(
314
+ result.get("input_tokens", 0),
315
+ result.get("output_tokens", 0),
316
+ )
317
+ )
318
+ else:
319
+ # Handle object format (RunResult or ScenarioResult)
320
+ evaluation = getattr(result, "evaluation", None)
321
+ if evaluation:
322
+ goals = getattr(evaluation, "goals", None)
323
+ if goals:
324
+ metrics.update(build_goal_metrics(goals))
325
+ metrics.update(
326
+ build_score_metrics(
327
+ getattr(evaluation, "total_score", 0),
328
+ getattr(evaluation, "max_score", 0),
329
+ getattr(evaluation, "percentage", 0),
330
+ )
331
+ )
332
+ else:
333
+ # ScenarioResult format - goal_results and score at top level
334
+ goal_results = getattr(result, "goal_results", None)
335
+ if goal_results:
336
+ metrics.update(build_goal_metrics(goal_results))
337
+
338
+ score = getattr(result, "score", 0) or 0
339
+ max_score = getattr(result, "max_score", 0) or 0
340
+ percentage = (score / max_score * 100) if max_score > 0 else 0
341
+ metrics.update(build_score_metrics(score, max_score, percentage))
342
+
343
+ latency = getattr(result, "latency_ms", None)
344
+ if latency:
345
+ metrics.update(build_timing_metrics(latency))
346
+
347
+ input_tokens = getattr(result, "input_tokens", 0) or 0
348
+ output_tokens = getattr(result, "output_tokens", 0) or 0
349
+ if input_tokens or output_tokens:
350
+ metrics.update(build_token_metrics(input_tokens, output_tokens))
351
+
352
+ if metrics:
353
+ mlflow.log_metrics(metrics)
354
+
355
+ except Exception as e:
356
+ logger.warning(f"Failed to log metrics to MLflow: {e}")
357
+
358
+ def _log_tags(
359
+ self,
360
+ result: Any,
361
+ scenario_name: str,
362
+ scenario_id: str,
363
+ agent_name: str = "default",
364
+ dataset_case: str | None = None,
365
+ ) -> None:
366
+ """Log all tags including custom tags from config.
367
+
368
+ Args:
369
+ result: Run result (RunResult, ScenarioResult, or dict)
370
+ scenario_name: Scenario name
371
+ scenario_id: Scenario ID
372
+ agent_name: Agent configuration name
373
+ dataset_case: Optional dataset case identifier
374
+ """
375
+ mlflow = self._get_mlflow()
376
+ if mlflow is None:
377
+ return
378
+
379
+ try:
380
+ # Build standard tags
381
+ tags = build_standard_tags(
382
+ result=result,
383
+ scenario_name=scenario_name,
384
+ scenario_id=scenario_id,
385
+ agent_name=agent_name,
386
+ dataset_case=dataset_case,
387
+ )
388
+
389
+ # Merge custom tags from config (config tags take precedence)
390
+ tags.update(self.config.tags)
391
+
392
+ mlflow.set_tags(tags)
393
+
394
+ except Exception as e:
395
+ logger.warning(f"Failed to log tags to MLflow: {e}")
396
+
397
+ def _log_artifacts(
398
+ self,
399
+ result: Any,
400
+ scenario_path: Path,
401
+ scenario_name: str,
402
+ ) -> None:
403
+ """Generate and upload artifacts.
404
+
405
+ Uploads:
406
+ - scenario.yaml: Original scenario file
407
+ - conversation.json: Full message history
408
+ - summary.txt: Human-readable summary
409
+
410
+ Args:
411
+ result: Run result
412
+ scenario_path: Path to scenario YAML
413
+ scenario_name: Scenario name
414
+ """
415
+ mlflow = self._get_mlflow()
416
+ if mlflow is None:
417
+ return
418
+
419
+ artifacts_dir = None
420
+ try:
421
+ # Prepare artifacts directory
422
+ artifacts_dir = prepare_artifacts_dir(
423
+ result=result,
424
+ scenario_path=scenario_path,
425
+ scenario_name=scenario_name,
426
+ )
427
+
428
+ # Log all artifacts
429
+ mlflow.log_artifacts(str(artifacts_dir))
430
+
431
+ except Exception as e:
432
+ logger.warning(f"Failed to log artifacts to MLflow: {e}")
433
+
434
+ finally:
435
+ # Cleanup temp directory
436
+ if artifacts_dir and artifacts_dir.exists():
437
+ import shutil
438
+
439
+ shutil.rmtree(artifacts_dir, ignore_errors=True)
@@ -0,0 +1,115 @@
1
+ """Metric naming helpers for MLflow integration.
2
+
3
+ Standardized naming conventions:
4
+ - goal_{goal_name}: Individual goal scores (0.0-1.0)
5
+ - timing_{phase}_ms: Timing metrics in milliseconds
6
+ - tokens_{type}: Token counts (input, output, total)
7
+ - score_{category}: Aggregate scores
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import re
13
+
14
+
15
+ def sanitize_metric_name(name: str) -> str:
16
+ """Sanitize a metric name for MLflow compatibility.
17
+
18
+ - Converts to lowercase
19
+ - Replaces spaces and special characters with underscores
20
+ - Removes leading/trailing underscores
21
+ - Collapses multiple underscores
22
+
23
+ Args:
24
+ name: Raw metric name
25
+
26
+ Returns:
27
+ Sanitized metric name safe for MLflow
28
+ """
29
+ # Lowercase
30
+ result = name.lower()
31
+ # Replace spaces and special chars with underscores
32
+ result = re.sub(r"[^a-z0-9_]", "_", result)
33
+ # Collapse multiple underscores
34
+ result = re.sub(r"_+", "_", result)
35
+ # Remove leading/trailing underscores
36
+ result = result.strip("_")
37
+ return result or "unnamed"
38
+
39
+
40
+ def build_goal_metrics(goals: list) -> dict[str, float]:
41
+ """Build metric dict from goal results.
42
+
43
+ Args:
44
+ goals: List of GoalResult objects from scenario evaluation
45
+
46
+ Returns:
47
+ Dict mapping goal_{goal_name} to score value
48
+ """
49
+ metrics: dict[str, float] = {}
50
+ for goal in goals:
51
+ # Handle both object (GoalResult) and dict formats
52
+ if isinstance(goal, dict):
53
+ name = goal.get("name", goal.get("id", "unnamed"))
54
+ score = goal.get("score")
55
+ if score is None:
56
+ score = 1.0 if goal.get("achieved", False) else 0.0
57
+ else:
58
+ name = getattr(goal, "name", getattr(goal, "id", "unnamed"))
59
+ score = getattr(goal, "score", None)
60
+ if score is None:
61
+ score = 1.0 if getattr(goal, "achieved", False) else 0.0
62
+ key = f"goal_{sanitize_metric_name(name)}"
63
+ metrics[key] = float(score)
64
+ return metrics
65
+
66
+
67
+ def build_timing_metrics(latency_ms: int) -> dict[str, float]:
68
+ """Build timing metric dict.
69
+
70
+ Args:
71
+ latency_ms: Total execution time in milliseconds
72
+
73
+ Returns:
74
+ Dict with timing_total_ms metric
75
+ """
76
+ return {"timing_total_ms": float(latency_ms)}
77
+
78
+
79
+ def build_token_metrics(input_tokens: int, output_tokens: int) -> dict[str, float]:
80
+ """Build token metric dict.
81
+
82
+ Args:
83
+ input_tokens: Number of input tokens
84
+ output_tokens: Number of output tokens
85
+
86
+ Returns:
87
+ Dict with tokens_input, tokens_output, tokens_total metrics
88
+ """
89
+ return {
90
+ "tokens_input": float(input_tokens),
91
+ "tokens_output": float(output_tokens),
92
+ "tokens_total": float(input_tokens + output_tokens),
93
+ }
94
+
95
+
96
+ def build_score_metrics(
97
+ total_score: float,
98
+ max_score: float,
99
+ percentage: float,
100
+ ) -> dict[str, float]:
101
+ """Build score metric dict.
102
+
103
+ Args:
104
+ total_score: Sum of achieved scores
105
+ max_score: Maximum possible score
106
+ percentage: Percentage achieved (0-100)
107
+
108
+ Returns:
109
+ Dict with score_total, score_max, score_percentage metrics
110
+ """
111
+ return {
112
+ "score_total": total_score,
113
+ "score_max": max_score,
114
+ "score_percentage": percentage,
115
+ }
@@ -0,0 +1,140 @@
1
+ """Tag helpers for MLflow integration.
2
+
3
+ Standard tags applied to every run:
4
+ - sandboxy_version: Package version
5
+ - scenario_name: Human-readable scenario name
6
+ - scenario_id: Unique scenario identifier
7
+ - model_name: Full model name (e.g., openai/gpt-4o)
8
+ - model_provider: Provider extracted from model name
9
+ - status: success or failed
10
+ - agent_name: Agent configuration name
11
+
12
+ Optional tags:
13
+ - commit_hash: Git commit hash (if in repo)
14
+ - dataset_case: Dataset case identifier (for benchmarks)
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import subprocess
20
+ from typing import TYPE_CHECKING
21
+
22
+ if TYPE_CHECKING:
23
+ from sandboxy.scenarios.unified import RunResult
24
+
25
+
26
+ def get_sandboxy_version() -> str:
27
+ """Get the current Sandboxy package version.
28
+
29
+ Returns:
30
+ Version string or "unknown" if not available
31
+ """
32
+ try:
33
+ from sandboxy import __version__
34
+
35
+ return __version__
36
+ except (ImportError, AttributeError):
37
+ # Fallback: try to read from package metadata
38
+ try:
39
+ from importlib.metadata import version
40
+
41
+ return version("sandboxy")
42
+ except Exception:
43
+ return "unknown"
44
+
45
+
46
+ def parse_model_provider(model: str) -> str:
47
+ """Extract provider from model string.
48
+
49
+ Args:
50
+ model: Model identifier (e.g., "openai/gpt-4o", "anthropic/claude-3")
51
+
52
+ Returns:
53
+ Provider name or "unknown" if no provider prefix
54
+
55
+ Examples:
56
+ >>> parse_model_provider("openai/gpt-4o")
57
+ 'openai'
58
+ >>> parse_model_provider("anthropic/claude-3")
59
+ 'anthropic'
60
+ >>> parse_model_provider("gpt-4o")
61
+ 'unknown'
62
+ """
63
+ if "/" in model:
64
+ return model.split("/", 1)[0]
65
+ return "unknown"
66
+
67
+
68
+ def get_commit_hash() -> str | None:
69
+ """Get current git commit hash (short form).
70
+
71
+ Returns:
72
+ 8-character commit hash or None if not in git repo
73
+ """
74
+ try:
75
+ result = subprocess.run(
76
+ ["git", "rev-parse", "HEAD"],
77
+ capture_output=True,
78
+ text=True,
79
+ timeout=5,
80
+ check=False,
81
+ )
82
+ if result.returncode == 0:
83
+ return result.stdout.strip()[:8]
84
+ except Exception:
85
+ pass
86
+ return None
87
+
88
+
89
+ def build_standard_tags(
90
+ result: RunResult | dict | object,
91
+ scenario_name: str,
92
+ scenario_id: str,
93
+ agent_name: str = "default",
94
+ dataset_case: str | None = None,
95
+ ) -> dict[str, str]:
96
+ """Build standard tag dict for an MLflow run.
97
+
98
+ Handles both RunResult (unified) and ScenarioResult (legacy) formats.
99
+
100
+ Args:
101
+ result: Run result from scenario execution (any format)
102
+ scenario_name: Human-readable scenario name
103
+ scenario_id: Unique scenario identifier
104
+ agent_name: Agent configuration name (default: "default")
105
+ dataset_case: Optional dataset case identifier
106
+
107
+ Returns:
108
+ Dict of tag name to tag value
109
+ """
110
+ # Extract fields from various result formats
111
+ if isinstance(result, dict):
112
+ error = result.get("error")
113
+ model = result.get("model", "unknown")
114
+ else:
115
+ error = getattr(result, "error", None)
116
+ model = getattr(result, "model", None) or agent_name
117
+
118
+ # Determine status
119
+ status = "failed" if error else "success"
120
+
121
+ tags = {
122
+ "sandboxy_version": get_sandboxy_version(),
123
+ "scenario_name": scenario_name,
124
+ "scenario_id": scenario_id,
125
+ "model_name": str(model),
126
+ "model_provider": parse_model_provider(str(model)),
127
+ "status": status,
128
+ "agent_name": agent_name,
129
+ }
130
+
131
+ # Optional: commit hash
132
+ commit_hash = get_commit_hash()
133
+ if commit_hash:
134
+ tags["commit_hash"] = commit_hash
135
+
136
+ # Optional: dataset case
137
+ if dataset_case:
138
+ tags["dataset_case"] = dataset_case
139
+
140
+ return tags