sandboxy 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,38 @@
1
+ """MLflow integration for Sandboxy scenario tracking and evaluation."""
2
+
3
+ from sandboxy.mlflow.config import MLflowConfig
4
+
5
+ __all__ = [
6
+ "MLflowConfig",
7
+ "MLflowExporter",
8
+ "mlflow_run_context",
9
+ "enable_tracing",
10
+ "disable_tracing",
11
+ "trace_span",
12
+ ]
13
+
14
+
15
+ def __getattr__(name: str):
16
+ """Lazy import to avoid mlflow import when not needed."""
17
+ if name == "MLflowExporter":
18
+ from sandboxy.mlflow.exporter import MLflowExporter
19
+
20
+ return MLflowExporter
21
+ if name == "mlflow_run_context":
22
+ from sandboxy.mlflow.exporter import mlflow_run_context
23
+
24
+ return mlflow_run_context
25
+ if name == "enable_tracing":
26
+ from sandboxy.mlflow.tracing import enable_tracing
27
+
28
+ return enable_tracing
29
+ if name == "disable_tracing":
30
+ from sandboxy.mlflow.tracing import disable_tracing
31
+
32
+ return disable_tracing
33
+ if name == "trace_span":
34
+ from sandboxy.mlflow.tracing import trace_span
35
+
36
+ return trace_span
37
+ msg = f"module {__name__!r} has no attribute {name!r}"
38
+ raise AttributeError(msg)
@@ -0,0 +1,184 @@
1
+ """Artifact generation for MLflow integration.
2
+
3
+ Generates human-readable summaries and prepares artifact directories
4
+ for upload to MLflow.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import shutil
11
+ import tempfile
12
+ from datetime import UTC, datetime
13
+ from pathlib import Path
14
+ from typing import TYPE_CHECKING
15
+
16
+ if TYPE_CHECKING:
17
+ from sandboxy.scenarios.unified import RunResult
18
+
19
+
20
+ def generate_summary(
21
+ result: RunResult | dict | object,
22
+ scenario_name: str,
23
+ model: str,
24
+ ) -> str:
25
+ """Generate human-readable summary text.
26
+
27
+ Handles both RunResult (unified) and ScenarioResult (legacy) formats.
28
+
29
+ Args:
30
+ result: Run result from scenario execution (any format)
31
+ scenario_name: Human-readable scenario name
32
+ model: Model identifier
33
+
34
+ Returns:
35
+ Formatted summary text
36
+ """
37
+ # Extract fields from various result formats
38
+ if isinstance(result, dict):
39
+ error = result.get("error")
40
+ evaluation = result.get("evaluation")
41
+ latency_ms = result.get("latency_ms")
42
+ input_tokens = result.get("input_tokens", 0)
43
+ output_tokens = result.get("output_tokens", 0)
44
+ score = result.get("score", 0)
45
+ goals_achieved = result.get("goals_achieved", [])
46
+ else:
47
+ error = getattr(result, "error", None)
48
+ evaluation = getattr(result, "evaluation", None)
49
+ latency_ms = getattr(result, "latency_ms", None)
50
+ input_tokens = getattr(result, "input_tokens", 0) or 0
51
+ output_tokens = getattr(result, "output_tokens", 0) or 0
52
+ score = getattr(result, "score", 0)
53
+ goals_achieved = getattr(result, "goals_achieved", [])
54
+
55
+ # Determine status
56
+ status = "FAILED" if error else "PASSED"
57
+ timestamp = datetime.now(UTC).isoformat()
58
+
59
+ lines = [
60
+ "Sandboxy Run Summary",
61
+ "=" * 20,
62
+ f"Scenario: {scenario_name}",
63
+ f"Model: {model}",
64
+ f"Status: {status}",
65
+ f"Timestamp: {timestamp}",
66
+ "",
67
+ ]
68
+
69
+ # Scores section - handle both unified and legacy formats
70
+ if evaluation:
71
+ if isinstance(evaluation, dict):
72
+ goals = evaluation.get("goals", [])
73
+ total = evaluation.get("total_score", 0)
74
+ max_score = evaluation.get("max_score", 0)
75
+ pct = evaluation.get("percentage", 0)
76
+ else:
77
+ goals = getattr(evaluation, "goals", []) or []
78
+ total = getattr(evaluation, "total_score", 0)
79
+ max_score = getattr(evaluation, "max_score", 0)
80
+ pct = getattr(evaluation, "percentage", 0)
81
+
82
+ if goals:
83
+ lines.append("Scores:")
84
+ for goal in goals:
85
+ if isinstance(goal, dict):
86
+ name = goal.get("name", "unknown")
87
+ goal_score = goal.get("score", 0)
88
+ passed = goal.get("passed", False)
89
+ else:
90
+ name = getattr(goal, "name", "unknown")
91
+ goal_score = getattr(goal, "score", 0)
92
+ passed = getattr(goal, "passed", False)
93
+ check = "✓" if passed else "✗"
94
+ lines.append(f" {name}: {goal_score:.1f} {check}")
95
+
96
+ lines.append(f" score_total: {total:.1f}/{max_score:.1f} ({pct:.1f}%)")
97
+ lines.append("")
98
+ elif score or goals_achieved:
99
+ # Legacy ScenarioResult format
100
+ lines.append("Scores:")
101
+ lines.append(f" Total Score: {score}")
102
+ if goals_achieved:
103
+ lines.append(f" Goals Achieved: {', '.join(goals_achieved)}")
104
+ lines.append("")
105
+
106
+ # Timing section
107
+ if latency_ms:
108
+ lines.append("Timing:")
109
+ lines.append(f" Total: {latency_ms}ms")
110
+ lines.append("")
111
+
112
+ # Tokens section
113
+ if input_tokens or output_tokens:
114
+ total_tokens = input_tokens + output_tokens
115
+ lines.append("Tokens:")
116
+ lines.append(f" Input: {input_tokens}")
117
+ lines.append(f" Output: {output_tokens}")
118
+ lines.append(f" Total: {total_tokens}")
119
+ lines.append("")
120
+
121
+ return "\n".join(lines)
122
+
123
+
124
+ def prepare_artifacts_dir(
125
+ result: RunResult | dict | object,
126
+ scenario_path: Path,
127
+ scenario_name: str,
128
+ ) -> Path:
129
+ """Create temporary directory with all artifacts.
130
+
131
+ Handles both RunResult (unified) and ScenarioResult (legacy) formats.
132
+
133
+ Creates:
134
+ {tmpdir}/
135
+ ├── scenario.yaml # Original scenario file
136
+ ├── conversation.json # Full message history
137
+ └── summary.txt # Human-readable summary
138
+
139
+ Args:
140
+ result: Run result from scenario execution (any format)
141
+ scenario_path: Path to scenario YAML file
142
+ scenario_name: Human-readable scenario name
143
+
144
+ Returns:
145
+ Path to temporary directory (caller must clean up)
146
+ """
147
+ # Create temp directory
148
+ tmpdir = Path(tempfile.mkdtemp(prefix="sandboxy_mlflow_"))
149
+
150
+ # Copy scenario YAML
151
+ if scenario_path.exists():
152
+ shutil.copy(scenario_path, tmpdir / "scenario.yaml")
153
+
154
+ # Get history from result (handle both formats)
155
+ if isinstance(result, dict):
156
+ history = result.get("history", []) or result.get("messages", [])
157
+ model = result.get("model", "unknown")
158
+ else:
159
+ history = getattr(result, "history", None) or getattr(result, "messages", [])
160
+ model = getattr(result, "model", None) or "unknown"
161
+
162
+ # Write conversation.json
163
+ if history:
164
+ # Convert history to serializable format
165
+ serializable_history = []
166
+ for msg in history:
167
+ if hasattr(msg, "model_dump"):
168
+ serializable_history.append(msg.model_dump())
169
+ elif hasattr(msg, "dict"):
170
+ serializable_history.append(msg.dict())
171
+ elif isinstance(msg, dict):
172
+ serializable_history.append(msg)
173
+ else:
174
+ serializable_history.append({"content": str(msg)})
175
+
176
+ (tmpdir / "conversation.json").write_text(
177
+ json.dumps(serializable_history, indent=2, default=str)
178
+ )
179
+
180
+ # Write summary.txt
181
+ summary = generate_summary(result, scenario_name, model)
182
+ (tmpdir / "summary.txt").write_text(summary)
183
+
184
+ return tmpdir
@@ -0,0 +1,90 @@
1
+ """MLflow configuration with CLI > YAML > env precedence resolution."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from dataclasses import dataclass, field
7
+ from typing import Any
8
+
9
+
10
+ @dataclass
11
+ class MLflowConfig:
12
+ """Configuration for MLflow export, resolved from multiple sources.
13
+
14
+ Resolution precedence (highest to lowest):
15
+ 1. CLI flags (--mlflow-export, --no-mlflow, --mlflow-tracking-uri, --mlflow-experiment)
16
+ 2. Scenario YAML mlflow block
17
+ 3. Environment variables (MLFLOW_TRACKING_URI)
18
+ 4. Defaults
19
+ """
20
+
21
+ enabled: bool = False
22
+ tracking_uri: str | None = None
23
+ experiment: str | None = None
24
+ tags: dict[str, str] = field(default_factory=dict)
25
+ tracing: bool = True # Enable LLM call tracing by default when MLflow is enabled
26
+
27
+ @classmethod
28
+ def resolve(
29
+ cls,
30
+ cli_export: bool = False,
31
+ cli_no_mlflow: bool = False,
32
+ cli_tracking_uri: str | None = None,
33
+ cli_experiment: str | None = None,
34
+ cli_tracing: bool | None = None,
35
+ yaml_config: dict[str, Any] | None = None,
36
+ scenario_name: str = "default",
37
+ ) -> MLflowConfig:
38
+ """Resolve MLflow configuration with CLI > YAML > env precedence.
39
+
40
+ Args:
41
+ cli_export: --mlflow-export flag was set
42
+ cli_no_mlflow: --no-mlflow flag was set (force disable)
43
+ cli_tracking_uri: --mlflow-tracking-uri value
44
+ cli_experiment: --mlflow-experiment value
45
+ cli_tracing: --mlflow-tracing flag (None=use default, True=enable, False=disable)
46
+ yaml_config: mlflow block from scenario YAML
47
+ scenario_name: Fallback experiment name (defaults to scenario name)
48
+
49
+ Returns:
50
+ Resolved MLflowConfig instance
51
+ """
52
+ # --no-mlflow always wins
53
+ if cli_no_mlflow:
54
+ return cls(enabled=False)
55
+
56
+ yaml_config = yaml_config or {}
57
+
58
+ # Determine enabled state: CLI flag or YAML enabled
59
+ yaml_enabled = yaml_config.get("enabled", False)
60
+ enabled = cli_export or yaml_enabled
61
+
62
+ if not enabled:
63
+ return cls(enabled=False)
64
+
65
+ # Resolve tracking URI: CLI > env > YAML
66
+ tracking_uri = (
67
+ cli_tracking_uri
68
+ or os.environ.get("MLFLOW_TRACKING_URI")
69
+ or yaml_config.get("tracking_uri")
70
+ )
71
+
72
+ # Resolve experiment: CLI > YAML > scenario name
73
+ experiment = cli_experiment or yaml_config.get("experiment") or scenario_name
74
+
75
+ # Merge custom tags from YAML
76
+ tags = dict(yaml_config.get("tags", {}))
77
+
78
+ # Resolve tracing: CLI > YAML > default (True)
79
+ if cli_tracing is not None:
80
+ tracing = cli_tracing
81
+ else:
82
+ tracing = yaml_config.get("tracing", True)
83
+
84
+ return cls(
85
+ enabled=True,
86
+ tracking_uri=tracking_uri,
87
+ experiment=experiment,
88
+ tags=tags,
89
+ tracing=tracing,
90
+ )