sandboxy 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandboxy/api/routes/local.py +182 -19
- sandboxy/cli/main.py +530 -174
- sandboxy/mlflow/__init__.py +38 -0
- sandboxy/mlflow/artifacts.py +184 -0
- sandboxy/mlflow/config.py +90 -0
- sandboxy/mlflow/exporter.py +439 -0
- sandboxy/mlflow/metrics.py +115 -0
- sandboxy/mlflow/tags.py +140 -0
- sandboxy/mlflow/tracing.py +126 -0
- sandboxy/scenarios/loader.py +44 -2
- sandboxy/scenarios/runner.py +57 -2
- sandboxy/tools/yaml_tools.py +18 -0
- sandboxy/ui/dist/assets/index-CU06wBqc.js +362 -0
- sandboxy/ui/dist/assets/index-Cgg2wY2m.css +1 -0
- sandboxy/ui/dist/index.html +2 -2
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/METADATA +37 -1
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/RECORD +20 -13
- sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -1
- sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -347
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/WHEEL +0 -0
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/entry_points.txt +0 -0
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""MLflow integration for Sandboxy scenario tracking and evaluation."""
|
|
2
|
+
|
|
3
|
+
from sandboxy.mlflow.config import MLflowConfig
|
|
4
|
+
|
|
5
|
+
__all__ = [
|
|
6
|
+
"MLflowConfig",
|
|
7
|
+
"MLflowExporter",
|
|
8
|
+
"mlflow_run_context",
|
|
9
|
+
"enable_tracing",
|
|
10
|
+
"disable_tracing",
|
|
11
|
+
"trace_span",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def __getattr__(name: str):
|
|
16
|
+
"""Lazy import to avoid mlflow import when not needed."""
|
|
17
|
+
if name == "MLflowExporter":
|
|
18
|
+
from sandboxy.mlflow.exporter import MLflowExporter
|
|
19
|
+
|
|
20
|
+
return MLflowExporter
|
|
21
|
+
if name == "mlflow_run_context":
|
|
22
|
+
from sandboxy.mlflow.exporter import mlflow_run_context
|
|
23
|
+
|
|
24
|
+
return mlflow_run_context
|
|
25
|
+
if name == "enable_tracing":
|
|
26
|
+
from sandboxy.mlflow.tracing import enable_tracing
|
|
27
|
+
|
|
28
|
+
return enable_tracing
|
|
29
|
+
if name == "disable_tracing":
|
|
30
|
+
from sandboxy.mlflow.tracing import disable_tracing
|
|
31
|
+
|
|
32
|
+
return disable_tracing
|
|
33
|
+
if name == "trace_span":
|
|
34
|
+
from sandboxy.mlflow.tracing import trace_span
|
|
35
|
+
|
|
36
|
+
return trace_span
|
|
37
|
+
msg = f"module {__name__!r} has no attribute {name!r}"
|
|
38
|
+
raise AttributeError(msg)
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Artifact generation for MLflow integration.
|
|
2
|
+
|
|
3
|
+
Generates human-readable summaries and prepares artifact directories
|
|
4
|
+
for upload to MLflow.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import shutil
|
|
11
|
+
import tempfile
|
|
12
|
+
from datetime import UTC, datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import TYPE_CHECKING
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from sandboxy.scenarios.unified import RunResult
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def generate_summary(
|
|
21
|
+
result: RunResult | dict | object,
|
|
22
|
+
scenario_name: str,
|
|
23
|
+
model: str,
|
|
24
|
+
) -> str:
|
|
25
|
+
"""Generate human-readable summary text.
|
|
26
|
+
|
|
27
|
+
Handles both RunResult (unified) and ScenarioResult (legacy) formats.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
result: Run result from scenario execution (any format)
|
|
31
|
+
scenario_name: Human-readable scenario name
|
|
32
|
+
model: Model identifier
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Formatted summary text
|
|
36
|
+
"""
|
|
37
|
+
# Extract fields from various result formats
|
|
38
|
+
if isinstance(result, dict):
|
|
39
|
+
error = result.get("error")
|
|
40
|
+
evaluation = result.get("evaluation")
|
|
41
|
+
latency_ms = result.get("latency_ms")
|
|
42
|
+
input_tokens = result.get("input_tokens", 0)
|
|
43
|
+
output_tokens = result.get("output_tokens", 0)
|
|
44
|
+
score = result.get("score", 0)
|
|
45
|
+
goals_achieved = result.get("goals_achieved", [])
|
|
46
|
+
else:
|
|
47
|
+
error = getattr(result, "error", None)
|
|
48
|
+
evaluation = getattr(result, "evaluation", None)
|
|
49
|
+
latency_ms = getattr(result, "latency_ms", None)
|
|
50
|
+
input_tokens = getattr(result, "input_tokens", 0) or 0
|
|
51
|
+
output_tokens = getattr(result, "output_tokens", 0) or 0
|
|
52
|
+
score = getattr(result, "score", 0)
|
|
53
|
+
goals_achieved = getattr(result, "goals_achieved", [])
|
|
54
|
+
|
|
55
|
+
# Determine status
|
|
56
|
+
status = "FAILED" if error else "PASSED"
|
|
57
|
+
timestamp = datetime.now(UTC).isoformat()
|
|
58
|
+
|
|
59
|
+
lines = [
|
|
60
|
+
"Sandboxy Run Summary",
|
|
61
|
+
"=" * 20,
|
|
62
|
+
f"Scenario: {scenario_name}",
|
|
63
|
+
f"Model: {model}",
|
|
64
|
+
f"Status: {status}",
|
|
65
|
+
f"Timestamp: {timestamp}",
|
|
66
|
+
"",
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
# Scores section - handle both unified and legacy formats
|
|
70
|
+
if evaluation:
|
|
71
|
+
if isinstance(evaluation, dict):
|
|
72
|
+
goals = evaluation.get("goals", [])
|
|
73
|
+
total = evaluation.get("total_score", 0)
|
|
74
|
+
max_score = evaluation.get("max_score", 0)
|
|
75
|
+
pct = evaluation.get("percentage", 0)
|
|
76
|
+
else:
|
|
77
|
+
goals = getattr(evaluation, "goals", []) or []
|
|
78
|
+
total = getattr(evaluation, "total_score", 0)
|
|
79
|
+
max_score = getattr(evaluation, "max_score", 0)
|
|
80
|
+
pct = getattr(evaluation, "percentage", 0)
|
|
81
|
+
|
|
82
|
+
if goals:
|
|
83
|
+
lines.append("Scores:")
|
|
84
|
+
for goal in goals:
|
|
85
|
+
if isinstance(goal, dict):
|
|
86
|
+
name = goal.get("name", "unknown")
|
|
87
|
+
goal_score = goal.get("score", 0)
|
|
88
|
+
passed = goal.get("passed", False)
|
|
89
|
+
else:
|
|
90
|
+
name = getattr(goal, "name", "unknown")
|
|
91
|
+
goal_score = getattr(goal, "score", 0)
|
|
92
|
+
passed = getattr(goal, "passed", False)
|
|
93
|
+
check = "✓" if passed else "✗"
|
|
94
|
+
lines.append(f" {name}: {goal_score:.1f} {check}")
|
|
95
|
+
|
|
96
|
+
lines.append(f" score_total: {total:.1f}/{max_score:.1f} ({pct:.1f}%)")
|
|
97
|
+
lines.append("")
|
|
98
|
+
elif score or goals_achieved:
|
|
99
|
+
# Legacy ScenarioResult format
|
|
100
|
+
lines.append("Scores:")
|
|
101
|
+
lines.append(f" Total Score: {score}")
|
|
102
|
+
if goals_achieved:
|
|
103
|
+
lines.append(f" Goals Achieved: {', '.join(goals_achieved)}")
|
|
104
|
+
lines.append("")
|
|
105
|
+
|
|
106
|
+
# Timing section
|
|
107
|
+
if latency_ms:
|
|
108
|
+
lines.append("Timing:")
|
|
109
|
+
lines.append(f" Total: {latency_ms}ms")
|
|
110
|
+
lines.append("")
|
|
111
|
+
|
|
112
|
+
# Tokens section
|
|
113
|
+
if input_tokens or output_tokens:
|
|
114
|
+
total_tokens = input_tokens + output_tokens
|
|
115
|
+
lines.append("Tokens:")
|
|
116
|
+
lines.append(f" Input: {input_tokens}")
|
|
117
|
+
lines.append(f" Output: {output_tokens}")
|
|
118
|
+
lines.append(f" Total: {total_tokens}")
|
|
119
|
+
lines.append("")
|
|
120
|
+
|
|
121
|
+
return "\n".join(lines)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def prepare_artifacts_dir(
|
|
125
|
+
result: RunResult | dict | object,
|
|
126
|
+
scenario_path: Path,
|
|
127
|
+
scenario_name: str,
|
|
128
|
+
) -> Path:
|
|
129
|
+
"""Create temporary directory with all artifacts.
|
|
130
|
+
|
|
131
|
+
Handles both RunResult (unified) and ScenarioResult (legacy) formats.
|
|
132
|
+
|
|
133
|
+
Creates:
|
|
134
|
+
{tmpdir}/
|
|
135
|
+
├── scenario.yaml # Original scenario file
|
|
136
|
+
├── conversation.json # Full message history
|
|
137
|
+
└── summary.txt # Human-readable summary
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
result: Run result from scenario execution (any format)
|
|
141
|
+
scenario_path: Path to scenario YAML file
|
|
142
|
+
scenario_name: Human-readable scenario name
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Path to temporary directory (caller must clean up)
|
|
146
|
+
"""
|
|
147
|
+
# Create temp directory
|
|
148
|
+
tmpdir = Path(tempfile.mkdtemp(prefix="sandboxy_mlflow_"))
|
|
149
|
+
|
|
150
|
+
# Copy scenario YAML
|
|
151
|
+
if scenario_path.exists():
|
|
152
|
+
shutil.copy(scenario_path, tmpdir / "scenario.yaml")
|
|
153
|
+
|
|
154
|
+
# Get history from result (handle both formats)
|
|
155
|
+
if isinstance(result, dict):
|
|
156
|
+
history = result.get("history", []) or result.get("messages", [])
|
|
157
|
+
model = result.get("model", "unknown")
|
|
158
|
+
else:
|
|
159
|
+
history = getattr(result, "history", None) or getattr(result, "messages", [])
|
|
160
|
+
model = getattr(result, "model", None) or "unknown"
|
|
161
|
+
|
|
162
|
+
# Write conversation.json
|
|
163
|
+
if history:
|
|
164
|
+
# Convert history to serializable format
|
|
165
|
+
serializable_history = []
|
|
166
|
+
for msg in history:
|
|
167
|
+
if hasattr(msg, "model_dump"):
|
|
168
|
+
serializable_history.append(msg.model_dump())
|
|
169
|
+
elif hasattr(msg, "dict"):
|
|
170
|
+
serializable_history.append(msg.dict())
|
|
171
|
+
elif isinstance(msg, dict):
|
|
172
|
+
serializable_history.append(msg)
|
|
173
|
+
else:
|
|
174
|
+
serializable_history.append({"content": str(msg)})
|
|
175
|
+
|
|
176
|
+
(tmpdir / "conversation.json").write_text(
|
|
177
|
+
json.dumps(serializable_history, indent=2, default=str)
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# Write summary.txt
|
|
181
|
+
summary = generate_summary(result, scenario_name, model)
|
|
182
|
+
(tmpdir / "summary.txt").write_text(summary)
|
|
183
|
+
|
|
184
|
+
return tmpdir
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""MLflow configuration with CLI > YAML > env precedence resolution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class MLflowConfig:
|
|
12
|
+
"""Configuration for MLflow export, resolved from multiple sources.
|
|
13
|
+
|
|
14
|
+
Resolution precedence (highest to lowest):
|
|
15
|
+
1. CLI flags (--mlflow-export, --no-mlflow, --mlflow-tracking-uri, --mlflow-experiment)
|
|
16
|
+
2. Scenario YAML mlflow block
|
|
17
|
+
3. Environment variables (MLFLOW_TRACKING_URI)
|
|
18
|
+
4. Defaults
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
enabled: bool = False
|
|
22
|
+
tracking_uri: str | None = None
|
|
23
|
+
experiment: str | None = None
|
|
24
|
+
tags: dict[str, str] = field(default_factory=dict)
|
|
25
|
+
tracing: bool = True # Enable LLM call tracing by default when MLflow is enabled
|
|
26
|
+
|
|
27
|
+
@classmethod
|
|
28
|
+
def resolve(
|
|
29
|
+
cls,
|
|
30
|
+
cli_export: bool = False,
|
|
31
|
+
cli_no_mlflow: bool = False,
|
|
32
|
+
cli_tracking_uri: str | None = None,
|
|
33
|
+
cli_experiment: str | None = None,
|
|
34
|
+
cli_tracing: bool | None = None,
|
|
35
|
+
yaml_config: dict[str, Any] | None = None,
|
|
36
|
+
scenario_name: str = "default",
|
|
37
|
+
) -> MLflowConfig:
|
|
38
|
+
"""Resolve MLflow configuration with CLI > YAML > env precedence.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
cli_export: --mlflow-export flag was set
|
|
42
|
+
cli_no_mlflow: --no-mlflow flag was set (force disable)
|
|
43
|
+
cli_tracking_uri: --mlflow-tracking-uri value
|
|
44
|
+
cli_experiment: --mlflow-experiment value
|
|
45
|
+
cli_tracing: --mlflow-tracing flag (None=use default, True=enable, False=disable)
|
|
46
|
+
yaml_config: mlflow block from scenario YAML
|
|
47
|
+
scenario_name: Fallback experiment name (defaults to scenario name)
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Resolved MLflowConfig instance
|
|
51
|
+
"""
|
|
52
|
+
# --no-mlflow always wins
|
|
53
|
+
if cli_no_mlflow:
|
|
54
|
+
return cls(enabled=False)
|
|
55
|
+
|
|
56
|
+
yaml_config = yaml_config or {}
|
|
57
|
+
|
|
58
|
+
# Determine enabled state: CLI flag or YAML enabled
|
|
59
|
+
yaml_enabled = yaml_config.get("enabled", False)
|
|
60
|
+
enabled = cli_export or yaml_enabled
|
|
61
|
+
|
|
62
|
+
if not enabled:
|
|
63
|
+
return cls(enabled=False)
|
|
64
|
+
|
|
65
|
+
# Resolve tracking URI: CLI > env > YAML
|
|
66
|
+
tracking_uri = (
|
|
67
|
+
cli_tracking_uri
|
|
68
|
+
or os.environ.get("MLFLOW_TRACKING_URI")
|
|
69
|
+
or yaml_config.get("tracking_uri")
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Resolve experiment: CLI > YAML > scenario name
|
|
73
|
+
experiment = cli_experiment or yaml_config.get("experiment") or scenario_name
|
|
74
|
+
|
|
75
|
+
# Merge custom tags from YAML
|
|
76
|
+
tags = dict(yaml_config.get("tags", {}))
|
|
77
|
+
|
|
78
|
+
# Resolve tracing: CLI > YAML > default (True)
|
|
79
|
+
if cli_tracing is not None:
|
|
80
|
+
tracing = cli_tracing
|
|
81
|
+
else:
|
|
82
|
+
tracing = yaml_config.get("tracing", True)
|
|
83
|
+
|
|
84
|
+
return cls(
|
|
85
|
+
enabled=True,
|
|
86
|
+
tracking_uri=tracking_uri,
|
|
87
|
+
experiment=experiment,
|
|
88
|
+
tags=tags,
|
|
89
|
+
tracing=tracing,
|
|
90
|
+
)
|