levelapp 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- levelapp/__init__.py +0 -0
- levelapp/aspects/__init__.py +8 -0
- levelapp/aspects/loader.py +253 -0
- levelapp/aspects/logger.py +59 -0
- levelapp/aspects/monitor.py +617 -0
- levelapp/aspects/sanitizer.py +168 -0
- levelapp/clients/__init__.py +122 -0
- levelapp/clients/anthropic.py +112 -0
- levelapp/clients/gemini.py +130 -0
- levelapp/clients/groq.py +101 -0
- levelapp/clients/huggingface.py +162 -0
- levelapp/clients/ionos.py +126 -0
- levelapp/clients/mistral.py +106 -0
- levelapp/clients/openai.py +116 -0
- levelapp/comparator/__init__.py +5 -0
- levelapp/comparator/comparator.py +232 -0
- levelapp/comparator/extractor.py +108 -0
- levelapp/comparator/schemas.py +61 -0
- levelapp/comparator/scorer.py +269 -0
- levelapp/comparator/utils.py +136 -0
- levelapp/config/__init__.py +5 -0
- levelapp/config/endpoint.py +199 -0
- levelapp/config/prompts.py +57 -0
- levelapp/core/__init__.py +0 -0
- levelapp/core/base.py +386 -0
- levelapp/core/schemas.py +24 -0
- levelapp/core/session.py +336 -0
- levelapp/endpoint/__init__.py +0 -0
- levelapp/endpoint/client.py +188 -0
- levelapp/endpoint/client_test.py +41 -0
- levelapp/endpoint/manager.py +114 -0
- levelapp/endpoint/parsers.py +119 -0
- levelapp/endpoint/schemas.py +38 -0
- levelapp/endpoint/tester.py +52 -0
- levelapp/evaluator/__init__.py +3 -0
- levelapp/evaluator/evaluator.py +307 -0
- levelapp/metrics/__init__.py +63 -0
- levelapp/metrics/embedding.py +56 -0
- levelapp/metrics/embeddings/__init__.py +0 -0
- levelapp/metrics/embeddings/sentence_transformer.py +30 -0
- levelapp/metrics/embeddings/torch_based.py +56 -0
- levelapp/metrics/exact.py +182 -0
- levelapp/metrics/fuzzy.py +80 -0
- levelapp/metrics/token.py +103 -0
- levelapp/plugins/__init__.py +0 -0
- levelapp/repository/__init__.py +3 -0
- levelapp/repository/filesystem.py +203 -0
- levelapp/repository/firestore.py +291 -0
- levelapp/simulator/__init__.py +3 -0
- levelapp/simulator/schemas.py +116 -0
- levelapp/simulator/simulator.py +531 -0
- levelapp/simulator/utils.py +134 -0
- levelapp/visualization/__init__.py +7 -0
- levelapp/visualization/charts.py +358 -0
- levelapp/visualization/dashboard.py +240 -0
- levelapp/visualization/exporter.py +167 -0
- levelapp/visualization/templates/base.html +158 -0
- levelapp/visualization/templates/comparator_dashboard.html +57 -0
- levelapp/visualization/templates/simulator_dashboard.html +111 -0
- levelapp/workflow/__init__.py +6 -0
- levelapp/workflow/base.py +192 -0
- levelapp/workflow/config.py +96 -0
- levelapp/workflow/context.py +64 -0
- levelapp/workflow/factory.py +42 -0
- levelapp/workflow/registration.py +6 -0
- levelapp/workflow/runtime.py +19 -0
- levelapp-0.1.15.dist-info/METADATA +571 -0
- levelapp-0.1.15.dist-info/RECORD +70 -0
- levelapp-0.1.15.dist-info/WHEEL +4 -0
- levelapp-0.1.15.dist-info/licenses/LICENSE +0 -0
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"""levelapp/visualization/charts.py: Chart generation for evaluation results."""
|
|
2
|
+
|
|
3
|
+
import plotly.graph_objects as go
|
|
4
|
+
import plotly.express as px
|
|
5
|
+
from typing import Dict, Any
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
|
|
8
|
+
from levelapp.simulator.schemas import SimulationResults
|
|
9
|
+
from levelapp.aspects import logger
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ChartGenerator:
|
|
13
|
+
"""Generate interactive charts for evaluation results."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, theme: str = "plotly_white"):
|
|
16
|
+
"""
|
|
17
|
+
Initialize ChartGenerator with a theme.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
theme: Plotly theme name (plotly, plotly_white, plotly_dark, ggplot2, seaborn, etc.)
|
|
21
|
+
"""
|
|
22
|
+
self.theme = theme
|
|
23
|
+
self._color_palette = px.colors.qualitative.Plotly
|
|
24
|
+
|
|
25
|
+
def create_score_trend(
|
|
26
|
+
self, results: SimulationResults, metric: str = "average"
|
|
27
|
+
) -> go.Figure:
|
|
28
|
+
"""
|
|
29
|
+
Create a line chart showing score trends across scripts/attempts.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
results: SimulationResults object
|
|
33
|
+
metric: Which metric to plot (default: "average")
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Plotly Figure object
|
|
37
|
+
"""
|
|
38
|
+
logger.info(f"[ChartGenerator] Creating score trend chart for metric: {metric}")
|
|
39
|
+
|
|
40
|
+
fig = go.Figure()
|
|
41
|
+
|
|
42
|
+
# Extract scores by provider
|
|
43
|
+
provider_scores = defaultdict(list)
|
|
44
|
+
script_ids = []
|
|
45
|
+
|
|
46
|
+
if results.script_results:
|
|
47
|
+
for idx, script_result in enumerate(results.script_results):
|
|
48
|
+
script_id = script_result.script_id
|
|
49
|
+
script_ids.append(script_id)
|
|
50
|
+
|
|
51
|
+
avg_scores = script_result.average_scores
|
|
52
|
+
for provider, score in avg_scores.items():
|
|
53
|
+
if (
|
|
54
|
+
provider != "processing_time"
|
|
55
|
+
and provider != "guardrail"
|
|
56
|
+
and provider != "metadata"
|
|
57
|
+
):
|
|
58
|
+
provider_scores[provider].append(score)
|
|
59
|
+
|
|
60
|
+
# Create line for each provider
|
|
61
|
+
for idx, (provider, scores) in enumerate(provider_scores.items()):
|
|
62
|
+
fig.add_trace(
|
|
63
|
+
go.Scatter(
|
|
64
|
+
x=script_ids[: len(scores)],
|
|
65
|
+
y=scores,
|
|
66
|
+
mode="lines+markers",
|
|
67
|
+
name=provider.upper(),
|
|
68
|
+
line=dict(
|
|
69
|
+
width=2,
|
|
70
|
+
color=self._color_palette[idx % len(self._color_palette)],
|
|
71
|
+
),
|
|
72
|
+
marker=dict(size=8),
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
fig.update_layout(
|
|
77
|
+
title="Evaluation Score Trends Across Scripts",
|
|
78
|
+
xaxis_title="Script ID",
|
|
79
|
+
yaxis_title="Score",
|
|
80
|
+
template=self.theme,
|
|
81
|
+
hovermode="x unified",
|
|
82
|
+
legend=dict(
|
|
83
|
+
orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1
|
|
84
|
+
),
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
return fig
|
|
88
|
+
|
|
89
|
+
def create_provider_comparison(self, results: SimulationResults) -> go.Figure:
|
|
90
|
+
"""
|
|
91
|
+
Create a bar chart comparing average scores across providers.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
results: SimulationResults object
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Plotly Figure object
|
|
98
|
+
"""
|
|
99
|
+
logger.info("[ChartGenerator] Creating provider comparison chart")
|
|
100
|
+
|
|
101
|
+
providers = []
|
|
102
|
+
scores = []
|
|
103
|
+
|
|
104
|
+
if results.average_scores:
|
|
105
|
+
for provider, score in results.average_scores.items():
|
|
106
|
+
if provider not in ["processing_time", "guardrail", "metadata"]:
|
|
107
|
+
providers.append(provider.upper())
|
|
108
|
+
scores.append(score)
|
|
109
|
+
|
|
110
|
+
fig = go.Figure(
|
|
111
|
+
data=[
|
|
112
|
+
go.Bar(
|
|
113
|
+
x=providers,
|
|
114
|
+
y=scores,
|
|
115
|
+
marker_color=self._color_palette[: len(providers)],
|
|
116
|
+
text=[f"{s:.3f}" for s in scores],
|
|
117
|
+
textposition="auto",
|
|
118
|
+
)
|
|
119
|
+
]
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
fig.update_layout(
|
|
123
|
+
title="Average Scores by Provider",
|
|
124
|
+
xaxis_title="Provider",
|
|
125
|
+
yaxis_title="Average Score",
|
|
126
|
+
template=self.theme,
|
|
127
|
+
yaxis=dict(range=[0, 1]),
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return fig
|
|
131
|
+
|
|
132
|
+
def create_score_distribution(
|
|
133
|
+
self, results: SimulationResults, provider: str
|
|
134
|
+
) -> go.Figure:
|
|
135
|
+
"""
|
|
136
|
+
Create a histogram and box plot showing score distribution for a provider.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
results: SimulationResults object
|
|
140
|
+
provider: Provider name to analyze
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Plotly Figure object with subplots
|
|
144
|
+
"""
|
|
145
|
+
logger.info(
|
|
146
|
+
f"[ChartGenerator] Creating score distribution for provider: {provider}"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
from plotly.subplots import make_subplots
|
|
150
|
+
|
|
151
|
+
# Collect all scores for the provider
|
|
152
|
+
scores = []
|
|
153
|
+
if results.interaction_results:
|
|
154
|
+
for script_result in results.interaction_results:
|
|
155
|
+
avg_scores = script_result.get("average_scores", {})
|
|
156
|
+
if provider in avg_scores:
|
|
157
|
+
scores.append(avg_scores[provider])
|
|
158
|
+
|
|
159
|
+
# Create subplots
|
|
160
|
+
fig = make_subplots(
|
|
161
|
+
rows=2,
|
|
162
|
+
cols=1,
|
|
163
|
+
subplot_titles=("Score Distribution", "Box Plot"),
|
|
164
|
+
vertical_spacing=0.15,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Histogram
|
|
168
|
+
fig.add_trace(
|
|
169
|
+
go.Histogram(
|
|
170
|
+
x=scores,
|
|
171
|
+
nbinsx=20,
|
|
172
|
+
name="Distribution",
|
|
173
|
+
marker_color=self._color_palette[0],
|
|
174
|
+
),
|
|
175
|
+
row=1,
|
|
176
|
+
col=1,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Box plot
|
|
180
|
+
fig.add_trace(
|
|
181
|
+
go.Box(
|
|
182
|
+
x=scores,
|
|
183
|
+
name=provider.upper(),
|
|
184
|
+
marker_color=self._color_palette[1],
|
|
185
|
+
boxmean="sd",
|
|
186
|
+
),
|
|
187
|
+
row=2,
|
|
188
|
+
col=1,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
fig.update_layout(
|
|
192
|
+
title=f"Score Distribution for {provider.upper()}",
|
|
193
|
+
template=self.theme,
|
|
194
|
+
showlegend=False,
|
|
195
|
+
height=600,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
fig.update_xaxes(title_text="Score", row=2, col=1)
|
|
199
|
+
fig.update_yaxes(title_text="Frequency", row=1, col=1)
|
|
200
|
+
|
|
201
|
+
return fig
|
|
202
|
+
|
|
203
|
+
def create_metadata_heatmap(self, comparator_results: Dict[str, Any]) -> go.Figure:
|
|
204
|
+
"""
|
|
205
|
+
Create a heatmap showing field-level accuracy from comparator results.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
comparator_results: Dictionary of comparator evaluation results
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Plotly Figure object
|
|
212
|
+
"""
|
|
213
|
+
logger.info("[ChartGenerator] Creating metadata accuracy heatmap")
|
|
214
|
+
|
|
215
|
+
fields = []
|
|
216
|
+
scores = []
|
|
217
|
+
|
|
218
|
+
for idx, result in comparator_results.items():
|
|
219
|
+
field_name = result.get("field_name", f"Field {idx}")
|
|
220
|
+
set_scores = result.get("set_scores")
|
|
221
|
+
|
|
222
|
+
if set_scores is not None:
|
|
223
|
+
score = set_scores[0] if isinstance(set_scores, list) else set_scores
|
|
224
|
+
fields.append(field_name)
|
|
225
|
+
scores.append(float(score) if score is not None else 0.0)
|
|
226
|
+
|
|
227
|
+
fig = go.Figure(
|
|
228
|
+
data=go.Heatmap(
|
|
229
|
+
z=[scores],
|
|
230
|
+
x=fields,
|
|
231
|
+
y=["Accuracy"],
|
|
232
|
+
colorscale="RdYlGn",
|
|
233
|
+
text=[[f"{s:.2f}" for s in scores]],
|
|
234
|
+
texttemplate="%{text}",
|
|
235
|
+
textfont={"size": 10},
|
|
236
|
+
colorbar=dict(title="Score"),
|
|
237
|
+
)
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
fig.update_layout(
|
|
241
|
+
title="Metadata Field Accuracy Heatmap",
|
|
242
|
+
xaxis_title="Field Name",
|
|
243
|
+
template=self.theme,
|
|
244
|
+
height=300,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
return fig
|
|
248
|
+
|
|
249
|
+
def create_interaction_timeline(self, results: SimulationResults) -> go.Figure:
|
|
250
|
+
"""
|
|
251
|
+
Create a timeline visualization of interaction performance.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
results: SimulationResults object
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Plotly Figure object
|
|
258
|
+
"""
|
|
259
|
+
logger.info("[ChartGenerator] Creating interaction timeline")
|
|
260
|
+
|
|
261
|
+
fig = go.Figure()
|
|
262
|
+
|
|
263
|
+
if results.interaction_results:
|
|
264
|
+
for script_idx, script_result in enumerate(results.interaction_results):
|
|
265
|
+
script_id = script_result.get("script_id", f"Script {script_idx + 1}")
|
|
266
|
+
attempts = script_result.get("attempts", [])
|
|
267
|
+
|
|
268
|
+
for attempt in attempts:
|
|
269
|
+
attempt_num = attempt.get("attempt", 1)
|
|
270
|
+
duration = attempt.get("total_duration", 0)
|
|
271
|
+
|
|
272
|
+
fig.add_trace(
|
|
273
|
+
go.Bar(
|
|
274
|
+
x=[duration],
|
|
275
|
+
y=[f"{script_id} - Attempt {attempt_num}"],
|
|
276
|
+
orientation="h",
|
|
277
|
+
name=script_id,
|
|
278
|
+
showlegend=False,
|
|
279
|
+
marker_color=self._color_palette[
|
|
280
|
+
script_idx % len(self._color_palette)
|
|
281
|
+
],
|
|
282
|
+
)
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
fig.update_layout(
|
|
286
|
+
title="Interaction Processing Timeline",
|
|
287
|
+
xaxis_title="Duration (seconds)",
|
|
288
|
+
yaxis_title="Script - Attempt",
|
|
289
|
+
template=self.theme,
|
|
290
|
+
height=max(400, len(results.interaction_results or []) * 60),
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
return fig
|
|
294
|
+
|
|
295
|
+
def create_summary_metrics(self, results: SimulationResults) -> go.Figure:
|
|
296
|
+
"""
|
|
297
|
+
Create a summary metrics card visualization.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
results: SimulationResults object
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
Plotly Figure object
|
|
304
|
+
"""
|
|
305
|
+
logger.info("[ChartGenerator] Creating summary metrics visualization")
|
|
306
|
+
|
|
307
|
+
from plotly.subplots import make_subplots
|
|
308
|
+
|
|
309
|
+
# Calculate summary metrics
|
|
310
|
+
total_scripts = len(results.interaction_results or [])
|
|
311
|
+
avg_score = (
|
|
312
|
+
results.average_scores.get("openai", 0) if results.average_scores else 0
|
|
313
|
+
)
|
|
314
|
+
total_time = results.elapsed_time
|
|
315
|
+
|
|
316
|
+
fig = make_subplots(
|
|
317
|
+
rows=1,
|
|
318
|
+
cols=3,
|
|
319
|
+
subplot_titles=("Total Scripts", "Avg Score", "Total Time (s)"),
|
|
320
|
+
specs=[
|
|
321
|
+
[{"type": "indicator"}, {"type": "indicator"}, {"type": "indicator"}]
|
|
322
|
+
],
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
fig.add_trace(
|
|
326
|
+
go.Indicator(
|
|
327
|
+
mode="number",
|
|
328
|
+
value=total_scripts,
|
|
329
|
+
number={"font": {"size": 40}},
|
|
330
|
+
),
|
|
331
|
+
row=1,
|
|
332
|
+
col=1,
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
fig.add_trace(
|
|
336
|
+
go.Indicator(
|
|
337
|
+
mode="number+delta",
|
|
338
|
+
value=avg_score,
|
|
339
|
+
number={"font": {"size": 40}},
|
|
340
|
+
delta={"reference": 0.8, "relative": False},
|
|
341
|
+
),
|
|
342
|
+
row=1,
|
|
343
|
+
col=2,
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
fig.add_trace(
|
|
347
|
+
go.Indicator(
|
|
348
|
+
mode="number",
|
|
349
|
+
value=total_time,
|
|
350
|
+
number={"font": {"size": 40}, "suffix": "s"},
|
|
351
|
+
),
|
|
352
|
+
row=1,
|
|
353
|
+
col=3,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
fig.update_layout(template=self.theme, height=200, showlegend=False)
|
|
357
|
+
|
|
358
|
+
return fig
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""levelapp/visualization/dashboard.py: Dashboard generation for evaluation results."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, Any
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from jinja2 import Environment, FileSystemLoader, select_autoescape
|
|
7
|
+
|
|
8
|
+
from levelapp.simulator.schemas import SimulationResults
|
|
9
|
+
from levelapp.visualization.charts import ChartGenerator
|
|
10
|
+
from levelapp.aspects import logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DashboardGenerator:
|
|
14
|
+
"""Generate comprehensive HTML dashboards for evaluation results."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, template_dir: str | None = None):
|
|
17
|
+
"""
|
|
18
|
+
Initialize DashboardGenerator.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
template_dir: Optional custom template directory path
|
|
22
|
+
"""
|
|
23
|
+
if template_dir is None:
|
|
24
|
+
# Use default templates directory
|
|
25
|
+
template_dir = Path(__file__).parent / "templates"
|
|
26
|
+
|
|
27
|
+
self.template_dir = Path(template_dir)
|
|
28
|
+
self.env = Environment(
|
|
29
|
+
loader=FileSystemLoader(str(self.template_dir)),
|
|
30
|
+
autoescape=select_autoescape(["html", "xml"]),
|
|
31
|
+
)
|
|
32
|
+
self.chart_gen = ChartGenerator()
|
|
33
|
+
|
|
34
|
+
def generate_simulator_dashboard(
|
|
35
|
+
self,
|
|
36
|
+
results: SimulationResults,
|
|
37
|
+
output_path: str,
|
|
38
|
+
title: str = "Evaluation Dashboard",
|
|
39
|
+
) -> str:
|
|
40
|
+
"""
|
|
41
|
+
Generate a complete HTML dashboard for simulator results.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
results: SimulationResults object
|
|
45
|
+
output_path: Path to save the HTML file
|
|
46
|
+
title: Dashboard title
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Path to the generated HTML file
|
|
50
|
+
"""
|
|
51
|
+
logger.info(
|
|
52
|
+
f"[DashboardGenerator] Generating simulator dashboard: {output_path}"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
# Generate all charts
|
|
56
|
+
charts = {
|
|
57
|
+
"score_trend": self.chart_gen.create_score_trend(results),
|
|
58
|
+
"provider_comparison": self.chart_gen.create_provider_comparison(results),
|
|
59
|
+
"summary_metrics": self.chart_gen.create_summary_metrics(results),
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
# Add distribution charts for each provider
|
|
63
|
+
if results.average_scores:
|
|
64
|
+
for provider in results.average_scores.keys():
|
|
65
|
+
if provider not in ["processing_time", "guardrail", "metadata"]:
|
|
66
|
+
try:
|
|
67
|
+
charts[f"distribution_{provider}"] = (
|
|
68
|
+
self.chart_gen.create_score_distribution(results, provider)
|
|
69
|
+
)
|
|
70
|
+
except Exception as e:
|
|
71
|
+
logger.warning(
|
|
72
|
+
f"Failed to create distribution chart for {provider}: {e}"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Add timeline chart
|
|
76
|
+
try:
|
|
77
|
+
charts["timeline"] = self.chart_gen.create_interaction_timeline(results)
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.warning(f"Failed to create timeline chart: {e}")
|
|
80
|
+
|
|
81
|
+
# Convert charts to HTML
|
|
82
|
+
chart_htmls = {}
|
|
83
|
+
for name, fig in charts.items():
|
|
84
|
+
chart_htmls[name] = fig.to_html(
|
|
85
|
+
include_plotlyjs="cdn",
|
|
86
|
+
div_id=f"chart_{name}",
|
|
87
|
+
config={"responsive": True},
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Calculate summary statistics
|
|
91
|
+
summary_stats = self._create_summary_stats(results)
|
|
92
|
+
|
|
93
|
+
# Prepare context for template
|
|
94
|
+
context = {
|
|
95
|
+
"title": title,
|
|
96
|
+
"generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
97
|
+
"summary_stats": summary_stats,
|
|
98
|
+
"charts": chart_htmls,
|
|
99
|
+
"results": results,
|
|
100
|
+
"has_evaluation_summary": bool(results.evaluation_summary),
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
# Render template
|
|
104
|
+
html_content = self._render_template("simulator_dashboard.html", context)
|
|
105
|
+
|
|
106
|
+
# Save to file
|
|
107
|
+
output_file = Path(output_path)
|
|
108
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
109
|
+
output_file.write_text(html_content, encoding="utf-8")
|
|
110
|
+
|
|
111
|
+
logger.info(f"[DashboardGenerator] Dashboard saved to: {output_path}")
|
|
112
|
+
return str(output_file.absolute())
|
|
113
|
+
|
|
114
|
+
def generate_comparator_dashboard(
|
|
115
|
+
self,
|
|
116
|
+
results: Dict[str, Any],
|
|
117
|
+
output_path: str,
|
|
118
|
+
title: str = "Comparison Dashboard",
|
|
119
|
+
) -> str:
|
|
120
|
+
"""
|
|
121
|
+
Generate a complete HTML dashboard for comparator results.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
results: Comparator results dictionary
|
|
125
|
+
output_path: Path to save the HTML file
|
|
126
|
+
title: Dashboard title
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Path to the generated HTML file
|
|
130
|
+
"""
|
|
131
|
+
logger.info(
|
|
132
|
+
f"[DashboardGenerator] Generating comparator dashboard: {output_path}"
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Generate heatmap
|
|
136
|
+
heatmap = self.chart_gen.create_metadata_heatmap(results)
|
|
137
|
+
|
|
138
|
+
# Convert to HTML
|
|
139
|
+
chart_html = heatmap.to_html(
|
|
140
|
+
include_plotlyjs="cdn", div_id="chart_heatmap", config={"responsive": True}
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Prepare context
|
|
144
|
+
context = {
|
|
145
|
+
"title": title,
|
|
146
|
+
"generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
147
|
+
"chart_html": chart_html,
|
|
148
|
+
"results": results,
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
# Render template
|
|
152
|
+
html_content = self._render_template("comparator_dashboard.html", context)
|
|
153
|
+
|
|
154
|
+
# Save to file
|
|
155
|
+
output_file = Path(output_path)
|
|
156
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
157
|
+
output_file.write_text(html_content, encoding="utf-8")
|
|
158
|
+
|
|
159
|
+
logger.info(f"[DashboardGenerator] Dashboard saved to: {output_path}")
|
|
160
|
+
return str(output_file.absolute())
|
|
161
|
+
|
|
162
|
+
def _create_summary_stats(self, results: SimulationResults) -> Dict[str, Any]:
|
|
163
|
+
"""
|
|
164
|
+
Extract key summary statistics from results.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
results: SimulationResults object
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Dictionary of summary statistics
|
|
171
|
+
"""
|
|
172
|
+
stats = {
|
|
173
|
+
"total_scripts": len(results.interaction_results or []),
|
|
174
|
+
"total_time": results.elapsed_time,
|
|
175
|
+
"started_at": results.started_at.strftime("%Y-%m-%d %H:%M:%S"),
|
|
176
|
+
"finished_at": results.finished_at.strftime("%Y-%m-%d %H:%M:%S"),
|
|
177
|
+
"average_scores": results.average_scores or {},
|
|
178
|
+
"providers": list(results.average_scores.keys())
|
|
179
|
+
if results.average_scores
|
|
180
|
+
else [],
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
# Calculate overall average (excluding non-score metrics)
|
|
184
|
+
score_values = [
|
|
185
|
+
v
|
|
186
|
+
for k, v in (results.average_scores or {}).items()
|
|
187
|
+
if k not in ["processing_time", "guardrail", "metadata"]
|
|
188
|
+
]
|
|
189
|
+
stats["overall_average"] = (
|
|
190
|
+
sum(score_values) / len(score_values) if score_values else 0.0
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
return stats
|
|
194
|
+
|
|
195
|
+
def _render_template(self, template_name: str, context: Dict[str, Any]) -> str:
|
|
196
|
+
"""
|
|
197
|
+
Render a Jinja2 template with the given context.
|
|
198
|
+
|
|
199
|
+
Args:
|
|
200
|
+
template_name: Name of the template file
|
|
201
|
+
context: Template context dictionary
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Rendered HTML string
|
|
205
|
+
"""
|
|
206
|
+
try:
|
|
207
|
+
template = self.env.get_template(template_name)
|
|
208
|
+
return template.render(**context)
|
|
209
|
+
except Exception as e:
|
|
210
|
+
logger.error(f"[DashboardGenerator] Template rendering failed: {e}")
|
|
211
|
+
# Return a basic HTML fallback
|
|
212
|
+
return self._create_fallback_html(context)
|
|
213
|
+
|
|
214
|
+
def _create_fallback_html(self, context: Dict[str, Any]) -> str:
|
|
215
|
+
"""Create a basic HTML fallback if template rendering fails."""
|
|
216
|
+
return f"""
|
|
217
|
+
<!DOCTYPE html>
|
|
218
|
+
<html>
|
|
219
|
+
<head>
|
|
220
|
+
<title>{context.get("title", "Evaluation Dashboard")}</title>
|
|
221
|
+
<meta charset="utf-8">
|
|
222
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
223
|
+
<style>
|
|
224
|
+
body {{ font-family: Arial, sans-serif; margin: 20px; }}
|
|
225
|
+
.container {{ max-width: 1200px; margin: 0 auto; }}
|
|
226
|
+
h1 {{ color: #333; }}
|
|
227
|
+
.chart {{ margin: 20px 0; }}
|
|
228
|
+
</style>
|
|
229
|
+
</head>
|
|
230
|
+
<body>
|
|
231
|
+
<div class="container">
|
|
232
|
+
<h1>{context.get("title", "Evaluation Dashboard")}</h1>
|
|
233
|
+
<p>Generated at: {context.get("generated_at", "N/A")}</p>
|
|
234
|
+
<div class="charts">
|
|
235
|
+
{"".join(context.get("charts", {}).values())}
|
|
236
|
+
</div>
|
|
237
|
+
</div>
|
|
238
|
+
</body>
|
|
239
|
+
</html>
|
|
240
|
+
"""
|