levelapp 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- levelapp/__init__.py +0 -0
- levelapp/aspects/__init__.py +8 -0
- levelapp/aspects/loader.py +253 -0
- levelapp/aspects/logger.py +59 -0
- levelapp/aspects/monitor.py +617 -0
- levelapp/aspects/sanitizer.py +168 -0
- levelapp/clients/__init__.py +122 -0
- levelapp/clients/anthropic.py +112 -0
- levelapp/clients/gemini.py +130 -0
- levelapp/clients/groq.py +101 -0
- levelapp/clients/huggingface.py +162 -0
- levelapp/clients/ionos.py +126 -0
- levelapp/clients/mistral.py +106 -0
- levelapp/clients/openai.py +116 -0
- levelapp/comparator/__init__.py +5 -0
- levelapp/comparator/comparator.py +232 -0
- levelapp/comparator/extractor.py +108 -0
- levelapp/comparator/schemas.py +61 -0
- levelapp/comparator/scorer.py +269 -0
- levelapp/comparator/utils.py +136 -0
- levelapp/config/__init__.py +5 -0
- levelapp/config/endpoint.py +199 -0
- levelapp/config/prompts.py +57 -0
- levelapp/core/__init__.py +0 -0
- levelapp/core/base.py +386 -0
- levelapp/core/schemas.py +24 -0
- levelapp/core/session.py +336 -0
- levelapp/endpoint/__init__.py +0 -0
- levelapp/endpoint/client.py +188 -0
- levelapp/endpoint/client_test.py +41 -0
- levelapp/endpoint/manager.py +114 -0
- levelapp/endpoint/parsers.py +119 -0
- levelapp/endpoint/schemas.py +38 -0
- levelapp/endpoint/tester.py +52 -0
- levelapp/evaluator/__init__.py +3 -0
- levelapp/evaluator/evaluator.py +307 -0
- levelapp/metrics/__init__.py +63 -0
- levelapp/metrics/embedding.py +56 -0
- levelapp/metrics/embeddings/__init__.py +0 -0
- levelapp/metrics/embeddings/sentence_transformer.py +30 -0
- levelapp/metrics/embeddings/torch_based.py +56 -0
- levelapp/metrics/exact.py +182 -0
- levelapp/metrics/fuzzy.py +80 -0
- levelapp/metrics/token.py +103 -0
- levelapp/plugins/__init__.py +0 -0
- levelapp/repository/__init__.py +3 -0
- levelapp/repository/filesystem.py +203 -0
- levelapp/repository/firestore.py +291 -0
- levelapp/simulator/__init__.py +3 -0
- levelapp/simulator/schemas.py +116 -0
- levelapp/simulator/simulator.py +531 -0
- levelapp/simulator/utils.py +134 -0
- levelapp/visualization/__init__.py +7 -0
- levelapp/visualization/charts.py +358 -0
- levelapp/visualization/dashboard.py +240 -0
- levelapp/visualization/exporter.py +167 -0
- levelapp/visualization/templates/base.html +158 -0
- levelapp/visualization/templates/comparator_dashboard.html +57 -0
- levelapp/visualization/templates/simulator_dashboard.html +111 -0
- levelapp/workflow/__init__.py +6 -0
- levelapp/workflow/base.py +192 -0
- levelapp/workflow/config.py +96 -0
- levelapp/workflow/context.py +64 -0
- levelapp/workflow/factory.py +42 -0
- levelapp/workflow/registration.py +6 -0
- levelapp/workflow/runtime.py +19 -0
- levelapp-0.1.15.dist-info/METADATA +571 -0
- levelapp-0.1.15.dist-info/RECORD +70 -0
- levelapp-0.1.15.dist-info/WHEEL +4 -0
- levelapp-0.1.15.dist-info/licenses/LICENSE +0 -0
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""levelapp/visualization/exporter.py: Export utilities for visualization results."""
|
|
2
|
+
|
|
3
|
+
import zipfile
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, List
|
|
6
|
+
import plotly.graph_objects as go
|
|
7
|
+
|
|
8
|
+
from levelapp.simulator.schemas import SimulationResults
|
|
9
|
+
from levelapp.visualization.dashboard import DashboardGenerator
|
|
10
|
+
from levelapp.aspects import logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ResultsExporter:
|
|
14
|
+
"""Export evaluation results and visualizations to multiple formats."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, output_dir: str):
|
|
17
|
+
"""
|
|
18
|
+
Initialize ResultsExporter.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
output_dir: Directory to save exported files
|
|
22
|
+
"""
|
|
23
|
+
self.output_dir = Path(output_dir)
|
|
24
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
25
|
+
|
|
26
|
+
def export_chart(
|
|
27
|
+
self, figure: go.Figure, filename: str, formats: List[str] = ["html", "png"]
|
|
28
|
+
) -> Dict[str, str]:
|
|
29
|
+
"""
|
|
30
|
+
Export a single chart to multiple formats.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
figure: Plotly Figure object
|
|
34
|
+
filename: Base filename (without extension)
|
|
35
|
+
formats: List of formats to export (html, png, pdf, json)
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Dictionary mapping format to file path
|
|
39
|
+
"""
|
|
40
|
+
logger.info(
|
|
41
|
+
f"[ResultsExporter] Exporting chart '{filename}' to formats: {formats}"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
exported_files = {}
|
|
45
|
+
|
|
46
|
+
for fmt in formats:
|
|
47
|
+
output_path = self.output_dir / f"{filename}.{fmt}"
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
if fmt == "html":
|
|
51
|
+
figure.write_html(
|
|
52
|
+
str(output_path),
|
|
53
|
+
include_plotlyjs="cdn",
|
|
54
|
+
config={"responsive": True},
|
|
55
|
+
)
|
|
56
|
+
elif fmt == "png":
|
|
57
|
+
figure.write_image(str(output_path), width=1200, height=800)
|
|
58
|
+
elif fmt == "pdf":
|
|
59
|
+
figure.write_image(str(output_path))
|
|
60
|
+
elif fmt == "json":
|
|
61
|
+
figure.write_json(str(output_path))
|
|
62
|
+
else:
|
|
63
|
+
logger.warning(f"Unsupported format: {fmt}")
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
exported_files[fmt] = str(output_path.absolute())
|
|
67
|
+
logger.info(f"[ResultsExporter] Exported to: {output_path}")
|
|
68
|
+
|
|
69
|
+
except Exception as e:
|
|
70
|
+
logger.error(
|
|
71
|
+
f"[ResultsExporter] Failed to export {filename} as {fmt}: {e}"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
return exported_files
|
|
75
|
+
|
|
76
|
+
def export_dashboard(
|
|
77
|
+
self, results: SimulationResults, formats: List[str] = ["html"]
|
|
78
|
+
) -> Dict[str, str]:
|
|
79
|
+
"""
|
|
80
|
+
Export a complete dashboard with all visualizations.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
results: SimulationResults object
|
|
84
|
+
formats: List of formats to export
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Dictionary mapping format to file path
|
|
88
|
+
"""
|
|
89
|
+
logger.info(f"[ResultsExporter] Exporting dashboard to formats: {formats}")
|
|
90
|
+
|
|
91
|
+
exported_files = {}
|
|
92
|
+
dashboard_gen = DashboardGenerator()
|
|
93
|
+
|
|
94
|
+
# Generate HTML dashboard
|
|
95
|
+
if "html" in formats:
|
|
96
|
+
html_path = self.output_dir / "dashboard.html"
|
|
97
|
+
dashboard_path = dashboard_gen.generate_simulator_dashboard(
|
|
98
|
+
results=results,
|
|
99
|
+
output_path=str(html_path),
|
|
100
|
+
title="Evaluation Dashboard",
|
|
101
|
+
)
|
|
102
|
+
exported_files["html"] = dashboard_path
|
|
103
|
+
|
|
104
|
+
# Export individual charts if PNG/PDF requested
|
|
105
|
+
if any(fmt in formats for fmt in ["png", "pdf"]):
|
|
106
|
+
from levelapp.visualization.charts import ChartGenerator
|
|
107
|
+
|
|
108
|
+
chart_gen = ChartGenerator()
|
|
109
|
+
|
|
110
|
+
charts_to_export = {
|
|
111
|
+
"score_trend": chart_gen.create_score_trend(results),
|
|
112
|
+
"provider_comparison": chart_gen.create_provider_comparison(results),
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
for chart_name, figure in charts_to_export.items():
|
|
116
|
+
chart_formats = [fmt for fmt in formats if fmt in ["png", "pdf"]]
|
|
117
|
+
self.export_chart(figure, chart_name, chart_formats)
|
|
118
|
+
|
|
119
|
+
return exported_files
|
|
120
|
+
|
|
121
|
+
def create_archive(
|
|
122
|
+
self, files: List[str], archive_name: str = "evaluation_results.zip"
|
|
123
|
+
) -> str:
|
|
124
|
+
"""
|
|
125
|
+
Create a ZIP archive of exported files.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
files: List of file paths to include
|
|
129
|
+
archive_name: Name of the archive file
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Path to the created archive
|
|
133
|
+
"""
|
|
134
|
+
logger.info(f"[ResultsExporter] Creating archive: {archive_name}")
|
|
135
|
+
|
|
136
|
+
archive_path = self.output_dir / archive_name
|
|
137
|
+
|
|
138
|
+
with zipfile.ZipFile(archive_path, "w", zipfile.ZIP_DEFLATED) as zipf:
|
|
139
|
+
for file_path in files:
|
|
140
|
+
file_path = Path(file_path)
|
|
141
|
+
if file_path.exists():
|
|
142
|
+
zipf.write(file_path, file_path.name)
|
|
143
|
+
logger.info(f"[ResultsExporter] Added to archive: {file_path.name}")
|
|
144
|
+
|
|
145
|
+
logger.info(f"[ResultsExporter] Archive created: {archive_path}")
|
|
146
|
+
return str(archive_path.absolute())
|
|
147
|
+
|
|
148
|
+
def export_results_json(
|
|
149
|
+
self, results: SimulationResults, filename: str = "results.json"
|
|
150
|
+
) -> str:
|
|
151
|
+
"""
|
|
152
|
+
Export raw results to JSON file.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
results: SimulationResults object
|
|
156
|
+
filename: Output filename
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Path to the JSON file
|
|
160
|
+
"""
|
|
161
|
+
logger.info(f"[ResultsExporter] Exporting results to JSON: {filename}")
|
|
162
|
+
|
|
163
|
+
output_path = self.output_dir / filename
|
|
164
|
+
output_path.write_text(results.model_dump_json(indent=2), encoding="utf-8")
|
|
165
|
+
|
|
166
|
+
logger.info(f"[ResultsExporter] JSON exported to: {output_path}")
|
|
167
|
+
return str(output_path.absolute())
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>{{ title }}</title>
|
|
7
|
+
<style>
|
|
8
|
+
* {
|
|
9
|
+
margin: 0;
|
|
10
|
+
padding: 0;
|
|
11
|
+
box-sizing: border-box;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
body {
|
|
15
|
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
|
|
16
|
+
background: #f5f7fa;
|
|
17
|
+
color: #333;
|
|
18
|
+
line-height: 1.6;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
.container {
|
|
22
|
+
max-width: 1400px;
|
|
23
|
+
margin: 0 auto;
|
|
24
|
+
padding: 20px;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
header {
|
|
28
|
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
|
29
|
+
color: white;
|
|
30
|
+
padding: 30px;
|
|
31
|
+
border-radius: 10px;
|
|
32
|
+
margin-bottom: 30px;
|
|
33
|
+
box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
header h1 {
|
|
37
|
+
font-size: 2.5em;
|
|
38
|
+
margin-bottom: 10px;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
header .subtitle {
|
|
42
|
+
opacity: 0.9;
|
|
43
|
+
font-size: 1.1em;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
.summary-cards {
|
|
47
|
+
display: grid;
|
|
48
|
+
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
|
49
|
+
gap: 20px;
|
|
50
|
+
margin-bottom: 30px;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
.card {
|
|
54
|
+
background: white;
|
|
55
|
+
padding: 25px;
|
|
56
|
+
border-radius: 10px;
|
|
57
|
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
|
58
|
+
transition: transform 0.2s;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
.card:hover {
|
|
62
|
+
transform: translateY(-5px);
|
|
63
|
+
box-shadow: 0 4px 8px rgba(0,0,0,0.15);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
.card h3 {
|
|
67
|
+
color: #667eea;
|
|
68
|
+
font-size: 0.9em;
|
|
69
|
+
text-transform: uppercase;
|
|
70
|
+
letter-spacing: 1px;
|
|
71
|
+
margin-bottom: 10px;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
.card .value {
|
|
75
|
+
font-size: 2.5em;
|
|
76
|
+
font-weight: bold;
|
|
77
|
+
color: #333;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
.card .label {
|
|
81
|
+
color: #666;
|
|
82
|
+
font-size: 0.9em;
|
|
83
|
+
margin-top: 5px;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
.chart-section {
|
|
87
|
+
background: white;
|
|
88
|
+
padding: 30px;
|
|
89
|
+
border-radius: 10px;
|
|
90
|
+
margin-bottom: 30px;
|
|
91
|
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
.chart-section h2 {
|
|
95
|
+
color: #333;
|
|
96
|
+
margin-bottom: 20px;
|
|
97
|
+
padding-bottom: 10px;
|
|
98
|
+
border-bottom: 2px solid #667eea;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
.chart-grid {
|
|
102
|
+
display: grid;
|
|
103
|
+
grid-template-columns: repeat(auto-fit, minmax(500px, 1fr));
|
|
104
|
+
gap: 30px;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
footer {
|
|
108
|
+
text-align: center;
|
|
109
|
+
padding: 20px;
|
|
110
|
+
color: #666;
|
|
111
|
+
font-size: 0.9em;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
.badge {
|
|
115
|
+
display: inline-block;
|
|
116
|
+
padding: 4px 12px;
|
|
117
|
+
border-radius: 20px;
|
|
118
|
+
font-size: 0.85em;
|
|
119
|
+
font-weight: 600;
|
|
120
|
+
margin: 2px;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
.badge-success {
|
|
124
|
+
background: #d4edda;
|
|
125
|
+
color: #155724;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
.badge-info {
|
|
129
|
+
background: #d1ecf1;
|
|
130
|
+
color: #0c5460;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
@media (max-width: 768px) {
|
|
134
|
+
.chart-grid {
|
|
135
|
+
grid-template-columns: 1fr;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
header h1 {
|
|
139
|
+
font-size: 1.8em;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
</style>
|
|
143
|
+
</head>
|
|
144
|
+
<body>
|
|
145
|
+
<div class="container">
|
|
146
|
+
<header>
|
|
147
|
+
<h1>{{ title }}</h1>
|
|
148
|
+
<div class="subtitle">Generated at: {{ generated_at }}</div>
|
|
149
|
+
</header>
|
|
150
|
+
|
|
151
|
+
{% block content %}{% endblock %}
|
|
152
|
+
|
|
153
|
+
<footer>
|
|
154
|
+
<p>Generated by LevelApp Evaluation Framework</p>
|
|
155
|
+
</footer>
|
|
156
|
+
</div>
|
|
157
|
+
</body>
|
|
158
|
+
</html>
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
{% extends "base.html" %}
|
|
2
|
+
|
|
3
|
+
{% block content %}
|
|
4
|
+
<!-- Metadata Heatmap -->
|
|
5
|
+
<div class="chart-section">
|
|
6
|
+
<h2>Metadata Field Accuracy</h2>
|
|
7
|
+
{{ chart_html|safe }}
|
|
8
|
+
</div>
|
|
9
|
+
|
|
10
|
+
<!-- Results Table -->
|
|
11
|
+
<div class="chart-section">
|
|
12
|
+
<h2>Detailed Comparison Results</h2>
|
|
13
|
+
<div style="overflow-x: auto;">
|
|
14
|
+
<table style="width: 100%; border-collapse: collapse;">
|
|
15
|
+
<thead>
|
|
16
|
+
<tr style="background: #667eea; color: white;">
|
|
17
|
+
<th style="padding: 12px; text-align: left;">Field Name</th>
|
|
18
|
+
<th style="padding: 12px; text-align: left;">Entity Metric</th>
|
|
19
|
+
<th style="padding: 12px; text-align: left;">Set Metric</th>
|
|
20
|
+
<th style="padding: 12px; text-align: center;">Score</th>
|
|
21
|
+
<th style="padding: 12px; text-align: center;">Threshold</th>
|
|
22
|
+
</tr>
|
|
23
|
+
</thead>
|
|
24
|
+
<tbody>
|
|
25
|
+
{% for idx, result in results.items() %}
|
|
26
|
+
<tr style="border-bottom: 1px solid #ddd;">
|
|
27
|
+
<td style="padding: 12px;">{{ result.field_name }}</td>
|
|
28
|
+
<td style="padding: 12px;">{{ result.entity_metric }}</td>
|
|
29
|
+
<td style="padding: 12px;">{{ result.set_metric }}</td>
|
|
30
|
+
<td style="padding: 12px; text-align: center;">
|
|
31
|
+
{% set score = result.set_scores[0] if result.set_scores is iterable and
|
|
32
|
+
result.set_scores|length > 0 else result.set_scores %}
|
|
33
|
+
<span
|
|
34
|
+
class="badge {% if score >= result.threshold %}badge-success{% else %}badge-info{% endif %}">
|
|
35
|
+
{{ "%.3f"|format(score if score is number else 0) }}
|
|
36
|
+
</span>
|
|
37
|
+
</td>
|
|
38
|
+
<td style="padding: 12px; text-align: center;">{{ result.threshold }}</td>
|
|
39
|
+
</tr>
|
|
40
|
+
{% endfor %}
|
|
41
|
+
</tbody>
|
|
42
|
+
</table>
|
|
43
|
+
</div>
|
|
44
|
+
</div>
|
|
45
|
+
|
|
46
|
+
<!-- Raw Data -->
|
|
47
|
+
<div class="chart-section">
|
|
48
|
+
<h2>Raw Comparison Data</h2>
|
|
49
|
+
<details>
|
|
50
|
+
<summary style="cursor: pointer; padding: 10px; background: #f8f9fa; border-radius: 5px;">
|
|
51
|
+
Click to view raw JSON data
|
|
52
|
+
</summary>
|
|
53
|
+
<pre
|
|
54
|
+
style="background: #f8f9fa; padding: 20px; border-radius: 5px; overflow-x: auto; margin-top: 10px;">{{ results|tojson(indent=2) }}</pre>
|
|
55
|
+
</details>
|
|
56
|
+
</div>
|
|
57
|
+
{% endblock %}
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
{% extends "base.html" %}
|
|
2
|
+
|
|
3
|
+
{% block content %}
|
|
4
|
+
<!-- Summary Statistics -->
|
|
5
|
+
<div class="summary-cards">
|
|
6
|
+
<div class="card">
|
|
7
|
+
<h3>Total Scripts</h3>
|
|
8
|
+
<div class="value">{{ summary_stats.total_scripts }}</div>
|
|
9
|
+
<div class="label">Evaluated</div>
|
|
10
|
+
</div>
|
|
11
|
+
|
|
12
|
+
<div class="card">
|
|
13
|
+
<h3>Overall Average</h3>
|
|
14
|
+
<div class="value">{{ "%.3f"|format(summary_stats.overall_average) }}</div>
|
|
15
|
+
<div class="label">Across all providers</div>
|
|
16
|
+
</div>
|
|
17
|
+
|
|
18
|
+
<div class="card">
|
|
19
|
+
<h3>Total Time</h3>
|
|
20
|
+
<div class="value">{{ "%.2f"|format(summary_stats.total_time) }}</div>
|
|
21
|
+
<div class="label">Seconds</div>
|
|
22
|
+
</div>
|
|
23
|
+
|
|
24
|
+
<div class="card">
|
|
25
|
+
<h3>Providers</h3>
|
|
26
|
+
<div class="value">{{ summary_stats.providers|length }}</div>
|
|
27
|
+
<div class="label">
|
|
28
|
+
{% for provider in summary_stats.providers %}
|
|
29
|
+
{% if provider not in ['processing_time', 'guardrail', 'metadata'] %}
|
|
30
|
+
<span class="badge badge-info">{{ provider|upper }}</span>
|
|
31
|
+
{% endif %}
|
|
32
|
+
{% endfor %}
|
|
33
|
+
</div>
|
|
34
|
+
</div>
|
|
35
|
+
</div>
|
|
36
|
+
|
|
37
|
+
<!-- Summary Metrics Chart -->
|
|
38
|
+
{% if charts.summary_metrics %}
|
|
39
|
+
<div class="chart-section">
|
|
40
|
+
<h2>Key Metrics</h2>
|
|
41
|
+
{{ charts.summary_metrics|safe }}
|
|
42
|
+
</div>
|
|
43
|
+
{% endif %}
|
|
44
|
+
|
|
45
|
+
<!-- Score Trends -->
|
|
46
|
+
{% if charts.score_trend %}
|
|
47
|
+
<div class="chart-section">
|
|
48
|
+
<h2>Score Trends</h2>
|
|
49
|
+
{{ charts.score_trend|safe }}
|
|
50
|
+
</div>
|
|
51
|
+
{% endif %}
|
|
52
|
+
|
|
53
|
+
<!-- Provider Comparison -->
|
|
54
|
+
{% if charts.provider_comparison %}
|
|
55
|
+
<div class="chart-section">
|
|
56
|
+
<h2>Provider Comparison</h2>
|
|
57
|
+
{{ charts.provider_comparison|safe }}
|
|
58
|
+
</div>
|
|
59
|
+
{% endif %}
|
|
60
|
+
|
|
61
|
+
<!-- Distribution Charts -->
|
|
62
|
+
<div class="chart-section">
|
|
63
|
+
<h2>Score Distributions</h2>
|
|
64
|
+
<div class="chart-grid">
|
|
65
|
+
{% for key, chart in charts.items() %}
|
|
66
|
+
{% if key.startswith('distribution_') %}
|
|
67
|
+
{{ chart|safe }}
|
|
68
|
+
{% endif %}
|
|
69
|
+
{% endfor %}
|
|
70
|
+
</div>
|
|
71
|
+
</div>
|
|
72
|
+
|
|
73
|
+
<!-- Timeline -->
|
|
74
|
+
{% if charts.timeline %}
|
|
75
|
+
<div class="chart-section">
|
|
76
|
+
<h2>Processing Timeline</h2>
|
|
77
|
+
{{ charts.timeline|safe }}
|
|
78
|
+
</div>
|
|
79
|
+
{% endif %}
|
|
80
|
+
|
|
81
|
+
<!-- Evaluation Summary -->
|
|
82
|
+
{% if has_evaluation_summary and results.evaluation_summary %}
|
|
83
|
+
<div class="chart-section">
|
|
84
|
+
<h2>Evaluation Summary</h2>
|
|
85
|
+
{% for provider, verdicts in results.evaluation_summary.items() %}
|
|
86
|
+
<div style="margin-bottom: 20px;">
|
|
87
|
+
<h3 style="color: #667eea;">{{ provider|upper }}</h3>
|
|
88
|
+
<ul style="list-style: none; padding-left: 0;">
|
|
89
|
+
{% for verdict in verdicts %}
|
|
90
|
+
<li style="padding: 8px; background: #f8f9fa; margin: 5px 0; border-left: 3px solid #667eea;">
|
|
91
|
+
{{ verdict }}
|
|
92
|
+
</li>
|
|
93
|
+
{% endfor %}
|
|
94
|
+
</ul>
|
|
95
|
+
</div>
|
|
96
|
+
{% endfor %}
|
|
97
|
+
</div>
|
|
98
|
+
{% endif %}
|
|
99
|
+
|
|
100
|
+
<!-- Raw Data Section -->
|
|
101
|
+
<div class="chart-section">
|
|
102
|
+
<h2>Detailed Results</h2>
|
|
103
|
+
<details>
|
|
104
|
+
<summary style="cursor: pointer; padding: 10px; background: #f8f9fa; border-radius: 5px;">
|
|
105
|
+
Click to view raw JSON data
|
|
106
|
+
</summary>
|
|
107
|
+
<pre
|
|
108
|
+
style="background: #f8f9fa; padding: 20px; border-radius: 5px; overflow-x: auto; margin-top: 10px;">{{ results.model_dump_json(indent=2) }}</pre>
|
|
109
|
+
</details>
|
|
110
|
+
</div>
|
|
111
|
+
{% endblock %}
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from pydantic import ValidationError
|
|
5
|
+
from functools import partial
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
|
|
9
|
+
from levelapp.core.base import BaseProcess
|
|
10
|
+
from levelapp.endpoint.client import EndpointConfig
|
|
11
|
+
from levelapp.endpoint.manager import EndpointConfigManager
|
|
12
|
+
from levelapp.simulator.schemas import ScriptsBatch
|
|
13
|
+
from levelapp.simulator.simulator import ConversationSimulator
|
|
14
|
+
from levelapp.workflow.runtime import WorkflowContext
|
|
15
|
+
from levelapp.aspects.loader import DataLoader
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class BaseWorkflow(ABC):
|
|
19
|
+
"""Abstract base class for evaluation workflows."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, name: str, context: WorkflowContext) -> None:
|
|
22
|
+
self.name = name
|
|
23
|
+
self.context = context
|
|
24
|
+
self.process: BaseProcess | None = None
|
|
25
|
+
self._input_data: Any | None = None
|
|
26
|
+
self._results: Any | None = None
|
|
27
|
+
self._initialized: bool = False
|
|
28
|
+
|
|
29
|
+
def setup(self) -> None:
|
|
30
|
+
"""Validate and initialize workflow-specific settings."""
|
|
31
|
+
if self._initialized:
|
|
32
|
+
return
|
|
33
|
+
|
|
34
|
+
self.process = self._setup_process(context=self.context)
|
|
35
|
+
self._initialized = True
|
|
36
|
+
|
|
37
|
+
def load_data(self) -> None:
|
|
38
|
+
"""Load and preprocess input data."""
|
|
39
|
+
if not self._initialized:
|
|
40
|
+
raise RuntimeError(f"[{self.name}] Workflow not initialized. Call setup() first.")
|
|
41
|
+
self._input_data = self._load_input_data(context=self.context)
|
|
42
|
+
|
|
43
|
+
def execute(self) -> None:
|
|
44
|
+
"""Run the workflow evaluation steps."""
|
|
45
|
+
if not self._input_data:
|
|
46
|
+
raise RuntimeError(f"[{self.name}] No reference data available.")
|
|
47
|
+
|
|
48
|
+
if asyncio.iscoroutinefunction(self.process.run):
|
|
49
|
+
self._results = asyncio.run(self.process.run(**self._input_data))
|
|
50
|
+
else:
|
|
51
|
+
self._results = self.process.run(**self._input_data)
|
|
52
|
+
|
|
53
|
+
async def aexecute(self) -> None:
|
|
54
|
+
if not self._input_data:
|
|
55
|
+
raise RuntimeError(f"[{self.name}] No reference data available.")
|
|
56
|
+
|
|
57
|
+
if asyncio.iscoroutinefunction(self.process.run):
|
|
58
|
+
self._results = await self.process.run(**self._input_data)
|
|
59
|
+
else:
|
|
60
|
+
loop = asyncio.get_running_loop()
|
|
61
|
+
func = partial(self.process.run, **self._input_data)
|
|
62
|
+
self._results = await loop.run_in_executor(None, func, None)
|
|
63
|
+
|
|
64
|
+
def collect_results(self) -> Any:
|
|
65
|
+
"""
|
|
66
|
+
Return unified results structure.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
The simulation results.
|
|
70
|
+
"""
|
|
71
|
+
return self._results
|
|
72
|
+
|
|
73
|
+
@abstractmethod
|
|
74
|
+
async def test_connection(self, context: Dict[str, Any]) -> Dict[str, Any]:
|
|
75
|
+
"""
|
|
76
|
+
Abstract method for testing endpoint connection.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
context (Dict[str, Any]): The context (request payload) to test connectivity with.
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
The test connectivity result.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
@abstractmethod
|
|
86
|
+
def _setup_process(self, context: WorkflowContext) -> BaseProcess:
|
|
87
|
+
"""
|
|
88
|
+
Abstract method for setting up the configured process.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
context (WorkflowContext): The workflow context.
|
|
92
|
+
"""
|
|
93
|
+
raise NotImplementedError
|
|
94
|
+
|
|
95
|
+
@abstractmethod
|
|
96
|
+
def _load_input_data(self, context: WorkflowContext) -> Any:
|
|
97
|
+
"""
|
|
98
|
+
Abstract method for loading reference data.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
context (WorkflowContext): The workflow context.
|
|
102
|
+
"""
|
|
103
|
+
raise NotImplementedError
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class SimulatorWorkflow(BaseWorkflow):
|
|
107
|
+
def __init__(self, context: WorkflowContext) -> None:
|
|
108
|
+
super().__init__(name="ConversationSimulator", context=context)
|
|
109
|
+
|
|
110
|
+
def _setup_process(self, context: WorkflowContext) -> BaseProcess:
|
|
111
|
+
"""
|
|
112
|
+
Concrete implementation for setting up the simulation workflow.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
context (WorkflowContext): The workflow context for the simulation workflow.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
ConversationSimulator instance.
|
|
119
|
+
"""
|
|
120
|
+
simulator = ConversationSimulator()
|
|
121
|
+
simulator.setup(
|
|
122
|
+
endpoint_config=context.endpoint,
|
|
123
|
+
evaluators=context.evaluators,
|
|
124
|
+
providers=context.providers,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return simulator
|
|
128
|
+
|
|
129
|
+
async def test_connection(self, context: Dict[str, Any]) -> Dict[str, Any]:
|
|
130
|
+
"""
|
|
131
|
+
Runs a connectivity test of the configured endpoint.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
context (Dict[str, Any]): The request payload to send for testing.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
The test connectivity result.
|
|
138
|
+
"""
|
|
139
|
+
endpoint_cm = EndpointConfigManager()
|
|
140
|
+
endpoint_cm.set_endpoints(endpoints_config=[self.context.endpoint])
|
|
141
|
+
tester = endpoint_cm.get_tester(endpoint_name=self.context.endpoint.name)
|
|
142
|
+
results = await tester.test(context=context)
|
|
143
|
+
|
|
144
|
+
return results
|
|
145
|
+
|
|
146
|
+
def _load_input_data(self, context: WorkflowContext) -> Dict[str, Any]:
|
|
147
|
+
"""
|
|
148
|
+
Concrete implementation for loading the reference data.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
context (WorkflowContext): The workflow context for the simulation workflow.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Dict[str, Any]: The reference data.
|
|
155
|
+
"""
|
|
156
|
+
loader = DataLoader()
|
|
157
|
+
if "reference_data" in context.inputs:
|
|
158
|
+
data_config = context.inputs["reference_data"]
|
|
159
|
+
else:
|
|
160
|
+
reference_data_path = context.inputs.get("reference_data_path", "no-path-provided")
|
|
161
|
+
|
|
162
|
+
if not reference_data_path:
|
|
163
|
+
raise RuntimeError(f"[{self.name}] No reference data available.")
|
|
164
|
+
|
|
165
|
+
file_path = Path(reference_data_path)
|
|
166
|
+
|
|
167
|
+
if not file_path.exists():
|
|
168
|
+
raise FileNotFoundError(f"[{self.name}] Reference data file not found.")
|
|
169
|
+
|
|
170
|
+
data_config = loader.load_raw_data(path=reference_data_path)
|
|
171
|
+
|
|
172
|
+
try:
|
|
173
|
+
scripts_batch = ScriptsBatch.model_validate(data_config)
|
|
174
|
+
|
|
175
|
+
except ValidationError as e:
|
|
176
|
+
raise RuntimeError(f"[{self.name}] Validation error: {e}")
|
|
177
|
+
|
|
178
|
+
attempts = context.config.process.evaluation_params.get("attempts", 1)
|
|
179
|
+
batch_size = context.config.process.evaluation_params.get("batch_size", 5)
|
|
180
|
+
|
|
181
|
+
return {"test_batch": scripts_batch, "attempts": attempts, "batch_size": batch_size}
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class ComparatorWorkflow(BaseWorkflow):
|
|
185
|
+
def __init__(self, context: WorkflowContext) -> None:
|
|
186
|
+
super().__init__(name="MetadataComparator", context=context)
|
|
187
|
+
|
|
188
|
+
def _setup_process(self, context: WorkflowContext) -> BaseProcess:
|
|
189
|
+
raise NotImplementedError
|
|
190
|
+
|
|
191
|
+
def _load_input_data(self, context: WorkflowContext) -> Any:
|
|
192
|
+
raise NotImplementedError
|