eval-ai-library 0.3.2__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- {eval_ai_library-0.3.2.dist-info → eval_ai_library-0.3.10.dist-info}/METADATA +379 -1
- {eval_ai_library-0.3.2.dist-info → eval_ai_library-0.3.10.dist-info}/RECORD +12 -8
- eval_ai_library-0.3.10.dist-info/entry_points.txt +2 -0
- eval_lib/__init__.py +11 -1
- eval_lib/cli.py +166 -0
- eval_lib/dashboard_server.py +172 -0
- eval_lib/evaluate.py +24 -1
- eval_lib/html.py +736 -0
- eval_lib/llm_client.py +47 -1
- {eval_ai_library-0.3.2.dist-info → eval_ai_library-0.3.10.dist-info}/WHEEL +0 -0
- {eval_ai_library-0.3.2.dist-info → eval_ai_library-0.3.10.dist-info}/licenses/LICENSE +0 -0
- {eval_ai_library-0.3.2.dist-info → eval_ai_library-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# eval_lib/dashboard_server.py
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import List, Dict, Any, Optional
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DashboardCache:
|
|
10
|
+
"""Cache to store evaluation results for the dashboard"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, cache_dir: str = ".eval_cache"):
|
|
13
|
+
self.cache_dir = Path(cache_dir)
|
|
14
|
+
self.cache_dir.mkdir(exist_ok=True)
|
|
15
|
+
self.cache_file = self.cache_dir / "results.json"
|
|
16
|
+
self.results_history = []
|
|
17
|
+
self._load_cache()
|
|
18
|
+
|
|
19
|
+
def _load_cache(self):
|
|
20
|
+
"""Load cache from file"""
|
|
21
|
+
if self.cache_file.exists():
|
|
22
|
+
try:
|
|
23
|
+
with open(self.cache_file, 'r', encoding='utf-8') as f:
|
|
24
|
+
self.results_history = json.load(f)
|
|
25
|
+
except Exception as e:
|
|
26
|
+
print(f"Warning: Could not load cache: {e}")
|
|
27
|
+
self.results_history = []
|
|
28
|
+
|
|
29
|
+
def _save_cache(self):
|
|
30
|
+
"""Save cache to file"""
|
|
31
|
+
try:
|
|
32
|
+
with open(self.cache_file, 'w', encoding='utf-8') as f:
|
|
33
|
+
json.dump(self.results_history, f,
|
|
34
|
+
indent=2, ensure_ascii=False)
|
|
35
|
+
except Exception as e:
|
|
36
|
+
print(f"Warning: Could not save cache: {e}")
|
|
37
|
+
|
|
38
|
+
def add_results(self, results: List[tuple], session_name: Optional[str] = None) -> str:
|
|
39
|
+
"""Add new results to the cache"""
|
|
40
|
+
import time
|
|
41
|
+
session_id = session_name or f"session_{int(time.time())}"
|
|
42
|
+
parsed_data = self._parse_results(results)
|
|
43
|
+
|
|
44
|
+
session_data = {
|
|
45
|
+
'session_id': session_id,
|
|
46
|
+
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
47
|
+
'data': parsed_data
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
self.results_history.append(session_data)
|
|
51
|
+
self._save_cache()
|
|
52
|
+
|
|
53
|
+
return session_id
|
|
54
|
+
|
|
55
|
+
def get_latest(self) -> Optional[Dict[str, Any]]:
|
|
56
|
+
"""Get latest results"""
|
|
57
|
+
if self.results_history:
|
|
58
|
+
return self.results_history[-1]
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
def get_all(self) -> List[Dict[str, Any]]:
|
|
62
|
+
"""Get all results"""
|
|
63
|
+
return self.results_history
|
|
64
|
+
|
|
65
|
+
def get_by_session(self, session_id: str) -> Optional[Dict[str, Any]]:
|
|
66
|
+
"""Get results by session_id"""
|
|
67
|
+
for session in self.results_history:
|
|
68
|
+
if session['session_id'] == session_id:
|
|
69
|
+
return session
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
def clear(self):
|
|
73
|
+
"""Clear the cache"""
|
|
74
|
+
self.results_history = []
|
|
75
|
+
self._save_cache()
|
|
76
|
+
|
|
77
|
+
def _parse_results(self, results: List[tuple]) -> Dict[str, Any]:
|
|
78
|
+
"""Parse raw results into structured format for dashboard"""
|
|
79
|
+
|
|
80
|
+
test_cases = []
|
|
81
|
+
metrics_summary = {}
|
|
82
|
+
total_cost = 0.0
|
|
83
|
+
|
|
84
|
+
for test_idx, test_results in results:
|
|
85
|
+
for result in test_results:
|
|
86
|
+
test_case_data = {
|
|
87
|
+
'test_index': test_idx,
|
|
88
|
+
'input': result.input[:100] + '...' if len(result.input) > 100 else result.input,
|
|
89
|
+
'input_full': result.input,
|
|
90
|
+
'actual_output': result.actual_output[:200] if result.actual_output else '',
|
|
91
|
+
'actual_output_full': result.actual_output,
|
|
92
|
+
'expected_output': result.expected_output[:200] if result.expected_output else '',
|
|
93
|
+
'expected_output_full': result.expected_output,
|
|
94
|
+
'retrieval_context': result.retrieval_context if result.retrieval_context else [],
|
|
95
|
+
'metrics': []
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
for metric_data in result.metrics_data:
|
|
99
|
+
# Determine model name
|
|
100
|
+
if isinstance(metric_data.evaluation_model, str):
|
|
101
|
+
model_name = metric_data.evaluation_model
|
|
102
|
+
else:
|
|
103
|
+
# For CustomLLMClient
|
|
104
|
+
try:
|
|
105
|
+
model_name = metric_data.evaluation_model.get_model_name()
|
|
106
|
+
except:
|
|
107
|
+
model_name = str(
|
|
108
|
+
type(metric_data.evaluation_model).__name__)
|
|
109
|
+
|
|
110
|
+
test_case_data['metrics'].append({
|
|
111
|
+
'name': metric_data.name,
|
|
112
|
+
'score': round(metric_data.score, 3),
|
|
113
|
+
'success': metric_data.success,
|
|
114
|
+
'threshold': metric_data.threshold,
|
|
115
|
+
'reason': metric_data.reason[:300] if metric_data.reason else '',
|
|
116
|
+
'reason_full': metric_data.reason,
|
|
117
|
+
'evaluation_model': model_name,
|
|
118
|
+
'evaluation_cost': metric_data.evaluation_cost,
|
|
119
|
+
'evaluation_log': metric_data.evaluation_log
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
if metric_data.name not in metrics_summary:
|
|
123
|
+
metrics_summary[metric_data.name] = {
|
|
124
|
+
'scores': [],
|
|
125
|
+
'passed': 0,
|
|
126
|
+
'failed': 0,
|
|
127
|
+
'threshold': metric_data.threshold,
|
|
128
|
+
'total_cost': 0.0,
|
|
129
|
+
'model': model_name
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
metrics_summary[metric_data.name]['scores'].append(
|
|
133
|
+
metric_data.score)
|
|
134
|
+
if metric_data.success:
|
|
135
|
+
metrics_summary[metric_data.name]['passed'] += 1
|
|
136
|
+
else:
|
|
137
|
+
metrics_summary[metric_data.name]['failed'] += 1
|
|
138
|
+
|
|
139
|
+
if metric_data.evaluation_cost:
|
|
140
|
+
total_cost += metric_data.evaluation_cost
|
|
141
|
+
metrics_summary[metric_data.name]['total_cost'] += metric_data.evaluation_cost
|
|
142
|
+
|
|
143
|
+
test_cases.append(test_case_data)
|
|
144
|
+
|
|
145
|
+
for metric_name, data in metrics_summary.items():
|
|
146
|
+
data['avg_score'] = sum(data['scores']) / \
|
|
147
|
+
len(data['scores']) if data['scores'] else 0
|
|
148
|
+
data['success_rate'] = (data['passed'] / (data['passed'] + data['failed'])
|
|
149
|
+
* 100) if (data['passed'] + data['failed']) > 0 else 0
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
'test_cases': test_cases,
|
|
153
|
+
'metrics_summary': metrics_summary,
|
|
154
|
+
'total_cost': total_cost,
|
|
155
|
+
'total_tests': len(test_cases)
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def save_results_to_cache(results: List[tuple], session_name: Optional[str] = None) -> str:
|
|
160
|
+
"""
|
|
161
|
+
Save evaluation results to cache for dashboard viewing.
|
|
162
|
+
Cache is always saved to .eval_cache/ in current directory.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
results: Evaluation results from evaluate()
|
|
166
|
+
session_name: Optional name for the session
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Session ID
|
|
170
|
+
"""
|
|
171
|
+
cache = DashboardCache()
|
|
172
|
+
return cache.add_results(results, session_name)
|
eval_lib/evaluate.py
CHANGED
|
@@ -68,7 +68,9 @@ def _print_summary(results: List, total_cost: float, total_time: float, passed:
|
|
|
68
68
|
async def evaluate(
|
|
69
69
|
test_cases: List[EvalTestCase],
|
|
70
70
|
metrics: List[MetricPattern],
|
|
71
|
-
verbose: bool = True
|
|
71
|
+
verbose: bool = True,
|
|
72
|
+
show_dashboard: bool = False,
|
|
73
|
+
session_name: str = None,
|
|
72
74
|
) -> List[Tuple[None, List[TestCaseResult]]]:
|
|
73
75
|
"""
|
|
74
76
|
Evaluate test cases with multiple metrics.
|
|
@@ -77,6 +79,10 @@ async def evaluate(
|
|
|
77
79
|
test_cases: List of test cases to evaluate
|
|
78
80
|
metrics: List of metrics to apply
|
|
79
81
|
verbose: Enable detailed logging (default: True)
|
|
82
|
+
show_dashboard: Launch interactive web dashboard (default: False)
|
|
83
|
+
dashboard_port: Port for dashboard server (default: 14500)
|
|
84
|
+
session_name: Name for this evaluation session
|
|
85
|
+
cache_dir: Directory to store cache (default: .eval_cache)
|
|
80
86
|
|
|
81
87
|
Returns:
|
|
82
88
|
List of evaluation results
|
|
@@ -183,6 +189,23 @@ async def evaluate(
|
|
|
183
189
|
_print_summary(results, total_cost, total_time,
|
|
184
190
|
total_passed, total_tests)
|
|
185
191
|
|
|
192
|
+
if show_dashboard:
|
|
193
|
+
from eval_lib.dashboard_server import save_results_to_cache
|
|
194
|
+
|
|
195
|
+
session_id = save_results_to_cache(results, session_name)
|
|
196
|
+
|
|
197
|
+
if verbose:
|
|
198
|
+
print(f"\n{Colors.BOLD}{Colors.GREEN}{'='*70}{Colors.ENDC}")
|
|
199
|
+
print(f"{Colors.BOLD}{Colors.GREEN}📊 DASHBOARD{Colors.ENDC}")
|
|
200
|
+
print(f"{Colors.BOLD}{Colors.GREEN}{'='*70}{Colors.ENDC}")
|
|
201
|
+
print(
|
|
202
|
+
f"\n✅ Results saved to cache: {Colors.CYAN}{session_id}{Colors.ENDC}")
|
|
203
|
+
print(f"\n💡 To view results, run:")
|
|
204
|
+
print(f" {Colors.YELLOW}eval-lib dashboard{Colors.ENDC}")
|
|
205
|
+
print(
|
|
206
|
+
f"\n Then open: {Colors.CYAN}http://localhost:14500{Colors.ENDC}")
|
|
207
|
+
print(f"\n{Colors.BOLD}{Colors.GREEN}{'='*70}{Colors.ENDC}\n")
|
|
208
|
+
|
|
186
209
|
return results
|
|
187
210
|
|
|
188
211
|
|