eval-ai-library 0.3.2__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eval-ai-library might be problematic. Click here for more details.

@@ -0,0 +1,172 @@
1
+ # eval_lib/dashboard_server.py
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import List, Dict, Any, Optional
6
+ from datetime import datetime
7
+
8
+
9
+ class DashboardCache:
10
+ """Cache to store evaluation results for the dashboard"""
11
+
12
+ def __init__(self, cache_dir: str = ".eval_cache"):
13
+ self.cache_dir = Path(cache_dir)
14
+ self.cache_dir.mkdir(exist_ok=True)
15
+ self.cache_file = self.cache_dir / "results.json"
16
+ self.results_history = []
17
+ self._load_cache()
18
+
19
+ def _load_cache(self):
20
+ """Load cache from file"""
21
+ if self.cache_file.exists():
22
+ try:
23
+ with open(self.cache_file, 'r', encoding='utf-8') as f:
24
+ self.results_history = json.load(f)
25
+ except Exception as e:
26
+ print(f"Warning: Could not load cache: {e}")
27
+ self.results_history = []
28
+
29
+ def _save_cache(self):
30
+ """Save cache to file"""
31
+ try:
32
+ with open(self.cache_file, 'w', encoding='utf-8') as f:
33
+ json.dump(self.results_history, f,
34
+ indent=2, ensure_ascii=False)
35
+ except Exception as e:
36
+ print(f"Warning: Could not save cache: {e}")
37
+
38
+ def add_results(self, results: List[tuple], session_name: Optional[str] = None) -> str:
39
+ """Add new results to the cache"""
40
+ import time
41
+ session_id = session_name or f"session_{int(time.time())}"
42
+ parsed_data = self._parse_results(results)
43
+
44
+ session_data = {
45
+ 'session_id': session_id,
46
+ 'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
47
+ 'data': parsed_data
48
+ }
49
+
50
+ self.results_history.append(session_data)
51
+ self._save_cache()
52
+
53
+ return session_id
54
+
55
+ def get_latest(self) -> Optional[Dict[str, Any]]:
56
+ """Get latest results"""
57
+ if self.results_history:
58
+ return self.results_history[-1]
59
+ return None
60
+
61
+ def get_all(self) -> List[Dict[str, Any]]:
62
+ """Get all results"""
63
+ return self.results_history
64
+
65
+ def get_by_session(self, session_id: str) -> Optional[Dict[str, Any]]:
66
+ """Get results by session_id"""
67
+ for session in self.results_history:
68
+ if session['session_id'] == session_id:
69
+ return session
70
+ return None
71
+
72
+ def clear(self):
73
+ """Clear the cache"""
74
+ self.results_history = []
75
+ self._save_cache()
76
+
77
+ def _parse_results(self, results: List[tuple]) -> Dict[str, Any]:
78
+ """Parse raw results into structured format for dashboard"""
79
+
80
+ test_cases = []
81
+ metrics_summary = {}
82
+ total_cost = 0.0
83
+
84
+ for test_idx, test_results in results:
85
+ for result in test_results:
86
+ test_case_data = {
87
+ 'test_index': test_idx,
88
+ 'input': result.input[:100] + '...' if len(result.input) > 100 else result.input,
89
+ 'input_full': result.input,
90
+ 'actual_output': result.actual_output[:200] if result.actual_output else '',
91
+ 'actual_output_full': result.actual_output,
92
+ 'expected_output': result.expected_output[:200] if result.expected_output else '',
93
+ 'expected_output_full': result.expected_output,
94
+ 'retrieval_context': result.retrieval_context if result.retrieval_context else [],
95
+ 'metrics': []
96
+ }
97
+
98
+ for metric_data in result.metrics_data:
99
+ # Determine model name
100
+ if isinstance(metric_data.evaluation_model, str):
101
+ model_name = metric_data.evaluation_model
102
+ else:
103
+ # For CustomLLMClient
104
+ try:
105
+ model_name = metric_data.evaluation_model.get_model_name()
106
+ except:
107
+ model_name = str(
108
+ type(metric_data.evaluation_model).__name__)
109
+
110
+ test_case_data['metrics'].append({
111
+ 'name': metric_data.name,
112
+ 'score': round(metric_data.score, 3),
113
+ 'success': metric_data.success,
114
+ 'threshold': metric_data.threshold,
115
+ 'reason': metric_data.reason[:300] if metric_data.reason else '',
116
+ 'reason_full': metric_data.reason,
117
+ 'evaluation_model': model_name,
118
+ 'evaluation_cost': metric_data.evaluation_cost,
119
+ 'evaluation_log': metric_data.evaluation_log
120
+ })
121
+
122
+ if metric_data.name not in metrics_summary:
123
+ metrics_summary[metric_data.name] = {
124
+ 'scores': [],
125
+ 'passed': 0,
126
+ 'failed': 0,
127
+ 'threshold': metric_data.threshold,
128
+ 'total_cost': 0.0,
129
+ 'model': model_name
130
+ }
131
+
132
+ metrics_summary[metric_data.name]['scores'].append(
133
+ metric_data.score)
134
+ if metric_data.success:
135
+ metrics_summary[metric_data.name]['passed'] += 1
136
+ else:
137
+ metrics_summary[metric_data.name]['failed'] += 1
138
+
139
+ if metric_data.evaluation_cost:
140
+ total_cost += metric_data.evaluation_cost
141
+ metrics_summary[metric_data.name]['total_cost'] += metric_data.evaluation_cost
142
+
143
+ test_cases.append(test_case_data)
144
+
145
+ for metric_name, data in metrics_summary.items():
146
+ data['avg_score'] = sum(data['scores']) / \
147
+ len(data['scores']) if data['scores'] else 0
148
+ data['success_rate'] = (data['passed'] / (data['passed'] + data['failed'])
149
+ * 100) if (data['passed'] + data['failed']) > 0 else 0
150
+
151
+ return {
152
+ 'test_cases': test_cases,
153
+ 'metrics_summary': metrics_summary,
154
+ 'total_cost': total_cost,
155
+ 'total_tests': len(test_cases)
156
+ }
157
+
158
+
159
+ def save_results_to_cache(results: List[tuple], session_name: Optional[str] = None) -> str:
160
+ """
161
+ Save evaluation results to cache for dashboard viewing.
162
+ Cache is always saved to .eval_cache/ in current directory.
163
+
164
+ Args:
165
+ results: Evaluation results from evaluate()
166
+ session_name: Optional name for the session
167
+
168
+ Returns:
169
+ Session ID
170
+ """
171
+ cache = DashboardCache()
172
+ return cache.add_results(results, session_name)
eval_lib/evaluate.py CHANGED
@@ -68,7 +68,9 @@ def _print_summary(results: List, total_cost: float, total_time: float, passed:
68
68
  async def evaluate(
69
69
  test_cases: List[EvalTestCase],
70
70
  metrics: List[MetricPattern],
71
- verbose: bool = True
71
+ verbose: bool = True,
72
+ show_dashboard: bool = False,
73
+ session_name: str = None,
72
74
  ) -> List[Tuple[None, List[TestCaseResult]]]:
73
75
  """
74
76
  Evaluate test cases with multiple metrics.
@@ -77,6 +79,10 @@ async def evaluate(
77
79
  test_cases: List of test cases to evaluate
78
80
  metrics: List of metrics to apply
79
81
  verbose: Enable detailed logging (default: True)
82
+ show_dashboard: Launch interactive web dashboard (default: False)
83
+ dashboard_port: Port for dashboard server (default: 14500)
84
+ session_name: Name for this evaluation session
85
+ cache_dir: Directory to store cache (default: .eval_cache)
80
86
 
81
87
  Returns:
82
88
  List of evaluation results
@@ -183,6 +189,23 @@ async def evaluate(
183
189
  _print_summary(results, total_cost, total_time,
184
190
  total_passed, total_tests)
185
191
 
192
+ if show_dashboard:
193
+ from eval_lib.dashboard_server import save_results_to_cache
194
+
195
+ session_id = save_results_to_cache(results, session_name)
196
+
197
+ if verbose:
198
+ print(f"\n{Colors.BOLD}{Colors.GREEN}{'='*70}{Colors.ENDC}")
199
+ print(f"{Colors.BOLD}{Colors.GREEN}📊 DASHBOARD{Colors.ENDC}")
200
+ print(f"{Colors.BOLD}{Colors.GREEN}{'='*70}{Colors.ENDC}")
201
+ print(
202
+ f"\n✅ Results saved to cache: {Colors.CYAN}{session_id}{Colors.ENDC}")
203
+ print(f"\n💡 To view results, run:")
204
+ print(f" {Colors.YELLOW}eval-lib dashboard{Colors.ENDC}")
205
+ print(
206
+ f"\n Then open: {Colors.CYAN}http://localhost:14500{Colors.ENDC}")
207
+ print(f"\n{Colors.BOLD}{Colors.GREEN}{'='*70}{Colors.ENDC}\n")
208
+
186
209
  return results
187
210
 
188
211