eval-ai-library 0.3.3__py3-none-any.whl → 0.3.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- {eval_ai_library-0.3.3.dist-info → eval_ai_library-0.3.11.dist-info}/METADATA +166 -1
- {eval_ai_library-0.3.3.dist-info → eval_ai_library-0.3.11.dist-info}/RECORD +11 -7
- eval_ai_library-0.3.11.dist-info/entry_points.txt +2 -0
- eval_lib/__init__.py +9 -1
- eval_lib/cli.py +183 -0
- eval_lib/dashboard_server.py +172 -0
- eval_lib/evaluate.py +24 -1
- eval_lib/html.py +49 -0
- {eval_ai_library-0.3.3.dist-info → eval_ai_library-0.3.11.dist-info}/WHEEL +0 -0
- {eval_ai_library-0.3.3.dist-info → eval_ai_library-0.3.11.dist-info}/licenses/LICENSE +0 -0
- {eval_ai_library-0.3.3.dist-info → eval_ai_library-0.3.11.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-ai-library
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.11
|
|
4
4
|
Summary: Comprehensive AI Model Evaluation Framework with support for multiple LLM providers
|
|
5
5
|
Author-email: Aleksandr Meshkov <alekslynx90@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -45,6 +45,7 @@ Requires-Dist: html2text>=2020.1.16
|
|
|
45
45
|
Requires-Dist: markdown>=3.4.0
|
|
46
46
|
Requires-Dist: pandas>=2.0.0
|
|
47
47
|
Requires-Dist: striprtf>=0.0.26
|
|
48
|
+
Requires-Dist: flask>=3.0.0
|
|
48
49
|
Provides-Extra: dev
|
|
49
50
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
50
51
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
@@ -807,6 +808,170 @@ response, cost = await chat_complete(
|
|
|
807
808
|
)
|
|
808
809
|
```
|
|
809
810
|
|
|
811
|
+
## Dashboard
|
|
812
|
+
|
|
813
|
+
The library includes an interactive web dashboard for visualizing evaluation results. All evaluation results are automatically saved to cache and can be viewed in a beautiful web interface.
|
|
814
|
+
|
|
815
|
+
### Features
|
|
816
|
+
|
|
817
|
+
- 📊 **Interactive Charts**: Visual representation of metrics with Chart.js
|
|
818
|
+
- 📈 **Metrics Summary**: Aggregate statistics across all evaluations
|
|
819
|
+
- 🔍 **Detailed View**: Drill down into individual test cases and metric results
|
|
820
|
+
- 💾 **Session History**: Access past evaluation runs
|
|
821
|
+
- 🎨 **Beautiful UI**: Modern, responsive interface with color-coded results
|
|
822
|
+
- 🔄 **Real-time Updates**: Refresh to see new evaluation results
|
|
823
|
+
|
|
824
|
+
### Starting the Dashboard
|
|
825
|
+
|
|
826
|
+
The dashboard runs as a separate server that you start once and keep running:
|
|
827
|
+
```bash
|
|
828
|
+
# Start dashboard server (from your project directory)
|
|
829
|
+
eval-lib dashboard
|
|
830
|
+
|
|
831
|
+
# Custom port if 14500 is busy
|
|
832
|
+
eval-lib dashboard --port 8080
|
|
833
|
+
|
|
834
|
+
# Custom cache directory
|
|
835
|
+
eval-lib dashboard --cache-dir /path/to/cache
|
|
836
|
+
```
|
|
837
|
+
|
|
838
|
+
Once started, the dashboard will be available at `http://localhost:14500`
|
|
839
|
+
|
|
840
|
+
### Saving Results to Dashboard
|
|
841
|
+
|
|
842
|
+
Enable dashboard cache saving in your evaluation:
|
|
843
|
+
```python
|
|
844
|
+
import asyncio
|
|
845
|
+
from eval_lib import (
|
|
846
|
+
evaluate,
|
|
847
|
+
EvalTestCase,
|
|
848
|
+
AnswerRelevancyMetric,
|
|
849
|
+
FaithfulnessMetric
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
async def evaluate_with_dashboard():
|
|
853
|
+
test_cases = [
|
|
854
|
+
EvalTestCase(
|
|
855
|
+
input="What is the capital of France?",
|
|
856
|
+
actual_output="Paris is the capital.",
|
|
857
|
+
expected_output="Paris",
|
|
858
|
+
retrieval_context=["Paris is the capital of France."]
|
|
859
|
+
)
|
|
860
|
+
]
|
|
861
|
+
|
|
862
|
+
metrics = [
|
|
863
|
+
AnswerRelevancyMetric(model="gpt-4o-mini", threshold=0.7),
|
|
864
|
+
FaithfulnessMetric(model="gpt-4o-mini", threshold=0.8)
|
|
865
|
+
]
|
|
866
|
+
|
|
867
|
+
# Results are saved to .eval_cache/ for dashboard viewing
|
|
868
|
+
results = await evaluate(
|
|
869
|
+
test_cases=test_cases,
|
|
870
|
+
metrics=metrics,
|
|
871
|
+
show_dashboard=True, # ← Enable dashboard cache
|
|
872
|
+
session_name="My First Evaluation" # Optional session name
|
|
873
|
+
)
|
|
874
|
+
|
|
875
|
+
return results
|
|
876
|
+
|
|
877
|
+
asyncio.run(evaluate_with_dashboard())
|
|
878
|
+
```
|
|
879
|
+
|
|
880
|
+
### Typical Workflow
|
|
881
|
+
|
|
882
|
+
**Terminal 1 - Start Dashboard (once):**
|
|
883
|
+
```bash
|
|
884
|
+
cd ~/my_project
|
|
885
|
+
eval-lib dashboard
|
|
886
|
+
# Leave this terminal open - dashboard stays running
|
|
887
|
+
```
|
|
888
|
+
|
|
889
|
+
**Terminal 2 - Run Evaluations (multiple times):**
|
|
890
|
+
```python
|
|
891
|
+
# Run evaluation 1
|
|
892
|
+
results1 = await evaluate(
|
|
893
|
+
test_cases=test_cases1,
|
|
894
|
+
metrics=metrics,
|
|
895
|
+
show_dashboard=True,
|
|
896
|
+
session_name="Evaluation 1"
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
# Run evaluation 2
|
|
900
|
+
results2 = await evaluate(
|
|
901
|
+
test_cases=test_cases2,
|
|
902
|
+
metrics=metrics,
|
|
903
|
+
show_dashboard=True,
|
|
904
|
+
session_name="Evaluation 2"
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
# All results are cached and viewable in dashboard
|
|
908
|
+
```
|
|
909
|
+
|
|
910
|
+
**Browser:**
|
|
911
|
+
- Open `http://localhost:14500`
|
|
912
|
+
- Refresh page (F5) to see new evaluation results
|
|
913
|
+
- Switch between different evaluation sessions using the dropdown
|
|
914
|
+
|
|
915
|
+
### Dashboard Features
|
|
916
|
+
|
|
917
|
+
**Summary Cards:**
|
|
918
|
+
- Total test cases evaluated
|
|
919
|
+
- Total cost across all evaluations
|
|
920
|
+
- Number of metrics used
|
|
921
|
+
|
|
922
|
+
**Metrics Overview:**
|
|
923
|
+
- Average scores per metric
|
|
924
|
+
- Pass/fail counts
|
|
925
|
+
- Success rates
|
|
926
|
+
- Model used for evaluation
|
|
927
|
+
- Total cost per metric
|
|
928
|
+
|
|
929
|
+
**Detailed Results Table:**
|
|
930
|
+
- Test case inputs and outputs
|
|
931
|
+
- Individual metric scores
|
|
932
|
+
- Pass/fail status
|
|
933
|
+
- Click "View Details" for full information including:
|
|
934
|
+
- Complete input/output/expected output
|
|
935
|
+
- Full retrieval context
|
|
936
|
+
- Detailed evaluation reasoning
|
|
937
|
+
- Complete evaluation logs
|
|
938
|
+
|
|
939
|
+
**Charts:**
|
|
940
|
+
- Bar chart: Average scores by metric
|
|
941
|
+
- Doughnut chart: Success rate distribution
|
|
942
|
+
|
|
943
|
+
### Cache Management
|
|
944
|
+
|
|
945
|
+
Results are stored in `.eval_cache/results.json` in your project directory:
|
|
946
|
+
```bash
|
|
947
|
+
# View cache contents
|
|
948
|
+
cat .eval_cache/results.json
|
|
949
|
+
|
|
950
|
+
# Clear cache via dashboard
|
|
951
|
+
# Click "Clear Cache" button in dashboard UI
|
|
952
|
+
|
|
953
|
+
# Or manually delete cache
|
|
954
|
+
rm -rf .eval_cache/
|
|
955
|
+
```
|
|
956
|
+
|
|
957
|
+
### CLI Commands
|
|
958
|
+
```bash
|
|
959
|
+
# Start dashboard with defaults
|
|
960
|
+
eval-lib dashboard
|
|
961
|
+
|
|
962
|
+
# Custom port
|
|
963
|
+
eval-lib dashboard --port 8080
|
|
964
|
+
|
|
965
|
+
# Custom cache directory
|
|
966
|
+
eval-lib dashboard --cache-dir /path/to/project/.eval_cache
|
|
967
|
+
|
|
968
|
+
# Check library version
|
|
969
|
+
eval-lib version
|
|
970
|
+
|
|
971
|
+
# Help
|
|
972
|
+
eval-lib help
|
|
973
|
+
```
|
|
974
|
+
|
|
810
975
|
## Custom LLM Providers
|
|
811
976
|
|
|
812
977
|
The library supports custom LLM providers through the `CustomLLMClient` abstract base class. This allows you to integrate any LLM provider, including internal corporate models, locally-hosted models, or custom endpoints.
|
|
@@ -1,7 +1,10 @@
|
|
|
1
|
-
eval_ai_library-0.3.
|
|
2
|
-
eval_lib/__init__.py,sha256=
|
|
3
|
-
eval_lib/
|
|
1
|
+
eval_ai_library-0.3.11.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
|
|
2
|
+
eval_lib/__init__.py,sha256=CeVKlt01IO3EWykYMX2q91kdCKnJgGB1eFPJSIx4TJU,3204
|
|
3
|
+
eval_lib/cli.py,sha256=cRjEZhDVpRaP8jnGva-Fv1dHfcQ2h8OBAmNxxcXf_ww,5440
|
|
4
|
+
eval_lib/dashboard_server.py,sha256=kVkXihQh7WwoWBxsdt9jADOwCJtuAsjIqw9eaoNpUqI,6768
|
|
5
|
+
eval_lib/evaluate.py,sha256=LEjwPsuuPGpdwes-xXesCKtKlBFFMF5X1CpIGJIrZ20,12630
|
|
4
6
|
eval_lib/evaluation_schema.py,sha256=7IDd_uozqewhh7k0p1hKut_20udvRxxkV6thclxKUg0,1904
|
|
7
|
+
eval_lib/html.py,sha256=N4lSBI1LKuZ3Iqgm_Vjy2F1o1qb0kT0fgXukSYqDido,1709
|
|
5
8
|
eval_lib/llm_client.py,sha256=eeTVhCLR1uYbhqOEOSBt3wWPKuzgzA9v8m0F9f-4Gqg,14910
|
|
6
9
|
eval_lib/metric_pattern.py,sha256=wULgMNDeAqJC_Qjglo7bYzY2eGhA_PmY_hA_qGfg0sI,11730
|
|
7
10
|
eval_lib/price.py,sha256=jbmkkUTxPuXrkSHuaJYPl7jSzfDIzQ9p_swWWs26UJ0,1986
|
|
@@ -28,7 +31,8 @@ eval_lib/metrics/faithfulness_metric/faithfulness.py,sha256=OqamlhTOps7d-NOStSIK
|
|
|
28
31
|
eval_lib/metrics/geval/geval.py,sha256=mNciHXnqU2drOJsWlYmbwftGiKM89-Ykw2f6XneIGBM,10629
|
|
29
32
|
eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py,sha256=4QqYgGMcp6W9Lw-v4s0AlUhMSOKvBOEgnLvhqVXaT9I,4286
|
|
30
33
|
eval_lib/metrics/toxicity_metric/toxicity.py,sha256=rBE1_fvpbCRdBpBep1y1LTIhofKR8GD4Eh76EOYzxL0,4076
|
|
31
|
-
eval_ai_library-0.3.
|
|
32
|
-
eval_ai_library-0.3.
|
|
33
|
-
eval_ai_library-0.3.
|
|
34
|
-
eval_ai_library-0.3.
|
|
34
|
+
eval_ai_library-0.3.11.dist-info/METADATA,sha256=in5pcLVIlycK5X_AaKF7HbzekAB_qEbhwIvm0pWE3X4,47969
|
|
35
|
+
eval_ai_library-0.3.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
36
|
+
eval_ai_library-0.3.11.dist-info/entry_points.txt,sha256=VTDuJiTezDkBLQw1NWcRoOOuZPHqYgOCcVIoYno-L00,47
|
|
37
|
+
eval_ai_library-0.3.11.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
|
|
38
|
+
eval_ai_library-0.3.11.dist-info/RECORD,,
|
eval_lib/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ A powerful library for evaluating AI models with support for multiple LLM provid
|
|
|
7
7
|
and a wide range of evaluation metrics for RAG systems and AI agents.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
__version__ = "0.3.
|
|
10
|
+
__version__ = "0.3.11"
|
|
11
11
|
__author__ = "Aleksandr Meshkov"
|
|
12
12
|
|
|
13
13
|
# Core evaluation functions
|
|
@@ -66,6 +66,10 @@ from eval_lib.agent_metrics import (
|
|
|
66
66
|
KnowledgeRetentionMetric
|
|
67
67
|
)
|
|
68
68
|
|
|
69
|
+
from .dashboard_server import (
|
|
70
|
+
DashboardCache
|
|
71
|
+
)
|
|
72
|
+
|
|
69
73
|
|
|
70
74
|
def __getattr__(name):
|
|
71
75
|
"""
|
|
@@ -136,4 +140,8 @@ __all__ = [
|
|
|
136
140
|
# Utils
|
|
137
141
|
"score_agg",
|
|
138
142
|
"extract_json_block",
|
|
143
|
+
|
|
144
|
+
# Dashboard
|
|
145
|
+
'start_dashboard',
|
|
146
|
+
'DashboardCache',
|
|
139
147
|
]
|
eval_lib/cli.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# eval_lib/cli.py
|
|
2
|
+
"""
|
|
3
|
+
Command-line interface for Eval AI Library
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import os
|
|
10
|
+
import json
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def run_dashboard():
|
|
14
|
+
"""Run dashboard server from CLI"""
|
|
15
|
+
parser = argparse.ArgumentParser(
|
|
16
|
+
description='Eval AI Library Dashboard Server',
|
|
17
|
+
prog='eval-lib dashboard'
|
|
18
|
+
)
|
|
19
|
+
parser.add_argument(
|
|
20
|
+
'--port',
|
|
21
|
+
type=int,
|
|
22
|
+
default=14500,
|
|
23
|
+
help='Port to run dashboard on (default: 14500)'
|
|
24
|
+
)
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
'--host',
|
|
27
|
+
type=str,
|
|
28
|
+
default='0.0.0.0',
|
|
29
|
+
help='Host to bind to (default: 0.0.0.0)'
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
'--cache-dir',
|
|
33
|
+
type=str,
|
|
34
|
+
default='.eval_cache',
|
|
35
|
+
help='Path to cache directory (default: .eval_cache)'
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
args = parser.parse_args(sys.argv[2:]) # Skip 'eval-lib' and 'dashboard'
|
|
39
|
+
|
|
40
|
+
# Import here to avoid loading everything for --help
|
|
41
|
+
from eval_lib.dashboard_server import DashboardCache
|
|
42
|
+
from eval_lib.html import HTML_TEMPLATE
|
|
43
|
+
from flask import Flask, render_template_string, jsonify
|
|
44
|
+
|
|
45
|
+
# Create cache with custom directory
|
|
46
|
+
def get_fresh_cache():
|
|
47
|
+
"""Reload cache from disk"""
|
|
48
|
+
return DashboardCache(cache_dir=args.cache_dir)
|
|
49
|
+
|
|
50
|
+
cache = get_fresh_cache()
|
|
51
|
+
|
|
52
|
+
print("="*70)
|
|
53
|
+
print("📊 Eval AI Library - Dashboard Server")
|
|
54
|
+
print("="*70)
|
|
55
|
+
|
|
56
|
+
# Check cache
|
|
57
|
+
latest = cache.get_latest()
|
|
58
|
+
if latest:
|
|
59
|
+
print(f"\n✅ Found cached results:")
|
|
60
|
+
print(f" Latest session: {latest['session_id']}")
|
|
61
|
+
print(f" Timestamp: {latest['timestamp']}")
|
|
62
|
+
print(f" Total sessions: {len(cache.get_all())}")
|
|
63
|
+
else:
|
|
64
|
+
print("\n⚠️ No cached results found")
|
|
65
|
+
print(" Run an evaluation with show_dashboard=True to populate cache")
|
|
66
|
+
|
|
67
|
+
print(f"\n🚀 Starting server...")
|
|
68
|
+
print(f" URL: http://localhost:{args.port}")
|
|
69
|
+
print(f" Host: {args.host}")
|
|
70
|
+
print(f" Cache: {Path(args.cache_dir).absolute()}")
|
|
71
|
+
print(f"\n💡 Keep this terminal open to keep the server running")
|
|
72
|
+
print(f" Press Ctrl+C to stop\n")
|
|
73
|
+
print("="*70 + "\n")
|
|
74
|
+
|
|
75
|
+
static_folder = os.path.join(os.path.dirname(__file__), 'static')
|
|
76
|
+
|
|
77
|
+
app = Flask(__name__, static_folder=static_folder)
|
|
78
|
+
app.config['WTF_CSRF_ENABLED'] = False
|
|
79
|
+
app.config['JSON_SORT_KEYS'] = False
|
|
80
|
+
|
|
81
|
+
@app.route('/')
|
|
82
|
+
def index():
|
|
83
|
+
return render_template_string(HTML_TEMPLATE)
|
|
84
|
+
|
|
85
|
+
@app.route('/favicon.ico')
|
|
86
|
+
def favicon():
|
|
87
|
+
return '', 204
|
|
88
|
+
|
|
89
|
+
@app.after_request
|
|
90
|
+
def after_request(response):
|
|
91
|
+
response.headers['Access-Control-Allow-Origin'] = '*'
|
|
92
|
+
response.headers['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS'
|
|
93
|
+
response.headers['Access-Control-Allow-Headers'] = 'Content-Type'
|
|
94
|
+
return response
|
|
95
|
+
|
|
96
|
+
@app.route('/api/latest')
|
|
97
|
+
def api_latest():
|
|
98
|
+
cache = get_fresh_cache()
|
|
99
|
+
latest = cache.get_latest()
|
|
100
|
+
if latest:
|
|
101
|
+
json_str = json.dumps(latest, ensure_ascii=False, sort_keys=False)
|
|
102
|
+
from flask import Response
|
|
103
|
+
return Response(
|
|
104
|
+
json_str,
|
|
105
|
+
mimetype='application/json',
|
|
106
|
+
headers={'Content-Type': 'application/json; charset=utf-8'}
|
|
107
|
+
)
|
|
108
|
+
return jsonify({'error': 'No results available'}), 404
|
|
109
|
+
|
|
110
|
+
@app.route('/api/sessions')
|
|
111
|
+
def api_sessions():
|
|
112
|
+
cache = get_fresh_cache()
|
|
113
|
+
sessions = [
|
|
114
|
+
{
|
|
115
|
+
'session_id': s['session_id'],
|
|
116
|
+
'timestamp': s['timestamp'],
|
|
117
|
+
'total_tests': s['data']['total_tests']
|
|
118
|
+
}
|
|
119
|
+
for s in cache.get_all()
|
|
120
|
+
]
|
|
121
|
+
return jsonify(sessions)
|
|
122
|
+
|
|
123
|
+
@app.route('/api/session/<session_id>')
|
|
124
|
+
def api_session(session_id):
|
|
125
|
+
cache = get_fresh_cache()
|
|
126
|
+
session = cache.get_by_session(session_id)
|
|
127
|
+
if session:
|
|
128
|
+
json_str = json.dumps(session, ensure_ascii=False, sort_keys=False)
|
|
129
|
+
from flask import Response
|
|
130
|
+
return Response(
|
|
131
|
+
json_str,
|
|
132
|
+
mimetype='application/json',
|
|
133
|
+
headers={'Content-Type': 'application/json; charset=utf-8'}
|
|
134
|
+
)
|
|
135
|
+
return jsonify({'error': 'Session not found'}), 404
|
|
136
|
+
|
|
137
|
+
@app.route('/api/clear')
|
|
138
|
+
def api_clear():
|
|
139
|
+
cache = get_fresh_cache()
|
|
140
|
+
cache.clear()
|
|
141
|
+
return jsonify({'message': 'Cache cleared'})
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
app.run(
|
|
145
|
+
host=args.host,
|
|
146
|
+
port=args.port,
|
|
147
|
+
debug=False,
|
|
148
|
+
use_reloader=False,
|
|
149
|
+
threaded=True
|
|
150
|
+
)
|
|
151
|
+
except KeyboardInterrupt:
|
|
152
|
+
print("\n\n🛑 Dashboard server stopped")
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def main():
|
|
156
|
+
"""Main CLI entry point"""
|
|
157
|
+
parser = argparse.ArgumentParser(
|
|
158
|
+
description='Eval AI Library CLI',
|
|
159
|
+
usage='eval-lib <command> [options]'
|
|
160
|
+
)
|
|
161
|
+
parser.add_argument(
|
|
162
|
+
'command',
|
|
163
|
+
help='Command to run (dashboard, version, help)'
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Parse only the command
|
|
167
|
+
args = parser.parse_args(sys.argv[1:2])
|
|
168
|
+
|
|
169
|
+
if args.command == 'dashboard':
|
|
170
|
+
run_dashboard()
|
|
171
|
+
elif args.command == 'version':
|
|
172
|
+
from eval_lib import __version__
|
|
173
|
+
print(f"Eval AI Library v{__version__}")
|
|
174
|
+
elif args.command == 'help':
|
|
175
|
+
parser.print_help()
|
|
176
|
+
else:
|
|
177
|
+
print(f"Unknown command: {args.command}")
|
|
178
|
+
print("Available commands: dashboard, version, help")
|
|
179
|
+
sys.exit(1)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
if __name__ == '__main__':
|
|
183
|
+
main()
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# eval_lib/dashboard_server.py
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import List, Dict, Any, Optional
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DashboardCache:
|
|
10
|
+
"""Cache to store evaluation results for the dashboard"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, cache_dir: str = ".eval_cache"):
|
|
13
|
+
self.cache_dir = Path(cache_dir)
|
|
14
|
+
self.cache_dir.mkdir(exist_ok=True)
|
|
15
|
+
self.cache_file = self.cache_dir / "results.json"
|
|
16
|
+
self.results_history = []
|
|
17
|
+
self._load_cache()
|
|
18
|
+
|
|
19
|
+
def _load_cache(self):
|
|
20
|
+
"""Load cache from file"""
|
|
21
|
+
if self.cache_file.exists():
|
|
22
|
+
try:
|
|
23
|
+
with open(self.cache_file, 'r', encoding='utf-8') as f:
|
|
24
|
+
self.results_history = json.load(f)
|
|
25
|
+
except Exception as e:
|
|
26
|
+
print(f"Warning: Could not load cache: {e}")
|
|
27
|
+
self.results_history = []
|
|
28
|
+
|
|
29
|
+
def _save_cache(self):
|
|
30
|
+
"""Save cache to file"""
|
|
31
|
+
try:
|
|
32
|
+
with open(self.cache_file, 'w', encoding='utf-8') as f:
|
|
33
|
+
json.dump(self.results_history, f,
|
|
34
|
+
indent=2, ensure_ascii=False, sort_keys=False)
|
|
35
|
+
except Exception as e:
|
|
36
|
+
print(f"Warning: Could not save cache: {e}")
|
|
37
|
+
|
|
38
|
+
def add_results(self, results: List[tuple], session_name: Optional[str] = None) -> str:
|
|
39
|
+
"""Add new results to the cache"""
|
|
40
|
+
import time
|
|
41
|
+
session_id = session_name or f"session_{int(time.time())}"
|
|
42
|
+
parsed_data = self._parse_results(results)
|
|
43
|
+
|
|
44
|
+
session_data = {
|
|
45
|
+
'session_id': session_id,
|
|
46
|
+
'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
|
47
|
+
'data': parsed_data
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
self.results_history.append(session_data)
|
|
51
|
+
self._save_cache()
|
|
52
|
+
|
|
53
|
+
return session_id
|
|
54
|
+
|
|
55
|
+
def get_latest(self) -> Optional[Dict[str, Any]]:
|
|
56
|
+
"""Get latest results"""
|
|
57
|
+
if self.results_history:
|
|
58
|
+
return self.results_history[-1]
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
def get_all(self) -> List[Dict[str, Any]]:
|
|
62
|
+
"""Get all results"""
|
|
63
|
+
return self.results_history
|
|
64
|
+
|
|
65
|
+
def get_by_session(self, session_id: str) -> Optional[Dict[str, Any]]:
|
|
66
|
+
"""Get results by session_id"""
|
|
67
|
+
for session in self.results_history:
|
|
68
|
+
if session['session_id'] == session_id:
|
|
69
|
+
return session
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
def clear(self):
|
|
73
|
+
"""Clear the cache"""
|
|
74
|
+
self.results_history = []
|
|
75
|
+
self._save_cache()
|
|
76
|
+
|
|
77
|
+
def _parse_results(self, results: List[tuple]) -> Dict[str, Any]:
|
|
78
|
+
"""Parse raw results into structured format for dashboard"""
|
|
79
|
+
|
|
80
|
+
test_cases = []
|
|
81
|
+
metrics_summary = {}
|
|
82
|
+
total_cost = 0.0
|
|
83
|
+
|
|
84
|
+
for test_idx, test_results in results:
|
|
85
|
+
for result in test_results:
|
|
86
|
+
test_case_data = {
|
|
87
|
+
'test_index': test_idx,
|
|
88
|
+
'input': result.input[:100] + '...' if len(result.input) > 100 else result.input,
|
|
89
|
+
'input_full': result.input,
|
|
90
|
+
'actual_output': result.actual_output[:200] if result.actual_output else '',
|
|
91
|
+
'actual_output_full': result.actual_output,
|
|
92
|
+
'expected_output': result.expected_output[:200] if result.expected_output else '',
|
|
93
|
+
'expected_output_full': result.expected_output,
|
|
94
|
+
'retrieval_context': result.retrieval_context if result.retrieval_context else [],
|
|
95
|
+
'metrics': []
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
for metric_data in result.metrics_data:
|
|
99
|
+
# Determine model name
|
|
100
|
+
if isinstance(metric_data.evaluation_model, str):
|
|
101
|
+
model_name = metric_data.evaluation_model
|
|
102
|
+
else:
|
|
103
|
+
# For CustomLLMClient
|
|
104
|
+
try:
|
|
105
|
+
model_name = metric_data.evaluation_model.get_model_name()
|
|
106
|
+
except:
|
|
107
|
+
model_name = str(
|
|
108
|
+
type(metric_data.evaluation_model).__name__)
|
|
109
|
+
|
|
110
|
+
test_case_data['metrics'].append({
|
|
111
|
+
'name': metric_data.name,
|
|
112
|
+
'score': round(metric_data.score, 3),
|
|
113
|
+
'success': metric_data.success,
|
|
114
|
+
'threshold': metric_data.threshold,
|
|
115
|
+
'reason': metric_data.reason[:300] if metric_data.reason else '',
|
|
116
|
+
'reason_full': metric_data.reason,
|
|
117
|
+
'evaluation_model': model_name,
|
|
118
|
+
'evaluation_cost': metric_data.evaluation_cost,
|
|
119
|
+
'evaluation_log': metric_data.evaluation_log
|
|
120
|
+
})
|
|
121
|
+
|
|
122
|
+
if metric_data.name not in metrics_summary:
|
|
123
|
+
metrics_summary[metric_data.name] = {
|
|
124
|
+
'scores': [],
|
|
125
|
+
'passed': 0,
|
|
126
|
+
'failed': 0,
|
|
127
|
+
'threshold': metric_data.threshold,
|
|
128
|
+
'total_cost': 0.0,
|
|
129
|
+
'model': model_name
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
metrics_summary[metric_data.name]['scores'].append(
|
|
133
|
+
metric_data.score)
|
|
134
|
+
if metric_data.success:
|
|
135
|
+
metrics_summary[metric_data.name]['passed'] += 1
|
|
136
|
+
else:
|
|
137
|
+
metrics_summary[metric_data.name]['failed'] += 1
|
|
138
|
+
|
|
139
|
+
if metric_data.evaluation_cost:
|
|
140
|
+
total_cost += metric_data.evaluation_cost
|
|
141
|
+
metrics_summary[metric_data.name]['total_cost'] += metric_data.evaluation_cost
|
|
142
|
+
|
|
143
|
+
test_cases.append(test_case_data)
|
|
144
|
+
|
|
145
|
+
for metric_name, data in metrics_summary.items():
|
|
146
|
+
data['avg_score'] = sum(data['scores']) / \
|
|
147
|
+
len(data['scores']) if data['scores'] else 0
|
|
148
|
+
data['success_rate'] = (data['passed'] / (data['passed'] + data['failed'])
|
|
149
|
+
* 100) if (data['passed'] + data['failed']) > 0 else 0
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
'test_cases': test_cases,
|
|
153
|
+
'metrics_summary': metrics_summary,
|
|
154
|
+
'total_cost': total_cost,
|
|
155
|
+
'total_tests': len(test_cases)
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def save_results_to_cache(results: List[tuple], session_name: Optional[str] = None) -> str:
|
|
160
|
+
"""
|
|
161
|
+
Save evaluation results to cache for dashboard viewing.
|
|
162
|
+
Cache is always saved to .eval_cache/ in current directory.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
results: Evaluation results from evaluate()
|
|
166
|
+
session_name: Optional name for the session
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Session ID
|
|
170
|
+
"""
|
|
171
|
+
cache = DashboardCache()
|
|
172
|
+
return cache.add_results(results, session_name)
|
eval_lib/evaluate.py
CHANGED
|
@@ -68,7 +68,9 @@ def _print_summary(results: List, total_cost: float, total_time: float, passed:
|
|
|
68
68
|
async def evaluate(
|
|
69
69
|
test_cases: List[EvalTestCase],
|
|
70
70
|
metrics: List[MetricPattern],
|
|
71
|
-
verbose: bool = True
|
|
71
|
+
verbose: bool = True,
|
|
72
|
+
show_dashboard: bool = False,
|
|
73
|
+
session_name: str = None,
|
|
72
74
|
) -> List[Tuple[None, List[TestCaseResult]]]:
|
|
73
75
|
"""
|
|
74
76
|
Evaluate test cases with multiple metrics.
|
|
@@ -77,6 +79,10 @@ async def evaluate(
|
|
|
77
79
|
test_cases: List of test cases to evaluate
|
|
78
80
|
metrics: List of metrics to apply
|
|
79
81
|
verbose: Enable detailed logging (default: True)
|
|
82
|
+
show_dashboard: Launch interactive web dashboard (default: False)
|
|
83
|
+
dashboard_port: Port for dashboard server (default: 14500)
|
|
84
|
+
session_name: Name for this evaluation session
|
|
85
|
+
cache_dir: Directory to store cache (default: .eval_cache)
|
|
80
86
|
|
|
81
87
|
Returns:
|
|
82
88
|
List of evaluation results
|
|
@@ -183,6 +189,23 @@ async def evaluate(
|
|
|
183
189
|
_print_summary(results, total_cost, total_time,
|
|
184
190
|
total_passed, total_tests)
|
|
185
191
|
|
|
192
|
+
if show_dashboard:
|
|
193
|
+
from eval_lib.dashboard_server import save_results_to_cache
|
|
194
|
+
|
|
195
|
+
session_id = save_results_to_cache(results, session_name)
|
|
196
|
+
|
|
197
|
+
if verbose:
|
|
198
|
+
print(f"\n{Colors.BOLD}{Colors.GREEN}{'='*70}{Colors.ENDC}")
|
|
199
|
+
print(f"{Colors.BOLD}{Colors.GREEN}📊 DASHBOARD{Colors.ENDC}")
|
|
200
|
+
print(f"{Colors.BOLD}{Colors.GREEN}{'='*70}{Colors.ENDC}")
|
|
201
|
+
print(
|
|
202
|
+
f"\n✅ Results saved to cache: {Colors.CYAN}{session_id}{Colors.ENDC}")
|
|
203
|
+
print(f"\n💡 To view results, run:")
|
|
204
|
+
print(f" {Colors.YELLOW}eval-lib dashboard{Colors.ENDC}")
|
|
205
|
+
print(
|
|
206
|
+
f"\n Then open: {Colors.CYAN}http://localhost:14500{Colors.ENDC}")
|
|
207
|
+
print(f"\n{Colors.BOLD}{Colors.GREEN}{'='*70}{Colors.ENDC}\n")
|
|
208
|
+
|
|
186
209
|
return results
|
|
187
210
|
|
|
188
211
|
|
eval_lib/html.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
HTML_TEMPLATE = """
|
|
2
|
+
<!DOCTYPE html>
|
|
3
|
+
<html lang="en">
|
|
4
|
+
<head>
|
|
5
|
+
<meta charset="UTF-8">
|
|
6
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
7
|
+
<title>Eval AI Library - Dashboard</title>
|
|
8
|
+
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
|
9
|
+
<script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
|
|
10
|
+
<link rel="stylesheet" href="{{ url_for('static', filename='dashboard.css') }}">
|
|
11
|
+
</head>
|
|
12
|
+
<body>
|
|
13
|
+
<div class="container">
|
|
14
|
+
<header>
|
|
15
|
+
<div>
|
|
16
|
+
<h1>Eval AI Library Dashboard</h1>
|
|
17
|
+
<div class="timestamp" id="timestamp">Loading...</div>
|
|
18
|
+
</div>
|
|
19
|
+
<div class="controls">
|
|
20
|
+
<select id="sessionSelect" onchange="loadSession()">
|
|
21
|
+
<option value="">Loading sessions...</option>
|
|
22
|
+
</select>
|
|
23
|
+
<button onclick="refreshData()">Refresh</button>
|
|
24
|
+
<button class="primary" onclick="clearCache()">Clear Cache</button>
|
|
25
|
+
</div>
|
|
26
|
+
</header>
|
|
27
|
+
|
|
28
|
+
<div id="content" class="loading">
|
|
29
|
+
Loading data...
|
|
30
|
+
</div>
|
|
31
|
+
</div>
|
|
32
|
+
|
|
33
|
+
<!-- Modal for detailed information -->
|
|
34
|
+
<div id="detailsModal" class="modal">
|
|
35
|
+
<div class="modal-content">
|
|
36
|
+
<div class="modal-header">
|
|
37
|
+
<div class="test-status">
|
|
38
|
+
<h2 id="modalTitle">Test Details</h2>
|
|
39
|
+
</div>
|
|
40
|
+
<span class="close" onclick="closeModal()">×</span>
|
|
41
|
+
</div>
|
|
42
|
+
<div class="modal-body" id="modalBody"></div>
|
|
43
|
+
</div>
|
|
44
|
+
</div>
|
|
45
|
+
|
|
46
|
+
<script src="{{ url_for('static', filename='dashboard.js') }}"></script>
|
|
47
|
+
</body>
|
|
48
|
+
</html>
|
|
49
|
+
"""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|