eval-ai-library 0.3.2__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- {eval_ai_library-0.3.2.dist-info → eval_ai_library-0.3.10.dist-info}/METADATA +379 -1
- {eval_ai_library-0.3.2.dist-info → eval_ai_library-0.3.10.dist-info}/RECORD +12 -8
- eval_ai_library-0.3.10.dist-info/entry_points.txt +2 -0
- eval_lib/__init__.py +11 -1
- eval_lib/cli.py +166 -0
- eval_lib/dashboard_server.py +172 -0
- eval_lib/evaluate.py +24 -1
- eval_lib/html.py +736 -0
- eval_lib/llm_client.py +47 -1
- {eval_ai_library-0.3.2.dist-info → eval_ai_library-0.3.10.dist-info}/WHEEL +0 -0
- {eval_ai_library-0.3.2.dist-info → eval_ai_library-0.3.10.dist-info}/licenses/LICENSE +0 -0
- {eval_ai_library-0.3.2.dist-info → eval_ai_library-0.3.10.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-ai-library
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.10
|
|
4
4
|
Summary: Comprehensive AI Model Evaluation Framework with support for multiple LLM providers
|
|
5
5
|
Author-email: Aleksandr Meshkov <alekslynx90@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -45,6 +45,7 @@ Requires-Dist: html2text>=2020.1.16
|
|
|
45
45
|
Requires-Dist: markdown>=3.4.0
|
|
46
46
|
Requires-Dist: pandas>=2.0.0
|
|
47
47
|
Requires-Dist: striprtf>=0.0.26
|
|
48
|
+
Requires-Dist: flask>=3.0.0
|
|
48
49
|
Provides-Extra: dev
|
|
49
50
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
50
51
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
@@ -807,6 +808,383 @@ response, cost = await chat_complete(
|
|
|
807
808
|
)
|
|
808
809
|
```
|
|
809
810
|
|
|
811
|
+
## Dashboard
|
|
812
|
+
|
|
813
|
+
The library includes an interactive web dashboard for visualizing evaluation results. All evaluation results are automatically saved to cache and can be viewed in a beautiful web interface.
|
|
814
|
+
|
|
815
|
+
### Features
|
|
816
|
+
|
|
817
|
+
- 📊 **Interactive Charts**: Visual representation of metrics with Chart.js
|
|
818
|
+
- 📈 **Metrics Summary**: Aggregate statistics across all evaluations
|
|
819
|
+
- 🔍 **Detailed View**: Drill down into individual test cases and metric results
|
|
820
|
+
- 💾 **Session History**: Access past evaluation runs
|
|
821
|
+
- 🎨 **Beautiful UI**: Modern, responsive interface with color-coded results
|
|
822
|
+
- 🔄 **Real-time Updates**: Refresh to see new evaluation results
|
|
823
|
+
|
|
824
|
+
### Starting the Dashboard
|
|
825
|
+
|
|
826
|
+
The dashboard runs as a separate server that you start once and keep running:
|
|
827
|
+
```bash
|
|
828
|
+
# Start dashboard server (from your project directory)
|
|
829
|
+
eval-lib dashboard
|
|
830
|
+
|
|
831
|
+
# Custom port if 14500 is busy
|
|
832
|
+
eval-lib dashboard --port 8080
|
|
833
|
+
|
|
834
|
+
# Custom cache directory
|
|
835
|
+
eval-lib dashboard --cache-dir /path/to/cache
|
|
836
|
+
```
|
|
837
|
+
|
|
838
|
+
Once started, the dashboard will be available at `http://localhost:14500`
|
|
839
|
+
|
|
840
|
+
### Saving Results to Dashboard
|
|
841
|
+
|
|
842
|
+
Enable dashboard cache saving in your evaluation:
|
|
843
|
+
```python
|
|
844
|
+
import asyncio
|
|
845
|
+
from eval_lib import (
|
|
846
|
+
evaluate,
|
|
847
|
+
EvalTestCase,
|
|
848
|
+
AnswerRelevancyMetric,
|
|
849
|
+
FaithfulnessMetric
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
async def evaluate_with_dashboard():
|
|
853
|
+
test_cases = [
|
|
854
|
+
EvalTestCase(
|
|
855
|
+
input="What is the capital of France?",
|
|
856
|
+
actual_output="Paris is the capital.",
|
|
857
|
+
expected_output="Paris",
|
|
858
|
+
retrieval_context=["Paris is the capital of France."]
|
|
859
|
+
)
|
|
860
|
+
]
|
|
861
|
+
|
|
862
|
+
metrics = [
|
|
863
|
+
AnswerRelevancyMetric(model="gpt-4o-mini", threshold=0.7),
|
|
864
|
+
FaithfulnessMetric(model="gpt-4o-mini", threshold=0.8)
|
|
865
|
+
]
|
|
866
|
+
|
|
867
|
+
# Results are saved to .eval_cache/ for dashboard viewing
|
|
868
|
+
results = await evaluate(
|
|
869
|
+
test_cases=test_cases,
|
|
870
|
+
metrics=metrics,
|
|
871
|
+
show_dashboard=True, # ← Enable dashboard cache
|
|
872
|
+
session_name="My First Evaluation" # Optional session name
|
|
873
|
+
)
|
|
874
|
+
|
|
875
|
+
return results
|
|
876
|
+
|
|
877
|
+
asyncio.run(evaluate_with_dashboard())
|
|
878
|
+
```
|
|
879
|
+
|
|
880
|
+
### Typical Workflow
|
|
881
|
+
|
|
882
|
+
**Terminal 1 - Start Dashboard (once):**
|
|
883
|
+
```bash
|
|
884
|
+
cd ~/my_project
|
|
885
|
+
eval-lib dashboard
|
|
886
|
+
# Leave this terminal open - dashboard stays running
|
|
887
|
+
```
|
|
888
|
+
|
|
889
|
+
**Terminal 2 - Run Evaluations (multiple times):**
|
|
890
|
+
```python
|
|
891
|
+
# Run evaluation 1
|
|
892
|
+
results1 = await evaluate(
|
|
893
|
+
test_cases=test_cases1,
|
|
894
|
+
metrics=metrics,
|
|
895
|
+
show_dashboard=True,
|
|
896
|
+
session_name="Evaluation 1"
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
# Run evaluation 2
|
|
900
|
+
results2 = await evaluate(
|
|
901
|
+
test_cases=test_cases2,
|
|
902
|
+
metrics=metrics,
|
|
903
|
+
show_dashboard=True,
|
|
904
|
+
session_name="Evaluation 2"
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
# All results are cached and viewable in dashboard
|
|
908
|
+
```
|
|
909
|
+
|
|
910
|
+
**Browser:**
|
|
911
|
+
- Open `http://localhost:14500`
|
|
912
|
+
- Refresh page (F5) to see new evaluation results
|
|
913
|
+
- Switch between different evaluation sessions using the dropdown
|
|
914
|
+
|
|
915
|
+
### Dashboard Features
|
|
916
|
+
|
|
917
|
+
**Summary Cards:**
|
|
918
|
+
- Total test cases evaluated
|
|
919
|
+
- Total cost across all evaluations
|
|
920
|
+
- Number of metrics used
|
|
921
|
+
|
|
922
|
+
**Metrics Overview:**
|
|
923
|
+
- Average scores per metric
|
|
924
|
+
- Pass/fail counts
|
|
925
|
+
- Success rates
|
|
926
|
+
- Model used for evaluation
|
|
927
|
+
- Total cost per metric
|
|
928
|
+
|
|
929
|
+
**Detailed Results Table:**
|
|
930
|
+
- Test case inputs and outputs
|
|
931
|
+
- Individual metric scores
|
|
932
|
+
- Pass/fail status
|
|
933
|
+
- Click "View Details" for full information including:
|
|
934
|
+
- Complete input/output/expected output
|
|
935
|
+
- Full retrieval context
|
|
936
|
+
- Detailed evaluation reasoning
|
|
937
|
+
- Complete evaluation logs
|
|
938
|
+
|
|
939
|
+
**Charts:**
|
|
940
|
+
- Bar chart: Average scores by metric
|
|
941
|
+
- Doughnut chart: Success rate distribution
|
|
942
|
+
|
|
943
|
+
### Cache Management
|
|
944
|
+
|
|
945
|
+
Results are stored in `.eval_cache/results.json` in your project directory:
|
|
946
|
+
```bash
|
|
947
|
+
# View cache contents
|
|
948
|
+
cat .eval_cache/results.json
|
|
949
|
+
|
|
950
|
+
# Clear cache via dashboard
|
|
951
|
+
# Click "Clear Cache" button in dashboard UI
|
|
952
|
+
|
|
953
|
+
# Or manually delete cache
|
|
954
|
+
rm -rf .eval_cache/
|
|
955
|
+
```
|
|
956
|
+
|
|
957
|
+
### CLI Commands
|
|
958
|
+
```bash
|
|
959
|
+
# Start dashboard with defaults
|
|
960
|
+
eval-lib dashboard
|
|
961
|
+
|
|
962
|
+
# Custom port
|
|
963
|
+
eval-lib dashboard --port 8080
|
|
964
|
+
|
|
965
|
+
# Custom cache directory
|
|
966
|
+
eval-lib dashboard --cache-dir /path/to/project/.eval_cache
|
|
967
|
+
|
|
968
|
+
# Check library version
|
|
969
|
+
eval-lib version
|
|
970
|
+
|
|
971
|
+
# Help
|
|
972
|
+
eval-lib help
|
|
973
|
+
```
|
|
974
|
+
|
|
975
|
+
## Custom LLM Providers
|
|
976
|
+
|
|
977
|
+
The library supports custom LLM providers through the `CustomLLMClient` abstract base class. This allows you to integrate any LLM provider, including internal corporate models, locally-hosted models, or custom endpoints.
|
|
978
|
+
|
|
979
|
+
### Creating a Custom Provider
|
|
980
|
+
|
|
981
|
+
Implement the `CustomLLMClient` interface:
|
|
982
|
+
```python
|
|
983
|
+
from eval_lib import CustomLLMClient
|
|
984
|
+
from typing import Optional
|
|
985
|
+
from openai import AsyncOpenAI
|
|
986
|
+
|
|
987
|
+
class InternalLLMClient(CustomLLMClient):
|
|
988
|
+
"""Client for internal corporate LLM or custom endpoint"""
|
|
989
|
+
|
|
990
|
+
def __init__(
|
|
991
|
+
self,
|
|
992
|
+
endpoint: str,
|
|
993
|
+
model: str,
|
|
994
|
+
api_key: Optional[str] = None,
|
|
995
|
+
temperature: float = 0.0
|
|
996
|
+
):
|
|
997
|
+
"""
|
|
998
|
+
Args:
|
|
999
|
+
endpoint: Your internal LLM endpoint URL (e.g., "https://internal-llm.company.com/v1")
|
|
1000
|
+
model: Model name to use
|
|
1001
|
+
api_key: API key if required (optional for local models)
|
|
1002
|
+
temperature: Default temperature
|
|
1003
|
+
"""
|
|
1004
|
+
self.endpoint = endpoint
|
|
1005
|
+
self.model = model
|
|
1006
|
+
self.api_key = api_key or "not-needed" # Some endpoints don't need auth
|
|
1007
|
+
|
|
1008
|
+
self.client = AsyncOpenAI(
|
|
1009
|
+
api_key=self.api_key,
|
|
1010
|
+
base_url=self.endpoint
|
|
1011
|
+
)
|
|
1012
|
+
|
|
1013
|
+
async def chat_complete(
|
|
1014
|
+
self,
|
|
1015
|
+
messages: list[dict[str, str]],
|
|
1016
|
+
temperature: float
|
|
1017
|
+
) -> tuple[str, Optional[float]]:
|
|
1018
|
+
"""Generate response from internal LLM"""
|
|
1019
|
+
response = await self.client.chat.completions.create(
|
|
1020
|
+
model=self.model,
|
|
1021
|
+
messages=messages,
|
|
1022
|
+
temperature=temperature,
|
|
1023
|
+
)
|
|
1024
|
+
text = response.choices[0].message.content.strip()
|
|
1025
|
+
cost = None # Internal models typically don't have API costs
|
|
1026
|
+
return text, cost
|
|
1027
|
+
|
|
1028
|
+
def get_model_name(self) -> str:
|
|
1029
|
+
"""Return model name for logging"""
|
|
1030
|
+
return f"internal:{self.model}"
|
|
1031
|
+
```
|
|
1032
|
+
|
|
1033
|
+
### Using Custom Providers
|
|
1034
|
+
|
|
1035
|
+
Use your custom provider in any metric:
|
|
1036
|
+
```python
|
|
1037
|
+
import asyncio
|
|
1038
|
+
from eval_lib import (
|
|
1039
|
+
evaluate,
|
|
1040
|
+
EvalTestCase,
|
|
1041
|
+
AnswerRelevancyMetric,
|
|
1042
|
+
FaithfulnessMetric
|
|
1043
|
+
)
|
|
1044
|
+
|
|
1045
|
+
# Create custom internal LLM client
|
|
1046
|
+
internal_llm = InternalLLMClient(
|
|
1047
|
+
endpoint="https://internal-llm.company.com/v1",
|
|
1048
|
+
model="company-gpt-v2",
|
|
1049
|
+
api_key="your-internal-key" # Optional
|
|
1050
|
+
)
|
|
1051
|
+
|
|
1052
|
+
# Use in metrics
|
|
1053
|
+
test_cases = [
|
|
1054
|
+
EvalTestCase(
|
|
1055
|
+
input="What is the capital of France?",
|
|
1056
|
+
actual_output="Paris is the capital.",
|
|
1057
|
+
expected_output="Paris",
|
|
1058
|
+
retrieval_context=["Paris is the capital of France."]
|
|
1059
|
+
)
|
|
1060
|
+
]
|
|
1061
|
+
|
|
1062
|
+
metrics = [
|
|
1063
|
+
AnswerRelevancyMetric(
|
|
1064
|
+
model=internal_llm, # ← Your custom LLM
|
|
1065
|
+
threshold=0.7
|
|
1066
|
+
),
|
|
1067
|
+
FaithfulnessMetric(
|
|
1068
|
+
model=internal_llm, # ← Same custom client
|
|
1069
|
+
threshold=0.8
|
|
1070
|
+
)
|
|
1071
|
+
]
|
|
1072
|
+
|
|
1073
|
+
async def run_evaluation():
|
|
1074
|
+
results = await evaluate(
|
|
1075
|
+
test_cases=test_cases,
|
|
1076
|
+
metrics=metrics,
|
|
1077
|
+
verbose=True
|
|
1078
|
+
)
|
|
1079
|
+
return results
|
|
1080
|
+
|
|
1081
|
+
asyncio.run(run_evaluation())
|
|
1082
|
+
```
|
|
1083
|
+
|
|
1084
|
+
### Mixing Standard and Custom Providers
|
|
1085
|
+
|
|
1086
|
+
You can mix standard and custom providers in the same evaluation:
|
|
1087
|
+
```python
|
|
1088
|
+
# Create custom provider
|
|
1089
|
+
internal_llm = InternalLLMClient(
|
|
1090
|
+
endpoint="https://internal-llm.company.com/v1",
|
|
1091
|
+
model="company-model"
|
|
1092
|
+
)
|
|
1093
|
+
|
|
1094
|
+
# Mix standard OpenAI and custom internal LLM
|
|
1095
|
+
metrics = [
|
|
1096
|
+
AnswerRelevancyMetric(
|
|
1097
|
+
model="gpt-4o-mini", # ← Standard OpenAI
|
|
1098
|
+
threshold=0.7
|
|
1099
|
+
),
|
|
1100
|
+
FaithfulnessMetric(
|
|
1101
|
+
model=internal_llm, # ← Custom internal LLM
|
|
1102
|
+
threshold=0.8
|
|
1103
|
+
),
|
|
1104
|
+
ContextualRelevancyMetric(
|
|
1105
|
+
model="anthropic:claude-sonnet-4-0", # ← Standard Anthropic
|
|
1106
|
+
threshold=0.7
|
|
1107
|
+
)
|
|
1108
|
+
]
|
|
1109
|
+
|
|
1110
|
+
results = await evaluate(test_cases=test_cases, metrics=metrics)
|
|
1111
|
+
```
|
|
1112
|
+
|
|
1113
|
+
### Custom Provider Use Cases
|
|
1114
|
+
|
|
1115
|
+
**When to use custom providers:**
|
|
1116
|
+
|
|
1117
|
+
1. **Internal Corporate LLMs**: Connect to your company's proprietary models
|
|
1118
|
+
2. **Local Models**: Integrate locally-hosted models (vLLM, TGI, LM Studio, Ollama with custom setup)
|
|
1119
|
+
3. **Fine-tuned Models**: Use your own fine-tuned models hosted anywhere
|
|
1120
|
+
4. **Research Models**: Connect to experimental or research models
|
|
1121
|
+
5. **Custom Endpoints**: Any LLM accessible via HTTP endpoint
|
|
1122
|
+
|
|
1123
|
+
**Example: Local Model with vLLM**
|
|
1124
|
+
```python
|
|
1125
|
+
# vLLM server running on localhost:8000
|
|
1126
|
+
local_model = InternalLLMClient(
|
|
1127
|
+
endpoint="http://localhost:8000/v1",
|
|
1128
|
+
model="meta-llama/Llama-2-7b-chat",
|
|
1129
|
+
api_key=None # Local models don't need auth
|
|
1130
|
+
)
|
|
1131
|
+
|
|
1132
|
+
# Use in evaluation
|
|
1133
|
+
metric = AnswerRelevancyMetric(model=local_model, threshold=0.7)
|
|
1134
|
+
```
|
|
1135
|
+
|
|
1136
|
+
**Example: Corporate Internal Model**
|
|
1137
|
+
```python
|
|
1138
|
+
# Company's internal LLM with authentication
|
|
1139
|
+
company_model = InternalLLMClient(
|
|
1140
|
+
endpoint="https://ai-platform.company.internal/api/v1",
|
|
1141
|
+
model="company-gpt-enterprise",
|
|
1142
|
+
api_key="internal-api-key-here"
|
|
1143
|
+
)
|
|
1144
|
+
|
|
1145
|
+
# Use in evaluation
|
|
1146
|
+
metrics = [
|
|
1147
|
+
AnswerRelevancyMetric(model=company_model, threshold=0.7),
|
|
1148
|
+
FaithfulnessMetric(model=company_model, threshold=0.8)
|
|
1149
|
+
]
|
|
1150
|
+
```
|
|
1151
|
+
|
|
1152
|
+
**Key Requirements:**
|
|
1153
|
+
|
|
1154
|
+
1. **`async def chat_complete()`** - Must be async and return `(str, Optional[float])`
|
|
1155
|
+
2. **`def get_model_name()`** - Return string identifier for logging
|
|
1156
|
+
3. **Error Handling** - Handle connection and API errors appropriately
|
|
1157
|
+
4. **Cost** - Return `None` for cost if not applicable (e.g., internal/local models)
|
|
1158
|
+
|
|
1159
|
+
### Advanced: Custom Authentication
|
|
1160
|
+
|
|
1161
|
+
For custom authentication schemes:
|
|
1162
|
+
```python
|
|
1163
|
+
class CustomAuthLLMClient(CustomLLMClient):
|
|
1164
|
+
"""Client with custom authentication"""
|
|
1165
|
+
|
|
1166
|
+
def __init__(self, endpoint: str, auth_token: str):
|
|
1167
|
+
self.endpoint = endpoint
|
|
1168
|
+
self.headers = {
|
|
1169
|
+
"Authorization": f"Bearer {auth_token}",
|
|
1170
|
+
"X-Custom-Header": "value"
|
|
1171
|
+
}
|
|
1172
|
+
# Use aiohttp or httpx for custom auth
|
|
1173
|
+
import aiohttp
|
|
1174
|
+
self.session = aiohttp.ClientSession(headers=self.headers)
|
|
1175
|
+
|
|
1176
|
+
async def chat_complete(self, messages, temperature):
|
|
1177
|
+
async with self.session.post(
|
|
1178
|
+
f"{self.endpoint}/chat",
|
|
1179
|
+
json={"messages": messages, "temperature": temperature}
|
|
1180
|
+
) as response:
|
|
1181
|
+
data = await response.json()
|
|
1182
|
+
return data["content"], None
|
|
1183
|
+
|
|
1184
|
+
def get_model_name(self):
|
|
1185
|
+
return "custom-auth-model"
|
|
1186
|
+
```
|
|
1187
|
+
|
|
810
1188
|
## Test Data Generation
|
|
811
1189
|
|
|
812
1190
|
The library includes a powerful test data generator that can create realistic test cases either from scratch or based on your documents.
|
|
@@ -1,8 +1,11 @@
|
|
|
1
|
-
eval_ai_library-0.3.
|
|
2
|
-
eval_lib/__init__.py,sha256=
|
|
3
|
-
eval_lib/
|
|
1
|
+
eval_ai_library-0.3.10.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
|
|
2
|
+
eval_lib/__init__.py,sha256=OMrncAoUbbrJXfaYf8k2wJEGw1e2r9k-s1uXkerZ9mE,3204
|
|
3
|
+
eval_lib/cli.py,sha256=Fvnj6HgCQ3lhx28skweALgHSm3FMEpavQCB3o_sQhtE,4731
|
|
4
|
+
eval_lib/dashboard_server.py,sha256=6ND7ujtzN0PdMyVmJFnKDWrIf4kaodnetLZRPUhYHas,6751
|
|
5
|
+
eval_lib/evaluate.py,sha256=LEjwPsuuPGpdwes-xXesCKtKlBFFMF5X1CpIGJIrZ20,12630
|
|
4
6
|
eval_lib/evaluation_schema.py,sha256=7IDd_uozqewhh7k0p1hKut_20udvRxxkV6thclxKUg0,1904
|
|
5
|
-
eval_lib/
|
|
7
|
+
eval_lib/html.py,sha256=_tBTtwxZpjIwc3TVOyLGDw2VFD77aAeA47JdovoZ0CI,24094
|
|
8
|
+
eval_lib/llm_client.py,sha256=eeTVhCLR1uYbhqOEOSBt3wWPKuzgzA9v8m0F9f-4Gqg,14910
|
|
6
9
|
eval_lib/metric_pattern.py,sha256=wULgMNDeAqJC_Qjglo7bYzY2eGhA_PmY_hA_qGfg0sI,11730
|
|
7
10
|
eval_lib/price.py,sha256=jbmkkUTxPuXrkSHuaJYPl7jSzfDIzQ9p_swWWs26UJ0,1986
|
|
8
11
|
eval_lib/py.typed,sha256=8PjyZ1aVoQpRVvt71muvuq5qE-jTFZkK-GLHkhdebmc,26
|
|
@@ -28,7 +31,8 @@ eval_lib/metrics/faithfulness_metric/faithfulness.py,sha256=OqamlhTOps7d-NOStSIK
|
|
|
28
31
|
eval_lib/metrics/geval/geval.py,sha256=mNciHXnqU2drOJsWlYmbwftGiKM89-Ykw2f6XneIGBM,10629
|
|
29
32
|
eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py,sha256=4QqYgGMcp6W9Lw-v4s0AlUhMSOKvBOEgnLvhqVXaT9I,4286
|
|
30
33
|
eval_lib/metrics/toxicity_metric/toxicity.py,sha256=rBE1_fvpbCRdBpBep1y1LTIhofKR8GD4Eh76EOYzxL0,4076
|
|
31
|
-
eval_ai_library-0.3.
|
|
32
|
-
eval_ai_library-0.3.
|
|
33
|
-
eval_ai_library-0.3.
|
|
34
|
-
eval_ai_library-0.3.
|
|
34
|
+
eval_ai_library-0.3.10.dist-info/METADATA,sha256=pevxrimXqbreKbRwHZ0GBu_VXsfGhles6OMN2SBOJHo,47969
|
|
35
|
+
eval_ai_library-0.3.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
36
|
+
eval_ai_library-0.3.10.dist-info/entry_points.txt,sha256=VTDuJiTezDkBLQw1NWcRoOOuZPHqYgOCcVIoYno-L00,47
|
|
37
|
+
eval_ai_library-0.3.10.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
|
|
38
|
+
eval_ai_library-0.3.10.dist-info/RECORD,,
|
eval_lib/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ A powerful library for evaluating AI models with support for multiple LLM provid
|
|
|
7
7
|
and a wide range of evaluation metrics for RAG systems and AI agents.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
__version__ = "0.3.
|
|
10
|
+
__version__ = "0.3.10"
|
|
11
11
|
__author__ = "Aleksandr Meshkov"
|
|
12
12
|
|
|
13
13
|
# Core evaluation functions
|
|
@@ -39,6 +39,7 @@ from eval_lib.llm_client import (
|
|
|
39
39
|
chat_complete,
|
|
40
40
|
get_embeddings,
|
|
41
41
|
LLMDescriptor,
|
|
42
|
+
CustomLLMClient,
|
|
42
43
|
Provider
|
|
43
44
|
)
|
|
44
45
|
|
|
@@ -65,6 +66,10 @@ from eval_lib.agent_metrics import (
|
|
|
65
66
|
KnowledgeRetentionMetric
|
|
66
67
|
)
|
|
67
68
|
|
|
69
|
+
from .dashboard_server import (
|
|
70
|
+
DashboardCache
|
|
71
|
+
)
|
|
72
|
+
|
|
68
73
|
|
|
69
74
|
def __getattr__(name):
|
|
70
75
|
"""
|
|
@@ -106,6 +111,7 @@ __all__ = [
|
|
|
106
111
|
"chat_complete",
|
|
107
112
|
"get_embeddings",
|
|
108
113
|
"LLMDescriptor",
|
|
114
|
+
"CustomLLMClient",
|
|
109
115
|
"Provider",
|
|
110
116
|
|
|
111
117
|
# RAG Metrics
|
|
@@ -134,4 +140,8 @@ __all__ = [
|
|
|
134
140
|
# Utils
|
|
135
141
|
"score_agg",
|
|
136
142
|
"extract_json_block",
|
|
143
|
+
|
|
144
|
+
# Dashboard
|
|
145
|
+
'start_dashboard',
|
|
146
|
+
'DashboardCache',
|
|
137
147
|
]
|
eval_lib/cli.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# eval_lib/cli.py
|
|
2
|
+
"""
|
|
3
|
+
Command-line interface for Eval AI Library
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def run_dashboard():
|
|
12
|
+
"""Run dashboard server from CLI"""
|
|
13
|
+
parser = argparse.ArgumentParser(
|
|
14
|
+
description='Eval AI Library Dashboard Server',
|
|
15
|
+
prog='eval-lib dashboard'
|
|
16
|
+
)
|
|
17
|
+
parser.add_argument(
|
|
18
|
+
'--port',
|
|
19
|
+
type=int,
|
|
20
|
+
default=14500,
|
|
21
|
+
help='Port to run dashboard on (default: 14500)'
|
|
22
|
+
)
|
|
23
|
+
parser.add_argument(
|
|
24
|
+
'--host',
|
|
25
|
+
type=str,
|
|
26
|
+
default='0.0.0.0',
|
|
27
|
+
help='Host to bind to (default: 0.0.0.0)'
|
|
28
|
+
)
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
'--cache-dir',
|
|
31
|
+
type=str,
|
|
32
|
+
default='.eval_cache',
|
|
33
|
+
help='Path to cache directory (default: .eval_cache)'
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
args = parser.parse_args(sys.argv[2:]) # Skip 'eval-lib' and 'dashboard'
|
|
37
|
+
|
|
38
|
+
# Import here to avoid loading everything for --help
|
|
39
|
+
from eval_lib.dashboard_server import DashboardCache
|
|
40
|
+
from eval_lib.html import HTML_TEMPLATE
|
|
41
|
+
from flask import Flask, render_template_string, jsonify
|
|
42
|
+
|
|
43
|
+
# Create cache with custom directory
|
|
44
|
+
def get_fresh_cache():
|
|
45
|
+
"""Reload cache from disk"""
|
|
46
|
+
return DashboardCache(cache_dir=args.cache_dir)
|
|
47
|
+
|
|
48
|
+
cache = get_fresh_cache()
|
|
49
|
+
|
|
50
|
+
print("="*70)
|
|
51
|
+
print("📊 Eval AI Library - Dashboard Server")
|
|
52
|
+
print("="*70)
|
|
53
|
+
|
|
54
|
+
# Check cache
|
|
55
|
+
latest = cache.get_latest()
|
|
56
|
+
if latest:
|
|
57
|
+
print(f"\n✅ Found cached results:")
|
|
58
|
+
print(f" Latest session: {latest['session_id']}")
|
|
59
|
+
print(f" Timestamp: {latest['timestamp']}")
|
|
60
|
+
print(f" Total sessions: {len(cache.get_all())}")
|
|
61
|
+
else:
|
|
62
|
+
print("\n⚠️ No cached results found")
|
|
63
|
+
print(" Run an evaluation with show_dashboard=True to populate cache")
|
|
64
|
+
|
|
65
|
+
print(f"\n🚀 Starting server...")
|
|
66
|
+
print(f" URL: http://localhost:{args.port}")
|
|
67
|
+
print(f" Host: {args.host}")
|
|
68
|
+
print(f" Cache: {Path(args.cache_dir).absolute()}")
|
|
69
|
+
print(f"\n💡 Keep this terminal open to keep the server running")
|
|
70
|
+
print(f" Press Ctrl+C to stop\n")
|
|
71
|
+
print("="*70 + "\n")
|
|
72
|
+
|
|
73
|
+
app = Flask(__name__)
|
|
74
|
+
app.config['WTF_CSRF_ENABLED'] = False
|
|
75
|
+
|
|
76
|
+
@app.route('/')
|
|
77
|
+
def index():
|
|
78
|
+
return render_template_string(HTML_TEMPLATE)
|
|
79
|
+
|
|
80
|
+
@app.route('/favicon.ico')
|
|
81
|
+
def favicon():
|
|
82
|
+
return '', 204
|
|
83
|
+
|
|
84
|
+
@app.after_request
|
|
85
|
+
def after_request(response):
|
|
86
|
+
response.headers['Access-Control-Allow-Origin'] = '*'
|
|
87
|
+
response.headers['Access-Control-Allow-Methods'] = 'GET, POST, OPTIONS'
|
|
88
|
+
response.headers['Access-Control-Allow-Headers'] = 'Content-Type'
|
|
89
|
+
return response
|
|
90
|
+
|
|
91
|
+
@app.route('/api/latest')
|
|
92
|
+
def api_latest():
|
|
93
|
+
cache = get_fresh_cache()
|
|
94
|
+
latest = cache.get_latest()
|
|
95
|
+
if latest:
|
|
96
|
+
return jsonify(latest)
|
|
97
|
+
return jsonify({'error': 'No results available'}), 404
|
|
98
|
+
|
|
99
|
+
@app.route('/api/sessions')
|
|
100
|
+
def api_sessions():
|
|
101
|
+
cache = get_fresh_cache()
|
|
102
|
+
sessions = [
|
|
103
|
+
{
|
|
104
|
+
'session_id': s['session_id'],
|
|
105
|
+
'timestamp': s['timestamp'],
|
|
106
|
+
'total_tests': s['data']['total_tests']
|
|
107
|
+
}
|
|
108
|
+
for s in cache.get_all()
|
|
109
|
+
]
|
|
110
|
+
return jsonify(sessions)
|
|
111
|
+
|
|
112
|
+
@app.route('/api/session/<session_id>')
|
|
113
|
+
def api_session(session_id):
|
|
114
|
+
cache = get_fresh_cache()
|
|
115
|
+
session = cache.get_by_session(session_id)
|
|
116
|
+
if session:
|
|
117
|
+
return jsonify(session)
|
|
118
|
+
return jsonify({'error': 'Session not found'}), 404
|
|
119
|
+
|
|
120
|
+
@app.route('/api/clear')
|
|
121
|
+
def api_clear():
|
|
122
|
+
cache = get_fresh_cache()
|
|
123
|
+
cache.clear()
|
|
124
|
+
return jsonify({'message': 'Cache cleared'})
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
app.run(
|
|
128
|
+
host=args.host,
|
|
129
|
+
port=args.port,
|
|
130
|
+
debug=False,
|
|
131
|
+
use_reloader=False,
|
|
132
|
+
threaded=True
|
|
133
|
+
)
|
|
134
|
+
except KeyboardInterrupt:
|
|
135
|
+
print("\n\n🛑 Dashboard server stopped")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def main():
|
|
139
|
+
"""Main CLI entry point"""
|
|
140
|
+
parser = argparse.ArgumentParser(
|
|
141
|
+
description='Eval AI Library CLI',
|
|
142
|
+
usage='eval-lib <command> [options]'
|
|
143
|
+
)
|
|
144
|
+
parser.add_argument(
|
|
145
|
+
'command',
|
|
146
|
+
help='Command to run (dashboard, version, help)'
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Parse only the command
|
|
150
|
+
args = parser.parse_args(sys.argv[1:2])
|
|
151
|
+
|
|
152
|
+
if args.command == 'dashboard':
|
|
153
|
+
run_dashboard()
|
|
154
|
+
elif args.command == 'version':
|
|
155
|
+
from eval_lib import __version__
|
|
156
|
+
print(f"Eval AI Library v{__version__}")
|
|
157
|
+
elif args.command == 'help':
|
|
158
|
+
parser.print_help()
|
|
159
|
+
else:
|
|
160
|
+
print(f"Unknown command: {args.command}")
|
|
161
|
+
print("Available commands: dashboard, version, help")
|
|
162
|
+
sys.exit(1)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
if __name__ == '__main__':
|
|
166
|
+
main()
|