eval-ai-library 0.3.3__tar.gz → 0.3.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- {eval_ai_library-0.3.3/eval_ai_library.egg-info → eval_ai_library-0.3.10}/PKG-INFO +166 -1
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/README.md +164 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10/eval_ai_library.egg-info}/PKG-INFO +166 -1
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_ai_library.egg-info/SOURCES.txt +4 -0
- eval_ai_library-0.3.10/eval_ai_library.egg-info/entry_points.txt +2 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_ai_library.egg-info/requires.txt +1 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/__init__.py +9 -1
- eval_ai_library-0.3.10/eval_lib/cli.py +166 -0
- eval_ai_library-0.3.10/eval_lib/dashboard_server.py +172 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/evaluate.py +24 -1
- eval_ai_library-0.3.10/eval_lib/html.py +736 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/pyproject.toml +8 -1
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/LICENSE +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/MANIFEST.in +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_ai_library.egg-info/dependency_links.txt +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_ai_library.egg-info/top_level.txt +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/agent_metrics/__init__.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/agent_metrics/task_success_metric/task_success_rate.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/datagenerator/datagenerator.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/datagenerator/document_loader.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/datagenerator/prompts.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/evaluation_schema.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/llm_client.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metric_pattern.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/__init__.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/answer_precision_metric/answer_precision.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/bias_metric/bias.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/contextual_precision_metric/contextual_precision.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/contextual_recall_metric/contextual_recall.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/custom_metric/custom_eval.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/faithfulness_metric/faithfulness.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/geval/geval.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/toxicity_metric/toxicity.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/price.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/py.typed +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/testcases_schema.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/utils.py +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/setup.cfg +0 -0
- {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-ai-library
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.10
|
|
4
4
|
Summary: Comprehensive AI Model Evaluation Framework with support for multiple LLM providers
|
|
5
5
|
Author-email: Aleksandr Meshkov <alekslynx90@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -45,6 +45,7 @@ Requires-Dist: html2text>=2020.1.16
|
|
|
45
45
|
Requires-Dist: markdown>=3.4.0
|
|
46
46
|
Requires-Dist: pandas>=2.0.0
|
|
47
47
|
Requires-Dist: striprtf>=0.0.26
|
|
48
|
+
Requires-Dist: flask>=3.0.0
|
|
48
49
|
Provides-Extra: dev
|
|
49
50
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
50
51
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
@@ -807,6 +808,170 @@ response, cost = await chat_complete(
|
|
|
807
808
|
)
|
|
808
809
|
```
|
|
809
810
|
|
|
811
|
+
## Dashboard
|
|
812
|
+
|
|
813
|
+
The library includes an interactive web dashboard for visualizing evaluation results. All evaluation results are automatically saved to cache and can be viewed in a beautiful web interface.
|
|
814
|
+
|
|
815
|
+
### Features
|
|
816
|
+
|
|
817
|
+
- 📊 **Interactive Charts**: Visual representation of metrics with Chart.js
|
|
818
|
+
- 📈 **Metrics Summary**: Aggregate statistics across all evaluations
|
|
819
|
+
- 🔍 **Detailed View**: Drill down into individual test cases and metric results
|
|
820
|
+
- 💾 **Session History**: Access past evaluation runs
|
|
821
|
+
- 🎨 **Beautiful UI**: Modern, responsive interface with color-coded results
|
|
822
|
+
- 🔄 **Real-time Updates**: Refresh to see new evaluation results
|
|
823
|
+
|
|
824
|
+
### Starting the Dashboard
|
|
825
|
+
|
|
826
|
+
The dashboard runs as a separate server that you start once and keep running:
|
|
827
|
+
```bash
|
|
828
|
+
# Start dashboard server (from your project directory)
|
|
829
|
+
eval-lib dashboard
|
|
830
|
+
|
|
831
|
+
# Custom port if 14500 is busy
|
|
832
|
+
eval-lib dashboard --port 8080
|
|
833
|
+
|
|
834
|
+
# Custom cache directory
|
|
835
|
+
eval-lib dashboard --cache-dir /path/to/cache
|
|
836
|
+
```
|
|
837
|
+
|
|
838
|
+
Once started, the dashboard will be available at `http://localhost:14500`
|
|
839
|
+
|
|
840
|
+
### Saving Results to Dashboard
|
|
841
|
+
|
|
842
|
+
Enable dashboard cache saving in your evaluation:
|
|
843
|
+
```python
|
|
844
|
+
import asyncio
|
|
845
|
+
from eval_lib import (
|
|
846
|
+
evaluate,
|
|
847
|
+
EvalTestCase,
|
|
848
|
+
AnswerRelevancyMetric,
|
|
849
|
+
FaithfulnessMetric
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
async def evaluate_with_dashboard():
|
|
853
|
+
test_cases = [
|
|
854
|
+
EvalTestCase(
|
|
855
|
+
input="What is the capital of France?",
|
|
856
|
+
actual_output="Paris is the capital.",
|
|
857
|
+
expected_output="Paris",
|
|
858
|
+
retrieval_context=["Paris is the capital of France."]
|
|
859
|
+
)
|
|
860
|
+
]
|
|
861
|
+
|
|
862
|
+
metrics = [
|
|
863
|
+
AnswerRelevancyMetric(model="gpt-4o-mini", threshold=0.7),
|
|
864
|
+
FaithfulnessMetric(model="gpt-4o-mini", threshold=0.8)
|
|
865
|
+
]
|
|
866
|
+
|
|
867
|
+
# Results are saved to .eval_cache/ for dashboard viewing
|
|
868
|
+
results = await evaluate(
|
|
869
|
+
test_cases=test_cases,
|
|
870
|
+
metrics=metrics,
|
|
871
|
+
show_dashboard=True, # ← Enable dashboard cache
|
|
872
|
+
session_name="My First Evaluation" # Optional session name
|
|
873
|
+
)
|
|
874
|
+
|
|
875
|
+
return results
|
|
876
|
+
|
|
877
|
+
asyncio.run(evaluate_with_dashboard())
|
|
878
|
+
```
|
|
879
|
+
|
|
880
|
+
### Typical Workflow
|
|
881
|
+
|
|
882
|
+
**Terminal 1 - Start Dashboard (once):**
|
|
883
|
+
```bash
|
|
884
|
+
cd ~/my_project
|
|
885
|
+
eval-lib dashboard
|
|
886
|
+
# Leave this terminal open - dashboard stays running
|
|
887
|
+
```
|
|
888
|
+
|
|
889
|
+
**Terminal 2 - Run Evaluations (multiple times):**
|
|
890
|
+
```python
|
|
891
|
+
# Run evaluation 1
|
|
892
|
+
results1 = await evaluate(
|
|
893
|
+
test_cases=test_cases1,
|
|
894
|
+
metrics=metrics,
|
|
895
|
+
show_dashboard=True,
|
|
896
|
+
session_name="Evaluation 1"
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
# Run evaluation 2
|
|
900
|
+
results2 = await evaluate(
|
|
901
|
+
test_cases=test_cases2,
|
|
902
|
+
metrics=metrics,
|
|
903
|
+
show_dashboard=True,
|
|
904
|
+
session_name="Evaluation 2"
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
# All results are cached and viewable in dashboard
|
|
908
|
+
```
|
|
909
|
+
|
|
910
|
+
**Browser:**
|
|
911
|
+
- Open `http://localhost:14500`
|
|
912
|
+
- Refresh page (F5) to see new evaluation results
|
|
913
|
+
- Switch between different evaluation sessions using the dropdown
|
|
914
|
+
|
|
915
|
+
### Dashboard Features
|
|
916
|
+
|
|
917
|
+
**Summary Cards:**
|
|
918
|
+
- Total test cases evaluated
|
|
919
|
+
- Total cost across all evaluations
|
|
920
|
+
- Number of metrics used
|
|
921
|
+
|
|
922
|
+
**Metrics Overview:**
|
|
923
|
+
- Average scores per metric
|
|
924
|
+
- Pass/fail counts
|
|
925
|
+
- Success rates
|
|
926
|
+
- Model used for evaluation
|
|
927
|
+
- Total cost per metric
|
|
928
|
+
|
|
929
|
+
**Detailed Results Table:**
|
|
930
|
+
- Test case inputs and outputs
|
|
931
|
+
- Individual metric scores
|
|
932
|
+
- Pass/fail status
|
|
933
|
+
- Click "View Details" for full information including:
|
|
934
|
+
- Complete input/output/expected output
|
|
935
|
+
- Full retrieval context
|
|
936
|
+
- Detailed evaluation reasoning
|
|
937
|
+
- Complete evaluation logs
|
|
938
|
+
|
|
939
|
+
**Charts:**
|
|
940
|
+
- Bar chart: Average scores by metric
|
|
941
|
+
- Doughnut chart: Success rate distribution
|
|
942
|
+
|
|
943
|
+
### Cache Management
|
|
944
|
+
|
|
945
|
+
Results are stored in `.eval_cache/results.json` in your project directory:
|
|
946
|
+
```bash
|
|
947
|
+
# View cache contents
|
|
948
|
+
cat .eval_cache/results.json
|
|
949
|
+
|
|
950
|
+
# Clear cache via dashboard
|
|
951
|
+
# Click "Clear Cache" button in dashboard UI
|
|
952
|
+
|
|
953
|
+
# Or manually delete cache
|
|
954
|
+
rm -rf .eval_cache/
|
|
955
|
+
```
|
|
956
|
+
|
|
957
|
+
### CLI Commands
|
|
958
|
+
```bash
|
|
959
|
+
# Start dashboard with defaults
|
|
960
|
+
eval-lib dashboard
|
|
961
|
+
|
|
962
|
+
# Custom port
|
|
963
|
+
eval-lib dashboard --port 8080
|
|
964
|
+
|
|
965
|
+
# Custom cache directory
|
|
966
|
+
eval-lib dashboard --cache-dir /path/to/project/.eval_cache
|
|
967
|
+
|
|
968
|
+
# Check library version
|
|
969
|
+
eval-lib version
|
|
970
|
+
|
|
971
|
+
# Help
|
|
972
|
+
eval-lib help
|
|
973
|
+
```
|
|
974
|
+
|
|
810
975
|
## Custom LLM Providers
|
|
811
976
|
|
|
812
977
|
The library supports custom LLM providers through the `CustomLLMClient` abstract base class. This allows you to integrate any LLM provider, including internal corporate models, locally-hosted models, or custom endpoints.
|
|
@@ -748,6 +748,170 @@ response, cost = await chat_complete(
|
|
|
748
748
|
)
|
|
749
749
|
```
|
|
750
750
|
|
|
751
|
+
## Dashboard
|
|
752
|
+
|
|
753
|
+
The library includes an interactive web dashboard for visualizing evaluation results. All evaluation results are automatically saved to cache and can be viewed in a beautiful web interface.
|
|
754
|
+
|
|
755
|
+
### Features
|
|
756
|
+
|
|
757
|
+
- 📊 **Interactive Charts**: Visual representation of metrics with Chart.js
|
|
758
|
+
- 📈 **Metrics Summary**: Aggregate statistics across all evaluations
|
|
759
|
+
- 🔍 **Detailed View**: Drill down into individual test cases and metric results
|
|
760
|
+
- 💾 **Session History**: Access past evaluation runs
|
|
761
|
+
- 🎨 **Beautiful UI**: Modern, responsive interface with color-coded results
|
|
762
|
+
- 🔄 **Real-time Updates**: Refresh to see new evaluation results
|
|
763
|
+
|
|
764
|
+
### Starting the Dashboard
|
|
765
|
+
|
|
766
|
+
The dashboard runs as a separate server that you start once and keep running:
|
|
767
|
+
```bash
|
|
768
|
+
# Start dashboard server (from your project directory)
|
|
769
|
+
eval-lib dashboard
|
|
770
|
+
|
|
771
|
+
# Custom port if 14500 is busy
|
|
772
|
+
eval-lib dashboard --port 8080
|
|
773
|
+
|
|
774
|
+
# Custom cache directory
|
|
775
|
+
eval-lib dashboard --cache-dir /path/to/cache
|
|
776
|
+
```
|
|
777
|
+
|
|
778
|
+
Once started, the dashboard will be available at `http://localhost:14500`
|
|
779
|
+
|
|
780
|
+
### Saving Results to Dashboard
|
|
781
|
+
|
|
782
|
+
Enable dashboard cache saving in your evaluation:
|
|
783
|
+
```python
|
|
784
|
+
import asyncio
|
|
785
|
+
from eval_lib import (
|
|
786
|
+
evaluate,
|
|
787
|
+
EvalTestCase,
|
|
788
|
+
AnswerRelevancyMetric,
|
|
789
|
+
FaithfulnessMetric
|
|
790
|
+
)
|
|
791
|
+
|
|
792
|
+
async def evaluate_with_dashboard():
|
|
793
|
+
test_cases = [
|
|
794
|
+
EvalTestCase(
|
|
795
|
+
input="What is the capital of France?",
|
|
796
|
+
actual_output="Paris is the capital.",
|
|
797
|
+
expected_output="Paris",
|
|
798
|
+
retrieval_context=["Paris is the capital of France."]
|
|
799
|
+
)
|
|
800
|
+
]
|
|
801
|
+
|
|
802
|
+
metrics = [
|
|
803
|
+
AnswerRelevancyMetric(model="gpt-4o-mini", threshold=0.7),
|
|
804
|
+
FaithfulnessMetric(model="gpt-4o-mini", threshold=0.8)
|
|
805
|
+
]
|
|
806
|
+
|
|
807
|
+
# Results are saved to .eval_cache/ for dashboard viewing
|
|
808
|
+
results = await evaluate(
|
|
809
|
+
test_cases=test_cases,
|
|
810
|
+
metrics=metrics,
|
|
811
|
+
show_dashboard=True, # ← Enable dashboard cache
|
|
812
|
+
session_name="My First Evaluation" # Optional session name
|
|
813
|
+
)
|
|
814
|
+
|
|
815
|
+
return results
|
|
816
|
+
|
|
817
|
+
asyncio.run(evaluate_with_dashboard())
|
|
818
|
+
```
|
|
819
|
+
|
|
820
|
+
### Typical Workflow
|
|
821
|
+
|
|
822
|
+
**Terminal 1 - Start Dashboard (once):**
|
|
823
|
+
```bash
|
|
824
|
+
cd ~/my_project
|
|
825
|
+
eval-lib dashboard
|
|
826
|
+
# Leave this terminal open - dashboard stays running
|
|
827
|
+
```
|
|
828
|
+
|
|
829
|
+
**Terminal 2 - Run Evaluations (multiple times):**
|
|
830
|
+
```python
|
|
831
|
+
# Run evaluation 1
|
|
832
|
+
results1 = await evaluate(
|
|
833
|
+
test_cases=test_cases1,
|
|
834
|
+
metrics=metrics,
|
|
835
|
+
show_dashboard=True,
|
|
836
|
+
session_name="Evaluation 1"
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
# Run evaluation 2
|
|
840
|
+
results2 = await evaluate(
|
|
841
|
+
test_cases=test_cases2,
|
|
842
|
+
metrics=metrics,
|
|
843
|
+
show_dashboard=True,
|
|
844
|
+
session_name="Evaluation 2"
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
# All results are cached and viewable in dashboard
|
|
848
|
+
```
|
|
849
|
+
|
|
850
|
+
**Browser:**
|
|
851
|
+
- Open `http://localhost:14500`
|
|
852
|
+
- Refresh page (F5) to see new evaluation results
|
|
853
|
+
- Switch between different evaluation sessions using the dropdown
|
|
854
|
+
|
|
855
|
+
### Dashboard Features
|
|
856
|
+
|
|
857
|
+
**Summary Cards:**
|
|
858
|
+
- Total test cases evaluated
|
|
859
|
+
- Total cost across all evaluations
|
|
860
|
+
- Number of metrics used
|
|
861
|
+
|
|
862
|
+
**Metrics Overview:**
|
|
863
|
+
- Average scores per metric
|
|
864
|
+
- Pass/fail counts
|
|
865
|
+
- Success rates
|
|
866
|
+
- Model used for evaluation
|
|
867
|
+
- Total cost per metric
|
|
868
|
+
|
|
869
|
+
**Detailed Results Table:**
|
|
870
|
+
- Test case inputs and outputs
|
|
871
|
+
- Individual metric scores
|
|
872
|
+
- Pass/fail status
|
|
873
|
+
- Click "View Details" for full information including:
|
|
874
|
+
- Complete input/output/expected output
|
|
875
|
+
- Full retrieval context
|
|
876
|
+
- Detailed evaluation reasoning
|
|
877
|
+
- Complete evaluation logs
|
|
878
|
+
|
|
879
|
+
**Charts:**
|
|
880
|
+
- Bar chart: Average scores by metric
|
|
881
|
+
- Doughnut chart: Success rate distribution
|
|
882
|
+
|
|
883
|
+
### Cache Management
|
|
884
|
+
|
|
885
|
+
Results are stored in `.eval_cache/results.json` in your project directory:
|
|
886
|
+
```bash
|
|
887
|
+
# View cache contents
|
|
888
|
+
cat .eval_cache/results.json
|
|
889
|
+
|
|
890
|
+
# Clear cache via dashboard
|
|
891
|
+
# Click "Clear Cache" button in dashboard UI
|
|
892
|
+
|
|
893
|
+
# Or manually delete cache
|
|
894
|
+
rm -rf .eval_cache/
|
|
895
|
+
```
|
|
896
|
+
|
|
897
|
+
### CLI Commands
|
|
898
|
+
```bash
|
|
899
|
+
# Start dashboard with defaults
|
|
900
|
+
eval-lib dashboard
|
|
901
|
+
|
|
902
|
+
# Custom port
|
|
903
|
+
eval-lib dashboard --port 8080
|
|
904
|
+
|
|
905
|
+
# Custom cache directory
|
|
906
|
+
eval-lib dashboard --cache-dir /path/to/project/.eval_cache
|
|
907
|
+
|
|
908
|
+
# Check library version
|
|
909
|
+
eval-lib version
|
|
910
|
+
|
|
911
|
+
# Help
|
|
912
|
+
eval-lib help
|
|
913
|
+
```
|
|
914
|
+
|
|
751
915
|
## Custom LLM Providers
|
|
752
916
|
|
|
753
917
|
The library supports custom LLM providers through the `CustomLLMClient` abstract base class. This allows you to integrate any LLM provider, including internal corporate models, locally-hosted models, or custom endpoints.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-ai-library
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.10
|
|
4
4
|
Summary: Comprehensive AI Model Evaluation Framework with support for multiple LLM providers
|
|
5
5
|
Author-email: Aleksandr Meshkov <alekslynx90@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -45,6 +45,7 @@ Requires-Dist: html2text>=2020.1.16
|
|
|
45
45
|
Requires-Dist: markdown>=3.4.0
|
|
46
46
|
Requires-Dist: pandas>=2.0.0
|
|
47
47
|
Requires-Dist: striprtf>=0.0.26
|
|
48
|
+
Requires-Dist: flask>=3.0.0
|
|
48
49
|
Provides-Extra: dev
|
|
49
50
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
50
51
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
@@ -807,6 +808,170 @@ response, cost = await chat_complete(
|
|
|
807
808
|
)
|
|
808
809
|
```
|
|
809
810
|
|
|
811
|
+
## Dashboard
|
|
812
|
+
|
|
813
|
+
The library includes an interactive web dashboard for visualizing evaluation results. All evaluation results are automatically saved to cache and can be viewed in a beautiful web interface.
|
|
814
|
+
|
|
815
|
+
### Features
|
|
816
|
+
|
|
817
|
+
- 📊 **Interactive Charts**: Visual representation of metrics with Chart.js
|
|
818
|
+
- 📈 **Metrics Summary**: Aggregate statistics across all evaluations
|
|
819
|
+
- 🔍 **Detailed View**: Drill down into individual test cases and metric results
|
|
820
|
+
- 💾 **Session History**: Access past evaluation runs
|
|
821
|
+
- 🎨 **Beautiful UI**: Modern, responsive interface with color-coded results
|
|
822
|
+
- 🔄 **Real-time Updates**: Refresh to see new evaluation results
|
|
823
|
+
|
|
824
|
+
### Starting the Dashboard
|
|
825
|
+
|
|
826
|
+
The dashboard runs as a separate server that you start once and keep running:
|
|
827
|
+
```bash
|
|
828
|
+
# Start dashboard server (from your project directory)
|
|
829
|
+
eval-lib dashboard
|
|
830
|
+
|
|
831
|
+
# Custom port if 14500 is busy
|
|
832
|
+
eval-lib dashboard --port 8080
|
|
833
|
+
|
|
834
|
+
# Custom cache directory
|
|
835
|
+
eval-lib dashboard --cache-dir /path/to/cache
|
|
836
|
+
```
|
|
837
|
+
|
|
838
|
+
Once started, the dashboard will be available at `http://localhost:14500`
|
|
839
|
+
|
|
840
|
+
### Saving Results to Dashboard
|
|
841
|
+
|
|
842
|
+
Enable dashboard cache saving in your evaluation:
|
|
843
|
+
```python
|
|
844
|
+
import asyncio
|
|
845
|
+
from eval_lib import (
|
|
846
|
+
evaluate,
|
|
847
|
+
EvalTestCase,
|
|
848
|
+
AnswerRelevancyMetric,
|
|
849
|
+
FaithfulnessMetric
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
async def evaluate_with_dashboard():
|
|
853
|
+
test_cases = [
|
|
854
|
+
EvalTestCase(
|
|
855
|
+
input="What is the capital of France?",
|
|
856
|
+
actual_output="Paris is the capital.",
|
|
857
|
+
expected_output="Paris",
|
|
858
|
+
retrieval_context=["Paris is the capital of France."]
|
|
859
|
+
)
|
|
860
|
+
]
|
|
861
|
+
|
|
862
|
+
metrics = [
|
|
863
|
+
AnswerRelevancyMetric(model="gpt-4o-mini", threshold=0.7),
|
|
864
|
+
FaithfulnessMetric(model="gpt-4o-mini", threshold=0.8)
|
|
865
|
+
]
|
|
866
|
+
|
|
867
|
+
# Results are saved to .eval_cache/ for dashboard viewing
|
|
868
|
+
results = await evaluate(
|
|
869
|
+
test_cases=test_cases,
|
|
870
|
+
metrics=metrics,
|
|
871
|
+
show_dashboard=True, # ← Enable dashboard cache
|
|
872
|
+
session_name="My First Evaluation" # Optional session name
|
|
873
|
+
)
|
|
874
|
+
|
|
875
|
+
return results
|
|
876
|
+
|
|
877
|
+
asyncio.run(evaluate_with_dashboard())
|
|
878
|
+
```
|
|
879
|
+
|
|
880
|
+
### Typical Workflow
|
|
881
|
+
|
|
882
|
+
**Terminal 1 - Start Dashboard (once):**
|
|
883
|
+
```bash
|
|
884
|
+
cd ~/my_project
|
|
885
|
+
eval-lib dashboard
|
|
886
|
+
# Leave this terminal open - dashboard stays running
|
|
887
|
+
```
|
|
888
|
+
|
|
889
|
+
**Terminal 2 - Run Evaluations (multiple times):**
|
|
890
|
+
```python
|
|
891
|
+
# Run evaluation 1
|
|
892
|
+
results1 = await evaluate(
|
|
893
|
+
test_cases=test_cases1,
|
|
894
|
+
metrics=metrics,
|
|
895
|
+
show_dashboard=True,
|
|
896
|
+
session_name="Evaluation 1"
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
# Run evaluation 2
|
|
900
|
+
results2 = await evaluate(
|
|
901
|
+
test_cases=test_cases2,
|
|
902
|
+
metrics=metrics,
|
|
903
|
+
show_dashboard=True,
|
|
904
|
+
session_name="Evaluation 2"
|
|
905
|
+
)
|
|
906
|
+
|
|
907
|
+
# All results are cached and viewable in dashboard
|
|
908
|
+
```
|
|
909
|
+
|
|
910
|
+
**Browser:**
|
|
911
|
+
- Open `http://localhost:14500`
|
|
912
|
+
- Refresh page (F5) to see new evaluation results
|
|
913
|
+
- Switch between different evaluation sessions using the dropdown
|
|
914
|
+
|
|
915
|
+
### Dashboard Features
|
|
916
|
+
|
|
917
|
+
**Summary Cards:**
|
|
918
|
+
- Total test cases evaluated
|
|
919
|
+
- Total cost across all evaluations
|
|
920
|
+
- Number of metrics used
|
|
921
|
+
|
|
922
|
+
**Metrics Overview:**
|
|
923
|
+
- Average scores per metric
|
|
924
|
+
- Pass/fail counts
|
|
925
|
+
- Success rates
|
|
926
|
+
- Model used for evaluation
|
|
927
|
+
- Total cost per metric
|
|
928
|
+
|
|
929
|
+
**Detailed Results Table:**
|
|
930
|
+
- Test case inputs and outputs
|
|
931
|
+
- Individual metric scores
|
|
932
|
+
- Pass/fail status
|
|
933
|
+
- Click "View Details" for full information including:
|
|
934
|
+
- Complete input/output/expected output
|
|
935
|
+
- Full retrieval context
|
|
936
|
+
- Detailed evaluation reasoning
|
|
937
|
+
- Complete evaluation logs
|
|
938
|
+
|
|
939
|
+
**Charts:**
|
|
940
|
+
- Bar chart: Average scores by metric
|
|
941
|
+
- Doughnut chart: Success rate distribution
|
|
942
|
+
|
|
943
|
+
### Cache Management
|
|
944
|
+
|
|
945
|
+
Results are stored in `.eval_cache/results.json` in your project directory:
|
|
946
|
+
```bash
|
|
947
|
+
# View cache contents
|
|
948
|
+
cat .eval_cache/results.json
|
|
949
|
+
|
|
950
|
+
# Clear cache via dashboard
|
|
951
|
+
# Click "Clear Cache" button in dashboard UI
|
|
952
|
+
|
|
953
|
+
# Or manually delete cache
|
|
954
|
+
rm -rf .eval_cache/
|
|
955
|
+
```
|
|
956
|
+
|
|
957
|
+
### CLI Commands
|
|
958
|
+
```bash
|
|
959
|
+
# Start dashboard with defaults
|
|
960
|
+
eval-lib dashboard
|
|
961
|
+
|
|
962
|
+
# Custom port
|
|
963
|
+
eval-lib dashboard --port 8080
|
|
964
|
+
|
|
965
|
+
# Custom cache directory
|
|
966
|
+
eval-lib dashboard --cache-dir /path/to/project/.eval_cache
|
|
967
|
+
|
|
968
|
+
# Check library version
|
|
969
|
+
eval-lib version
|
|
970
|
+
|
|
971
|
+
# Help
|
|
972
|
+
eval-lib help
|
|
973
|
+
```
|
|
974
|
+
|
|
810
975
|
## Custom LLM Providers
|
|
811
976
|
|
|
812
977
|
The library supports custom LLM providers through the `CustomLLMClient` abstract base class. This allows you to integrate any LLM provider, including internal corporate models, locally-hosted models, or custom endpoints.
|
|
@@ -6,11 +6,15 @@ setup.py
|
|
|
6
6
|
eval_ai_library.egg-info/PKG-INFO
|
|
7
7
|
eval_ai_library.egg-info/SOURCES.txt
|
|
8
8
|
eval_ai_library.egg-info/dependency_links.txt
|
|
9
|
+
eval_ai_library.egg-info/entry_points.txt
|
|
9
10
|
eval_ai_library.egg-info/requires.txt
|
|
10
11
|
eval_ai_library.egg-info/top_level.txt
|
|
11
12
|
eval_lib/__init__.py
|
|
13
|
+
eval_lib/cli.py
|
|
14
|
+
eval_lib/dashboard_server.py
|
|
12
15
|
eval_lib/evaluate.py
|
|
13
16
|
eval_lib/evaluation_schema.py
|
|
17
|
+
eval_lib/html.py
|
|
14
18
|
eval_lib/llm_client.py
|
|
15
19
|
eval_lib/metric_pattern.py
|
|
16
20
|
eval_lib/price.py
|
|
@@ -7,7 +7,7 @@ A powerful library for evaluating AI models with support for multiple LLM provid
|
|
|
7
7
|
and a wide range of evaluation metrics for RAG systems and AI agents.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
__version__ = "0.3.
|
|
10
|
+
__version__ = "0.3.10"
|
|
11
11
|
__author__ = "Aleksandr Meshkov"
|
|
12
12
|
|
|
13
13
|
# Core evaluation functions
|
|
@@ -66,6 +66,10 @@ from eval_lib.agent_metrics import (
|
|
|
66
66
|
KnowledgeRetentionMetric
|
|
67
67
|
)
|
|
68
68
|
|
|
69
|
+
from .dashboard_server import (
|
|
70
|
+
DashboardCache
|
|
71
|
+
)
|
|
72
|
+
|
|
69
73
|
|
|
70
74
|
def __getattr__(name):
|
|
71
75
|
"""
|
|
@@ -136,4 +140,8 @@ __all__ = [
|
|
|
136
140
|
# Utils
|
|
137
141
|
"score_agg",
|
|
138
142
|
"extract_json_block",
|
|
143
|
+
|
|
144
|
+
# Dashboard
|
|
145
|
+
'start_dashboard',
|
|
146
|
+
'DashboardCache',
|
|
139
147
|
]
|