eval-ai-library 0.3.3__tar.gz → 0.3.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eval-ai-library might be problematic. Click here for more details.

Files changed (45) hide show
  1. {eval_ai_library-0.3.3/eval_ai_library.egg-info → eval_ai_library-0.3.10}/PKG-INFO +166 -1
  2. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/README.md +164 -0
  3. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10/eval_ai_library.egg-info}/PKG-INFO +166 -1
  4. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_ai_library.egg-info/SOURCES.txt +4 -0
  5. eval_ai_library-0.3.10/eval_ai_library.egg-info/entry_points.txt +2 -0
  6. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_ai_library.egg-info/requires.txt +1 -0
  7. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/__init__.py +9 -1
  8. eval_ai_library-0.3.10/eval_lib/cli.py +166 -0
  9. eval_ai_library-0.3.10/eval_lib/dashboard_server.py +172 -0
  10. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/evaluate.py +24 -1
  11. eval_ai_library-0.3.10/eval_lib/html.py +736 -0
  12. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/pyproject.toml +8 -1
  13. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/LICENSE +0 -0
  14. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/MANIFEST.in +0 -0
  15. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_ai_library.egg-info/dependency_links.txt +0 -0
  16. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_ai_library.egg-info/top_level.txt +0 -0
  17. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/agent_metrics/__init__.py +0 -0
  18. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +0 -0
  19. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +0 -0
  20. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/agent_metrics/task_success_metric/task_success_rate.py +0 -0
  21. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +0 -0
  22. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/datagenerator/datagenerator.py +0 -0
  23. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/datagenerator/document_loader.py +0 -0
  24. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/datagenerator/prompts.py +0 -0
  25. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/evaluation_schema.py +0 -0
  26. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/llm_client.py +0 -0
  27. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metric_pattern.py +0 -0
  28. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/__init__.py +0 -0
  29. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/answer_precision_metric/answer_precision.py +0 -0
  30. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +0 -0
  31. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/bias_metric/bias.py +0 -0
  32. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/contextual_precision_metric/contextual_precision.py +0 -0
  33. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/contextual_recall_metric/contextual_recall.py +0 -0
  34. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +0 -0
  35. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/custom_metric/custom_eval.py +0 -0
  36. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/faithfulness_metric/faithfulness.py +0 -0
  37. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/geval/geval.py +0 -0
  38. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +0 -0
  39. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/metrics/toxicity_metric/toxicity.py +0 -0
  40. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/price.py +0 -0
  41. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/py.typed +0 -0
  42. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/testcases_schema.py +0 -0
  43. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/eval_lib/utils.py +0 -0
  44. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/setup.cfg +0 -0
  45. {eval_ai_library-0.3.3 → eval_ai_library-0.3.10}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-ai-library
3
- Version: 0.3.3
3
+ Version: 0.3.10
4
4
  Summary: Comprehensive AI Model Evaluation Framework with support for multiple LLM providers
5
5
  Author-email: Aleksandr Meshkov <alekslynx90@gmail.com>
6
6
  License: MIT
@@ -45,6 +45,7 @@ Requires-Dist: html2text>=2020.1.16
45
45
  Requires-Dist: markdown>=3.4.0
46
46
  Requires-Dist: pandas>=2.0.0
47
47
  Requires-Dist: striprtf>=0.0.26
48
+ Requires-Dist: flask>=3.0.0
48
49
  Provides-Extra: dev
49
50
  Requires-Dist: pytest>=7.0.0; extra == "dev"
50
51
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
@@ -807,6 +808,170 @@ response, cost = await chat_complete(
807
808
  )
808
809
  ```
809
810
 
811
+ ## Dashboard
812
+
813
+ The library includes an interactive web dashboard for visualizing evaluation results. All evaluation results are automatically saved to cache and can be viewed in a beautiful web interface.
814
+
815
+ ### Features
816
+
817
+ - 📊 **Interactive Charts**: Visual representation of metrics with Chart.js
818
+ - 📈 **Metrics Summary**: Aggregate statistics across all evaluations
819
+ - 🔍 **Detailed View**: Drill down into individual test cases and metric results
820
+ - 💾 **Session History**: Access past evaluation runs
821
+ - 🎨 **Beautiful UI**: Modern, responsive interface with color-coded results
822
+ - 🔄 **Real-time Updates**: Refresh to see new evaluation results
823
+
824
+ ### Starting the Dashboard
825
+
826
+ The dashboard runs as a separate server that you start once and keep running:
827
+ ```bash
828
+ # Start dashboard server (from your project directory)
829
+ eval-lib dashboard
830
+
831
+ # Custom port if 14500 is busy
832
+ eval-lib dashboard --port 8080
833
+
834
+ # Custom cache directory
835
+ eval-lib dashboard --cache-dir /path/to/cache
836
+ ```
837
+
838
+ Once started, the dashboard will be available at `http://localhost:14500`
839
+
840
+ ### Saving Results to Dashboard
841
+
842
+ Enable dashboard cache saving in your evaluation:
843
+ ```python
844
+ import asyncio
845
+ from eval_lib import (
846
+ evaluate,
847
+ EvalTestCase,
848
+ AnswerRelevancyMetric,
849
+ FaithfulnessMetric
850
+ )
851
+
852
+ async def evaluate_with_dashboard():
853
+ test_cases = [
854
+ EvalTestCase(
855
+ input="What is the capital of France?",
856
+ actual_output="Paris is the capital.",
857
+ expected_output="Paris",
858
+ retrieval_context=["Paris is the capital of France."]
859
+ )
860
+ ]
861
+
862
+ metrics = [
863
+ AnswerRelevancyMetric(model="gpt-4o-mini", threshold=0.7),
864
+ FaithfulnessMetric(model="gpt-4o-mini", threshold=0.8)
865
+ ]
866
+
867
+ # Results are saved to .eval_cache/ for dashboard viewing
868
+ results = await evaluate(
869
+ test_cases=test_cases,
870
+ metrics=metrics,
871
+ show_dashboard=True, # ← Enable dashboard cache
872
+ session_name="My First Evaluation" # Optional session name
873
+ )
874
+
875
+ return results
876
+
877
+ asyncio.run(evaluate_with_dashboard())
878
+ ```
879
+
880
+ ### Typical Workflow
881
+
882
+ **Terminal 1 - Start Dashboard (once):**
883
+ ```bash
884
+ cd ~/my_project
885
+ eval-lib dashboard
886
+ # Leave this terminal open - dashboard stays running
887
+ ```
888
+
889
+ **Terminal 2 - Run Evaluations (multiple times):**
890
+ ```python
891
+ # Run evaluation 1
892
+ results1 = await evaluate(
893
+ test_cases=test_cases1,
894
+ metrics=metrics,
895
+ show_dashboard=True,
896
+ session_name="Evaluation 1"
897
+ )
898
+
899
+ # Run evaluation 2
900
+ results2 = await evaluate(
901
+ test_cases=test_cases2,
902
+ metrics=metrics,
903
+ show_dashboard=True,
904
+ session_name="Evaluation 2"
905
+ )
906
+
907
+ # All results are cached and viewable in dashboard
908
+ ```
909
+
910
+ **Browser:**
911
+ - Open `http://localhost:14500`
912
+ - Refresh page (F5) to see new evaluation results
913
+ - Switch between different evaluation sessions using the dropdown
914
+
915
+ ### Dashboard Features
916
+
917
+ **Summary Cards:**
918
+ - Total test cases evaluated
919
+ - Total cost across all evaluations
920
+ - Number of metrics used
921
+
922
+ **Metrics Overview:**
923
+ - Average scores per metric
924
+ - Pass/fail counts
925
+ - Success rates
926
+ - Model used for evaluation
927
+ - Total cost per metric
928
+
929
+ **Detailed Results Table:**
930
+ - Test case inputs and outputs
931
+ - Individual metric scores
932
+ - Pass/fail status
933
+ - Click "View Details" for full information including:
934
+ - Complete input/output/expected output
935
+ - Full retrieval context
936
+ - Detailed evaluation reasoning
937
+ - Complete evaluation logs
938
+
939
+ **Charts:**
940
+ - Bar chart: Average scores by metric
941
+ - Doughnut chart: Success rate distribution
942
+
943
+ ### Cache Management
944
+
945
+ Results are stored in `.eval_cache/results.json` in your project directory:
946
+ ```bash
947
+ # View cache contents
948
+ cat .eval_cache/results.json
949
+
950
+ # Clear cache via dashboard
951
+ # Click "Clear Cache" button in dashboard UI
952
+
953
+ # Or manually delete cache
954
+ rm -rf .eval_cache/
955
+ ```
956
+
957
+ ### CLI Commands
958
+ ```bash
959
+ # Start dashboard with defaults
960
+ eval-lib dashboard
961
+
962
+ # Custom port
963
+ eval-lib dashboard --port 8080
964
+
965
+ # Custom cache directory
966
+ eval-lib dashboard --cache-dir /path/to/project/.eval_cache
967
+
968
+ # Check library version
969
+ eval-lib version
970
+
971
+ # Help
972
+ eval-lib help
973
+ ```
974
+
810
975
  ## Custom LLM Providers
811
976
 
812
977
  The library supports custom LLM providers through the `CustomLLMClient` abstract base class. This allows you to integrate any LLM provider, including internal corporate models, locally-hosted models, or custom endpoints.
@@ -748,6 +748,170 @@ response, cost = await chat_complete(
748
748
  )
749
749
  ```
750
750
 
751
+ ## Dashboard
752
+
753
+ The library includes an interactive web dashboard for visualizing evaluation results. All evaluation results are automatically saved to cache and can be viewed in a beautiful web interface.
754
+
755
+ ### Features
756
+
757
+ - 📊 **Interactive Charts**: Visual representation of metrics with Chart.js
758
+ - 📈 **Metrics Summary**: Aggregate statistics across all evaluations
759
+ - 🔍 **Detailed View**: Drill down into individual test cases and metric results
760
+ - 💾 **Session History**: Access past evaluation runs
761
+ - 🎨 **Beautiful UI**: Modern, responsive interface with color-coded results
762
+ - 🔄 **Real-time Updates**: Refresh to see new evaluation results
763
+
764
+ ### Starting the Dashboard
765
+
766
+ The dashboard runs as a separate server that you start once and keep running:
767
+ ```bash
768
+ # Start dashboard server (from your project directory)
769
+ eval-lib dashboard
770
+
771
+ # Custom port if 14500 is busy
772
+ eval-lib dashboard --port 8080
773
+
774
+ # Custom cache directory
775
+ eval-lib dashboard --cache-dir /path/to/cache
776
+ ```
777
+
778
+ Once started, the dashboard will be available at `http://localhost:14500`
779
+
780
+ ### Saving Results to Dashboard
781
+
782
+ Enable dashboard cache saving in your evaluation:
783
+ ```python
784
+ import asyncio
785
+ from eval_lib import (
786
+ evaluate,
787
+ EvalTestCase,
788
+ AnswerRelevancyMetric,
789
+ FaithfulnessMetric
790
+ )
791
+
792
+ async def evaluate_with_dashboard():
793
+ test_cases = [
794
+ EvalTestCase(
795
+ input="What is the capital of France?",
796
+ actual_output="Paris is the capital.",
797
+ expected_output="Paris",
798
+ retrieval_context=["Paris is the capital of France."]
799
+ )
800
+ ]
801
+
802
+ metrics = [
803
+ AnswerRelevancyMetric(model="gpt-4o-mini", threshold=0.7),
804
+ FaithfulnessMetric(model="gpt-4o-mini", threshold=0.8)
805
+ ]
806
+
807
+ # Results are saved to .eval_cache/ for dashboard viewing
808
+ results = await evaluate(
809
+ test_cases=test_cases,
810
+ metrics=metrics,
811
+ show_dashboard=True, # ← Enable dashboard cache
812
+ session_name="My First Evaluation" # Optional session name
813
+ )
814
+
815
+ return results
816
+
817
+ asyncio.run(evaluate_with_dashboard())
818
+ ```
819
+
820
+ ### Typical Workflow
821
+
822
+ **Terminal 1 - Start Dashboard (once):**
823
+ ```bash
824
+ cd ~/my_project
825
+ eval-lib dashboard
826
+ # Leave this terminal open - dashboard stays running
827
+ ```
828
+
829
+ **Terminal 2 - Run Evaluations (multiple times):**
830
+ ```python
831
+ # Run evaluation 1
832
+ results1 = await evaluate(
833
+ test_cases=test_cases1,
834
+ metrics=metrics,
835
+ show_dashboard=True,
836
+ session_name="Evaluation 1"
837
+ )
838
+
839
+ # Run evaluation 2
840
+ results2 = await evaluate(
841
+ test_cases=test_cases2,
842
+ metrics=metrics,
843
+ show_dashboard=True,
844
+ session_name="Evaluation 2"
845
+ )
846
+
847
+ # All results are cached and viewable in dashboard
848
+ ```
849
+
850
+ **Browser:**
851
+ - Open `http://localhost:14500`
852
+ - Refresh page (F5) to see new evaluation results
853
+ - Switch between different evaluation sessions using the dropdown
854
+
855
+ ### Dashboard Features
856
+
857
+ **Summary Cards:**
858
+ - Total test cases evaluated
859
+ - Total cost across all evaluations
860
+ - Number of metrics used
861
+
862
+ **Metrics Overview:**
863
+ - Average scores per metric
864
+ - Pass/fail counts
865
+ - Success rates
866
+ - Model used for evaluation
867
+ - Total cost per metric
868
+
869
+ **Detailed Results Table:**
870
+ - Test case inputs and outputs
871
+ - Individual metric scores
872
+ - Pass/fail status
873
+ - Click "View Details" for full information including:
874
+ - Complete input/output/expected output
875
+ - Full retrieval context
876
+ - Detailed evaluation reasoning
877
+ - Complete evaluation logs
878
+
879
+ **Charts:**
880
+ - Bar chart: Average scores by metric
881
+ - Doughnut chart: Success rate distribution
882
+
883
+ ### Cache Management
884
+
885
+ Results are stored in `.eval_cache/results.json` in your project directory:
886
+ ```bash
887
+ # View cache contents
888
+ cat .eval_cache/results.json
889
+
890
+ # Clear cache via dashboard
891
+ # Click "Clear Cache" button in dashboard UI
892
+
893
+ # Or manually delete cache
894
+ rm -rf .eval_cache/
895
+ ```
896
+
897
+ ### CLI Commands
898
+ ```bash
899
+ # Start dashboard with defaults
900
+ eval-lib dashboard
901
+
902
+ # Custom port
903
+ eval-lib dashboard --port 8080
904
+
905
+ # Custom cache directory
906
+ eval-lib dashboard --cache-dir /path/to/project/.eval_cache
907
+
908
+ # Check library version
909
+ eval-lib version
910
+
911
+ # Help
912
+ eval-lib help
913
+ ```
914
+
751
915
  ## Custom LLM Providers
752
916
 
753
917
  The library supports custom LLM providers through the `CustomLLMClient` abstract base class. This allows you to integrate any LLM provider, including internal corporate models, locally-hosted models, or custom endpoints.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-ai-library
3
- Version: 0.3.3
3
+ Version: 0.3.10
4
4
  Summary: Comprehensive AI Model Evaluation Framework with support for multiple LLM providers
5
5
  Author-email: Aleksandr Meshkov <alekslynx90@gmail.com>
6
6
  License: MIT
@@ -45,6 +45,7 @@ Requires-Dist: html2text>=2020.1.16
45
45
  Requires-Dist: markdown>=3.4.0
46
46
  Requires-Dist: pandas>=2.0.0
47
47
  Requires-Dist: striprtf>=0.0.26
48
+ Requires-Dist: flask>=3.0.0
48
49
  Provides-Extra: dev
49
50
  Requires-Dist: pytest>=7.0.0; extra == "dev"
50
51
  Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
@@ -807,6 +808,170 @@ response, cost = await chat_complete(
807
808
  )
808
809
  ```
809
810
 
811
+ ## Dashboard
812
+
813
+ The library includes an interactive web dashboard for visualizing evaluation results. All evaluation results are automatically saved to cache and can be viewed in a beautiful web interface.
814
+
815
+ ### Features
816
+
817
+ - 📊 **Interactive Charts**: Visual representation of metrics with Chart.js
818
+ - 📈 **Metrics Summary**: Aggregate statistics across all evaluations
819
+ - 🔍 **Detailed View**: Drill down into individual test cases and metric results
820
+ - 💾 **Session History**: Access past evaluation runs
821
+ - 🎨 **Beautiful UI**: Modern, responsive interface with color-coded results
822
+ - 🔄 **Real-time Updates**: Refresh to see new evaluation results
823
+
824
+ ### Starting the Dashboard
825
+
826
+ The dashboard runs as a separate server that you start once and keep running:
827
+ ```bash
828
+ # Start dashboard server (from your project directory)
829
+ eval-lib dashboard
830
+
831
+ # Custom port if 14500 is busy
832
+ eval-lib dashboard --port 8080
833
+
834
+ # Custom cache directory
835
+ eval-lib dashboard --cache-dir /path/to/cache
836
+ ```
837
+
838
+ Once started, the dashboard will be available at `http://localhost:14500`
839
+
840
+ ### Saving Results to Dashboard
841
+
842
+ Enable dashboard cache saving in your evaluation:
843
+ ```python
844
+ import asyncio
845
+ from eval_lib import (
846
+ evaluate,
847
+ EvalTestCase,
848
+ AnswerRelevancyMetric,
849
+ FaithfulnessMetric
850
+ )
851
+
852
+ async def evaluate_with_dashboard():
853
+ test_cases = [
854
+ EvalTestCase(
855
+ input="What is the capital of France?",
856
+ actual_output="Paris is the capital.",
857
+ expected_output="Paris",
858
+ retrieval_context=["Paris is the capital of France."]
859
+ )
860
+ ]
861
+
862
+ metrics = [
863
+ AnswerRelevancyMetric(model="gpt-4o-mini", threshold=0.7),
864
+ FaithfulnessMetric(model="gpt-4o-mini", threshold=0.8)
865
+ ]
866
+
867
+ # Results are saved to .eval_cache/ for dashboard viewing
868
+ results = await evaluate(
869
+ test_cases=test_cases,
870
+ metrics=metrics,
871
+ show_dashboard=True, # ← Enable dashboard cache
872
+ session_name="My First Evaluation" # Optional session name
873
+ )
874
+
875
+ return results
876
+
877
+ asyncio.run(evaluate_with_dashboard())
878
+ ```
879
+
880
+ ### Typical Workflow
881
+
882
+ **Terminal 1 - Start Dashboard (once):**
883
+ ```bash
884
+ cd ~/my_project
885
+ eval-lib dashboard
886
+ # Leave this terminal open - dashboard stays running
887
+ ```
888
+
889
+ **Terminal 2 - Run Evaluations (multiple times):**
890
+ ```python
891
+ # Run evaluation 1
892
+ results1 = await evaluate(
893
+ test_cases=test_cases1,
894
+ metrics=metrics,
895
+ show_dashboard=True,
896
+ session_name="Evaluation 1"
897
+ )
898
+
899
+ # Run evaluation 2
900
+ results2 = await evaluate(
901
+ test_cases=test_cases2,
902
+ metrics=metrics,
903
+ show_dashboard=True,
904
+ session_name="Evaluation 2"
905
+ )
906
+
907
+ # All results are cached and viewable in dashboard
908
+ ```
909
+
910
+ **Browser:**
911
+ - Open `http://localhost:14500`
912
+ - Refresh page (F5) to see new evaluation results
913
+ - Switch between different evaluation sessions using the dropdown
914
+
915
+ ### Dashboard Features
916
+
917
+ **Summary Cards:**
918
+ - Total test cases evaluated
919
+ - Total cost across all evaluations
920
+ - Number of metrics used
921
+
922
+ **Metrics Overview:**
923
+ - Average scores per metric
924
+ - Pass/fail counts
925
+ - Success rates
926
+ - Model used for evaluation
927
+ - Total cost per metric
928
+
929
+ **Detailed Results Table:**
930
+ - Test case inputs and outputs
931
+ - Individual metric scores
932
+ - Pass/fail status
933
+ - Click "View Details" for full information including:
934
+ - Complete input/output/expected output
935
+ - Full retrieval context
936
+ - Detailed evaluation reasoning
937
+ - Complete evaluation logs
938
+
939
+ **Charts:**
940
+ - Bar chart: Average scores by metric
941
+ - Doughnut chart: Success rate distribution
942
+
943
+ ### Cache Management
944
+
945
+ Results are stored in `.eval_cache/results.json` in your project directory:
946
+ ```bash
947
+ # View cache contents
948
+ cat .eval_cache/results.json
949
+
950
+ # Clear cache via dashboard
951
+ # Click "Clear Cache" button in dashboard UI
952
+
953
+ # Or manually delete cache
954
+ rm -rf .eval_cache/
955
+ ```
956
+
957
+ ### CLI Commands
958
+ ```bash
959
+ # Start dashboard with defaults
960
+ eval-lib dashboard
961
+
962
+ # Custom port
963
+ eval-lib dashboard --port 8080
964
+
965
+ # Custom cache directory
966
+ eval-lib dashboard --cache-dir /path/to/project/.eval_cache
967
+
968
+ # Check library version
969
+ eval-lib version
970
+
971
+ # Help
972
+ eval-lib help
973
+ ```
974
+
810
975
  ## Custom LLM Providers
811
976
 
812
977
  The library supports custom LLM providers through the `CustomLLMClient` abstract base class. This allows you to integrate any LLM provider, including internal corporate models, locally-hosted models, or custom endpoints.
@@ -6,11 +6,15 @@ setup.py
6
6
  eval_ai_library.egg-info/PKG-INFO
7
7
  eval_ai_library.egg-info/SOURCES.txt
8
8
  eval_ai_library.egg-info/dependency_links.txt
9
+ eval_ai_library.egg-info/entry_points.txt
9
10
  eval_ai_library.egg-info/requires.txt
10
11
  eval_ai_library.egg-info/top_level.txt
11
12
  eval_lib/__init__.py
13
+ eval_lib/cli.py
14
+ eval_lib/dashboard_server.py
12
15
  eval_lib/evaluate.py
13
16
  eval_lib/evaluation_schema.py
17
+ eval_lib/html.py
14
18
  eval_lib/llm_client.py
15
19
  eval_lib/metric_pattern.py
16
20
  eval_lib/price.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ eval-lib = eval_lib.cli:main
@@ -20,6 +20,7 @@ html2text>=2020.1.16
20
20
  markdown>=3.4.0
21
21
  pandas>=2.0.0
22
22
  striprtf>=0.0.26
23
+ flask>=3.0.0
23
24
 
24
25
  [dev]
25
26
  pytest>=7.0.0
@@ -7,7 +7,7 @@ A powerful library for evaluating AI models with support for multiple LLM provid
7
7
  and a wide range of evaluation metrics for RAG systems and AI agents.
8
8
  """
9
9
 
10
- __version__ = "0.3.3"
10
+ __version__ = "0.3.10"
11
11
  __author__ = "Aleksandr Meshkov"
12
12
 
13
13
  # Core evaluation functions
@@ -66,6 +66,10 @@ from eval_lib.agent_metrics import (
66
66
  KnowledgeRetentionMetric
67
67
  )
68
68
 
69
+ from .dashboard_server import (
70
+ DashboardCache
71
+ )
72
+
69
73
 
70
74
  def __getattr__(name):
71
75
  """
@@ -136,4 +140,8 @@ __all__ = [
136
140
  # Utils
137
141
  "score_agg",
138
142
  "extract_json_block",
143
+
144
+ # Dashboard
145
+ 'start_dashboard',
146
+ 'DashboardCache',
139
147
  ]