evalscope 0.6.0rc0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. evalscope/backend/opencompass/tasks/eval_datasets.py +1 -1
  2. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +230 -0
  3. evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt +43 -0
  4. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +87 -0
  5. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +36 -0
  6. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +26 -0
  7. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +41 -0
  8. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +60 -0
  9. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +36 -0
  10. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +22 -0
  11. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +35 -0
  12. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  13. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  14. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  15. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  16. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +34 -0
  17. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +36 -0
  18. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +25 -0
  19. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +7 -0
  20. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +39 -0
  21. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +16 -0
  22. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +24 -0
  23. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +18 -0
  24. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +120 -100
  25. evalscope/backend/rag_eval/utils/clip.py +149 -0
  26. evalscope/backend/rag_eval/utils/embedding.py +183 -0
  27. evalscope/backend/rag_eval/utils/llm.py +72 -0
  28. evalscope/backend/rag_eval/utils/tools.py +63 -0
  29. evalscope/backend/vlm_eval_kit/backend_manager.py +23 -21
  30. evalscope/benchmarks/ceval/samples.jsonl +1 -0
  31. evalscope/benchmarks/cmmlu/samples.jsonl +5 -0
  32. evalscope/benchmarks/mmlu/samples.jsonl +5 -0
  33. evalscope/benchmarks/race/samples.jsonl +5 -0
  34. evalscope/benchmarks/trivia_qa/samples.jsonl +5 -0
  35. evalscope/cli/start_perf.py +8 -11
  36. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  37. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +58485 -0
  38. evalscope/metrics/resources/gpt2-zhcn3-v4.json +1 -0
  39. evalscope/metrics/rouge_metric.py +30 -15
  40. evalscope/perf/arguments.py +179 -0
  41. evalscope/perf/benchmark.py +245 -0
  42. evalscope/perf/http_client.py +127 -711
  43. evalscope/perf/main.py +35 -0
  44. evalscope/perf/plugin/__init__.py +2 -0
  45. evalscope/perf/plugin/api/__init__.py +3 -0
  46. evalscope/perf/{api_plugin_base.py → plugin/api/base.py} +17 -18
  47. evalscope/perf/{custom_api.py → plugin/api/custom_api.py} +25 -19
  48. evalscope/perf/{dashscope_api.py → plugin/api/dashscope_api.py} +28 -14
  49. evalscope/perf/{openai_api.py → plugin/api/openai_api.py} +51 -27
  50. evalscope/perf/plugin/datasets/__init__.py +6 -0
  51. evalscope/perf/{dataset_plugin_base.py → plugin/datasets/base.py} +13 -10
  52. evalscope/perf/plugin/datasets/custom.py +21 -0
  53. evalscope/perf/plugin/datasets/flickr8k.py +51 -0
  54. evalscope/perf/{datasets → plugin/datasets}/line_by_line.py +9 -5
  55. evalscope/perf/plugin/datasets/longalpaca.py +28 -0
  56. evalscope/perf/plugin/datasets/openqa.py +38 -0
  57. evalscope/perf/plugin/datasets/speed_benchmark.py +50 -0
  58. evalscope/perf/plugin/registry.py +54 -0
  59. evalscope/perf/{how_to_analysis_result.py → utils/analysis_result.py} +11 -5
  60. evalscope/perf/utils/benchmark_util.py +135 -0
  61. evalscope/perf/utils/chat_service.py +252 -0
  62. evalscope/perf/utils/db_util.py +200 -0
  63. evalscope/perf/utils/handler.py +46 -0
  64. evalscope/perf/utils/local_server.py +139 -0
  65. evalscope/registry/config/cfg_arena.yaml +77 -0
  66. evalscope/registry/config/cfg_arena_zhihu.yaml +63 -0
  67. evalscope/registry/config/cfg_pairwise_baseline.yaml +83 -0
  68. evalscope/registry/config/cfg_single.yaml +78 -0
  69. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +8 -0
  70. evalscope/registry/data/prompt_template/prompt_templates.jsonl +8 -0
  71. evalscope/registry/data/qa_browser/battle.jsonl +634 -0
  72. evalscope/registry/data/qa_browser/category_mapping.yaml +10 -0
  73. evalscope/registry/data/question.jsonl +80 -0
  74. evalscope/third_party/longbench_write/README.md +118 -0
  75. evalscope/third_party/longbench_write/default_task.json +27 -0
  76. evalscope/third_party/longbench_write/default_task.yaml +24 -0
  77. evalscope/third_party/toolbench_static/README.md +118 -0
  78. evalscope/third_party/toolbench_static/config_default.json +15 -0
  79. evalscope/third_party/toolbench_static/config_default.yaml +12 -0
  80. evalscope/third_party/toolbench_static/requirements.txt +2 -0
  81. evalscope/utils/logger.py +18 -20
  82. evalscope/utils/utils.py +41 -42
  83. evalscope/version.py +2 -2
  84. evalscope-0.7.0.dist-info/LICENSE +203 -0
  85. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/METADATA +162 -103
  86. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/RECORD +107 -32
  87. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/WHEEL +1 -1
  88. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/top_level.txt +1 -0
  89. tests/cli/__init__.py +1 -0
  90. tests/cli/test_run.py +76 -0
  91. tests/perf/__init__.py +1 -0
  92. tests/perf/test_perf.py +96 -0
  93. tests/rag/__init__.py +0 -0
  94. tests/rag/test_clip_benchmark.py +85 -0
  95. tests/rag/test_mteb.py +136 -0
  96. tests/rag/test_ragas.py +120 -0
  97. tests/swift/__init__.py +1 -0
  98. tests/swift/test_run_swift_eval.py +146 -0
  99. tests/swift/test_run_swift_vlm_eval.py +128 -0
  100. tests/swift/test_run_swift_vlm_jugde_eval.py +157 -0
  101. tests/test_run_all.py +12 -0
  102. tests/vlm/__init__.py +1 -0
  103. tests/vlm/test_vlmeval.py +59 -0
  104. evalscope/perf/_logging.py +0 -32
  105. evalscope/perf/datasets/longalpaca_12k.py +0 -20
  106. evalscope/perf/datasets/openqa.py +0 -22
  107. evalscope/perf/plugin_registry.py +0 -35
  108. evalscope/perf/query_parameters.py +0 -42
  109. evalscope/perf/server_sent_event.py +0 -43
  110. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +0 -221
  111. /evalscope/{perf/datasets → backend/rag_eval/utils}/__init__.py +0 -0
  112. /evalscope/{preprocess/tokenizers → perf/utils}/__init__.py +0 -0
  113. {evalscope-0.6.0rc0.dist-info → evalscope-0.7.0.dist-info}/entry_points.txt +0 -0
  114. {evalscope/preprocess → tests}/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.6.0rc0
3
+ Version: 0.7.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -15,26 +15,28 @@ Classifier: Programming Language :: Python :: 3.9
15
15
  Classifier: Programming Language :: Python :: 3.10
16
16
  Requires-Python: >=3.8
17
17
  Description-Content-Type: text/markdown
18
- Requires-Dist: torch
18
+ License-File: LICENSE
19
19
  Requires-Dist: absl-py
20
20
  Requires-Dist: accelerate
21
21
  Requires-Dist: cachetools
22
- Requires-Dist: datasets (<=3.0.1,>=3.0.0)
22
+ Requires-Dist: datasets<=3.0.1,>=3.0.0
23
23
  Requires-Dist: editdistance
24
+ Requires-Dist: jieba
24
25
  Requires-Dist: jsonlines
25
26
  Requires-Dist: matplotlib
26
27
  Requires-Dist: modelscope[framework]
27
- Requires-Dist: nltk (>=3.9)
28
+ Requires-Dist: nltk>=3.9
28
29
  Requires-Dist: openai
29
30
  Requires-Dist: pandas
30
31
  Requires-Dist: plotly
31
- Requires-Dist: pyarrow (<=17.0.0)
32
+ Requires-Dist: pyarrow<=17.0.0
32
33
  Requires-Dist: pympler
33
34
  Requires-Dist: pyyaml
34
35
  Requires-Dist: regex
35
36
  Requires-Dist: requests
36
37
  Requires-Dist: requests-toolbelt
37
- Requires-Dist: rouge-score (>=0.1.0)
38
+ Requires-Dist: rouge-chinese
39
+ Requires-Dist: rouge-score>=0.1.0
38
40
  Requires-Dist: sacrebleu
39
41
  Requires-Dist: scikit-learn
40
42
  Requires-Dist: seaborn
@@ -42,83 +44,95 @@ Requires-Dist: sentencepiece
42
44
  Requires-Dist: simple-ddl-parser
43
45
  Requires-Dist: tabulate
44
46
  Requires-Dist: tiktoken
47
+ Requires-Dist: torch
45
48
  Requires-Dist: tqdm
46
- Requires-Dist: transformers (>=4.33)
49
+ Requires-Dist: transformers>=4.33
47
50
  Requires-Dist: transformers-stream-generator
48
- Requires-Dist: jieba
49
- Requires-Dist: rouge-chinese
50
51
  Provides-Extra: all
51
- Requires-Dist: torch ; extra == 'all'
52
- Requires-Dist: absl-py ; extra == 'all'
53
- Requires-Dist: accelerate ; extra == 'all'
54
- Requires-Dist: cachetools ; extra == 'all'
55
- Requires-Dist: datasets (<=3.0.1,>=3.0.0) ; extra == 'all'
56
- Requires-Dist: editdistance ; extra == 'all'
57
- Requires-Dist: jsonlines ; extra == 'all'
58
- Requires-Dist: matplotlib ; extra == 'all'
59
- Requires-Dist: modelscope[framework] ; extra == 'all'
60
- Requires-Dist: nltk (>=3.9) ; extra == 'all'
61
- Requires-Dist: openai ; extra == 'all'
62
- Requires-Dist: pandas ; extra == 'all'
63
- Requires-Dist: plotly ; extra == 'all'
64
- Requires-Dist: pyarrow (<=17.0.0) ; extra == 'all'
65
- Requires-Dist: pympler ; extra == 'all'
66
- Requires-Dist: pyyaml ; extra == 'all'
67
- Requires-Dist: regex ; extra == 'all'
68
- Requires-Dist: requests ; extra == 'all'
69
- Requires-Dist: requests-toolbelt ; extra == 'all'
70
- Requires-Dist: rouge-score (>=0.1.0) ; extra == 'all'
71
- Requires-Dist: sacrebleu ; extra == 'all'
72
- Requires-Dist: scikit-learn ; extra == 'all'
73
- Requires-Dist: seaborn ; extra == 'all'
74
- Requires-Dist: sentencepiece ; extra == 'all'
75
- Requires-Dist: simple-ddl-parser ; extra == 'all'
76
- Requires-Dist: tabulate ; extra == 'all'
77
- Requires-Dist: tiktoken ; extra == 'all'
78
- Requires-Dist: tqdm ; extra == 'all'
79
- Requires-Dist: transformers (>=4.33) ; extra == 'all'
80
- Requires-Dist: transformers-stream-generator ; extra == 'all'
81
- Requires-Dist: jieba ; extra == 'all'
82
- Requires-Dist: rouge-chinese ; extra == 'all'
83
- Requires-Dist: ms-opencompass (>=0.1.3) ; extra == 'all'
84
- Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'all'
85
- Requires-Dist: mteb (==1.19.4) ; extra == 'all'
86
- Requires-Dist: ragas (==0.2.3) ; extra == 'all'
87
- Requires-Dist: webdataset (>0.2.0) ; extra == 'all'
52
+ Requires-Dist: absl-py; extra == "all"
53
+ Requires-Dist: accelerate; extra == "all"
54
+ Requires-Dist: cachetools; extra == "all"
55
+ Requires-Dist: datasets<=3.0.1,>=3.0.0; extra == "all"
56
+ Requires-Dist: editdistance; extra == "all"
57
+ Requires-Dist: jieba; extra == "all"
58
+ Requires-Dist: jsonlines; extra == "all"
59
+ Requires-Dist: matplotlib; extra == "all"
60
+ Requires-Dist: modelscope[framework]; extra == "all"
61
+ Requires-Dist: nltk>=3.9; extra == "all"
62
+ Requires-Dist: openai; extra == "all"
63
+ Requires-Dist: pandas; extra == "all"
64
+ Requires-Dist: plotly; extra == "all"
65
+ Requires-Dist: pyarrow<=17.0.0; extra == "all"
66
+ Requires-Dist: pympler; extra == "all"
67
+ Requires-Dist: pyyaml; extra == "all"
68
+ Requires-Dist: regex; extra == "all"
69
+ Requires-Dist: requests; extra == "all"
70
+ Requires-Dist: requests-toolbelt; extra == "all"
71
+ Requires-Dist: rouge-chinese; extra == "all"
72
+ Requires-Dist: rouge-score>=0.1.0; extra == "all"
73
+ Requires-Dist: sacrebleu; extra == "all"
74
+ Requires-Dist: scikit-learn; extra == "all"
75
+ Requires-Dist: seaborn; extra == "all"
76
+ Requires-Dist: sentencepiece; extra == "all"
77
+ Requires-Dist: simple-ddl-parser; extra == "all"
78
+ Requires-Dist: tabulate; extra == "all"
79
+ Requires-Dist: tiktoken; extra == "all"
80
+ Requires-Dist: torch; extra == "all"
81
+ Requires-Dist: tqdm; extra == "all"
82
+ Requires-Dist: transformers>=4.33; extra == "all"
83
+ Requires-Dist: transformers-stream-generator; extra == "all"
84
+ Requires-Dist: ms-opencompass>=0.1.3; extra == "all"
85
+ Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
86
+ Requires-Dist: mteb==1.19.4; extra == "all"
87
+ Requires-Dist: ragas==0.2.5; extra == "all"
88
+ Requires-Dist: webdataset>0.2.0; extra == "all"
89
+ Requires-Dist: aiohttp; extra == "all"
90
+ Requires-Dist: fastapi; extra == "all"
91
+ Requires-Dist: numpy; extra == "all"
92
+ Requires-Dist: sse-starlette; extra == "all"
93
+ Requires-Dist: transformers; extra == "all"
94
+ Requires-Dist: unicorn; extra == "all"
88
95
  Provides-Extra: inner
89
- Requires-Dist: absl-py ; extra == 'inner'
90
- Requires-Dist: accelerate ; extra == 'inner'
91
- Requires-Dist: alibaba-itag-sdk ; extra == 'inner'
92
- Requires-Dist: dashscope ; extra == 'inner'
93
- Requires-Dist: editdistance ; extra == 'inner'
94
- Requires-Dist: jsonlines ; extra == 'inner'
95
- Requires-Dist: nltk ; extra == 'inner'
96
- Requires-Dist: openai ; extra == 'inner'
97
- Requires-Dist: pandas (==1.5.3) ; extra == 'inner'
98
- Requires-Dist: plotly ; extra == 'inner'
99
- Requires-Dist: pyarrow ; extra == 'inner'
100
- Requires-Dist: pyodps ; extra == 'inner'
101
- Requires-Dist: pyyaml ; extra == 'inner'
102
- Requires-Dist: regex ; extra == 'inner'
103
- Requires-Dist: requests (==2.28.1) ; extra == 'inner'
104
- Requires-Dist: requests-toolbelt (==0.10.1) ; extra == 'inner'
105
- Requires-Dist: rouge-score ; extra == 'inner'
106
- Requires-Dist: sacrebleu ; extra == 'inner'
107
- Requires-Dist: scikit-learn ; extra == 'inner'
108
- Requires-Dist: seaborn ; extra == 'inner'
109
- Requires-Dist: simple-ddl-parser ; extra == 'inner'
110
- Requires-Dist: streamlit ; extra == 'inner'
111
- Requires-Dist: tqdm ; extra == 'inner'
112
- Requires-Dist: transformers (<4.43,>=4.33) ; extra == 'inner'
113
- Requires-Dist: transformers-stream-generator ; extra == 'inner'
96
+ Requires-Dist: absl-py; extra == "inner"
97
+ Requires-Dist: accelerate; extra == "inner"
98
+ Requires-Dist: alibaba-itag-sdk; extra == "inner"
99
+ Requires-Dist: dashscope; extra == "inner"
100
+ Requires-Dist: editdistance; extra == "inner"
101
+ Requires-Dist: jsonlines; extra == "inner"
102
+ Requires-Dist: nltk; extra == "inner"
103
+ Requires-Dist: openai; extra == "inner"
104
+ Requires-Dist: pandas==1.5.3; extra == "inner"
105
+ Requires-Dist: plotly; extra == "inner"
106
+ Requires-Dist: pyarrow; extra == "inner"
107
+ Requires-Dist: pyodps; extra == "inner"
108
+ Requires-Dist: pyyaml; extra == "inner"
109
+ Requires-Dist: regex; extra == "inner"
110
+ Requires-Dist: requests==2.28.1; extra == "inner"
111
+ Requires-Dist: requests-toolbelt==0.10.1; extra == "inner"
112
+ Requires-Dist: rouge-score; extra == "inner"
113
+ Requires-Dist: sacrebleu; extra == "inner"
114
+ Requires-Dist: scikit-learn; extra == "inner"
115
+ Requires-Dist: seaborn; extra == "inner"
116
+ Requires-Dist: simple-ddl-parser; extra == "inner"
117
+ Requires-Dist: streamlit; extra == "inner"
118
+ Requires-Dist: tqdm; extra == "inner"
119
+ Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
120
+ Requires-Dist: transformers-stream-generator; extra == "inner"
114
121
  Provides-Extra: opencompass
115
- Requires-Dist: ms-opencompass (>=0.1.3) ; extra == 'opencompass'
122
+ Requires-Dist: ms-opencompass>=0.1.3; extra == "opencompass"
123
+ Provides-Extra: perf
124
+ Requires-Dist: aiohttp; extra == "perf"
125
+ Requires-Dist: fastapi; extra == "perf"
126
+ Requires-Dist: numpy; extra == "perf"
127
+ Requires-Dist: sse-starlette; extra == "perf"
128
+ Requires-Dist: transformers; extra == "perf"
129
+ Requires-Dist: unicorn; extra == "perf"
116
130
  Provides-Extra: rag
117
- Requires-Dist: mteb (==1.19.4) ; extra == 'rag'
118
- Requires-Dist: ragas (==0.2.3) ; extra == 'rag'
119
- Requires-Dist: webdataset (>0.2.0) ; extra == 'rag'
131
+ Requires-Dist: mteb==1.19.4; extra == "rag"
132
+ Requires-Dist: ragas==0.2.5; extra == "rag"
133
+ Requires-Dist: webdataset>0.2.0; extra == "rag"
120
134
  Provides-Extra: vlmeval
121
- Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'vlmeval'
135
+ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
122
136
 
123
137
 
124
138
 
@@ -129,16 +143,18 @@ Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'vlmeval'
129
143
  </p>
130
144
 
131
145
  <p align="center">
132
- <a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
133
- <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope">
134
- </a>
135
- <a href='https://evalscope.readthedocs.io/en/latest/?badge=latest'>
136
- <img src='https://readthedocs.org/projects/evalscope-en/badge/?version=latest' alt='Documentation Status' />
137
- </a>
138
- <br>
139
- <a href="https://evalscope.readthedocs.io/en/latest/">📖 Documents</a>
146
+ <a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
147
+ <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope">
148
+ </a>
149
+ <a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
150
+ <a href='https://evalscope.readthedocs.io/en/latest/?badge=latest'>
151
+ <img src='https://readthedocs.org/projects/evalscope-en/badge/?version=latest' alt='Documentation Status' />
152
+ </a>
153
+ <br>
154
+ <a href="https://evalscope.readthedocs.io/en/latest/">📖 Documents</a>
140
155
  <p>
141
156
 
157
+ > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
142
158
 
143
159
  ## 📋 Table of Contents
144
160
  - [Introduction](#introduction)
@@ -164,7 +180,7 @@ EvalScope is the official model evaluation and performance benchmarking framewor
164
180
  The architecture includes the following modules:
165
181
  1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
166
182
  2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
167
- 3. **Evaluation Backend**:
183
+ 3. **Evaluation Backend**:
168
184
  - **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
169
185
  - **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
170
186
  - **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
@@ -176,6 +192,7 @@ The architecture includes the following modules:
176
192
 
177
193
 
178
194
  ## 🎉 News
195
+ - 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
179
196
  - 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
180
197
  - 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
181
198
  - 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
@@ -210,7 +227,9 @@ We recommend using conda to manage your environment and installing dependencies
210
227
  # Additional options
211
228
  pip install evalscope[opencompass] # Install OpenCompass backend
212
229
  pip install evalscope[vlmeval] # Install VLMEvalKit backend
213
- pip install evalscope[all] # Install all backends (Native, OpenCompass, VLMEvalKit)
230
+ pip install evalscope[rag] # Install RAGEval backend
231
+ pip install evalscope[perf] # Install Perf dependencies
232
+ pip install evalscope[all] # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
214
233
  ```
215
234
 
216
235
  > [!WARNING]
@@ -236,7 +255,9 @@ We recommend using conda to manage your environment and installing dependencies
236
255
  # Additional options
237
256
  pip install -e '.[opencompass]' # Install OpenCompass backend
238
257
  pip install -e '.[vlmeval]' # Install VLMEvalKit backend
239
- pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit)
258
+ pip install -e '.[rag]' # Install RAGEval backend
259
+ pip install -e '.[perf]' # Install Perf dependencies
260
+ pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
240
261
  ```
241
262
 
242
263
 
@@ -245,31 +266,47 @@ We recommend using conda to manage your environment and installing dependencies
245
266
  ### 1. Simple Evaluation
246
267
  To evaluate a model using default settings on specified datasets, follow the process below:
247
268
 
248
- #### Install using pip
249
- You can execute this command from any directory:
269
+ #### Installation using pip
270
+
271
+ You can execute this in any directory:
250
272
  ```bash
251
273
  python -m evalscope.run \
252
- --model qwen/Qwen2-0.5B-Instruct \
274
+ --model Qwen/Qwen2.5-0.5B-Instruct \
253
275
  --template-type qwen \
254
- --datasets arc
276
+ --datasets gsm8k ceval \
277
+ --limit 10
255
278
  ```
256
279
 
257
- #### Install from source
258
- Execute this command in the `evalscope` directory:
280
+ #### Installation from source
281
+
282
+ You need to execute this in the `evalscope` directory:
259
283
  ```bash
260
284
  python evalscope/run.py \
261
- --model qwen/Qwen2-0.5B-Instruct \
285
+ --model Qwen/Qwen2.5-0.5B-Instruct \
262
286
  --template-type qwen \
263
- --datasets arc
287
+ --datasets gsm8k ceval \
288
+ --limit 10
264
289
  ```
265
290
 
266
- If prompted with `Do you wish to run the custom code? [y/N]`, please type `y`.
291
+ > If prompted with `Do you wish to run the custom code? [y/N]`, please type `y`.
292
+
293
+ **Results (tested with only 10 samples)**
294
+ ```text
295
+ Report table:
296
+ +-----------------------+--------------------+-----------------+
297
+ | Model | ceval | gsm8k |
298
+ +=======================+====================+=================+
299
+ | Qwen2.5-0.5B-Instruct | (ceval/acc) 0.5577 | (gsm8k/acc) 0.5 |
300
+ +-----------------------+--------------------+-----------------+
301
+ ```
267
302
 
268
303
 
269
304
  #### Basic Parameter Descriptions
270
305
  - `--model`: Specifies the `model_id` of the model on [ModelScope](https://modelscope.cn/), allowing automatic download. For example, see the [Qwen2-0.5B-Instruct model link](https://modelscope.cn/models/qwen/Qwen2-0.5B-Instruct/summary); you can also use a local path, such as `/path/to/model`.
271
306
  - `--template-type`: Specifies the template type corresponding to the model. Refer to the `Default Template` field in the [template table](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-datasets.html#llm) for filling in this field.
272
307
  - `--datasets`: The dataset name, allowing multiple datasets to be specified, separated by spaces; these datasets will be automatically downloaded. Refer to the [supported datasets list](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html) for available options.
308
+ - `--limit`: Maximum number of evaluation samples per dataset; if not specified, all will be evaluated, which is useful for quick validation.
309
+
273
310
 
274
311
  ### 2. Parameterized Evaluation
275
312
  If you wish to conduct a more customized evaluation, such as modifying model parameters or dataset parameters, you can use the following commands:
@@ -309,7 +346,7 @@ In addition to the three [basic parameters](#basic-parameter-descriptions), the
309
346
  - `--dataset-args`: Evaluation dataset configuration parameters, provided in JSON format, where the key is the dataset name and the value is the parameter; note that these must correspond one-to-one with the values in `--datasets`.
310
347
  - `--few_shot_num`: Number of few-shot examples.
311
348
  - `--few_shot_random`: Whether to randomly sample few-shot data; if not specified, defaults to `true`.
312
- - `--limit`: Maximum number of evaluation samples per dataset; if not specified, all will be evaluated, which is useful for quick validation.
349
+
313
350
 
314
351
  ### 3. Use the run_task Function to Submit an Evaluation Task
315
352
  Using the `run_task` function to submit an evaluation task requires the same parameters as the command line. You need to pass a dictionary as the parameter, which includes the following fields:
@@ -354,24 +391,46 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
354
391
  - **RAGEval**: Initiate RAG evaluation tasks through EvalScope, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html): [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/index.html)
355
392
  - **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
356
393
 
394
+
395
+ ## Model Serving Performance Evaluation
396
+ A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
397
+
398
+ Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
399
+
400
+ **Supports wandb for recording results**
401
+
402
+ ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
403
+
404
+ **Supports Speed Benchmark**
405
+
406
+ It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports:
407
+
408
+ ```text
409
+ Speed Benchmark Results:
410
+ +---------------+-----------------+----------------+
411
+ | Prompt Tokens | Speed(tokens/s) | GPU Memory(GB) |
412
+ +---------------+-----------------+----------------+
413
+ | 1 | 50.69 | 0.97 |
414
+ | 6144 | 51.36 | 1.23 |
415
+ | 14336 | 49.93 | 1.59 |
416
+ | 30720 | 49.56 | 2.34 |
417
+ +---------------+-----------------+----------------+
418
+ ```
419
+
357
420
  ## Custom Dataset Evaluation
358
421
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
359
422
 
360
423
  ## Offline Evaluation
361
- You can use local dataset to evaluate the model without internet connection.
424
+ You can use local dataset to evaluate the model without internet connection.
362
425
 
363
426
  Refer to: Offline Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/offline_evaluation.html)
364
427
 
365
428
 
366
429
  ## Arena Mode
367
- The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
430
+ The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
368
431
 
369
432
  Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
370
433
 
371
- ## Model Serving Performance Evaluation
372
- A stress testing tool that focuses on large language models and can be customized to support various data set formats and different API protocol formats.
373
-
374
- Refer to : Model Serving Performance Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test.html)
375
434
 
376
435
 
377
436