evalscope 0.8.0__py3-none-any.whl β†’ 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (147) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/base.py +1 -1
  4. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  5. evalscope/backend/rag_eval/utils/clip.py +2 -2
  6. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  7. evalscope/backend/rag_eval/utils/llm.py +1 -1
  8. evalscope/benchmarks/__init__.py +20 -1
  9. evalscope/benchmarks/arc/__init__.py +0 -5
  10. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  11. evalscope/benchmarks/bbh/__init__.py +0 -4
  12. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  13. evalscope/benchmarks/benchmark.py +70 -59
  14. evalscope/benchmarks/ceval/__init__.py +0 -5
  15. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  16. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  17. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  18. evalscope/benchmarks/competition_math/__init__.py +0 -5
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  20. evalscope/benchmarks/data_adapter.py +115 -87
  21. evalscope/benchmarks/general_qa/__init__.py +0 -5
  22. evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
  23. evalscope/benchmarks/gpqa/__init__.py +0 -0
  24. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  26. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  27. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
  28. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  29. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
  30. evalscope/benchmarks/humaneval/__init__.py +0 -4
  31. evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
  32. evalscope/benchmarks/ifeval/__init__.py +0 -0
  33. evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  34. evalscope/benchmarks/ifeval/instructions.py +1477 -0
  35. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  36. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  37. evalscope/benchmarks/ifeval/utils.py +134 -0
  38. evalscope/benchmarks/iquiz/__init__.py +0 -0
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  40. evalscope/benchmarks/mmlu/__init__.py +0 -5
  41. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  42. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  43. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  44. evalscope/benchmarks/race/__init__.py +0 -5
  45. evalscope/benchmarks/race/race_adapter.py +27 -123
  46. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  48. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  49. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  50. evalscope/cli/cli.py +2 -0
  51. evalscope/cli/start_app.py +30 -0
  52. evalscope/collections/__init__.py +3 -0
  53. evalscope/collections/evaluator.py +198 -0
  54. evalscope/collections/sampler.py +138 -0
  55. evalscope/collections/schema.py +126 -0
  56. evalscope/config.py +45 -7
  57. evalscope/constants.py +7 -38
  58. evalscope/evaluator/__init__.py +0 -1
  59. evalscope/evaluator/evaluator.py +89 -121
  60. evalscope/evaluator/rating_eval.py +1 -1
  61. evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
  62. evalscope/metrics/__init__.py +3 -0
  63. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  64. evalscope/metrics/math_accuracy.py +193 -50
  65. evalscope/metrics/metrics.py +18 -6
  66. evalscope/metrics/named_metrics.py +17 -0
  67. evalscope/metrics/rouge_metric.py +13 -8
  68. evalscope/models/__init__.py +14 -1
  69. evalscope/models/base_adapter.py +52 -0
  70. evalscope/models/chat_adapter.py +140 -0
  71. evalscope/models/choice_adapter.py +211 -0
  72. evalscope/{tools/rewrite_eval_results.py β†’ models/custom/dummy_model.py} +1 -1
  73. evalscope/models/custom_adapter.py +67 -0
  74. evalscope/models/local_model.py +74 -0
  75. evalscope/models/model.py +141 -0
  76. evalscope/models/server_adapter.py +111 -0
  77. evalscope/perf/__init__.py +1 -0
  78. evalscope/perf/arguments.py +3 -1
  79. evalscope/perf/benchmark.py +3 -3
  80. evalscope/perf/main.py +5 -7
  81. evalscope/perf/plugin/api/custom_api.py +1 -1
  82. evalscope/perf/plugin/api/openai_api.py +54 -50
  83. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  84. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  85. evalscope/perf/plugin/registry.py +3 -3
  86. evalscope/perf/utils/benchmark_util.py +4 -4
  87. evalscope/perf/utils/db_util.py +66 -22
  88. evalscope/perf/utils/local_server.py +4 -1
  89. evalscope/report/__init__.py +5 -0
  90. evalscope/report/app.py +693 -0
  91. evalscope/report/combinator.py +73 -0
  92. evalscope/report/generator.py +80 -0
  93. evalscope/report/utils.py +133 -0
  94. evalscope/run.py +64 -125
  95. evalscope/run_arena.py +3 -2
  96. evalscope/summarizer.py +15 -27
  97. evalscope/third_party/longbench_write/eval.py +2 -1
  98. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  99. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  100. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  101. evalscope/utils/__init__.py +1 -0
  102. evalscope/utils/chat_service.py +6 -5
  103. evalscope/utils/io_utils.py +170 -0
  104. evalscope/utils/logger.py +13 -0
  105. evalscope/utils/model_utils.py +15 -2
  106. evalscope/utils/utils.py +3 -200
  107. evalscope/version.py +2 -2
  108. {evalscope-0.8.0.dist-info β†’ evalscope-0.10.1.dist-info}/METADATA +129 -23
  109. {evalscope-0.8.0.dist-info β†’ evalscope-0.10.1.dist-info}/RECORD +119 -115
  110. tests/cli/test_collection.py +57 -0
  111. tests/cli/test_run.py +57 -7
  112. tests/perf/test_perf.py +3 -2
  113. tests/rag/test_mteb.py +3 -2
  114. tests/vlm/test_vlmeval.py +3 -2
  115. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  116. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  117. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  118. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  119. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  120. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  121. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  122. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  123. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  124. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  125. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  126. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  127. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  128. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  129. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  130. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  131. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  132. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  133. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  134. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  135. evalscope/evaluator/humaneval_evaluator.py +0 -158
  136. evalscope/models/api/__init__.py +0 -3
  137. evalscope/models/dummy_chat_model.py +0 -49
  138. evalscope/models/model_adapter.py +0 -525
  139. evalscope/models/openai_model.py +0 -103
  140. evalscope/tools/__init__.py +0 -1
  141. evalscope/tools/combine_reports.py +0 -135
  142. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  143. /evalscope/{models/api β†’ third_party/longbench_write/tools}/openai_api.py +0 -0
  144. {evalscope-0.8.0.dist-info β†’ evalscope-0.10.1.dist-info}/LICENSE +0 -0
  145. {evalscope-0.8.0.dist-info β†’ evalscope-0.10.1.dist-info}/WHEEL +0 -0
  146. {evalscope-0.8.0.dist-info β†’ evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
  147. {evalscope-0.8.0.dist-info β†’ evalscope-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.8.0
3
+ Version: 0.10.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -84,7 +84,7 @@ Requires-Dist: transformers-stream-generator; extra == "all"
84
84
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
85
85
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
86
86
  Requires-Dist: mteb==1.19.4; extra == "all"
87
- Requires-Dist: ragas==0.2.7; extra == "all"
87
+ Requires-Dist: ragas==0.2.9; extra == "all"
88
88
  Requires-Dist: webdataset>0.2.0; extra == "all"
89
89
  Requires-Dist: aiohttp; extra == "all"
90
90
  Requires-Dist: fastapi; extra == "all"
@@ -92,6 +92,11 @@ Requires-Dist: numpy; extra == "all"
92
92
  Requires-Dist: sse-starlette; extra == "all"
93
93
  Requires-Dist: transformers; extra == "all"
94
94
  Requires-Dist: unicorn; extra == "all"
95
+ Requires-Dist: gradio>=5.4.0; extra == "all"
96
+ Requires-Dist: plotly>=5.23.0; extra == "all"
97
+ Provides-Extra: app
98
+ Requires-Dist: gradio>=5.4.0; extra == "app"
99
+ Requires-Dist: plotly>=5.23.0; extra == "app"
95
100
  Provides-Extra: inner
96
101
  Requires-Dist: absl-py; extra == "inner"
97
102
  Requires-Dist: accelerate; extra == "inner"
@@ -129,7 +134,7 @@ Requires-Dist: transformers; extra == "perf"
129
134
  Requires-Dist: unicorn; extra == "perf"
130
135
  Provides-Extra: rag
131
136
  Requires-Dist: mteb==1.19.4; extra == "rag"
132
- Requires-Dist: ragas==0.2.7; extra == "rag"
137
+ Requires-Dist: ragas==0.2.9; extra == "rag"
133
138
  Requires-Dist: webdataset>0.2.0; extra == "rag"
134
139
  Provides-Extra: vlmeval
135
140
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
@@ -160,14 +165,16 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
160
165
  > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
161
166
 
162
167
  ## πŸ“‹ Contents
163
- - [Introduction](#introduction)
164
- - [News](#News)
165
- - [Installation](#installation)
166
- - [Quick Start](#quick-start)
168
+ - [Introduction](#-introduction)
169
+ - [News](#-news)
170
+ - [Installation](#️-installation)
171
+ - [Quick Start](#-quick-start)
167
172
  - [Evaluation Backend](#evaluation-backend)
168
- - [Custom Dataset Evaluation](#custom-dataset-evaluation)
169
- - [Model Serving Performance Evaluation](#Model-Serving-Performance-Evaluation)
170
- - [Arena Mode](#arena-mode)
173
+ - [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
174
+ - [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
175
+ - [Arena Mode](#-arena-mode)
176
+ - [Contribution](#️-contribution)
177
+ - [Roadmap](#-roadmap)
171
178
 
172
179
 
173
180
  ## πŸ“ Introduction
@@ -181,6 +188,8 @@ The framework accommodates multiple evaluation scenarios such as end-to-end RAG
181
188
  <br>EvalScope Framework.
182
189
  </p>
183
190
 
191
+ <details><summary>Framework Description</summary>
192
+
184
193
  The architecture includes the following modules:
185
194
  1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
186
195
  2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
@@ -194,13 +203,29 @@ The architecture includes the following modules:
194
203
  5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
195
204
  6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
196
205
 
206
+ </details>
207
+
208
+ ## ☎ User Groups
209
+
210
+ Please scan the QR code below to join our community groups:
211
+
212
+ [Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group | DingTalk Group
213
+ :-------------------------:|:-------------------------:|:-------------------------:
214
+ <img src="docs/asset/discord_qr.jpg" width="160" height="160"> | <img src="docs/asset/wechat.png" width="160" height="160"> | <img src="docs/asset/dingding.png" width="160" height="160">
215
+
197
216
 
198
217
  ## πŸŽ‰ News
218
+ - πŸ”₯ **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [πŸ“– Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
219
+ - πŸ”₯ **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [πŸ“– Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
220
+ - πŸ”₯πŸ”₯ **[2024.12.31]** Support for adding benchmark evaluations, refer to the [πŸ“– Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [πŸ“– Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
199
221
  - πŸ”₯ **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [πŸ“– User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
200
222
  - πŸ”₯ **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [πŸ“– User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
201
223
  - πŸ”₯ **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [πŸ“– Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
202
224
  - πŸ”₯ **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
203
225
  - πŸ”₯ **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
226
+
227
+ <details><summary>More</summary>
228
+
204
229
  - πŸ”₯ **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [πŸ“– read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
205
230
  - πŸ”₯ **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
206
231
  - πŸ”₯ **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
@@ -212,7 +237,7 @@ The architecture includes the following modules:
212
237
  - πŸ”₯ **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
213
238
  - πŸ”₯ **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
214
239
 
215
-
240
+ </details>
216
241
 
217
242
  ## πŸ› οΈ Installation
218
243
  ### Method 1: Install Using pip
@@ -356,15 +381,85 @@ run_task(task_cfg="config.json")
356
381
  - `--limit`: Maximum amount of evaluation data for each dataset. If not specified, it defaults to evaluating all data. Can be used for quick validation
357
382
 
358
383
  ### Output Results
384
+ ```text
385
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
386
+ | Model Name | Dataset Name | Metric Name | Category Name | Subset Name | Num | Score |
387
+ +=======================+================+=================+=================+===============+=======+=========+
388
+ | Qwen2.5-0.5B-Instruct | gsm8k | AverageAccuracy | default | main | 5 | 0.4 |
389
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
390
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Easy | 5 | 0.8 |
391
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
392
+ | Qwen2.5-0.5B-Instruct | ai2_arc | AverageAccuracy | default | ARC-Challenge | 5 | 0.4 |
393
+ +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
359
394
  ```
360
- +-----------------------+-------------------+-----------------+
361
- | Model | ai2_arc | gsm8k |
362
- +=======================+===================+=================+
363
- | Qwen2.5-0.5B-Instruct | (ai2_arc/acc) 0.6 | (gsm8k/acc) 0.6 |
364
- +-----------------------+-------------------+-----------------+
395
+
396
+ ## πŸ“ˆ Visualization of Evaluation Results
397
+
398
+ 1. Install the dependencies required for visualization, including gradio, plotly, etc.
399
+ ```bash
400
+ pip install 'evalscope[app]'
365
401
  ```
366
402
 
367
- ## βš™οΈ Complex Evaluation
403
+ 2. Start the Visualization Service
404
+
405
+ Run the following command to start the visualization service.
406
+ ```bash
407
+ evalscope app
408
+ ```
409
+ You can access the visualization service in the browser if the following output appears.
410
+ ```text
411
+ * Running on local URL: http://127.0.0.1:7861
412
+
413
+ To create a public link, set `share=True` in `launch()`.
414
+ ```
415
+
416
+ <table>
417
+ <tr>
418
+ <td style="text-align: center;">
419
+ <img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
420
+ <p>Setting Interface</p>
421
+ </td>
422
+ <td style="text-align: center;">
423
+ <img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
424
+ <p>Model Comparison</p>
425
+ </td>
426
+ </tr>
427
+ <tr>
428
+ <td style="text-align: center;">
429
+ <img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
430
+ <p>Report Overview</p>
431
+ </td>
432
+ <td style="text-align: center;">
433
+ <img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
434
+ <p>Report Details</p>
435
+ </td>
436
+ </tr>
437
+ </table>
438
+
439
+ For more details, refer to: [πŸ“– Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
440
+
441
+ ## 🌐 Evaluation of Specified Model API
442
+
443
+ Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
444
+
445
+ For example, to launch a model service using [vLLM](https://github.com/vllm-project/vllm):
446
+
447
+ ```shell
448
+ export VLLM_USE_MODELSCOPE=True && python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --served-model-name qwen2.5 --trust_remote_code --port 8801
449
+ ```
450
+ Then, you can use the following command to evaluate the model API service:
451
+ ```shell
452
+ evalscope eval \
453
+ --model qwen2.5 \
454
+ --api-url http://127.0.0.1:8801/v1/chat/completions \
455
+ --api-key EMPTY \
456
+ --eval-type service \
457
+ --datasets gsm8k \
458
+ --limit 10
459
+ ```
460
+
461
+ ## βš™οΈ Custom Parameter Evaluation
462
+
368
463
  For more customized evaluations, such as customizing model parameters or dataset parameters, you can use the following command. The evaluation startup method is the same as simple evaluation. Below shows how to start the evaluation using the `eval` command:
369
464
 
370
465
  ```shell
@@ -402,7 +497,7 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
402
497
  - **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
403
498
 
404
499
 
405
- ## Model Serving Performance Evaluation
500
+ ## πŸ“ˆ Model Serving Performance Evaluation
406
501
  A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
407
502
 
408
503
  Reference: Performance Testing [πŸ“– User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
@@ -427,19 +522,32 @@ Speed Benchmark Results:
427
522
  +---------------+-----------------+----------------+
428
523
  ```
429
524
 
430
- ## Custom Dataset Evaluation
525
+ ## πŸ–ŠοΈ Custom Dataset Evaluation
431
526
  EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [πŸ“–User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
432
527
 
433
528
 
434
- ## Arena Mode
529
+ ## 🏟️ Arena Mode
435
530
  The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
436
531
 
437
532
  Refer to: Arena Mode [πŸ“– User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
438
533
 
534
+ ## πŸ‘·β€β™‚οΈ Contribution
439
535
 
536
+ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn), is continuously optimizing its benchmark evaluation features! We invite you to refer to the [Contribution Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html) to easily add your own evaluation benchmarks and share your contributions with the community. Let’s work together to support the growth of EvalScope and make our tools even better! Join us now!
440
537
 
538
+ <a href="https://github.com/modelscope/evalscope/graphs/contributors" target="_blank">
539
+ <table>
540
+ <tr>
541
+ <th colspan="2">
542
+ <br><img src="https://contrib.rocks/image?repo=modelscope/evalscope"><br><br>
543
+ </th>
544
+ </tr>
545
+ </table>
546
+ </a>
441
547
 
442
- ## TO-DO List
548
+ ## πŸ”œ Roadmap
549
+ - [ ] Support for better evaluation report visualization
550
+ - [x] Support for mixed evaluations across multiple datasets
443
551
  - [x] RAG evaluation
444
552
  - [x] VLM evaluation
445
553
  - [x] Agents evaluation
@@ -450,8 +558,6 @@ Refer to: Arena Mode [πŸ“– User Guide](https://evalscope.readthedocs.io/en/lates
450
558
  - [ ] GAIA
451
559
  - [ ] GPQA
452
560
  - [x] MBPP
453
- - [ ] Auto-reviewer
454
- - [ ] Qwen-max
455
561
 
456
562
 
457
563
  ## Star History