evalscope 1.0.2__py3-none-any.whl β†’ 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show
  1. evalscope/api/benchmark/__init__.py +8 -1
  2. evalscope/api/benchmark/adapters/__init__.py +1 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
  4. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  5. evalscope/api/benchmark/benchmark.py +14 -0
  6. evalscope/api/dataset/dataset.py +21 -0
  7. evalscope/api/dataset/loader.py +6 -2
  8. evalscope/api/mixin/sandbox_mixin.py +32 -54
  9. evalscope/api/model/generate_config.py +6 -0
  10. evalscope/app/ui/multi_model.py +6 -1
  11. evalscope/app/ui/single_model.py +8 -2
  12. evalscope/app/utils/data_utils.py +3 -2
  13. evalscope/app/utils/visualization.py +2 -2
  14. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  15. evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
  16. evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
  17. evalscope/benchmarks/blink/__init__.py +0 -0
  18. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  19. evalscope/benchmarks/chartqa/__init__.py +0 -0
  20. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  21. evalscope/benchmarks/chartqa/utils.py +38 -0
  22. evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
  23. evalscope/benchmarks/docvqa/__init__.py +0 -0
  24. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  25. evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
  26. evalscope/benchmarks/general_arena/utils.py +2 -1
  27. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  28. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  29. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
  30. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  31. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
  32. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  33. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
  34. evalscope/benchmarks/infovqa/__init__.py +0 -0
  35. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
  37. evalscope/benchmarks/math_verse/__init__.py +0 -0
  38. evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
  39. evalscope/benchmarks/math_vision/__init__.py +0 -0
  40. evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
  41. evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
  42. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
  43. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  44. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
  45. evalscope/benchmarks/ner/__init__.py +0 -0
  46. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  47. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  48. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  49. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  50. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  51. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  52. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  53. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  54. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  55. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  56. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  57. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  58. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  59. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  60. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  61. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  62. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  63. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  64. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  65. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  66. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  67. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  68. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  69. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  74. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  75. evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
  76. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  77. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  78. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  79. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  80. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  81. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  82. evalscope/benchmarks/poly_math/__init__.py +0 -0
  83. evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
  84. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  85. evalscope/benchmarks/pope/__init__.py +0 -0
  86. evalscope/benchmarks/pope/pope_adapter.py +111 -0
  87. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  88. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  89. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  90. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  91. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
  92. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
  93. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  94. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  95. evalscope/benchmarks/zerobench/__init__.py +0 -0
  96. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  97. evalscope/constants.py +4 -0
  98. evalscope/evaluator/evaluator.py +72 -79
  99. evalscope/metrics/math_parser.py +14 -0
  100. evalscope/metrics/metric.py +52 -1
  101. evalscope/metrics/metrics.py +16 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  115. evalscope/models/utils/openai.py +4 -0
  116. evalscope/perf/arguments.py +24 -4
  117. evalscope/perf/benchmark.py +74 -89
  118. evalscope/perf/http_client.py +31 -16
  119. evalscope/perf/main.py +15 -2
  120. evalscope/perf/plugin/api/base.py +9 -7
  121. evalscope/perf/plugin/api/custom_api.py +13 -58
  122. evalscope/perf/plugin/api/default_api.py +179 -79
  123. evalscope/perf/plugin/api/openai_api.py +4 -3
  124. evalscope/perf/plugin/datasets/base.py +21 -0
  125. evalscope/perf/plugin/datasets/custom.py +2 -3
  126. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  127. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  128. evalscope/perf/plugin/datasets/openqa.py +2 -4
  129. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  130. evalscope/perf/utils/benchmark_util.py +36 -22
  131. evalscope/perf/utils/db_util.py +14 -19
  132. evalscope/perf/utils/local_server.py +0 -44
  133. evalscope/perf/utils/log_utils.py +21 -6
  134. evalscope/report/__init__.py +11 -2
  135. evalscope/report/combinator.py +52 -2
  136. evalscope/run.py +4 -0
  137. evalscope/utils/function_utils.py +195 -12
  138. evalscope/utils/io_utils.py +74 -0
  139. evalscope/utils/json_schema.py +8 -6
  140. evalscope/utils/logger.py +49 -17
  141. evalscope/utils/multi_choices.py +16 -1
  142. evalscope/utils/ner.py +377 -0
  143. evalscope/version.py +2 -2
  144. {evalscope-1.0.2.dist-info β†’ evalscope-1.1.1.dist-info}/METADATA +239 -393
  145. {evalscope-1.0.2.dist-info β†’ evalscope-1.1.1.dist-info}/RECORD +140 -98
  146. {evalscope-1.0.2.dist-info β†’ evalscope-1.1.1.dist-info}/WHEEL +1 -1
  147. {evalscope-1.0.2.dist-info β†’ evalscope-1.1.1.dist-info}/top_level.txt +0 -1
  148. tests/__init__.py +0 -1
  149. tests/benchmark/__init__.py +0 -1
  150. tests/benchmark/test_eval.py +0 -429
  151. tests/benchmark/test_image_edit.py +0 -65
  152. tests/benchmark/test_sandbox.py +0 -81
  153. tests/benchmark/test_t2i.py +0 -142
  154. tests/benchmark/test_vlm.py +0 -137
  155. tests/cli/__init__.py +0 -1
  156. tests/cli/test_all.py +0 -269
  157. tests/cli/test_collection.py +0 -99
  158. tests/cli/test_custom.py +0 -268
  159. tests/cli/test_reasoning.py +0 -81
  160. tests/common.py +0 -73
  161. tests/perf/__init__.py +0 -1
  162. tests/perf/test_perf.py +0 -206
  163. tests/rag/test_clip_benchmark.py +0 -87
  164. tests/rag/test_mteb.py +0 -213
  165. tests/rag/test_ragas.py +0 -128
  166. tests/swift/__init__.py +0 -1
  167. tests/swift/test_run_swift_eval.py +0 -146
  168. tests/swift/test_run_swift_vlm_eval.py +0 -128
  169. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  170. tests/test_run_all.py +0 -12
  171. tests/utils.py +0 -13
  172. tests/vlm/__init__.py +0 -1
  173. tests/vlm/test_vlmeval.py +0 -102
  174. {tests/rag β†’ evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
  175. {evalscope-1.0.2.dist-info β†’ evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
  176. {evalscope-1.0.2.dist-info β†’ evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
@@ -1,30 +1,30 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: evalscope
3
- Version: 1.0.2
3
+ Version: 1.1.1
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
- Home-page: https://github.com/modelscope/evalscope
6
5
  Author: ModelScope team
7
6
  Author-email: contact@modelscope.cn
8
7
  License: Apache License 2.0
8
+ Project-URL: Homepage, https://github.com/modelscope/evalscope
9
9
  Keywords: python,llm,evaluation
10
10
  Classifier: Development Status :: 4 - Beta
11
11
  Classifier: Operating System :: OS Independent
12
12
  Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.9
14
13
  Classifier: Programming Language :: Python :: 3.10
15
14
  Classifier: Programming Language :: Python :: 3.11
16
15
  Classifier: Programming Language :: Python :: 3.12
17
- Requires-Python: >=3.9
16
+ Classifier: License :: OSI Approved :: Apache Software License
17
+ Requires-Python: >=3.10
18
18
  Description-Content-Type: text/markdown
19
19
  License-File: LICENSE
20
20
  Requires-Dist: colorlog
21
21
  Requires-Dist: datasets==3.6.0
22
- Requires-Dist: docstring-parser
22
+ Requires-Dist: docstring_parser
23
23
  Requires-Dist: dotenv
24
24
  Requires-Dist: jieba
25
25
  Requires-Dist: jsonlines
26
26
  Requires-Dist: langdetect
27
- Requires-Dist: latex2sympy2-extended[antlr4_9_3]
27
+ Requires-Dist: latex2sympy2_extended[antlr4_9_3]
28
28
  Requires-Dist: matplotlib
29
29
  Requires-Dist: modelscope[framework]>=1.27
30
30
  Requires-Dist: nltk>=3.9
@@ -46,45 +46,54 @@ Requires-Dist: tabulate
46
46
  Requires-Dist: tqdm
47
47
  Requires-Dist: transformers>=4.33
48
48
  Requires-Dist: word2number
49
+ Provides-Extra: opencompass
50
+ Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
51
+ Provides-Extra: vlmeval
52
+ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
53
+ Provides-Extra: rag
54
+ Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
55
+ Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
56
+ Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
57
+ Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
58
+ Requires-Dist: mteb==1.38.20; extra == "rag"
59
+ Requires-Dist: ragas==0.2.14; extra == "rag"
60
+ Requires-Dist: torch; extra == "rag"
61
+ Requires-Dist: webdataset>0.2.0; extra == "rag"
62
+ Provides-Extra: perf
63
+ Requires-Dist: aiohttp; extra == "perf"
64
+ Requires-Dist: fastapi; extra == "perf"
65
+ Requires-Dist: jinja2; extra == "perf"
66
+ Requires-Dist: numpy; extra == "perf"
67
+ Requires-Dist: rich; extra == "perf"
68
+ Requires-Dist: sse_starlette; extra == "perf"
69
+ Requires-Dist: transformers; extra == "perf"
70
+ Requires-Dist: uvicorn; extra == "perf"
71
+ Provides-Extra: app
72
+ Requires-Dist: gradio==5.4.0; extra == "app"
73
+ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
49
74
  Provides-Extra: aigc
50
75
  Requires-Dist: diffusers; extra == "aigc"
51
76
  Requires-Dist: iopath; extra == "aigc"
52
77
  Requires-Dist: omegaconf; extra == "aigc"
53
- Requires-Dist: open-clip-torch; extra == "aigc"
78
+ Requires-Dist: open_clip_torch; extra == "aigc"
54
79
  Requires-Dist: opencv-python; extra == "aigc"
55
80
  Requires-Dist: peft>=0.17; extra == "aigc"
56
81
  Requires-Dist: torch; extra == "aigc"
57
82
  Requires-Dist: torchvision; extra == "aigc"
83
+ Provides-Extra: sandbox
84
+ Requires-Dist: ms-enclave[docker]; extra == "sandbox"
85
+ Provides-Extra: dev
86
+ Requires-Dist: pytest; extra == "dev"
87
+ Requires-Dist: pytest-cov; extra == "dev"
88
+ Requires-Dist: python-dotenv; extra == "dev"
89
+ Provides-Extra: docs
90
+ Requires-Dist: docutils>=0.16.0; extra == "docs"
91
+ Requires-Dist: myst_parser; extra == "docs"
92
+ Requires-Dist: recommonmark; extra == "docs"
93
+ Requires-Dist: sphinx>=5.3.0; extra == "docs"
94
+ Requires-Dist: sphinx-design; extra == "docs"
95
+ Requires-Dist: sphinxawesome-theme; extra == "docs"
58
96
  Provides-Extra: all
59
- Requires-Dist: colorlog; extra == "all"
60
- Requires-Dist: datasets==3.6.0; extra == "all"
61
- Requires-Dist: docstring-parser; extra == "all"
62
- Requires-Dist: dotenv; extra == "all"
63
- Requires-Dist: jieba; extra == "all"
64
- Requires-Dist: jsonlines; extra == "all"
65
- Requires-Dist: langdetect; extra == "all"
66
- Requires-Dist: latex2sympy2-extended[antlr4_9_3]; extra == "all"
67
- Requires-Dist: matplotlib; extra == "all"
68
- Requires-Dist: modelscope[framework]>=1.27; extra == "all"
69
- Requires-Dist: nltk>=3.9; extra == "all"
70
- Requires-Dist: openai; extra == "all"
71
- Requires-Dist: overrides; extra == "all"
72
- Requires-Dist: pandas; extra == "all"
73
- Requires-Dist: pillow; extra == "all"
74
- Requires-Dist: pydantic; extra == "all"
75
- Requires-Dist: pyyaml>=5.1; extra == "all"
76
- Requires-Dist: requests; extra == "all"
77
- Requires-Dist: rich; extra == "all"
78
- Requires-Dist: rouge-chinese; extra == "all"
79
- Requires-Dist: rouge-score>=0.1.0; extra == "all"
80
- Requires-Dist: sacrebleu; extra == "all"
81
- Requires-Dist: scikit-learn; extra == "all"
82
- Requires-Dist: seaborn; extra == "all"
83
- Requires-Dist: sympy; extra == "all"
84
- Requires-Dist: tabulate; extra == "all"
85
- Requires-Dist: tqdm; extra == "all"
86
- Requires-Dist: transformers>=4.33; extra == "all"
87
- Requires-Dist: word2number; extra == "all"
88
97
  Requires-Dist: ms-opencompass>=0.1.6; extra == "all"
89
98
  Requires-Dist: ms-vlmeval>=0.0.17; extra == "all"
90
99
  Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
@@ -99,7 +108,8 @@ Requires-Dist: aiohttp; extra == "all"
99
108
  Requires-Dist: fastapi; extra == "all"
100
109
  Requires-Dist: jinja2; extra == "all"
101
110
  Requires-Dist: numpy; extra == "all"
102
- Requires-Dist: sse-starlette; extra == "all"
111
+ Requires-Dist: rich; extra == "all"
112
+ Requires-Dist: sse_starlette; extra == "all"
103
113
  Requires-Dist: transformers; extra == "all"
104
114
  Requires-Dist: uvicorn; extra == "all"
105
115
  Requires-Dist: gradio==5.4.0; extra == "all"
@@ -107,46 +117,12 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
107
117
  Requires-Dist: diffusers; extra == "all"
108
118
  Requires-Dist: iopath; extra == "all"
109
119
  Requires-Dist: omegaconf; extra == "all"
110
- Requires-Dist: open-clip-torch; extra == "all"
120
+ Requires-Dist: open_clip_torch; extra == "all"
111
121
  Requires-Dist: opencv-python; extra == "all"
112
122
  Requires-Dist: peft>=0.17; extra == "all"
123
+ Requires-Dist: torch; extra == "all"
113
124
  Requires-Dist: torchvision; extra == "all"
114
- Provides-Extra: app
115
- Requires-Dist: gradio==5.4.0; extra == "app"
116
- Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
117
- Provides-Extra: dev
118
- Requires-Dist: pytest; extra == "dev"
119
- Requires-Dist: pytest-cov; extra == "dev"
120
- Requires-Dist: python-dotenv; extra == "dev"
121
- Provides-Extra: docs
122
- Requires-Dist: docutils>=0.16.0; extra == "docs"
123
- Requires-Dist: myst-parser; extra == "docs"
124
- Requires-Dist: recommonmark; extra == "docs"
125
- Requires-Dist: sphinx>=5.3.0; extra == "docs"
126
- Requires-Dist: sphinx-design; extra == "docs"
127
- Requires-Dist: sphinxawesome-theme; extra == "docs"
128
- Provides-Extra: opencompass
129
- Requires-Dist: ms-opencompass>=0.1.6; extra == "opencompass"
130
- Provides-Extra: perf
131
- Requires-Dist: aiohttp; extra == "perf"
132
- Requires-Dist: fastapi; extra == "perf"
133
- Requires-Dist: jinja2; extra == "perf"
134
- Requires-Dist: numpy; extra == "perf"
135
- Requires-Dist: rich; extra == "perf"
136
- Requires-Dist: sse-starlette; extra == "perf"
137
- Requires-Dist: transformers; extra == "perf"
138
- Requires-Dist: uvicorn; extra == "perf"
139
- Provides-Extra: rag
140
- Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
141
- Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
142
- Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
143
- Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
144
- Requires-Dist: mteb==1.38.20; extra == "rag"
145
- Requires-Dist: ragas==0.2.14; extra == "rag"
146
- Requires-Dist: torch; extra == "rag"
147
- Requires-Dist: webdataset>0.2.0; extra == "rag"
148
- Provides-Extra: vlmeval
149
- Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
125
+ Dynamic: license-file
150
126
 
151
127
  <p align="center">
152
128
  <br>
@@ -154,13 +130,12 @@ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
154
130
  <br>
155
131
  <p>
156
132
 
157
-
158
133
  <p align="center">
159
134
  <a href="README_zh.md">δΈ­ζ–‡</a> &nbsp | &nbsp English &nbsp
160
135
  </p>
161
136
 
162
137
  <p align="center">
163
- <img src="https://img.shields.io/badge/python-%E2%89%A53.9-5be.svg">
138
+ <img src="https://img.shields.io/badge/python-%E2%89%A53.10-5be.svg">
164
139
  <a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
165
140
  <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
166
141
  <a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
@@ -168,121 +143,77 @@ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
168
143
  <p>
169
144
 
170
145
  <p align="center">
171
- <a href="https://evalscope.readthedocs.io/zh-cn/latest/"> πŸ“– δΈ­ζ–‡ζ–‡ζ‘£</a> &nbsp | &nbsp <a href="https://evalscope.readthedocs.io/en/latest/"> πŸ“– English Documents</a>
146
+ <a href="https://evalscope.readthedocs.io/zh-cn/latest/"> πŸ“– Chinese Documentation</a> &nbsp | &nbsp <a href="https://evalscope.readthedocs.io/en/latest/"> πŸ“– English Documentation</a>
172
147
  <p>
173
148
 
174
- > ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
175
-
176
- ## πŸ“‹ Contents
177
- - [πŸ“‹ Contents](#-contents)
178
- - [πŸ“ Introduction](#-introduction)
179
- - [☎ User Groups](#-user-groups)
180
- - [πŸŽ‰ News](#-news)
181
- - [πŸ› οΈ Environment Setup](#️-environment-setup)
182
- - [Method 1. Install via pip](#method-1-install-via-pip)
183
- - [Method 2. Install from source](#method-2-install-from-source)
184
- - [πŸš€ Quick Start](#-quick-start)
185
- - [Method 1. Using Command Line](#method-1-using-command-line)
186
- - [Method 2. Using Python Code](#method-2-using-python-code)
187
- - [Basic Parameter](#basic-parameter)
188
- - [Output Results](#output-results)
189
- - [πŸ“ˆ Visualization of Evaluation Results](#-visualization-of-evaluation-results)
190
- - [🌐 Evaluation of Model API](#-evaluation-of-model-api)
191
- - [βš™οΈ Custom Parameter Evaluation](#️-custom-parameter-evaluation)
192
- - [Parameter Description](#parameter-description)
193
- - [πŸ§ͺ Other Evaluation Backends](#-other-evaluation-backends)
194
- - [πŸ“ˆ Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
195
- - [πŸ–ŠοΈ Custom Dataset Evaluation](#️-custom-dataset-evaluation)
196
- - [βš”οΈ Arena Mode](#️-arena-mode)
197
- - [πŸ‘·β€β™‚οΈ Contribution](#️-contribution)
198
- - [πŸ“š Citation](#-citation)
199
- - [πŸ”œ Roadmap](#-roadmap)
200
- - [⭐ Star History](#-star-history)
201
149
 
150
+ > ⭐ If you like this project, please click the "Star" button in the upper right corner to support us. Your support is our motivation to move forward!
202
151
 
203
152
  ## πŸ“ Introduction
204
153
 
205
- EvalScope is a comprehensive model evaluation and performance benchmarking framework meticulously crafted by the [ModelScope Community](https://modelscope.cn/), offering a one-stop solution for your model assessment needs. Regardless of the type of model you are developing, EvalScope is equipped to cater to your requirements:
154
+ EvalScope is a powerful and easily extensible model evaluation framework created by the [ModelScope Community](https://modelscope.cn/), aiming to provide a one-stop evaluation solution for large model developers.
206
155
 
207
- - 🧠 Large Language Models
208
- - 🎨 Multimodal Models
209
- - πŸ” Embedding Models
210
- - πŸ† Reranker Models
211
- - πŸ–ΌοΈ CLIP Models
212
- - 🎭 AIGC Models (Image-to-Text/Video)
213
- - ...and more!
156
+ Whether you want to evaluate the general capabilities of models, conduct multi-model performance comparisons, or need to stress test models, EvalScope can meet your needs.
214
157
 
215
- EvalScope is not merely an evaluation tool; it is a valuable ally in your model optimization journey:
158
+ ## ✨ Key Features
216
159
 
217
- - πŸ… Equipped with multiple industry-recognized benchmarks and evaluation metrics: MMLU, CMMLU, C-Eval, GSM8K, etc.
218
- - πŸ“Š Model inference performance stress testing: Ensuring your model excels in real-world applications.
219
- - πŸš€ Seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, enabling one-click evaluations and providing full-chain support from training to assessment for your model development.
160
+ - **πŸ“š Comprehensive Evaluation Benchmarks**: Built-in multiple industry-recognized evaluation benchmarks including MMLU, C-Eval, GSM8K, and more.
161
+ - **🧩 Multi-modal and Multi-domain Support**: Supports evaluation of various model types including Large Language Models (LLM), Vision Language Models (VLM), Embedding, Reranker, AIGC, and more.
162
+ - **πŸš€ Multi-backend Integration**: Seamlessly integrates multiple evaluation backends including OpenCompass, VLMEvalKit, RAGEval to meet different evaluation needs.
163
+ - **⚑ Inference Performance Testing**: Provides powerful model service stress testing tools, supporting multiple performance metrics such as TTFT, TPOT.
164
+ - **πŸ“Š Interactive Reports**: Provides WebUI visualization interface, supporting multi-dimensional model comparison, report overview and detailed inspection.
165
+ - **βš”οΈ Arena Mode**: Supports multi-model battles (Pairwise Battle), intuitively ranking and evaluating models.
166
+ - **πŸ”§ Highly Extensible**: Developers can easily add custom datasets, models and evaluation metrics.
220
167
 
221
- Below is the overall architecture diagram of EvalScope:
168
+ <details><summary>πŸ›οΈ Overall Architecture</summary>
222
169
 
223
170
  <p align="center">
224
- <img src="https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/doc/EvalScope%E6%9E%B6%E6%9E%84%E5%9B%BE.png" width="70%">
225
- <br>EvalScope Framework.
171
+ <img src="https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/doc/EvalScope%E6%9E%B6%E6%9E%84%E5%9B%BE.png" style="width: 70%;">
172
+ <br>EvalScope Overall Architecture.
226
173
  </p>
227
174
 
228
- <details><summary>Framework Description</summary>
229
-
230
- The architecture includes the following modules:
231
- 1. Input Layer
232
- - **Model Sources**: API models (OpenAI API), local models (ModelScope)
233
- - **Datasets**: Standard evaluation benchmarks (MMLU/GSM8k, etc.), custom data (MCQ/QA)
175
+ 1. **Input Layer**
176
+ - **Model Sources**: API models (OpenAI API), Local models (ModelScope)
177
+ - **Datasets**: Standard evaluation benchmarks (MMLU/GSM8k etc.), Custom data (MCQ/QA)
234
178
 
235
- 2. Core Functions
236
- - **Multi-backend Evaluation**
237
- - Native backends: Unified evaluation for LLM/VLM/Embedding/T2I models
238
- - Integrated frameworks: OpenCompass/MTEB/VLMEvalKit/RAGAS
179
+ 2. **Core Functions**
180
+ - **Multi-backend Evaluation**: Native backend, OpenCompass, MTEB, VLMEvalKit, RAGAS
181
+ - **Performance Monitoring**: Supports multiple model service APIs and data formats, tracking TTFT/TPOP and other metrics
182
+ - **Tool Extensions**: Integrates Tool-Bench, Needle-in-a-Haystack, etc.
239
183
 
240
- - **Performance Monitoring**
241
- - Model plugins: Supports various model service APIs
242
- - Data plugins: Supports multiple data formats
243
- - Metric tracking: TTFT/TPOP/Stability and other metrics
244
-
245
- - **Tool Extensions**
246
- - Integration: Tool-Bench/Needle-in-a-Haystack/BFCL-v3
247
-
248
- 3. Output Layer
249
- - **Structured Reports**: Supports JSON/Tables/Logs
250
- - **Visualization Platforms**: Supports Gradio/Wandb/SwanLab
184
+ 3. **Output Layer**
185
+ - **Structured Reports**: Supports JSON, Table, Logs
186
+ - **Visualization Platform**: Supports Gradio, Wandb, SwanLab
251
187
 
252
188
  </details>
253
189
 
254
- ## ☎ User Groups
255
-
256
- Please scan the QR code below to join our community groups:
257
-
258
- [Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group | DingTalk Group
259
- :-------------------------:|:-------------------------:|:-------------------------:
260
- <img src="docs/asset/discord_qr.jpg" width="160" height="160"> | <img src="docs/asset/wechat.png" width="160" height="160"> | <img src="docs/asset/dingding.png" width="160" height="160">
261
-
262
-
263
- ## πŸŽ‰ News
190
+ ## πŸŽ‰ What's New
264
191
 
265
192
  > [!IMPORTANT]
266
193
  > **Version 1.0 Refactoring**
267
194
  >
268
195
  > Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
269
196
 
197
+ - πŸ”₯ **[2025.10.21]** Optimized sandbox environment usage in code evaluation, supporting both local and remote operation modes. For details, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/sandbox.html).
198
+ - πŸ”₯ **[2025.10.20]** Added support for evaluation benchmarks including PolyMath, SimpleVQA, MathVerse, MathVision, AA-LCR; optimized evalscope perf performance to align with vLLM Bench. For details, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/vs_vllm_bench.html).
199
+ - πŸ”₯ **[2025.10.14]** Added support for OCRBench, OCRBench-v2, DocVQA, InfoVQA, ChartQA, and BLINK multimodal image-text evaluation benchmarks.
200
+ - πŸ”₯ **[2025.09.22]** Code evaluation benchmarks (HumanEval, LiveCodeBench) now support running in a sandbox environment. To use this feature, please install [ms-enclave](https://github.com/modelscope/ms-enclave) first.
270
201
  - πŸ”₯ **[2025.09.19]** Added support for multimodal image-text evaluation benchmarks including RealWorldQA, AI2D, MMStar, MMBench, and OmniBench, as well as pure text evaluation benchmarks such as Multi-IF, HealthBench, and AMC.
271
- - πŸ”₯ **[2025.09.05]** Added support for vision-language multimodal model evaluation tasks, such as MathVista and MMMU. For more supported datasets, please [refer to the documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/vlm.html).
202
+ - πŸ”₯ **[2025.09.05]** Added support for vision-language multimodal model evaluation tasks, such as MathVista and MMMU. For more supported datasets, please [refer to the documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/vlm.html).
272
203
  - πŸ”₯ **[2025.09.04]** Added support for image editing task evaluation, including the [GEdit-Bench](https://modelscope.cn/datasets/stepfun-ai/GEdit-Bench) benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/image_edit.html).
273
204
  - πŸ”₯ **[2025.08.22]** Version 1.0 Refactoring. Break changes, please [refer to](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#switching-to-version-v1-0).
274
205
  - πŸ”₯ **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
275
- - πŸ”₯ **[2025.07.16]** Support for [Ο„-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
206
+ - πŸ”₯ **[2025.07.16]** Support for [Ο„-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#bench).
276
207
  - πŸ”₯ **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
277
208
  - πŸ”₯ **[2025.07.03]** Refactored Arena Mode: now supports custom model battles, outputs a model leaderboard, and provides battle result visualization. See [reference](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html) for details.
209
+ <details><summary>More</summary>
210
+
278
211
  - πŸ”₯ **[2025.06.28]** Optimized custom dataset evaluation: now supports evaluation without reference answers. Enhanced LLM judge usage, with built-in modes for "scoring directly without reference answers" and "checking answer consistency with reference answers". See [reference](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/llm.html#qa) for details.
279
- - πŸ”₯ **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
212
+ - πŸ”₯ **[2025.06.19]** Added support for the [BFCL-v3](https://modelscope.cn/datasets/AI-ModelScope/bfcl_v3) benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/bfcl_v3.html).
280
213
  - πŸ”₯ **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
281
214
  - πŸ”₯ **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/index.html).
282
215
  - πŸ”₯ **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
283
216
  - πŸ”₯ **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
284
- <details><summary>More</summary>
285
-
286
217
  - πŸ”₯ **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read πŸ“–](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
287
218
  - πŸ”₯ **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
288
219
  - πŸ”₯ **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
@@ -319,96 +250,71 @@ Please scan the QR code below to join our community groups:
319
250
 
320
251
  </details>
321
252
 
322
- ## πŸ› οΈ Environment Setup
253
+ ## ❀️ Community & Support
323
254
 
324
- ### Method 1. Install via pip
255
+ Welcome to join our community to communicate with other developers and get help.
325
256
 
326
- We recommend using conda to manage your environment and pip to install dependencies. This allows you to use the latest evalscope PyPI package.
257
+ [Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group | DingTalk Group
258
+ :-------------------------:|:-------------------------:|:-------------------------:
259
+ <img src="docs/asset/discord_qr.jpg" width="160" height="160"> | <img src="docs/asset/wechat.png" width="160" height="160"> | <img src="docs/asset/dingding.png" width="160" height="160">
327
260
 
328
- 1. Create a conda environment (optional)
329
- ```shell
330
- # Python 3.10 is recommended
331
- conda create -n evalscope python=3.10
332
261
 
333
- # Activate the conda environment
334
- conda activate evalscope
335
- ```
336
- 2. Install dependencies via pip
337
- ```shell
338
- pip install evalscope
339
- ```
340
- 3. Install additional dependencies (optional)
341
- - To use model service inference benchmarking features, install the perf dependency:
262
+
263
+ ## πŸ› οΈ Environment Setup
264
+
265
+ We recommend using `conda` to create a virtual environment and install with `pip`.
266
+
267
+ 1. **Create and Activate Conda Environment** (Python 3.10 recommended)
342
268
  ```shell
343
- pip install 'evalscope[perf]'
269
+ conda create -n evalscope python=3.10
270
+ conda activate evalscope
344
271
  ```
345
- - To use visualization features, install the app dependency:
272
+
273
+ 2. **Install EvalScope**
274
+
275
+ - **Method 1: Install via PyPI (Recommended)**
276
+ ```shell
277
+ pip install evalscope
278
+ ```
279
+
280
+ - **Method 2: Install from Source (For Development)**
281
+ ```shell
282
+ git clone https://github.com/modelscope/evalscope.git
283
+ cd evalscope
284
+ pip install -e .
285
+ ```
286
+
287
+ 3. **Install Additional Dependencies** (Optional)
288
+ Install corresponding feature extensions according to your needs:
346
289
  ```shell
290
+ # Performance testing
291
+ pip install 'evalscope[perf]'
292
+
293
+ # Visualization App
347
294
  pip install 'evalscope[app]'
348
- ```
349
- - If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
350
- ```shell
295
+
296
+ # Other evaluation backends
351
297
  pip install 'evalscope[opencompass]'
352
298
  pip install 'evalscope[vlmeval]'
353
299
  pip install 'evalscope[rag]'
354
- ```
355
- - To install all dependencies:
356
- ```shell
300
+
301
+ # Install all dependencies
357
302
  pip install 'evalscope[all]'
358
303
  ```
304
+ > If you installed from source, please replace `evalscope` with `.`, for example `pip install '.[perf]'`.
359
305
 
360
306
  > [!NOTE]
361
- > The project has been renamed to `evalscope`. For version `v0.4.3` or earlier, you can install it with:
362
- > ```shell
363
- > pip install llmuses<=0.4.3
364
- > ```
365
- > Then, import related dependencies using `llmuses`:
366
- > ```python
367
- > from llmuses import ...
368
- > ```
369
-
370
- ### Method 2. Install from source
371
-
372
- Installing from source allows you to use the latest code and makes it easier for further development and debugging.
373
-
374
- 1. Clone the source code
375
- ```shell
376
- git clone https://github.com/modelscope/evalscope.git
377
- ```
378
- 2. Install dependencies
379
- ```shell
380
- cd evalscope/
381
-
382
- pip install -e .
383
- ```
384
- 3. Install additional dependencies
385
- - To use model service inference benchmarking features, install the perf dependency:
386
- ```shell
387
- pip install '.[perf]'
388
- ```
389
- - To use visualization features, install the app dependency:
390
- ```shell
391
- pip install '.[app]'
392
- ```
393
- - If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
394
- ```shell
395
- pip install '.[opencompass]'
396
- pip install '.[vlmeval]'
397
- pip install '.[rag]'
398
- ```
399
- - To install all dependencies:
400
- ```shell
401
- pip install '.[all]'
402
- ```
307
+ > This project was formerly known as `llmuses`. If you need to use `v0.4.3` or earlier versions, please run `pip install llmuses<=0.4.3` and use `from llmuses import ...` for imports.
403
308
 
404
309
 
405
310
  ## πŸš€ Quick Start
406
311
 
407
- To evaluate a model on specified datasets using default configurations, this framework supports two ways to initiate evaluation tasks: using the command line or using Python code.
312
+ You can start evaluation tasks in two ways: **command line** or **Python code**.
408
313
 
409
314
  ### Method 1. Using Command Line
410
315
 
411
- Execute the `eval` command in any directory:
316
+ Execute the `evalscope eval` command in any path to start evaluation. The following command will evaluate the `Qwen/Qwen2.5-0.5B-Instruct` model on `gsm8k` and `arc` datasets, taking only 5 samples from each dataset.
317
+
412
318
  ```bash
413
319
  evalscope eval \
414
320
  --model Qwen/Qwen2.5-0.5B-Instruct \
@@ -418,22 +324,23 @@ evalscope eval \
418
324
 
419
325
  ### Method 2. Using Python Code
420
326
 
421
- When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
422
-
423
- **Using `TaskConfig`**
327
+ Use the `run_task` function and `TaskConfig` object to configure and start evaluation tasks.
424
328
 
425
329
  ```python
426
330
  from evalscope import run_task, TaskConfig
427
331
 
332
+ # Configure evaluation task
428
333
  task_cfg = TaskConfig(
429
334
  model='Qwen/Qwen2.5-0.5B-Instruct',
430
335
  datasets=['gsm8k', 'arc'],
431
336
  limit=5
432
337
  )
433
338
 
434
- run_task(task_cfg=task_cfg)
339
+ # Start evaluation
340
+ run_task(task_cfg)
435
341
  ```
436
- <details><summary>More Startup Methods</summary>
342
+
343
+ <details><summary><b>πŸ’‘ Tip:</b> `run_task` also supports dictionaries, YAML or JSON files as configuration.</summary>
437
344
 
438
345
  **Using Python Dictionary**
439
346
 
@@ -445,13 +352,10 @@ task_cfg = {
445
352
  'datasets': ['gsm8k', 'arc'],
446
353
  'limit': 5
447
354
  }
448
-
449
355
  run_task(task_cfg=task_cfg)
450
356
  ```
451
357
 
452
- **Using `yaml` file**
453
-
454
- `config.yaml`:
358
+ **Using YAML File** (`config.yaml`)
455
359
  ```yaml
456
360
  model: Qwen/Qwen2.5-0.5B-Instruct
457
361
  datasets:
@@ -459,37 +363,15 @@ datasets:
459
363
  - arc
460
364
  limit: 5
461
365
  ```
462
-
463
366
  ```python
464
367
  from evalscope.run import run_task
465
368
 
466
369
  run_task(task_cfg="config.yaml")
467
370
  ```
468
-
469
- **Using `json` file**
470
-
471
- `config.json`:
472
- ```json
473
- {
474
- "model": "Qwen/Qwen2.5-0.5B-Instruct",
475
- "datasets": ["gsm8k", "arc"],
476
- "limit": 5
477
- }
478
- ```
479
-
480
- ```python
481
- from evalscope.run import run_task
482
-
483
- run_task(task_cfg="config.json")
484
- ```
485
371
  </details>
486
372
 
487
- ### Basic Parameter
488
- - `--model`: Specifies the `model_id` of the model in [ModelScope](https://modelscope.cn/), which can be automatically downloaded, e.g., [Qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B-Instruct/summary); or use the local path of the model, e.g., `/path/to/model`
489
- - `--datasets`: Dataset names, supports inputting multiple datasets separated by spaces. Datasets will be automatically downloaded from modelscope. For supported datasets, refer to the [Dataset List](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/index.html)
490
- - `--limit`: Maximum amount of evaluation data for each dataset. If not specified, it defaults to evaluating all data. Can be used for quick validation
491
-
492
373
  ### Output Results
374
+ After evaluation completion, you will see a report in the terminal in the following format:
493
375
  ```text
494
376
  +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
495
377
  | Model Name | Dataset Name | Metric Name | Category Name | Subset Name | Num | Score |
@@ -502,164 +384,140 @@ run_task(task_cfg="config.json")
502
384
  +-----------------------+----------------+-----------------+-----------------+---------------+-------+---------+
503
385
  ```
504
386
 
505
- ## πŸ“ˆ Visualization of Evaluation Results
506
-
507
- 1. Install the dependencies required for visualization, including gradio, plotly, etc.
508
- ```bash
509
- pip install 'evalscope[app]'
510
- ```
511
-
512
- 2. Start the Visualization Service
513
-
514
- Run the following command to start the visualization service.
515
- ```bash
516
- evalscope app
517
- ```
518
- You can access the visualization service in the browser if the following output appears.
519
- ```text
520
- * Running on local URL: http://127.0.0.1:7861
521
-
522
- To create a public link, set `share=True` in `launch()`.
523
- ```
387
+ ## πŸ“ˆ Advanced Usage
524
388
 
525
- <table>
526
- <tr>
527
- <td style="text-align: center;">
528
- <img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 75%;" />
529
- <p>Setting Interface</p>
530
- </td>
531
- <td style="text-align: center;">
532
- <img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
533
- <p>Model Comparison</p>
534
- </td>
535
- </tr>
536
- <tr>
537
- <td style="text-align: center;">
538
- <img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
539
- <p>Report Overview</p>
540
- </td>
541
- <td style="text-align: center;">
542
- <img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 80%;" />
543
- <p>Report Details</p>
544
- </td>
545
- </tr>
546
- </table>
547
-
548
- For more details, refer to: [πŸ“– Visualization of Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html)
389
+ ### Custom Evaluation Parameters
549
390
 
550
- ## 🌐 Evaluation of Model API
551
-
552
- Specify the model API service address (api_url) and API Key (api_key) to evaluate the deployed model API service. In this case, the `eval-type` parameter must be specified as `service`, for example:
553
-
554
- For example, to launch a model service using [vLLM](https://github.com/vllm-project/vllm):
555
-
556
- ```shell
557
- export VLLM_USE_MODELSCOPE=True && python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-0.5B-Instruct --served-model-name qwen2.5 --trust_remote_code --port 8801
558
- ```
559
- Then, you can use the following command to evaluate the model API service:
560
- ```shell
561
- evalscope eval \
562
- --model qwen2.5 \
563
- --api-url http://127.0.0.1:8801/v1 \
564
- --api-key EMPTY \
565
- --eval-type service \
566
- --datasets gsm8k \
567
- --limit 10
568
- ```
569
-
570
- ## βš™οΈ Custom Parameter Evaluation
571
-
572
- For more customized evaluations, such as customizing model parameters or dataset parameters, you can use the following command. The evaluation startup method is the same as simple evaluation. Below shows how to start the evaluation using the `eval` command:
391
+ You can fine-tune model loading, inference, and dataset configuration through command line parameters.
573
392
 
574
393
  ```shell
575
394
  evalscope eval \
576
395
  --model Qwen/Qwen3-0.6B \
577
396
  --model-args '{"revision": "master", "precision": "torch.float16", "device_map": "auto"}' \
578
- --generation-config '{"do_sample":true,"temperature":0.6,"max_tokens":512,"chat_template_kwargs":{"enable_thinking": false}}' \
397
+ --generation-config '{"do_sample":true,"temperature":0.6,"max_tokens":512}' \
579
398
  --dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
580
399
  --datasets gsm8k \
581
400
  --limit 10
582
401
  ```
583
402
 
584
- ### Parameter Description
585
- - `--model-args`: Model loading parameters, passed as a JSON string:
586
- - `revision`: Model version
587
- - `precision`: Model precision
588
- - `device_map`: Device allocation for the model
589
- - `--generation-config`: Generation parameters, passed as a JSON string and parsed as a dictionary:
590
- - `do_sample`: Whether to use sampling
591
- - `temperature`: Generation temperature
592
- - `max_tokens`: Maximum length of generated tokens
593
- - `chat_template_kwargs`: Model inference template parameters
594
- - `--dataset-args`: Settings for the evaluation dataset, passed as a JSON string where the key is the dataset name and the value is the parameters. Note that these need to correspond one-to-one with the values in the `--datasets` parameter:
595
- - `few_shot_num`: Number of few-shot examples
596
- - `few_shot_random`: Whether to randomly sample few-shot data; if not set, defaults to `true`
403
+ - `--model-args`: Model loading parameters such as `revision`, `precision`, etc.
404
+ - `--generation-config`: Model generation parameters such as `temperature`, `max_tokens`, etc.
405
+ - `--dataset-args`: Dataset configuration parameters such as `few_shot_num`, etc.
597
406
 
598
- Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
407
+ For details, please refer to [πŸ“– Complete Parameter Guide](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
599
408
 
409
+ ### Evaluating Online Model APIs
600
410
 
601
- ## πŸ§ͺ Other Evaluation Backends
602
- EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
603
- - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
604
- - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [πŸ“– User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
605
- - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [πŸ“– User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/vlmevalkit_backend.html)
606
- - **RAGEval**: Initiate RAG evaluation tasks through EvalScope, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html): [πŸ“– User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/index.html)
607
- - **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
411
+ EvalScope supports evaluating model services deployed via APIs (such as services deployed with vLLM). Simply specify the service address and API Key.
608
412
 
413
+ 1. **Start Model Service** (using vLLM as example)
414
+ ```shell
415
+ export VLLM_USE_MODELSCOPE=True
416
+ python -m vllm.entrypoints.openai.api_server \
417
+ --model Qwen/Qwen2.5-0.5B-Instruct \
418
+ --served-model-name qwen2.5 \
419
+ --port 8801
420
+ ```
609
421
 
610
- ## πŸ“ˆ Model Serving Performance Evaluation
611
- A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
422
+ 2. **Run Evaluation**
423
+ ```shell
424
+ evalscope eval \
425
+ --model qwen2.5 \
426
+ --eval-type service \
427
+ --api-url http://127.0.0.1:8801/v1 \
428
+ --api-key EMPTY \
429
+ --datasets gsm8k \
430
+ --limit 10
431
+ ```
612
432
 
613
- Reference: Performance Testing [πŸ“– User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
433
+ ### βš”οΈ Arena Mode
614
434
 
615
- **Output example**
435
+ Arena mode evaluates model performance through pairwise battles between models, providing win rates and rankings, perfect for horizontal comparison of multiple models.
616
436
 
617
- ![multi_perf](docs/en/user_guides/stress_test/images/multi_perf.png)
437
+ ```text
438
+ # Example evaluation results
439
+ Model WinRate (%) CI (%)
440
+ ------------ ------------- ---------------
441
+ qwen2.5-72b 69.3 (-13.3 / +12.2)
442
+ qwen2.5-7b 50 (+0.0 / +0.0)
443
+ qwen2.5-0.5b 4.7 (-2.5 / +4.4)
444
+ ```
445
+ For details, please refer to [πŸ“– Arena Mode Usage Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html).
618
446
 
447
+ ### πŸ–ŠοΈ Custom Dataset Evaluation
619
448
 
620
- **Supports wandb for recording results**
449
+ EvalScope allows you to easily add and evaluate your own datasets. For details, please refer to [πŸ“– Custom Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html).
621
450
 
622
- ![wandb sample](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/wandb_sample.png)
623
451
 
624
- **Supports swanlab for recording results**
452
+ ## πŸ§ͺ Other Evaluation Backends
453
+ EvalScope supports launching evaluation tasks through third-party evaluation frameworks (we call them "backends") to meet diverse evaluation needs.
625
454
 
626
- ![swanlab sample](https://sail-moe.oss-cn-hangzhou.aliyuncs.com/yunlin/images/evalscope/swanlab.png)
455
+ - **Native**: EvalScope's default evaluation framework with comprehensive functionality.
456
+ - **OpenCompass**: Focuses on text-only evaluation. [πŸ“– Usage Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
457
+ - **VLMEvalKit**: Focuses on multi-modal evaluation. [πŸ“– Usage Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/vlmevalkit_backend.html)
458
+ - **RAGEval**: Focuses on RAG evaluation, supporting Embedding and Reranker models. [πŸ“– Usage Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/index.html)
459
+ - **Third-party Evaluation Tools**: Supports evaluation tasks like [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html).
627
460
 
628
- **Supports Speed Benchmark**
461
+ ## ⚑ Inference Performance Evaluation Tool
462
+ EvalScope provides a powerful stress testing tool for evaluating the performance of large language model services.
629
463
 
630
- It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports:
464
+ - **Key Metrics**: Supports throughput (Tokens/s), first token latency (TTFT), token generation latency (TPOT), etc.
465
+ - **Result Recording**: Supports recording results to `wandb` and `swanlab`.
466
+ - **Speed Benchmarks**: Can generate speed benchmark results similar to official reports.
631
467
 
632
- ```text
633
- Speed Benchmark Results:
634
- +---------------+-----------------+----------------+
635
- | Prompt Tokens | Speed(tokens/s) | GPU Memory(GB) |
636
- +---------------+-----------------+----------------+
637
- | 1 | 50.69 | 0.97 |
638
- | 6144 | 51.36 | 1.23 |
639
- | 14336 | 49.93 | 1.59 |
640
- | 30720 | 49.56 | 2.34 |
641
- +---------------+-----------------+----------------+
642
- ```
468
+ For details, please refer to [πŸ“– Performance Testing Usage Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
643
469
 
644
- ## πŸ–ŠοΈ Custom Dataset Evaluation
645
- EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [πŸ“–User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
470
+ Example output is shown below:
471
+ <p align="center">
472
+ <img src="docs/en/user_guides/stress_test/images/multi_perf.png" style="width: 80%;">
473
+ </p>
646
474
 
647
475
 
648
- ## βš”οΈ Arena Mode
476
+ ## πŸ“Š Visualizing Evaluation Results
649
477
 
650
- Arena mode allows you to configure multiple candidate models and specify a baseline model. Evaluation is performed by pairwise battles between each candidate model and the baseline model, with the final output including each model's win rate and ranking. This method is suitable for comparative evaluation among multiple models, providing an intuitive reflection of each model's strengths and weaknesses. Refer to: Arena Mode [πŸ“– User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
478
+ EvalScope provides a Gradio-based WebUI for interactive analysis and comparison of evaluation results.
651
479
 
652
- ```text
653
- Model WinRate (%) CI (%)
654
- ------------ ------------- ---------------
655
- qwen2.5-72b 69.3 (-13.3 / +12.2)
656
- qwen2.5-7b 50 (+0.0 / +0.0)
657
- qwen2.5-0.5b 4.7 (-2.5 / +4.4)
658
- ```
480
+ 1. **Install Dependencies**
481
+ ```bash
482
+ pip install 'evalscope[app]'
483
+ ```
484
+
485
+ 2. **Start Service**
486
+ ```bash
487
+ evalscope app
488
+ ```
489
+ Visit `http://127.0.0.1:7861` to open the visualization interface.
490
+
491
+ <table>
492
+ <tr>
493
+ <td style="text-align: center;">
494
+ <img src="docs/en/get_started/images/setting.png" alt="Setting" style="width: 85%;" />
495
+ <p>Settings Interface</p>
496
+ </td>
497
+ <td style="text-align: center;">
498
+ <img src="docs/en/get_started/images/model_compare.png" alt="Model Compare" style="width: 100%;" />
499
+ <p>Model Comparison</p>
500
+ </td>
501
+ </tr>
502
+ <tr>
503
+ <td style="text-align: center;">
504
+ <img src="docs/en/get_started/images/report_overview.png" alt="Report Overview" style="width: 100%;" />
505
+ <p>Report Overview</p>
506
+ </td>
507
+ <td style="text-align: center;">
508
+ <img src="docs/en/get_started/images/report_details.png" alt="Report Details" style="width: 85%;" />
509
+ <p>Report Details</p>
510
+ </td>
511
+ </tr>
512
+ </table>
513
+
514
+ For details, please refer to [πŸ“– Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html).
659
515
 
660
- ## πŸ‘·β€β™‚οΈ Contribution
516
+ ## πŸ‘·β€β™‚οΈ Contributing
661
517
 
662
- EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn), is continuously optimizing its benchmark evaluation features! We invite you to refer to the [Contribution Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html) to easily add your own evaluation benchmarks and share your contributions with the community. Let’s work together to support the growth of EvalScope and make our tools even better! Join us now!
518
+ We welcome any contributions from the community! If you want to add new evaluation benchmarks, models, or features, please refer to our [Contributing Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html).
519
+
520
+ Thanks to all developers who have contributed to EvalScope!
663
521
 
664
522
  <a href="https://github.com/modelscope/evalscope/graphs/contributors" target="_blank">
665
523
  <table>
@@ -671,8 +529,10 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
671
529
  </table>
672
530
  </a>
673
531
 
532
+
674
533
  ## πŸ“š Citation
675
534
 
535
+ If you use EvalScope in your research, please cite our work:
676
536
  ```bibtex
677
537
  @misc{evalscope_2024,
678
538
  title={{EvalScope}: Evaluation Framework for Large Models},
@@ -682,20 +542,6 @@ EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn
682
542
  }
683
543
  ```
684
544
 
685
- ## πŸ”œ Roadmap
686
- - [x] Support for better evaluation report visualization
687
- - [x] Support for mixed evaluations across multiple datasets
688
- - [x] RAG evaluation
689
- - [x] VLM evaluation
690
- - [x] Agents evaluation
691
- - [x] vLLM
692
- - [ ] Distributed evaluating
693
- - [x] Multi-modal evaluation
694
- - [ ] Benchmarks
695
- - [x] BFCL-v3
696
- - [x] GPQA
697
- - [x] MBPP
698
-
699
545
 
700
546
  ## ⭐ Star History
701
547