evalscope 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (45) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/backend/rag_eval/utils/llm.py +4 -5
  3. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  4. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
  5. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  6. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
  7. evalscope/benchmarks/arena_hard/utils.py +162 -0
  8. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
  10. evalscope/benchmarks/data_adapter.py +26 -2
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
  13. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
  14. evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
  15. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  16. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
  17. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
  18. evalscope/collections/evaluator.py +1 -1
  19. evalscope/config.py +6 -3
  20. evalscope/constants.py +1 -0
  21. evalscope/evaluator/evaluator.py +5 -4
  22. evalscope/metrics/llm_judge.py +1 -1
  23. evalscope/models/chat_adapter.py +32 -11
  24. evalscope/models/custom_adapter.py +1 -1
  25. evalscope/perf/arguments.py +19 -46
  26. evalscope/perf/benchmark.py +64 -90
  27. evalscope/perf/main.py +1 -1
  28. evalscope/perf/plugin/api/openai_api.py +4 -2
  29. evalscope/perf/plugin/datasets/__init__.py +1 -0
  30. evalscope/perf/plugin/datasets/openqa.py +6 -11
  31. evalscope/perf/plugin/datasets/random_dataset.py +51 -0
  32. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  33. evalscope/perf/utils/db_util.py +5 -2
  34. evalscope/run.py +14 -2
  35. evalscope/version.py +2 -2
  36. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/METADATA +42 -78
  37. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/RECORD +45 -37
  38. tests/cli/test_all.py +33 -24
  39. tests/cli/test_run.py +69 -22
  40. tests/perf/test_perf.py +23 -0
  41. tests/rag/test_ragas.py +4 -1
  42. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
  43. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
  44. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
  45. {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.13.0
3
+ Version: 0.13.2
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -16,11 +16,8 @@ Classifier: Programming Language :: Python :: 3.10
16
16
  Requires-Python: >=3.8
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
- Requires-Dist: absl-py
20
19
  Requires-Dist: accelerate
21
- Requires-Dist: cachetools
22
20
  Requires-Dist: datasets<=3.2.0,>=3.0.0
23
- Requires-Dist: editdistance
24
21
  Requires-Dist: immutabledict
25
22
  Requires-Dist: jieba
26
23
  Requires-Dist: jsonlines
@@ -31,34 +28,23 @@ Requires-Dist: modelscope[framework]
31
28
  Requires-Dist: nltk>=3.9
32
29
  Requires-Dist: openai
33
30
  Requires-Dist: pandas
34
- Requires-Dist: plotly
35
31
  Requires-Dist: pyarrow
36
- Requires-Dist: pympler
37
32
  Requires-Dist: pyyaml
38
- Requires-Dist: regex
39
33
  Requires-Dist: requests
40
- Requires-Dist: requests-toolbelt
41
34
  Requires-Dist: rouge-chinese
42
35
  Requires-Dist: rouge-score>=0.1.0
43
36
  Requires-Dist: sacrebleu
44
37
  Requires-Dist: scikit-learn
45
38
  Requires-Dist: seaborn
46
- Requires-Dist: sentencepiece
47
- Requires-Dist: simple-ddl-parser
48
39
  Requires-Dist: sympy
49
40
  Requires-Dist: tabulate
50
- Requires-Dist: tiktoken
51
41
  Requires-Dist: torch
52
42
  Requires-Dist: tqdm
53
43
  Requires-Dist: transformers>=4.33
54
- Requires-Dist: transformers-stream-generator
55
44
  Requires-Dist: word2number
56
45
  Provides-Extra: all
57
- Requires-Dist: absl-py; extra == "all"
58
46
  Requires-Dist: accelerate; extra == "all"
59
- Requires-Dist: cachetools; extra == "all"
60
47
  Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
61
- Requires-Dist: editdistance; extra == "all"
62
48
  Requires-Dist: immutabledict; extra == "all"
63
49
  Requires-Dist: jieba; extra == "all"
64
50
  Requires-Dist: jsonlines; extra == "all"
@@ -69,30 +55,26 @@ Requires-Dist: modelscope[framework]; extra == "all"
69
55
  Requires-Dist: nltk>=3.9; extra == "all"
70
56
  Requires-Dist: openai; extra == "all"
71
57
  Requires-Dist: pandas; extra == "all"
72
- Requires-Dist: plotly; extra == "all"
73
58
  Requires-Dist: pyarrow; extra == "all"
74
- Requires-Dist: pympler; extra == "all"
75
59
  Requires-Dist: pyyaml; extra == "all"
76
- Requires-Dist: regex; extra == "all"
77
60
  Requires-Dist: requests; extra == "all"
78
- Requires-Dist: requests-toolbelt; extra == "all"
79
61
  Requires-Dist: rouge-chinese; extra == "all"
80
62
  Requires-Dist: rouge-score>=0.1.0; extra == "all"
81
63
  Requires-Dist: sacrebleu; extra == "all"
82
64
  Requires-Dist: scikit-learn; extra == "all"
83
65
  Requires-Dist: seaborn; extra == "all"
84
- Requires-Dist: sentencepiece; extra == "all"
85
- Requires-Dist: simple-ddl-parser; extra == "all"
86
66
  Requires-Dist: sympy; extra == "all"
87
67
  Requires-Dist: tabulate; extra == "all"
88
- Requires-Dist: tiktoken; extra == "all"
89
68
  Requires-Dist: torch; extra == "all"
90
69
  Requires-Dist: tqdm; extra == "all"
91
70
  Requires-Dist: transformers>=4.33; extra == "all"
92
- Requires-Dist: transformers-stream-generator; extra == "all"
93
71
  Requires-Dist: word2number; extra == "all"
94
72
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
95
73
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
74
+ Requires-Dist: langchain<0.3.0; extra == "all"
75
+ Requires-Dist: langchain-community<0.3.0; extra == "all"
76
+ Requires-Dist: langchain-core<0.3.0; extra == "all"
77
+ Requires-Dist: langchain-openai<0.3.0; extra == "all"
96
78
  Requires-Dist: mteb==1.19.4; extra == "all"
97
79
  Requires-Dist: ragas==0.2.9; extra == "all"
98
80
  Requires-Dist: webdataset>0.2.0; extra == "all"
@@ -107,32 +89,6 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
107
89
  Provides-Extra: app
108
90
  Requires-Dist: gradio==5.4.0; extra == "app"
109
91
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
110
- Provides-Extra: inner
111
- Requires-Dist: absl-py; extra == "inner"
112
- Requires-Dist: accelerate; extra == "inner"
113
- Requires-Dist: alibaba-itag-sdk; extra == "inner"
114
- Requires-Dist: dashscope; extra == "inner"
115
- Requires-Dist: editdistance; extra == "inner"
116
- Requires-Dist: jsonlines; extra == "inner"
117
- Requires-Dist: nltk; extra == "inner"
118
- Requires-Dist: openai; extra == "inner"
119
- Requires-Dist: pandas==1.5.3; extra == "inner"
120
- Requires-Dist: plotly; extra == "inner"
121
- Requires-Dist: pyarrow; extra == "inner"
122
- Requires-Dist: pyodps; extra == "inner"
123
- Requires-Dist: pyyaml; extra == "inner"
124
- Requires-Dist: regex; extra == "inner"
125
- Requires-Dist: requests==2.28.1; extra == "inner"
126
- Requires-Dist: requests-toolbelt==0.10.1; extra == "inner"
127
- Requires-Dist: rouge-score; extra == "inner"
128
- Requires-Dist: sacrebleu; extra == "inner"
129
- Requires-Dist: scikit-learn; extra == "inner"
130
- Requires-Dist: seaborn; extra == "inner"
131
- Requires-Dist: simple-ddl-parser; extra == "inner"
132
- Requires-Dist: streamlit; extra == "inner"
133
- Requires-Dist: tqdm; extra == "inner"
134
- Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
135
- Requires-Dist: transformers-stream-generator; extra == "inner"
136
92
  Provides-Extra: opencompass
137
93
  Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
138
94
  Provides-Extra: perf
@@ -143,6 +99,10 @@ Requires-Dist: sse-starlette; extra == "perf"
143
99
  Requires-Dist: transformers; extra == "perf"
144
100
  Requires-Dist: unicorn; extra == "perf"
145
101
  Provides-Extra: rag
102
+ Requires-Dist: langchain<0.3.0; extra == "rag"
103
+ Requires-Dist: langchain-community<0.3.0; extra == "rag"
104
+ Requires-Dist: langchain-core<0.3.0; extra == "rag"
105
+ Requires-Dist: langchain-openai<0.3.0; extra == "rag"
146
106
  Requires-Dist: mteb==1.19.4; extra == "rag"
147
107
  Requires-Dist: ragas==0.2.9; extra == "rag"
148
108
  Requires-Dist: webdataset>0.2.0; extra == "rag"
@@ -239,7 +199,9 @@ Please scan the QR code below to join our community groups:
239
199
 
240
200
  ## 🎉 News
241
201
 
242
- - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark. You can use it by specifying `live_code_bench`.
202
+ - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
203
+ - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
204
+ - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
243
205
  - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
244
206
  - 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
245
207
  - 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
@@ -277,23 +239,24 @@ Please scan the QR code below to join our community groups:
277
239
  We recommend using conda to manage your environment and installing dependencies with pip:
278
240
 
279
241
  1. Create a conda environment (optional)
280
- ```shell
281
- # It is recommended to use Python 3.10
282
- conda create -n evalscope python=3.10
283
- # Activate the conda environment
284
- conda activate evalscope
285
- ```
242
+ ```shell
243
+ # It is recommended to use Python 3.10
244
+ conda create -n evalscope python=3.10
245
+ # Activate the conda environment
246
+ conda activate evalscope
247
+ ```
286
248
 
287
249
  2. Install dependencies using pip
288
- ```shell
289
- pip install evalscope # Install Native backend (default)
290
- # Additional options
291
- pip install evalscope[opencompass] # Install OpenCompass backend
292
- pip install evalscope[vlmeval] # Install VLMEvalKit backend
293
- pip install evalscope[rag] # Install RAGEval backend
294
- pip install evalscope[perf] # Install Perf dependencies
295
- pip install evalscope[all] # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
296
- ```
250
+ ```shell
251
+ pip install evalscope # Install Native backend (default)
252
+ # Additional options
253
+ pip install 'evalscope[opencompass]' # Install OpenCompass backend
254
+ pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
255
+ pip install 'evalscope[rag]' # Install RAGEval backend
256
+ pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
257
+ pip install 'evalscope[app]' # Install dependencies for visualization
258
+ pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
259
+ ```
297
260
 
298
261
  > [!WARNING]
299
262
  > As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
@@ -307,21 +270,22 @@ We recommend using conda to manage your environment and installing dependencies
307
270
 
308
271
  ### Method 2: Install from Source
309
272
  1. Download the source code
310
- ```shell
311
- git clone https://github.com/modelscope/evalscope.git
312
- ```
273
+ ```shell
274
+ git clone https://github.com/modelscope/evalscope.git
275
+ ```
313
276
 
314
277
  2. Install dependencies
315
- ```shell
316
- cd evalscope/
317
- pip install -e . # Install Native backend
318
- # Additional options
319
- pip install -e '.[opencompass]' # Install OpenCompass backend
320
- pip install -e '.[vlmeval]' # Install VLMEvalKit backend
321
- pip install -e '.[rag]' # Install RAGEval backend
322
- pip install -e '.[perf]' # Install Perf dependencies
323
- pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
324
- ```
278
+ ```shell
279
+ cd evalscope/
280
+ pip install -e . # Install Native backend
281
+ # Additional options
282
+ pip install -e '.[opencompass]' # Install OpenCompass backend
283
+ pip install -e '.[vlmeval]' # Install VLMEvalKit backend
284
+ pip install -e '.[rag]' # Install RAGEval backend
285
+ pip install -e '.[perf]' # Install Perf dependencies
286
+ pip install -e '.[app]' # Install visualization dependencies
287
+ pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
288
+ ```
325
289
 
326
290
 
327
291
  ## 🚀 Quick Start
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
- evalscope/arguments.py,sha256=VhZd7a8PoZK01qFCMEADLINqLYi6njRqRb50iR1l1lo,5241
3
- evalscope/config.py,sha256=9bMV7wf8pM7N5dEj_kJsCq6oM8xobzQDYh0NF8h-j1I,9313
4
- evalscope/constants.py,sha256=ydS8oihksGnvvzvJZw7HGhEeeccHNpJxspB81gAv29Y,3720
5
- evalscope/run.py,sha256=Udz-H503UaMYos0ic3A_npXIbnd4eJLx26q5UEahF-U,5797
2
+ evalscope/arguments.py,sha256=OPYmX_ar7rXFm0ETPuE2hs-knDQtwQ0pFwSazjn3S9Q,5241
3
+ evalscope/config.py,sha256=CkNBE83S335iyu0VRMkblaJw5nGM8pXv4NhK5ySE3cs,9476
4
+ evalscope/constants.py,sha256=Cgzkoz4R3MC3YLtbCM2fmSwF8Z2kuxYdOC8t9FWJj9w,3740
5
+ evalscope/run.py,sha256=LUCdnNzNIfHSWvxu3gxAsHEDX7hT5mcVnV4lSY5h0iA,6007
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
7
  evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
8
- evalscope/version.py,sha256=a1r1BkZoSpoA_eGXZoXm6WaLayRHhF__TgvE9xG-Whs,119
8
+ evalscope/version.py,sha256=JzXnfz-D9eKhVPZu2TQUPFaTFhRiZ3iK4jcIuxfnQE8,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -50,21 +50,26 @@ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_
50
50
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
51
  evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
52
52
  evalscope/backend/rag_eval/utils/embedding.py,sha256=x9HAEfZSSAnT2Tdbf-9a5UmBVagCr__ay5A2nMCPMpg,6258
53
- evalscope/backend/rag_eval/utils/llm.py,sha256=IaNgdQBnURAmtpK5UPDqfCNrtV_J3wu0s4JWQqKedHA,2568
53
+ evalscope/backend/rag_eval/utils/llm.py,sha256=UIfdvkxVViYkIpX-MoM8sAwGEAozzVFyzX-YoFxXC1E,2607
54
54
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
55
55
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
56
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
57
57
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
58
58
  evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
59
59
  evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
60
- evalscope/benchmarks/data_adapter.py,sha256=2u9oC4RBHVfEMHKPRu87xM4XOw_RS2Z2fvagNsciEo4,16791
60
+ evalscope/benchmarks/data_adapter.py,sha256=UvbJJTNBvA0aM-xmsaj9jEEsNksn9pTDDr90FfFX2pg,17606
61
61
  evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
62
62
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
64
64
  evalscope/benchmarks/aime/aime25_adapter.py,sha256=FB_NufY2V7uYdxVnrY_4y81gyyfYDnvedz1_zHdDWt4,1709
65
+ evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
+ evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=2a6wHJSLe89Xh18u1LBkMQEZzfOURiek6o0-k2lCQgM,4065
65
67
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
66
68
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
67
69
  evalscope/benchmarks/arc/arc_adapter.py,sha256=lkhDz-DYjPQ1vHzo8X4j-0Lq_rBxAnws35_R00pIbNI,6347
70
+ evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
+ evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=bdQfLTWB5pFo4hET0uFqu5zMX9PNQNwdoLoGrL5jCBE,6213
72
+ evalscope/benchmarks/arena_hard/utils.py,sha256=NstI1VR5fTaT-bfXRj0cLqm0DtH8EY4EQHR-K9HJubI,5089
68
73
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
69
74
  evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
70
75
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
@@ -98,20 +103,20 @@ evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTP
98
103
  evalscope/benchmarks/ceval/ceval_adapter.py,sha256=E4QobCjSSkMZtPJyaT_XBVxiqEqa1bta1I9aFnaHOqs,11308
99
104
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
100
105
  evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=nKF_a0yc_PbZYjYA_-gJh3ePZIEz5txrhDV4IsTqD4Q,8196
106
+ evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=fYvkJn1UcWM3aqhPMTTtBPVzjTL-Rm_g9UwUJx1FvJc,8106
102
107
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
103
108
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
104
109
  evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=TTq2jRz46Hqc_D_ZBaiw_OwKub1FZX6w8C7g7COIdGs,10372
105
110
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
106
111
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
107
112
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
108
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=F2YCaNDn49X82l06WlLFp2OPFB7nv0ecW40099I9iSE,6871
113
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=lD7sDro0dSWKgYaM_ZgWbBdetxVURpjo_2q1gvVt1XU,6815
109
114
  evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
110
- evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=U4M-0MVJS3-z03YW8nafooFJ7x60e5uEpBO5z_c7zk8,2450
115
+ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=ecNwAE3p2eKIeC4whSUdZpeJ8NgidbSFZbIYtSW26Xo,2394
111
116
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
117
  evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
113
118
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
114
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=ELDdS5T3JZeSWVv1ldawcHzLwAljEWKqakbRMVcBvgw,4741
119
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=KBZDP1T-t7uu8vBLGL_unVdj7rDko3KWBPKqWlw31JQ,4596
115
120
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
116
121
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
117
122
  evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
@@ -125,7 +130,7 @@ evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0
125
130
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
126
131
  evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
127
132
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
- evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=R7MILWuMglvXr7yWioBxyJ2T4EdEkwRZ1lnvWqZqG28,1922
133
+ evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=xuQ1EK8Af_093qqeOXPIp_iqTWcG5KGOtE6r5hx3958,1858
129
134
  evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
130
135
  evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
131
136
  evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
@@ -140,7 +145,7 @@ evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=8MOECcweL
140
145
  evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
141
146
  evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
142
147
  evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
143
- evalscope/benchmarks/live_code_bench/testing_util.py,sha256=EBe0XzY3B4cW5dCjwLksW7o4R1chZwsuFjxkfqVPFI4,28238
148
+ evalscope/benchmarks/live_code_bench/testing_util.py,sha256=s5oa--dOcugcpBmHsbeqnTRTDhdiCNXkIQuRc6EgA8o,28241
144
149
  evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
145
150
  evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
146
151
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -149,6 +154,8 @@ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=e__Evar99V9l65FlzT6T594CN4iMgmu
149
154
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
150
155
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
151
156
  evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
157
+ evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
+ evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=ZZMy9exJ8hknr1D6s73sAhHHzBAKcqo7WAmlUtPqpCI,9556
152
159
  evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
153
160
  evalscope/benchmarks/musr/musr_adapter.py,sha256=Po8hcIQiqlFo0AGjcNQe75cpsMNDcfiJaKgZsk33-DY,2442
154
161
  evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -159,7 +166,7 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
159
166
  evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
160
167
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
161
168
  evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=SrK18xDe4HyUaIPRLVEDtoF4Nc_ms4aFxktEsj8MnnA,9071
169
+ evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=CsRUJ0v1sSUmtO6QWkdzisn9OHN-1JSXB-9ghOuNqgY,8988
163
170
  evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
164
171
  evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
165
172
  evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
@@ -180,17 +187,17 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
180
187
  evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
181
188
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
182
189
  evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
183
- evalscope/collections/evaluator.py,sha256=okP4_a5vuM-Z0O_4ntauuyn2NeH228JUo_YrbrTqKPM,12741
190
+ evalscope/collections/evaluator.py,sha256=YJy8Dj35XCdCwhNDwZecJkeW1_ZgIOsuRLFzfe3SyV8,12724
184
191
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
185
192
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
186
193
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
187
- evalscope/evaluator/evaluator.py,sha256=yj7ds5WMYqQcRw3B3x11-cajl4DmWsLM_3kO1n2k7OE,19734
194
+ evalscope/evaluator/evaluator.py,sha256=szRQrXH5ILpUljb14lcunuOt185H8Um1paviTokraA4,19845
188
195
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
189
196
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
190
197
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
191
198
  evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
192
199
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
193
- evalscope/metrics/llm_judge.py,sha256=g9pLMJPNTUyw0sGteblws1_e_KzbRqcbqKcaIzfE_DE,4031
200
+ evalscope/metrics/llm_judge.py,sha256=Di0Q1c6VHLl0nQ_TVOZOOQlMApDIU83HuDPTOV8XrTA,4023
194
201
  evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
195
202
  evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
196
203
  evalscope/metrics/named_metrics.py,sha256=pSHA2_qdi9B5bDHIh08GYhx63odilSwA_T-95K1Usl0,1380
@@ -201,9 +208,9 @@ evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48
201
208
  evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
202
209
  evalscope/models/__init__.py,sha256=i9vcOBMEF_UM7C2gpmh2GsQk3njwqevoQ6A4CnP1fHs,1000
203
210
  evalscope/models/base_adapter.py,sha256=7PbRwfD5PIZCBYVds6ZHI8TBY9C5i2LdPOTu88FJWlY,3414
204
- evalscope/models/chat_adapter.py,sha256=5-yz7L41OdeBO9J_qRkEZcduATrYIMe__UFfh7BzjIc,6277
211
+ evalscope/models/chat_adapter.py,sha256=2XZmdhxnvy4yezPLXNVRbgrs0QkUY2VznEBq5mCYjKs,7106
205
212
  evalscope/models/choice_adapter.py,sha256=fnJdo-FMJ-zvNLbEJGc73odgWXIxtVudL00JIf2vzsA,8239
206
- evalscope/models/custom_adapter.py,sha256=Za52WF1I_YcJkGomJ6s9sP2Fs8DoJ4HHBYBi3iC3WNI,2379
213
+ evalscope/models/custom_adapter.py,sha256=AGztmZ0aT0g2flh4B4NaiZ8LCDg8tT0gVNxmrP5W1mA,2401
207
214
  evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
208
215
  evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
209
216
  evalscope/models/register.py,sha256=4vX6AfScAzwD7UkncbuejfAiQHznQkK5hvtG6jEUbWo,809
@@ -212,29 +219,30 @@ evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk
212
219
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
213
220
  evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
214
221
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
215
- evalscope/perf/arguments.py,sha256=u3GNdnOBmiEirtgJLspsLO7qBwHeWLoXd4vlt69jJ-g,9717
216
- evalscope/perf/benchmark.py,sha256=hKN-Nu-x-VTswHP0M6PT3jvduWxN7AJpz34DBrUcafQ,9734
222
+ evalscope/perf/arguments.py,sha256=srDp3JMYIPZxkfua5WHkjq3G8lJlTtxdXKxE_CivoJk,9156
223
+ evalscope/perf/benchmark.py,sha256=qY7zrsZMDBr1fABsShXjgK12tNE7PhzGZdLaUtdtxU8,8318
217
224
  evalscope/perf/http_client.py,sha256=xMakdQkJ2cgIOd-yOmHEW0vbGKTJ0JWhLFt9IFtUP8Q,7473
218
- evalscope/perf/main.py,sha256=aZUrfbz-Pl2xe8AgUL_6rW6n8dX4YAToDw5xPpLtbI4,1278
225
+ evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
219
226
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
220
227
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
221
228
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
222
229
  evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
223
230
  evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
224
231
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
225
- evalscope/perf/plugin/api/openai_api.py,sha256=KQRQMOfQceKQtrvTE-SyhNHcDoGuQ0900yh7r74Hcoo,7560
226
- evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
232
+ evalscope/perf/plugin/api/openai_api.py,sha256=DNDmW7jT0Abopw-K73X0PE7Vr2wTSKMBj79hJZTi-K8,7668
233
+ evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
227
234
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
228
235
  evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
229
236
  evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
230
237
  evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
231
238
  evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
232
- evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1CaKcfTi3Tnc,1394
233
- evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
239
+ evalscope/perf/plugin/datasets/openqa.py,sha256=_aVXs2s8wbmtoB6ZO-pNjUZvBVxRUYdoJDGv5-BumtI,1342
240
+ evalscope/perf/plugin/datasets/random_dataset.py,sha256=wPyY5kk2zKnc8u9uYEl-vQ6BLHeWbdC8EHEAZNFSDeU,2702
241
+ evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
234
242
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
235
243
  evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
236
244
  evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
237
- evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
245
+ evalscope/perf/utils/db_util.py,sha256=OAaR9bK4SPfMuk41w1t4d7ljxPDDEZOzcwDn2s9bpz0,9052
238
246
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
239
247
  evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
240
248
  evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -310,24 +318,24 @@ evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
310
318
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
311
319
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
312
320
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
313
- tests/cli/test_all.py,sha256=1wwXtdjBmWYLhs5TXOJhZBwPm2qd9FYFqQSemXWKNUs,3865
321
+ tests/cli/test_all.py,sha256=tRC4TWaqxEsB6jMsGR7u9RHWHuKzn7Umt2XKY1V8CLU,4035
314
322
  tests/cli/test_collection.py,sha256=V-_M7ngwekMGqPuI16jjJZyAK2XLE4Z6QTn-8B5ykgU,4071
315
- tests/cli/test_run.py,sha256=LKWWxT0jaMLtcIl57vnXEFFlzbJpAplFqqwinvAHN8Y,15047
323
+ tests/cli/test_run.py,sha256=0gD0nPiioieaDOqRZkS5ruIWuiv1B5D456wSSHv9y40,16471
316
324
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
317
- tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
325
+ tests/perf/test_perf.py,sha256=mfXTCsD9RaCef3b4CLvm8ErxBUaWzn-EKKhOxD65i3A,3817
318
326
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
319
327
  tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
320
328
  tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
321
- tests/rag/test_ragas.py,sha256=N_mUBIyxdQ1REzjkoI2sBNluKLLmKatLc3VY1o9uPck,3947
329
+ tests/rag/test_ragas.py,sha256=fzpn4zZPeZ04ZdfLmwXbsSjf7WcjPWrGsA6RDNXgIEQ,4011
322
330
  tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
323
331
  tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
324
332
  tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
325
333
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
326
334
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
327
335
  tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
328
- evalscope-0.13.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
329
- evalscope-0.13.0.dist-info/METADATA,sha256=0i3SENci2ws_vqdewQAxVUqan-MV1LwJoLLcEZ8ML7w,32870
330
- evalscope-0.13.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
331
- evalscope-0.13.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
332
- evalscope-0.13.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
333
- evalscope-0.13.0.dist-info/RECORD,,
336
+ evalscope-0.13.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
337
+ evalscope-0.13.2.dist-info/METADATA,sha256=b7rVRQHN5miovM5qlh4Dozpl8OaxO0rg0ctT-kDZMyY,32399
338
+ evalscope-0.13.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
339
+ evalscope-0.13.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
340
+ evalscope-0.13.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
341
+ evalscope-0.13.2.dist-info/RECORD,,
tests/cli/test_all.py CHANGED
@@ -18,31 +18,34 @@ os.environ['LOG_LEVEL'] = 'DEBUG'
18
18
  logger = get_logger()
19
19
 
20
20
  datasets=[
21
- # 'iquiz',
22
- # 'ifeval',
23
- # 'mmlu',
24
- # 'mmlu_pro',
25
- # 'musr',
26
- # 'process_bench',
27
- # 'race',
28
- # 'trivia_qa',
29
- # 'cmmlu',
30
- # 'humaneval',
31
- # 'gsm8k',
32
- # 'bbh',
33
- # 'competition_math',
34
- # 'math_500',
35
- # 'aime24',
36
- # 'gpqa',
37
- # 'arc',
38
- # 'ceval',
39
- # 'hellaswag',
40
- # 'general_mcq',
41
- # 'general_qa',
21
+ 'iquiz',
22
+ 'ifeval',
23
+ 'mmlu',
24
+ 'mmlu_pro',
25
+ 'musr',
26
+ 'process_bench',
27
+ 'race',
28
+ 'trivia_qa',
29
+ 'cmmlu',
30
+ 'humaneval',
31
+ 'gsm8k',
32
+ 'bbh',
33
+ 'competition_math',
34
+ 'math_500',
35
+ 'aime24',
36
+ 'gpqa',
37
+ 'arc',
38
+ 'ceval',
39
+ 'hellaswag',
40
+ 'general_mcq',
41
+ 'general_qa',
42
42
  'super_gpqa',
43
43
  'live_code_bench',
44
+ 'mmlu_redux',
44
45
  'simple_qa',
45
46
  'chinese_simpleqa',
47
+ 'alpaca_eval',
48
+ 'arena_hard',
46
49
  ]
47
50
 
48
51
  dataset_args={
@@ -110,7 +113,13 @@ dataset_args={
110
113
  'start_date': '2024-12-01',
111
114
  'end_date': '2025-01-01'
112
115
  },
113
- }
116
+ },
117
+ 'chinese_simpleqa': {
118
+ 'subset_list': ['中华文化']
119
+ },
120
+ 'mmlu_redux':{
121
+ 'subset_list': ['abstract_algebra']
122
+ },
114
123
  }
115
124
 
116
125
  class TestRun(unittest.TestCase):
@@ -119,13 +128,13 @@ class TestRun(unittest.TestCase):
119
128
  from evalscope.config import TaskConfig
120
129
 
121
130
  task_cfg = TaskConfig(
122
- model='qwen2.5-7b-instruct',
131
+ model='qwen2.5-0.5b-instruct',
123
132
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
124
133
  api_key= env.get('DASHSCOPE_API_KEY'),
125
134
  eval_type=EvalType.SERVICE,
126
135
  datasets=datasets,
127
136
  dataset_args=dataset_args,
128
- eval_batch_size=32,
137
+ eval_batch_size=2,
129
138
  limit=2,
130
139
  stream=True,
131
140
  generation_config={