evalscope 0.13.1__py3-none-any.whl → 0.13.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (35) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/backend/rag_eval/utils/llm.py +4 -5
  3. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  4. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
  5. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  6. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
  7. evalscope/benchmarks/arena_hard/utils.py +162 -0
  8. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
  10. evalscope/benchmarks/data_adapter.py +26 -2
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
  13. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
  14. evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
  15. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  16. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
  17. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
  18. evalscope/config.py +1 -1
  19. evalscope/metrics/llm_judge.py +1 -1
  20. evalscope/models/chat_adapter.py +32 -11
  21. evalscope/perf/arguments.py +8 -6
  22. evalscope/perf/benchmark.py +31 -63
  23. evalscope/perf/plugin/api/openai_api.py +4 -2
  24. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  25. evalscope/perf/utils/db_util.py +2 -2
  26. evalscope/version.py +2 -2
  27. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/METADATA +10 -49
  28. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/RECORD +35 -28
  29. tests/cli/test_all.py +33 -24
  30. tests/cli/test_run.py +35 -18
  31. tests/rag/test_ragas.py +4 -1
  32. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
  33. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
  34. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
  35. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.13.1
3
+ Version: 0.13.2
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -16,11 +16,8 @@ Classifier: Programming Language :: Python :: 3.10
16
16
  Requires-Python: >=3.8
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
- Requires-Dist: absl-py
20
19
  Requires-Dist: accelerate
21
- Requires-Dist: cachetools
22
20
  Requires-Dist: datasets<=3.2.0,>=3.0.0
23
- Requires-Dist: editdistance
24
21
  Requires-Dist: immutabledict
25
22
  Requires-Dist: jieba
26
23
  Requires-Dist: jsonlines
@@ -31,34 +28,23 @@ Requires-Dist: modelscope[framework]
31
28
  Requires-Dist: nltk>=3.9
32
29
  Requires-Dist: openai
33
30
  Requires-Dist: pandas
34
- Requires-Dist: plotly
35
31
  Requires-Dist: pyarrow
36
- Requires-Dist: pympler
37
32
  Requires-Dist: pyyaml
38
- Requires-Dist: regex
39
33
  Requires-Dist: requests
40
- Requires-Dist: requests-toolbelt
41
34
  Requires-Dist: rouge-chinese
42
35
  Requires-Dist: rouge-score>=0.1.0
43
36
  Requires-Dist: sacrebleu
44
37
  Requires-Dist: scikit-learn
45
38
  Requires-Dist: seaborn
46
- Requires-Dist: sentencepiece
47
- Requires-Dist: simple-ddl-parser
48
39
  Requires-Dist: sympy
49
40
  Requires-Dist: tabulate
50
- Requires-Dist: tiktoken
51
41
  Requires-Dist: torch
52
42
  Requires-Dist: tqdm
53
43
  Requires-Dist: transformers>=4.33
54
- Requires-Dist: transformers-stream-generator
55
44
  Requires-Dist: word2number
56
45
  Provides-Extra: all
57
- Requires-Dist: absl-py; extra == "all"
58
46
  Requires-Dist: accelerate; extra == "all"
59
- Requires-Dist: cachetools; extra == "all"
60
47
  Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
61
- Requires-Dist: editdistance; extra == "all"
62
48
  Requires-Dist: immutabledict; extra == "all"
63
49
  Requires-Dist: jieba; extra == "all"
64
50
  Requires-Dist: jsonlines; extra == "all"
@@ -69,30 +55,26 @@ Requires-Dist: modelscope[framework]; extra == "all"
69
55
  Requires-Dist: nltk>=3.9; extra == "all"
70
56
  Requires-Dist: openai; extra == "all"
71
57
  Requires-Dist: pandas; extra == "all"
72
- Requires-Dist: plotly; extra == "all"
73
58
  Requires-Dist: pyarrow; extra == "all"
74
- Requires-Dist: pympler; extra == "all"
75
59
  Requires-Dist: pyyaml; extra == "all"
76
- Requires-Dist: regex; extra == "all"
77
60
  Requires-Dist: requests; extra == "all"
78
- Requires-Dist: requests-toolbelt; extra == "all"
79
61
  Requires-Dist: rouge-chinese; extra == "all"
80
62
  Requires-Dist: rouge-score>=0.1.0; extra == "all"
81
63
  Requires-Dist: sacrebleu; extra == "all"
82
64
  Requires-Dist: scikit-learn; extra == "all"
83
65
  Requires-Dist: seaborn; extra == "all"
84
- Requires-Dist: sentencepiece; extra == "all"
85
- Requires-Dist: simple-ddl-parser; extra == "all"
86
66
  Requires-Dist: sympy; extra == "all"
87
67
  Requires-Dist: tabulate; extra == "all"
88
- Requires-Dist: tiktoken; extra == "all"
89
68
  Requires-Dist: torch; extra == "all"
90
69
  Requires-Dist: tqdm; extra == "all"
91
70
  Requires-Dist: transformers>=4.33; extra == "all"
92
- Requires-Dist: transformers-stream-generator; extra == "all"
93
71
  Requires-Dist: word2number; extra == "all"
94
72
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
95
73
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
74
+ Requires-Dist: langchain<0.3.0; extra == "all"
75
+ Requires-Dist: langchain-community<0.3.0; extra == "all"
76
+ Requires-Dist: langchain-core<0.3.0; extra == "all"
77
+ Requires-Dist: langchain-openai<0.3.0; extra == "all"
96
78
  Requires-Dist: mteb==1.19.4; extra == "all"
97
79
  Requires-Dist: ragas==0.2.9; extra == "all"
98
80
  Requires-Dist: webdataset>0.2.0; extra == "all"
@@ -107,32 +89,6 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
107
89
  Provides-Extra: app
108
90
  Requires-Dist: gradio==5.4.0; extra == "app"
109
91
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
110
- Provides-Extra: inner
111
- Requires-Dist: absl-py; extra == "inner"
112
- Requires-Dist: accelerate; extra == "inner"
113
- Requires-Dist: alibaba-itag-sdk; extra == "inner"
114
- Requires-Dist: dashscope; extra == "inner"
115
- Requires-Dist: editdistance; extra == "inner"
116
- Requires-Dist: jsonlines; extra == "inner"
117
- Requires-Dist: nltk; extra == "inner"
118
- Requires-Dist: openai; extra == "inner"
119
- Requires-Dist: pandas==1.5.3; extra == "inner"
120
- Requires-Dist: plotly; extra == "inner"
121
- Requires-Dist: pyarrow; extra == "inner"
122
- Requires-Dist: pyodps; extra == "inner"
123
- Requires-Dist: pyyaml; extra == "inner"
124
- Requires-Dist: regex; extra == "inner"
125
- Requires-Dist: requests==2.28.1; extra == "inner"
126
- Requires-Dist: requests-toolbelt==0.10.1; extra == "inner"
127
- Requires-Dist: rouge-score; extra == "inner"
128
- Requires-Dist: sacrebleu; extra == "inner"
129
- Requires-Dist: scikit-learn; extra == "inner"
130
- Requires-Dist: seaborn; extra == "inner"
131
- Requires-Dist: simple-ddl-parser; extra == "inner"
132
- Requires-Dist: streamlit; extra == "inner"
133
- Requires-Dist: tqdm; extra == "inner"
134
- Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
135
- Requires-Dist: transformers-stream-generator; extra == "inner"
136
92
  Provides-Extra: opencompass
137
93
  Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
138
94
  Provides-Extra: perf
@@ -143,6 +99,10 @@ Requires-Dist: sse-starlette; extra == "perf"
143
99
  Requires-Dist: transformers; extra == "perf"
144
100
  Requires-Dist: unicorn; extra == "perf"
145
101
  Provides-Extra: rag
102
+ Requires-Dist: langchain<0.3.0; extra == "rag"
103
+ Requires-Dist: langchain-community<0.3.0; extra == "rag"
104
+ Requires-Dist: langchain-core<0.3.0; extra == "rag"
105
+ Requires-Dist: langchain-openai<0.3.0; extra == "rag"
146
106
  Requires-Dist: mteb==1.19.4; extra == "rag"
147
107
  Requires-Dist: ragas==0.2.9; extra == "rag"
148
108
  Requires-Dist: webdataset>0.2.0; extra == "rag"
@@ -239,6 +199,7 @@ Please scan the QR code below to join our community groups:
239
199
 
240
200
  ## 🎉 News
241
201
 
202
+ - 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
242
203
  - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
243
204
  - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
244
205
  - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
- evalscope/arguments.py,sha256=VhZd7a8PoZK01qFCMEADLINqLYi6njRqRb50iR1l1lo,5241
3
- evalscope/config.py,sha256=wLrc8a7z28IFPRaeUzot5HGtSDY_13KR-3kRyFKEGx8,9476
2
+ evalscope/arguments.py,sha256=OPYmX_ar7rXFm0ETPuE2hs-knDQtwQ0pFwSazjn3S9Q,5241
3
+ evalscope/config.py,sha256=CkNBE83S335iyu0VRMkblaJw5nGM8pXv4NhK5ySE3cs,9476
4
4
  evalscope/constants.py,sha256=Cgzkoz4R3MC3YLtbCM2fmSwF8Z2kuxYdOC8t9FWJj9w,3740
5
5
  evalscope/run.py,sha256=LUCdnNzNIfHSWvxu3gxAsHEDX7hT5mcVnV4lSY5h0iA,6007
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
7
  evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
8
- evalscope/version.py,sha256=Y30-zF2dwch3upMc0t5yNNjIgvI-LQQWFhftRQgXvOk,119
8
+ evalscope/version.py,sha256=JzXnfz-D9eKhVPZu2TQUPFaTFhRiZ3iK4jcIuxfnQE8,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -50,21 +50,26 @@ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_
50
50
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
51
  evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
52
52
  evalscope/backend/rag_eval/utils/embedding.py,sha256=x9HAEfZSSAnT2Tdbf-9a5UmBVagCr__ay5A2nMCPMpg,6258
53
- evalscope/backend/rag_eval/utils/llm.py,sha256=IaNgdQBnURAmtpK5UPDqfCNrtV_J3wu0s4JWQqKedHA,2568
53
+ evalscope/backend/rag_eval/utils/llm.py,sha256=UIfdvkxVViYkIpX-MoM8sAwGEAozzVFyzX-YoFxXC1E,2607
54
54
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
55
55
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
56
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
57
57
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
58
58
  evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
59
59
  evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
60
- evalscope/benchmarks/data_adapter.py,sha256=2u9oC4RBHVfEMHKPRu87xM4XOw_RS2Z2fvagNsciEo4,16791
60
+ evalscope/benchmarks/data_adapter.py,sha256=UvbJJTNBvA0aM-xmsaj9jEEsNksn9pTDDr90FfFX2pg,17606
61
61
  evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
62
62
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
64
64
  evalscope/benchmarks/aime/aime25_adapter.py,sha256=FB_NufY2V7uYdxVnrY_4y81gyyfYDnvedz1_zHdDWt4,1709
65
+ evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
+ evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=2a6wHJSLe89Xh18u1LBkMQEZzfOURiek6o0-k2lCQgM,4065
65
67
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
66
68
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
67
69
  evalscope/benchmarks/arc/arc_adapter.py,sha256=lkhDz-DYjPQ1vHzo8X4j-0Lq_rBxAnws35_R00pIbNI,6347
70
+ evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
+ evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=bdQfLTWB5pFo4hET0uFqu5zMX9PNQNwdoLoGrL5jCBE,6213
72
+ evalscope/benchmarks/arena_hard/utils.py,sha256=NstI1VR5fTaT-bfXRj0cLqm0DtH8EY4EQHR-K9HJubI,5089
68
73
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
69
74
  evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
70
75
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
@@ -98,20 +103,20 @@ evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTP
98
103
  evalscope/benchmarks/ceval/ceval_adapter.py,sha256=E4QobCjSSkMZtPJyaT_XBVxiqEqa1bta1I9aFnaHOqs,11308
99
104
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
100
105
  evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=nKF_a0yc_PbZYjYA_-gJh3ePZIEz5txrhDV4IsTqD4Q,8196
106
+ evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=fYvkJn1UcWM3aqhPMTTtBPVzjTL-Rm_g9UwUJx1FvJc,8106
102
107
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
103
108
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
104
109
  evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=TTq2jRz46Hqc_D_ZBaiw_OwKub1FZX6w8C7g7COIdGs,10372
105
110
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
106
111
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
107
112
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
108
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=F2YCaNDn49X82l06WlLFp2OPFB7nv0ecW40099I9iSE,6871
113
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=lD7sDro0dSWKgYaM_ZgWbBdetxVURpjo_2q1gvVt1XU,6815
109
114
  evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
110
- evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=U4M-0MVJS3-z03YW8nafooFJ7x60e5uEpBO5z_c7zk8,2450
115
+ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=ecNwAE3p2eKIeC4whSUdZpeJ8NgidbSFZbIYtSW26Xo,2394
111
116
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
117
  evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
113
118
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
114
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=ELDdS5T3JZeSWVv1ldawcHzLwAljEWKqakbRMVcBvgw,4741
119
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=KBZDP1T-t7uu8vBLGL_unVdj7rDko3KWBPKqWlw31JQ,4596
115
120
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
116
121
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
117
122
  evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
@@ -125,7 +130,7 @@ evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0
125
130
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
126
131
  evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
127
132
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
128
- evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=R7MILWuMglvXr7yWioBxyJ2T4EdEkwRZ1lnvWqZqG28,1922
133
+ evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=xuQ1EK8Af_093qqeOXPIp_iqTWcG5KGOtE6r5hx3958,1858
129
134
  evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
130
135
  evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
131
136
  evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
@@ -140,7 +145,7 @@ evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=8MOECcweL
140
145
  evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
141
146
  evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
142
147
  evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
143
- evalscope/benchmarks/live_code_bench/testing_util.py,sha256=EBe0XzY3B4cW5dCjwLksW7o4R1chZwsuFjxkfqVPFI4,28238
148
+ evalscope/benchmarks/live_code_bench/testing_util.py,sha256=s5oa--dOcugcpBmHsbeqnTRTDhdiCNXkIQuRc6EgA8o,28241
144
149
  evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
145
150
  evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
146
151
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -149,6 +154,8 @@ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=e__Evar99V9l65FlzT6T594CN4iMgmu
149
154
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
150
155
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
151
156
  evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
157
+ evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
+ evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=ZZMy9exJ8hknr1D6s73sAhHHzBAKcqo7WAmlUtPqpCI,9556
152
159
  evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
153
160
  evalscope/benchmarks/musr/musr_adapter.py,sha256=Po8hcIQiqlFo0AGjcNQe75cpsMNDcfiJaKgZsk33-DY,2442
154
161
  evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -159,7 +166,7 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
159
166
  evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
160
167
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
161
168
  evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=SrK18xDe4HyUaIPRLVEDtoF4Nc_ms4aFxktEsj8MnnA,9071
169
+ evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=CsRUJ0v1sSUmtO6QWkdzisn9OHN-1JSXB-9ghOuNqgY,8988
163
170
  evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
164
171
  evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
165
172
  evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
@@ -190,7 +197,7 @@ evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0Fw
190
197
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
191
198
  evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
192
199
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
193
- evalscope/metrics/llm_judge.py,sha256=g9pLMJPNTUyw0sGteblws1_e_KzbRqcbqKcaIzfE_DE,4031
200
+ evalscope/metrics/llm_judge.py,sha256=Di0Q1c6VHLl0nQ_TVOZOOQlMApDIU83HuDPTOV8XrTA,4023
194
201
  evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
195
202
  evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
196
203
  evalscope/metrics/named_metrics.py,sha256=pSHA2_qdi9B5bDHIh08GYhx63odilSwA_T-95K1Usl0,1380
@@ -201,7 +208,7 @@ evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48
201
208
  evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
202
209
  evalscope/models/__init__.py,sha256=i9vcOBMEF_UM7C2gpmh2GsQk3njwqevoQ6A4CnP1fHs,1000
203
210
  evalscope/models/base_adapter.py,sha256=7PbRwfD5PIZCBYVds6ZHI8TBY9C5i2LdPOTu88FJWlY,3414
204
- evalscope/models/chat_adapter.py,sha256=5-yz7L41OdeBO9J_qRkEZcduATrYIMe__UFfh7BzjIc,6277
211
+ evalscope/models/chat_adapter.py,sha256=2XZmdhxnvy4yezPLXNVRbgrs0QkUY2VznEBq5mCYjKs,7106
205
212
  evalscope/models/choice_adapter.py,sha256=fnJdo-FMJ-zvNLbEJGc73odgWXIxtVudL00JIf2vzsA,8239
206
213
  evalscope/models/custom_adapter.py,sha256=AGztmZ0aT0g2flh4B4NaiZ8LCDg8tT0gVNxmrP5W1mA,2401
207
214
  evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
@@ -212,8 +219,8 @@ evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk
212
219
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
213
220
  evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
214
221
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
215
- evalscope/perf/arguments.py,sha256=hBR6TXCoLkHRLxrwXacmierfFZhyQaT5hnKAfp-vE6I,8990
216
- evalscope/perf/benchmark.py,sha256=VYcFhSoZXcLoNXpFYxOFxLbBLv_8Tn74Qklim7vELCM,9889
222
+ evalscope/perf/arguments.py,sha256=srDp3JMYIPZxkfua5WHkjq3G8lJlTtxdXKxE_CivoJk,9156
223
+ evalscope/perf/benchmark.py,sha256=qY7zrsZMDBr1fABsShXjgK12tNE7PhzGZdLaUtdtxU8,8318
217
224
  evalscope/perf/http_client.py,sha256=xMakdQkJ2cgIOd-yOmHEW0vbGKTJ0JWhLFt9IFtUP8Q,7473
218
225
  evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
219
226
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
@@ -222,7 +229,7 @@ evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2m
222
229
  evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
223
230
  evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
224
231
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
225
- evalscope/perf/plugin/api/openai_api.py,sha256=KQRQMOfQceKQtrvTE-SyhNHcDoGuQ0900yh7r74Hcoo,7560
232
+ evalscope/perf/plugin/api/openai_api.py,sha256=DNDmW7jT0Abopw-K73X0PE7Vr2wTSKMBj79hJZTi-K8,7668
226
233
  evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
227
234
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
228
235
  evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
@@ -231,11 +238,11 @@ evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96
231
238
  evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
232
239
  evalscope/perf/plugin/datasets/openqa.py,sha256=_aVXs2s8wbmtoB6ZO-pNjUZvBVxRUYdoJDGv5-BumtI,1342
233
240
  evalscope/perf/plugin/datasets/random_dataset.py,sha256=wPyY5kk2zKnc8u9uYEl-vQ6BLHeWbdC8EHEAZNFSDeU,2702
234
- evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
241
+ evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
235
242
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
236
243
  evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
237
244
  evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
238
- evalscope/perf/utils/db_util.py,sha256=hRXixxpNBrACF43reOJV5SoO1vj34cqoNMaTKH_oLLE,9100
245
+ evalscope/perf/utils/db_util.py,sha256=OAaR9bK4SPfMuk41w1t4d7ljxPDDEZOzcwDn2s9bpz0,9052
239
246
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
240
247
  evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
241
248
  evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -311,24 +318,24 @@ evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
311
318
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
312
319
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
313
320
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
314
- tests/cli/test_all.py,sha256=1wwXtdjBmWYLhs5TXOJhZBwPm2qd9FYFqQSemXWKNUs,3865
321
+ tests/cli/test_all.py,sha256=tRC4TWaqxEsB6jMsGR7u9RHWHuKzn7Umt2XKY1V8CLU,4035
315
322
  tests/cli/test_collection.py,sha256=V-_M7ngwekMGqPuI16jjJZyAK2XLE4Z6QTn-8B5ykgU,4071
316
- tests/cli/test_run.py,sha256=Gk8uCT0IjDSf2sf-TXeQFV83ovNzRs4GcAkQ1DhRJEU,15929
323
+ tests/cli/test_run.py,sha256=0gD0nPiioieaDOqRZkS5ruIWuiv1B5D456wSSHv9y40,16471
317
324
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
318
325
  tests/perf/test_perf.py,sha256=mfXTCsD9RaCef3b4CLvm8ErxBUaWzn-EKKhOxD65i3A,3817
319
326
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
320
327
  tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
321
328
  tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
322
- tests/rag/test_ragas.py,sha256=N_mUBIyxdQ1REzjkoI2sBNluKLLmKatLc3VY1o9uPck,3947
329
+ tests/rag/test_ragas.py,sha256=fzpn4zZPeZ04ZdfLmwXbsSjf7WcjPWrGsA6RDNXgIEQ,4011
323
330
  tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
324
331
  tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
325
332
  tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
326
333
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
327
334
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
328
335
  tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
329
- evalscope-0.13.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
330
- evalscope-0.13.1.dist-info/METADATA,sha256=luYebd_U93wnTkXcv_MYPfd9-JRz51DjWB6Bh6phspU,33546
331
- evalscope-0.13.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
332
- evalscope-0.13.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
333
- evalscope-0.13.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
334
- evalscope-0.13.1.dist-info/RECORD,,
336
+ evalscope-0.13.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
337
+ evalscope-0.13.2.dist-info/METADATA,sha256=b7rVRQHN5miovM5qlh4Dozpl8OaxO0rg0ctT-kDZMyY,32399
338
+ evalscope-0.13.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
339
+ evalscope-0.13.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
340
+ evalscope-0.13.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
341
+ evalscope-0.13.2.dist-info/RECORD,,
tests/cli/test_all.py CHANGED
@@ -18,31 +18,34 @@ os.environ['LOG_LEVEL'] = 'DEBUG'
18
18
  logger = get_logger()
19
19
 
20
20
  datasets=[
21
- # 'iquiz',
22
- # 'ifeval',
23
- # 'mmlu',
24
- # 'mmlu_pro',
25
- # 'musr',
26
- # 'process_bench',
27
- # 'race',
28
- # 'trivia_qa',
29
- # 'cmmlu',
30
- # 'humaneval',
31
- # 'gsm8k',
32
- # 'bbh',
33
- # 'competition_math',
34
- # 'math_500',
35
- # 'aime24',
36
- # 'gpqa',
37
- # 'arc',
38
- # 'ceval',
39
- # 'hellaswag',
40
- # 'general_mcq',
41
- # 'general_qa',
21
+ 'iquiz',
22
+ 'ifeval',
23
+ 'mmlu',
24
+ 'mmlu_pro',
25
+ 'musr',
26
+ 'process_bench',
27
+ 'race',
28
+ 'trivia_qa',
29
+ 'cmmlu',
30
+ 'humaneval',
31
+ 'gsm8k',
32
+ 'bbh',
33
+ 'competition_math',
34
+ 'math_500',
35
+ 'aime24',
36
+ 'gpqa',
37
+ 'arc',
38
+ 'ceval',
39
+ 'hellaswag',
40
+ 'general_mcq',
41
+ 'general_qa',
42
42
  'super_gpqa',
43
43
  'live_code_bench',
44
+ 'mmlu_redux',
44
45
  'simple_qa',
45
46
  'chinese_simpleqa',
47
+ 'alpaca_eval',
48
+ 'arena_hard',
46
49
  ]
47
50
 
48
51
  dataset_args={
@@ -110,7 +113,13 @@ dataset_args={
110
113
  'start_date': '2024-12-01',
111
114
  'end_date': '2025-01-01'
112
115
  },
113
- }
116
+ },
117
+ 'chinese_simpleqa': {
118
+ 'subset_list': ['中华文化']
119
+ },
120
+ 'mmlu_redux':{
121
+ 'subset_list': ['abstract_algebra']
122
+ },
114
123
  }
115
124
 
116
125
  class TestRun(unittest.TestCase):
@@ -119,13 +128,13 @@ class TestRun(unittest.TestCase):
119
128
  from evalscope.config import TaskConfig
120
129
 
121
130
  task_cfg = TaskConfig(
122
- model='qwen2.5-7b-instruct',
131
+ model='qwen2.5-0.5b-instruct',
123
132
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
124
133
  api_key= env.get('DASHSCOPE_API_KEY'),
125
134
  eval_type=EvalType.SERVICE,
126
135
  datasets=datasets,
127
136
  dataset_args=dataset_args,
128
- eval_batch_size=32,
137
+ eval_batch_size=2,
129
138
  limit=2,
130
139
  stream=True,
131
140
  generation_config={
tests/cli/test_run.py CHANGED
@@ -207,11 +207,12 @@ class TestRun(unittest.TestCase):
207
207
  from evalscope.config import TaskConfig
208
208
 
209
209
  task_cfg = TaskConfig(
210
- model='qwen/Qwen2-0.5B-Instruct',
210
+ model='Qwen/Qwen2.5-0.5B-Instruct',
211
211
  datasets=[
212
+ 'iquiz',
212
213
  # 'math_500',
213
214
  # 'aime24',
214
- 'competition_math'
215
+ # 'competition_math'
215
216
  ],
216
217
  dataset_args={
217
218
  'competition_math': {
@@ -255,7 +256,7 @@ class TestRun(unittest.TestCase):
255
256
  from evalscope.config import TaskConfig
256
257
 
257
258
  task_cfg = TaskConfig(
258
- model='qwen2.5-7b-instruct',
259
+ model='qwen-plus',
259
260
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
260
261
  api_key= env.get('DASHSCOPE_API_KEY'),
261
262
  eval_type=EvalType.SERVICE,
@@ -278,10 +279,11 @@ class TestRun(unittest.TestCase):
278
279
  # 'gpqa',
279
280
  # 'arc',
280
281
  # 'ceval',
281
- 'hellaswag',
282
+ # 'hellaswag',
282
283
  # 'general_mcq',
283
- # 'general_qa'
284
+ 'general_qa'
284
285
  # 'super_gpqa',
286
+ # 'mmlu_redux'
285
287
  ],
286
288
  dataset_args={
287
289
  'mmlu': {
@@ -335,23 +337,26 @@ class TestRun(unittest.TestCase):
335
337
  'example', # 评测数据集名称,上述 *_dev.csv 中的 *
336
338
  # 'test'
337
339
  ],
338
- 'metric_list': ['AverageBLEU']
340
+ 'metric_list': ['AverageRouge']
339
341
  },
340
342
  'super_gpqa': {
341
343
  # 'subset_list': ['Philosophy', 'Education'],
342
344
  'few_shot_num': 0
343
- }
345
+ },
346
+ 'mmlu_redux':{
347
+ 'subset_list': ['abstract_algebra']
348
+ },
344
349
  },
345
350
  eval_batch_size=32,
346
351
  limit=15,
347
- # debug=True,
352
+ debug=True,
348
353
  stream=False,
349
354
  generation_config={
350
355
  'temperature': 0,
351
- 'n': 1,
356
+ 'n': 2,
352
357
  'max_tokens': 4096,
353
358
  },
354
- # use_cache='./outputs/20250212_150525',
359
+ use_cache='outputs/20250326_202848',
355
360
  )
356
361
 
357
362
  run_task(task_cfg=task_cfg)
@@ -392,7 +397,7 @@ class TestRun(unittest.TestCase):
392
397
  from evalscope.config import TaskConfig
393
398
 
394
399
  task_cfg = TaskConfig(
395
- model='qwq-32b',
400
+ model='qwen2.5-0.5b-instruct',
396
401
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
397
402
  api_key= env.get('DASHSCOPE_API_KEY'),
398
403
  eval_type=EvalType.SERVICE,
@@ -404,10 +409,12 @@ class TestRun(unittest.TestCase):
404
409
  # 'gsm8k'
405
410
  # 'truthful_qa',
406
411
  # 'simple_qa',
407
- # # 'chinese_simpleqa',
408
- 'live_code_bench',
409
- # 'humaneval'
410
- # 'general_qa'
412
+ # 'chinese_simpleqa',
413
+ # 'live_code_bench',
414
+ # 'humaneval',
415
+ # 'general_qa',
416
+ # 'alpaca_eval',
417
+ 'arena_hard'
411
418
  ],
412
419
  dataset_args={
413
420
  'competition_math': {
@@ -427,20 +434,30 @@ class TestRun(unittest.TestCase):
427
434
  # 'test'
428
435
  ]
429
436
  },
437
+ 'chinese_simpleqa': {
438
+ 'subset_list': [
439
+ '中华文化'
440
+ ]
441
+ },
430
442
  },
431
- eval_batch_size=10,
432
- # limit=5,
443
+ eval_batch_size=5,
444
+ limit=10,
433
445
  judge_strategy=JudgeStrategy.AUTO,
434
- judge_worker_num=8,
446
+ judge_worker_num=5,
435
447
  judge_model_args={
436
448
  'model_id': 'qwen2.5-7b-instruct',
437
449
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
438
450
  'api_key': env.get('DASHSCOPE_API_KEY'),
451
+ 'generation_config': {
452
+ 'temperature': 0.0,
453
+ 'max_tokens': 4096
454
+ }
439
455
  },
440
456
  generation_config={
441
457
  'max_new_tokens': 20000,
442
458
  'temperature': 0.0,
443
459
  'seed': 42,
460
+ 'n': 1
444
461
  },
445
462
  timeout=60000,
446
463
  stream=True,
tests/rag/test_ragas.py CHANGED
@@ -1,5 +1,8 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import os
3
+ from dotenv import dotenv_values
4
+
5
+ env = dotenv_values('.env')
3
6
  import unittest
4
7
 
5
8
  from evalscope.run import run_task
@@ -63,7 +66,7 @@ class TestRAGAS(unittest.TestCase):
63
66
  'eval': {
64
67
  'testset_file': 'outputs/testset_chinese_with_answer.json',
65
68
  'critic_llm': {
66
- 'model_name_or_path': 'qwen/Qwen2-7B-Instruct',
69
+ 'model_name_or_path': 'Qwen/Qwen2.5-7B-Instruct',
67
70
  },
68
71
  'embeddings': {
69
72
  'model_name_or_path': 'AI-ModelScope/m3e-base',