evalscope 0.13.1__py3-none-any.whl → 0.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/backend/rag_eval/utils/llm.py +4 -5
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
- evalscope/benchmarks/arena_hard/utils.py +162 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
- evalscope/benchmarks/data_adapter.py +26 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
- evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
- evalscope/config.py +1 -1
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/models/chat_adapter.py +32 -11
- evalscope/perf/arguments.py +8 -6
- evalscope/perf/benchmark.py +31 -63
- evalscope/perf/plugin/api/openai_api.py +4 -2
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/utils/db_util.py +2 -2
- evalscope/version.py +2 -2
- {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/METADATA +10 -49
- {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/RECORD +35 -28
- tests/cli/test_all.py +33 -24
- tests/cli/test_run.py +35 -18
- tests/rag/test_ragas.py +4 -1
- {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.2
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -16,11 +16,8 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
16
16
|
Requires-Python: >=3.8
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
|
-
Requires-Dist: absl-py
|
|
20
19
|
Requires-Dist: accelerate
|
|
21
|
-
Requires-Dist: cachetools
|
|
22
20
|
Requires-Dist: datasets<=3.2.0,>=3.0.0
|
|
23
|
-
Requires-Dist: editdistance
|
|
24
21
|
Requires-Dist: immutabledict
|
|
25
22
|
Requires-Dist: jieba
|
|
26
23
|
Requires-Dist: jsonlines
|
|
@@ -31,34 +28,23 @@ Requires-Dist: modelscope[framework]
|
|
|
31
28
|
Requires-Dist: nltk>=3.9
|
|
32
29
|
Requires-Dist: openai
|
|
33
30
|
Requires-Dist: pandas
|
|
34
|
-
Requires-Dist: plotly
|
|
35
31
|
Requires-Dist: pyarrow
|
|
36
|
-
Requires-Dist: pympler
|
|
37
32
|
Requires-Dist: pyyaml
|
|
38
|
-
Requires-Dist: regex
|
|
39
33
|
Requires-Dist: requests
|
|
40
|
-
Requires-Dist: requests-toolbelt
|
|
41
34
|
Requires-Dist: rouge-chinese
|
|
42
35
|
Requires-Dist: rouge-score>=0.1.0
|
|
43
36
|
Requires-Dist: sacrebleu
|
|
44
37
|
Requires-Dist: scikit-learn
|
|
45
38
|
Requires-Dist: seaborn
|
|
46
|
-
Requires-Dist: sentencepiece
|
|
47
|
-
Requires-Dist: simple-ddl-parser
|
|
48
39
|
Requires-Dist: sympy
|
|
49
40
|
Requires-Dist: tabulate
|
|
50
|
-
Requires-Dist: tiktoken
|
|
51
41
|
Requires-Dist: torch
|
|
52
42
|
Requires-Dist: tqdm
|
|
53
43
|
Requires-Dist: transformers>=4.33
|
|
54
|
-
Requires-Dist: transformers-stream-generator
|
|
55
44
|
Requires-Dist: word2number
|
|
56
45
|
Provides-Extra: all
|
|
57
|
-
Requires-Dist: absl-py; extra == "all"
|
|
58
46
|
Requires-Dist: accelerate; extra == "all"
|
|
59
|
-
Requires-Dist: cachetools; extra == "all"
|
|
60
47
|
Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
|
|
61
|
-
Requires-Dist: editdistance; extra == "all"
|
|
62
48
|
Requires-Dist: immutabledict; extra == "all"
|
|
63
49
|
Requires-Dist: jieba; extra == "all"
|
|
64
50
|
Requires-Dist: jsonlines; extra == "all"
|
|
@@ -69,30 +55,26 @@ Requires-Dist: modelscope[framework]; extra == "all"
|
|
|
69
55
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
70
56
|
Requires-Dist: openai; extra == "all"
|
|
71
57
|
Requires-Dist: pandas; extra == "all"
|
|
72
|
-
Requires-Dist: plotly; extra == "all"
|
|
73
58
|
Requires-Dist: pyarrow; extra == "all"
|
|
74
|
-
Requires-Dist: pympler; extra == "all"
|
|
75
59
|
Requires-Dist: pyyaml; extra == "all"
|
|
76
|
-
Requires-Dist: regex; extra == "all"
|
|
77
60
|
Requires-Dist: requests; extra == "all"
|
|
78
|
-
Requires-Dist: requests-toolbelt; extra == "all"
|
|
79
61
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
80
62
|
Requires-Dist: rouge-score>=0.1.0; extra == "all"
|
|
81
63
|
Requires-Dist: sacrebleu; extra == "all"
|
|
82
64
|
Requires-Dist: scikit-learn; extra == "all"
|
|
83
65
|
Requires-Dist: seaborn; extra == "all"
|
|
84
|
-
Requires-Dist: sentencepiece; extra == "all"
|
|
85
|
-
Requires-Dist: simple-ddl-parser; extra == "all"
|
|
86
66
|
Requires-Dist: sympy; extra == "all"
|
|
87
67
|
Requires-Dist: tabulate; extra == "all"
|
|
88
|
-
Requires-Dist: tiktoken; extra == "all"
|
|
89
68
|
Requires-Dist: torch; extra == "all"
|
|
90
69
|
Requires-Dist: tqdm; extra == "all"
|
|
91
70
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
92
|
-
Requires-Dist: transformers-stream-generator; extra == "all"
|
|
93
71
|
Requires-Dist: word2number; extra == "all"
|
|
94
72
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
|
|
95
73
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
|
|
74
|
+
Requires-Dist: langchain<0.3.0; extra == "all"
|
|
75
|
+
Requires-Dist: langchain-community<0.3.0; extra == "all"
|
|
76
|
+
Requires-Dist: langchain-core<0.3.0; extra == "all"
|
|
77
|
+
Requires-Dist: langchain-openai<0.3.0; extra == "all"
|
|
96
78
|
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
97
79
|
Requires-Dist: ragas==0.2.9; extra == "all"
|
|
98
80
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
@@ -107,32 +89,6 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
|
107
89
|
Provides-Extra: app
|
|
108
90
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
109
91
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
110
|
-
Provides-Extra: inner
|
|
111
|
-
Requires-Dist: absl-py; extra == "inner"
|
|
112
|
-
Requires-Dist: accelerate; extra == "inner"
|
|
113
|
-
Requires-Dist: alibaba-itag-sdk; extra == "inner"
|
|
114
|
-
Requires-Dist: dashscope; extra == "inner"
|
|
115
|
-
Requires-Dist: editdistance; extra == "inner"
|
|
116
|
-
Requires-Dist: jsonlines; extra == "inner"
|
|
117
|
-
Requires-Dist: nltk; extra == "inner"
|
|
118
|
-
Requires-Dist: openai; extra == "inner"
|
|
119
|
-
Requires-Dist: pandas==1.5.3; extra == "inner"
|
|
120
|
-
Requires-Dist: plotly; extra == "inner"
|
|
121
|
-
Requires-Dist: pyarrow; extra == "inner"
|
|
122
|
-
Requires-Dist: pyodps; extra == "inner"
|
|
123
|
-
Requires-Dist: pyyaml; extra == "inner"
|
|
124
|
-
Requires-Dist: regex; extra == "inner"
|
|
125
|
-
Requires-Dist: requests==2.28.1; extra == "inner"
|
|
126
|
-
Requires-Dist: requests-toolbelt==0.10.1; extra == "inner"
|
|
127
|
-
Requires-Dist: rouge-score; extra == "inner"
|
|
128
|
-
Requires-Dist: sacrebleu; extra == "inner"
|
|
129
|
-
Requires-Dist: scikit-learn; extra == "inner"
|
|
130
|
-
Requires-Dist: seaborn; extra == "inner"
|
|
131
|
-
Requires-Dist: simple-ddl-parser; extra == "inner"
|
|
132
|
-
Requires-Dist: streamlit; extra == "inner"
|
|
133
|
-
Requires-Dist: tqdm; extra == "inner"
|
|
134
|
-
Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
|
|
135
|
-
Requires-Dist: transformers-stream-generator; extra == "inner"
|
|
136
92
|
Provides-Extra: opencompass
|
|
137
93
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
|
|
138
94
|
Provides-Extra: perf
|
|
@@ -143,6 +99,10 @@ Requires-Dist: sse-starlette; extra == "perf"
|
|
|
143
99
|
Requires-Dist: transformers; extra == "perf"
|
|
144
100
|
Requires-Dist: unicorn; extra == "perf"
|
|
145
101
|
Provides-Extra: rag
|
|
102
|
+
Requires-Dist: langchain<0.3.0; extra == "rag"
|
|
103
|
+
Requires-Dist: langchain-community<0.3.0; extra == "rag"
|
|
104
|
+
Requires-Dist: langchain-core<0.3.0; extra == "rag"
|
|
105
|
+
Requires-Dist: langchain-openai<0.3.0; extra == "rag"
|
|
146
106
|
Requires-Dist: mteb==1.19.4; extra == "rag"
|
|
147
107
|
Requires-Dist: ragas==0.2.9; extra == "rag"
|
|
148
108
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
@@ -239,6 +199,7 @@ Please scan the QR code below to join our community groups:
|
|
|
239
199
|
|
|
240
200
|
## 🎉 News
|
|
241
201
|
|
|
202
|
+
- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
242
203
|
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
243
204
|
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
244
205
|
- 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=OPYmX_ar7rXFm0ETPuE2hs-knDQtwQ0pFwSazjn3S9Q,5241
|
|
3
|
+
evalscope/config.py,sha256=CkNBE83S335iyu0VRMkblaJw5nGM8pXv4NhK5ySE3cs,9476
|
|
4
4
|
evalscope/constants.py,sha256=Cgzkoz4R3MC3YLtbCM2fmSwF8Z2kuxYdOC8t9FWJj9w,3740
|
|
5
5
|
evalscope/run.py,sha256=LUCdnNzNIfHSWvxu3gxAsHEDX7hT5mcVnV4lSY5h0iA,6007
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=JzXnfz-D9eKhVPZu2TQUPFaTFhRiZ3iK4jcIuxfnQE8,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -50,21 +50,26 @@ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_
|
|
|
50
50
|
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
51
|
evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
|
|
52
52
|
evalscope/backend/rag_eval/utils/embedding.py,sha256=x9HAEfZSSAnT2Tdbf-9a5UmBVagCr__ay5A2nMCPMpg,6258
|
|
53
|
-
evalscope/backend/rag_eval/utils/llm.py,sha256=
|
|
53
|
+
evalscope/backend/rag_eval/utils/llm.py,sha256=UIfdvkxVViYkIpX-MoM8sAwGEAozzVFyzX-YoFxXC1E,2607
|
|
54
54
|
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
55
55
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
57
57
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
58
58
|
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
59
|
evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
60
|
+
evalscope/benchmarks/data_adapter.py,sha256=UvbJJTNBvA0aM-xmsaj9jEEsNksn9pTDDr90FfFX2pg,17606
|
|
61
61
|
evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
|
|
62
62
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
63
|
evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
|
|
64
64
|
evalscope/benchmarks/aime/aime25_adapter.py,sha256=FB_NufY2V7uYdxVnrY_4y81gyyfYDnvedz1_zHdDWt4,1709
|
|
65
|
+
evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
|
+
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=2a6wHJSLe89Xh18u1LBkMQEZzfOURiek6o0-k2lCQgM,4065
|
|
65
67
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
66
68
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
67
69
|
evalscope/benchmarks/arc/arc_adapter.py,sha256=lkhDz-DYjPQ1vHzo8X4j-0Lq_rBxAnws35_R00pIbNI,6347
|
|
70
|
+
evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
|
+
evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=bdQfLTWB5pFo4hET0uFqu5zMX9PNQNwdoLoGrL5jCBE,6213
|
|
72
|
+
evalscope/benchmarks/arena_hard/utils.py,sha256=NstI1VR5fTaT-bfXRj0cLqm0DtH8EY4EQHR-K9HJubI,5089
|
|
68
73
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
69
74
|
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
|
|
70
75
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
@@ -98,20 +103,20 @@ evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTP
|
|
|
98
103
|
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=E4QobCjSSkMZtPJyaT_XBVxiqEqa1bta1I9aFnaHOqs,11308
|
|
99
104
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
100
105
|
evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
101
|
-
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=
|
|
106
|
+
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=fYvkJn1UcWM3aqhPMTTtBPVzjTL-Rm_g9UwUJx1FvJc,8106
|
|
102
107
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
103
108
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
104
109
|
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=TTq2jRz46Hqc_D_ZBaiw_OwKub1FZX6w8C7g7COIdGs,10372
|
|
105
110
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
106
111
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
107
112
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
108
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
113
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=lD7sDro0dSWKgYaM_ZgWbBdetxVURpjo_2q1gvVt1XU,6815
|
|
109
114
|
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
110
|
-
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=
|
|
115
|
+
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=ecNwAE3p2eKIeC4whSUdZpeJ8NgidbSFZbIYtSW26Xo,2394
|
|
111
116
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
112
117
|
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
|
|
113
118
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
114
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
119
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=KBZDP1T-t7uu8vBLGL_unVdj7rDko3KWBPKqWlw31JQ,4596
|
|
115
120
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
116
121
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
117
122
|
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
|
|
@@ -125,7 +130,7 @@ evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0
|
|
|
125
130
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
126
131
|
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
|
|
127
132
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
|
-
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=
|
|
133
|
+
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=xuQ1EK8Af_093qqeOXPIp_iqTWcG5KGOtE6r5hx3958,1858
|
|
129
134
|
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
130
135
|
evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
|
|
131
136
|
evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
|
|
@@ -140,7 +145,7 @@ evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=8MOECcweL
|
|
|
140
145
|
evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
|
|
141
146
|
evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
|
|
142
147
|
evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
|
|
143
|
-
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=
|
|
148
|
+
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=s5oa--dOcugcpBmHsbeqnTRTDhdiCNXkIQuRc6EgA8o,28241
|
|
144
149
|
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
145
150
|
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
|
|
146
151
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -149,6 +154,8 @@ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=e__Evar99V9l65FlzT6T594CN4iMgmu
|
|
|
149
154
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
150
155
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
151
156
|
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
|
|
157
|
+
evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
158
|
+
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=ZZMy9exJ8hknr1D6s73sAhHHzBAKcqo7WAmlUtPqpCI,9556
|
|
152
159
|
evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
160
|
evalscope/benchmarks/musr/musr_adapter.py,sha256=Po8hcIQiqlFo0AGjcNQe75cpsMNDcfiJaKgZsk33-DY,2442
|
|
154
161
|
evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -159,7 +166,7 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
|
|
|
159
166
|
evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
|
|
160
167
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
161
168
|
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
|
-
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=
|
|
169
|
+
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=CsRUJ0v1sSUmtO6QWkdzisn9OHN-1JSXB-9ghOuNqgY,8988
|
|
163
170
|
evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
164
171
|
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
|
|
165
172
|
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
|
|
@@ -190,7 +197,7 @@ evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0Fw
|
|
|
190
197
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
191
198
|
evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
|
|
192
199
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
193
|
-
evalscope/metrics/llm_judge.py,sha256=
|
|
200
|
+
evalscope/metrics/llm_judge.py,sha256=Di0Q1c6VHLl0nQ_TVOZOOQlMApDIU83HuDPTOV8XrTA,4023
|
|
194
201
|
evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
|
|
195
202
|
evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
|
|
196
203
|
evalscope/metrics/named_metrics.py,sha256=pSHA2_qdi9B5bDHIh08GYhx63odilSwA_T-95K1Usl0,1380
|
|
@@ -201,7 +208,7 @@ evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48
|
|
|
201
208
|
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
202
209
|
evalscope/models/__init__.py,sha256=i9vcOBMEF_UM7C2gpmh2GsQk3njwqevoQ6A4CnP1fHs,1000
|
|
203
210
|
evalscope/models/base_adapter.py,sha256=7PbRwfD5PIZCBYVds6ZHI8TBY9C5i2LdPOTu88FJWlY,3414
|
|
204
|
-
evalscope/models/chat_adapter.py,sha256=
|
|
211
|
+
evalscope/models/chat_adapter.py,sha256=2XZmdhxnvy4yezPLXNVRbgrs0QkUY2VznEBq5mCYjKs,7106
|
|
205
212
|
evalscope/models/choice_adapter.py,sha256=fnJdo-FMJ-zvNLbEJGc73odgWXIxtVudL00JIf2vzsA,8239
|
|
206
213
|
evalscope/models/custom_adapter.py,sha256=AGztmZ0aT0g2flh4B4NaiZ8LCDg8tT0gVNxmrP5W1mA,2401
|
|
207
214
|
evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
|
|
@@ -212,8 +219,8 @@ evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk
|
|
|
212
219
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
213
220
|
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
214
221
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
215
|
-
evalscope/perf/arguments.py,sha256=
|
|
216
|
-
evalscope/perf/benchmark.py,sha256=
|
|
222
|
+
evalscope/perf/arguments.py,sha256=srDp3JMYIPZxkfua5WHkjq3G8lJlTtxdXKxE_CivoJk,9156
|
|
223
|
+
evalscope/perf/benchmark.py,sha256=qY7zrsZMDBr1fABsShXjgK12tNE7PhzGZdLaUtdtxU8,8318
|
|
217
224
|
evalscope/perf/http_client.py,sha256=xMakdQkJ2cgIOd-yOmHEW0vbGKTJ0JWhLFt9IFtUP8Q,7473
|
|
218
225
|
evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
|
|
219
226
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
@@ -222,7 +229,7 @@ evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2m
|
|
|
222
229
|
evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
|
|
223
230
|
evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
|
|
224
231
|
evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
|
|
225
|
-
evalscope/perf/plugin/api/openai_api.py,sha256=
|
|
232
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=DNDmW7jT0Abopw-K73X0PE7Vr2wTSKMBj79hJZTi-K8,7668
|
|
226
233
|
evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
|
|
227
234
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
228
235
|
evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
|
|
@@ -231,11 +238,11 @@ evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96
|
|
|
231
238
|
evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
|
|
232
239
|
evalscope/perf/plugin/datasets/openqa.py,sha256=_aVXs2s8wbmtoB6ZO-pNjUZvBVxRUYdoJDGv5-BumtI,1342
|
|
233
240
|
evalscope/perf/plugin/datasets/random_dataset.py,sha256=wPyY5kk2zKnc8u9uYEl-vQ6BLHeWbdC8EHEAZNFSDeU,2702
|
|
234
|
-
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=
|
|
241
|
+
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
|
|
235
242
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
236
243
|
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
237
244
|
evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
|
|
238
|
-
evalscope/perf/utils/db_util.py,sha256=
|
|
245
|
+
evalscope/perf/utils/db_util.py,sha256=OAaR9bK4SPfMuk41w1t4d7ljxPDDEZOzcwDn2s9bpz0,9052
|
|
239
246
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
240
247
|
evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
|
|
241
248
|
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -311,24 +318,24 @@ evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
|
|
|
311
318
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
312
319
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
313
320
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
314
|
-
tests/cli/test_all.py,sha256=
|
|
321
|
+
tests/cli/test_all.py,sha256=tRC4TWaqxEsB6jMsGR7u9RHWHuKzn7Umt2XKY1V8CLU,4035
|
|
315
322
|
tests/cli/test_collection.py,sha256=V-_M7ngwekMGqPuI16jjJZyAK2XLE4Z6QTn-8B5ykgU,4071
|
|
316
|
-
tests/cli/test_run.py,sha256=
|
|
323
|
+
tests/cli/test_run.py,sha256=0gD0nPiioieaDOqRZkS5ruIWuiv1B5D456wSSHv9y40,16471
|
|
317
324
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
318
325
|
tests/perf/test_perf.py,sha256=mfXTCsD9RaCef3b4CLvm8ErxBUaWzn-EKKhOxD65i3A,3817
|
|
319
326
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
320
327
|
tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
|
|
321
328
|
tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
|
|
322
|
-
tests/rag/test_ragas.py,sha256=
|
|
329
|
+
tests/rag/test_ragas.py,sha256=fzpn4zZPeZ04ZdfLmwXbsSjf7WcjPWrGsA6RDNXgIEQ,4011
|
|
323
330
|
tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
324
331
|
tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
|
|
325
332
|
tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
|
|
326
333
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
327
334
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
328
335
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
329
|
-
evalscope-0.13.
|
|
330
|
-
evalscope-0.13.
|
|
331
|
-
evalscope-0.13.
|
|
332
|
-
evalscope-0.13.
|
|
333
|
-
evalscope-0.13.
|
|
334
|
-
evalscope-0.13.
|
|
336
|
+
evalscope-0.13.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
337
|
+
evalscope-0.13.2.dist-info/METADATA,sha256=b7rVRQHN5miovM5qlh4Dozpl8OaxO0rg0ctT-kDZMyY,32399
|
|
338
|
+
evalscope-0.13.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
339
|
+
evalscope-0.13.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
340
|
+
evalscope-0.13.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
341
|
+
evalscope-0.13.2.dist-info/RECORD,,
|
tests/cli/test_all.py
CHANGED
|
@@ -18,31 +18,34 @@ os.environ['LOG_LEVEL'] = 'DEBUG'
|
|
|
18
18
|
logger = get_logger()
|
|
19
19
|
|
|
20
20
|
datasets=[
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
21
|
+
'iquiz',
|
|
22
|
+
'ifeval',
|
|
23
|
+
'mmlu',
|
|
24
|
+
'mmlu_pro',
|
|
25
|
+
'musr',
|
|
26
|
+
'process_bench',
|
|
27
|
+
'race',
|
|
28
|
+
'trivia_qa',
|
|
29
|
+
'cmmlu',
|
|
30
|
+
'humaneval',
|
|
31
|
+
'gsm8k',
|
|
32
|
+
'bbh',
|
|
33
|
+
'competition_math',
|
|
34
|
+
'math_500',
|
|
35
|
+
'aime24',
|
|
36
|
+
'gpqa',
|
|
37
|
+
'arc',
|
|
38
|
+
'ceval',
|
|
39
|
+
'hellaswag',
|
|
40
|
+
'general_mcq',
|
|
41
|
+
'general_qa',
|
|
42
42
|
'super_gpqa',
|
|
43
43
|
'live_code_bench',
|
|
44
|
+
'mmlu_redux',
|
|
44
45
|
'simple_qa',
|
|
45
46
|
'chinese_simpleqa',
|
|
47
|
+
'alpaca_eval',
|
|
48
|
+
'arena_hard',
|
|
46
49
|
]
|
|
47
50
|
|
|
48
51
|
dataset_args={
|
|
@@ -110,7 +113,13 @@ dataset_args={
|
|
|
110
113
|
'start_date': '2024-12-01',
|
|
111
114
|
'end_date': '2025-01-01'
|
|
112
115
|
},
|
|
113
|
-
}
|
|
116
|
+
},
|
|
117
|
+
'chinese_simpleqa': {
|
|
118
|
+
'subset_list': ['中华文化']
|
|
119
|
+
},
|
|
120
|
+
'mmlu_redux':{
|
|
121
|
+
'subset_list': ['abstract_algebra']
|
|
122
|
+
},
|
|
114
123
|
}
|
|
115
124
|
|
|
116
125
|
class TestRun(unittest.TestCase):
|
|
@@ -119,13 +128,13 @@ class TestRun(unittest.TestCase):
|
|
|
119
128
|
from evalscope.config import TaskConfig
|
|
120
129
|
|
|
121
130
|
task_cfg = TaskConfig(
|
|
122
|
-
model='qwen2.5-
|
|
131
|
+
model='qwen2.5-0.5b-instruct',
|
|
123
132
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
124
133
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
125
134
|
eval_type=EvalType.SERVICE,
|
|
126
135
|
datasets=datasets,
|
|
127
136
|
dataset_args=dataset_args,
|
|
128
|
-
eval_batch_size=
|
|
137
|
+
eval_batch_size=2,
|
|
129
138
|
limit=2,
|
|
130
139
|
stream=True,
|
|
131
140
|
generation_config={
|
tests/cli/test_run.py
CHANGED
|
@@ -207,11 +207,12 @@ class TestRun(unittest.TestCase):
|
|
|
207
207
|
from evalscope.config import TaskConfig
|
|
208
208
|
|
|
209
209
|
task_cfg = TaskConfig(
|
|
210
|
-
model='
|
|
210
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
211
211
|
datasets=[
|
|
212
|
+
'iquiz',
|
|
212
213
|
# 'math_500',
|
|
213
214
|
# 'aime24',
|
|
214
|
-
'competition_math'
|
|
215
|
+
# 'competition_math'
|
|
215
216
|
],
|
|
216
217
|
dataset_args={
|
|
217
218
|
'competition_math': {
|
|
@@ -255,7 +256,7 @@ class TestRun(unittest.TestCase):
|
|
|
255
256
|
from evalscope.config import TaskConfig
|
|
256
257
|
|
|
257
258
|
task_cfg = TaskConfig(
|
|
258
|
-
model='
|
|
259
|
+
model='qwen-plus',
|
|
259
260
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
260
261
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
261
262
|
eval_type=EvalType.SERVICE,
|
|
@@ -278,10 +279,11 @@ class TestRun(unittest.TestCase):
|
|
|
278
279
|
# 'gpqa',
|
|
279
280
|
# 'arc',
|
|
280
281
|
# 'ceval',
|
|
281
|
-
'hellaswag',
|
|
282
|
+
# 'hellaswag',
|
|
282
283
|
# 'general_mcq',
|
|
283
|
-
|
|
284
|
+
'general_qa'
|
|
284
285
|
# 'super_gpqa',
|
|
286
|
+
# 'mmlu_redux'
|
|
285
287
|
],
|
|
286
288
|
dataset_args={
|
|
287
289
|
'mmlu': {
|
|
@@ -335,23 +337,26 @@ class TestRun(unittest.TestCase):
|
|
|
335
337
|
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
336
338
|
# 'test'
|
|
337
339
|
],
|
|
338
|
-
'metric_list': ['
|
|
340
|
+
'metric_list': ['AverageRouge']
|
|
339
341
|
},
|
|
340
342
|
'super_gpqa': {
|
|
341
343
|
# 'subset_list': ['Philosophy', 'Education'],
|
|
342
344
|
'few_shot_num': 0
|
|
343
|
-
}
|
|
345
|
+
},
|
|
346
|
+
'mmlu_redux':{
|
|
347
|
+
'subset_list': ['abstract_algebra']
|
|
348
|
+
},
|
|
344
349
|
},
|
|
345
350
|
eval_batch_size=32,
|
|
346
351
|
limit=15,
|
|
347
|
-
|
|
352
|
+
debug=True,
|
|
348
353
|
stream=False,
|
|
349
354
|
generation_config={
|
|
350
355
|
'temperature': 0,
|
|
351
|
-
'n':
|
|
356
|
+
'n': 2,
|
|
352
357
|
'max_tokens': 4096,
|
|
353
358
|
},
|
|
354
|
-
|
|
359
|
+
use_cache='outputs/20250326_202848',
|
|
355
360
|
)
|
|
356
361
|
|
|
357
362
|
run_task(task_cfg=task_cfg)
|
|
@@ -392,7 +397,7 @@ class TestRun(unittest.TestCase):
|
|
|
392
397
|
from evalscope.config import TaskConfig
|
|
393
398
|
|
|
394
399
|
task_cfg = TaskConfig(
|
|
395
|
-
model='
|
|
400
|
+
model='qwen2.5-0.5b-instruct',
|
|
396
401
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
397
402
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
398
403
|
eval_type=EvalType.SERVICE,
|
|
@@ -404,10 +409,12 @@ class TestRun(unittest.TestCase):
|
|
|
404
409
|
# 'gsm8k'
|
|
405
410
|
# 'truthful_qa',
|
|
406
411
|
# 'simple_qa',
|
|
407
|
-
#
|
|
408
|
-
'live_code_bench',
|
|
409
|
-
# 'humaneval'
|
|
410
|
-
# 'general_qa'
|
|
412
|
+
# 'chinese_simpleqa',
|
|
413
|
+
# 'live_code_bench',
|
|
414
|
+
# 'humaneval',
|
|
415
|
+
# 'general_qa',
|
|
416
|
+
# 'alpaca_eval',
|
|
417
|
+
'arena_hard'
|
|
411
418
|
],
|
|
412
419
|
dataset_args={
|
|
413
420
|
'competition_math': {
|
|
@@ -427,20 +434,30 @@ class TestRun(unittest.TestCase):
|
|
|
427
434
|
# 'test'
|
|
428
435
|
]
|
|
429
436
|
},
|
|
437
|
+
'chinese_simpleqa': {
|
|
438
|
+
'subset_list': [
|
|
439
|
+
'中华文化'
|
|
440
|
+
]
|
|
441
|
+
},
|
|
430
442
|
},
|
|
431
|
-
eval_batch_size=
|
|
432
|
-
|
|
443
|
+
eval_batch_size=5,
|
|
444
|
+
limit=10,
|
|
433
445
|
judge_strategy=JudgeStrategy.AUTO,
|
|
434
|
-
judge_worker_num=
|
|
446
|
+
judge_worker_num=5,
|
|
435
447
|
judge_model_args={
|
|
436
448
|
'model_id': 'qwen2.5-7b-instruct',
|
|
437
449
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
438
450
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
451
|
+
'generation_config': {
|
|
452
|
+
'temperature': 0.0,
|
|
453
|
+
'max_tokens': 4096
|
|
454
|
+
}
|
|
439
455
|
},
|
|
440
456
|
generation_config={
|
|
441
457
|
'max_new_tokens': 20000,
|
|
442
458
|
'temperature': 0.0,
|
|
443
459
|
'seed': 42,
|
|
460
|
+
'n': 1
|
|
444
461
|
},
|
|
445
462
|
timeout=60000,
|
|
446
463
|
stream=True,
|
tests/rag/test_ragas.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import os
|
|
3
|
+
from dotenv import dotenv_values
|
|
4
|
+
|
|
5
|
+
env = dotenv_values('.env')
|
|
3
6
|
import unittest
|
|
4
7
|
|
|
5
8
|
from evalscope.run import run_task
|
|
@@ -63,7 +66,7 @@ class TestRAGAS(unittest.TestCase):
|
|
|
63
66
|
'eval': {
|
|
64
67
|
'testset_file': 'outputs/testset_chinese_with_answer.json',
|
|
65
68
|
'critic_llm': {
|
|
66
|
-
'model_name_or_path': '
|
|
69
|
+
'model_name_or_path': 'Qwen/Qwen2.5-7B-Instruct',
|
|
67
70
|
},
|
|
68
71
|
'embeddings': {
|
|
69
72
|
'model_name_or_path': 'AI-ModelScope/m3e-base',
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|