evalscope 0.13.0__py3-none-any.whl → 0.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/backend/rag_eval/utils/llm.py +4 -5
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
- evalscope/benchmarks/arena_hard/utils.py +162 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
- evalscope/benchmarks/data_adapter.py +26 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
- evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
- evalscope/collections/evaluator.py +1 -1
- evalscope/config.py +6 -3
- evalscope/constants.py +1 -0
- evalscope/evaluator/evaluator.py +5 -4
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/models/chat_adapter.py +32 -11
- evalscope/models/custom_adapter.py +1 -1
- evalscope/perf/arguments.py +19 -46
- evalscope/perf/benchmark.py +64 -90
- evalscope/perf/main.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +4 -2
- evalscope/perf/plugin/datasets/__init__.py +1 -0
- evalscope/perf/plugin/datasets/openqa.py +6 -11
- evalscope/perf/plugin/datasets/random_dataset.py +51 -0
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/utils/db_util.py +5 -2
- evalscope/run.py +14 -2
- evalscope/version.py +2 -2
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/METADATA +42 -78
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/RECORD +45 -37
- tests/cli/test_all.py +33 -24
- tests/cli/test_run.py +69 -22
- tests/perf/test_perf.py +23 -0
- tests/rag/test_ragas.py +4 -1
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.0.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.13.
|
|
3
|
+
Version: 0.13.2
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -16,11 +16,8 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
16
16
|
Requires-Python: >=3.8
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
|
-
Requires-Dist: absl-py
|
|
20
19
|
Requires-Dist: accelerate
|
|
21
|
-
Requires-Dist: cachetools
|
|
22
20
|
Requires-Dist: datasets<=3.2.0,>=3.0.0
|
|
23
|
-
Requires-Dist: editdistance
|
|
24
21
|
Requires-Dist: immutabledict
|
|
25
22
|
Requires-Dist: jieba
|
|
26
23
|
Requires-Dist: jsonlines
|
|
@@ -31,34 +28,23 @@ Requires-Dist: modelscope[framework]
|
|
|
31
28
|
Requires-Dist: nltk>=3.9
|
|
32
29
|
Requires-Dist: openai
|
|
33
30
|
Requires-Dist: pandas
|
|
34
|
-
Requires-Dist: plotly
|
|
35
31
|
Requires-Dist: pyarrow
|
|
36
|
-
Requires-Dist: pympler
|
|
37
32
|
Requires-Dist: pyyaml
|
|
38
|
-
Requires-Dist: regex
|
|
39
33
|
Requires-Dist: requests
|
|
40
|
-
Requires-Dist: requests-toolbelt
|
|
41
34
|
Requires-Dist: rouge-chinese
|
|
42
35
|
Requires-Dist: rouge-score>=0.1.0
|
|
43
36
|
Requires-Dist: sacrebleu
|
|
44
37
|
Requires-Dist: scikit-learn
|
|
45
38
|
Requires-Dist: seaborn
|
|
46
|
-
Requires-Dist: sentencepiece
|
|
47
|
-
Requires-Dist: simple-ddl-parser
|
|
48
39
|
Requires-Dist: sympy
|
|
49
40
|
Requires-Dist: tabulate
|
|
50
|
-
Requires-Dist: tiktoken
|
|
51
41
|
Requires-Dist: torch
|
|
52
42
|
Requires-Dist: tqdm
|
|
53
43
|
Requires-Dist: transformers>=4.33
|
|
54
|
-
Requires-Dist: transformers-stream-generator
|
|
55
44
|
Requires-Dist: word2number
|
|
56
45
|
Provides-Extra: all
|
|
57
|
-
Requires-Dist: absl-py; extra == "all"
|
|
58
46
|
Requires-Dist: accelerate; extra == "all"
|
|
59
|
-
Requires-Dist: cachetools; extra == "all"
|
|
60
47
|
Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
|
|
61
|
-
Requires-Dist: editdistance; extra == "all"
|
|
62
48
|
Requires-Dist: immutabledict; extra == "all"
|
|
63
49
|
Requires-Dist: jieba; extra == "all"
|
|
64
50
|
Requires-Dist: jsonlines; extra == "all"
|
|
@@ -69,30 +55,26 @@ Requires-Dist: modelscope[framework]; extra == "all"
|
|
|
69
55
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
70
56
|
Requires-Dist: openai; extra == "all"
|
|
71
57
|
Requires-Dist: pandas; extra == "all"
|
|
72
|
-
Requires-Dist: plotly; extra == "all"
|
|
73
58
|
Requires-Dist: pyarrow; extra == "all"
|
|
74
|
-
Requires-Dist: pympler; extra == "all"
|
|
75
59
|
Requires-Dist: pyyaml; extra == "all"
|
|
76
|
-
Requires-Dist: regex; extra == "all"
|
|
77
60
|
Requires-Dist: requests; extra == "all"
|
|
78
|
-
Requires-Dist: requests-toolbelt; extra == "all"
|
|
79
61
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
80
62
|
Requires-Dist: rouge-score>=0.1.0; extra == "all"
|
|
81
63
|
Requires-Dist: sacrebleu; extra == "all"
|
|
82
64
|
Requires-Dist: scikit-learn; extra == "all"
|
|
83
65
|
Requires-Dist: seaborn; extra == "all"
|
|
84
|
-
Requires-Dist: sentencepiece; extra == "all"
|
|
85
|
-
Requires-Dist: simple-ddl-parser; extra == "all"
|
|
86
66
|
Requires-Dist: sympy; extra == "all"
|
|
87
67
|
Requires-Dist: tabulate; extra == "all"
|
|
88
|
-
Requires-Dist: tiktoken; extra == "all"
|
|
89
68
|
Requires-Dist: torch; extra == "all"
|
|
90
69
|
Requires-Dist: tqdm; extra == "all"
|
|
91
70
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
92
|
-
Requires-Dist: transformers-stream-generator; extra == "all"
|
|
93
71
|
Requires-Dist: word2number; extra == "all"
|
|
94
72
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
|
|
95
73
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
|
|
74
|
+
Requires-Dist: langchain<0.3.0; extra == "all"
|
|
75
|
+
Requires-Dist: langchain-community<0.3.0; extra == "all"
|
|
76
|
+
Requires-Dist: langchain-core<0.3.0; extra == "all"
|
|
77
|
+
Requires-Dist: langchain-openai<0.3.0; extra == "all"
|
|
96
78
|
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
97
79
|
Requires-Dist: ragas==0.2.9; extra == "all"
|
|
98
80
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
@@ -107,32 +89,6 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
|
107
89
|
Provides-Extra: app
|
|
108
90
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
109
91
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
110
|
-
Provides-Extra: inner
|
|
111
|
-
Requires-Dist: absl-py; extra == "inner"
|
|
112
|
-
Requires-Dist: accelerate; extra == "inner"
|
|
113
|
-
Requires-Dist: alibaba-itag-sdk; extra == "inner"
|
|
114
|
-
Requires-Dist: dashscope; extra == "inner"
|
|
115
|
-
Requires-Dist: editdistance; extra == "inner"
|
|
116
|
-
Requires-Dist: jsonlines; extra == "inner"
|
|
117
|
-
Requires-Dist: nltk; extra == "inner"
|
|
118
|
-
Requires-Dist: openai; extra == "inner"
|
|
119
|
-
Requires-Dist: pandas==1.5.3; extra == "inner"
|
|
120
|
-
Requires-Dist: plotly; extra == "inner"
|
|
121
|
-
Requires-Dist: pyarrow; extra == "inner"
|
|
122
|
-
Requires-Dist: pyodps; extra == "inner"
|
|
123
|
-
Requires-Dist: pyyaml; extra == "inner"
|
|
124
|
-
Requires-Dist: regex; extra == "inner"
|
|
125
|
-
Requires-Dist: requests==2.28.1; extra == "inner"
|
|
126
|
-
Requires-Dist: requests-toolbelt==0.10.1; extra == "inner"
|
|
127
|
-
Requires-Dist: rouge-score; extra == "inner"
|
|
128
|
-
Requires-Dist: sacrebleu; extra == "inner"
|
|
129
|
-
Requires-Dist: scikit-learn; extra == "inner"
|
|
130
|
-
Requires-Dist: seaborn; extra == "inner"
|
|
131
|
-
Requires-Dist: simple-ddl-parser; extra == "inner"
|
|
132
|
-
Requires-Dist: streamlit; extra == "inner"
|
|
133
|
-
Requires-Dist: tqdm; extra == "inner"
|
|
134
|
-
Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
|
|
135
|
-
Requires-Dist: transformers-stream-generator; extra == "inner"
|
|
136
92
|
Provides-Extra: opencompass
|
|
137
93
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
|
|
138
94
|
Provides-Extra: perf
|
|
@@ -143,6 +99,10 @@ Requires-Dist: sse-starlette; extra == "perf"
|
|
|
143
99
|
Requires-Dist: transformers; extra == "perf"
|
|
144
100
|
Requires-Dist: unicorn; extra == "perf"
|
|
145
101
|
Provides-Extra: rag
|
|
102
|
+
Requires-Dist: langchain<0.3.0; extra == "rag"
|
|
103
|
+
Requires-Dist: langchain-community<0.3.0; extra == "rag"
|
|
104
|
+
Requires-Dist: langchain-core<0.3.0; extra == "rag"
|
|
105
|
+
Requires-Dist: langchain-openai<0.3.0; extra == "rag"
|
|
146
106
|
Requires-Dist: mteb==1.19.4; extra == "rag"
|
|
147
107
|
Requires-Dist: ragas==0.2.9; extra == "rag"
|
|
148
108
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
@@ -239,7 +199,9 @@ Please scan the QR code below to join our community groups:
|
|
|
239
199
|
|
|
240
200
|
## 🎉 News
|
|
241
201
|
|
|
242
|
-
- 🔥 **[2025.03.
|
|
202
|
+
- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
203
|
+
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
204
|
+
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
243
205
|
- 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
|
|
244
206
|
- 🔥 **[2025.03.07]** Added support for the [QwQ-32B](https://modelscope.cn/models/Qwen/QwQ-32B/summary) model, evaluate the model's reasoning ability and reasoning efficiency, refer to [📖 Best Practices for QwQ-32B Evaluation](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html) for more details.
|
|
245
207
|
- 🔥 **[2025.03.04]** Added support for the [SuperGPQA](https://modelscope.cn/datasets/m-a-p/SuperGPQA/summary) dataset, which covers 13 categories, 72 first-level disciplines, and 285 second-level disciplines, totaling 26,529 questions. You can use it by specifying `super_gpqa`.
|
|
@@ -277,23 +239,24 @@ Please scan the QR code below to join our community groups:
|
|
|
277
239
|
We recommend using conda to manage your environment and installing dependencies with pip:
|
|
278
240
|
|
|
279
241
|
1. Create a conda environment (optional)
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
242
|
+
```shell
|
|
243
|
+
# It is recommended to use Python 3.10
|
|
244
|
+
conda create -n evalscope python=3.10
|
|
245
|
+
# Activate the conda environment
|
|
246
|
+
conda activate evalscope
|
|
247
|
+
```
|
|
286
248
|
|
|
287
249
|
2. Install dependencies using pip
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
250
|
+
```shell
|
|
251
|
+
pip install evalscope # Install Native backend (default)
|
|
252
|
+
# Additional options
|
|
253
|
+
pip install 'evalscope[opencompass]' # Install OpenCompass backend
|
|
254
|
+
pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
|
|
255
|
+
pip install 'evalscope[rag]' # Install RAGEval backend
|
|
256
|
+
pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
|
|
257
|
+
pip install 'evalscope[app]' # Install dependencies for visualization
|
|
258
|
+
pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
|
|
259
|
+
```
|
|
297
260
|
|
|
298
261
|
> [!WARNING]
|
|
299
262
|
> As the project has been renamed to `evalscope`, for versions `v0.4.3` or earlier, you can install using the following command:
|
|
@@ -307,21 +270,22 @@ We recommend using conda to manage your environment and installing dependencies
|
|
|
307
270
|
|
|
308
271
|
### Method 2: Install from Source
|
|
309
272
|
1. Download the source code
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
273
|
+
```shell
|
|
274
|
+
git clone https://github.com/modelscope/evalscope.git
|
|
275
|
+
```
|
|
313
276
|
|
|
314
277
|
2. Install dependencies
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
278
|
+
```shell
|
|
279
|
+
cd evalscope/
|
|
280
|
+
pip install -e . # Install Native backend
|
|
281
|
+
# Additional options
|
|
282
|
+
pip install -e '.[opencompass]' # Install OpenCompass backend
|
|
283
|
+
pip install -e '.[vlmeval]' # Install VLMEvalKit backend
|
|
284
|
+
pip install -e '.[rag]' # Install RAGEval backend
|
|
285
|
+
pip install -e '.[perf]' # Install Perf dependencies
|
|
286
|
+
pip install -e '.[app]' # Install visualization dependencies
|
|
287
|
+
pip install -e '.[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
|
|
288
|
+
```
|
|
325
289
|
|
|
326
290
|
|
|
327
291
|
## 🚀 Quick Start
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=OPYmX_ar7rXFm0ETPuE2hs-knDQtwQ0pFwSazjn3S9Q,5241
|
|
3
|
+
evalscope/config.py,sha256=CkNBE83S335iyu0VRMkblaJw5nGM8pXv4NhK5ySE3cs,9476
|
|
4
|
+
evalscope/constants.py,sha256=Cgzkoz4R3MC3YLtbCM2fmSwF8Z2kuxYdOC8t9FWJj9w,3740
|
|
5
|
+
evalscope/run.py,sha256=LUCdnNzNIfHSWvxu3gxAsHEDX7hT5mcVnV4lSY5h0iA,6007
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=JzXnfz-D9eKhVPZu2TQUPFaTFhRiZ3iK4jcIuxfnQE8,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -50,21 +50,26 @@ evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=aP8U9zjIDl26X_
|
|
|
50
50
|
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
51
|
evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
|
|
52
52
|
evalscope/backend/rag_eval/utils/embedding.py,sha256=x9HAEfZSSAnT2Tdbf-9a5UmBVagCr__ay5A2nMCPMpg,6258
|
|
53
|
-
evalscope/backend/rag_eval/utils/llm.py,sha256=
|
|
53
|
+
evalscope/backend/rag_eval/utils/llm.py,sha256=UIfdvkxVViYkIpX-MoM8sAwGEAozzVFyzX-YoFxXC1E,2607
|
|
54
54
|
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
55
55
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
57
57
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
58
58
|
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
59
|
evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
60
|
+
evalscope/benchmarks/data_adapter.py,sha256=UvbJJTNBvA0aM-xmsaj9jEEsNksn9pTDDr90FfFX2pg,17606
|
|
61
61
|
evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
|
|
62
62
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
63
|
evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
|
|
64
64
|
evalscope/benchmarks/aime/aime25_adapter.py,sha256=FB_NufY2V7uYdxVnrY_4y81gyyfYDnvedz1_zHdDWt4,1709
|
|
65
|
+
evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
|
+
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=2a6wHJSLe89Xh18u1LBkMQEZzfOURiek6o0-k2lCQgM,4065
|
|
65
67
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
66
68
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
67
69
|
evalscope/benchmarks/arc/arc_adapter.py,sha256=lkhDz-DYjPQ1vHzo8X4j-0Lq_rBxAnws35_R00pIbNI,6347
|
|
70
|
+
evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
|
+
evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=bdQfLTWB5pFo4hET0uFqu5zMX9PNQNwdoLoGrL5jCBE,6213
|
|
72
|
+
evalscope/benchmarks/arena_hard/utils.py,sha256=NstI1VR5fTaT-bfXRj0cLqm0DtH8EY4EQHR-K9HJubI,5089
|
|
68
73
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
69
74
|
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
|
|
70
75
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
@@ -98,20 +103,20 @@ evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTP
|
|
|
98
103
|
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=E4QobCjSSkMZtPJyaT_XBVxiqEqa1bta1I9aFnaHOqs,11308
|
|
99
104
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
100
105
|
evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
101
|
-
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=
|
|
106
|
+
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=fYvkJn1UcWM3aqhPMTTtBPVzjTL-Rm_g9UwUJx1FvJc,8106
|
|
102
107
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
103
108
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
104
109
|
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=TTq2jRz46Hqc_D_ZBaiw_OwKub1FZX6w8C7g7COIdGs,10372
|
|
105
110
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
106
111
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
107
112
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
108
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
113
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=lD7sDro0dSWKgYaM_ZgWbBdetxVURpjo_2q1gvVt1XU,6815
|
|
109
114
|
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
110
|
-
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=
|
|
115
|
+
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=ecNwAE3p2eKIeC4whSUdZpeJ8NgidbSFZbIYtSW26Xo,2394
|
|
111
116
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
112
117
|
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
|
|
113
118
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
114
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
119
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=KBZDP1T-t7uu8vBLGL_unVdj7rDko3KWBPKqWlw31JQ,4596
|
|
115
120
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
116
121
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
117
122
|
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
|
|
@@ -125,7 +130,7 @@ evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0
|
|
|
125
130
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
126
131
|
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
|
|
127
132
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
|
-
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=
|
|
133
|
+
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=xuQ1EK8Af_093qqeOXPIp_iqTWcG5KGOtE6r5hx3958,1858
|
|
129
134
|
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
130
135
|
evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
|
|
131
136
|
evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
|
|
@@ -140,7 +145,7 @@ evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=8MOECcweL
|
|
|
140
145
|
evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
|
|
141
146
|
evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
|
|
142
147
|
evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
|
|
143
|
-
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=
|
|
148
|
+
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=s5oa--dOcugcpBmHsbeqnTRTDhdiCNXkIQuRc6EgA8o,28241
|
|
144
149
|
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
145
150
|
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
|
|
146
151
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -149,6 +154,8 @@ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=e__Evar99V9l65FlzT6T594CN4iMgmu
|
|
|
149
154
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
150
155
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
151
156
|
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
|
|
157
|
+
evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
158
|
+
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=ZZMy9exJ8hknr1D6s73sAhHHzBAKcqo7WAmlUtPqpCI,9556
|
|
152
159
|
evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
160
|
evalscope/benchmarks/musr/musr_adapter.py,sha256=Po8hcIQiqlFo0AGjcNQe75cpsMNDcfiJaKgZsk33-DY,2442
|
|
154
161
|
evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -159,7 +166,7 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
|
|
|
159
166
|
evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
|
|
160
167
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
161
168
|
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
|
-
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=
|
|
169
|
+
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=CsRUJ0v1sSUmtO6QWkdzisn9OHN-1JSXB-9ghOuNqgY,8988
|
|
163
170
|
evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
164
171
|
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
|
|
165
172
|
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
|
|
@@ -180,17 +187,17 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
|
|
|
180
187
|
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
181
188
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
182
189
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
183
|
-
evalscope/collections/evaluator.py,sha256=
|
|
190
|
+
evalscope/collections/evaluator.py,sha256=YJy8Dj35XCdCwhNDwZecJkeW1_ZgIOsuRLFzfe3SyV8,12724
|
|
184
191
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
185
192
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
186
193
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
187
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
194
|
+
evalscope/evaluator/evaluator.py,sha256=szRQrXH5ILpUljb14lcunuOt185H8Um1paviTokraA4,19845
|
|
188
195
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
189
196
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
190
197
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
191
198
|
evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
|
|
192
199
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
193
|
-
evalscope/metrics/llm_judge.py,sha256=
|
|
200
|
+
evalscope/metrics/llm_judge.py,sha256=Di0Q1c6VHLl0nQ_TVOZOOQlMApDIU83HuDPTOV8XrTA,4023
|
|
194
201
|
evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
|
|
195
202
|
evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
|
|
196
203
|
evalscope/metrics/named_metrics.py,sha256=pSHA2_qdi9B5bDHIh08GYhx63odilSwA_T-95K1Usl0,1380
|
|
@@ -201,9 +208,9 @@ evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48
|
|
|
201
208
|
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
202
209
|
evalscope/models/__init__.py,sha256=i9vcOBMEF_UM7C2gpmh2GsQk3njwqevoQ6A4CnP1fHs,1000
|
|
203
210
|
evalscope/models/base_adapter.py,sha256=7PbRwfD5PIZCBYVds6ZHI8TBY9C5i2LdPOTu88FJWlY,3414
|
|
204
|
-
evalscope/models/chat_adapter.py,sha256=
|
|
211
|
+
evalscope/models/chat_adapter.py,sha256=2XZmdhxnvy4yezPLXNVRbgrs0QkUY2VznEBq5mCYjKs,7106
|
|
205
212
|
evalscope/models/choice_adapter.py,sha256=fnJdo-FMJ-zvNLbEJGc73odgWXIxtVudL00JIf2vzsA,8239
|
|
206
|
-
evalscope/models/custom_adapter.py,sha256=
|
|
213
|
+
evalscope/models/custom_adapter.py,sha256=AGztmZ0aT0g2flh4B4NaiZ8LCDg8tT0gVNxmrP5W1mA,2401
|
|
207
214
|
evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
|
|
208
215
|
evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
|
|
209
216
|
evalscope/models/register.py,sha256=4vX6AfScAzwD7UkncbuejfAiQHznQkK5hvtG6jEUbWo,809
|
|
@@ -212,29 +219,30 @@ evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk
|
|
|
212
219
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
213
220
|
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
214
221
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
215
|
-
evalscope/perf/arguments.py,sha256=
|
|
216
|
-
evalscope/perf/benchmark.py,sha256=
|
|
222
|
+
evalscope/perf/arguments.py,sha256=srDp3JMYIPZxkfua5WHkjq3G8lJlTtxdXKxE_CivoJk,9156
|
|
223
|
+
evalscope/perf/benchmark.py,sha256=qY7zrsZMDBr1fABsShXjgK12tNE7PhzGZdLaUtdtxU8,8318
|
|
217
224
|
evalscope/perf/http_client.py,sha256=xMakdQkJ2cgIOd-yOmHEW0vbGKTJ0JWhLFt9IFtUP8Q,7473
|
|
218
|
-
evalscope/perf/main.py,sha256=
|
|
225
|
+
evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
|
|
219
226
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
220
227
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
221
228
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
222
229
|
evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
|
|
223
230
|
evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
|
|
224
231
|
evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
|
|
225
|
-
evalscope/perf/plugin/api/openai_api.py,sha256=
|
|
226
|
-
evalscope/perf/plugin/datasets/__init__.py,sha256=
|
|
232
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=DNDmW7jT0Abopw-K73X0PE7Vr2wTSKMBj79hJZTi-K8,7668
|
|
233
|
+
evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
|
|
227
234
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
228
235
|
evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
|
|
229
236
|
evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
|
|
230
237
|
evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
|
|
231
238
|
evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
|
|
232
|
-
evalscope/perf/plugin/datasets/openqa.py,sha256=
|
|
233
|
-
evalscope/perf/plugin/datasets/
|
|
239
|
+
evalscope/perf/plugin/datasets/openqa.py,sha256=_aVXs2s8wbmtoB6ZO-pNjUZvBVxRUYdoJDGv5-BumtI,1342
|
|
240
|
+
evalscope/perf/plugin/datasets/random_dataset.py,sha256=wPyY5kk2zKnc8u9uYEl-vQ6BLHeWbdC8EHEAZNFSDeU,2702
|
|
241
|
+
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
|
|
234
242
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
235
243
|
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
236
244
|
evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
|
|
237
|
-
evalscope/perf/utils/db_util.py,sha256=
|
|
245
|
+
evalscope/perf/utils/db_util.py,sha256=OAaR9bK4SPfMuk41w1t4d7ljxPDDEZOzcwDn2s9bpz0,9052
|
|
238
246
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
239
247
|
evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
|
|
240
248
|
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -310,24 +318,24 @@ evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
|
|
|
310
318
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
311
319
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
312
320
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
313
|
-
tests/cli/test_all.py,sha256=
|
|
321
|
+
tests/cli/test_all.py,sha256=tRC4TWaqxEsB6jMsGR7u9RHWHuKzn7Umt2XKY1V8CLU,4035
|
|
314
322
|
tests/cli/test_collection.py,sha256=V-_M7ngwekMGqPuI16jjJZyAK2XLE4Z6QTn-8B5ykgU,4071
|
|
315
|
-
tests/cli/test_run.py,sha256=
|
|
323
|
+
tests/cli/test_run.py,sha256=0gD0nPiioieaDOqRZkS5ruIWuiv1B5D456wSSHv9y40,16471
|
|
316
324
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
317
|
-
tests/perf/test_perf.py,sha256=
|
|
325
|
+
tests/perf/test_perf.py,sha256=mfXTCsD9RaCef3b4CLvm8ErxBUaWzn-EKKhOxD65i3A,3817
|
|
318
326
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
319
327
|
tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
|
|
320
328
|
tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
|
|
321
|
-
tests/rag/test_ragas.py,sha256=
|
|
329
|
+
tests/rag/test_ragas.py,sha256=fzpn4zZPeZ04ZdfLmwXbsSjf7WcjPWrGsA6RDNXgIEQ,4011
|
|
322
330
|
tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
323
331
|
tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
|
|
324
332
|
tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
|
|
325
333
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
326
334
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
327
335
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
328
|
-
evalscope-0.13.
|
|
329
|
-
evalscope-0.13.
|
|
330
|
-
evalscope-0.13.
|
|
331
|
-
evalscope-0.13.
|
|
332
|
-
evalscope-0.13.
|
|
333
|
-
evalscope-0.13.
|
|
336
|
+
evalscope-0.13.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
337
|
+
evalscope-0.13.2.dist-info/METADATA,sha256=b7rVRQHN5miovM5qlh4Dozpl8OaxO0rg0ctT-kDZMyY,32399
|
|
338
|
+
evalscope-0.13.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
339
|
+
evalscope-0.13.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
340
|
+
evalscope-0.13.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
341
|
+
evalscope-0.13.2.dist-info/RECORD,,
|
tests/cli/test_all.py
CHANGED
|
@@ -18,31 +18,34 @@ os.environ['LOG_LEVEL'] = 'DEBUG'
|
|
|
18
18
|
logger = get_logger()
|
|
19
19
|
|
|
20
20
|
datasets=[
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
21
|
+
'iquiz',
|
|
22
|
+
'ifeval',
|
|
23
|
+
'mmlu',
|
|
24
|
+
'mmlu_pro',
|
|
25
|
+
'musr',
|
|
26
|
+
'process_bench',
|
|
27
|
+
'race',
|
|
28
|
+
'trivia_qa',
|
|
29
|
+
'cmmlu',
|
|
30
|
+
'humaneval',
|
|
31
|
+
'gsm8k',
|
|
32
|
+
'bbh',
|
|
33
|
+
'competition_math',
|
|
34
|
+
'math_500',
|
|
35
|
+
'aime24',
|
|
36
|
+
'gpqa',
|
|
37
|
+
'arc',
|
|
38
|
+
'ceval',
|
|
39
|
+
'hellaswag',
|
|
40
|
+
'general_mcq',
|
|
41
|
+
'general_qa',
|
|
42
42
|
'super_gpqa',
|
|
43
43
|
'live_code_bench',
|
|
44
|
+
'mmlu_redux',
|
|
44
45
|
'simple_qa',
|
|
45
46
|
'chinese_simpleqa',
|
|
47
|
+
'alpaca_eval',
|
|
48
|
+
'arena_hard',
|
|
46
49
|
]
|
|
47
50
|
|
|
48
51
|
dataset_args={
|
|
@@ -110,7 +113,13 @@ dataset_args={
|
|
|
110
113
|
'start_date': '2024-12-01',
|
|
111
114
|
'end_date': '2025-01-01'
|
|
112
115
|
},
|
|
113
|
-
}
|
|
116
|
+
},
|
|
117
|
+
'chinese_simpleqa': {
|
|
118
|
+
'subset_list': ['中华文化']
|
|
119
|
+
},
|
|
120
|
+
'mmlu_redux':{
|
|
121
|
+
'subset_list': ['abstract_algebra']
|
|
122
|
+
},
|
|
114
123
|
}
|
|
115
124
|
|
|
116
125
|
class TestRun(unittest.TestCase):
|
|
@@ -119,13 +128,13 @@ class TestRun(unittest.TestCase):
|
|
|
119
128
|
from evalscope.config import TaskConfig
|
|
120
129
|
|
|
121
130
|
task_cfg = TaskConfig(
|
|
122
|
-
model='qwen2.5-
|
|
131
|
+
model='qwen2.5-0.5b-instruct',
|
|
123
132
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
124
133
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
125
134
|
eval_type=EvalType.SERVICE,
|
|
126
135
|
datasets=datasets,
|
|
127
136
|
dataset_args=dataset_args,
|
|
128
|
-
eval_batch_size=
|
|
137
|
+
eval_batch_size=2,
|
|
129
138
|
limit=2,
|
|
130
139
|
stream=True,
|
|
131
140
|
generation_config={
|