evalscope 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/aime24/__init__.py +0 -0
- evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
- evalscope/benchmarks/benchmark.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
- evalscope/benchmarks/data_adapter.py +18 -12
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
- evalscope/benchmarks/gpqa/gpqa_adapter.py +26 -8
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -13
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
- evalscope/collections/evaluator.py +103 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +1 -0
- evalscope/evaluator/evaluator.py +78 -64
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +16 -1
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/chat_adapter.py +69 -49
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +4 -0
- evalscope/models/server_adapter.py +28 -34
- evalscope/report/app.py +30 -15
- evalscope/run.py +10 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/METADATA +14 -5
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/RECORD +53 -46
- tests/cli/test_run.py +93 -16
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.11.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -19,10 +19,12 @@ License-File: LICENSE
|
|
|
19
19
|
Requires-Dist: absl-py
|
|
20
20
|
Requires-Dist: accelerate
|
|
21
21
|
Requires-Dist: cachetools
|
|
22
|
-
Requires-Dist: datasets<=3.0
|
|
22
|
+
Requires-Dist: datasets<=3.2.0,>=3.0.0
|
|
23
23
|
Requires-Dist: editdistance
|
|
24
24
|
Requires-Dist: jieba
|
|
25
25
|
Requires-Dist: jsonlines
|
|
26
|
+
Requires-Dist: langdetect
|
|
27
|
+
Requires-Dist: latex2sympy2
|
|
26
28
|
Requires-Dist: matplotlib
|
|
27
29
|
Requires-Dist: modelscope[framework]
|
|
28
30
|
Requires-Dist: nltk>=3.9
|
|
@@ -42,20 +44,24 @@ Requires-Dist: scikit-learn
|
|
|
42
44
|
Requires-Dist: seaborn
|
|
43
45
|
Requires-Dist: sentencepiece
|
|
44
46
|
Requires-Dist: simple-ddl-parser
|
|
47
|
+
Requires-Dist: sympy
|
|
45
48
|
Requires-Dist: tabulate
|
|
46
49
|
Requires-Dist: tiktoken
|
|
47
50
|
Requires-Dist: torch
|
|
48
51
|
Requires-Dist: tqdm
|
|
49
52
|
Requires-Dist: transformers>=4.33
|
|
50
53
|
Requires-Dist: transformers-stream-generator
|
|
54
|
+
Requires-Dist: word2number
|
|
51
55
|
Provides-Extra: all
|
|
52
56
|
Requires-Dist: absl-py; extra == "all"
|
|
53
57
|
Requires-Dist: accelerate; extra == "all"
|
|
54
58
|
Requires-Dist: cachetools; extra == "all"
|
|
55
|
-
Requires-Dist: datasets<=3.0
|
|
59
|
+
Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
|
|
56
60
|
Requires-Dist: editdistance; extra == "all"
|
|
57
61
|
Requires-Dist: jieba; extra == "all"
|
|
58
62
|
Requires-Dist: jsonlines; extra == "all"
|
|
63
|
+
Requires-Dist: langdetect; extra == "all"
|
|
64
|
+
Requires-Dist: latex2sympy2; extra == "all"
|
|
59
65
|
Requires-Dist: matplotlib; extra == "all"
|
|
60
66
|
Requires-Dist: modelscope[framework]; extra == "all"
|
|
61
67
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
@@ -75,12 +81,14 @@ Requires-Dist: scikit-learn; extra == "all"
|
|
|
75
81
|
Requires-Dist: seaborn; extra == "all"
|
|
76
82
|
Requires-Dist: sentencepiece; extra == "all"
|
|
77
83
|
Requires-Dist: simple-ddl-parser; extra == "all"
|
|
84
|
+
Requires-Dist: sympy; extra == "all"
|
|
78
85
|
Requires-Dist: tabulate; extra == "all"
|
|
79
86
|
Requires-Dist: tiktoken; extra == "all"
|
|
80
87
|
Requires-Dist: torch; extra == "all"
|
|
81
88
|
Requires-Dist: tqdm; extra == "all"
|
|
82
89
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
83
90
|
Requires-Dist: transformers-stream-generator; extra == "all"
|
|
91
|
+
Requires-Dist: word2number; extra == "all"
|
|
84
92
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
|
|
85
93
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
|
|
86
94
|
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
@@ -92,10 +100,10 @@ Requires-Dist: numpy; extra == "all"
|
|
|
92
100
|
Requires-Dist: sse-starlette; extra == "all"
|
|
93
101
|
Requires-Dist: transformers; extra == "all"
|
|
94
102
|
Requires-Dist: unicorn; extra == "all"
|
|
95
|
-
Requires-Dist: gradio
|
|
103
|
+
Requires-Dist: gradio==5.4.0; extra == "all"
|
|
96
104
|
Requires-Dist: plotly>=5.23.0; extra == "all"
|
|
97
105
|
Provides-Extra: app
|
|
98
|
-
Requires-Dist: gradio
|
|
106
|
+
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
99
107
|
Requires-Dist: plotly>=5.23.0; extra == "app"
|
|
100
108
|
Provides-Extra: inner
|
|
101
109
|
Requires-Dist: absl-py; extra == "inner"
|
|
@@ -215,6 +223,7 @@ Please scan the QR code below to join our community groups:
|
|
|
215
223
|
|
|
216
224
|
|
|
217
225
|
## 🎉 News
|
|
226
|
+
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
218
227
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
219
228
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
220
229
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=r8gOMX6i8dWMl_WXLsBdHla7cuauBAyv9apky9VxLsE,4598
|
|
3
|
+
evalscope/config.py,sha256=D7C_K0f0xsfzFUSNSJJUTz3n9tmA6zLDbf8pZ_9ltpw,8600
|
|
4
4
|
evalscope/constants.py,sha256=bkcDVbB4Pr1Qxz83qefcWjEetVGiHTcx3m84WX14ASI,3330
|
|
5
|
-
evalscope/run.py,sha256=
|
|
5
|
+
evalscope/run.py,sha256=qfMqVWlUiXEiIJ665p3-IYWknhIeNZkCJe3Yn07Y74U,5692
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=h6YAZAgeAreWmKtpfr4D6BEvnWZxb1bka9hrpYOO0l8,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -56,13 +56,15 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
|
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
57
57
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
58
58
|
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
59
|
+
evalscope/benchmarks/benchmark.py,sha256=IY2xYmNR58aYnZK7rnUDONWiLQopo_ZifGS2SfN2L-Q,2422
|
|
60
|
+
evalscope/benchmarks/data_adapter.py,sha256=xCBvJe4ubgpP1J8ElcWAJwF6B5CSrBEv_uMwQzlUaLY,12540
|
|
61
|
+
evalscope/benchmarks/aime24/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
|
+
evalscope/benchmarks/aime24/aime24_adapter.py,sha256=FYH8NsT1nis3VoBMzRM_ueOsGNXjOKZCa6J_wpUM3RQ,1772
|
|
61
63
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
62
64
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
63
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
65
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=vfwAy01LA141qn1lsSyZmEIGWbbhOCRMOGoSM-K2z6M,6490
|
|
64
66
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
65
|
-
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=
|
|
67
|
+
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=37wY3r1qW5qdjyKF-8n7UIM0IVcpaQugMb5Rkjbppxg,8524
|
|
66
68
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
67
69
|
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
|
|
68
70
|
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
|
|
@@ -91,55 +93,60 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
|
|
|
91
93
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
92
94
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
93
95
|
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
94
|
-
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=
|
|
96
|
+
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=Qz2oNGw0H_4FtfY-Izdxv9fgwxScJksyvwzeQw-aVyo,11374
|
|
95
97
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
96
|
-
evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
|
|
97
98
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
98
99
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
99
|
-
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=
|
|
100
|
+
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=1RmhI0SNxHK-Fz-iTIR76zeBRDLlm0m6_7rJywqk3Rk,10446
|
|
100
101
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
101
102
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
102
103
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
103
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
104
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=i0E4TNajMVcWT8lc5haIjKvdmHuI5qzgpssIm5Fw7bs,7413
|
|
105
|
+
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
106
|
+
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=o3Q6ke-RLx4qUbF5FgASZogv3-kCJ6qpK43F_LARU3Y,2496
|
|
107
|
+
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
108
|
+
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=vDHgsWpsIZQWNadl3mI8M3rDKkvPM2N2KAkW-8aeOHY,5130
|
|
104
109
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
105
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
110
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=fu14ZzGYyg2MEdJbxZGBoIbais6xA9Um2BEAJTvBZZM,3823
|
|
106
111
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
107
112
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
108
|
-
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=
|
|
113
|
+
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=tiy8Cn1ZmNKjVg8lqNAxWBbsKp8h0uiDNpWuHfcID0A,4689
|
|
109
114
|
evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
110
115
|
evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
|
|
111
|
-
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=
|
|
116
|
+
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=4qtMX_SfqkXRMgGLOA6tNGMK9EkITWbjLlJT9gWbT20,10664
|
|
112
117
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
113
118
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
114
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
119
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=qArX2umdrYJZkDA9i3XGBGljCton99v5Yss9be9iZYw,6269
|
|
115
120
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
116
121
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
117
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
122
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=onacZB_6SF9239Ly-U70__WYsinS9iWpnf3oiYMNxKc,5164
|
|
118
123
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
119
|
-
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=
|
|
124
|
+
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=3HsAdNj5JJGCFA17sPXi-59yv-pfcB0UeXKdY_mQcwU,2015
|
|
120
125
|
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
121
126
|
evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
|
|
122
127
|
evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
|
|
123
128
|
evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
|
|
124
129
|
evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
125
|
-
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=
|
|
130
|
+
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=nv4mzKOPp1YPcr6e7daZuZyQ3jRNNG6PUzi38REuwSk,2356
|
|
131
|
+
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
132
|
+
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=mBzsllop5sTHw-uK04FjhEWDiEDjDaNUFDUBIVN7Xgg,1742
|
|
126
133
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
127
134
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
128
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256
|
|
135
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=pmT1y9dbWJcZK3U6hkXa3-lBDABx7DhQ7oHc3O-Nkg0,11769
|
|
129
136
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
130
137
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
131
|
-
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=
|
|
138
|
+
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=Fdrj26MfYmPzio2tI23WTcofrwD69_m41mkVpvlxzVU,4815
|
|
132
139
|
evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
133
140
|
evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
|
|
134
|
-
evalscope/benchmarks/race/race_adapter.py,sha256=
|
|
141
|
+
evalscope/benchmarks/race/race_adapter.py,sha256=dC9I-3T9UFh2OVpmWKRmSszPOlFZAZ40xOPa4zN3daI,6661
|
|
135
142
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
136
143
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
137
144
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
138
145
|
evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
|
|
139
|
-
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=
|
|
146
|
+
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=GVuJT-Xz4ugVtcUSTRxcBgViHVowcqJf3yVsotcZoZI,5062
|
|
140
147
|
evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
141
148
|
evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
|
|
142
|
-
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=
|
|
149
|
+
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=6rT1zuQh0nLuYymcchO-cMP98EY0vWizbfTfnUERWgo,12905
|
|
143
150
|
evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
144
151
|
evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
|
|
145
152
|
evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
|
|
@@ -148,19 +155,19 @@ evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,7
|
|
|
148
155
|
evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
|
|
149
156
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
150
157
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
151
|
-
evalscope/collections/evaluator.py,sha256=
|
|
152
|
-
evalscope/collections/sampler.py,sha256=
|
|
153
|
-
evalscope/collections/schema.py,sha256=
|
|
158
|
+
evalscope/collections/evaluator.py,sha256=FJx3KGdLi0-TIqWC_067HEmA4P298BKdwHIrbcai46M,12065
|
|
159
|
+
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
160
|
+
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
154
161
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
155
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
162
|
+
evalscope/evaluator/evaluator.py,sha256=E0NiP5O56WbF8eiUmw9IY2ouotRog9H-2SRyTzZld0I,17569
|
|
156
163
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
157
164
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
158
165
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
159
166
|
evalscope/metrics/__init__.py,sha256=yzuZjXufrPqVhzNTNaJLJwhs7-Sgb-iNG0I3BdOX7Tg,291
|
|
160
167
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
161
|
-
evalscope/metrics/
|
|
162
|
-
evalscope/metrics/metrics.py,sha256=
|
|
163
|
-
evalscope/metrics/named_metrics.py,sha256=
|
|
168
|
+
evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
|
|
169
|
+
evalscope/metrics/metrics.py,sha256=r4FHyEvvFhMu0vAHBw-ByFefObDBC3DQdr53klSk6Wk,13325
|
|
170
|
+
evalscope/metrics/named_metrics.py,sha256=SeBXmgWyK4y4tKiGKro3k-CZU1OShuKe6qxwpT3tizY,1313
|
|
164
171
|
evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
|
|
165
172
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
166
173
|
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
|
|
@@ -168,12 +175,12 @@ evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48
|
|
|
168
175
|
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
169
176
|
evalscope/models/__init__.py,sha256=pafIEbJq_2DrYjQbgI0SNVxywNYOxvqwk7Dr1P7KEwk,923
|
|
170
177
|
evalscope/models/base_adapter.py,sha256=fT3i8c9jRmz_VBcUYMMmXrlCM6JWcixPdgak5yT6Wkw,2177
|
|
171
|
-
evalscope/models/chat_adapter.py,sha256=
|
|
172
|
-
evalscope/models/choice_adapter.py,sha256=
|
|
173
|
-
evalscope/models/custom_adapter.py,sha256=
|
|
174
|
-
evalscope/models/local_model.py,sha256=
|
|
178
|
+
evalscope/models/chat_adapter.py,sha256=nOrNDuvuNKkTcW9zNcR_EIqbzkqK5PFws-5YsSxBR9E,6120
|
|
179
|
+
evalscope/models/choice_adapter.py,sha256=jj_6KB1BAsvv4Yufn2bM2tCiLovFUum2368lseogmb8,8036
|
|
180
|
+
evalscope/models/custom_adapter.py,sha256=Ed_MGEcZxKK4mkXTpUY4GXTsayprHzIEOC1L9gqwjf4,2284
|
|
181
|
+
evalscope/models/local_model.py,sha256=s0YVX9Djqazusk7qzSpWQB76jGGuzJxqQlZzomsCFsk,2621
|
|
175
182
|
evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
|
|
176
|
-
evalscope/models/server_adapter.py,sha256=
|
|
183
|
+
evalscope/models/server_adapter.py,sha256=iVJuUJlHGVGxnlrDMnbHZ8WQ4OR2HK5HrXH4obD2_cg,4173
|
|
177
184
|
evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
|
|
178
185
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
179
186
|
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
@@ -225,7 +232,7 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
|
|
|
225
232
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
226
233
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
227
234
|
evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
|
|
228
|
-
evalscope/report/app.py,sha256=
|
|
235
|
+
evalscope/report/app.py,sha256=adP1rVVOxYMbCTdopV3FKWBhUzB7t1AXcDOxW4Ct56g,26647
|
|
229
236
|
evalscope/report/combinator.py,sha256=bi6nvTbMrzraZ8kUZ6mIMikk8-qEIVYUhdaH4RE1Tg8,2653
|
|
230
237
|
evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
|
|
231
238
|
evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
|
|
@@ -258,9 +265,9 @@ evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1
|
|
|
258
265
|
evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
|
|
259
266
|
evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
|
|
260
267
|
evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
|
|
261
|
-
evalscope/utils/chat_service.py,sha256=
|
|
268
|
+
evalscope/utils/chat_service.py,sha256=eZ8uyVeVFpXZo_uvRFyVhnFyJpL14zcn9UA6K4Ax5J4,8676
|
|
262
269
|
evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
|
|
263
|
-
evalscope/utils/io_utils.py,sha256=
|
|
270
|
+
evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
|
|
264
271
|
evalscope/utils/logger.py,sha256=49F2WDi1g_o8aW8Z29wOt9YHE9LDqkHIgb-d8TVybJY,3635
|
|
265
272
|
evalscope/utils/model_utils.py,sha256=PK7pKNY8ovtGZHNRvDpZ-d8zBHMOkxd6fRVkM8VF06I,736
|
|
266
273
|
evalscope/utils/utils.py,sha256=a6a2vDDxqlj7nY8xynkKkWs_ZPXEU2UMwvxp0JEpHjg,9686
|
|
@@ -268,7 +275,7 @@ tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
|
268
275
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
269
276
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
270
277
|
tests/cli/test_collection.py,sha256=gx3GySIAPNaLUSf3D3Q3V0WZc21BPdNthIbECHQN0TI,3026
|
|
271
|
-
tests/cli/test_run.py,sha256=
|
|
278
|
+
tests/cli/test_run.py,sha256=gtId2SF1LlDCIn4S_WKRpAyTig_pWOhY8yto4P5B1EY,8303
|
|
272
279
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
273
280
|
tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
|
|
274
281
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -281,9 +288,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
281
288
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
282
289
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
283
290
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
284
|
-
evalscope-0.
|
|
285
|
-
evalscope-0.
|
|
286
|
-
evalscope-0.
|
|
287
|
-
evalscope-0.
|
|
288
|
-
evalscope-0.
|
|
289
|
-
evalscope-0.
|
|
291
|
+
evalscope-0.11.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
292
|
+
evalscope-0.11.0.dist-info/METADATA,sha256=GL8Ybyby65DYg8jxjxzdcFYvXBhKzE7eRFIBRiJ0-hc,29584
|
|
293
|
+
evalscope-0.11.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
294
|
+
evalscope-0.11.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
295
|
+
evalscope-0.11.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
296
|
+
evalscope-0.11.0.dist-info/RECORD,,
|
tests/cli/test_run.py
CHANGED
|
@@ -73,16 +73,18 @@ class TestRun(unittest.TestCase):
|
|
|
73
73
|
def test_run_task(self):
|
|
74
74
|
task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
|
|
75
75
|
'datasets': [
|
|
76
|
-
'mmlu_pro',
|
|
76
|
+
# 'mmlu_pro',
|
|
77
77
|
# 'bbh',
|
|
78
|
-
'hellaswag',
|
|
78
|
+
# 'hellaswag',
|
|
79
79
|
# 'gsm8k',
|
|
80
|
-
# 'arc'
|
|
80
|
+
# 'arc',
|
|
81
81
|
# 'race',
|
|
82
|
+
'ifeval',
|
|
82
83
|
# 'truthful_qa',
|
|
83
84
|
# 'trivia_qa',
|
|
84
85
|
],
|
|
85
|
-
'limit':
|
|
86
|
+
'limit': 2,
|
|
87
|
+
'eval_batch_size': 2,
|
|
86
88
|
'debug': True}
|
|
87
89
|
run_task(task_cfg=task_cfg)
|
|
88
90
|
|
|
@@ -93,9 +95,9 @@ class TestRun(unittest.TestCase):
|
|
|
93
95
|
|
|
94
96
|
task_cfg = TaskConfig(
|
|
95
97
|
model='qwen/Qwen2-0.5B-Instruct',
|
|
96
|
-
datasets=['
|
|
98
|
+
datasets=['general_mcq', 'general_qa'], # 数据格式,选择题格式固定为 'ceval'
|
|
97
99
|
dataset_args={
|
|
98
|
-
'
|
|
100
|
+
'general_mcq': {
|
|
99
101
|
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
100
102
|
'subset_list': [
|
|
101
103
|
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
@@ -117,8 +119,17 @@ class TestRun(unittest.TestCase):
|
|
|
117
119
|
|
|
118
120
|
task_cfg = TaskConfig(
|
|
119
121
|
model='qwen/Qwen2-0.5B-Instruct',
|
|
120
|
-
datasets=[
|
|
121
|
-
|
|
122
|
+
datasets=[
|
|
123
|
+
# 'math_500',
|
|
124
|
+
# 'aime24',
|
|
125
|
+
'competition_math'
|
|
126
|
+
],
|
|
127
|
+
dataset_args={
|
|
128
|
+
'competition_math': {
|
|
129
|
+
'subset_list': ['Level 4', 'Level 5']
|
|
130
|
+
}
|
|
131
|
+
},
|
|
132
|
+
limit=5
|
|
122
133
|
)
|
|
123
134
|
|
|
124
135
|
run_task(task_cfg=task_cfg)
|
|
@@ -128,12 +139,12 @@ class TestRun(unittest.TestCase):
|
|
|
128
139
|
from evalscope.config import TaskConfig
|
|
129
140
|
|
|
130
141
|
task_cfg = TaskConfig(
|
|
131
|
-
model='Qwen2.5-
|
|
142
|
+
model='Qwen2.5-0.5B-Instruct',
|
|
132
143
|
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
133
144
|
api_key='EMPTY',
|
|
134
145
|
eval_type=EvalType.SERVICE,
|
|
135
146
|
datasets=[
|
|
136
|
-
'iquiz',
|
|
147
|
+
# 'iquiz',
|
|
137
148
|
# 'ifeval',
|
|
138
149
|
# 'mmlu',
|
|
139
150
|
# 'mmlu_pro',
|
|
@@ -141,25 +152,91 @@ class TestRun(unittest.TestCase):
|
|
|
141
152
|
# 'trivia_qa',
|
|
142
153
|
# 'cmmlu',
|
|
143
154
|
# 'humaneval',
|
|
144
|
-
# 'competition_math',
|
|
145
155
|
# 'gsm8k',
|
|
156
|
+
# 'bbh',
|
|
157
|
+
'competition_math',
|
|
158
|
+
'math_500',
|
|
159
|
+
'aime24',
|
|
160
|
+
'gpqa',
|
|
146
161
|
# 'arc',
|
|
147
162
|
# 'ceval',
|
|
148
|
-
# 'bbh',
|
|
149
163
|
# 'hellaswag',
|
|
150
164
|
],
|
|
151
165
|
dataset_args={
|
|
166
|
+
'mmlu': {
|
|
167
|
+
'subset_list': ['elementary_mathematics'],
|
|
168
|
+
'few_shot_num': 0
|
|
169
|
+
},
|
|
170
|
+
'mmlu_pro': {
|
|
171
|
+
'subset_list': ['math'],
|
|
172
|
+
'few_shot_num': 0
|
|
173
|
+
},
|
|
152
174
|
'ceval': {
|
|
153
175
|
'subset_list': [
|
|
154
|
-
'computer_network', 'operating_system', 'computer_architecture'
|
|
155
|
-
]
|
|
156
|
-
|
|
176
|
+
'computer_network', 'operating_system', 'computer_architecture'
|
|
177
|
+
],
|
|
178
|
+
'few_shot_num': 0
|
|
179
|
+
},
|
|
180
|
+
'cmmlu': {
|
|
181
|
+
'subset_list': ['elementary_chinese'],
|
|
182
|
+
'few_shot_num': 0
|
|
183
|
+
},
|
|
184
|
+
'bbh': {
|
|
185
|
+
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
186
|
+
},
|
|
187
|
+
'gpqa': {
|
|
188
|
+
'subset_list': ['gpqa_diamond'],
|
|
189
|
+
'few_shot_num': 0
|
|
190
|
+
},
|
|
191
|
+
'humaneval': {
|
|
192
|
+
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
193
|
+
},
|
|
194
|
+
'competition_math': {
|
|
195
|
+
'subset_list': ['Level 1']
|
|
196
|
+
},
|
|
197
|
+
},
|
|
198
|
+
eval_batch_size=5,
|
|
199
|
+
limit=10,
|
|
200
|
+
debug=True,
|
|
201
|
+
generation_config={
|
|
202
|
+
'temperature': 0.7,
|
|
203
|
+
'n': 5
|
|
157
204
|
},
|
|
158
|
-
|
|
205
|
+
use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250212_150525'
|
|
159
206
|
)
|
|
160
207
|
|
|
161
208
|
run_task(task_cfg=task_cfg)
|
|
162
209
|
|
|
163
210
|
|
|
211
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
212
|
+
def test_run_batch_eval(self):
|
|
213
|
+
from evalscope.config import TaskConfig
|
|
214
|
+
|
|
215
|
+
task_cfg = TaskConfig(
|
|
216
|
+
model='LLM-Research/Llama-3.2-1B-Instruct',
|
|
217
|
+
datasets=[
|
|
218
|
+
# 'math_500',
|
|
219
|
+
# 'aime24',
|
|
220
|
+
# 'competition_math'
|
|
221
|
+
# 'arc',
|
|
222
|
+
'gsm8k'
|
|
223
|
+
# 'truthful_qa'
|
|
224
|
+
],
|
|
225
|
+
dataset_args={
|
|
226
|
+
'competition_math': {
|
|
227
|
+
'subset_list': ['Level 4', 'Level 5']
|
|
228
|
+
}
|
|
229
|
+
},
|
|
230
|
+
eval_batch_size=2,
|
|
231
|
+
limit=5,
|
|
232
|
+
generation_config={
|
|
233
|
+
'max_new_tokens': 2048,
|
|
234
|
+
'temperature': 0.7,
|
|
235
|
+
'num_return_sequences': 2,
|
|
236
|
+
}
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
run_task(task_cfg=task_cfg)
|
|
240
|
+
|
|
164
241
|
if __name__ == '__main__':
|
|
165
242
|
unittest.main()
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{'id': 0, 'question': '下列关于税法基本原则的表述中,不正确的是____。', 'A': '税收法定原则包括税收要件法定原则和税务合法性原则', 'B': '税收公平原则源于法律上的平等性原则', 'C': '税收效率原则包含经济效率和行政效率两个方面', 'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定', 'answer': 'D', 'explanation': ''}
|