evalscope 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (55) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/aime24/__init__.py +0 -0
  3. evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  5. evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
  6. evalscope/benchmarks/benchmark.py +2 -2
  7. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
  10. evalscope/benchmarks/data_adapter.py +18 -12
  11. evalscope/benchmarks/data_collection/__init__.py +0 -0
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  13. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
  16. evalscope/benchmarks/gpqa/gpqa_adapter.py +26 -8
  17. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
  18. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  19. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  20. evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -13
  21. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  22. evalscope/benchmarks/math_500/__init__.py +0 -0
  23. evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
  24. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  25. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
  26. evalscope/benchmarks/race/race_adapter.py +3 -3
  27. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  28. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
  29. evalscope/collections/evaluator.py +103 -39
  30. evalscope/collections/sampler.py +2 -1
  31. evalscope/collections/schema.py +1 -2
  32. evalscope/config.py +1 -0
  33. evalscope/evaluator/evaluator.py +78 -64
  34. evalscope/metrics/math_parser.py +526 -0
  35. evalscope/metrics/metrics.py +16 -1
  36. evalscope/metrics/named_metrics.py +31 -7
  37. evalscope/models/chat_adapter.py +69 -49
  38. evalscope/models/choice_adapter.py +52 -45
  39. evalscope/models/custom_adapter.py +2 -2
  40. evalscope/models/local_model.py +4 -0
  41. evalscope/models/server_adapter.py +28 -34
  42. evalscope/report/app.py +30 -15
  43. evalscope/run.py +10 -7
  44. evalscope/utils/chat_service.py +2 -2
  45. evalscope/utils/io_utils.py +1 -1
  46. evalscope/version.py +2 -2
  47. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/METADATA +14 -5
  48. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/RECORD +53 -46
  49. tests/cli/test_run.py +93 -16
  50. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  51. evalscope/metrics/math_accuracy.py +0 -200
  52. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
  53. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
  54. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
  55. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.10.1
3
+ Version: 0.11.0
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -19,10 +19,12 @@ License-File: LICENSE
19
19
  Requires-Dist: absl-py
20
20
  Requires-Dist: accelerate
21
21
  Requires-Dist: cachetools
22
- Requires-Dist: datasets<=3.0.1,>=3.0.0
22
+ Requires-Dist: datasets<=3.2.0,>=3.0.0
23
23
  Requires-Dist: editdistance
24
24
  Requires-Dist: jieba
25
25
  Requires-Dist: jsonlines
26
+ Requires-Dist: langdetect
27
+ Requires-Dist: latex2sympy2
26
28
  Requires-Dist: matplotlib
27
29
  Requires-Dist: modelscope[framework]
28
30
  Requires-Dist: nltk>=3.9
@@ -42,20 +44,24 @@ Requires-Dist: scikit-learn
42
44
  Requires-Dist: seaborn
43
45
  Requires-Dist: sentencepiece
44
46
  Requires-Dist: simple-ddl-parser
47
+ Requires-Dist: sympy
45
48
  Requires-Dist: tabulate
46
49
  Requires-Dist: tiktoken
47
50
  Requires-Dist: torch
48
51
  Requires-Dist: tqdm
49
52
  Requires-Dist: transformers>=4.33
50
53
  Requires-Dist: transformers-stream-generator
54
+ Requires-Dist: word2number
51
55
  Provides-Extra: all
52
56
  Requires-Dist: absl-py; extra == "all"
53
57
  Requires-Dist: accelerate; extra == "all"
54
58
  Requires-Dist: cachetools; extra == "all"
55
- Requires-Dist: datasets<=3.0.1,>=3.0.0; extra == "all"
59
+ Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
56
60
  Requires-Dist: editdistance; extra == "all"
57
61
  Requires-Dist: jieba; extra == "all"
58
62
  Requires-Dist: jsonlines; extra == "all"
63
+ Requires-Dist: langdetect; extra == "all"
64
+ Requires-Dist: latex2sympy2; extra == "all"
59
65
  Requires-Dist: matplotlib; extra == "all"
60
66
  Requires-Dist: modelscope[framework]; extra == "all"
61
67
  Requires-Dist: nltk>=3.9; extra == "all"
@@ -75,12 +81,14 @@ Requires-Dist: scikit-learn; extra == "all"
75
81
  Requires-Dist: seaborn; extra == "all"
76
82
  Requires-Dist: sentencepiece; extra == "all"
77
83
  Requires-Dist: simple-ddl-parser; extra == "all"
84
+ Requires-Dist: sympy; extra == "all"
78
85
  Requires-Dist: tabulate; extra == "all"
79
86
  Requires-Dist: tiktoken; extra == "all"
80
87
  Requires-Dist: torch; extra == "all"
81
88
  Requires-Dist: tqdm; extra == "all"
82
89
  Requires-Dist: transformers>=4.33; extra == "all"
83
90
  Requires-Dist: transformers-stream-generator; extra == "all"
91
+ Requires-Dist: word2number; extra == "all"
84
92
  Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
85
93
  Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
86
94
  Requires-Dist: mteb==1.19.4; extra == "all"
@@ -92,10 +100,10 @@ Requires-Dist: numpy; extra == "all"
92
100
  Requires-Dist: sse-starlette; extra == "all"
93
101
  Requires-Dist: transformers; extra == "all"
94
102
  Requires-Dist: unicorn; extra == "all"
95
- Requires-Dist: gradio>=5.4.0; extra == "all"
103
+ Requires-Dist: gradio==5.4.0; extra == "all"
96
104
  Requires-Dist: plotly>=5.23.0; extra == "all"
97
105
  Provides-Extra: app
98
- Requires-Dist: gradio>=5.4.0; extra == "app"
106
+ Requires-Dist: gradio==5.4.0; extra == "app"
99
107
  Requires-Dist: plotly>=5.23.0; extra == "app"
100
108
  Provides-Extra: inner
101
109
  Requires-Dist: absl-py; extra == "inner"
@@ -215,6 +223,7 @@ Please scan the QR code below to join our community groups:
215
223
 
216
224
 
217
225
  ## 🎉 News
226
+ - 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/zh-cn/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
218
227
  - 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
219
228
  - 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
220
229
  - 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
- evalscope/arguments.py,sha256=v6IyhjgBACDkapnZYi6DeBI1aZxRVA-mx7KR1j72lYs,4493
3
- evalscope/config.py,sha256=4klkNziKT4r8a4Z1imkiY16-S8iER1BYPMOG4nJg9lU,8571
2
+ evalscope/arguments.py,sha256=r8gOMX6i8dWMl_WXLsBdHla7cuauBAyv9apky9VxLsE,4598
3
+ evalscope/config.py,sha256=D7C_K0f0xsfzFUSNSJJUTz3n9tmA6zLDbf8pZ_9ltpw,8600
4
4
  evalscope/constants.py,sha256=bkcDVbB4Pr1Qxz83qefcWjEetVGiHTcx3m84WX14ASI,3330
5
- evalscope/run.py,sha256=KKZBy2hr8_BscE0ZR1rN9U7iPc1eZYeeInfXe3EY7lA,5718
5
+ evalscope/run.py,sha256=qfMqVWlUiXEiIJ665p3-IYWknhIeNZkCJe3Yn07Y74U,5692
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
7
  evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
8
- evalscope/version.py,sha256=Bt6Ke7m38AQOnf3xTgdKX-eFqm09Gu5GYEjTkjPrPEk,119
8
+ evalscope/version.py,sha256=h6YAZAgeAreWmKtpfr4D6BEvnWZxb1bka9hrpYOO0l8,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -56,13 +56,15 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
56
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
57
57
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
58
58
  evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
59
- evalscope/benchmarks/benchmark.py,sha256=SFDjyxd4t4KEcLBP82zE_KCJ_wXuv8J3XFzIR4M9fFI,2419
60
- evalscope/benchmarks/data_adapter.py,sha256=Aaspp5dR1aINXAopm0y7LHeMwJbmYXfy5bNm9DpagRo,12051
59
+ evalscope/benchmarks/benchmark.py,sha256=IY2xYmNR58aYnZK7rnUDONWiLQopo_ZifGS2SfN2L-Q,2422
60
+ evalscope/benchmarks/data_adapter.py,sha256=xCBvJe4ubgpP1J8ElcWAJwF6B5CSrBEv_uMwQzlUaLY,12540
61
+ evalscope/benchmarks/aime24/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
+ evalscope/benchmarks/aime24/aime24_adapter.py,sha256=FYH8NsT1nis3VoBMzRM_ueOsGNXjOKZCa6J_wpUM3RQ,1772
61
63
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
62
64
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
63
- evalscope/benchmarks/arc/arc_adapter.py,sha256=TdDB3lazJNdUt2bBo1G7zaOAN6YkKXdcgMui1ygQj3Y,6591
65
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=vfwAy01LA141qn1lsSyZmEIGWbbhOCRMOGoSM-K2z6M,6490
64
66
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
65
- evalscope/benchmarks/bbh/bbh_adapter.py,sha256=pkgIEr_4QyzngUcs0j4oOscFljGoYZcCAS861Afnt_0,8316
67
+ evalscope/benchmarks/bbh/bbh_adapter.py,sha256=37wY3r1qW5qdjyKF-8n7UIM0IVcpaQugMb5Rkjbppxg,8524
66
68
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
67
69
  evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
68
70
  evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
@@ -91,55 +93,60 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
91
93
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
92
94
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
93
95
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
94
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=2PvM5cvviyVNeFGnz-ymYVhEyPoea52OL_dg7dwVzQQ,11429
96
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=Qz2oNGw0H_4FtfY-Izdxv9fgwxScJksyvwzeQw-aVyo,11374
95
97
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
96
- evalscope/benchmarks/ceval/samples.jsonl,sha256=dyWhGAdt4eq6Amgu2Ykx8RevUJVFtbhGFSTbDAeUgHc,448
97
98
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
98
99
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
99
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=O6FIsJDgg4OiHZSafaDq7jZ2gubWumPMhkdVb8WN-D8,10526
100
+ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=1RmhI0SNxHK-Fz-iTIR76zeBRDLlm0m6_7rJywqk3Rk,10446
100
101
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
101
102
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
102
103
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
103
- evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=ns2WPbqkR52rRKo244WoAeAO9VOESEl_sHCPhym2DnM,6768
104
+ evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=i0E4TNajMVcWT8lc5haIjKvdmHuI5qzgpssIm5Fw7bs,7413
105
+ evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
+ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=o3Q6ke-RLx4qUbF5FgASZogv3-kCJ6qpK43F_LARU3Y,2496
107
+ evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
+ evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=vDHgsWpsIZQWNadl3mI8M3rDKkvPM2N2KAkW-8aeOHY,5130
104
109
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
105
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=1MQXl3Wf_Dnzn7_7BSTu7RT6BOfhhiVyAnqECawxyfM,3899
110
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=fu14ZzGYyg2MEdJbxZGBoIbais6xA9Um2BEAJTvBZZM,3823
106
111
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
107
112
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
108
- evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=Z5TtgPCCT8AVmFCMVIVmfhqe51CyCTaLSYTiev7smPw,4232
113
+ evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=tiy8Cn1ZmNKjVg8lqNAxWBbsKp8h0uiDNpWuHfcID0A,4689
109
114
  evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
110
115
  evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTvt5hpXg,4233
111
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=9DuNos8xCOVFOUSJ04LAoBRVPbtqgR4XmOVk6r8ADU8,11114
116
+ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=4qtMX_SfqkXRMgGLOA6tNGMK9EkITWbjLlJT9gWbT20,10664
112
117
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
113
118
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
114
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=p7Nu-1B2mgbjfth1IhkMSWEC0TxOtD6tp_bOWeeRjts,6332
119
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=qArX2umdrYJZkDA9i3XGBGljCton99v5Yss9be9iZYw,6269
115
120
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
116
121
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
117
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=mjWkJqeRM1JVlrLXaCz1qscneLhYySZt8cgdXZSmJWY,5215
122
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=onacZB_6SF9239Ly-U70__WYsinS9iWpnf3oiYMNxKc,5164
118
123
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
119
- evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=Jx04TddVZE1gk4wXyljhtt3CLo-7Ux_RcLLMlTV-Nhg,2024
124
+ evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=3HsAdNj5JJGCFA17sPXi-59yv-pfcB0UeXKdY_mQcwU,2015
120
125
  evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
121
126
  evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
122
127
  evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
123
128
  evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
124
129
  evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
125
- evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=gByj-11KGRTQk2wF1UwNACl8i1svBAEDaj-KJm1XEmw,2387
130
+ evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=nv4mzKOPp1YPcr6e7daZuZyQ3jRNNG6PUzi38REuwSk,2356
131
+ evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
132
+ evalscope/benchmarks/math_500/math_500_adapter.py,sha256=mBzsllop5sTHw-uK04FjhEWDiEDjDaNUFDUBIVN7Xgg,1742
126
133
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
127
134
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
128
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=-ONQW0EPAPXFPIpH_Y6zRE-t9j5dT7yABgAU8wxIH4M,11829
135
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=pmT1y9dbWJcZK3U6hkXa3-lBDABx7DhQ7oHc3O-Nkg0,11769
129
136
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
130
137
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
131
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=9Mg7AKb2YL7aCilsXNA5_f1JmETfXQd1kOvLkGcKFEA,4372
138
+ evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=Fdrj26MfYmPzio2tI23WTcofrwD69_m41mkVpvlxzVU,4815
132
139
  evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
133
140
  evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO61nU,3608
134
- evalscope/benchmarks/race/race_adapter.py,sha256=9uyQLDA9kVKGu0XhwcBoMyxcgUh3jqWXRO5DahRqUpg,6678
141
+ evalscope/benchmarks/race/race_adapter.py,sha256=dC9I-3T9UFh2OVpmWKRmSszPOlFZAZ40xOPa4zN3daI,6661
135
142
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
136
143
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
137
144
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
138
145
  evalscope/benchmarks/trivia_qa/trivia_qa.py,sha256=eekxaXppMLb5tCQqNLOw2MaWlYDhI2IicPzRsTHqb5A,3070
139
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=e-jrcCvl8fbPzWCOYKq_sbl4XCulsPzAECGtvTPE-rM,5106
146
+ evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=GVuJT-Xz4ugVtcUSTRxcBgViHVowcqJf3yVsotcZoZI,5062
140
147
  evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
141
148
  evalscope/benchmarks/truthful_qa/truthful_qa.py,sha256=A4abSL6WMRcXez8flxsHy-0ZFyRg-moq9rTeOA1TalY,6909
142
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=tCVO0RTD_S7z1ky7su5z67dnpgbsEtcH5j0vCpfvUV8,12908
149
+ evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=6rT1zuQh0nLuYymcchO-cMP98EY0vWizbfTfnUERWgo,12905
143
150
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
144
151
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
145
152
  evalscope/cli/cli.py,sha256=w_dtXljur9s5lmTn6LbbFL_viTPQB1WAEzhYcId09Og,729
@@ -148,19 +155,19 @@ evalscope/cli/start_eval.py,sha256=2lyD2WSQ0DnP6T31VvTimQ-6POnwxeEP9GLPFnT7Tfo,7
148
155
  evalscope/cli/start_perf.py,sha256=lEHJBSpzNsO4KGlWfQc-EfZGXq1M_FpOwtRxRdb4fso,813
149
156
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
150
157
  evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
151
- evalscope/collections/evaluator.py,sha256=_XaLn_cSKvAW96aNwaaPbrBDPl9qn0VrsTjID_y7SpM,8910
152
- evalscope/collections/sampler.py,sha256=6Tp0jN7bJQqG-7AQ2UDPDur6O5aC_nl0N-OV9HfuE9Q,4769
153
- evalscope/collections/schema.py,sha256=Ns47HXt7Ym4sPdPyxStxALHUid2cW7kWhqvw_jK_p-4,4172
158
+ evalscope/collections/evaluator.py,sha256=FJx3KGdLi0-TIqWC_067HEmA4P298BKdwHIrbcai46M,12065
159
+ evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
160
+ evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
154
161
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
155
- evalscope/evaluator/evaluator.py,sha256=0IOuWQ4KgWuMisNmFqh4-id3d1Kkbkf4JW-6hVz7tqU,16638
162
+ evalscope/evaluator/evaluator.py,sha256=E0NiP5O56WbF8eiUmw9IY2ouotRog9H-2SRyTzZld0I,17569
156
163
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
157
164
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
158
165
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
159
166
  evalscope/metrics/__init__.py,sha256=yzuZjXufrPqVhzNTNaJLJwhs7-Sgb-iNG0I3BdOX7Tg,291
160
167
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
161
- evalscope/metrics/math_accuracy.py,sha256=a0L_YT70bsJYn5_POICJyj6ZVFbHek1ly6j_ssV9Xsc,5585
162
- evalscope/metrics/metrics.py,sha256=H02Hhj9Me2qzUjSzdV57i5Gj6xP_w5kbuPcuPpejlI0,12860
163
- evalscope/metrics/named_metrics.py,sha256=j-y-d5EJ4FJzOxlIKobKIMUNu--nzAIIc2j0TvDfFb0,574
168
+ evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
169
+ evalscope/metrics/metrics.py,sha256=r4FHyEvvFhMu0vAHBw-ByFefObDBC3DQdr53klSk6Wk,13325
170
+ evalscope/metrics/named_metrics.py,sha256=SeBXmgWyK4y4tKiGKro3k-CZU1OShuKe6qxwpT3tizY,1313
164
171
  evalscope/metrics/rouge_metric.py,sha256=zhIUqenSuxnORR9tamLQBGjFwP91Zei2UiLtcOyseVM,4639
165
172
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
166
173
  evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=Kq6AObenmLVQ5tN3NgN042a6mgRFQmRO21-ohd9mSa8,11972
@@ -168,12 +175,12 @@ evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48
168
175
  evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
169
176
  evalscope/models/__init__.py,sha256=pafIEbJq_2DrYjQbgI0SNVxywNYOxvqwk7Dr1P7KEwk,923
170
177
  evalscope/models/base_adapter.py,sha256=fT3i8c9jRmz_VBcUYMMmXrlCM6JWcixPdgak5yT6Wkw,2177
171
- evalscope/models/chat_adapter.py,sha256=9DIMwacjrR647pYVKgeYn090ZKBVHmMD_mf3Gz2vdw0,5461
172
- evalscope/models/choice_adapter.py,sha256=Zb-UUFpF2tpMGuGH_wFleMxpSb__-SuN1cMF7yj25aI,7661
173
- evalscope/models/custom_adapter.py,sha256=uj4kbBCwhrXjvSq9f6HgTJ5yJ9FJpvs1k5-9Ekm9RmA,2272
174
- evalscope/models/local_model.py,sha256=EBclVq5tqUFNOZebRlNnZSvzwtSun7FsZRf2tx0cMt0,2486
178
+ evalscope/models/chat_adapter.py,sha256=nOrNDuvuNKkTcW9zNcR_EIqbzkqK5PFws-5YsSxBR9E,6120
179
+ evalscope/models/choice_adapter.py,sha256=jj_6KB1BAsvv4Yufn2bM2tCiLovFUum2368lseogmb8,8036
180
+ evalscope/models/custom_adapter.py,sha256=Ed_MGEcZxKK4mkXTpUY4GXTsayprHzIEOC1L9gqwjf4,2284
181
+ evalscope/models/local_model.py,sha256=s0YVX9Djqazusk7qzSpWQB76jGGuzJxqQlZzomsCFsk,2621
175
182
  evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
176
- evalscope/models/server_adapter.py,sha256=VGk_nTwkLWO7Ln7lV_KSaIBzlSRZzyIs_bWDeJ_pOho,4469
183
+ evalscope/models/server_adapter.py,sha256=iVJuUJlHGVGxnlrDMnbHZ8WQ4OR2HK5HrXH4obD2_cg,4173
177
184
  evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk77ksNQ,102
178
185
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
179
186
  evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
@@ -225,7 +232,7 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
225
232
  evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
226
233
  evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
227
234
  evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
228
- evalscope/report/app.py,sha256=kB4CCrAoIONRc37Np8B3QsLxJBD_j2Sw2xtfR1FgfC0,26087
235
+ evalscope/report/app.py,sha256=adP1rVVOxYMbCTdopV3FKWBhUzB7t1AXcDOxW4Ct56g,26647
229
236
  evalscope/report/combinator.py,sha256=bi6nvTbMrzraZ8kUZ6mIMikk8-qEIVYUhdaH4RE1Tg8,2653
230
237
  evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
231
238
  evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
@@ -258,9 +265,9 @@ evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1
258
265
  evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=usmVelh0ogBlCtSUL0dqp89w2mAqH1Ptv9MURVoGrc8,1209
259
266
  evalscope/utils/__init__.py,sha256=jLVoGryuqUh4Km9QWWQBzpqkcVNRK0MbwNaSgckqdiU,139
260
267
  evalscope/utils/arena_utils.py,sha256=Gf8VpH4C_oF2Abif_QeL0rAP6tvTzsc0gglpdNkUE48,7155
261
- evalscope/utils/chat_service.py,sha256=Kh3hEUW_HF158a0QqHbWepHIHRQFJgUM-jCDAcQ_maw,8674
268
+ evalscope/utils/chat_service.py,sha256=eZ8uyVeVFpXZo_uvRFyVhnFyJpL14zcn9UA6K4Ax5J4,8676
262
269
  evalscope/utils/completion_parsers.py,sha256=YWHkLkSfURTcUjNNlCL6PPDICd4F2Ns9figgPN4C97c,2933
263
- evalscope/utils/io_utils.py,sha256=vm6uJBBqx4fc7jsHGbwNQ6Hbx7XYhjT1Q2dQ7aHjDD0,4172
270
+ evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
264
271
  evalscope/utils/logger.py,sha256=49F2WDi1g_o8aW8Z29wOt9YHE9LDqkHIgb-d8TVybJY,3635
265
272
  evalscope/utils/model_utils.py,sha256=PK7pKNY8ovtGZHNRvDpZ-d8zBHMOkxd6fRVkM8VF06I,736
266
273
  evalscope/utils/utils.py,sha256=a6a2vDDxqlj7nY8xynkKkWs_ZPXEU2UMwvxp0JEpHjg,9686
@@ -268,7 +275,7 @@ tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
268
275
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
269
276
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
270
277
  tests/cli/test_collection.py,sha256=gx3GySIAPNaLUSf3D3Q3V0WZc21BPdNthIbECHQN0TI,3026
271
- tests/cli/test_run.py,sha256=aywruYPPweMEHaBOynf0G3liKBKMH_H_e4Znq2PcaR4,5821
278
+ tests/cli/test_run.py,sha256=gtId2SF1LlDCIn4S_WKRpAyTig_pWOhY8yto4P5B1EY,8303
272
279
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
273
280
  tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
274
281
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -281,9 +288,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
281
288
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
282
289
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
283
290
  tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
284
- evalscope-0.10.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
285
- evalscope-0.10.1.dist-info/METADATA,sha256=-HQt66q9NaZvcNwiXgLW87aduUogXKaHYz6JokxtEXc,28975
286
- evalscope-0.10.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
287
- evalscope-0.10.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
288
- evalscope-0.10.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
289
- evalscope-0.10.1.dist-info/RECORD,,
291
+ evalscope-0.11.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
292
+ evalscope-0.11.0.dist-info/METADATA,sha256=GL8Ybyby65DYg8jxjxzdcFYvXBhKzE7eRFIBRiJ0-hc,29584
293
+ evalscope-0.11.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
294
+ evalscope-0.11.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
295
+ evalscope-0.11.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
296
+ evalscope-0.11.0.dist-info/RECORD,,
tests/cli/test_run.py CHANGED
@@ -73,16 +73,18 @@ class TestRun(unittest.TestCase):
73
73
  def test_run_task(self):
74
74
  task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct',
75
75
  'datasets': [
76
- 'mmlu_pro',
76
+ # 'mmlu_pro',
77
77
  # 'bbh',
78
- 'hellaswag',
78
+ # 'hellaswag',
79
79
  # 'gsm8k',
80
- # 'arc'
80
+ # 'arc',
81
81
  # 'race',
82
+ 'ifeval',
82
83
  # 'truthful_qa',
83
84
  # 'trivia_qa',
84
85
  ],
85
- 'limit': 20,
86
+ 'limit': 2,
87
+ 'eval_batch_size': 2,
86
88
  'debug': True}
87
89
  run_task(task_cfg=task_cfg)
88
90
 
@@ -93,9 +95,9 @@ class TestRun(unittest.TestCase):
93
95
 
94
96
  task_cfg = TaskConfig(
95
97
  model='qwen/Qwen2-0.5B-Instruct',
96
- datasets=['ceval', 'general_qa'], # 数据格式,选择题格式固定为 'ceval'
98
+ datasets=['general_mcq', 'general_qa'], # 数据格式,选择题格式固定为 'ceval'
97
99
  dataset_args={
98
- 'ceval': {
100
+ 'general_mcq': {
99
101
  'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
100
102
  'subset_list': [
101
103
  'example' # 评测数据集名称,上述 *_dev.csv 中的 *
@@ -117,8 +119,17 @@ class TestRun(unittest.TestCase):
117
119
 
118
120
  task_cfg = TaskConfig(
119
121
  model='qwen/Qwen2-0.5B-Instruct',
120
- datasets=['humaneval'],
121
- limit=2
122
+ datasets=[
123
+ # 'math_500',
124
+ # 'aime24',
125
+ 'competition_math'
126
+ ],
127
+ dataset_args={
128
+ 'competition_math': {
129
+ 'subset_list': ['Level 4', 'Level 5']
130
+ }
131
+ },
132
+ limit=5
122
133
  )
123
134
 
124
135
  run_task(task_cfg=task_cfg)
@@ -128,12 +139,12 @@ class TestRun(unittest.TestCase):
128
139
  from evalscope.config import TaskConfig
129
140
 
130
141
  task_cfg = TaskConfig(
131
- model='Qwen2.5-7B-Instruct',
142
+ model='Qwen2.5-0.5B-Instruct',
132
143
  api_url='http://127.0.0.1:8801/v1/chat/completions',
133
144
  api_key='EMPTY',
134
145
  eval_type=EvalType.SERVICE,
135
146
  datasets=[
136
- 'iquiz',
147
+ # 'iquiz',
137
148
  # 'ifeval',
138
149
  # 'mmlu',
139
150
  # 'mmlu_pro',
@@ -141,25 +152,91 @@ class TestRun(unittest.TestCase):
141
152
  # 'trivia_qa',
142
153
  # 'cmmlu',
143
154
  # 'humaneval',
144
- # 'competition_math',
145
155
  # 'gsm8k',
156
+ # 'bbh',
157
+ 'competition_math',
158
+ 'math_500',
159
+ 'aime24',
160
+ 'gpqa',
146
161
  # 'arc',
147
162
  # 'ceval',
148
- # 'bbh',
149
163
  # 'hellaswag',
150
164
  ],
151
165
  dataset_args={
166
+ 'mmlu': {
167
+ 'subset_list': ['elementary_mathematics'],
168
+ 'few_shot_num': 0
169
+ },
170
+ 'mmlu_pro': {
171
+ 'subset_list': ['math'],
172
+ 'few_shot_num': 0
173
+ },
152
174
  'ceval': {
153
175
  'subset_list': [
154
- 'computer_network', 'operating_system', 'computer_architecture', 'college_programming'
155
- ]
156
- }
176
+ 'computer_network', 'operating_system', 'computer_architecture'
177
+ ],
178
+ 'few_shot_num': 0
179
+ },
180
+ 'cmmlu': {
181
+ 'subset_list': ['elementary_chinese'],
182
+ 'few_shot_num': 0
183
+ },
184
+ 'bbh': {
185
+ 'subset_list': ['word_sorting', 'movie_recommendation'],
186
+ },
187
+ 'gpqa': {
188
+ 'subset_list': ['gpqa_diamond'],
189
+ 'few_shot_num': 0
190
+ },
191
+ 'humaneval': {
192
+ 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
193
+ },
194
+ 'competition_math': {
195
+ 'subset_list': ['Level 1']
196
+ },
197
+ },
198
+ eval_batch_size=5,
199
+ limit=10,
200
+ debug=True,
201
+ generation_config={
202
+ 'temperature': 0.7,
203
+ 'n': 5
157
204
  },
158
- # limit=10
205
+ use_cache='/mnt/data/data/user/maoyunlin.myl/eval-scope/outputs/20250212_150525'
159
206
  )
160
207
 
161
208
  run_task(task_cfg=task_cfg)
162
209
 
163
210
 
211
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
212
+ def test_run_batch_eval(self):
213
+ from evalscope.config import TaskConfig
214
+
215
+ task_cfg = TaskConfig(
216
+ model='LLM-Research/Llama-3.2-1B-Instruct',
217
+ datasets=[
218
+ # 'math_500',
219
+ # 'aime24',
220
+ # 'competition_math'
221
+ # 'arc',
222
+ 'gsm8k'
223
+ # 'truthful_qa'
224
+ ],
225
+ dataset_args={
226
+ 'competition_math': {
227
+ 'subset_list': ['Level 4', 'Level 5']
228
+ }
229
+ },
230
+ eval_batch_size=2,
231
+ limit=5,
232
+ generation_config={
233
+ 'max_new_tokens': 2048,
234
+ 'temperature': 0.7,
235
+ 'num_return_sequences': 2,
236
+ }
237
+ )
238
+
239
+ run_task(task_cfg=task_cfg)
240
+
164
241
  if __name__ == '__main__':
165
242
  unittest.main()
@@ -1 +0,0 @@
1
- {'id': 0, 'question': '下列关于税法基本原则的表述中,不正确的是____。', 'A': '税收法定原则包括税收要件法定原则和税务合法性原则', 'B': '税收公平原则源于法律上的平等性原则', 'C': '税收效率原则包含经济效率和行政效率两个方面', 'D': '税务机关按法定程序依法征税,可以自由做出减征、停征或免征税款的决定', 'answer': 'D', 'explanation': ''}