evalscope 0.12.1__py3-none-any.whl → 0.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +6 -1
- evalscope/benchmarks/arc/arc_adapter.py +3 -3
- evalscope/benchmarks/benchmark.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -1
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +32 -4
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -4
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +20 -24
- evalscope/benchmarks/humaneval/humaneval_adapter.py +8 -5
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
- evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -2
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +148 -1
- evalscope/benchmarks/super_gpqa/utils.py +0 -5
- evalscope/collections/evaluator.py +4 -4
- evalscope/config.py +11 -3
- evalscope/constants.py +8 -0
- evalscope/evaluator/evaluator.py +56 -17
- evalscope/metrics/llm_judge.py +104 -0
- evalscope/models/custom_adapter.py +1 -1
- evalscope/perf/arguments.py +11 -40
- evalscope/perf/benchmark.py +39 -28
- evalscope/perf/http_client.py +9 -1
- evalscope/perf/main.py +2 -1
- evalscope/perf/plugin/datasets/__init__.py +1 -0
- evalscope/perf/plugin/datasets/openqa.py +6 -11
- evalscope/perf/plugin/datasets/random_dataset.py +51 -0
- evalscope/perf/utils/db_util.py +3 -0
- evalscope/run.py +15 -3
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/METADATA +56 -38
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/RECORD +50 -36
- tests/cli/test_all.py +144 -0
- tests/cli/test_collection.py +27 -1
- tests/cli/test_run.py +103 -11
- tests/perf/test_perf.py +23 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/LICENSE +0 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/WHEEL +0 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=VhZd7a8PoZK01qFCMEADLINqLYi6njRqRb50iR1l1lo,5241
|
|
3
|
+
evalscope/config.py,sha256=wLrc8a7z28IFPRaeUzot5HGtSDY_13KR-3kRyFKEGx8,9476
|
|
4
|
+
evalscope/constants.py,sha256=Cgzkoz4R3MC3YLtbCM2fmSwF8Z2kuxYdOC8t9FWJj9w,3740
|
|
5
|
+
evalscope/run.py,sha256=LUCdnNzNIfHSWvxu3gxAsHEDX7hT5mcVnV4lSY5h0iA,6007
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=Y30-zF2dwch3upMc0t5yNNjIgvI-LQQWFhftRQgXvOk,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -56,15 +56,15 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
|
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
57
57
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
58
58
|
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
59
|
+
evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
|
|
60
|
+
evalscope/benchmarks/data_adapter.py,sha256=2u9oC4RBHVfEMHKPRu87xM4XOw_RS2Z2fvagNsciEo4,16791
|
|
61
61
|
evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
|
|
62
62
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
63
|
evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
|
|
64
64
|
evalscope/benchmarks/aime/aime25_adapter.py,sha256=FB_NufY2V7uYdxVnrY_4y81gyyfYDnvedz1_zHdDWt4,1709
|
|
65
65
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
66
66
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
67
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
67
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=lkhDz-DYjPQ1vHzo8X4j-0Lq_rBxAnws35_R00pIbNI,6347
|
|
68
68
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
69
69
|
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
|
|
70
70
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
@@ -95,11 +95,13 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
|
|
|
95
95
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
96
96
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
97
97
|
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
98
|
-
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=
|
|
98
|
+
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=E4QobCjSSkMZtPJyaT_XBVxiqEqa1bta1I9aFnaHOqs,11308
|
|
99
99
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
100
|
+
evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
101
|
+
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=nKF_a0yc_PbZYjYA_-gJh3ePZIEz5txrhDV4IsTqD4Q,8196
|
|
100
102
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
101
103
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
102
|
-
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=
|
|
104
|
+
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=TTq2jRz46Hqc_D_ZBaiw_OwKub1FZX6w8C7g7COIdGs,10372
|
|
103
105
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
104
106
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
105
107
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
@@ -109,7 +111,7 @@ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=U4M-0MVJS
|
|
|
109
111
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
110
112
|
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
|
|
111
113
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
112
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
114
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=ELDdS5T3JZeSWVv1ldawcHzLwAljEWKqakbRMVcBvgw,4741
|
|
113
115
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
114
116
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
115
117
|
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
|
|
@@ -118,10 +120,10 @@ evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTv
|
|
|
118
120
|
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=ZZZ-9oja53IwiU33Kjm7NTk4MbFGWyvonhnHrn_3Na8,10557
|
|
119
121
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
120
122
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
121
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
123
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=QYZZuxbjkKxAjxuoWn0M5WgusO55vzeAcyKnWUMow3M,5871
|
|
122
124
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
123
125
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
124
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
126
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
|
|
125
127
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
126
128
|
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=R7MILWuMglvXr7yWioBxyJ2T4EdEkwRZ1lnvWqZqG28,1922
|
|
127
129
|
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
@@ -130,11 +132,20 @@ evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2
|
|
|
130
132
|
evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
|
|
131
133
|
evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
132
134
|
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=16whmFkJt9fLbei9d-kmjnWB_5y5vsiX9tK5kSuxDw8,2449
|
|
135
|
+
evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
136
|
+
evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=rOWaG8PV4AGIRhS_gqwxEhphEVe1Cqg57Eudwm5HTjI,6820
|
|
137
|
+
evalscope/benchmarks/live_code_bench/execute_utils.py,sha256=MreaMLI0IicNZawpfqcyoRLt67EZ3CJvmxxRTYwhAbU,7397
|
|
138
|
+
evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
|
|
139
|
+
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=8MOECcweLG465JFgUzP20QlKyBAO90oFHhH7Z77FuUY,3521
|
|
140
|
+
evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
|
|
141
|
+
evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
|
|
142
|
+
evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
|
|
143
|
+
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=EBe0XzY3B4cW5dCjwLksW7o4R1chZwsuFjxkfqVPFI4,28238
|
|
133
144
|
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
134
145
|
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
|
|
135
146
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
136
147
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
137
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=
|
|
148
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=e__Evar99V9l65FlzT6T594CN4iMgmuVhjujQAm4po4,11662
|
|
138
149
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
139
150
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
140
151
|
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
|
|
@@ -148,11 +159,11 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
|
|
|
148
159
|
evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
|
|
149
160
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
150
161
|
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
151
|
-
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=
|
|
162
|
+
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=SrK18xDe4HyUaIPRLVEDtoF4Nc_ms4aFxktEsj8MnnA,9071
|
|
152
163
|
evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
164
|
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
|
|
154
165
|
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
|
|
155
|
-
evalscope/benchmarks/super_gpqa/utils.py,sha256=
|
|
166
|
+
evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
|
|
156
167
|
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
|
|
157
168
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
158
169
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
@@ -169,16 +180,17 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
|
|
|
169
180
|
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
170
181
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
171
182
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
172
|
-
evalscope/collections/evaluator.py,sha256=
|
|
183
|
+
evalscope/collections/evaluator.py,sha256=YJy8Dj35XCdCwhNDwZecJkeW1_ZgIOsuRLFzfe3SyV8,12724
|
|
173
184
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
174
185
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
175
186
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
176
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
187
|
+
evalscope/evaluator/evaluator.py,sha256=szRQrXH5ILpUljb14lcunuOt185H8Um1paviTokraA4,19845
|
|
177
188
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
178
189
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
179
190
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
180
191
|
evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
|
|
181
192
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
193
|
+
evalscope/metrics/llm_judge.py,sha256=g9pLMJPNTUyw0sGteblws1_e_KzbRqcbqKcaIzfE_DE,4031
|
|
182
194
|
evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
|
|
183
195
|
evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
|
|
184
196
|
evalscope/metrics/named_metrics.py,sha256=pSHA2_qdi9B5bDHIh08GYhx63odilSwA_T-95K1Usl0,1380
|
|
@@ -191,7 +203,7 @@ evalscope/models/__init__.py,sha256=i9vcOBMEF_UM7C2gpmh2GsQk3njwqevoQ6A4CnP1fHs,
|
|
|
191
203
|
evalscope/models/base_adapter.py,sha256=7PbRwfD5PIZCBYVds6ZHI8TBY9C5i2LdPOTu88FJWlY,3414
|
|
192
204
|
evalscope/models/chat_adapter.py,sha256=5-yz7L41OdeBO9J_qRkEZcduATrYIMe__UFfh7BzjIc,6277
|
|
193
205
|
evalscope/models/choice_adapter.py,sha256=fnJdo-FMJ-zvNLbEJGc73odgWXIxtVudL00JIf2vzsA,8239
|
|
194
|
-
evalscope/models/custom_adapter.py,sha256=
|
|
206
|
+
evalscope/models/custom_adapter.py,sha256=AGztmZ0aT0g2flh4B4NaiZ8LCDg8tT0gVNxmrP5W1mA,2401
|
|
195
207
|
evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
|
|
196
208
|
evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
|
|
197
209
|
evalscope/models/register.py,sha256=4vX6AfScAzwD7UkncbuejfAiQHznQkK5hvtG6jEUbWo,809
|
|
@@ -200,10 +212,10 @@ evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk
|
|
|
200
212
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
201
213
|
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
202
214
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
203
|
-
evalscope/perf/arguments.py,sha256=
|
|
204
|
-
evalscope/perf/benchmark.py,sha256=
|
|
205
|
-
evalscope/perf/http_client.py,sha256=
|
|
206
|
-
evalscope/perf/main.py,sha256=
|
|
215
|
+
evalscope/perf/arguments.py,sha256=hBR6TXCoLkHRLxrwXacmierfFZhyQaT5hnKAfp-vE6I,8990
|
|
216
|
+
evalscope/perf/benchmark.py,sha256=VYcFhSoZXcLoNXpFYxOFxLbBLv_8Tn74Qklim7vELCM,9889
|
|
217
|
+
evalscope/perf/http_client.py,sha256=xMakdQkJ2cgIOd-yOmHEW0vbGKTJ0JWhLFt9IFtUP8Q,7473
|
|
218
|
+
evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
|
|
207
219
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
208
220
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
209
221
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
@@ -211,18 +223,19 @@ evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqY
|
|
|
211
223
|
evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
|
|
212
224
|
evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
|
|
213
225
|
evalscope/perf/plugin/api/openai_api.py,sha256=KQRQMOfQceKQtrvTE-SyhNHcDoGuQ0900yh7r74Hcoo,7560
|
|
214
|
-
evalscope/perf/plugin/datasets/__init__.py,sha256=
|
|
226
|
+
evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
|
|
215
227
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
216
228
|
evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
|
|
217
229
|
evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
|
|
218
230
|
evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
|
|
219
231
|
evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
|
|
220
|
-
evalscope/perf/plugin/datasets/openqa.py,sha256=
|
|
232
|
+
evalscope/perf/plugin/datasets/openqa.py,sha256=_aVXs2s8wbmtoB6ZO-pNjUZvBVxRUYdoJDGv5-BumtI,1342
|
|
233
|
+
evalscope/perf/plugin/datasets/random_dataset.py,sha256=wPyY5kk2zKnc8u9uYEl-vQ6BLHeWbdC8EHEAZNFSDeU,2702
|
|
221
234
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
|
|
222
235
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
223
236
|
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
224
237
|
evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
|
|
225
|
-
evalscope/perf/utils/db_util.py,sha256=
|
|
238
|
+
evalscope/perf/utils/db_util.py,sha256=hRXixxpNBrACF43reOJV5SoO1vj34cqoNMaTKH_oLLE,9100
|
|
226
239
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
227
240
|
evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
|
|
228
241
|
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -257,7 +270,7 @@ evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u
|
|
|
257
270
|
evalscope/third_party/longbench_write/default_task.json,sha256=d_NPShtW10Mc02U3pAuxX9hXd09tZw7QJAr1SvrECcM,694
|
|
258
271
|
evalscope/third_party/longbench_write/default_task.yaml,sha256=YjU8EeyH9UtM8e7_fhrwJNChQdszOAcrKmOi--Awvhk,578
|
|
259
272
|
evalscope/third_party/longbench_write/eval.py,sha256=39McZSDHL7bA5Dg-BSyZ4EiAF1nfTiYJAnx5FqbNYok,11265
|
|
260
|
-
evalscope/third_party/longbench_write/infer.py,sha256=
|
|
273
|
+
evalscope/third_party/longbench_write/infer.py,sha256=32t90zTll6SXH7Wx8QnRFMs6ZUwvpbgYNuawCByzwR0,4971
|
|
261
274
|
evalscope/third_party/longbench_write/longbench_write.py,sha256=nIR1toB1hvUXR7Lrs3xcY9wqaI-bjeADg_Oscf3HdaY,3991
|
|
262
275
|
evalscope/third_party/longbench_write/utils.py,sha256=nd-YslsOyNGAuyBfAWb2pnTMaGLMQ58lbnJJdrCndeI,815
|
|
263
276
|
evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -298,10 +311,11 @@ evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
|
|
|
298
311
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
299
312
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
300
313
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
301
|
-
tests/cli/
|
|
302
|
-
tests/cli/
|
|
314
|
+
tests/cli/test_all.py,sha256=1wwXtdjBmWYLhs5TXOJhZBwPm2qd9FYFqQSemXWKNUs,3865
|
|
315
|
+
tests/cli/test_collection.py,sha256=V-_M7ngwekMGqPuI16jjJZyAK2XLE4Z6QTn-8B5ykgU,4071
|
|
316
|
+
tests/cli/test_run.py,sha256=Gk8uCT0IjDSf2sf-TXeQFV83ovNzRs4GcAkQ1DhRJEU,15929
|
|
303
317
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
304
|
-
tests/perf/test_perf.py,sha256=
|
|
318
|
+
tests/perf/test_perf.py,sha256=mfXTCsD9RaCef3b4CLvm8ErxBUaWzn-EKKhOxD65i3A,3817
|
|
305
319
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
306
320
|
tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
|
|
307
321
|
tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
|
|
@@ -312,9 +326,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
312
326
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
313
327
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
314
328
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
315
|
-
evalscope-0.
|
|
316
|
-
evalscope-0.
|
|
317
|
-
evalscope-0.
|
|
318
|
-
evalscope-0.
|
|
319
|
-
evalscope-0.
|
|
320
|
-
evalscope-0.
|
|
329
|
+
evalscope-0.13.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
330
|
+
evalscope-0.13.1.dist-info/METADATA,sha256=luYebd_U93wnTkXcv_MYPfd9-JRz51DjWB6Bh6phspU,33546
|
|
331
|
+
evalscope-0.13.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
332
|
+
evalscope-0.13.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
333
|
+
evalscope-0.13.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
334
|
+
evalscope-0.13.1.dist-info/RECORD,,
|
tests/cli/test_all.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import subprocess
|
|
8
|
+
import unittest
|
|
9
|
+
|
|
10
|
+
from evalscope.config import TaskConfig
|
|
11
|
+
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
12
|
+
from evalscope.run import run_task
|
|
13
|
+
from evalscope.utils import is_module_installed, test_level_list
|
|
14
|
+
from evalscope.utils.logger import get_logger
|
|
15
|
+
|
|
16
|
+
os.environ['LOG_LEVEL'] = 'DEBUG'
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
datasets=[
|
|
21
|
+
# 'iquiz',
|
|
22
|
+
# 'ifeval',
|
|
23
|
+
# 'mmlu',
|
|
24
|
+
# 'mmlu_pro',
|
|
25
|
+
# 'musr',
|
|
26
|
+
# 'process_bench',
|
|
27
|
+
# 'race',
|
|
28
|
+
# 'trivia_qa',
|
|
29
|
+
# 'cmmlu',
|
|
30
|
+
# 'humaneval',
|
|
31
|
+
# 'gsm8k',
|
|
32
|
+
# 'bbh',
|
|
33
|
+
# 'competition_math',
|
|
34
|
+
# 'math_500',
|
|
35
|
+
# 'aime24',
|
|
36
|
+
# 'gpqa',
|
|
37
|
+
# 'arc',
|
|
38
|
+
# 'ceval',
|
|
39
|
+
# 'hellaswag',
|
|
40
|
+
# 'general_mcq',
|
|
41
|
+
# 'general_qa',
|
|
42
|
+
'super_gpqa',
|
|
43
|
+
'live_code_bench',
|
|
44
|
+
'simple_qa',
|
|
45
|
+
'chinese_simpleqa',
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
dataset_args={
|
|
49
|
+
'mmlu': {
|
|
50
|
+
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
51
|
+
'few_shot_num': 0
|
|
52
|
+
},
|
|
53
|
+
'mmlu_pro': {
|
|
54
|
+
'subset_list': ['math', 'health'],
|
|
55
|
+
'few_shot_num': 4
|
|
56
|
+
},
|
|
57
|
+
'ceval': {
|
|
58
|
+
'subset_list': [
|
|
59
|
+
'computer_network', 'operating_system', 'computer_architecture'
|
|
60
|
+
],
|
|
61
|
+
'few_shot_num': 0
|
|
62
|
+
},
|
|
63
|
+
'cmmlu': {
|
|
64
|
+
'subset_list': ['elementary_chinese'],
|
|
65
|
+
'few_shot_num': 0
|
|
66
|
+
},
|
|
67
|
+
'bbh': {
|
|
68
|
+
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
69
|
+
},
|
|
70
|
+
'gpqa': {
|
|
71
|
+
'subset_list': ['gpqa_diamond'],
|
|
72
|
+
'few_shot_num': 0,
|
|
73
|
+
},
|
|
74
|
+
'humaneval': {
|
|
75
|
+
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
76
|
+
},
|
|
77
|
+
'competition_math': {
|
|
78
|
+
'subset_list': ['Level 1']
|
|
79
|
+
},
|
|
80
|
+
'math_500': {
|
|
81
|
+
'subset_list': ['Level 1']
|
|
82
|
+
},
|
|
83
|
+
'process_bench': {
|
|
84
|
+
'subset_list': ['gsm8k'],
|
|
85
|
+
},
|
|
86
|
+
'musr': {
|
|
87
|
+
'subset_list': ['murder_mysteries']
|
|
88
|
+
},
|
|
89
|
+
'general_mcq': {
|
|
90
|
+
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
91
|
+
'subset_list': [
|
|
92
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
93
|
+
],
|
|
94
|
+
},
|
|
95
|
+
'general_qa': {
|
|
96
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
97
|
+
'subset_list': [
|
|
98
|
+
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
99
|
+
# 'test'
|
|
100
|
+
],
|
|
101
|
+
'metric_list': ['AverageBLEU']
|
|
102
|
+
},
|
|
103
|
+
'super_gpqa': {
|
|
104
|
+
'subset_list': ['Philosophy', 'Education'],
|
|
105
|
+
'few_shot_num': 0
|
|
106
|
+
},
|
|
107
|
+
'live_code_bench': {
|
|
108
|
+
'subset_list': ['v4_v5'],
|
|
109
|
+
'extra_params': {
|
|
110
|
+
'start_date': '2024-12-01',
|
|
111
|
+
'end_date': '2025-01-01'
|
|
112
|
+
},
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
class TestRun(unittest.TestCase):
|
|
117
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
118
|
+
def test_benchmarks(self):
|
|
119
|
+
from evalscope.config import TaskConfig
|
|
120
|
+
|
|
121
|
+
task_cfg = TaskConfig(
|
|
122
|
+
model='qwen2.5-7b-instruct',
|
|
123
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
124
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
125
|
+
eval_type=EvalType.SERVICE,
|
|
126
|
+
datasets=datasets,
|
|
127
|
+
dataset_args=dataset_args,
|
|
128
|
+
eval_batch_size=32,
|
|
129
|
+
limit=2,
|
|
130
|
+
stream=True,
|
|
131
|
+
generation_config={
|
|
132
|
+
'temperature': 0,
|
|
133
|
+
'n': 1,
|
|
134
|
+
'max_tokens': 4096,
|
|
135
|
+
},
|
|
136
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
137
|
+
judge_model_args={
|
|
138
|
+
'model_id': 'qwen2.5-7b-instruct',
|
|
139
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
140
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
141
|
+
}
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
run_task(task_cfg=task_cfg)
|
tests/cli/test_collection.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
import unittest
|
|
3
4
|
|
|
4
5
|
from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
|
|
5
|
-
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.constants import EvalType, JudgeStrategy
|
|
6
7
|
from evalscope.utils.io_utils import dump_jsonl_data
|
|
7
8
|
from evalscope.utils.utils import test_level_list
|
|
8
9
|
|
|
@@ -55,3 +56,28 @@ class TestCollection(unittest.TestCase):
|
|
|
55
56
|
}},
|
|
56
57
|
)
|
|
57
58
|
run_task(task_cfg=task_cfg)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
62
|
+
def test_evaluate_collection_with_judge(self):
|
|
63
|
+
from evalscope import TaskConfig, run_task
|
|
64
|
+
|
|
65
|
+
task_cfg = TaskConfig(
|
|
66
|
+
model='qwen2.5-7b-instruct',
|
|
67
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
68
|
+
api_key= os.getenv('DASHSCOPE_API_KEY'),
|
|
69
|
+
eval_type=EvalType.SERVICE,
|
|
70
|
+
datasets=['data_collection'],
|
|
71
|
+
dataset_args={'data_collection': {
|
|
72
|
+
'local_path': 'outputs/mixed_data_test.jsonl'
|
|
73
|
+
# 'local_path': 'outputs/weighted_mixed_data.jsonl'
|
|
74
|
+
}},
|
|
75
|
+
limit=10,
|
|
76
|
+
judge_strategy=JudgeStrategy.LLM_RECALL,
|
|
77
|
+
judge_model_args={
|
|
78
|
+
'model_id': 'qwen2.5-7b-instruct',
|
|
79
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
80
|
+
'api_key': os.getenv('DASHSCOPE_API_KEY'),
|
|
81
|
+
}
|
|
82
|
+
)
|
|
83
|
+
run_task(task_cfg=task_cfg)
|
tests/cli/test_run.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
2
6
|
import os
|
|
3
7
|
import subprocess
|
|
4
|
-
import torch
|
|
5
8
|
import unittest
|
|
6
9
|
|
|
7
10
|
from evalscope.config import TaskConfig
|
|
8
|
-
from evalscope.constants import EvalType, OutputType
|
|
11
|
+
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
9
12
|
from evalscope.run import run_task
|
|
10
13
|
from evalscope.utils import is_module_installed, test_level_list
|
|
11
14
|
from evalscope.utils.logger import get_logger
|
|
@@ -200,7 +203,7 @@ class TestRun(unittest.TestCase):
|
|
|
200
203
|
print(res)
|
|
201
204
|
|
|
202
205
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
203
|
-
def
|
|
206
|
+
def test_run_one_task(self):
|
|
204
207
|
from evalscope.config import TaskConfig
|
|
205
208
|
|
|
206
209
|
task_cfg = TaskConfig(
|
|
@@ -220,14 +223,41 @@ class TestRun(unittest.TestCase):
|
|
|
220
223
|
|
|
221
224
|
run_task(task_cfg=task_cfg)
|
|
222
225
|
|
|
226
|
+
|
|
227
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
228
|
+
def test_run_task_loop(self):
|
|
229
|
+
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
|
|
230
|
+
from evalscope.config import TaskConfig
|
|
231
|
+
|
|
232
|
+
task_cfg1 = TaskConfig(
|
|
233
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
234
|
+
model_id='model1',
|
|
235
|
+
datasets=['iquiz'],
|
|
236
|
+
limit=10
|
|
237
|
+
)
|
|
238
|
+
task_cfg2 = TaskConfig(
|
|
239
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
240
|
+
model_id='model2',
|
|
241
|
+
datasets=['iquiz'],
|
|
242
|
+
limit=10
|
|
243
|
+
)
|
|
244
|
+
task_cfg3 = TaskConfig(
|
|
245
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
246
|
+
model_id='model3',
|
|
247
|
+
datasets=['iquiz'],
|
|
248
|
+
limit=10
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
run_task(task_cfg=[task_cfg1, task_cfg2, task_cfg3])
|
|
252
|
+
|
|
223
253
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
224
254
|
def test_run_server_model(self):
|
|
225
255
|
from evalscope.config import TaskConfig
|
|
226
256
|
|
|
227
257
|
task_cfg = TaskConfig(
|
|
228
|
-
model='
|
|
229
|
-
api_url='
|
|
230
|
-
api_key='
|
|
258
|
+
model='qwen2.5-7b-instruct',
|
|
259
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
260
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
231
261
|
eval_type=EvalType.SERVICE,
|
|
232
262
|
datasets=[
|
|
233
263
|
# 'iquiz',
|
|
@@ -245,17 +275,17 @@ class TestRun(unittest.TestCase):
|
|
|
245
275
|
# 'competition_math',
|
|
246
276
|
# 'math_500',
|
|
247
277
|
# 'aime24',
|
|
248
|
-
'gpqa',
|
|
278
|
+
# 'gpqa',
|
|
249
279
|
# 'arc',
|
|
250
|
-
'ceval',
|
|
251
|
-
|
|
280
|
+
# 'ceval',
|
|
281
|
+
'hellaswag',
|
|
252
282
|
# 'general_mcq',
|
|
253
283
|
# 'general_qa'
|
|
254
284
|
# 'super_gpqa',
|
|
255
285
|
],
|
|
256
286
|
dataset_args={
|
|
257
287
|
'mmlu': {
|
|
258
|
-
'subset_list': ['elementary_mathematics'],
|
|
288
|
+
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
259
289
|
'few_shot_num': 0
|
|
260
290
|
},
|
|
261
291
|
'mmlu_pro': {
|
|
@@ -313,7 +343,7 @@ class TestRun(unittest.TestCase):
|
|
|
313
343
|
}
|
|
314
344
|
},
|
|
315
345
|
eval_batch_size=32,
|
|
316
|
-
limit=
|
|
346
|
+
limit=15,
|
|
317
347
|
# debug=True,
|
|
318
348
|
stream=False,
|
|
319
349
|
generation_config={
|
|
@@ -357,5 +387,67 @@ class TestRun(unittest.TestCase):
|
|
|
357
387
|
|
|
358
388
|
run_task(task_cfg=task_cfg)
|
|
359
389
|
|
|
390
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
391
|
+
def test_run_judge_model(self):
|
|
392
|
+
from evalscope.config import TaskConfig
|
|
393
|
+
|
|
394
|
+
task_cfg = TaskConfig(
|
|
395
|
+
model='qwq-32b',
|
|
396
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
397
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
398
|
+
eval_type=EvalType.SERVICE,
|
|
399
|
+
datasets=[
|
|
400
|
+
# 'math_500',
|
|
401
|
+
# 'aime24',
|
|
402
|
+
# 'competition_math',
|
|
403
|
+
# 'arc',
|
|
404
|
+
# 'gsm8k'
|
|
405
|
+
# 'truthful_qa',
|
|
406
|
+
# 'simple_qa',
|
|
407
|
+
# # 'chinese_simpleqa',
|
|
408
|
+
'live_code_bench',
|
|
409
|
+
# 'humaneval'
|
|
410
|
+
# 'general_qa'
|
|
411
|
+
],
|
|
412
|
+
dataset_args={
|
|
413
|
+
'competition_math': {
|
|
414
|
+
'subset_list': ['Level 4']
|
|
415
|
+
},
|
|
416
|
+
'live_code_bench': {
|
|
417
|
+
'extra_params': {
|
|
418
|
+
'start_date': '2024-08-01',
|
|
419
|
+
'end_date': '2025-02-28'
|
|
420
|
+
},
|
|
421
|
+
'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
|
|
422
|
+
},
|
|
423
|
+
'general_qa': {
|
|
424
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
425
|
+
'subset_list': [
|
|
426
|
+
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
427
|
+
# 'test'
|
|
428
|
+
]
|
|
429
|
+
},
|
|
430
|
+
},
|
|
431
|
+
eval_batch_size=10,
|
|
432
|
+
# limit=5,
|
|
433
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
434
|
+
judge_worker_num=8,
|
|
435
|
+
judge_model_args={
|
|
436
|
+
'model_id': 'qwen2.5-7b-instruct',
|
|
437
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
438
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
439
|
+
},
|
|
440
|
+
generation_config={
|
|
441
|
+
'max_new_tokens': 20000,
|
|
442
|
+
'temperature': 0.0,
|
|
443
|
+
'seed': 42,
|
|
444
|
+
},
|
|
445
|
+
timeout=60000,
|
|
446
|
+
stream=True,
|
|
447
|
+
# use_cache='outputs/20250320_143658'
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
run_task(task_cfg=task_cfg)
|
|
451
|
+
|
|
360
452
|
if __name__ == '__main__':
|
|
361
453
|
unittest.main()
|
tests/perf/test_perf.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import os
|
|
3
|
+
from dotenv import dotenv_values
|
|
3
4
|
|
|
5
|
+
env = dotenv_values('.env')
|
|
4
6
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
5
7
|
import unittest
|
|
6
8
|
|
|
@@ -96,6 +98,27 @@ class TestPerf(unittest.TestCase):
|
|
|
96
98
|
}
|
|
97
99
|
run_perf_benchmark(task_cfg)
|
|
98
100
|
|
|
101
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
102
|
+
def test_run_perf_local_random(self):
|
|
103
|
+
from evalscope.perf.arguments import Arguments
|
|
104
|
+
task_cfg = Arguments(
|
|
105
|
+
parallel=20,
|
|
106
|
+
model='Qwen2.5-0.5B-Instruct',
|
|
107
|
+
url='http://127.0.0.1:8801/v1/chat/completions',
|
|
108
|
+
api='openai',
|
|
109
|
+
dataset='random',
|
|
110
|
+
min_tokens=1024,
|
|
111
|
+
max_tokens=1024,
|
|
112
|
+
prefix_length=0,
|
|
113
|
+
min_prompt_length=1024,
|
|
114
|
+
max_prompt_length=1024,
|
|
115
|
+
number=40,
|
|
116
|
+
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
117
|
+
seed=None,
|
|
118
|
+
debug= True,
|
|
119
|
+
)
|
|
120
|
+
run_perf_benchmark(task_cfg)
|
|
121
|
+
|
|
99
122
|
|
|
100
123
|
if __name__ == '__main__':
|
|
101
124
|
unittest.main(buffer=False)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|