evalscope 0.12.1__py3-none-any.whl → 0.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (50) hide show
  1. evalscope/arguments.py +6 -1
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -3
  3. evalscope/benchmarks/benchmark.py +3 -2
  4. evalscope/benchmarks/ceval/ceval_adapter.py +2 -1
  5. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  6. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
  7. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +2 -1
  8. evalscope/benchmarks/data_adapter.py +32 -4
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -4
  10. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +20 -24
  11. evalscope/benchmarks/humaneval/humaneval_adapter.py +8 -5
  12. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  13. evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
  14. evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
  15. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  16. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
  17. evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
  18. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  19. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  20. evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -2
  22. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +148 -1
  23. evalscope/benchmarks/super_gpqa/utils.py +0 -5
  24. evalscope/collections/evaluator.py +4 -4
  25. evalscope/config.py +11 -3
  26. evalscope/constants.py +8 -0
  27. evalscope/evaluator/evaluator.py +56 -17
  28. evalscope/metrics/llm_judge.py +104 -0
  29. evalscope/models/custom_adapter.py +1 -1
  30. evalscope/perf/arguments.py +11 -40
  31. evalscope/perf/benchmark.py +39 -28
  32. evalscope/perf/http_client.py +9 -1
  33. evalscope/perf/main.py +2 -1
  34. evalscope/perf/plugin/datasets/__init__.py +1 -0
  35. evalscope/perf/plugin/datasets/openqa.py +6 -11
  36. evalscope/perf/plugin/datasets/random_dataset.py +51 -0
  37. evalscope/perf/utils/db_util.py +3 -0
  38. evalscope/run.py +15 -3
  39. evalscope/third_party/longbench_write/infer.py +1 -1
  40. evalscope/version.py +2 -2
  41. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/METADATA +56 -38
  42. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/RECORD +50 -36
  43. tests/cli/test_all.py +144 -0
  44. tests/cli/test_collection.py +27 -1
  45. tests/cli/test_run.py +103 -11
  46. tests/perf/test_perf.py +23 -0
  47. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/LICENSE +0 -0
  48. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/WHEEL +0 -0
  49. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/entry_points.txt +0 -0
  50. {evalscope-0.12.1.dist-info → evalscope-0.13.1.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
- evalscope/arguments.py,sha256=QT3f_oBDl1jXl68rgHVBsOxWeJTw1zXFmm7Zu1VRMQU,4826
3
- evalscope/config.py,sha256=eQ_r94W_uQiF9ZWN-k84KxrT85E3YiJklDuM5mIKt_s,9124
4
- evalscope/constants.py,sha256=l6xkVknVybi3frXaftksRZNaCFcw9ZJZ8ORJeWDJEaQ,3615
5
- evalscope/run.py,sha256=ae6WsKllRt5xanRRFJWSBkVEjCf-Lgx35nlLyqOxctU,5785
2
+ evalscope/arguments.py,sha256=VhZd7a8PoZK01qFCMEADLINqLYi6njRqRb50iR1l1lo,5241
3
+ evalscope/config.py,sha256=wLrc8a7z28IFPRaeUzot5HGtSDY_13KR-3kRyFKEGx8,9476
4
+ evalscope/constants.py,sha256=Cgzkoz4R3MC3YLtbCM2fmSwF8Z2kuxYdOC8t9FWJj9w,3740
5
+ evalscope/run.py,sha256=LUCdnNzNIfHSWvxu3gxAsHEDX7hT5mcVnV4lSY5h0iA,6007
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
7
  evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
8
- evalscope/version.py,sha256=KVyRitFqvCQM-1iaU2VOfx7rh9IDqOUGstYhQ6DLAI4,119
8
+ evalscope/version.py,sha256=Y30-zF2dwch3upMc0t5yNNjIgvI-LQQWFhftRQgXvOk,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -56,15 +56,15 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
56
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
57
57
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
58
58
  evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
59
- evalscope/benchmarks/benchmark.py,sha256=AByXFsuia3lqCLFsPRt95UR7SxwEuAGpeuKBVjb7jLE,2463
60
- evalscope/benchmarks/data_adapter.py,sha256=JwptQHL4DbcZ_Ll0kJ0QL8rgK2ZVFftyAXiUWKcrvL4,15532
59
+ evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
60
+ evalscope/benchmarks/data_adapter.py,sha256=2u9oC4RBHVfEMHKPRu87xM4XOw_RS2Z2fvagNsciEo4,16791
61
61
  evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
62
62
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
64
64
  evalscope/benchmarks/aime/aime25_adapter.py,sha256=FB_NufY2V7uYdxVnrY_4y81gyyfYDnvedz1_zHdDWt4,1709
65
65
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
66
66
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
67
- evalscope/benchmarks/arc/arc_adapter.py,sha256=8ksPc6IM266NE7F9Bo-Y9SRZZM-tlCKPfLbJg3VEq9w,6269
67
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=lkhDz-DYjPQ1vHzo8X4j-0Lq_rBxAnws35_R00pIbNI,6347
68
68
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
69
69
  evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
70
70
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
@@ -95,11 +95,13 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
95
95
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
96
96
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
97
97
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
98
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=B3nO0WmqSyH-LlicqreIPWrxXgVPt1rrp3ndc7YRYiE,11157
98
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=E4QobCjSSkMZtPJyaT_XBVxiqEqa1bta1I9aFnaHOqs,11308
99
99
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
100
+ evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
+ evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=nKF_a0yc_PbZYjYA_-gJh3ePZIEz5txrhDV4IsTqD4Q,8196
100
102
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
101
103
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
102
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=zNaYSelcGZulgFLQXp2eD56_QOFRkaXHknfy_VWJciA,10230
104
+ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=TTq2jRz46Hqc_D_ZBaiw_OwKub1FZX6w8C7g7COIdGs,10372
103
105
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
104
106
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
105
107
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
@@ -109,7 +111,7 @@ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=U4M-0MVJS
109
111
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
110
112
  evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
111
113
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
112
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=wnKUIVc1UvnjI5XGOHf5aCx0H0xTKoZZWAD-Q8AJNAE,4686
114
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=ELDdS5T3JZeSWVv1ldawcHzLwAljEWKqakbRMVcBvgw,4741
113
115
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
116
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
115
117
  evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
@@ -118,10 +120,10 @@ evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTv
118
120
  evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=ZZZ-9oja53IwiU33Kjm7NTk4MbFGWyvonhnHrn_3Na8,10557
119
121
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
120
122
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
121
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=2CnrIapK51l4bQyFKWWqmOaeBSpkIlq2asetWcp24gs,6057
123
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=QYZZuxbjkKxAjxuoWn0M5WgusO55vzeAcyKnWUMow3M,5871
122
124
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
123
125
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
124
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=YK4u3JG_Ub4vP-xnsrf-lMheIBdCgFWmirhPUch3biU,5120
126
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
125
127
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
126
128
  evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=R7MILWuMglvXr7yWioBxyJ2T4EdEkwRZ1lnvWqZqG28,1922
127
129
  evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
@@ -130,11 +132,20 @@ evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2
130
132
  evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
131
133
  evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
132
134
  evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=16whmFkJt9fLbei9d-kmjnWB_5y5vsiX9tK5kSuxDw8,2449
135
+ evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
136
+ evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=rOWaG8PV4AGIRhS_gqwxEhphEVe1Cqg57Eudwm5HTjI,6820
137
+ evalscope/benchmarks/live_code_bench/execute_utils.py,sha256=MreaMLI0IicNZawpfqcyoRLt67EZ3CJvmxxRTYwhAbU,7397
138
+ evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
139
+ evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=8MOECcweLG465JFgUzP20QlKyBAO90oFHhH7Z77FuUY,3521
140
+ evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
141
+ evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
142
+ evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
143
+ evalscope/benchmarks/live_code_bench/testing_util.py,sha256=EBe0XzY3B4cW5dCjwLksW7o4R1chZwsuFjxkfqVPFI4,28238
133
144
  evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
145
  evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
135
146
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
136
147
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
137
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=RMZoHAApVOpD3_NeHLcsiM7SpglKpfrGSUhBWPgdAVE,11525
148
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=e__Evar99V9l65FlzT6T594CN4iMgmuVhjujQAm4po4,11662
138
149
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
139
150
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
140
151
  evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
@@ -148,11 +159,11 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
148
159
  evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
149
160
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
150
161
  evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
151
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=FZwXN78X2fV3Dchop_UuFAhNFkwWs12qJlIczgvvrJ8,477
162
+ evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=SrK18xDe4HyUaIPRLVEDtoF4Nc_ms4aFxktEsj8MnnA,9071
152
163
  evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
153
164
  evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
154
165
  evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
155
- evalscope/benchmarks/super_gpqa/utils.py,sha256=uhANVnoIaH8-QuzjcVuyVB-8aGOMy94XKUF-TFemY_Q,3578
166
+ evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
156
167
  evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
157
168
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
158
169
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
@@ -169,16 +180,17 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
169
180
  evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
170
181
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
171
182
  evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
172
- evalscope/collections/evaluator.py,sha256=Zi3uRZhSRIimYye_apZWL6VOiHqaM5znbFA4TBvqSbg,12761
183
+ evalscope/collections/evaluator.py,sha256=YJy8Dj35XCdCwhNDwZecJkeW1_ZgIOsuRLFzfe3SyV8,12724
173
184
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
174
185
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
175
186
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
176
- evalscope/evaluator/evaluator.py,sha256=VIiw1eI46UOsFWNd7schD4ah_Q5ll0crl2sRmGIRmig,17649
187
+ evalscope/evaluator/evaluator.py,sha256=szRQrXH5ILpUljb14lcunuOt185H8Um1paviTokraA4,19845
177
188
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
178
189
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
179
190
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
180
191
  evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
181
192
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
193
+ evalscope/metrics/llm_judge.py,sha256=g9pLMJPNTUyw0sGteblws1_e_KzbRqcbqKcaIzfE_DE,4031
182
194
  evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
183
195
  evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
184
196
  evalscope/metrics/named_metrics.py,sha256=pSHA2_qdi9B5bDHIh08GYhx63odilSwA_T-95K1Usl0,1380
@@ -191,7 +203,7 @@ evalscope/models/__init__.py,sha256=i9vcOBMEF_UM7C2gpmh2GsQk3njwqevoQ6A4CnP1fHs,
191
203
  evalscope/models/base_adapter.py,sha256=7PbRwfD5PIZCBYVds6ZHI8TBY9C5i2LdPOTu88FJWlY,3414
192
204
  evalscope/models/chat_adapter.py,sha256=5-yz7L41OdeBO9J_qRkEZcduATrYIMe__UFfh7BzjIc,6277
193
205
  evalscope/models/choice_adapter.py,sha256=fnJdo-FMJ-zvNLbEJGc73odgWXIxtVudL00JIf2vzsA,8239
194
- evalscope/models/custom_adapter.py,sha256=Za52WF1I_YcJkGomJ6s9sP2Fs8DoJ4HHBYBi3iC3WNI,2379
206
+ evalscope/models/custom_adapter.py,sha256=AGztmZ0aT0g2flh4B4NaiZ8LCDg8tT0gVNxmrP5W1mA,2401
195
207
  evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
196
208
  evalscope/models/model.py,sha256=diu4TE1ZFWdynTxsl4DejTNsLdwjxoyj2nsKR-Y8EZE,7343
197
209
  evalscope/models/register.py,sha256=4vX6AfScAzwD7UkncbuejfAiQHznQkK5hvtG6jEUbWo,809
@@ -200,10 +212,10 @@ evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk
200
212
  evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
201
213
  evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
202
214
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
203
- evalscope/perf/arguments.py,sha256=u3GNdnOBmiEirtgJLspsLO7qBwHeWLoXd4vlt69jJ-g,9717
204
- evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
205
- evalscope/perf/http_client.py,sha256=eoRPaBTCVC4DpgH4tnc-31_h_2PVkWUwCLWK6_TTkhM,7282
206
- evalscope/perf/main.py,sha256=SUMz8S2XPL8JaSL1-vy8qkrb34d5vp6DfQdwIGOUXTk,1277
215
+ evalscope/perf/arguments.py,sha256=hBR6TXCoLkHRLxrwXacmierfFZhyQaT5hnKAfp-vE6I,8990
216
+ evalscope/perf/benchmark.py,sha256=VYcFhSoZXcLoNXpFYxOFxLbBLv_8Tn74Qklim7vELCM,9889
217
+ evalscope/perf/http_client.py,sha256=xMakdQkJ2cgIOd-yOmHEW0vbGKTJ0JWhLFt9IFtUP8Q,7473
218
+ evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
207
219
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
208
220
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
209
221
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
@@ -211,18 +223,19 @@ evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqY
211
223
  evalscope/perf/plugin/api/custom_api.py,sha256=ay1AGi4y2opjwyRl0J0A54-vLB-pBj3QBFkzog0KA-g,3787
212
224
  evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
213
225
  evalscope/perf/plugin/api/openai_api.py,sha256=KQRQMOfQceKQtrvTE-SyhNHcDoGuQ0900yh7r74Hcoo,7560
214
- evalscope/perf/plugin/datasets/__init__.py,sha256=9mz2TnVHhxbEKAS9pLbKMQuIoShNlZpGiRo9e2RQLUs,490
226
+ evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
215
227
  evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
216
228
  evalscope/perf/plugin/datasets/custom.py,sha256=_GSC5yR90_BjcRjdJqrJT2vHQAzskz5XxYOxngUM2Pg,815
217
229
  evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
218
230
  evalscope/perf/plugin/datasets/line_by_line.py,sha256=IKVZMpKei6XW9DTm9VEssWHE96i1lTqMf0621dA_img,836
219
231
  evalscope/perf/plugin/datasets/longalpaca.py,sha256=2aENqCly_DX1dyNcurYsLFJIvXYFph6jWm7z7XETvMk,1176
220
- evalscope/perf/plugin/datasets/openqa.py,sha256=2pv7yyPSFYTjPhvAGBsHl0eQO8gt7Wk1CaKcfTi3Tnc,1394
232
+ evalscope/perf/plugin/datasets/openqa.py,sha256=_aVXs2s8wbmtoB6ZO-pNjUZvBVxRUYdoJDGv5-BumtI,1342
233
+ evalscope/perf/plugin/datasets/random_dataset.py,sha256=wPyY5kk2zKnc8u9uYEl-vQ6BLHeWbdC8EHEAZNFSDeU,2702
221
234
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=ef8MXhT6756y6LsXSpYeWjmwswu2hRXe2BOVS2_OgVM,1968
222
235
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
223
236
  evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
224
237
  evalscope/perf/utils/benchmark_util.py,sha256=4TyQ_tE5odcjKDFDueI3jrC0vld6QxmTreOd5_SP4vE,5802
225
- evalscope/perf/utils/db_util.py,sha256=PSBq16uWyzXx0zyoEE4wazWKN19UAA8_GjobS7rTPso,9001
238
+ evalscope/perf/utils/db_util.py,sha256=hRXixxpNBrACF43reOJV5SoO1vj34cqoNMaTKH_oLLE,9100
226
239
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
227
240
  evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
228
241
  evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -257,7 +270,7 @@ evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u
257
270
  evalscope/third_party/longbench_write/default_task.json,sha256=d_NPShtW10Mc02U3pAuxX9hXd09tZw7QJAr1SvrECcM,694
258
271
  evalscope/third_party/longbench_write/default_task.yaml,sha256=YjU8EeyH9UtM8e7_fhrwJNChQdszOAcrKmOi--Awvhk,578
259
272
  evalscope/third_party/longbench_write/eval.py,sha256=39McZSDHL7bA5Dg-BSyZ4EiAF1nfTiYJAnx5FqbNYok,11265
260
- evalscope/third_party/longbench_write/infer.py,sha256=bFsOp--8Qn6qQ-NpdLY0bennQGQl5TMGEngvGda8k7g,4937
273
+ evalscope/third_party/longbench_write/infer.py,sha256=32t90zTll6SXH7Wx8QnRFMs6ZUwvpbgYNuawCByzwR0,4971
261
274
  evalscope/third_party/longbench_write/longbench_write.py,sha256=nIR1toB1hvUXR7Lrs3xcY9wqaI-bjeADg_Oscf3HdaY,3991
262
275
  evalscope/third_party/longbench_write/utils.py,sha256=nd-YslsOyNGAuyBfAWb2pnTMaGLMQ58lbnJJdrCndeI,815
263
276
  evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -298,10 +311,11 @@ evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
298
311
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
299
312
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
300
313
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
301
- tests/cli/test_collection.py,sha256=-CrcAiZVtsY7mXUNVlRjhFWEgmPL5k1dH9PjNhKzYdU,3028
302
- tests/cli/test_run.py,sha256=flwZZ1PyMnrxy5f36mdUeGSO_ANpr2588dw1zHVQYJY,12735
314
+ tests/cli/test_all.py,sha256=1wwXtdjBmWYLhs5TXOJhZBwPm2qd9FYFqQSemXWKNUs,3865
315
+ tests/cli/test_collection.py,sha256=V-_M7ngwekMGqPuI16jjJZyAK2XLE4Z6QTn-8B5ykgU,4071
316
+ tests/cli/test_run.py,sha256=Gk8uCT0IjDSf2sf-TXeQFV83ovNzRs4GcAkQ1DhRJEU,15929
303
317
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
304
- tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
318
+ tests/perf/test_perf.py,sha256=mfXTCsD9RaCef3b4CLvm8ErxBUaWzn-EKKhOxD65i3A,3817
305
319
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
306
320
  tests/rag/test_clip_benchmark.py,sha256=Ar8Br2CoAFYT2h4zCv_JKMKCGJKbKGYZgNwJ410ZaoU,2597
307
321
  tests/rag/test_mteb.py,sha256=t64FXE-ZsOCLiRJrw-dIDIhKd1OXiaglXaeERs0lOh4,4643
@@ -312,9 +326,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
312
326
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
313
327
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
314
328
  tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
315
- evalscope-0.12.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
316
- evalscope-0.12.1.dist-info/METADATA,sha256=jdU1I5E3YNc8PLfY0NYYDTKiXzTE4HYtX5J6OUPkQ_s,31337
317
- evalscope-0.12.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
318
- evalscope-0.12.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
319
- evalscope-0.12.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
320
- evalscope-0.12.1.dist-info/RECORD,,
329
+ evalscope-0.13.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
330
+ evalscope-0.13.1.dist-info/METADATA,sha256=luYebd_U93wnTkXcv_MYPfd9-JRz51DjWB6Bh6phspU,33546
331
+ evalscope-0.13.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
332
+ evalscope-0.13.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
333
+ evalscope-0.13.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
334
+ evalscope-0.13.1.dist-info/RECORD,,
tests/cli/test_all.py ADDED
@@ -0,0 +1,144 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import os
7
+ import subprocess
8
+ import unittest
9
+
10
+ from evalscope.config import TaskConfig
11
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
12
+ from evalscope.run import run_task
13
+ from evalscope.utils import is_module_installed, test_level_list
14
+ from evalscope.utils.logger import get_logger
15
+
16
+ os.environ['LOG_LEVEL'] = 'DEBUG'
17
+
18
+ logger = get_logger()
19
+
20
+ datasets=[
21
+ # 'iquiz',
22
+ # 'ifeval',
23
+ # 'mmlu',
24
+ # 'mmlu_pro',
25
+ # 'musr',
26
+ # 'process_bench',
27
+ # 'race',
28
+ # 'trivia_qa',
29
+ # 'cmmlu',
30
+ # 'humaneval',
31
+ # 'gsm8k',
32
+ # 'bbh',
33
+ # 'competition_math',
34
+ # 'math_500',
35
+ # 'aime24',
36
+ # 'gpqa',
37
+ # 'arc',
38
+ # 'ceval',
39
+ # 'hellaswag',
40
+ # 'general_mcq',
41
+ # 'general_qa',
42
+ 'super_gpqa',
43
+ 'live_code_bench',
44
+ 'simple_qa',
45
+ 'chinese_simpleqa',
46
+ ]
47
+
48
+ dataset_args={
49
+ 'mmlu': {
50
+ 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
51
+ 'few_shot_num': 0
52
+ },
53
+ 'mmlu_pro': {
54
+ 'subset_list': ['math', 'health'],
55
+ 'few_shot_num': 4
56
+ },
57
+ 'ceval': {
58
+ 'subset_list': [
59
+ 'computer_network', 'operating_system', 'computer_architecture'
60
+ ],
61
+ 'few_shot_num': 0
62
+ },
63
+ 'cmmlu': {
64
+ 'subset_list': ['elementary_chinese'],
65
+ 'few_shot_num': 0
66
+ },
67
+ 'bbh': {
68
+ 'subset_list': ['word_sorting', 'movie_recommendation'],
69
+ },
70
+ 'gpqa': {
71
+ 'subset_list': ['gpqa_diamond'],
72
+ 'few_shot_num': 0,
73
+ },
74
+ 'humaneval': {
75
+ 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
76
+ },
77
+ 'competition_math': {
78
+ 'subset_list': ['Level 1']
79
+ },
80
+ 'math_500': {
81
+ 'subset_list': ['Level 1']
82
+ },
83
+ 'process_bench': {
84
+ 'subset_list': ['gsm8k'],
85
+ },
86
+ 'musr': {
87
+ 'subset_list': ['murder_mysteries']
88
+ },
89
+ 'general_mcq': {
90
+ 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
91
+ 'subset_list': [
92
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
93
+ ],
94
+ },
95
+ 'general_qa': {
96
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
97
+ 'subset_list': [
98
+ 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
99
+ # 'test'
100
+ ],
101
+ 'metric_list': ['AverageBLEU']
102
+ },
103
+ 'super_gpqa': {
104
+ 'subset_list': ['Philosophy', 'Education'],
105
+ 'few_shot_num': 0
106
+ },
107
+ 'live_code_bench': {
108
+ 'subset_list': ['v4_v5'],
109
+ 'extra_params': {
110
+ 'start_date': '2024-12-01',
111
+ 'end_date': '2025-01-01'
112
+ },
113
+ }
114
+ }
115
+
116
+ class TestRun(unittest.TestCase):
117
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
118
+ def test_benchmarks(self):
119
+ from evalscope.config import TaskConfig
120
+
121
+ task_cfg = TaskConfig(
122
+ model='qwen2.5-7b-instruct',
123
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
124
+ api_key= env.get('DASHSCOPE_API_KEY'),
125
+ eval_type=EvalType.SERVICE,
126
+ datasets=datasets,
127
+ dataset_args=dataset_args,
128
+ eval_batch_size=32,
129
+ limit=2,
130
+ stream=True,
131
+ generation_config={
132
+ 'temperature': 0,
133
+ 'n': 1,
134
+ 'max_tokens': 4096,
135
+ },
136
+ judge_strategy=JudgeStrategy.AUTO,
137
+ judge_model_args={
138
+ 'model_id': 'qwen2.5-7b-instruct',
139
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
140
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
141
+ }
142
+ )
143
+
144
+ run_task(task_cfg=task_cfg)
@@ -1,8 +1,9 @@
1
1
  import json
2
+ import os
2
3
  import unittest
3
4
 
4
5
  from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
5
- from evalscope.constants import EvalType
6
+ from evalscope.constants import EvalType, JudgeStrategy
6
7
  from evalscope.utils.io_utils import dump_jsonl_data
7
8
  from evalscope.utils.utils import test_level_list
8
9
 
@@ -55,3 +56,28 @@ class TestCollection(unittest.TestCase):
55
56
  }},
56
57
  )
57
58
  run_task(task_cfg=task_cfg)
59
+
60
+
61
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
62
+ def test_evaluate_collection_with_judge(self):
63
+ from evalscope import TaskConfig, run_task
64
+
65
+ task_cfg = TaskConfig(
66
+ model='qwen2.5-7b-instruct',
67
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
68
+ api_key= os.getenv('DASHSCOPE_API_KEY'),
69
+ eval_type=EvalType.SERVICE,
70
+ datasets=['data_collection'],
71
+ dataset_args={'data_collection': {
72
+ 'local_path': 'outputs/mixed_data_test.jsonl'
73
+ # 'local_path': 'outputs/weighted_mixed_data.jsonl'
74
+ }},
75
+ limit=10,
76
+ judge_strategy=JudgeStrategy.LLM_RECALL,
77
+ judge_model_args={
78
+ 'model_id': 'qwen2.5-7b-instruct',
79
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
80
+ 'api_key': os.getenv('DASHSCOPE_API_KEY'),
81
+ }
82
+ )
83
+ run_task(task_cfg=task_cfg)
tests/cli/test_run.py CHANGED
@@ -1,11 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
2
6
  import os
3
7
  import subprocess
4
- import torch
5
8
  import unittest
6
9
 
7
10
  from evalscope.config import TaskConfig
8
- from evalscope.constants import EvalType, OutputType
11
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
9
12
  from evalscope.run import run_task
10
13
  from evalscope.utils import is_module_installed, test_level_list
11
14
  from evalscope.utils.logger import get_logger
@@ -200,7 +203,7 @@ class TestRun(unittest.TestCase):
200
203
  print(res)
201
204
 
202
205
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
203
- def test_run_humaneval(self):
206
+ def test_run_one_task(self):
204
207
  from evalscope.config import TaskConfig
205
208
 
206
209
  task_cfg = TaskConfig(
@@ -220,14 +223,41 @@ class TestRun(unittest.TestCase):
220
223
 
221
224
  run_task(task_cfg=task_cfg)
222
225
 
226
+
227
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
228
+ def test_run_task_loop(self):
229
+ os.environ['CUDA_VISIBLE_DEVICES'] = '2'
230
+ from evalscope.config import TaskConfig
231
+
232
+ task_cfg1 = TaskConfig(
233
+ model='Qwen/Qwen2.5-0.5B-Instruct',
234
+ model_id='model1',
235
+ datasets=['iquiz'],
236
+ limit=10
237
+ )
238
+ task_cfg2 = TaskConfig(
239
+ model='Qwen/Qwen2.5-0.5B-Instruct',
240
+ model_id='model2',
241
+ datasets=['iquiz'],
242
+ limit=10
243
+ )
244
+ task_cfg3 = TaskConfig(
245
+ model='Qwen/Qwen2.5-0.5B-Instruct',
246
+ model_id='model3',
247
+ datasets=['iquiz'],
248
+ limit=10
249
+ )
250
+
251
+ run_task(task_cfg=[task_cfg1, task_cfg2, task_cfg3])
252
+
223
253
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
224
254
  def test_run_server_model(self):
225
255
  from evalscope.config import TaskConfig
226
256
 
227
257
  task_cfg = TaskConfig(
228
- model='Qwen2.5-0.5B-Instruct',
229
- api_url='http://127.0.0.1:8801/v1',
230
- api_key='EMPTY',
258
+ model='qwen2.5-7b-instruct',
259
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
260
+ api_key= env.get('DASHSCOPE_API_KEY'),
231
261
  eval_type=EvalType.SERVICE,
232
262
  datasets=[
233
263
  # 'iquiz',
@@ -245,17 +275,17 @@ class TestRun(unittest.TestCase):
245
275
  # 'competition_math',
246
276
  # 'math_500',
247
277
  # 'aime24',
248
- 'gpqa',
278
+ # 'gpqa',
249
279
  # 'arc',
250
- 'ceval',
251
- # 'hellaswag',
280
+ # 'ceval',
281
+ 'hellaswag',
252
282
  # 'general_mcq',
253
283
  # 'general_qa'
254
284
  # 'super_gpqa',
255
285
  ],
256
286
  dataset_args={
257
287
  'mmlu': {
258
- 'subset_list': ['elementary_mathematics'],
288
+ 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
259
289
  'few_shot_num': 0
260
290
  },
261
291
  'mmlu_pro': {
@@ -313,7 +343,7 @@ class TestRun(unittest.TestCase):
313
343
  }
314
344
  },
315
345
  eval_batch_size=32,
316
- limit=10,
346
+ limit=15,
317
347
  # debug=True,
318
348
  stream=False,
319
349
  generation_config={
@@ -357,5 +387,67 @@ class TestRun(unittest.TestCase):
357
387
 
358
388
  run_task(task_cfg=task_cfg)
359
389
 
390
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
391
+ def test_run_judge_model(self):
392
+ from evalscope.config import TaskConfig
393
+
394
+ task_cfg = TaskConfig(
395
+ model='qwq-32b',
396
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
397
+ api_key= env.get('DASHSCOPE_API_KEY'),
398
+ eval_type=EvalType.SERVICE,
399
+ datasets=[
400
+ # 'math_500',
401
+ # 'aime24',
402
+ # 'competition_math',
403
+ # 'arc',
404
+ # 'gsm8k'
405
+ # 'truthful_qa',
406
+ # 'simple_qa',
407
+ # # 'chinese_simpleqa',
408
+ 'live_code_bench',
409
+ # 'humaneval'
410
+ # 'general_qa'
411
+ ],
412
+ dataset_args={
413
+ 'competition_math': {
414
+ 'subset_list': ['Level 4']
415
+ },
416
+ 'live_code_bench': {
417
+ 'extra_params': {
418
+ 'start_date': '2024-08-01',
419
+ 'end_date': '2025-02-28'
420
+ },
421
+ 'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
422
+ },
423
+ 'general_qa': {
424
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
425
+ 'subset_list': [
426
+ 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
427
+ # 'test'
428
+ ]
429
+ },
430
+ },
431
+ eval_batch_size=10,
432
+ # limit=5,
433
+ judge_strategy=JudgeStrategy.AUTO,
434
+ judge_worker_num=8,
435
+ judge_model_args={
436
+ 'model_id': 'qwen2.5-7b-instruct',
437
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
438
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
439
+ },
440
+ generation_config={
441
+ 'max_new_tokens': 20000,
442
+ 'temperature': 0.0,
443
+ 'seed': 42,
444
+ },
445
+ timeout=60000,
446
+ stream=True,
447
+ # use_cache='outputs/20250320_143658'
448
+ )
449
+
450
+ run_task(task_cfg=task_cfg)
451
+
360
452
  if __name__ == '__main__':
361
453
  unittest.main()
tests/perf/test_perf.py CHANGED
@@ -1,6 +1,8 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import os
3
+ from dotenv import dotenv_values
3
4
 
5
+ env = dotenv_values('.env')
4
6
  os.environ['CUDA_VISIBLE_DEVICES'] = '0'
5
7
  import unittest
6
8
 
@@ -96,6 +98,27 @@ class TestPerf(unittest.TestCase):
96
98
  }
97
99
  run_perf_benchmark(task_cfg)
98
100
 
101
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
102
+ def test_run_perf_local_random(self):
103
+ from evalscope.perf.arguments import Arguments
104
+ task_cfg = Arguments(
105
+ parallel=20,
106
+ model='Qwen2.5-0.5B-Instruct',
107
+ url='http://127.0.0.1:8801/v1/chat/completions',
108
+ api='openai',
109
+ dataset='random',
110
+ min_tokens=1024,
111
+ max_tokens=1024,
112
+ prefix_length=0,
113
+ min_prompt_length=1024,
114
+ max_prompt_length=1024,
115
+ number=40,
116
+ tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
117
+ seed=None,
118
+ debug= True,
119
+ )
120
+ run_perf_benchmark(task_cfg)
121
+
99
122
 
100
123
  if __name__ == '__main__':
101
124
  unittest.main(buffer=False)