evalscope 0.12.1__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (43) hide show
  1. evalscope/arguments.py +6 -1
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -3
  3. evalscope/benchmarks/benchmark.py +3 -2
  4. evalscope/benchmarks/ceval/ceval_adapter.py +2 -1
  5. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  6. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
  7. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +2 -1
  8. evalscope/benchmarks/data_adapter.py +32 -4
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -4
  10. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +20 -24
  11. evalscope/benchmarks/humaneval/humaneval_adapter.py +8 -5
  12. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  13. evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
  14. evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
  15. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  16. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
  17. evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
  18. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  19. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  20. evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -2
  22. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +148 -1
  23. evalscope/benchmarks/super_gpqa/utils.py +0 -5
  24. evalscope/collections/evaluator.py +3 -3
  25. evalscope/config.py +6 -1
  26. evalscope/constants.py +7 -0
  27. evalscope/evaluator/evaluator.py +51 -13
  28. evalscope/metrics/llm_judge.py +104 -0
  29. evalscope/perf/benchmark.py +5 -0
  30. evalscope/perf/http_client.py +9 -1
  31. evalscope/perf/main.py +1 -0
  32. evalscope/run.py +1 -1
  33. evalscope/third_party/longbench_write/infer.py +1 -1
  34. evalscope/version.py +2 -2
  35. {evalscope-0.12.1.dist-info → evalscope-0.13.0.dist-info}/METADATA +25 -10
  36. {evalscope-0.12.1.dist-info → evalscope-0.13.0.dist-info}/RECORD +43 -30
  37. tests/cli/test_all.py +144 -0
  38. tests/cli/test_collection.py +27 -1
  39. tests/cli/test_run.py +72 -10
  40. {evalscope-0.12.1.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
  41. {evalscope-0.12.1.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
  42. {evalscope-0.12.1.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
  43. {evalscope-0.12.1.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,11 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
- evalscope/arguments.py,sha256=QT3f_oBDl1jXl68rgHVBsOxWeJTw1zXFmm7Zu1VRMQU,4826
3
- evalscope/config.py,sha256=eQ_r94W_uQiF9ZWN-k84KxrT85E3YiJklDuM5mIKt_s,9124
4
- evalscope/constants.py,sha256=l6xkVknVybi3frXaftksRZNaCFcw9ZJZ8ORJeWDJEaQ,3615
5
- evalscope/run.py,sha256=ae6WsKllRt5xanRRFJWSBkVEjCf-Lgx35nlLyqOxctU,5785
2
+ evalscope/arguments.py,sha256=VhZd7a8PoZK01qFCMEADLINqLYi6njRqRb50iR1l1lo,5241
3
+ evalscope/config.py,sha256=9bMV7wf8pM7N5dEj_kJsCq6oM8xobzQDYh0NF8h-j1I,9313
4
+ evalscope/constants.py,sha256=ydS8oihksGnvvzvJZw7HGhEeeccHNpJxspB81gAv29Y,3720
5
+ evalscope/run.py,sha256=Udz-H503UaMYos0ic3A_npXIbnd4eJLx26q5UEahF-U,5797
6
6
  evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
7
7
  evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
8
- evalscope/version.py,sha256=KVyRitFqvCQM-1iaU2VOfx7rh9IDqOUGstYhQ6DLAI4,119
8
+ evalscope/version.py,sha256=a1r1BkZoSpoA_eGXZoXm6WaLayRHhF__TgvE9xG-Whs,119
9
9
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
11
11
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
@@ -56,15 +56,15 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
56
56
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
57
57
  evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
58
58
  evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
59
- evalscope/benchmarks/benchmark.py,sha256=AByXFsuia3lqCLFsPRt95UR7SxwEuAGpeuKBVjb7jLE,2463
60
- evalscope/benchmarks/data_adapter.py,sha256=JwptQHL4DbcZ_Ll0kJ0QL8rgK2ZVFftyAXiUWKcrvL4,15532
59
+ evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
60
+ evalscope/benchmarks/data_adapter.py,sha256=2u9oC4RBHVfEMHKPRu87xM4XOw_RS2Z2fvagNsciEo4,16791
61
61
  evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
62
62
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
64
64
  evalscope/benchmarks/aime/aime25_adapter.py,sha256=FB_NufY2V7uYdxVnrY_4y81gyyfYDnvedz1_zHdDWt4,1709
65
65
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
66
66
  evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
67
- evalscope/benchmarks/arc/arc_adapter.py,sha256=8ksPc6IM266NE7F9Bo-Y9SRZZM-tlCKPfLbJg3VEq9w,6269
67
+ evalscope/benchmarks/arc/arc_adapter.py,sha256=lkhDz-DYjPQ1vHzo8X4j-0Lq_rBxAnws35_R00pIbNI,6347
68
68
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
69
69
  evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
70
70
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
@@ -95,11 +95,13 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
95
95
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
96
96
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
97
97
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
98
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=B3nO0WmqSyH-LlicqreIPWrxXgVPt1rrp3ndc7YRYiE,11157
98
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=E4QobCjSSkMZtPJyaT_XBVxiqEqa1bta1I9aFnaHOqs,11308
99
99
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
100
+ evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
+ evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=nKF_a0yc_PbZYjYA_-gJh3ePZIEz5txrhDV4IsTqD4Q,8196
100
102
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
101
103
  evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
102
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=zNaYSelcGZulgFLQXp2eD56_QOFRkaXHknfy_VWJciA,10230
104
+ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=TTq2jRz46Hqc_D_ZBaiw_OwKub1FZX6w8C7g7COIdGs,10372
103
105
  evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
104
106
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
105
107
  evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
@@ -109,7 +111,7 @@ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=U4M-0MVJS
109
111
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
110
112
  evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
111
113
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
112
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=wnKUIVc1UvnjI5XGOHf5aCx0H0xTKoZZWAD-Q8AJNAE,4686
114
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=ELDdS5T3JZeSWVv1ldawcHzLwAljEWKqakbRMVcBvgw,4741
113
115
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
116
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
115
117
  evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
@@ -118,10 +120,10 @@ evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTv
118
120
  evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=ZZZ-9oja53IwiU33Kjm7NTk4MbFGWyvonhnHrn_3Na8,10557
119
121
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
120
122
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
121
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=2CnrIapK51l4bQyFKWWqmOaeBSpkIlq2asetWcp24gs,6057
123
+ evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=QYZZuxbjkKxAjxuoWn0M5WgusO55vzeAcyKnWUMow3M,5871
122
124
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
123
125
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
124
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=YK4u3JG_Ub4vP-xnsrf-lMheIBdCgFWmirhPUch3biU,5120
126
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
125
127
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
126
128
  evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=R7MILWuMglvXr7yWioBxyJ2T4EdEkwRZ1lnvWqZqG28,1922
127
129
  evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
@@ -130,11 +132,20 @@ evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2
130
132
  evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
131
133
  evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
132
134
  evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=16whmFkJt9fLbei9d-kmjnWB_5y5vsiX9tK5kSuxDw8,2449
135
+ evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
136
+ evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=rOWaG8PV4AGIRhS_gqwxEhphEVe1Cqg57Eudwm5HTjI,6820
137
+ evalscope/benchmarks/live_code_bench/execute_utils.py,sha256=MreaMLI0IicNZawpfqcyoRLt67EZ3CJvmxxRTYwhAbU,7397
138
+ evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
139
+ evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=8MOECcweLG465JFgUzP20QlKyBAO90oFHhH7Z77FuUY,3521
140
+ evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
141
+ evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
142
+ evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
143
+ evalscope/benchmarks/live_code_bench/testing_util.py,sha256=EBe0XzY3B4cW5dCjwLksW7o4R1chZwsuFjxkfqVPFI4,28238
133
144
  evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
134
145
  evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
135
146
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
136
147
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
137
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=RMZoHAApVOpD3_NeHLcsiM7SpglKpfrGSUhBWPgdAVE,11525
148
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=e__Evar99V9l65FlzT6T594CN4iMgmuVhjujQAm4po4,11662
138
149
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
139
150
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
140
151
  evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
@@ -148,11 +159,11 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
148
159
  evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
149
160
  evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
150
161
  evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
151
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=FZwXN78X2fV3Dchop_UuFAhNFkwWs12qJlIczgvvrJ8,477
162
+ evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=SrK18xDe4HyUaIPRLVEDtoF4Nc_ms4aFxktEsj8MnnA,9071
152
163
  evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
153
164
  evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
154
165
  evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
155
- evalscope/benchmarks/super_gpqa/utils.py,sha256=uhANVnoIaH8-QuzjcVuyVB-8aGOMy94XKUF-TFemY_Q,3578
166
+ evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
156
167
  evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
157
168
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
158
169
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
@@ -169,16 +180,17 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
169
180
  evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
170
181
  evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
171
182
  evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
172
- evalscope/collections/evaluator.py,sha256=Zi3uRZhSRIimYye_apZWL6VOiHqaM5znbFA4TBvqSbg,12761
183
+ evalscope/collections/evaluator.py,sha256=okP4_a5vuM-Z0O_4ntauuyn2NeH228JUo_YrbrTqKPM,12741
173
184
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
174
185
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
175
186
  evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
176
- evalscope/evaluator/evaluator.py,sha256=VIiw1eI46UOsFWNd7schD4ah_Q5ll0crl2sRmGIRmig,17649
187
+ evalscope/evaluator/evaluator.py,sha256=yj7ds5WMYqQcRw3B3x11-cajl4DmWsLM_3kO1n2k7OE,19734
177
188
  evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
178
189
  evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
179
190
  evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
180
191
  evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
181
192
  evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
193
+ evalscope/metrics/llm_judge.py,sha256=g9pLMJPNTUyw0sGteblws1_e_KzbRqcbqKcaIzfE_DE,4031
182
194
  evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
183
195
  evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
184
196
  evalscope/metrics/named_metrics.py,sha256=pSHA2_qdi9B5bDHIh08GYhx63odilSwA_T-95K1Usl0,1380
@@ -201,9 +213,9 @@ evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAd
201
213
  evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
202
214
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
203
215
  evalscope/perf/arguments.py,sha256=u3GNdnOBmiEirtgJLspsLO7qBwHeWLoXd4vlt69jJ-g,9717
204
- evalscope/perf/benchmark.py,sha256=qNgDNseW8N0beuAB_4-JVtTdHs7ZaJEHK5XnkMU9vRU,9618
205
- evalscope/perf/http_client.py,sha256=eoRPaBTCVC4DpgH4tnc-31_h_2PVkWUwCLWK6_TTkhM,7282
206
- evalscope/perf/main.py,sha256=SUMz8S2XPL8JaSL1-vy8qkrb34d5vp6DfQdwIGOUXTk,1277
216
+ evalscope/perf/benchmark.py,sha256=hKN-Nu-x-VTswHP0M6PT3jvduWxN7AJpz34DBrUcafQ,9734
217
+ evalscope/perf/http_client.py,sha256=xMakdQkJ2cgIOd-yOmHEW0vbGKTJ0JWhLFt9IFtUP8Q,7473
218
+ evalscope/perf/main.py,sha256=aZUrfbz-Pl2xe8AgUL_6rW6n8dX4YAToDw5xPpLtbI4,1278
207
219
  evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
208
220
  evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
209
221
  evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
@@ -257,7 +269,7 @@ evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u
257
269
  evalscope/third_party/longbench_write/default_task.json,sha256=d_NPShtW10Mc02U3pAuxX9hXd09tZw7QJAr1SvrECcM,694
258
270
  evalscope/third_party/longbench_write/default_task.yaml,sha256=YjU8EeyH9UtM8e7_fhrwJNChQdszOAcrKmOi--Awvhk,578
259
271
  evalscope/third_party/longbench_write/eval.py,sha256=39McZSDHL7bA5Dg-BSyZ4EiAF1nfTiYJAnx5FqbNYok,11265
260
- evalscope/third_party/longbench_write/infer.py,sha256=bFsOp--8Qn6qQ-NpdLY0bennQGQl5TMGEngvGda8k7g,4937
272
+ evalscope/third_party/longbench_write/infer.py,sha256=32t90zTll6SXH7Wx8QnRFMs6ZUwvpbgYNuawCByzwR0,4971
261
273
  evalscope/third_party/longbench_write/longbench_write.py,sha256=nIR1toB1hvUXR7Lrs3xcY9wqaI-bjeADg_Oscf3HdaY,3991
262
274
  evalscope/third_party/longbench_write/utils.py,sha256=nd-YslsOyNGAuyBfAWb2pnTMaGLMQ58lbnJJdrCndeI,815
263
275
  evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -298,8 +310,9 @@ evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
298
310
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
299
311
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
300
312
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
301
- tests/cli/test_collection.py,sha256=-CrcAiZVtsY7mXUNVlRjhFWEgmPL5k1dH9PjNhKzYdU,3028
302
- tests/cli/test_run.py,sha256=flwZZ1PyMnrxy5f36mdUeGSO_ANpr2588dw1zHVQYJY,12735
313
+ tests/cli/test_all.py,sha256=1wwXtdjBmWYLhs5TXOJhZBwPm2qd9FYFqQSemXWKNUs,3865
314
+ tests/cli/test_collection.py,sha256=V-_M7ngwekMGqPuI16jjJZyAK2XLE4Z6QTn-8B5ykgU,4071
315
+ tests/cli/test_run.py,sha256=LKWWxT0jaMLtcIl57vnXEFFlzbJpAplFqqwinvAHN8Y,15047
303
316
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
304
317
  tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
305
318
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -312,9 +325,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
312
325
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
313
326
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
314
327
  tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
315
- evalscope-0.12.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
316
- evalscope-0.12.1.dist-info/METADATA,sha256=jdU1I5E3YNc8PLfY0NYYDTKiXzTE4HYtX5J6OUPkQ_s,31337
317
- evalscope-0.12.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
318
- evalscope-0.12.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
319
- evalscope-0.12.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
320
- evalscope-0.12.1.dist-info/RECORD,,
328
+ evalscope-0.13.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
329
+ evalscope-0.13.0.dist-info/METADATA,sha256=0i3SENci2ws_vqdewQAxVUqan-MV1LwJoLLcEZ8ML7w,32870
330
+ evalscope-0.13.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
331
+ evalscope-0.13.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
332
+ evalscope-0.13.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
333
+ evalscope-0.13.0.dist-info/RECORD,,
tests/cli/test_all.py ADDED
@@ -0,0 +1,144 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import os
7
+ import subprocess
8
+ import unittest
9
+
10
+ from evalscope.config import TaskConfig
11
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
12
+ from evalscope.run import run_task
13
+ from evalscope.utils import is_module_installed, test_level_list
14
+ from evalscope.utils.logger import get_logger
15
+
16
+ os.environ['LOG_LEVEL'] = 'DEBUG'
17
+
18
+ logger = get_logger()
19
+
20
+ datasets=[
21
+ # 'iquiz',
22
+ # 'ifeval',
23
+ # 'mmlu',
24
+ # 'mmlu_pro',
25
+ # 'musr',
26
+ # 'process_bench',
27
+ # 'race',
28
+ # 'trivia_qa',
29
+ # 'cmmlu',
30
+ # 'humaneval',
31
+ # 'gsm8k',
32
+ # 'bbh',
33
+ # 'competition_math',
34
+ # 'math_500',
35
+ # 'aime24',
36
+ # 'gpqa',
37
+ # 'arc',
38
+ # 'ceval',
39
+ # 'hellaswag',
40
+ # 'general_mcq',
41
+ # 'general_qa',
42
+ 'super_gpqa',
43
+ 'live_code_bench',
44
+ 'simple_qa',
45
+ 'chinese_simpleqa',
46
+ ]
47
+
48
+ dataset_args={
49
+ 'mmlu': {
50
+ 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
51
+ 'few_shot_num': 0
52
+ },
53
+ 'mmlu_pro': {
54
+ 'subset_list': ['math', 'health'],
55
+ 'few_shot_num': 4
56
+ },
57
+ 'ceval': {
58
+ 'subset_list': [
59
+ 'computer_network', 'operating_system', 'computer_architecture'
60
+ ],
61
+ 'few_shot_num': 0
62
+ },
63
+ 'cmmlu': {
64
+ 'subset_list': ['elementary_chinese'],
65
+ 'few_shot_num': 0
66
+ },
67
+ 'bbh': {
68
+ 'subset_list': ['word_sorting', 'movie_recommendation'],
69
+ },
70
+ 'gpqa': {
71
+ 'subset_list': ['gpqa_diamond'],
72
+ 'few_shot_num': 0,
73
+ },
74
+ 'humaneval': {
75
+ 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
76
+ },
77
+ 'competition_math': {
78
+ 'subset_list': ['Level 1']
79
+ },
80
+ 'math_500': {
81
+ 'subset_list': ['Level 1']
82
+ },
83
+ 'process_bench': {
84
+ 'subset_list': ['gsm8k'],
85
+ },
86
+ 'musr': {
87
+ 'subset_list': ['murder_mysteries']
88
+ },
89
+ 'general_mcq': {
90
+ 'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
91
+ 'subset_list': [
92
+ 'example' # 评测数据集名称,上述 *_dev.csv 中的 *
93
+ ],
94
+ },
95
+ 'general_qa': {
96
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
97
+ 'subset_list': [
98
+ 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
99
+ # 'test'
100
+ ],
101
+ 'metric_list': ['AverageBLEU']
102
+ },
103
+ 'super_gpqa': {
104
+ 'subset_list': ['Philosophy', 'Education'],
105
+ 'few_shot_num': 0
106
+ },
107
+ 'live_code_bench': {
108
+ 'subset_list': ['v4_v5'],
109
+ 'extra_params': {
110
+ 'start_date': '2024-12-01',
111
+ 'end_date': '2025-01-01'
112
+ },
113
+ }
114
+ }
115
+
116
+ class TestRun(unittest.TestCase):
117
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
118
+ def test_benchmarks(self):
119
+ from evalscope.config import TaskConfig
120
+
121
+ task_cfg = TaskConfig(
122
+ model='qwen2.5-7b-instruct',
123
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
124
+ api_key= env.get('DASHSCOPE_API_KEY'),
125
+ eval_type=EvalType.SERVICE,
126
+ datasets=datasets,
127
+ dataset_args=dataset_args,
128
+ eval_batch_size=32,
129
+ limit=2,
130
+ stream=True,
131
+ generation_config={
132
+ 'temperature': 0,
133
+ 'n': 1,
134
+ 'max_tokens': 4096,
135
+ },
136
+ judge_strategy=JudgeStrategy.AUTO,
137
+ judge_model_args={
138
+ 'model_id': 'qwen2.5-7b-instruct',
139
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
140
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
141
+ }
142
+ )
143
+
144
+ run_task(task_cfg=task_cfg)
@@ -1,8 +1,9 @@
1
1
  import json
2
+ import os
2
3
  import unittest
3
4
 
4
5
  from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
5
- from evalscope.constants import EvalType
6
+ from evalscope.constants import EvalType, JudgeStrategy
6
7
  from evalscope.utils.io_utils import dump_jsonl_data
7
8
  from evalscope.utils.utils import test_level_list
8
9
 
@@ -55,3 +56,28 @@ class TestCollection(unittest.TestCase):
55
56
  }},
56
57
  )
57
58
  run_task(task_cfg=task_cfg)
59
+
60
+
61
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
62
+ def test_evaluate_collection_with_judge(self):
63
+ from evalscope import TaskConfig, run_task
64
+
65
+ task_cfg = TaskConfig(
66
+ model='qwen2.5-7b-instruct',
67
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
68
+ api_key= os.getenv('DASHSCOPE_API_KEY'),
69
+ eval_type=EvalType.SERVICE,
70
+ datasets=['data_collection'],
71
+ dataset_args={'data_collection': {
72
+ 'local_path': 'outputs/mixed_data_test.jsonl'
73
+ # 'local_path': 'outputs/weighted_mixed_data.jsonl'
74
+ }},
75
+ limit=10,
76
+ judge_strategy=JudgeStrategy.LLM_RECALL,
77
+ judge_model_args={
78
+ 'model_id': 'qwen2.5-7b-instruct',
79
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
80
+ 'api_key': os.getenv('DASHSCOPE_API_KEY'),
81
+ }
82
+ )
83
+ run_task(task_cfg=task_cfg)
tests/cli/test_run.py CHANGED
@@ -1,11 +1,14 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
2
6
  import os
3
7
  import subprocess
4
- import torch
5
8
  import unittest
6
9
 
7
10
  from evalscope.config import TaskConfig
8
- from evalscope.constants import EvalType, OutputType
11
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
9
12
  from evalscope.run import run_task
10
13
  from evalscope.utils import is_module_installed, test_level_list
11
14
  from evalscope.utils.logger import get_logger
@@ -225,9 +228,9 @@ class TestRun(unittest.TestCase):
225
228
  from evalscope.config import TaskConfig
226
229
 
227
230
  task_cfg = TaskConfig(
228
- model='Qwen2.5-0.5B-Instruct',
229
- api_url='http://127.0.0.1:8801/v1',
230
- api_key='EMPTY',
231
+ model='qwen2.5-7b-instruct',
232
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
233
+ api_key= env.get('DASHSCOPE_API_KEY'),
231
234
  eval_type=EvalType.SERVICE,
232
235
  datasets=[
233
236
  # 'iquiz',
@@ -245,17 +248,17 @@ class TestRun(unittest.TestCase):
245
248
  # 'competition_math',
246
249
  # 'math_500',
247
250
  # 'aime24',
248
- 'gpqa',
251
+ # 'gpqa',
249
252
  # 'arc',
250
- 'ceval',
251
- # 'hellaswag',
253
+ # 'ceval',
254
+ 'hellaswag',
252
255
  # 'general_mcq',
253
256
  # 'general_qa'
254
257
  # 'super_gpqa',
255
258
  ],
256
259
  dataset_args={
257
260
  'mmlu': {
258
- 'subset_list': ['elementary_mathematics'],
261
+ 'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
259
262
  'few_shot_num': 0
260
263
  },
261
264
  'mmlu_pro': {
@@ -313,7 +316,7 @@ class TestRun(unittest.TestCase):
313
316
  }
314
317
  },
315
318
  eval_batch_size=32,
316
- limit=10,
319
+ limit=15,
317
320
  # debug=True,
318
321
  stream=False,
319
322
  generation_config={
@@ -357,5 +360,64 @@ class TestRun(unittest.TestCase):
357
360
 
358
361
  run_task(task_cfg=task_cfg)
359
362
 
363
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
364
+ def test_run_judge_model(self):
365
+ from evalscope.config import TaskConfig
366
+
367
+ task_cfg = TaskConfig(
368
+ model='qwen2.5-7b-instruct',
369
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
370
+ api_key= env.get('DASHSCOPE_API_KEY'),
371
+ eval_type=EvalType.SERVICE,
372
+ datasets=[
373
+ # 'math_500',
374
+ 'aime24',
375
+ # 'competition_math',
376
+ # 'arc',
377
+ # 'gsm8k'
378
+ # 'truthful_qa',
379
+ # 'simple_qa',
380
+ # # 'chinese_simpleqa',
381
+ # 'live_code_bench',
382
+ # 'humaneval'
383
+ # 'general_qa'
384
+ ],
385
+ dataset_args={
386
+ 'competition_math': {
387
+ 'subset_list': ['Level 4']
388
+ },
389
+ 'live_code_bench': {
390
+ 'subset_list': ['v4_v5'],
391
+ 'extra_params': {
392
+ 'start_date': '2024-12-01',
393
+ 'end_date': '2025-01-01'
394
+ },
395
+ 'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
396
+ },
397
+ 'general_qa': {
398
+ 'local_path': 'custom_eval/text/qa', # 自定义数据集路径
399
+ 'subset_list': [
400
+ 'example', # 评测数据集名称,上述 *_dev.csv 中的 *
401
+ # 'test'
402
+ ]
403
+ },
404
+ },
405
+ eval_batch_size=5,
406
+ limit=5,
407
+ judge_strategy=JudgeStrategy.AUTO,
408
+ judge_model_args={
409
+ 'model_id': 'qwen2.5-7b-instruct',
410
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
411
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
412
+ },
413
+ generation_config={
414
+ 'max_new_tokens': 2048,
415
+ 'temperature': 0.0,
416
+ 'seed': 42,
417
+ }
418
+ )
419
+
420
+ run_task(task_cfg=task_cfg)
421
+
360
422
  if __name__ == '__main__':
361
423
  unittest.main()