evalscope 0.12.1__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +6 -1
- evalscope/benchmarks/arc/arc_adapter.py +3 -3
- evalscope/benchmarks/benchmark.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +2 -1
- evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +32 -4
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -4
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +20 -24
- evalscope/benchmarks/humaneval/humaneval_adapter.py +8 -5
- evalscope/benchmarks/live_code_bench/__init__.py +0 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
- evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
- evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
- evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
- evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
- evalscope/benchmarks/live_code_bench/prompts.py +207 -0
- evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -2
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +148 -1
- evalscope/benchmarks/super_gpqa/utils.py +0 -5
- evalscope/collections/evaluator.py +3 -3
- evalscope/config.py +6 -1
- evalscope/constants.py +7 -0
- evalscope/evaluator/evaluator.py +51 -13
- evalscope/metrics/llm_judge.py +104 -0
- evalscope/perf/benchmark.py +5 -0
- evalscope/perf/http_client.py +9 -1
- evalscope/perf/main.py +1 -0
- evalscope/run.py +1 -1
- evalscope/third_party/longbench_write/infer.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.12.1.dist-info → evalscope-0.13.0.dist-info}/METADATA +25 -10
- {evalscope-0.12.1.dist-info → evalscope-0.13.0.dist-info}/RECORD +43 -30
- tests/cli/test_all.py +144 -0
- tests/cli/test_collection.py +27 -1
- tests/cli/test_run.py +72 -10
- {evalscope-0.12.1.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.12.1.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=VhZd7a8PoZK01qFCMEADLINqLYi6njRqRb50iR1l1lo,5241
|
|
3
|
+
evalscope/config.py,sha256=9bMV7wf8pM7N5dEj_kJsCq6oM8xobzQDYh0NF8h-j1I,9313
|
|
4
|
+
evalscope/constants.py,sha256=ydS8oihksGnvvzvJZw7HGhEeeccHNpJxspB81gAv29Y,3720
|
|
5
|
+
evalscope/run.py,sha256=Udz-H503UaMYos0ic3A_npXIbnd4eJLx26q5UEahF-U,5797
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
7
|
evalscope/summarizer.py,sha256=Wnt8Y61PasOkigo8zMpi1qdFzfETjfnDtCDDQ6VwgDw,5867
|
|
8
|
-
evalscope/version.py,sha256=
|
|
8
|
+
evalscope/version.py,sha256=a1r1BkZoSpoA_eGXZoXm6WaLayRHhF__TgvE9xG-Whs,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -56,15 +56,15 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
|
|
|
56
56
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ZB0wYzPBPyIM0zjfp5C71GcjAgKxKoWxF7RrhuDQBYM,5931
|
|
57
57
|
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
58
58
|
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
59
|
+
evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
|
|
60
|
+
evalscope/benchmarks/data_adapter.py,sha256=2u9oC4RBHVfEMHKPRu87xM4XOw_RS2Z2fvagNsciEo4,16791
|
|
61
61
|
evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
|
|
62
62
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
63
|
evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
|
|
64
64
|
evalscope/benchmarks/aime/aime25_adapter.py,sha256=FB_NufY2V7uYdxVnrY_4y81gyyfYDnvedz1_zHdDWt4,1709
|
|
65
65
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
66
66
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
67
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
67
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=lkhDz-DYjPQ1vHzo8X4j-0Lq_rBxAnws35_R00pIbNI,6347
|
|
68
68
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
69
69
|
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
|
|
70
70
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
@@ -95,11 +95,13 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
|
|
|
95
95
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
96
96
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
97
97
|
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
98
|
-
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=
|
|
98
|
+
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=E4QobCjSSkMZtPJyaT_XBVxiqEqa1bta1I9aFnaHOqs,11308
|
|
99
99
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
100
|
+
evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
101
|
+
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=nKF_a0yc_PbZYjYA_-gJh3ePZIEz5txrhDV4IsTqD4Q,8196
|
|
100
102
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
101
103
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
102
|
-
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=
|
|
104
|
+
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=TTq2jRz46Hqc_D_ZBaiw_OwKub1FZX6w8C7g7COIdGs,10372
|
|
103
105
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
104
106
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
105
107
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
@@ -109,7 +111,7 @@ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=U4M-0MVJS
|
|
|
109
111
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
110
112
|
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
|
|
111
113
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
112
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
114
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=ELDdS5T3JZeSWVv1ldawcHzLwAljEWKqakbRMVcBvgw,4741
|
|
113
115
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
114
116
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
115
117
|
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
|
|
@@ -118,10 +120,10 @@ evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTv
|
|
|
118
120
|
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=ZZZ-9oja53IwiU33Kjm7NTk4MbFGWyvonhnHrn_3Na8,10557
|
|
119
121
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
120
122
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
121
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
123
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=QYZZuxbjkKxAjxuoWn0M5WgusO55vzeAcyKnWUMow3M,5871
|
|
122
124
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
123
125
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
124
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
126
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
|
|
125
127
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
126
128
|
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=R7MILWuMglvXr7yWioBxyJ2T4EdEkwRZ1lnvWqZqG28,1922
|
|
127
129
|
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
@@ -130,11 +132,20 @@ evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2
|
|
|
130
132
|
evalscope/benchmarks/ifeval/utils.py,sha256=TKrM1m2qDCUauahogItDdICf4mDk0OjasSxgnxjt2KY,4517
|
|
131
133
|
evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
132
134
|
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=16whmFkJt9fLbei9d-kmjnWB_5y5vsiX9tK5kSuxDw8,2449
|
|
135
|
+
evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
136
|
+
evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=rOWaG8PV4AGIRhS_gqwxEhphEVe1Cqg57Eudwm5HTjI,6820
|
|
137
|
+
evalscope/benchmarks/live_code_bench/execute_utils.py,sha256=MreaMLI0IicNZawpfqcyoRLt67EZ3CJvmxxRTYwhAbU,7397
|
|
138
|
+
evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
|
|
139
|
+
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=8MOECcweLG465JFgUzP20QlKyBAO90oFHhH7Z77FuUY,3521
|
|
140
|
+
evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
|
|
141
|
+
evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
|
|
142
|
+
evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
|
|
143
|
+
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=EBe0XzY3B4cW5dCjwLksW7o4R1chZwsuFjxkfqVPFI4,28238
|
|
133
144
|
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
134
145
|
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
|
|
135
146
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
136
147
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
137
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=
|
|
148
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=e__Evar99V9l65FlzT6T594CN4iMgmuVhjujQAm4po4,11662
|
|
138
149
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
139
150
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
140
151
|
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=OANfue-fK543drJrDj6V_zDMtySrQEBHPgTsejr-e7U,4226
|
|
@@ -148,11 +159,11 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
|
|
|
148
159
|
evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
|
|
149
160
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
150
161
|
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
151
|
-
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=
|
|
162
|
+
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=SrK18xDe4HyUaIPRLVEDtoF4Nc_ms4aFxktEsj8MnnA,9071
|
|
152
163
|
evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
164
|
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
|
|
154
165
|
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
|
|
155
|
-
evalscope/benchmarks/super_gpqa/utils.py,sha256=
|
|
166
|
+
evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
|
|
156
167
|
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=y7hR9SmoR_YqoEWtT8N9JpZOpeJIlg0cDGDgYw6R6hM,237
|
|
157
168
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
158
169
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
@@ -169,16 +180,17 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
|
|
|
169
180
|
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
170
181
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
171
182
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
172
|
-
evalscope/collections/evaluator.py,sha256=
|
|
183
|
+
evalscope/collections/evaluator.py,sha256=okP4_a5vuM-Z0O_4ntauuyn2NeH228JUo_YrbrTqKPM,12741
|
|
173
184
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
174
185
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
175
186
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
176
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
187
|
+
evalscope/evaluator/evaluator.py,sha256=yj7ds5WMYqQcRw3B3x11-cajl4DmWsLM_3kO1n2k7OE,19734
|
|
177
188
|
evalscope/evaluator/rating_eval.py,sha256=uo0uj9z_TDsxdYlT8WIfNZhFLAfRkW9zn_wlu-F72O0,5575
|
|
178
189
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
179
190
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
180
191
|
evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
|
|
181
192
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
193
|
+
evalscope/metrics/llm_judge.py,sha256=g9pLMJPNTUyw0sGteblws1_e_KzbRqcbqKcaIzfE_DE,4031
|
|
182
194
|
evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
|
|
183
195
|
evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
|
|
184
196
|
evalscope/metrics/named_metrics.py,sha256=pSHA2_qdi9B5bDHIh08GYhx63odilSwA_T-95K1Usl0,1380
|
|
@@ -201,9 +213,9 @@ evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAd
|
|
|
201
213
|
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
202
214
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
203
215
|
evalscope/perf/arguments.py,sha256=u3GNdnOBmiEirtgJLspsLO7qBwHeWLoXd4vlt69jJ-g,9717
|
|
204
|
-
evalscope/perf/benchmark.py,sha256=
|
|
205
|
-
evalscope/perf/http_client.py,sha256=
|
|
206
|
-
evalscope/perf/main.py,sha256=
|
|
216
|
+
evalscope/perf/benchmark.py,sha256=hKN-Nu-x-VTswHP0M6PT3jvduWxN7AJpz34DBrUcafQ,9734
|
|
217
|
+
evalscope/perf/http_client.py,sha256=xMakdQkJ2cgIOd-yOmHEW0vbGKTJ0JWhLFt9IFtUP8Q,7473
|
|
218
|
+
evalscope/perf/main.py,sha256=aZUrfbz-Pl2xe8AgUL_6rW6n8dX4YAToDw5xPpLtbI4,1278
|
|
207
219
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
208
220
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
209
221
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
@@ -257,7 +269,7 @@ evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u
|
|
|
257
269
|
evalscope/third_party/longbench_write/default_task.json,sha256=d_NPShtW10Mc02U3pAuxX9hXd09tZw7QJAr1SvrECcM,694
|
|
258
270
|
evalscope/third_party/longbench_write/default_task.yaml,sha256=YjU8EeyH9UtM8e7_fhrwJNChQdszOAcrKmOi--Awvhk,578
|
|
259
271
|
evalscope/third_party/longbench_write/eval.py,sha256=39McZSDHL7bA5Dg-BSyZ4EiAF1nfTiYJAnx5FqbNYok,11265
|
|
260
|
-
evalscope/third_party/longbench_write/infer.py,sha256=
|
|
272
|
+
evalscope/third_party/longbench_write/infer.py,sha256=32t90zTll6SXH7Wx8QnRFMs6ZUwvpbgYNuawCByzwR0,4971
|
|
261
273
|
evalscope/third_party/longbench_write/longbench_write.py,sha256=nIR1toB1hvUXR7Lrs3xcY9wqaI-bjeADg_Oscf3HdaY,3991
|
|
262
274
|
evalscope/third_party/longbench_write/utils.py,sha256=nd-YslsOyNGAuyBfAWb2pnTMaGLMQ58lbnJJdrCndeI,815
|
|
263
275
|
evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -298,8 +310,9 @@ evalscope/utils/utils.py,sha256=lGvn94ryIzx-7WLNJeuyehNTmINt0jYIjrjW12woPCs,9730
|
|
|
298
310
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
299
311
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
300
312
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
301
|
-
tests/cli/
|
|
302
|
-
tests/cli/
|
|
313
|
+
tests/cli/test_all.py,sha256=1wwXtdjBmWYLhs5TXOJhZBwPm2qd9FYFqQSemXWKNUs,3865
|
|
314
|
+
tests/cli/test_collection.py,sha256=V-_M7ngwekMGqPuI16jjJZyAK2XLE4Z6QTn-8B5ykgU,4071
|
|
315
|
+
tests/cli/test_run.py,sha256=LKWWxT0jaMLtcIl57vnXEFFlzbJpAplFqqwinvAHN8Y,15047
|
|
303
316
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
304
317
|
tests/perf/test_perf.py,sha256=iB8Mg565SfwPsObdAByHYfZNqN71kUtPW7ucmyiOWo8,3025
|
|
305
318
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -312,9 +325,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZU
|
|
|
312
325
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
313
326
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
314
327
|
tests/vlm/test_vlmeval.py,sha256=nzWXjw49SlxXgDnYS9N5JSFtcUp8xPOW2YNNzupvtt4,1806
|
|
315
|
-
evalscope-0.
|
|
316
|
-
evalscope-0.
|
|
317
|
-
evalscope-0.
|
|
318
|
-
evalscope-0.
|
|
319
|
-
evalscope-0.
|
|
320
|
-
evalscope-0.
|
|
328
|
+
evalscope-0.13.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
329
|
+
evalscope-0.13.0.dist-info/METADATA,sha256=0i3SENci2ws_vqdewQAxVUqan-MV1LwJoLLcEZ8ML7w,32870
|
|
330
|
+
evalscope-0.13.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
331
|
+
evalscope-0.13.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
332
|
+
evalscope-0.13.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
333
|
+
evalscope-0.13.0.dist-info/RECORD,,
|
tests/cli/test_all.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import subprocess
|
|
8
|
+
import unittest
|
|
9
|
+
|
|
10
|
+
from evalscope.config import TaskConfig
|
|
11
|
+
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
12
|
+
from evalscope.run import run_task
|
|
13
|
+
from evalscope.utils import is_module_installed, test_level_list
|
|
14
|
+
from evalscope.utils.logger import get_logger
|
|
15
|
+
|
|
16
|
+
os.environ['LOG_LEVEL'] = 'DEBUG'
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
datasets=[
|
|
21
|
+
# 'iquiz',
|
|
22
|
+
# 'ifeval',
|
|
23
|
+
# 'mmlu',
|
|
24
|
+
# 'mmlu_pro',
|
|
25
|
+
# 'musr',
|
|
26
|
+
# 'process_bench',
|
|
27
|
+
# 'race',
|
|
28
|
+
# 'trivia_qa',
|
|
29
|
+
# 'cmmlu',
|
|
30
|
+
# 'humaneval',
|
|
31
|
+
# 'gsm8k',
|
|
32
|
+
# 'bbh',
|
|
33
|
+
# 'competition_math',
|
|
34
|
+
# 'math_500',
|
|
35
|
+
# 'aime24',
|
|
36
|
+
# 'gpqa',
|
|
37
|
+
# 'arc',
|
|
38
|
+
# 'ceval',
|
|
39
|
+
# 'hellaswag',
|
|
40
|
+
# 'general_mcq',
|
|
41
|
+
# 'general_qa',
|
|
42
|
+
'super_gpqa',
|
|
43
|
+
'live_code_bench',
|
|
44
|
+
'simple_qa',
|
|
45
|
+
'chinese_simpleqa',
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
dataset_args={
|
|
49
|
+
'mmlu': {
|
|
50
|
+
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
51
|
+
'few_shot_num': 0
|
|
52
|
+
},
|
|
53
|
+
'mmlu_pro': {
|
|
54
|
+
'subset_list': ['math', 'health'],
|
|
55
|
+
'few_shot_num': 4
|
|
56
|
+
},
|
|
57
|
+
'ceval': {
|
|
58
|
+
'subset_list': [
|
|
59
|
+
'computer_network', 'operating_system', 'computer_architecture'
|
|
60
|
+
],
|
|
61
|
+
'few_shot_num': 0
|
|
62
|
+
},
|
|
63
|
+
'cmmlu': {
|
|
64
|
+
'subset_list': ['elementary_chinese'],
|
|
65
|
+
'few_shot_num': 0
|
|
66
|
+
},
|
|
67
|
+
'bbh': {
|
|
68
|
+
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
69
|
+
},
|
|
70
|
+
'gpqa': {
|
|
71
|
+
'subset_list': ['gpqa_diamond'],
|
|
72
|
+
'few_shot_num': 0,
|
|
73
|
+
},
|
|
74
|
+
'humaneval': {
|
|
75
|
+
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
76
|
+
},
|
|
77
|
+
'competition_math': {
|
|
78
|
+
'subset_list': ['Level 1']
|
|
79
|
+
},
|
|
80
|
+
'math_500': {
|
|
81
|
+
'subset_list': ['Level 1']
|
|
82
|
+
},
|
|
83
|
+
'process_bench': {
|
|
84
|
+
'subset_list': ['gsm8k'],
|
|
85
|
+
},
|
|
86
|
+
'musr': {
|
|
87
|
+
'subset_list': ['murder_mysteries']
|
|
88
|
+
},
|
|
89
|
+
'general_mcq': {
|
|
90
|
+
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
91
|
+
'subset_list': [
|
|
92
|
+
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
93
|
+
],
|
|
94
|
+
},
|
|
95
|
+
'general_qa': {
|
|
96
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
97
|
+
'subset_list': [
|
|
98
|
+
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
99
|
+
# 'test'
|
|
100
|
+
],
|
|
101
|
+
'metric_list': ['AverageBLEU']
|
|
102
|
+
},
|
|
103
|
+
'super_gpqa': {
|
|
104
|
+
'subset_list': ['Philosophy', 'Education'],
|
|
105
|
+
'few_shot_num': 0
|
|
106
|
+
},
|
|
107
|
+
'live_code_bench': {
|
|
108
|
+
'subset_list': ['v4_v5'],
|
|
109
|
+
'extra_params': {
|
|
110
|
+
'start_date': '2024-12-01',
|
|
111
|
+
'end_date': '2025-01-01'
|
|
112
|
+
},
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
class TestRun(unittest.TestCase):
|
|
117
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
118
|
+
def test_benchmarks(self):
|
|
119
|
+
from evalscope.config import TaskConfig
|
|
120
|
+
|
|
121
|
+
task_cfg = TaskConfig(
|
|
122
|
+
model='qwen2.5-7b-instruct',
|
|
123
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
124
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
125
|
+
eval_type=EvalType.SERVICE,
|
|
126
|
+
datasets=datasets,
|
|
127
|
+
dataset_args=dataset_args,
|
|
128
|
+
eval_batch_size=32,
|
|
129
|
+
limit=2,
|
|
130
|
+
stream=True,
|
|
131
|
+
generation_config={
|
|
132
|
+
'temperature': 0,
|
|
133
|
+
'n': 1,
|
|
134
|
+
'max_tokens': 4096,
|
|
135
|
+
},
|
|
136
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
137
|
+
judge_model_args={
|
|
138
|
+
'model_id': 'qwen2.5-7b-instruct',
|
|
139
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
140
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
141
|
+
}
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
run_task(task_cfg=task_cfg)
|
tests/cli/test_collection.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
import unittest
|
|
3
4
|
|
|
4
5
|
from evalscope.collections import CollectionSchema, DatasetInfo, WeightedSampler
|
|
5
|
-
from evalscope.constants import EvalType
|
|
6
|
+
from evalscope.constants import EvalType, JudgeStrategy
|
|
6
7
|
from evalscope.utils.io_utils import dump_jsonl_data
|
|
7
8
|
from evalscope.utils.utils import test_level_list
|
|
8
9
|
|
|
@@ -55,3 +56,28 @@ class TestCollection(unittest.TestCase):
|
|
|
55
56
|
}},
|
|
56
57
|
)
|
|
57
58
|
run_task(task_cfg=task_cfg)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
62
|
+
def test_evaluate_collection_with_judge(self):
|
|
63
|
+
from evalscope import TaskConfig, run_task
|
|
64
|
+
|
|
65
|
+
task_cfg = TaskConfig(
|
|
66
|
+
model='qwen2.5-7b-instruct',
|
|
67
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
68
|
+
api_key= os.getenv('DASHSCOPE_API_KEY'),
|
|
69
|
+
eval_type=EvalType.SERVICE,
|
|
70
|
+
datasets=['data_collection'],
|
|
71
|
+
dataset_args={'data_collection': {
|
|
72
|
+
'local_path': 'outputs/mixed_data_test.jsonl'
|
|
73
|
+
# 'local_path': 'outputs/weighted_mixed_data.jsonl'
|
|
74
|
+
}},
|
|
75
|
+
limit=10,
|
|
76
|
+
judge_strategy=JudgeStrategy.LLM_RECALL,
|
|
77
|
+
judge_model_args={
|
|
78
|
+
'model_id': 'qwen2.5-7b-instruct',
|
|
79
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
80
|
+
'api_key': os.getenv('DASHSCOPE_API_KEY'),
|
|
81
|
+
}
|
|
82
|
+
)
|
|
83
|
+
run_task(task_cfg=task_cfg)
|
tests/cli/test_run.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
2
6
|
import os
|
|
3
7
|
import subprocess
|
|
4
|
-
import torch
|
|
5
8
|
import unittest
|
|
6
9
|
|
|
7
10
|
from evalscope.config import TaskConfig
|
|
8
|
-
from evalscope.constants import EvalType, OutputType
|
|
11
|
+
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
9
12
|
from evalscope.run import run_task
|
|
10
13
|
from evalscope.utils import is_module_installed, test_level_list
|
|
11
14
|
from evalscope.utils.logger import get_logger
|
|
@@ -225,9 +228,9 @@ class TestRun(unittest.TestCase):
|
|
|
225
228
|
from evalscope.config import TaskConfig
|
|
226
229
|
|
|
227
230
|
task_cfg = TaskConfig(
|
|
228
|
-
model='
|
|
229
|
-
api_url='
|
|
230
|
-
api_key='
|
|
231
|
+
model='qwen2.5-7b-instruct',
|
|
232
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
233
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
231
234
|
eval_type=EvalType.SERVICE,
|
|
232
235
|
datasets=[
|
|
233
236
|
# 'iquiz',
|
|
@@ -245,17 +248,17 @@ class TestRun(unittest.TestCase):
|
|
|
245
248
|
# 'competition_math',
|
|
246
249
|
# 'math_500',
|
|
247
250
|
# 'aime24',
|
|
248
|
-
'gpqa',
|
|
251
|
+
# 'gpqa',
|
|
249
252
|
# 'arc',
|
|
250
|
-
'ceval',
|
|
251
|
-
|
|
253
|
+
# 'ceval',
|
|
254
|
+
'hellaswag',
|
|
252
255
|
# 'general_mcq',
|
|
253
256
|
# 'general_qa'
|
|
254
257
|
# 'super_gpqa',
|
|
255
258
|
],
|
|
256
259
|
dataset_args={
|
|
257
260
|
'mmlu': {
|
|
258
|
-
'subset_list': ['elementary_mathematics'],
|
|
261
|
+
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
259
262
|
'few_shot_num': 0
|
|
260
263
|
},
|
|
261
264
|
'mmlu_pro': {
|
|
@@ -313,7 +316,7 @@ class TestRun(unittest.TestCase):
|
|
|
313
316
|
}
|
|
314
317
|
},
|
|
315
318
|
eval_batch_size=32,
|
|
316
|
-
limit=
|
|
319
|
+
limit=15,
|
|
317
320
|
# debug=True,
|
|
318
321
|
stream=False,
|
|
319
322
|
generation_config={
|
|
@@ -357,5 +360,64 @@ class TestRun(unittest.TestCase):
|
|
|
357
360
|
|
|
358
361
|
run_task(task_cfg=task_cfg)
|
|
359
362
|
|
|
363
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
364
|
+
def test_run_judge_model(self):
|
|
365
|
+
from evalscope.config import TaskConfig
|
|
366
|
+
|
|
367
|
+
task_cfg = TaskConfig(
|
|
368
|
+
model='qwen2.5-7b-instruct',
|
|
369
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
370
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
371
|
+
eval_type=EvalType.SERVICE,
|
|
372
|
+
datasets=[
|
|
373
|
+
# 'math_500',
|
|
374
|
+
'aime24',
|
|
375
|
+
# 'competition_math',
|
|
376
|
+
# 'arc',
|
|
377
|
+
# 'gsm8k'
|
|
378
|
+
# 'truthful_qa',
|
|
379
|
+
# 'simple_qa',
|
|
380
|
+
# # 'chinese_simpleqa',
|
|
381
|
+
# 'live_code_bench',
|
|
382
|
+
# 'humaneval'
|
|
383
|
+
# 'general_qa'
|
|
384
|
+
],
|
|
385
|
+
dataset_args={
|
|
386
|
+
'competition_math': {
|
|
387
|
+
'subset_list': ['Level 4']
|
|
388
|
+
},
|
|
389
|
+
'live_code_bench': {
|
|
390
|
+
'subset_list': ['v4_v5'],
|
|
391
|
+
'extra_params': {
|
|
392
|
+
'start_date': '2024-12-01',
|
|
393
|
+
'end_date': '2025-01-01'
|
|
394
|
+
},
|
|
395
|
+
'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
|
|
396
|
+
},
|
|
397
|
+
'general_qa': {
|
|
398
|
+
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
399
|
+
'subset_list': [
|
|
400
|
+
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
401
|
+
# 'test'
|
|
402
|
+
]
|
|
403
|
+
},
|
|
404
|
+
},
|
|
405
|
+
eval_batch_size=5,
|
|
406
|
+
limit=5,
|
|
407
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
408
|
+
judge_model_args={
|
|
409
|
+
'model_id': 'qwen2.5-7b-instruct',
|
|
410
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
411
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
412
|
+
},
|
|
413
|
+
generation_config={
|
|
414
|
+
'max_new_tokens': 2048,
|
|
415
|
+
'temperature': 0.0,
|
|
416
|
+
'seed': 42,
|
|
417
|
+
}
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
run_task(task_cfg=task_cfg)
|
|
421
|
+
|
|
360
422
|
if __name__ == '__main__':
|
|
361
423
|
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|