evalscope 0.17.0__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/data_adapter.py +9 -4
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +2 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/utils.py +1 -0
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +5 -3
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/completion_parsers.py +7 -0
- evalscope/metrics/llm_judge.py +6 -5
- evalscope/metrics/metrics.py +19 -7
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +2 -0
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +13 -0
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +2 -2
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/utils/io_utils.py +10 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/METADATA +54 -34
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/RECORD +65 -58
- tests/cli/test_all.py +18 -2
- tests/cli/test_run.py +25 -37
- tests/perf/test_perf.py +29 -2
- evalscope/models/model.py +0 -189
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
2
|
evalscope/arguments.py,sha256=QkxE8eGSryiyo9uDiNQNZUI3l_hGPYmhVz1-KHgtB6E,6044
|
|
3
3
|
evalscope/config.py,sha256=1YfHXlIyYH70FQfi8TiUtpUH3VIRCh5YcbaayKZo5s4,6781
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
4
|
+
evalscope/constants.py,sha256=Tc74W89SxeeEzISDzO5IoxSo9A_F0LqjH0mOrcAYJXc,3737
|
|
5
5
|
evalscope/run.py,sha256=dL1deJ0J1RHW6X6ZStXzAVL7NwbjW6McfdOMkCpWrtc,7012
|
|
6
6
|
evalscope/summarizer.py,sha256=ZLFDHmi0Bgo18ouQsxuUl9vmIES9zkoapLLWRLhy19Q,5911
|
|
7
|
-
evalscope/version.py,sha256=
|
|
7
|
+
evalscope/version.py,sha256=wsTu-_Fq9Dmfg7bXg6eDVtNwZA5ui-MZ6IPs4EhytAc,119
|
|
8
8
|
evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
|
|
9
9
|
evalscope/app/app.py,sha256=8mSBp8qUCCmqupV4FEPMPdT9jL-bYu4DdH2qj8P0ktk,776
|
|
10
10
|
evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
|
|
@@ -69,9 +69,9 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
|
|
|
69
69
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=jlwM13Ty-Ax6AeMsNlo9xIBupNFgnceYuXtCmh0hNTQ,6160
|
|
70
70
|
evalscope/benchmarks/__init__.py,sha256=NVd_VvmkY36LxdHNmgeogSBwMFfWoLJAZF8vDg-CoFc,1308
|
|
71
71
|
evalscope/benchmarks/benchmark.py,sha256=uZ_-Y_wPhy6TxufWiElF4BwEWN93azT1JHtGRW8tR-w,2633
|
|
72
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
72
|
+
evalscope/benchmarks/data_adapter.py,sha256=UI4HpnJNYo18GXRiU0HwNUxjRfoSXlCB-xEBIGs2ckg,23914
|
|
73
73
|
evalscope/benchmarks/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
|
|
74
|
-
evalscope/benchmarks/utils.py,sha256=
|
|
74
|
+
evalscope/benchmarks/utils.py,sha256=mIk8n6zVMICQ5JWMyEwUqwlkxva4L-oD5SZzpIKw1sI,1851
|
|
75
75
|
evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
76
76
|
evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
77
77
|
evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
|
|
@@ -121,7 +121,7 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
|
|
|
121
121
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
122
122
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
123
123
|
evalscope/benchmarks/bfcl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
124
|
-
evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=
|
|
124
|
+
evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=ThDOYrJY_RdXMLSC1S9lP-8zYd1syZWpcrXXV1ZPLVs,10100
|
|
125
125
|
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
126
126
|
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=V_TC_E0lKXaFcV_qIdrg2_iddmGJ4um8iIdaXVaK_EM,11146
|
|
127
127
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
@@ -149,9 +149,9 @@ evalscope/benchmarks/general_arena/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQe
|
|
|
149
149
|
evalscope/benchmarks/general_arena/general_arena_adapter.py,sha256=j2aDzikz9obxvrR-damdvSCXR0rfjEo-OzX8vujj2N0,19887
|
|
150
150
|
evalscope/benchmarks/general_arena/utils.py,sha256=u0q4FNIOFka1_gC344OCvBXUz89Ah6M8asjIXbNSweM,7188
|
|
151
151
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
152
|
-
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=
|
|
152
|
+
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=cPN-p0tndjocQYqfc6OFkT5k8KL7kkVklmOtps-F08Y,5391
|
|
153
153
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
154
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
154
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=dpIGe635CoW4ejVohVwcarBxSckqvlnxcJ2ElpRlQ9o,5669
|
|
155
155
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
156
156
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
157
157
|
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=J6RfxpUT1l8Jj3vT_Vtsn1z8MKCg32XTlKn_eihCI50,5071
|
|
@@ -161,9 +161,11 @@ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=IBMdsvQ1w45_raCiACTBm7DVHtOYf
|
|
|
161
161
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
162
162
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
163
163
|
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=kgHz-n8_93J8DdR7XBlzfM2KDRoKcvg80h6CCjWv_Xk,6191
|
|
164
|
+
evalscope/benchmarks/hle/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
165
|
+
evalscope/benchmarks/hle/hle_adapter.py,sha256=ts38e-AqtUcbfc6VqRtWLacZDh7KzSm4rj7xKm9vTFc,4445
|
|
164
166
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
165
167
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
166
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
168
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=ZqNG3L8yMY44B7HleUjlSbVG-GLk9RBsvaGWOm2fQVw,4788
|
|
167
169
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
168
170
|
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=2oStqiTD4w2f2n0kbjcbg7GJQfKCsHFieokQcNndWb4,2041
|
|
169
171
|
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
@@ -186,7 +188,7 @@ evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
|
|
|
186
188
|
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=Oc9XnBgMAjEerYAk3GtY2TTKm1QH_UI896kUuW2_a5Y,2324
|
|
187
189
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
188
190
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
189
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=
|
|
191
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=Rhi-J6oGWawRVBk38ZgXk8-XrZ7wL8sf4zrncU73jgs,12111
|
|
190
192
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
191
193
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
192
194
|
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=uglOOZBZfQBIuJOG7iT4THk2LNcfHQoakxQDpS4jB1U,4554
|
|
@@ -211,8 +213,10 @@ evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=CQxRszzUrSIygOSd1G10
|
|
|
211
213
|
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=ce99v28wkhlGnfmihwpv3ikTqy3aumT8Jzm1LGxz-ck,10147
|
|
212
214
|
evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
|
|
213
215
|
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=XZb0CN83YbfH2dF-iIV-ciNLbIb3ON220qHe7zf8KF0,247
|
|
216
|
+
evalscope/benchmarks/tau_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
217
|
+
evalscope/benchmarks/tau_bench/tau_bench_adapter.py,sha256=5_VgRUtEjeZ-8gRZj4cnwwso1GUqf2GB49AlI4xqyDM,4221
|
|
214
218
|
evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
215
|
-
evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=
|
|
219
|
+
evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=fy6Hb84cm6s-pOoQXmT-N8D1OUYVGCuq77-2xwM_WLA,3093
|
|
216
220
|
evalscope/benchmarks/tool_bench/utils.py,sha256=led0d-Pa3rvmWkSWhEnZWP00fceudgESq5HXAQzJGls,7042
|
|
217
221
|
evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
218
222
|
evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
|
|
@@ -234,13 +238,13 @@ evalscope/collections/__init__.py,sha256=3v7tVLcJk86FeNBrxw3pWhu_lcpKYrnT_dDACCe
|
|
|
234
238
|
evalscope/collections/evaluator.py,sha256=RJ337S0sy8dsV25I2OAxeWgSx_HrmXTyuuHKSt9vQtM,17474
|
|
235
239
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
236
240
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
237
|
-
evalscope/evaluator/__init__.py,sha256=
|
|
238
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
239
|
-
evalscope/metrics/__init__.py,sha256=
|
|
240
|
-
evalscope/metrics/completion_parsers.py,sha256=
|
|
241
|
-
evalscope/metrics/llm_judge.py,sha256=
|
|
241
|
+
evalscope/evaluator/__init__.py,sha256=XqPnEp5MvfRwC5M5cEeOAC0-MMEPxBIESqiSa3YMBgo,84
|
|
242
|
+
evalscope/evaluator/evaluator.py,sha256=HKEF2k0S_dJR8cF9lrqf_W4diXbb6H3L81pD6XcmLiA,22481
|
|
243
|
+
evalscope/metrics/__init__.py,sha256=CH3bNyRx9dJ3gOqNwKDlaZ7zan4MShM0h8SnzarjokU,1851
|
|
244
|
+
evalscope/metrics/completion_parsers.py,sha256=56ZNzOfNU0O1ba9fs9Cyi4Vk_YUmcgWUbxW0SJ2KrlU,8974
|
|
245
|
+
evalscope/metrics/llm_judge.py,sha256=1hPFnGc3Szszqo21O618a7mxOgkdba3KsbZ66vvTbSA,8380
|
|
242
246
|
evalscope/metrics/math_parser.py,sha256=JtOkj28XOtwoUACXOXLzCeRYz0rx0tBsQLQDU8cbC20,17311
|
|
243
|
-
evalscope/metrics/metrics.py,sha256=
|
|
247
|
+
evalscope/metrics/metrics.py,sha256=OLfvEljGbQnv-bBiFD-GR2On4mpZ0xhKxiKkjZfoDX8,14268
|
|
244
248
|
evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uCRVDeE,2278
|
|
245
249
|
evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
|
|
246
250
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
@@ -346,46 +350,49 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.p
|
|
|
346
350
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py,sha256=LqMHlUTy2LEzoVwjALtrAw0UYmzIuHnFjQiVmn5nv-I,605
|
|
347
351
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py,sha256=d4HInkL_Phk0Bgg2cWaOvhsPa6lkqDeovFW86PL0I18,6371
|
|
348
352
|
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py,sha256=XzebAHBAjOpkIMZm43dd55PESgmyq_J45Ji6bogYR3s,11204
|
|
349
|
-
evalscope/models/__init__.py,sha256=
|
|
353
|
+
evalscope/models/__init__.py,sha256=x0Sna8mbujdOVqIYSGwIULbiPOue_Ifp-2JElSZsuMs,1481
|
|
350
354
|
evalscope/models/local_model.py,sha256=UWsmZlWpT8JNGjijzZQKirvq4YywBkKOS9G-U2cuxAw,4115
|
|
351
|
-
evalscope/models/
|
|
352
|
-
evalscope/models/
|
|
353
|
-
evalscope/models/adapters/
|
|
354
|
-
evalscope/models/adapters/
|
|
355
|
-
evalscope/models/adapters/
|
|
356
|
-
evalscope/models/adapters/
|
|
357
|
-
evalscope/models/adapters/
|
|
358
|
-
evalscope/models/adapters/
|
|
359
|
-
evalscope/models/adapters/
|
|
360
|
-
evalscope/models/adapters/
|
|
355
|
+
evalscope/models/register.py,sha256=G35J6BULFWwuqZO_rTkKBru1llZAyfPztcAASp_cb8M,1257
|
|
356
|
+
evalscope/models/adapters/__init__.py,sha256=WRaZsHlnz0MvGg9Jq565-XJjED-4cAyu4KbmrOhrHO4,688
|
|
357
|
+
evalscope/models/adapters/base_adapter.py,sha256=P4aicNmz1nsX9QLY9t4c6OIQPzIYfOhcrqjlAjR-ENY,3477
|
|
358
|
+
evalscope/models/adapters/bfcl_adapter.py,sha256=cG0vsQ3H2pmabo6tC0Y5Gonw0ng5-RFljDyRBMSj6xE,10422
|
|
359
|
+
evalscope/models/adapters/chat_adapter.py,sha256=epxA_on9ipsak8Lnkweh9en2AjVm5G0L1ARXYmDEEbk,8026
|
|
360
|
+
evalscope/models/adapters/choice_adapter.py,sha256=wIXnDcgnKaIMdhToaqy6fidhuZDpEz2vhxIB_V9u3Z8,8203
|
|
361
|
+
evalscope/models/adapters/custom_adapter.py,sha256=W8DIBiMWvHHcc0Mn9Frjj1YbpHRi7w-UQVJDiU2PakU,2400
|
|
362
|
+
evalscope/models/adapters/server_adapter.py,sha256=W6SXrPy-hZXpnISDjupu_j7bnmt-cP55sDojPXThitc,9701
|
|
363
|
+
evalscope/models/adapters/t2i_adapter.py,sha256=d6OviQFi_uN8PPXKrFpivk5Awm1O6wd_Gii8t3hVahY,2806
|
|
364
|
+
evalscope/models/adapters/tau_bench_adapter.py,sha256=jYGaj2L2wxtEiTdiSwZdY1XNkSzm6os7IvkxgK4msR0,6889
|
|
361
365
|
evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
|
|
362
366
|
evalscope/models/custom/custom_model.py,sha256=rBccFVpCIfTGt9cgXLcxeUWc7w1sTRtbTO5w5qqQIQE,1405
|
|
363
367
|
evalscope/models/custom/dummy_model.py,sha256=WpfrS3kvwRRdyThx9baaJ5vodYYh29VGRKsGKMWFflI,3124
|
|
364
368
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
365
|
-
evalscope/perf/arguments.py,sha256=
|
|
366
|
-
evalscope/perf/benchmark.py,sha256=
|
|
367
|
-
evalscope/perf/http_client.py,sha256
|
|
368
|
-
evalscope/perf/main.py,sha256=
|
|
369
|
-
evalscope/perf/plugin/__init__.py,sha256=
|
|
370
|
-
evalscope/perf/plugin/registry.py,sha256=
|
|
371
|
-
evalscope/perf/plugin/api/__init__.py,sha256=
|
|
372
|
-
evalscope/perf/plugin/api/base.py,sha256=
|
|
373
|
-
evalscope/perf/plugin/api/custom_api.py,sha256=
|
|
374
|
-
evalscope/perf/plugin/api/dashscope_api.py,sha256=
|
|
375
|
-
evalscope/perf/plugin/api/
|
|
376
|
-
evalscope/perf/plugin/
|
|
377
|
-
evalscope/perf/plugin/datasets/
|
|
378
|
-
evalscope/perf/plugin/datasets/
|
|
379
|
-
evalscope/perf/plugin/datasets/
|
|
380
|
-
evalscope/perf/plugin/datasets/
|
|
381
|
-
evalscope/perf/plugin/datasets/
|
|
382
|
-
evalscope/perf/plugin/datasets/
|
|
383
|
-
evalscope/perf/plugin/datasets/
|
|
369
|
+
evalscope/perf/arguments.py,sha256=lG2IOOzxg29pdnF6IobzPcqEcYqopulFpVU2QzRaEJA,11429
|
|
370
|
+
evalscope/perf/benchmark.py,sha256=ZVmsSeKDUKkApt3y5tIMMFZAyAj3UNVT7JPp1fh5mhE,7880
|
|
371
|
+
evalscope/perf/http_client.py,sha256=l_OKL80kTP6sM_PEBvsJ1_TejYJdUQnE2UlB-ud1WQM,4588
|
|
372
|
+
evalscope/perf/main.py,sha256=WZbBgFhIj9KqxzC7_NZxDlou019_EXatsHRt5vqDhFg,3439
|
|
373
|
+
evalscope/perf/plugin/__init__.py,sha256=Ztj4h1_JYJqbbWkeuDTj5aTRyGQf5Woc4xEIyjcokVU,94
|
|
374
|
+
evalscope/perf/plugin/registry.py,sha256=GhLe-h1rGzya2bgIUaV5VymQIaHqI7h5SG_i4PoGAm8,1967
|
|
375
|
+
evalscope/perf/plugin/api/__init__.py,sha256=7RsGdYTSfnW6iVpveEzNu8v4x8Yc8H-Kk39DqOHMrd4,152
|
|
376
|
+
evalscope/perf/plugin/api/base.py,sha256=9cX4xwTzy5ycnWqmQqRGMLasTEX6jVlobtADkh1KwXE,2782
|
|
377
|
+
evalscope/perf/plugin/api/custom_api.py,sha256=f8rUixcV9mTxoYyabu3wedEC4YVB70Yw6Az1NpfeWPQ,10375
|
|
378
|
+
evalscope/perf/plugin/api/dashscope_api.py,sha256=Miv2pzMa6sxZyYYJhCzcbOI_QHuZx7tazKpb6Not7ck,3627
|
|
379
|
+
evalscope/perf/plugin/api/default_api.py,sha256=kjuHQ-zRHe5WU4ofSzWBpWbIxBQBOh_ucu1z2g62gWg,4315
|
|
380
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=Mt_VedJUaCH3g-oVSJ_fsGcPk0KkspSzIMkrkih2Zb0,7777
|
|
381
|
+
evalscope/perf/plugin/datasets/__init__.py,sha256=qzeQ9BrJhiJJm1wHaFeOQkvXXdSd15Ucspbn5zjs-6Q,495
|
|
382
|
+
evalscope/perf/plugin/datasets/base.py,sha256=-3Ihnp2hYvZyPnP8Gh2Pu8ovlLNFHyZnNgRu3WHG4d0,2714
|
|
383
|
+
evalscope/perf/plugin/datasets/custom.py,sha256=UuOk8xYfSYyyYZL3U4grUjtfQhWHHZeAEC63n_4Siuw,1376
|
|
384
|
+
evalscope/perf/plugin/datasets/flickr8k.py,sha256=IXz5uu5SlqF1l_tJ_ITr2vx_R_d7gxWzqPuyEOx7rYo,1043
|
|
385
|
+
evalscope/perf/plugin/datasets/kontext_bench.py,sha256=XjKzr7nMzI3cfk83IH0PH1TNJaQMRXUpACnzFfP2n6g,1091
|
|
386
|
+
evalscope/perf/plugin/datasets/line_by_line.py,sha256=c3ydW4GqxkG0vl2g64jG0vBMql2FuFPyWh3mgkIh9Do,987
|
|
387
|
+
evalscope/perf/plugin/datasets/longalpaca.py,sha256=VnMjdHl_JV3NmZ6wRxVlJ99e8PYSjQTcVxoTkl21Ei0,1327
|
|
388
|
+
evalscope/perf/plugin/datasets/openqa.py,sha256=33AR419IrH-FxZRjjcYdAIEZXaX4TKEoirVVfX--N9I,1493
|
|
389
|
+
evalscope/perf/plugin/datasets/random_dataset.py,sha256=NNAXvgFPkLDOSpYNex1DyE4X-ELtQRm13_oBooO30j8,3514
|
|
390
|
+
evalscope/perf/plugin/datasets/random_vl_dataset.py,sha256=F3yA9Ih3YO895lZKCo3i85LeKTzjvGcvhzc8UNN-gUI,3240
|
|
384
391
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
|
|
385
392
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
386
393
|
evalscope/perf/utils/analysis_result.py,sha256=aoT7JD2zAzBeuZUfncKhJ2odX_7KnymwOmNB1Upam2c,935
|
|
387
|
-
evalscope/perf/utils/benchmark_util.py,sha256=
|
|
388
|
-
evalscope/perf/utils/db_util.py,sha256=
|
|
394
|
+
evalscope/perf/utils/benchmark_util.py,sha256=7bHpa5oaqcPJX7DSUkzK9assoFSHC27Q7-QylUOiklQ,7136
|
|
395
|
+
evalscope/perf/utils/db_util.py,sha256=TCdmoEx5iScL6h8wzucPojPwn6J1wTmQqX4sVk-ilHo,11630
|
|
389
396
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
390
397
|
evalscope/perf/utils/local_server.py,sha256=RL9rGd5tEniZ0aErhHcbVXMX22YmujfE11T3j37VL8k,4684
|
|
391
398
|
evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
|
|
@@ -434,7 +441,7 @@ evalscope/utils/argument_utils.py,sha256=D7qOH85wf7LKh_cJ2X51OEaL7CMaddydmHZkfoY
|
|
|
434
441
|
evalscope/utils/chat_service.py,sha256=U2jtrkOa2asRp16Zam0zIi_38mCyWQqql_L6JSwii4I,8749
|
|
435
442
|
evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-JOAWNFZI,1344
|
|
436
443
|
evalscope/utils/import_utils.py,sha256=BSdp7RQSZu67129TBbtJvMWU0CfCFu864K31eiM3pr8,2975
|
|
437
|
-
evalscope/utils/io_utils.py,sha256=
|
|
444
|
+
evalscope/utils/io_utils.py,sha256=2eEkLx4jhekgIV4vYL8yTN0PT6dbHUERMBZwmvxuiEc,7109
|
|
438
445
|
evalscope/utils/logger.py,sha256=Q2IeV_0jxz8L34b5GddPeCKXVh0UClbuhjyLe5Wtj7M,3648
|
|
439
446
|
evalscope/utils/model_utils.py,sha256=F1_WBHvBehWqrTd6kPtKICeeYucaZn5H0Gc3cCplYB8,2329
|
|
440
447
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -443,12 +450,12 @@ tests/utils.py,sha256=Fgm0CU6ilZjCGOfOMJH-Trxy0UIAGbhvy0Ijy_zDGUk,323
|
|
|
443
450
|
tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
444
451
|
tests/aigc/test_t2i.py,sha256=XtVknpwlVMb6FSw3_WMFxMq0gZX6iG-ffdSQkcW2Fzw,3856
|
|
445
452
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
446
|
-
tests/cli/test_all.py,sha256=
|
|
453
|
+
tests/cli/test_all.py,sha256=IT0mxjiuHCC0PpT4z3oN1Bbr_0viMcm8GnShZ02kp8w,6333
|
|
447
454
|
tests/cli/test_collection.py,sha256=bXWzccH822Y2B1Ed251U6TE8G_osI6MXYNxzmfv9kBI,4197
|
|
448
455
|
tests/cli/test_custom.py,sha256=0YE-TCAeaQMRVRFla_TIvTd8d0USvvsSeqvYAD3NDNg,8796
|
|
449
|
-
tests/cli/test_run.py,sha256=
|
|
456
|
+
tests/cli/test_run.py,sha256=YKX2XCHPxnStgzzP67U90RV9r1MC2GM3JoGQqfZKqrI,17324
|
|
450
457
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
451
|
-
tests/perf/test_perf.py,sha256=
|
|
458
|
+
tests/perf/test_perf.py,sha256=AEWvpN3ID6s-9MEoaZjQqUM8VVsqgk_v9KX8pDgvozA,5864
|
|
452
459
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
453
460
|
tests/rag/test_clip_benchmark.py,sha256=13pcY3gYHNQh2KfEHCqtCSqiOcbngSJ1BlVZzI58JCE,2694
|
|
454
461
|
tests/rag/test_mteb.py,sha256=fdNQIyUEzE7puPCKw5QhCHTEu7hz-ieHeq1xCWGh6IM,7246
|
|
@@ -459,9 +466,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=RwrKkc1WHEZxetM11cGL81G4faKCn7SYn4
|
|
|
459
466
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=UAUtOCQ72xbm8s-sov3cBEpYVDy189wpB-qOL3KoU7M,6053
|
|
460
467
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
461
468
|
tests/vlm/test_vlmeval.py,sha256=EDQRkYfSyOICUwo_tm3p-puaE_xdFmqOPkrt5etxsqM,3307
|
|
462
|
-
evalscope-0.17.
|
|
463
|
-
evalscope-0.17.
|
|
464
|
-
evalscope-0.17.
|
|
465
|
-
evalscope-0.17.
|
|
466
|
-
evalscope-0.17.
|
|
467
|
-
evalscope-0.17.
|
|
469
|
+
evalscope-0.17.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
470
|
+
evalscope-0.17.1.dist-info/METADATA,sha256=1PRiimjOBZgSWjvT3iL4VcvdaWk8v3fGp9xCXLpM1Dw,38469
|
|
471
|
+
evalscope-0.17.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
472
|
+
evalscope-0.17.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
473
|
+
evalscope-0.17.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
474
|
+
evalscope-0.17.1.dist-info/RECORD,,
|
tests/cli/test_all.py
CHANGED
|
@@ -39,7 +39,7 @@ datasets=[
|
|
|
39
39
|
'general_mcq',
|
|
40
40
|
'general_qa',
|
|
41
41
|
'super_gpqa',
|
|
42
|
-
'live_code_bench',
|
|
42
|
+
# 'live_code_bench',
|
|
43
43
|
'mmlu_redux',
|
|
44
44
|
'simple_qa',
|
|
45
45
|
'chinese_simpleqa',
|
|
@@ -53,8 +53,13 @@ datasets=[
|
|
|
53
53
|
'docmath',
|
|
54
54
|
'needle_haystack',
|
|
55
55
|
'bfcl_v3',
|
|
56
|
+
'hle',
|
|
57
|
+
'tau_bench',
|
|
56
58
|
]
|
|
57
59
|
|
|
60
|
+
# Reverse the datasets list to ensure the order is from most recent to oldest
|
|
61
|
+
datasets.reverse()
|
|
62
|
+
|
|
58
63
|
dataset_args={
|
|
59
64
|
'mmlu': {
|
|
60
65
|
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
@@ -132,7 +137,18 @@ dataset_args={
|
|
|
132
137
|
},
|
|
133
138
|
'bfcl_v3':{
|
|
134
139
|
'subset_list': ['simple', 'multiple']
|
|
135
|
-
}
|
|
140
|
+
},
|
|
141
|
+
'hle': {
|
|
142
|
+
'subset_list': ['Math', 'Other'],
|
|
143
|
+
},
|
|
144
|
+
'tau_bench': {
|
|
145
|
+
'extra_params': {
|
|
146
|
+
'user_model': 'qwen-plus',
|
|
147
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
148
|
+
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
149
|
+
},
|
|
150
|
+
'subset_list': ['airline'],
|
|
151
|
+
},
|
|
136
152
|
}
|
|
137
153
|
|
|
138
154
|
class TestRun(unittest.TestCase):
|
tests/cli/test_run.py
CHANGED
|
@@ -259,14 +259,14 @@ class TestRun(unittest.TestCase):
|
|
|
259
259
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
260
260
|
eval_type=EvalType.SERVICE,
|
|
261
261
|
datasets=[
|
|
262
|
-
'iquiz',
|
|
262
|
+
# 'iquiz',
|
|
263
263
|
# 'ifeval',
|
|
264
264
|
# 'mmlu',
|
|
265
265
|
# 'mmlu_pro',
|
|
266
266
|
# 'musr',
|
|
267
267
|
# 'process_bench',
|
|
268
268
|
# 'race',
|
|
269
|
-
|
|
269
|
+
'trivia_qa',
|
|
270
270
|
# 'cmmlu',
|
|
271
271
|
# 'humaneval',
|
|
272
272
|
# 'gsm8k',
|
|
@@ -289,6 +289,8 @@ class TestRun(unittest.TestCase):
|
|
|
289
289
|
# 'frames',
|
|
290
290
|
# 'bfcl_v3',
|
|
291
291
|
# 'truthful_qa',
|
|
292
|
+
# 'tau_bench',
|
|
293
|
+
# 'hle'
|
|
292
294
|
],
|
|
293
295
|
dataset_args={
|
|
294
296
|
'mmlu': {
|
|
@@ -297,7 +299,7 @@ class TestRun(unittest.TestCase):
|
|
|
297
299
|
},
|
|
298
300
|
'mmlu_pro': {
|
|
299
301
|
'subset_list': ['math', 'health'],
|
|
300
|
-
'few_shot_num':
|
|
302
|
+
'few_shot_num': 0
|
|
301
303
|
},
|
|
302
304
|
'ceval': {
|
|
303
305
|
'subset_list': [
|
|
@@ -360,13 +362,23 @@ class TestRun(unittest.TestCase):
|
|
|
360
362
|
# 'is_fc_model': False,
|
|
361
363
|
}
|
|
362
364
|
},
|
|
365
|
+
'tau_bench': {
|
|
366
|
+
'extra_params': {
|
|
367
|
+
'user_model': 'qwen-plus',
|
|
368
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
369
|
+
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
370
|
+
}
|
|
371
|
+
},
|
|
372
|
+
'hle': {
|
|
373
|
+
'subset_list': ['Math', 'Other'],
|
|
374
|
+
},
|
|
363
375
|
},
|
|
364
|
-
eval_batch_size=
|
|
365
|
-
limit=
|
|
376
|
+
eval_batch_size=10,
|
|
377
|
+
limit=10,
|
|
366
378
|
# debug=True,
|
|
367
379
|
stream=True,
|
|
368
380
|
generation_config={
|
|
369
|
-
'temperature': 0,
|
|
381
|
+
'temperature': 0.6,
|
|
370
382
|
'n': 1,
|
|
371
383
|
'max_tokens': 4096,
|
|
372
384
|
# 'extra_headers':{'key': 'value'},
|
|
@@ -377,35 +389,6 @@ class TestRun(unittest.TestCase):
|
|
|
377
389
|
run_task(task_cfg=task_cfg)
|
|
378
390
|
|
|
379
391
|
|
|
380
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
381
|
-
def test_run_batch_eval(self):
|
|
382
|
-
from evalscope.config import TaskConfig
|
|
383
|
-
|
|
384
|
-
task_cfg = TaskConfig(
|
|
385
|
-
model='LLM-Research/Llama-3.2-1B-Instruct',
|
|
386
|
-
datasets=[
|
|
387
|
-
# 'math_500',
|
|
388
|
-
# 'aime24',
|
|
389
|
-
# 'competition_math'
|
|
390
|
-
# 'arc',
|
|
391
|
-
'gsm8k'
|
|
392
|
-
# 'truthful_qa'
|
|
393
|
-
],
|
|
394
|
-
dataset_args={
|
|
395
|
-
'competition_math': {
|
|
396
|
-
'subset_list': ['Level 4', 'Level 5']
|
|
397
|
-
}
|
|
398
|
-
},
|
|
399
|
-
eval_batch_size=2,
|
|
400
|
-
limit=5,
|
|
401
|
-
generation_config={
|
|
402
|
-
'max_new_tokens': 2048,
|
|
403
|
-
'temperature': 0.7,
|
|
404
|
-
'num_return_sequences': 2,
|
|
405
|
-
}
|
|
406
|
-
)
|
|
407
|
-
|
|
408
|
-
run_task(task_cfg=task_cfg)
|
|
409
392
|
|
|
410
393
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
411
394
|
def test_run_judge_model(self):
|
|
@@ -417,7 +400,7 @@ class TestRun(unittest.TestCase):
|
|
|
417
400
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
418
401
|
eval_type=EvalType.SERVICE,
|
|
419
402
|
datasets=[
|
|
420
|
-
'math_500',
|
|
403
|
+
# 'math_500',
|
|
421
404
|
# 'aime24',
|
|
422
405
|
# 'competition_math',
|
|
423
406
|
# 'arc',
|
|
@@ -434,6 +417,7 @@ class TestRun(unittest.TestCase):
|
|
|
434
417
|
# 'docmath',
|
|
435
418
|
# 'needle_haystack',
|
|
436
419
|
# 'ifeval',
|
|
420
|
+
'hle'
|
|
437
421
|
],
|
|
438
422
|
dataset_args={
|
|
439
423
|
'needle_haystack': {
|
|
@@ -466,7 +450,10 @@ class TestRun(unittest.TestCase):
|
|
|
466
450
|
},
|
|
467
451
|
'frames': {
|
|
468
452
|
'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
|
|
469
|
-
}
|
|
453
|
+
},
|
|
454
|
+
'hle': {
|
|
455
|
+
'subset_list': ['Math', 'Other'],
|
|
456
|
+
},
|
|
470
457
|
},
|
|
471
458
|
eval_batch_size=10,
|
|
472
459
|
limit=3,
|
|
@@ -489,6 +476,7 @@ class TestRun(unittest.TestCase):
|
|
|
489
476
|
},
|
|
490
477
|
timeout=60000,
|
|
491
478
|
stream=True,
|
|
479
|
+
use_cache='outputs/20250714_150626'
|
|
492
480
|
# analysis_report=True,
|
|
493
481
|
# debug=True,
|
|
494
482
|
# use_cache='outputs/20250616_161931'
|
tests/perf/test_perf.py
CHANGED
|
@@ -35,9 +35,9 @@ class TestPerf(unittest.TestCase):
|
|
|
35
35
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
36
36
|
def test_run_perf_stream(self):
|
|
37
37
|
task_cfg = {
|
|
38
|
-
'url': 'http://127.0.0.1:
|
|
38
|
+
'url': 'http://127.0.0.1:8801/v1/chat/completions',
|
|
39
39
|
'parallel': 1,
|
|
40
|
-
'model': '
|
|
40
|
+
'model': 'Qwen2.5-0.5B-Instruct',
|
|
41
41
|
'number': 15,
|
|
42
42
|
'api': 'openai',
|
|
43
43
|
'dataset': 'openqa',
|
|
@@ -145,5 +145,32 @@ class TestPerf(unittest.TestCase):
|
|
|
145
145
|
print(metrics_result)
|
|
146
146
|
print(percentile_result)
|
|
147
147
|
|
|
148
|
+
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
149
|
+
def test_run_perf_random_vl(self):
|
|
150
|
+
from evalscope.perf.arguments import Arguments
|
|
151
|
+
task_cfg = Arguments(
|
|
152
|
+
parallel=[1, 2],
|
|
153
|
+
number=[2, 4],
|
|
154
|
+
model='qwen-vl-max',
|
|
155
|
+
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
156
|
+
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
157
|
+
api='openai',
|
|
158
|
+
dataset='kontext_bench',
|
|
159
|
+
min_tokens=100,
|
|
160
|
+
max_tokens=100,
|
|
161
|
+
prefix_length=0,
|
|
162
|
+
min_prompt_length=100,
|
|
163
|
+
max_prompt_length=100,
|
|
164
|
+
image_height=512,
|
|
165
|
+
image_width=512,
|
|
166
|
+
image_num=2,
|
|
167
|
+
tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
|
|
168
|
+
seed=None,
|
|
169
|
+
extra_args={'ignore_eos': True}
|
|
170
|
+
)
|
|
171
|
+
metrics_result, percentile_result = run_perf_benchmark(task_cfg)
|
|
172
|
+
print(metrics_result)
|
|
173
|
+
print(percentile_result)
|
|
174
|
+
|
|
148
175
|
if __name__ == '__main__':
|
|
149
176
|
unittest.main(buffer=False)
|
evalscope/models/model.py
DELETED
|
@@ -1,189 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
|
-
import time
|
|
4
|
-
from abc import ABC, abstractmethod
|
|
5
|
-
from typing import Any, List
|
|
6
|
-
|
|
7
|
-
from evalscope.utils.logger import get_logger
|
|
8
|
-
|
|
9
|
-
logger = get_logger()
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class BaseModel(ABC):
|
|
13
|
-
|
|
14
|
-
def __init__(self, model_cfg: dict, **kwargs):
|
|
15
|
-
"""
|
|
16
|
-
Base model class.
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
model_cfg (dict): The model configuration. Depending on the specific model. Example:
|
|
20
|
-
{'model_id': 'modelscope/Llama-2-7b-chat-ms', 'revision': 'v1.0.0'}
|
|
21
|
-
|
|
22
|
-
**kwargs: kwargs
|
|
23
|
-
"""
|
|
24
|
-
self.model_cfg: dict = model_cfg
|
|
25
|
-
self.kwargs = kwargs
|
|
26
|
-
|
|
27
|
-
@abstractmethod
|
|
28
|
-
def predict(self, *args, **kwargs) -> Any:
|
|
29
|
-
"""
|
|
30
|
-
Model prediction func.
|
|
31
|
-
"""
|
|
32
|
-
raise NotImplementedError
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class ChatBaseModel(BaseModel):
|
|
36
|
-
|
|
37
|
-
def __init__(self, model_cfg: dict, **kwargs):
|
|
38
|
-
"""
|
|
39
|
-
Chat base model class. Depending on the specific model.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
model_cfg (dict):
|
|
43
|
-
{'model_id': 'modelscope/Llama-2-7b-chat-ms', 'revision': 'v1.0.0', 'device_map': 'auto'}
|
|
44
|
-
|
|
45
|
-
**kwargs: kwargs
|
|
46
|
-
"""
|
|
47
|
-
super(ChatBaseModel, self).__init__(model_cfg=model_cfg, **kwargs)
|
|
48
|
-
|
|
49
|
-
@abstractmethod
|
|
50
|
-
def predict(self, inputs: dict, **kwargs) -> dict:
|
|
51
|
-
"""
|
|
52
|
-
Model prediction func. The inputs and outputs are compatible with OpenAI Chat Completions APIs.
|
|
53
|
-
Refer to: https://platform.openai.com/docs/guides/gpt/chat-completions-api
|
|
54
|
-
|
|
55
|
-
# TODO: follow latest OpenAI API
|
|
56
|
-
|
|
57
|
-
Args:
|
|
58
|
-
inputs (dict): The input prompts and history. Input format:
|
|
59
|
-
{'messages': [
|
|
60
|
-
{'role': 'system', 'content': 'You are a helpful assistant.'},
|
|
61
|
-
{'role': 'user', 'content': 'Who won the world series in 2020?'},
|
|
62
|
-
{'role': 'assistant', 'content': 'The Los Angeles Dodgers won the World Series in 2020.'},
|
|
63
|
-
]
|
|
64
|
-
'history': [
|
|
65
|
-
{'role': 'system', 'content': 'Hello'},
|
|
66
|
-
{'role': 'user', 'content': 'Hi'}]
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
kwargs (dict): Could be inference configuration. Default: None.
|
|
70
|
-
cfg format: {'max_length': 1024}
|
|
71
|
-
|
|
72
|
-
Returns: The result format:
|
|
73
|
-
{
|
|
74
|
-
'choices': [
|
|
75
|
-
{
|
|
76
|
-
'index': 0,
|
|
77
|
-
'message': {
|
|
78
|
-
'content': 'The 2020 World Series was played in Texas at Globe Life Field in Arlington.',
|
|
79
|
-
'role': 'assistant'
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
],
|
|
83
|
-
'created': 1677664795,
|
|
84
|
-
# For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
|
|
85
|
-
'model': 'gpt-3.5-turbo-0613',
|
|
86
|
-
'object': 'chat.completion',
|
|
87
|
-
'usage': {
|
|
88
|
-
'completion_tokens': 17,
|
|
89
|
-
'prompt_tokens': 57,
|
|
90
|
-
'total_tokens': 74
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
"""
|
|
94
|
-
raise NotImplementedError
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
# TODO: Remove this class after refactoring all models
|
|
98
|
-
class OpenAIModel(ChatBaseModel):
|
|
99
|
-
"""
|
|
100
|
-
APIs of OpenAI models.
|
|
101
|
-
Available models: gpt-3.5-turbo, gpt-4
|
|
102
|
-
"""
|
|
103
|
-
|
|
104
|
-
MAX_RETRIES = 3
|
|
105
|
-
|
|
106
|
-
def __init__(self, model_cfg: dict, **kwargs):
|
|
107
|
-
super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs)
|
|
108
|
-
|
|
109
|
-
openai_api_key = os.environ.get('OPENAI_API_KEY', None)
|
|
110
|
-
self.api_key = self.model_cfg.get('api_key', openai_api_key)
|
|
111
|
-
|
|
112
|
-
if not self.api_key:
|
|
113
|
-
logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY')
|
|
114
|
-
# raise ValueError(
|
|
115
|
-
# 'OpenAI API key is not provided, '
|
|
116
|
-
# 'please set it in environment variable OPENAI_API_KEY')
|
|
117
|
-
|
|
118
|
-
def predict(self, model_id: str, inputs: dict, **kwargs) -> dict:
|
|
119
|
-
|
|
120
|
-
sys_prompt: str = inputs.get('sys_prompt', '')
|
|
121
|
-
user_prompt: str = inputs.get('user_prompt', '')
|
|
122
|
-
|
|
123
|
-
# model_id: str = kwargs.get('model_id', '')
|
|
124
|
-
temperature: float = kwargs.pop('temperature', 0.2)
|
|
125
|
-
max_tokens: int = kwargs.pop('max_tokens', 1024)
|
|
126
|
-
mode: str = kwargs.pop('mode', 'chat.completion')
|
|
127
|
-
|
|
128
|
-
logger.info(f'Using OpenAI model_id: {model_id}')
|
|
129
|
-
|
|
130
|
-
res = self._predict(
|
|
131
|
-
model_id=model_id,
|
|
132
|
-
sys_prompt=sys_prompt,
|
|
133
|
-
user_prompt=user_prompt,
|
|
134
|
-
temperature=temperature,
|
|
135
|
-
max_tokens=max_tokens,
|
|
136
|
-
mode=mode)
|
|
137
|
-
|
|
138
|
-
return res
|
|
139
|
-
|
|
140
|
-
def _predict(
|
|
141
|
-
self,
|
|
142
|
-
model_id,
|
|
143
|
-
sys_prompt,
|
|
144
|
-
user_prompt,
|
|
145
|
-
temperature,
|
|
146
|
-
max_tokens,
|
|
147
|
-
mode: str = 'chat.completion',
|
|
148
|
-
) -> dict:
|
|
149
|
-
import openai
|
|
150
|
-
|
|
151
|
-
res = {}
|
|
152
|
-
openai.api_key = self.api_key
|
|
153
|
-
|
|
154
|
-
for i in range(self.MAX_RETRIES):
|
|
155
|
-
try:
|
|
156
|
-
if mode == 'chat.completion':
|
|
157
|
-
resp = openai.ChatCompletion.create(
|
|
158
|
-
model=model_id,
|
|
159
|
-
messages=[{
|
|
160
|
-
'role': 'system',
|
|
161
|
-
'content': sys_prompt
|
|
162
|
-
}, {
|
|
163
|
-
'role': 'user',
|
|
164
|
-
'content': user_prompt
|
|
165
|
-
}],
|
|
166
|
-
temperature=temperature,
|
|
167
|
-
max_tokens=max_tokens)
|
|
168
|
-
|
|
169
|
-
if resp:
|
|
170
|
-
ans_text = resp['choices'][0]['message']['content']
|
|
171
|
-
model_id = resp['model']
|
|
172
|
-
else:
|
|
173
|
-
logger.warning(f'OpenAI GPT API call failed: got empty response '
|
|
174
|
-
f'for input {sys_prompt} {user_prompt}')
|
|
175
|
-
ans_text = ''
|
|
176
|
-
model_id = ''
|
|
177
|
-
|
|
178
|
-
res['ans_text'] = ans_text
|
|
179
|
-
res['model_id'] = model_id
|
|
180
|
-
else:
|
|
181
|
-
raise ValueError(f'Invalid mode: {mode}')
|
|
182
|
-
|
|
183
|
-
return res
|
|
184
|
-
|
|
185
|
-
except Exception as e:
|
|
186
|
-
logger.warning(f'OpenAI API call failed: {e}')
|
|
187
|
-
time.sleep(3)
|
|
188
|
-
logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
|
|
189
|
-
return res
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|