evalscope 0.17.0__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (66) hide show
  1. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  2. evalscope/benchmarks/data_adapter.py +9 -4
  3. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -1
  4. evalscope/benchmarks/general_qa/general_qa_adapter.py +2 -1
  5. evalscope/benchmarks/hle/__init__.py +0 -0
  6. evalscope/benchmarks/hle/hle_adapter.py +118 -0
  7. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  8. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  9. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  10. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  11. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  12. evalscope/benchmarks/utils.py +1 -0
  13. evalscope/constants.py +5 -21
  14. evalscope/evaluator/__init__.py +1 -1
  15. evalscope/evaluator/evaluator.py +5 -3
  16. evalscope/metrics/__init__.py +3 -1
  17. evalscope/metrics/completion_parsers.py +7 -0
  18. evalscope/metrics/llm_judge.py +6 -5
  19. evalscope/metrics/metrics.py +19 -7
  20. evalscope/models/__init__.py +4 -8
  21. evalscope/models/adapters/__init__.py +4 -9
  22. evalscope/models/adapters/base_adapter.py +4 -0
  23. evalscope/models/adapters/bfcl_adapter.py +2 -0
  24. evalscope/models/adapters/chat_adapter.py +3 -0
  25. evalscope/models/adapters/choice_adapter.py +4 -0
  26. evalscope/models/adapters/custom_adapter.py +7 -3
  27. evalscope/models/adapters/server_adapter.py +2 -0
  28. evalscope/models/adapters/t2i_adapter.py +3 -0
  29. evalscope/models/adapters/tau_bench_adapter.py +189 -0
  30. evalscope/models/register.py +0 -14
  31. evalscope/perf/arguments.py +13 -0
  32. evalscope/perf/benchmark.py +38 -39
  33. evalscope/perf/http_client.py +30 -86
  34. evalscope/perf/main.py +2 -2
  35. evalscope/perf/plugin/__init__.py +3 -2
  36. evalscope/perf/plugin/api/__init__.py +4 -3
  37. evalscope/perf/plugin/api/base.py +22 -4
  38. evalscope/perf/plugin/api/custom_api.py +212 -55
  39. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  40. evalscope/perf/plugin/api/default_api.py +105 -0
  41. evalscope/perf/plugin/api/openai_api.py +17 -19
  42. evalscope/perf/plugin/datasets/__init__.py +10 -7
  43. evalscope/perf/plugin/datasets/base.py +22 -1
  44. evalscope/perf/plugin/datasets/custom.py +2 -1
  45. evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  46. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  47. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  48. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  49. evalscope/perf/plugin/datasets/openqa.py +2 -1
  50. evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  51. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  52. evalscope/perf/plugin/registry.py +36 -16
  53. evalscope/perf/utils/benchmark_util.py +14 -20
  54. evalscope/perf/utils/db_util.py +79 -61
  55. evalscope/utils/io_utils.py +10 -0
  56. evalscope/version.py +2 -2
  57. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/METADATA +54 -34
  58. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/RECORD +65 -58
  59. tests/cli/test_all.py +18 -2
  60. tests/cli/test_run.py +25 -37
  61. tests/perf/test_perf.py +29 -2
  62. evalscope/models/model.py +0 -189
  63. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
  64. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
  65. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
  66. {evalscope-0.17.0.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,10 @@
1
1
  evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
2
2
  evalscope/arguments.py,sha256=QkxE8eGSryiyo9uDiNQNZUI3l_hGPYmhVz1-KHgtB6E,6044
3
3
  evalscope/config.py,sha256=1YfHXlIyYH70FQfi8TiUtpUH3VIRCh5YcbaayKZo5s4,6781
4
- evalscope/constants.py,sha256=1CYghe0fGccyiVgzMIHd2HIb6lOo9fmB-8pH_l99iI4,4014
4
+ evalscope/constants.py,sha256=Tc74W89SxeeEzISDzO5IoxSo9A_F0LqjH0mOrcAYJXc,3737
5
5
  evalscope/run.py,sha256=dL1deJ0J1RHW6X6ZStXzAVL7NwbjW6McfdOMkCpWrtc,7012
6
6
  evalscope/summarizer.py,sha256=ZLFDHmi0Bgo18ouQsxuUl9vmIES9zkoapLLWRLhy19Q,5911
7
- evalscope/version.py,sha256=IZr-isfEmPkZ2eTCGlS5vvkiE5fMlg3HeXpgVmjGGJY,119
7
+ evalscope/version.py,sha256=wsTu-_Fq9Dmfg7bXg6eDVtNwZA5ui-MZ6IPs4EhytAc,119
8
8
  evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
9
9
  evalscope/app/app.py,sha256=8mSBp8qUCCmqupV4FEPMPdT9jL-bYu4DdH2qj8P0ktk,776
10
10
  evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
@@ -69,9 +69,9 @@ evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4ty
69
69
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=jlwM13Ty-Ax6AeMsNlo9xIBupNFgnceYuXtCmh0hNTQ,6160
70
70
  evalscope/benchmarks/__init__.py,sha256=NVd_VvmkY36LxdHNmgeogSBwMFfWoLJAZF8vDg-CoFc,1308
71
71
  evalscope/benchmarks/benchmark.py,sha256=uZ_-Y_wPhy6TxufWiElF4BwEWN93azT1JHtGRW8tR-w,2633
72
- evalscope/benchmarks/data_adapter.py,sha256=t_IOA6hvPrF_mrAzwgS-HP1aRQ_sI-3s9oSpRxmtFLg,23475
72
+ evalscope/benchmarks/data_adapter.py,sha256=UI4HpnJNYo18GXRiU0HwNUxjRfoSXlCB-xEBIGs2ckg,23914
73
73
  evalscope/benchmarks/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,1523
74
- evalscope/benchmarks/utils.py,sha256=37Pn9SxRqF0WoLR7LcGJF9xASh4VxcUL93v03WHmnh8,1813
74
+ evalscope/benchmarks/utils.py,sha256=mIk8n6zVMICQ5JWMyEwUqwlkxva4L-oD5SZzpIKw1sI,1851
75
75
  evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
76
  evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
77
  evalscope/benchmarks/aigc/t2i/base.py,sha256=4GFAvceT1Gpt5teDLRCZi62RwvPazuhG3zwft3gN3X4,2102
@@ -121,7 +121,7 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
121
121
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
122
122
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
123
123
  evalscope/benchmarks/bfcl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
- evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=MQPlfMvTQYHA4EP5g7eNzXDs4A4QvgYOiGC458Z39q4,10080
124
+ evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=ThDOYrJY_RdXMLSC1S9lP-8zYd1syZWpcrXXV1ZPLVs,10100
125
125
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
126
126
  evalscope/benchmarks/ceval/ceval_adapter.py,sha256=V_TC_E0lKXaFcV_qIdrg2_iddmGJ4um8iIdaXVaK_EM,11146
127
127
  evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
@@ -149,9 +149,9 @@ evalscope/benchmarks/general_arena/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQe
149
149
  evalscope/benchmarks/general_arena/general_arena_adapter.py,sha256=j2aDzikz9obxvrR-damdvSCXR0rfjEo-OzX8vujj2N0,19887
150
150
  evalscope/benchmarks/general_arena/utils.py,sha256=u0q4FNIOFka1_gC344OCvBXUz89Ah6M8asjIXbNSweM,7188
151
151
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
152
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=B5zMiywH06NWOZsxAOwP-4aE3DbJB3Oyi9tlbM2BEHU,5181
152
+ evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=cPN-p0tndjocQYqfc6OFkT5k8KL7kkVklmOtps-F08Y,5391
153
153
  evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
154
- evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=qdiH_XoWy3JNrBaUxl9S0bH16k0gXcx8dexZQflH74o,5443
154
+ evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=dpIGe635CoW4ejVohVwcarBxSckqvlnxcJ2ElpRlQ9o,5669
155
155
  evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
156
156
  evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
157
157
  evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=J6RfxpUT1l8Jj3vT_Vtsn1z8MKCg32XTlKn_eihCI50,5071
@@ -161,9 +161,11 @@ evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=IBMdsvQ1w45_raCiACTBm7DVHtOYf
161
161
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
162
162
  evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
163
163
  evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=kgHz-n8_93J8DdR7XBlzfM2KDRoKcvg80h6CCjWv_Xk,6191
164
+ evalscope/benchmarks/hle/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
+ evalscope/benchmarks/hle/hle_adapter.py,sha256=ts38e-AqtUcbfc6VqRtWLacZDh7KzSm4rj7xKm9vTFc,4445
164
166
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
165
167
  evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
166
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=O6muXpiBrQ9RGSglnl3gS0yO6BSkQtXASMR9yXUfhEE,5515
168
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=ZqNG3L8yMY44B7HleUjlSbVG-GLk9RBsvaGWOm2fQVw,4788
167
169
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
168
170
  evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=2oStqiTD4w2f2n0kbjcbg7GJQfKCsHFieokQcNndWb4,2041
169
171
  evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
@@ -186,7 +188,7 @@ evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5N
186
188
  evalscope/benchmarks/math_500/math_500_adapter.py,sha256=Oc9XnBgMAjEerYAk3GtY2TTKm1QH_UI896kUuW2_a5Y,2324
187
189
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
188
190
  evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
189
- evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=58z0BQDYq1TKTzFVDbVdpJVlBOv0pJtuAu7uS8gVwbA,12111
191
+ evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=Rhi-J6oGWawRVBk38ZgXk8-XrZ7wL8sf4zrncU73jgs,12111
190
192
  evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
191
193
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
192
194
  evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=uglOOZBZfQBIuJOG7iT4THk2LNcfHQoakxQDpS4jB1U,4554
@@ -211,8 +213,10 @@ evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=CQxRszzUrSIygOSd1G10
211
213
  evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=ce99v28wkhlGnfmihwpv3ikTqy3aumT8Jzm1LGxz-ck,10147
212
214
  evalscope/benchmarks/super_gpqa/utils.py,sha256=ftYPP9ODvLBlQSd9ltACx9iRIvjB8u1bg4AtgcJ4JAI,3360
213
215
  evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt,sha256=XZb0CN83YbfH2dF-iIV-ciNLbIb3ON220qHe7zf8KF0,247
216
+ evalscope/benchmarks/tau_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
217
+ evalscope/benchmarks/tau_bench/tau_bench_adapter.py,sha256=5_VgRUtEjeZ-8gRZj4cnwwso1GUqf2GB49AlI4xqyDM,4221
214
218
  evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
215
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=_QNncuCCMhhjsWzB934sYF-k010fKUdhhAOWrJ9LKDA,2813
219
+ evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=fy6Hb84cm6s-pOoQXmT-N8D1OUYVGCuq77-2xwM_WLA,3093
216
220
  evalscope/benchmarks/tool_bench/utils.py,sha256=led0d-Pa3rvmWkSWhEnZWP00fceudgESq5HXAQzJGls,7042
217
221
  evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
218
222
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
@@ -234,13 +238,13 @@ evalscope/collections/__init__.py,sha256=3v7tVLcJk86FeNBrxw3pWhu_lcpKYrnT_dDACCe
234
238
  evalscope/collections/evaluator.py,sha256=RJ337S0sy8dsV25I2OAxeWgSx_HrmXTyuuHKSt9vQtM,17474
235
239
  evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
236
240
  evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
237
- evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
238
- evalscope/evaluator/evaluator.py,sha256=0vxXyNt1v2wjzUXwC_7nc_-3wmT18yQ-QKDA9DDVXpA,22441
239
- evalscope/metrics/__init__.py,sha256=LlqXdOPiWTTAzxuKdUwTYO0KgN3Zh1zs18H2sM_5o5I,1709
240
- evalscope/metrics/completion_parsers.py,sha256=LJnHD_ea7SLfRVXIAHuzeJx0mAgbpzmf3VYcQRL-CdA,8733
241
- evalscope/metrics/llm_judge.py,sha256=9DCT4p9llNSVUc-K6inrHGpHVdReHlEz_kJQyRezvz8,8268
241
+ evalscope/evaluator/__init__.py,sha256=XqPnEp5MvfRwC5M5cEeOAC0-MMEPxBIESqiSa3YMBgo,84
242
+ evalscope/evaluator/evaluator.py,sha256=HKEF2k0S_dJR8cF9lrqf_W4diXbb6H3L81pD6XcmLiA,22481
243
+ evalscope/metrics/__init__.py,sha256=CH3bNyRx9dJ3gOqNwKDlaZ7zan4MShM0h8SnzarjokU,1851
244
+ evalscope/metrics/completion_parsers.py,sha256=56ZNzOfNU0O1ba9fs9Cyi4Vk_YUmcgWUbxW0SJ2KrlU,8974
245
+ evalscope/metrics/llm_judge.py,sha256=1hPFnGc3Szszqo21O618a7mxOgkdba3KsbZ66vvTbSA,8380
242
246
  evalscope/metrics/math_parser.py,sha256=JtOkj28XOtwoUACXOXLzCeRYz0rx0tBsQLQDU8cbC20,17311
243
- evalscope/metrics/metrics.py,sha256=f-KFVBJi6hOm9K7dFJSPCQDe5opEOzeb0z1YvhkKXb0,13797
247
+ evalscope/metrics/metrics.py,sha256=OLfvEljGbQnv-bBiFD-GR2On4mpZ0xhKxiKkjZfoDX8,14268
244
248
  evalscope/metrics/named_metrics.py,sha256=PrzU_1mGTeRFxVJFT1aXxIOiS7MnNoWyZsb8uCRVDeE,2278
245
249
  evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
246
250
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
@@ -346,46 +350,49 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.p
346
350
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py,sha256=LqMHlUTy2LEzoVwjALtrAw0UYmzIuHnFjQiVmn5nv-I,605
347
351
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py,sha256=d4HInkL_Phk0Bgg2cWaOvhsPa6lkqDeovFW86PL0I18,6371
348
352
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py,sha256=XzebAHBAjOpkIMZm43dd55PESgmyq_J45Ji6bogYR3s,11204
349
- evalscope/models/__init__.py,sha256=yB4NuKvSd3Jd4GRQvJeGPxwigd8RJErdop5PzSQhsMY,1565
353
+ evalscope/models/__init__.py,sha256=x0Sna8mbujdOVqIYSGwIULbiPOue_Ifp-2JElSZsuMs,1481
350
354
  evalscope/models/local_model.py,sha256=UWsmZlWpT8JNGjijzZQKirvq4YywBkKOS9G-U2cuxAw,4115
351
- evalscope/models/model.py,sha256=MxvJAUNkuT7IA3bchnmJDur_YCKj9ShOD2Uq40dBcGc,6308
352
- evalscope/models/register.py,sha256=WiylzfL-vb6Bl3H3_RdIaBabVOAc9tiuhsQzYJDVzTg,1948
353
- evalscope/models/adapters/__init__.py,sha256=zmldx8yC_KTI8NDRcxNLyPzv19wc57UvOVvzwyuYnG4,647
354
- evalscope/models/adapters/base_adapter.py,sha256=TfINK84g4mqmHcnqvvHmk-MXRN2Pkan4yVlVd4j0nVY,3166
355
- evalscope/models/adapters/bfcl_adapter.py,sha256=KtreuJ21X1lcUGGhVgW3U62p3P65_oydMdBPtE5um-I,10332
356
- evalscope/models/adapters/chat_adapter.py,sha256=PAClyBL_nQ1I1kmjeeZ3sdC-y5ZmfFj8rjCigh_vr40,7885
357
- evalscope/models/adapters/choice_adapter.py,sha256=4fuz3MFEqK8ln4mMs3goMCdRPBwYmmgN70HTdr_sW_U,8005
358
- evalscope/models/adapters/custom_adapter.py,sha256=w8cD0b3xgcdhSZelcat67CGJnALOfz5IALzURnLjab8,2275
359
- evalscope/models/adapters/server_adapter.py,sha256=4fyC7fM_L_hn1SDqqDJAvMjEvBHVbTNF7xZHrO9bnhI,9616
360
- evalscope/models/adapters/t2i_adapter.py,sha256=xkMRyZ61yTiJfmULK-p9du4nNox41pkHiV2CTFBO3qM,2659
355
+ evalscope/models/register.py,sha256=G35J6BULFWwuqZO_rTkKBru1llZAyfPztcAASp_cb8M,1257
356
+ evalscope/models/adapters/__init__.py,sha256=WRaZsHlnz0MvGg9Jq565-XJjED-4cAyu4KbmrOhrHO4,688
357
+ evalscope/models/adapters/base_adapter.py,sha256=P4aicNmz1nsX9QLY9t4c6OIQPzIYfOhcrqjlAjR-ENY,3477
358
+ evalscope/models/adapters/bfcl_adapter.py,sha256=cG0vsQ3H2pmabo6tC0Y5Gonw0ng5-RFljDyRBMSj6xE,10422
359
+ evalscope/models/adapters/chat_adapter.py,sha256=epxA_on9ipsak8Lnkweh9en2AjVm5G0L1ARXYmDEEbk,8026
360
+ evalscope/models/adapters/choice_adapter.py,sha256=wIXnDcgnKaIMdhToaqy6fidhuZDpEz2vhxIB_V9u3Z8,8203
361
+ evalscope/models/adapters/custom_adapter.py,sha256=W8DIBiMWvHHcc0Mn9Frjj1YbpHRi7w-UQVJDiU2PakU,2400
362
+ evalscope/models/adapters/server_adapter.py,sha256=W6SXrPy-hZXpnISDjupu_j7bnmt-cP55sDojPXThitc,9701
363
+ evalscope/models/adapters/t2i_adapter.py,sha256=d6OviQFi_uN8PPXKrFpivk5Awm1O6wd_Gii8t3hVahY,2806
364
+ evalscope/models/adapters/tau_bench_adapter.py,sha256=jYGaj2L2wxtEiTdiSwZdY1XNkSzm6os7IvkxgK4msR0,6889
361
365
  evalscope/models/custom/__init__.py,sha256=MZylegALg1HerOYtp-qbzu4Wb6PW3JbrxwONHU-PAVs,131
362
366
  evalscope/models/custom/custom_model.py,sha256=rBccFVpCIfTGt9cgXLcxeUWc7w1sTRtbTO5w5qqQIQE,1405
363
367
  evalscope/models/custom/dummy_model.py,sha256=WpfrS3kvwRRdyThx9baaJ5vodYYh29VGRKsGKMWFflI,3124
364
368
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
365
- evalscope/perf/arguments.py,sha256=4YNmgTl4c76dvcL1GqxHsfRrs5cx6pvsT-6ss7weRC4,10415
366
- evalscope/perf/benchmark.py,sha256=cjUpJ3SRnZVBs_H24yqLh4WG_hcCADrniLG1VsmByb8,7901
367
- evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
368
- evalscope/perf/main.py,sha256=UdtzFWG5M9VDeuM2EvD6pqRNw5EubRadU74K-PnCLpU,3400
369
- evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
370
- evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
371
- evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
372
- evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
373
- evalscope/perf/plugin/api/custom_api.py,sha256=ssE4J8AynA0n5SnXSQyk7K5Co3dwUN6Opph08clZna0,3785
374
- evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
375
- evalscope/perf/plugin/api/openai_api.py,sha256=PmjBfIzzSuzcKiVOUeA2aPxihV0dZEzFlgmbrD2isME,7773
376
- evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
377
- evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
378
- evalscope/perf/plugin/datasets/custom.py,sha256=-meul2hRmYvYAo--c_EtCnItRi5DvN7xxFOpq6vqdts,1346
379
- evalscope/perf/plugin/datasets/flickr8k.py,sha256=MbJKEB0XqZE0nDEenwYs0FLH9QL658Vn9uQmUH4hPvk,1605
380
- evalscope/perf/plugin/datasets/line_by_line.py,sha256=AqZYG6tVL3BIGnzh_2Tev8lDYezJG_1gqJY8bSNQl3Q,957
381
- evalscope/perf/plugin/datasets/longalpaca.py,sha256=XelLris0-c3StLInQ-Oav4jqGcXPNfJxEDeYvaetEbI,1297
382
- evalscope/perf/plugin/datasets/openqa.py,sha256=4Pnx5duFJzoiTUfZCbcK7LO8f-skmcpYNUUrtNR_UUc,1463
383
- evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANBGCSgSExFbscLwSM_Gmk,2958
369
+ evalscope/perf/arguments.py,sha256=lG2IOOzxg29pdnF6IobzPcqEcYqopulFpVU2QzRaEJA,11429
370
+ evalscope/perf/benchmark.py,sha256=ZVmsSeKDUKkApt3y5tIMMFZAyAj3UNVT7JPp1fh5mhE,7880
371
+ evalscope/perf/http_client.py,sha256=l_OKL80kTP6sM_PEBvsJ1_TejYJdUQnE2UlB-ud1WQM,4588
372
+ evalscope/perf/main.py,sha256=WZbBgFhIj9KqxzC7_NZxDlou019_EXatsHRt5vqDhFg,3439
373
+ evalscope/perf/plugin/__init__.py,sha256=Ztj4h1_JYJqbbWkeuDTj5aTRyGQf5Woc4xEIyjcokVU,94
374
+ evalscope/perf/plugin/registry.py,sha256=GhLe-h1rGzya2bgIUaV5VymQIaHqI7h5SG_i4PoGAm8,1967
375
+ evalscope/perf/plugin/api/__init__.py,sha256=7RsGdYTSfnW6iVpveEzNu8v4x8Yc8H-Kk39DqOHMrd4,152
376
+ evalscope/perf/plugin/api/base.py,sha256=9cX4xwTzy5ycnWqmQqRGMLasTEX6jVlobtADkh1KwXE,2782
377
+ evalscope/perf/plugin/api/custom_api.py,sha256=f8rUixcV9mTxoYyabu3wedEC4YVB70Yw6Az1NpfeWPQ,10375
378
+ evalscope/perf/plugin/api/dashscope_api.py,sha256=Miv2pzMa6sxZyYYJhCzcbOI_QHuZx7tazKpb6Not7ck,3627
379
+ evalscope/perf/plugin/api/default_api.py,sha256=kjuHQ-zRHe5WU4ofSzWBpWbIxBQBOh_ucu1z2g62gWg,4315
380
+ evalscope/perf/plugin/api/openai_api.py,sha256=Mt_VedJUaCH3g-oVSJ_fsGcPk0KkspSzIMkrkih2Zb0,7777
381
+ evalscope/perf/plugin/datasets/__init__.py,sha256=qzeQ9BrJhiJJm1wHaFeOQkvXXdSd15Ucspbn5zjs-6Q,495
382
+ evalscope/perf/plugin/datasets/base.py,sha256=-3Ihnp2hYvZyPnP8Gh2Pu8ovlLNFHyZnNgRu3WHG4d0,2714
383
+ evalscope/perf/plugin/datasets/custom.py,sha256=UuOk8xYfSYyyYZL3U4grUjtfQhWHHZeAEC63n_4Siuw,1376
384
+ evalscope/perf/plugin/datasets/flickr8k.py,sha256=IXz5uu5SlqF1l_tJ_ITr2vx_R_d7gxWzqPuyEOx7rYo,1043
385
+ evalscope/perf/plugin/datasets/kontext_bench.py,sha256=XjKzr7nMzI3cfk83IH0PH1TNJaQMRXUpACnzFfP2n6g,1091
386
+ evalscope/perf/plugin/datasets/line_by_line.py,sha256=c3ydW4GqxkG0vl2g64jG0vBMql2FuFPyWh3mgkIh9Do,987
387
+ evalscope/perf/plugin/datasets/longalpaca.py,sha256=VnMjdHl_JV3NmZ6wRxVlJ99e8PYSjQTcVxoTkl21Ei0,1327
388
+ evalscope/perf/plugin/datasets/openqa.py,sha256=33AR419IrH-FxZRjjcYdAIEZXaX4TKEoirVVfX--N9I,1493
389
+ evalscope/perf/plugin/datasets/random_dataset.py,sha256=NNAXvgFPkLDOSpYNex1DyE4X-ELtQRm13_oBooO30j8,3514
390
+ evalscope/perf/plugin/datasets/random_vl_dataset.py,sha256=F3yA9Ih3YO895lZKCo3i85LeKTzjvGcvhzc8UNN-gUI,3240
384
391
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
385
392
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
386
393
  evalscope/perf/utils/analysis_result.py,sha256=aoT7JD2zAzBeuZUfncKhJ2odX_7KnymwOmNB1Upam2c,935
387
- evalscope/perf/utils/benchmark_util.py,sha256=pv__38XjxkTqOfcREnod40WxeMJe4okDuVcYjyySDtg,7258
388
- evalscope/perf/utils/db_util.py,sha256=xqrXZapP_WwUdzkgFBTh3LDBWzr_UoU8v13rOjQ8TT4,9876
394
+ evalscope/perf/utils/benchmark_util.py,sha256=7bHpa5oaqcPJX7DSUkzK9assoFSHC27Q7-QylUOiklQ,7136
395
+ evalscope/perf/utils/db_util.py,sha256=TCdmoEx5iScL6h8wzucPojPwn6J1wTmQqX4sVk-ilHo,11630
389
396
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
390
397
  evalscope/perf/utils/local_server.py,sha256=RL9rGd5tEniZ0aErhHcbVXMX22YmujfE11T3j37VL8k,4684
391
398
  evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
@@ -434,7 +441,7 @@ evalscope/utils/argument_utils.py,sha256=D7qOH85wf7LKh_cJ2X51OEaL7CMaddydmHZkfoY
434
441
  evalscope/utils/chat_service.py,sha256=U2jtrkOa2asRp16Zam0zIi_38mCyWQqql_L6JSwii4I,8749
435
442
  evalscope/utils/deprecation_utils.py,sha256=WyeiLWSi5ti6FkuMbhimcPPUB43paa1FZ5-JOAWNFZI,1344
436
443
  evalscope/utils/import_utils.py,sha256=BSdp7RQSZu67129TBbtJvMWU0CfCFu864K31eiM3pr8,2975
437
- evalscope/utils/io_utils.py,sha256=xvFvVu3Hy2HJFvemvREdFh6en7SNmfrsnikK-Mj-q6Q,6828
444
+ evalscope/utils/io_utils.py,sha256=2eEkLx4jhekgIV4vYL8yTN0PT6dbHUERMBZwmvxuiEc,7109
438
445
  evalscope/utils/logger.py,sha256=Q2IeV_0jxz8L34b5GddPeCKXVh0UClbuhjyLe5Wtj7M,3648
439
446
  evalscope/utils/model_utils.py,sha256=F1_WBHvBehWqrTd6kPtKICeeYucaZn5H0Gc3cCplYB8,2329
440
447
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -443,12 +450,12 @@ tests/utils.py,sha256=Fgm0CU6ilZjCGOfOMJH-Trxy0UIAGbhvy0Ijy_zDGUk,323
443
450
  tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
444
451
  tests/aigc/test_t2i.py,sha256=XtVknpwlVMb6FSw3_WMFxMq0gZX6iG-ffdSQkcW2Fzw,3856
445
452
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
446
- tests/cli/test_all.py,sha256=GmY6g-EQCb_RJY4R76MeF9pvgYyzQHBxwn7Y_9BMwns,5866
453
+ tests/cli/test_all.py,sha256=IT0mxjiuHCC0PpT4z3oN1Bbr_0viMcm8GnShZ02kp8w,6333
447
454
  tests/cli/test_collection.py,sha256=bXWzccH822Y2B1Ed251U6TE8G_osI6MXYNxzmfv9kBI,4197
448
455
  tests/cli/test_custom.py,sha256=0YE-TCAeaQMRVRFla_TIvTd8d0USvvsSeqvYAD3NDNg,8796
449
- tests/cli/test_run.py,sha256=Al8-CZeoWZH-c-YIg6qUIKtSIfRdzlEBsgVsl-WMosk,17570
456
+ tests/cli/test_run.py,sha256=YKX2XCHPxnStgzzP67U90RV9r1MC2GM3JoGQqfZKqrI,17324
450
457
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
451
- tests/perf/test_perf.py,sha256=u82U7QYkA5JvR-iw0f4MNpWuEQOYid2g9cQ11ma7NAU,4844
458
+ tests/perf/test_perf.py,sha256=AEWvpN3ID6s-9MEoaZjQqUM8VVsqgk_v9KX8pDgvozA,5864
452
459
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
453
460
  tests/rag/test_clip_benchmark.py,sha256=13pcY3gYHNQh2KfEHCqtCSqiOcbngSJ1BlVZzI58JCE,2694
454
461
  tests/rag/test_mteb.py,sha256=fdNQIyUEzE7puPCKw5QhCHTEu7hz-ieHeq1xCWGh6IM,7246
@@ -459,9 +466,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=RwrKkc1WHEZxetM11cGL81G4faKCn7SYn4
459
466
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=UAUtOCQ72xbm8s-sov3cBEpYVDy189wpB-qOL3KoU7M,6053
460
467
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
461
468
  tests/vlm/test_vlmeval.py,sha256=EDQRkYfSyOICUwo_tm3p-puaE_xdFmqOPkrt5etxsqM,3307
462
- evalscope-0.17.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
463
- evalscope-0.17.0.dist-info/METADATA,sha256=EwH2JsfnfzkG-OSyvu-8lGjQ0aU6lsTuHMUwC_RDDTU,36893
464
- evalscope-0.17.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
465
- evalscope-0.17.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
466
- evalscope-0.17.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
467
- evalscope-0.17.0.dist-info/RECORD,,
469
+ evalscope-0.17.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
470
+ evalscope-0.17.1.dist-info/METADATA,sha256=1PRiimjOBZgSWjvT3iL4VcvdaWk8v3fGp9xCXLpM1Dw,38469
471
+ evalscope-0.17.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
472
+ evalscope-0.17.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
473
+ evalscope-0.17.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
474
+ evalscope-0.17.1.dist-info/RECORD,,
tests/cli/test_all.py CHANGED
@@ -39,7 +39,7 @@ datasets=[
39
39
  'general_mcq',
40
40
  'general_qa',
41
41
  'super_gpqa',
42
- 'live_code_bench',
42
+ # 'live_code_bench',
43
43
  'mmlu_redux',
44
44
  'simple_qa',
45
45
  'chinese_simpleqa',
@@ -53,8 +53,13 @@ datasets=[
53
53
  'docmath',
54
54
  'needle_haystack',
55
55
  'bfcl_v3',
56
+ 'hle',
57
+ 'tau_bench',
56
58
  ]
57
59
 
60
+ # Reverse the datasets list to ensure the order is from most recent to oldest
61
+ datasets.reverse()
62
+
58
63
  dataset_args={
59
64
  'mmlu': {
60
65
  'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
@@ -132,7 +137,18 @@ dataset_args={
132
137
  },
133
138
  'bfcl_v3':{
134
139
  'subset_list': ['simple', 'multiple']
135
- }
140
+ },
141
+ 'hle': {
142
+ 'subset_list': ['Math', 'Other'],
143
+ },
144
+ 'tau_bench': {
145
+ 'extra_params': {
146
+ 'user_model': 'qwen-plus',
147
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
148
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
149
+ },
150
+ 'subset_list': ['airline'],
151
+ },
136
152
  }
137
153
 
138
154
  class TestRun(unittest.TestCase):
tests/cli/test_run.py CHANGED
@@ -259,14 +259,14 @@ class TestRun(unittest.TestCase):
259
259
  api_key= env.get('DASHSCOPE_API_KEY'),
260
260
  eval_type=EvalType.SERVICE,
261
261
  datasets=[
262
- 'iquiz',
262
+ # 'iquiz',
263
263
  # 'ifeval',
264
264
  # 'mmlu',
265
265
  # 'mmlu_pro',
266
266
  # 'musr',
267
267
  # 'process_bench',
268
268
  # 'race',
269
- # 'trivia_qa',
269
+ 'trivia_qa',
270
270
  # 'cmmlu',
271
271
  # 'humaneval',
272
272
  # 'gsm8k',
@@ -289,6 +289,8 @@ class TestRun(unittest.TestCase):
289
289
  # 'frames',
290
290
  # 'bfcl_v3',
291
291
  # 'truthful_qa',
292
+ # 'tau_bench',
293
+ # 'hle'
292
294
  ],
293
295
  dataset_args={
294
296
  'mmlu': {
@@ -297,7 +299,7 @@ class TestRun(unittest.TestCase):
297
299
  },
298
300
  'mmlu_pro': {
299
301
  'subset_list': ['math', 'health'],
300
- 'few_shot_num': 4
302
+ 'few_shot_num': 0
301
303
  },
302
304
  'ceval': {
303
305
  'subset_list': [
@@ -360,13 +362,23 @@ class TestRun(unittest.TestCase):
360
362
  # 'is_fc_model': False,
361
363
  }
362
364
  },
365
+ 'tau_bench': {
366
+ 'extra_params': {
367
+ 'user_model': 'qwen-plus',
368
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
369
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
370
+ }
371
+ },
372
+ 'hle': {
373
+ 'subset_list': ['Math', 'Other'],
374
+ },
363
375
  },
364
- eval_batch_size=1,
365
- limit=5,
376
+ eval_batch_size=10,
377
+ limit=10,
366
378
  # debug=True,
367
379
  stream=True,
368
380
  generation_config={
369
- 'temperature': 0,
381
+ 'temperature': 0.6,
370
382
  'n': 1,
371
383
  'max_tokens': 4096,
372
384
  # 'extra_headers':{'key': 'value'},
@@ -377,35 +389,6 @@ class TestRun(unittest.TestCase):
377
389
  run_task(task_cfg=task_cfg)
378
390
 
379
391
 
380
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
381
- def test_run_batch_eval(self):
382
- from evalscope.config import TaskConfig
383
-
384
- task_cfg = TaskConfig(
385
- model='LLM-Research/Llama-3.2-1B-Instruct',
386
- datasets=[
387
- # 'math_500',
388
- # 'aime24',
389
- # 'competition_math'
390
- # 'arc',
391
- 'gsm8k'
392
- # 'truthful_qa'
393
- ],
394
- dataset_args={
395
- 'competition_math': {
396
- 'subset_list': ['Level 4', 'Level 5']
397
- }
398
- },
399
- eval_batch_size=2,
400
- limit=5,
401
- generation_config={
402
- 'max_new_tokens': 2048,
403
- 'temperature': 0.7,
404
- 'num_return_sequences': 2,
405
- }
406
- )
407
-
408
- run_task(task_cfg=task_cfg)
409
392
 
410
393
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
411
394
  def test_run_judge_model(self):
@@ -417,7 +400,7 @@ class TestRun(unittest.TestCase):
417
400
  api_key= env.get('DASHSCOPE_API_KEY'),
418
401
  eval_type=EvalType.SERVICE,
419
402
  datasets=[
420
- 'math_500',
403
+ # 'math_500',
421
404
  # 'aime24',
422
405
  # 'competition_math',
423
406
  # 'arc',
@@ -434,6 +417,7 @@ class TestRun(unittest.TestCase):
434
417
  # 'docmath',
435
418
  # 'needle_haystack',
436
419
  # 'ifeval',
420
+ 'hle'
437
421
  ],
438
422
  dataset_args={
439
423
  'needle_haystack': {
@@ -466,7 +450,10 @@ class TestRun(unittest.TestCase):
466
450
  },
467
451
  'frames': {
468
452
  'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
469
- }
453
+ },
454
+ 'hle': {
455
+ 'subset_list': ['Math', 'Other'],
456
+ },
470
457
  },
471
458
  eval_batch_size=10,
472
459
  limit=3,
@@ -489,6 +476,7 @@ class TestRun(unittest.TestCase):
489
476
  },
490
477
  timeout=60000,
491
478
  stream=True,
479
+ use_cache='outputs/20250714_150626'
492
480
  # analysis_report=True,
493
481
  # debug=True,
494
482
  # use_cache='outputs/20250616_161931'
tests/perf/test_perf.py CHANGED
@@ -35,9 +35,9 @@ class TestPerf(unittest.TestCase):
35
35
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
36
36
  def test_run_perf_stream(self):
37
37
  task_cfg = {
38
- 'url': 'http://127.0.0.1:8000/v1/chat/completions',
38
+ 'url': 'http://127.0.0.1:8801/v1/chat/completions',
39
39
  'parallel': 1,
40
- 'model': 'qwen2.5',
40
+ 'model': 'Qwen2.5-0.5B-Instruct',
41
41
  'number': 15,
42
42
  'api': 'openai',
43
43
  'dataset': 'openqa',
@@ -145,5 +145,32 @@ class TestPerf(unittest.TestCase):
145
145
  print(metrics_result)
146
146
  print(percentile_result)
147
147
 
148
+ @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
149
+ def test_run_perf_random_vl(self):
150
+ from evalscope.perf.arguments import Arguments
151
+ task_cfg = Arguments(
152
+ parallel=[1, 2],
153
+ number=[2, 4],
154
+ model='qwen-vl-max',
155
+ url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
156
+ api_key=env.get('DASHSCOPE_API_KEY'),
157
+ api='openai',
158
+ dataset='kontext_bench',
159
+ min_tokens=100,
160
+ max_tokens=100,
161
+ prefix_length=0,
162
+ min_prompt_length=100,
163
+ max_prompt_length=100,
164
+ image_height=512,
165
+ image_width=512,
166
+ image_num=2,
167
+ tokenizer_path='Qwen/Qwen2.5-0.5B-Instruct',
168
+ seed=None,
169
+ extra_args={'ignore_eos': True}
170
+ )
171
+ metrics_result, percentile_result = run_perf_benchmark(task_cfg)
172
+ print(metrics_result)
173
+ print(percentile_result)
174
+
148
175
  if __name__ == '__main__':
149
176
  unittest.main(buffer=False)
evalscope/models/model.py DELETED
@@ -1,189 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
- import time
4
- from abc import ABC, abstractmethod
5
- from typing import Any, List
6
-
7
- from evalscope.utils.logger import get_logger
8
-
9
- logger = get_logger()
10
-
11
-
12
- class BaseModel(ABC):
13
-
14
- def __init__(self, model_cfg: dict, **kwargs):
15
- """
16
- Base model class.
17
-
18
- Args:
19
- model_cfg (dict): The model configuration. Depending on the specific model. Example:
20
- {'model_id': 'modelscope/Llama-2-7b-chat-ms', 'revision': 'v1.0.0'}
21
-
22
- **kwargs: kwargs
23
- """
24
- self.model_cfg: dict = model_cfg
25
- self.kwargs = kwargs
26
-
27
- @abstractmethod
28
- def predict(self, *args, **kwargs) -> Any:
29
- """
30
- Model prediction func.
31
- """
32
- raise NotImplementedError
33
-
34
-
35
- class ChatBaseModel(BaseModel):
36
-
37
- def __init__(self, model_cfg: dict, **kwargs):
38
- """
39
- Chat base model class. Depending on the specific model.
40
-
41
- Args:
42
- model_cfg (dict):
43
- {'model_id': 'modelscope/Llama-2-7b-chat-ms', 'revision': 'v1.0.0', 'device_map': 'auto'}
44
-
45
- **kwargs: kwargs
46
- """
47
- super(ChatBaseModel, self).__init__(model_cfg=model_cfg, **kwargs)
48
-
49
- @abstractmethod
50
- def predict(self, inputs: dict, **kwargs) -> dict:
51
- """
52
- Model prediction func. The inputs and outputs are compatible with OpenAI Chat Completions APIs.
53
- Refer to: https://platform.openai.com/docs/guides/gpt/chat-completions-api
54
-
55
- # TODO: follow latest OpenAI API
56
-
57
- Args:
58
- inputs (dict): The input prompts and history. Input format:
59
- {'messages': [
60
- {'role': 'system', 'content': 'You are a helpful assistant.'},
61
- {'role': 'user', 'content': 'Who won the world series in 2020?'},
62
- {'role': 'assistant', 'content': 'The Los Angeles Dodgers won the World Series in 2020.'},
63
- ]
64
- 'history': [
65
- {'role': 'system', 'content': 'Hello'},
66
- {'role': 'user', 'content': 'Hi'}]
67
- }
68
-
69
- kwargs (dict): Could be inference configuration. Default: None.
70
- cfg format: {'max_length': 1024}
71
-
72
- Returns: The result format:
73
- {
74
- 'choices': [
75
- {
76
- 'index': 0,
77
- 'message': {
78
- 'content': 'The 2020 World Series was played in Texas at Globe Life Field in Arlington.',
79
- 'role': 'assistant'
80
- }
81
- }
82
- ],
83
- 'created': 1677664795,
84
- # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
85
- 'model': 'gpt-3.5-turbo-0613',
86
- 'object': 'chat.completion',
87
- 'usage': {
88
- 'completion_tokens': 17,
89
- 'prompt_tokens': 57,
90
- 'total_tokens': 74
91
- }
92
- }
93
- """
94
- raise NotImplementedError
95
-
96
-
97
- # TODO: Remove this class after refactoring all models
98
- class OpenAIModel(ChatBaseModel):
99
- """
100
- APIs of OpenAI models.
101
- Available models: gpt-3.5-turbo, gpt-4
102
- """
103
-
104
- MAX_RETRIES = 3
105
-
106
- def __init__(self, model_cfg: dict, **kwargs):
107
- super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs)
108
-
109
- openai_api_key = os.environ.get('OPENAI_API_KEY', None)
110
- self.api_key = self.model_cfg.get('api_key', openai_api_key)
111
-
112
- if not self.api_key:
113
- logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY')
114
- # raise ValueError(
115
- # 'OpenAI API key is not provided, '
116
- # 'please set it in environment variable OPENAI_API_KEY')
117
-
118
- def predict(self, model_id: str, inputs: dict, **kwargs) -> dict:
119
-
120
- sys_prompt: str = inputs.get('sys_prompt', '')
121
- user_prompt: str = inputs.get('user_prompt', '')
122
-
123
- # model_id: str = kwargs.get('model_id', '')
124
- temperature: float = kwargs.pop('temperature', 0.2)
125
- max_tokens: int = kwargs.pop('max_tokens', 1024)
126
- mode: str = kwargs.pop('mode', 'chat.completion')
127
-
128
- logger.info(f'Using OpenAI model_id: {model_id}')
129
-
130
- res = self._predict(
131
- model_id=model_id,
132
- sys_prompt=sys_prompt,
133
- user_prompt=user_prompt,
134
- temperature=temperature,
135
- max_tokens=max_tokens,
136
- mode=mode)
137
-
138
- return res
139
-
140
- def _predict(
141
- self,
142
- model_id,
143
- sys_prompt,
144
- user_prompt,
145
- temperature,
146
- max_tokens,
147
- mode: str = 'chat.completion',
148
- ) -> dict:
149
- import openai
150
-
151
- res = {}
152
- openai.api_key = self.api_key
153
-
154
- for i in range(self.MAX_RETRIES):
155
- try:
156
- if mode == 'chat.completion':
157
- resp = openai.ChatCompletion.create(
158
- model=model_id,
159
- messages=[{
160
- 'role': 'system',
161
- 'content': sys_prompt
162
- }, {
163
- 'role': 'user',
164
- 'content': user_prompt
165
- }],
166
- temperature=temperature,
167
- max_tokens=max_tokens)
168
-
169
- if resp:
170
- ans_text = resp['choices'][0]['message']['content']
171
- model_id = resp['model']
172
- else:
173
- logger.warning(f'OpenAI GPT API call failed: got empty response '
174
- f'for input {sys_prompt} {user_prompt}')
175
- ans_text = ''
176
- model_id = ''
177
-
178
- res['ans_text'] = ans_text
179
- res['model_id'] = model_id
180
- else:
181
- raise ValueError(f'Invalid mode: {mode}')
182
-
183
- return res
184
-
185
- except Exception as e:
186
- logger.warning(f'OpenAI API call failed: {e}')
187
- time.sleep(3)
188
- logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
189
- return res