evalscope 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (87) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +6 -4
  2. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  3. evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
  4. evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
  5. evalscope/api/benchmark/benchmark.py +27 -2
  6. evalscope/api/benchmark/meta.py +3 -0
  7. evalscope/api/evaluator/evaluator.py +5 -0
  8. evalscope/api/evaluator/state.py +5 -0
  9. evalscope/api/messages/chat_message.py +6 -1
  10. evalscope/api/mixin/__init__.py +1 -0
  11. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  12. evalscope/api/mixin/sandbox_mixin.py +204 -0
  13. evalscope/api/model/generate_config.py +0 -3
  14. evalscope/api/model/model.py +1 -1
  15. evalscope/api/tool/tool_info.py +1 -1
  16. evalscope/arguments.py +6 -0
  17. evalscope/benchmarks/ai2d/__init__.py +0 -0
  18. evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
  19. evalscope/benchmarks/amc/__init__.py +0 -0
  20. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  21. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  22. evalscope/benchmarks/bfcl/bfcl_adapter.py +141 -2
  23. evalscope/benchmarks/bfcl/generation.py +7 -7
  24. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  25. evalscope/benchmarks/healthbench/__init__.py +0 -0
  26. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  27. evalscope/benchmarks/healthbench/utils.py +102 -0
  28. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  29. evalscope/benchmarks/humaneval/utils.py +235 -0
  30. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  31. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  32. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  33. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  34. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  35. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  36. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  37. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  38. evalscope/benchmarks/mm_star/__init__.py +0 -0
  39. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  40. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
  41. evalscope/benchmarks/multi_if/__init__.py +0 -0
  42. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  43. evalscope/benchmarks/multi_if/metrics.py +120 -0
  44. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  45. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
  46. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  47. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  48. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  49. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  50. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  51. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  52. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  53. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
  54. evalscope/config.py +24 -1
  55. evalscope/constants.py +3 -0
  56. evalscope/evaluator/evaluator.py +25 -7
  57. evalscope/metrics/metric.py +27 -2
  58. evalscope/models/model_apis.py +10 -8
  59. evalscope/models/utils/openai.py +1 -2
  60. evalscope/perf/arguments.py +2 -0
  61. evalscope/perf/plugin/api/base.py +2 -2
  62. evalscope/perf/plugin/api/default_api.py +7 -7
  63. evalscope/perf/plugin/api/openai_api.py +83 -19
  64. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  65. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  66. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  67. evalscope/perf/utils/benchmark_util.py +1 -2
  68. evalscope/report/combinator.py +0 -25
  69. evalscope/report/report.py +8 -4
  70. evalscope/run.py +1 -1
  71. evalscope/utils/function_utils.py +41 -0
  72. evalscope/utils/import_utils.py +63 -13
  73. evalscope/utils/io_utils.py +19 -11
  74. evalscope/utils/json_schema.py +23 -2
  75. evalscope/utils/logger.py +19 -0
  76. evalscope/utils/model_utils.py +1 -1
  77. evalscope/version.py +2 -2
  78. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/METADATA +6 -10
  79. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/RECORD +87 -59
  80. tests/benchmark/test_eval.py +51 -7
  81. tests/benchmark/test_sandbox.py +81 -0
  82. tests/benchmark/test_vlm.py +60 -3
  83. tests/perf/test_perf.py +40 -12
  84. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
  85. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
  86. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
  87. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
@@ -1,47 +1,48 @@
1
1
  evalscope/__init__.py,sha256=oivLvqwNw2JlB-h-Z8_525IpfKcYEkS51F59tEfpy5w,445
2
- evalscope/arguments.py,sha256=OthHwNhG9VrP7_CYocmjZ4iVyG5LJbzO0FhseoLBalk,5663
3
- evalscope/config.py,sha256=NVFXbU0kVof2V8Bnjs-O2FEPdlXx3rZuoHcttm1THbM,10564
4
- evalscope/constants.py,sha256=cbkKHmEcJHF9T0m4yREx08__tulj6MV59im2RW-pR3c,3433
5
- evalscope/run.py,sha256=1JjqSky3Fm3v1tOE9pgR7alODoSNWa4ZdoLTWFLgjRE,6510
2
+ evalscope/arguments.py,sha256=jKAF47PsqXRioU21gRHw9hxJnfR31z_X7c__glRY5ns,6257
3
+ evalscope/config.py,sha256=S2N11-AxQkT7lVffpjXdtpT4QpnSP6th-c8I-501mwM,11507
4
+ evalscope/constants.py,sha256=W3E4Jp-x6qxvPOYtU9bNlzlERFvSAA_3F007apIwUlU,3601
5
+ evalscope/run.py,sha256=A9_7pR3FiA-It46A3Mqk7ce6fQy548p0ux2QUugj2hI,6531
6
6
  evalscope/summarizer.py,sha256=HUDJ1zKi22uNst3AUfX67Z0sHzeZy-4S8sYyvxJnBzc,5901
7
- evalscope/version.py,sha256=5Jk88EAyvBpPzsQaFYKGjukIwF3tVCXIrarT94bYsCQ,118
7
+ evalscope/version.py,sha256=H_zHGJkiB6equdW6Jo4F_hhdLYKZqriowav05O5_CeY,118
8
8
  evalscope/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  evalscope/api/registry.py,sha256=Qk0KMGDbt-iI0-OfoJZbOtxt76qreAVWh36HOoQAKM4,5448
10
10
  evalscope/api/benchmark/__init__.py,sha256=9xcTxpcQ6HhZ0QDwEIZhAT5IjybzaJ60VGLcmaFE5dU,188
11
- evalscope/api/benchmark/benchmark.py,sha256=q5hmEH845DfmvEB1NvlHM1b-oCCMpatIamT-2ubudbM,10088
12
- evalscope/api/benchmark/meta.py,sha256=G6Q5E1JwO-CpEwsjhMrXHExlVRUF1Ah5Nz21vkP8IV0,4218
11
+ evalscope/api/benchmark/benchmark.py,sha256=gqAM81SeGb_Q0rA6Q-LFpnNkOUiwOj43aRWECtCxAOE,10832
12
+ evalscope/api/benchmark/meta.py,sha256=N4u8NQjkjIw-xaf6KFnb6C8JDKB0DLbsXyXblDqIpvE,4304
13
13
  evalscope/api/benchmark/adapters/__init__.py,sha256=uLt_GiU4s-_6Rjgmr4OUTtE7dvEX-ZIQ403fd6oNuxA,264
14
- evalscope/api/benchmark/adapters/default_data_adapter.py,sha256=Y8wzOxq3qpbE2lgLZyXHxoLUxjlmbS-N6ByObrBwOvc,27977
14
+ evalscope/api/benchmark/adapters/default_data_adapter.py,sha256=WS4Pm0pk51Se196Ho31FmOqGyOajTtUGbbjWD9U7UwU,28064
15
15
  evalscope/api/benchmark/adapters/image_edit_adapter.py,sha256=06V-_A8RKuMNYMt7-vaXn2qBa9LIZgfFO_6PUuhAkh0,3052
16
- evalscope/api/benchmark/adapters/multi_choice_adapter.py,sha256=wp_6Kws3GoBk_mSzQP8Nr40osFf3iPJpntkANYAuIcc,2979
17
- evalscope/api/benchmark/adapters/text2image_adapter.py,sha256=4mccYHKB-9iyOZ0uwkTi2TgC76KIJpcu_4hnfbU5NMc,6434
18
- evalscope/api/benchmark/adapters/vision_language_adapter.py,sha256=N9LPh5tTGkvRYzp4giI0La0u4xzrHcJGhdTY9jiNCxY,219
16
+ evalscope/api/benchmark/adapters/multi_choice_adapter.py,sha256=auqLNvF50Or9bo3LOmQLXHfFaTTCTqvQzZog3glInng,3062
17
+ evalscope/api/benchmark/adapters/text2image_adapter.py,sha256=jO64hwjQexIv-MTyHH0Ffp_6p--9TKufOmX_U39mAnE,6385
18
+ evalscope/api/benchmark/adapters/vision_language_adapter.py,sha256=5d7ITkeosikb7u0ag0WkMaZ0SAYGkR_wKM9NP495GKk,280
19
19
  evalscope/api/dataset/__init__.py,sha256=RHFMzwfONEqmmn3vRtxyN3r29mipDUUUSEDhuwm0YpQ,147
20
20
  evalscope/api/dataset/dataset.py,sha256=9bwSx89zgOOBRQkRPVv-B5Yi30A6J1MLtekQSqwsy9g,11328
21
21
  evalscope/api/dataset/loader.py,sha256=t7KLH5ltLUumhiPIyYJzk6zn2iKLx-D2gIIoMhKdnhc,9714
22
22
  evalscope/api/dataset/utils.py,sha256=3E0ikqr6QWV_lX0d3Z4F4xFuVTcwbeDPgCvJY7v83Bc,4935
23
23
  evalscope/api/evaluator/__init__.py,sha256=-Ure6X4GlE7VYSNWSZ_DpjbUBGa5irVTymLENEHTYqY,138
24
24
  evalscope/api/evaluator/cache.py,sha256=a_M2ouUjtkMr5m3wRbmsE8ETP_aacxbm0d38yY5RljM,13244
25
- evalscope/api/evaluator/evaluator.py,sha256=SGW4RIKc79IlUP5FisrEycJlqORcaYxyIP5eabaSfeU,1600
26
- evalscope/api/evaluator/state.py,sha256=OyZUtQw9Wd6X8MA2mtmTGn74ReBq1x-JfWwV_TT99UY,8892
25
+ evalscope/api/evaluator/evaluator.py,sha256=xMF4w2qiQ7NNgOhSKs9Vd4VZ33SCDwTTJ82lDhaj1FQ,1734
26
+ evalscope/api/evaluator/state.py,sha256=Elz2cmbvOOqvOaEOAMatxgk4BdjqDZB3XKTaL4iqJLI,9039
27
27
  evalscope/api/filter/__init__.py,sha256=5eWKjT-dAiz8nE0S6WnU6plqjXZHYn7CJOgFiHSoovM,66
28
28
  evalscope/api/filter/filter.py,sha256=fsPddaHE5wwFIXgUWITFqlYXqdh6vx3QqcEf3rSXKVI,2068
29
29
  evalscope/api/messages/__init__.py,sha256=UKZ9VVCt7NPrcZXv_1e8MZ8mOWu0eLRvMIXykpJPZ9I,378
30
- evalscope/api/messages/chat_message.py,sha256=LZ3Yv_Ts5ASCfrq2y_zecpY3IN5lzHsRbaxz8WRQgD8,9698
30
+ evalscope/api/messages/chat_message.py,sha256=D88TklSAWOaG21EBDVDoRPwzVCqzEGbVW4sA8Af4axc,10053
31
31
  evalscope/api/messages/content.py,sha256=gUBUeK60BUhkwoulyzKL6q0iMt3VLlah9onLG1XVrWY,2772
32
32
  evalscope/api/messages/utils.py,sha256=uqlEbYEoUKpXLW8tQtP-cY5Miq7W0Xl6a98j55u6m6E,1266
33
33
  evalscope/api/metric/__init__.py,sha256=Cj2F8eiVny5uNtfPXKwQDq2owlHVKNzfr-COLYMEox4,106
34
34
  evalscope/api/metric/metric.py,sha256=XkjBqpZbFYynhTIH8WawfPmItbDQ6jWufE_ox9zDPCU,1568
35
35
  evalscope/api/metric/scorer.py,sha256=dczSQwkRmPk1uvNCMGT5G6nYbwWTcpwsZtyYXWkrJII,3749
36
- evalscope/api/mixin/__init__.py,sha256=DpHdR7t9d-HUzBXxwsW3t5MxM4kgoThQ4WF8s8EuSBY,43
37
- evalscope/api/mixin/llm_judge_mixin.py,sha256=KPNH41IL7md5XEYqC2ZbmnYm4tIrV-MgxpfKOWbYsMc,5624
36
+ evalscope/api/mixin/__init__.py,sha256=xBuoTuao5o_EFThgeeeWI87x64Q12aJttsaZc8gak_c,83
37
+ evalscope/api/mixin/llm_judge_mixin.py,sha256=ECVDfxCeAEkymFssD7xKhIDcct2qgQTqGnbijXk9leE,5675
38
+ evalscope/api/mixin/sandbox_mixin.py,sha256=uKqBtTtttKwrUArY-CTMDdFHjRBOR7Kl1sxaGHe-S2Q,7653
38
39
  evalscope/api/model/__init__.py,sha256=YxKdz1IKUt6eYoC7nx81yD2BtyiWQDvaoTcc8O9lvoE,286
39
- evalscope/api/model/generate_config.py,sha256=SyUNlZhcoBpLlMK8esu1XQs61SSPN_D5QN8TRUcnroI,7760
40
- evalscope/api/model/model.py,sha256=HecfGqaaB201n7I1pZ5Q4_aVC-xLA93uxdGgoreRYFw,12771
40
+ evalscope/api/model/generate_config.py,sha256=wQeDknXb49yBKSRL9rlIyerPobGXqU-A4hL1vySNGPo,7656
41
+ evalscope/api/model/model.py,sha256=c7YVbYYk47MHWwPjoB66xWjgmHdUGTOSOdtIsLcJfyc,12782
41
42
  evalscope/api/model/model_output.py,sha256=NeN6bLtAvg_3fTirewWfdP-_x4SJXa9pGuRpyXJY3B8,9333
42
43
  evalscope/api/tool/__init__.py,sha256=bEaW5ryY-erLcl2zMoDJNgiaBqlSPAL0jQ5daUHvvrw,272
43
44
  evalscope/api/tool/tool_call.py,sha256=WqMnw69L_yhQWycENZ7azPRhxRidhmrMcYAy7UTIqvg,2836
44
- evalscope/api/tool/tool_info.py,sha256=aqquWQRWWx7fPItIwiubiz2VRe2TLl_Jmn1ArIlngbw,5716
45
+ evalscope/api/tool/tool_info.py,sha256=FQOBqxKZ6Qb4f40iRH1mLg64cEhu1_-9Rn-f5iUrD2w,5733
45
46
  evalscope/api/tool/utils.py,sha256=IWFzM6WspzBmNPicXn6b7KS6Y-1I-ErsK9fua4cb53Y,2324
46
47
  evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
47
48
  evalscope/app/app.py,sha256=EaBWorA87ZmyIHovIE3styHWEVFsu_F70pTmP4-5zTQ,836
@@ -107,18 +108,22 @@ evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KB
107
108
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
108
109
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=jlwM13Ty-Ax6AeMsNlo9xIBupNFgnceYuXtCmh0hNTQ,6160
109
110
  evalscope/benchmarks/__init__.py,sha256=WHR4ej9Tqa2N9CyIaUWXS8EnHZtcujaNeg9hf8GT31Y,1182
111
+ evalscope/benchmarks/ai2d/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
+ evalscope/benchmarks/ai2d/ai2d_adapter.py,sha256=3GBNV4cNv9bBLJRdG_uA9qNhuN6qAEutHl8d-rsFpFU,2018
110
113
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
111
114
  evalscope/benchmarks/aime/aime24_adapter.py,sha256=HTlriHoHzlm1Rf3KAiGRLs8sx6Gyf6s7RGtOjk_hGS4,1767
112
115
  evalscope/benchmarks/aime/aime25_adapter.py,sha256=ZOE_6Zhg1MatWJSu2Zq372nKUODYtNFZimS1MJRFz5A,1591
113
116
  evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
117
  evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=sjaWcK8WH1XY0kzm5eHsq_7J62EJocAf4gRV_UB8ZBE,4971
118
+ evalscope/benchmarks/amc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
119
+ evalscope/benchmarks/amc/amc_adapter.py,sha256=NzLPOmj3fJhPw6gVrB8KtxEbqwUqQ923vXHnLWEfdiU,1418
115
120
  evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
116
121
  evalscope/benchmarks/arc/arc_adapter.py,sha256=GASZmoJ-PpzBG70cBdABZA5uVqoyosjV-jf9WShK7L8,1622
117
122
  evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
118
123
  evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=Ddn_hVO1PvNQ_kNknXfdJCz1AVnXZEdGWq4gX1_Qqow,7275
119
124
  evalscope/benchmarks/arena_hard/utils.py,sha256=23xCd7_ksrM4xMJBp7N2ZwpUpq1zpoQFjLm1oBcdgQY,5559
120
125
  evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
121
- evalscope/benchmarks/bbh/bbh_adapter.py,sha256=GcvgwBhIw7OG-ljWQ_urVOoWlrFjrBy1LAZ-Atm02Dw,5570
126
+ evalscope/benchmarks/bbh/bbh_adapter.py,sha256=lRI-DfdFkyg4ylW4d-6CUfiNqlF7K_IoTjzJz3jYTUs,6346
122
127
  evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
123
128
  evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
124
129
  evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
@@ -147,8 +152,8 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
147
152
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
148
153
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
149
154
  evalscope/benchmarks/bfcl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
150
- evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=N_AVgdfI4DXph0n3U1bChP9AQLx3_-ogAInFE-4EGig,10972
151
- evalscope/benchmarks/bfcl/generation.py,sha256=gOYzwTNEi2G0zykKdsx42Pc0Ql8iPD6RoX3MRbUhMJo,8698
155
+ evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=WzpL7XWDdx-EvbLluIOiMlADTO42CYs0IwQFvIfhTI0,18402
156
+ evalscope/benchmarks/bfcl/generation.py,sha256=c6lNjo-VTSUrVg-pqyPSucrbCKBOdBSyN0aR5AAtE4A,8701
152
157
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
153
158
  evalscope/benchmarks/ceval/ceval_adapter.py,sha256=4FLPgY-UtqINafnNxfOsE9AwS6GFXFCUGOBI-4EZUGk,8503
154
159
  evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -163,7 +168,7 @@ evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
163
168
  evalscope/benchmarks/docmath/docmath_adapter.py,sha256=-mel6hA-x_e7fV0uOHdX5BpoQEVyQ5VqwIwEqSNDpnc,4623
164
169
  evalscope/benchmarks/docmath/utils.py,sha256=d6Yjoa5q91kjr1SdVPVBndzDaUzMlO_GfEqMtUXXr0s,7707
165
170
  evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
166
- evalscope/benchmarks/drop/drop_adapter.py,sha256=PyvZ1WOdHQ0u0_JpuP97_yQsCUbzGcYsJf3bWKbakzg,9968
171
+ evalscope/benchmarks/drop/drop_adapter.py,sha256=Jbbr5O_Y5LI_vT_RskRQVKxGkiIraX_uXP7fYaZ5eZs,9995
167
172
  evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
168
173
  evalscope/benchmarks/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
169
174
  evalscope/benchmarks/frames/frames_adapter.py,sha256=w1kRya7w5omt95HHE6AzbzYVhyTT5r521676d_xJ6Vg,5514
@@ -180,12 +185,16 @@ evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=zWK2hhyKw5n8K30YvMjSm6XMwyrireO
180
185
  evalscope/benchmarks/gpqa/prompt.py,sha256=b1Gw2D5dEdhvLYymPfcvGKJdHrIzpiZkOwURKSxiQJg,5576
181
186
  evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
182
187
  evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=W4vTXsC7iHN1AgvpaCf1Rj7y2O8QczIluucnpSC5aYo,2636
188
+ evalscope/benchmarks/healthbench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
189
+ evalscope/benchmarks/healthbench/healthbench_adapter.py,sha256=1sL7i9yhORH4xiFWB9puPKWNZZFJGZFAlKdlzHp-fiw,13228
190
+ evalscope/benchmarks/healthbench/utils.py,sha256=M8SnOEhlqXWm03CFE6CAtbMiu6MqdGgVczAv-LPjA7Y,3683
183
191
  evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
184
192
  evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=tAe63NfV5ljUm1f4RTSFxWOVKBUhk3Cc0EGzF5uYLK4,2041
185
193
  evalscope/benchmarks/hle/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
186
194
  evalscope/benchmarks/hle/hle_adapter.py,sha256=4YVmETL9mEiLxF4vWRjePLyFaxelax6nOaqoAH5ZxmU,6389
187
195
  evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
188
- evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=5x2pnkbI9ZPPOyrRBsJ5ZcOCGJr8OR7qXLgVlY6eJxs,5825
196
+ evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=uLs3UHSALS3YHt0qzBismrIqdEUgbEalQbjC0CU7ym4,4085
197
+ evalscope/benchmarks/humaneval/utils.py,sha256=rPnc_JuSjNg9aV7UMUwsLrDlm-ufj64GNIBCWBeuRcM,6517
189
198
  evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
190
199
  evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=55FQwJ0_eDijppkVVlM5XCXzgRFmjH1SvGMItGsvn6o,2769
191
200
  evalscope/benchmarks/ifeval/instructions.py,sha256=HXnn1JgU3dpYltqIovFAn02DxkYOGw337kLMlOfJxJE,56048
@@ -200,19 +209,26 @@ evalscope/benchmarks/image_edit/gedit/vie_prompts.py,sha256=qVXWQyVUwZxEasDjVmYB
200
209
  evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
201
210
  evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=mNHA_Fuj_gAdOEoR7oChnGmErf1czqwnk8Zk-jRhBys,1304
202
211
  evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
203
- evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=wgx8RDbkXi2Mlt-aK_6o4VcoPb7I3eL8z8h8JW4SnEo,6510
212
+ evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=maN8qHmDHJpexPeB0qwZoXJ5zrqPbJDYVRptqvXI9d4,6827
204
213
  evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
205
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=tl7nGLDUgmNtyR4faE0aoW11OgLhsx7ZdKmONGDlQnQ,5203
214
+ evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=qnprJTv7zWA9aq6Lw4aDoall---kiivR0oDo3uSO2mI,6399
206
215
  evalscope/benchmarks/live_code_bench/load_utils.py,sha256=fEzWz_fUGwi5Ncum5PNVF9jFcuDwGgs7Vt_10YKBE2Q,2087
207
216
  evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
208
217
  evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
218
+ evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py,sha256=7DDx46EwtoR776vWjofJl1zaYCLdmeq8cF3fhDGdZgA,7424
209
219
  evalscope/benchmarks/live_code_bench/testing_util.py,sha256=TuoOTciC-hz3FTeDzsQB_THH3Be9UOP2XMrax-4sXkM,17282
210
220
  evalscope/benchmarks/maritime_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
211
221
  evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=Rx7iZ5JaEo73YwIzhm78gMDQ6gqcErbnWWXHxXM6BcU,2379
212
222
  evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
213
- evalscope/benchmarks/math_500/math_500_adapter.py,sha256=uuxjmqftY_r-hJBCjfBgYUELrBaB86MG8dIu2wTikgI,1848
223
+ evalscope/benchmarks/math_500/math_500_adapter.py,sha256=hn7SQhoIHKuH-2A_nGUhQPRw2gl2G-kZldc9ueY0G3A,1802
214
224
  evalscope/benchmarks/math_vista/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
215
225
  evalscope/benchmarks/math_vista/math_vista_adapter.py,sha256=Mu9BpH0rDNM0yMrGws4SEOnXy2NTSIKwyLs5t4nAP-s,5842
226
+ evalscope/benchmarks/minerva_math/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
227
+ evalscope/benchmarks/minerva_math/minerva_math_adapter.py,sha256=jyT9_D4w8PTtLBN3Kn10_CnssH_mPuRNnn9rek_zUEs,1655
228
+ evalscope/benchmarks/mm_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
229
+ evalscope/benchmarks/mm_bench/mm_bench_adapter.py,sha256=py0DakGQX1JE2rqYjYN9w_-H0DtQ-YqG5k2s_UzbxxU,4372
230
+ evalscope/benchmarks/mm_star/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
231
+ evalscope/benchmarks/mm_star/mm_star_adapter.py,sha256=oamLv6U2-JAK5mdVLkUgYxkOahxQkQYMRKAyu_xPAUE,2818
216
232
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
217
233
  evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=2NT3QbfPzajUTFZ0tBCl6PRrtFtAr5jPZNQRW2Idlno,5947
218
234
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -222,16 +238,27 @@ evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=m_37OIFrJB4ZIvtbDJ_
222
238
  evalscope/benchmarks/mmmu/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
223
239
  evalscope/benchmarks/mmmu/mmmu_adapter.py,sha256=C7UM6HvomcA_Srf7771S0CaUvifBX63i161XaacraGQ,6038
224
240
  evalscope/benchmarks/mmmu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
225
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py,sha256=a7rZV4WMPxeBdfwanmUjsB8yG1rwNXCsWCoqzOq-dd4,4901
241
+ evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py,sha256=banPS1nDt9bQ95urKbSZnR-hBTw23eL9MSrHt_0ZLp0,4725
242
+ evalscope/benchmarks/multi_if/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
243
+ evalscope/benchmarks/multi_if/ifeval.py,sha256=7y2rnJ4q1_DVA7I9mUnF7TBpu7Kez0X_Xhl-AJInzWk,87949
244
+ evalscope/benchmarks/multi_if/metrics.py,sha256=LWnhQw25cRNMReJ_xJ7Fx7WYHcT9i2FG1FUjYOuQDrI,4291
245
+ evalscope/benchmarks/multi_if/multi_if_adapter.py,sha256=I3_YPPUuRbrs9Gt3Qjhx9RM5Vu2gDFnheDcGu-oe840,5924
226
246
  evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
227
247
  evalscope/benchmarks/musr/musr_adapter.py,sha256=kx6bckj7Nijl4Wysuj-mKYdy0hIRDJho8yVTup403Hc,1473
228
248
  evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
229
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=mO8zhdCpoWPtlBk9GSzgcP25vEoQLYGwUM1QfcQ4iSE,17151
249
+ evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=GYaswWPwYI3aV5HSpcuBTgW9-HDtf2xzNZg0WrsI0Yo,17033
230
250
  evalscope/benchmarks/needle_haystack/utils.py,sha256=k8WDigqt5LgzHw6DtaYsLtb3BJL0FTZS9JOyJCpoPq8,2935
251
+ evalscope/benchmarks/olympiad_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
252
+ evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py,sha256=zePVmGjmyuwCWVb4h1PIQKAIFqBehwRwO2WOD0KX_ik,6565
253
+ evalscope/benchmarks/olympiad_bench/utils.py,sha256=w7vEZcT3vCVq8_DSMgAjZPpVFVHStJPJYsPkrs-yOFM,21412
254
+ evalscope/benchmarks/omni_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
255
+ evalscope/benchmarks/omni_bench/omni_bench_adapter.py,sha256=IJkRSokQC6MF_pN46Yofr_NaZaNt1XZFX1PUBmX4-qA,3651
231
256
  evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
232
257
  evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=XN3F6NH7mF4ibwGX5nI01sqEHz05UQFnBAyfAe14QYE,6174
233
258
  evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
234
259
  evalscope/benchmarks/race/race_adapter.py,sha256=KibT9gHpIOZhTcWihG0dUDAX4gAHa2g1WdGPOcEP9OY,1705
260
+ evalscope/benchmarks/real_world_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
261
+ evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py,sha256=J2u0J9d31uvkoz9nBI9tCMqG27hmYwdLQPPef9jx_pg,2788
235
262
  evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
236
263
  evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=_duveAliSaPUqVSLQ2TtSv5sfwvFFy7t-MgIIokQ24s,9017
237
264
  evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -240,7 +267,7 @@ evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=SPqpBebiHj_oyEqU94p
240
267
  evalscope/benchmarks/super_gpqa/utils.py,sha256=OK_oT-DnWNssITEwu_Zc3Ty5v21n0IaJQYftK2cpwmQ,3401
241
268
  evalscope/benchmarks/tau_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
242
269
  evalscope/benchmarks/tau_bench/generation.py,sha256=d7J5xrxEI-0BYxdSuxdDavcR7f1ipBdpQsKZzwyzGds,5190
243
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py,sha256=1Dj5r9zMuLJ59wHusEcHVTszBE8BVhAK8lNZzBBzKT8,6375
270
+ evalscope/benchmarks/tau_bench/tau_bench_adapter.py,sha256=47wA0ia6gezA3nqvUpd4Pb8f5alCrBKEt7GOxJFupow,6464
244
271
  evalscope/benchmarks/text2image/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
245
272
  evalscope/benchmarks/text2image/evalmuse_adapter.py,sha256=g-Wc1qTg-xWLTjiZPo8zmQud75ac-8mBpYRxOHfiO0g,3024
246
273
  evalscope/benchmarks/text2image/genai_bench_adapter.py,sha256=1GDB3gS9zwrfb9C83LQdQyN7bvvqeYuu5ulJ9Igmi2k,1876
@@ -268,14 +295,14 @@ evalscope/collections/__init__.py,sha256=x05hFLrjGsdtuHtc6PyQXHNuucVdYaBN9ZrM8gB
268
295
  evalscope/collections/sampler.py,sha256=086pzXQO4CO_QYCd10z149Sjh6sBpRBeIHf5OTLOVu8,4896
269
296
  evalscope/collections/schema.py,sha256=yzAlnH0O7iiWB4UnkFXI_Dvxcsq9hDgl0aGK2OpyBY8,4158
270
297
  evalscope/evaluator/__init__.py,sha256=KzYmVTfU-1pdX7va7l3B1-5QKWG07hj1B7rYkMmxitY,91
271
- evalscope/evaluator/evaluator.py,sha256=mkq85ieBRSc5X2FFxijomb2jD3YDKR6UelKFVP6WT8Y,13592
298
+ evalscope/evaluator/evaluator.py,sha256=o99m1CF7xuc3Qn2M25AhWulothZxICwZgZiWxSbynTc,14435
272
299
  evalscope/filters/__init__.py,sha256=AsXwKYDjGhFsJvtj036PRjMOPsHGt-CRicnHTtM_qA4,51
273
300
  evalscope/filters/extraction.py,sha256=KLFr_3XYsrv0PTvmXy0ugj2sqv2ZOWJFV7G_MmGjTHk,4146
274
301
  evalscope/filters/selection.py,sha256=yiJu2JjXDH_lgfEtB9umkGcA3zpo3zvnyoq2mKrXbnw,1609
275
302
  evalscope/metrics/__init__.py,sha256=1giVHESSjn98uBiAvYm5uLsmRQwmf9NHPSt7OT_QJss,1615
276
303
  evalscope/metrics/llm_judge.py,sha256=XukhH9PQtIZAcbjJlOmOD9ye3ngRv_IGKKJE9jhheOE,8653
277
304
  evalscope/metrics/math_parser.py,sha256=BMfautQtNNiF9f2DIEfO6SXSn_GYhzaddAjGWG10MJA,17257
278
- evalscope/metrics/metric.py,sha256=6la8Nq2E_brArDcNwkbRX3ECef0AAE3IrBCfUVE7UKc,10176
305
+ evalscope/metrics/metric.py,sha256=CabKKEbw_DptyH1ZQju7WzjB47fWUKdOhFB1ROpUC-4,10871
279
306
  evalscope/metrics/metrics.py,sha256=VxAggzEfaLKxWcXyuve8QbEBwV2W71udVyt0gynzGec,14134
280
307
  evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
281
308
  evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
@@ -384,47 +411,47 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugmen
384
411
  evalscope/models/__init__.py,sha256=RmW2S31BHBhMN49_VVF_5PJAk-TsuZQkuF2ALShbhAw,556
385
412
  evalscope/models/image_edit_model.py,sha256=oVjGgebnFu3ZXBJLNn62rJ65fcJR7DlG4qEVxisPJ2Y,4104
386
413
  evalscope/models/mockllm.py,sha256=t1fFAHkEb1n_atOCfnGteCX3DWp774lnWcHzi5lBjwM,2511
387
- evalscope/models/model_apis.py,sha256=qzoksjHJHE8CLoNT0UlnFVkmeS7ufguiAtaxZSC5Djc,1957
414
+ evalscope/models/model_apis.py,sha256=ZkZ_nfbeAFJnCndRvRIRLcbmJFTMhGRBi-WfMu0uZKE,1922
388
415
  evalscope/models/modelscope.py,sha256=jSFkho_Ir2py54y_Bwj9jpCoY2mMKkZ8ORzne-ldAIE,15806
389
416
  evalscope/models/openai_compatible.py,sha256=2uK78nDhWwgph7hcIiMc3NHRbIwvswRDM9o9ENahj4k,4659
390
417
  evalscope/models/text2image_model.py,sha256=Sdiyw6vewjVTiXK8RFEh1pohOhDge80EoIWYpnLjr5Y,3929
391
- evalscope/models/utils/openai.py,sha256=xnnpPKWAsqqEscOQr0WJjr7gHUa9POs55Bs1Zv6MXNQ,28182
418
+ evalscope/models/utils/openai.py,sha256=0DzuvTQYFEqcTp6sVtB2VZY7xeyWcOS0I6votqWegUg,28130
392
419
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
393
- evalscope/perf/arguments.py,sha256=lG2IOOzxg29pdnF6IobzPcqEcYqopulFpVU2QzRaEJA,11429
420
+ evalscope/perf/arguments.py,sha256=FmwVE4gC09B8nLd0sdczeEA9b5ztv4kwhOvLuby4wI8,11695
394
421
  evalscope/perf/benchmark.py,sha256=nSJr8lQvHDYiG33tNhkYaVOYONjhJ2wUb1x5RlUiXRY,7968
395
422
  evalscope/perf/http_client.py,sha256=4Ov1Cwi7gMgO05ZmazwyfYjUGAQNGWn7nbfl1ljRNh4,4610
396
423
  evalscope/perf/main.py,sha256=WZbBgFhIj9KqxzC7_NZxDlou019_EXatsHRt5vqDhFg,3439
397
424
  evalscope/perf/plugin/__init__.py,sha256=Ztj4h1_JYJqbbWkeuDTj5aTRyGQf5Woc4xEIyjcokVU,94
398
425
  evalscope/perf/plugin/registry.py,sha256=GhLe-h1rGzya2bgIUaV5VymQIaHqI7h5SG_i4PoGAm8,1967
399
426
  evalscope/perf/plugin/api/__init__.py,sha256=7RsGdYTSfnW6iVpveEzNu8v4x8Yc8H-Kk39DqOHMrd4,152
400
- evalscope/perf/plugin/api/base.py,sha256=9cX4xwTzy5ycnWqmQqRGMLasTEX6jVlobtADkh1KwXE,2782
427
+ evalscope/perf/plugin/api/base.py,sha256=RRZVk9MFuBwb9PFPTklFhQ_RTihg_E7W_LR26-ldPIA,2782
401
428
  evalscope/perf/plugin/api/custom_api.py,sha256=VYJO2lUt9EKdWz6zeYCfvdI0MqfcsIgcKvxqvY5C-3k,10376
402
429
  evalscope/perf/plugin/api/dashscope_api.py,sha256=Miv2pzMa6sxZyYYJhCzcbOI_QHuZx7tazKpb6Not7ck,3627
403
- evalscope/perf/plugin/api/default_api.py,sha256=kjuHQ-zRHe5WU4ofSzWBpWbIxBQBOh_ucu1z2g62gWg,4315
404
- evalscope/perf/plugin/api/openai_api.py,sha256=oewwOPhv0BLdC7n3BUngpVrDYst5wMrBEPhN8oGMKNU,7703
430
+ evalscope/perf/plugin/api/default_api.py,sha256=qvMIjbe_rM13cDHcFCwjtCsjc11qE80Yg7LypaSNTXc,4251
431
+ evalscope/perf/plugin/api/openai_api.py,sha256=a6w4C_voza61trHskHaWNPFr2x2zhRVwIXdiNnMH81E,10570
405
432
  evalscope/perf/plugin/datasets/__init__.py,sha256=qzeQ9BrJhiJJm1wHaFeOQkvXXdSd15Ucspbn5zjs-6Q,495
406
433
  evalscope/perf/plugin/datasets/base.py,sha256=-3Ihnp2hYvZyPnP8Gh2Pu8ovlLNFHyZnNgRu3WHG4d0,2714
407
434
  evalscope/perf/plugin/datasets/custom.py,sha256=yoRHTvTGAglaZ-mmRkPjYNMG7uZYuT1_KrBxnl2i0qg,1385
408
- evalscope/perf/plugin/datasets/flickr8k.py,sha256=M-w1UjOMkA6Uh9v-SURDrm1YCL-m1Cn1u1cIcEJFDpY,1044
409
- evalscope/perf/plugin/datasets/kontext_bench.py,sha256=-KsoXS7nAd6hzN4oCe85zcLkZQT-1IGWQFThuuvE7vo,1092
435
+ evalscope/perf/plugin/datasets/flickr8k.py,sha256=nhHiGNhXX-2c17NQ5q5Q7FgV2hB8XVeeAP8dKkboyHE,1033
436
+ evalscope/perf/plugin/datasets/kontext_bench.py,sha256=cN70hiBX1940IWvNWZG9YGE4vO1yj41Bo7bqmOWusoQ,1081
410
437
  evalscope/perf/plugin/datasets/line_by_line.py,sha256=F4ppdjKKLzFNf_16h6S-6nAU4lOfOFI2-tPgIeZDTMA,996
411
438
  evalscope/perf/plugin/datasets/longalpaca.py,sha256=JjPGYP8NdPmP48wff2fL5IZQfajXL5qhZBvKmZxtfW4,1336
412
439
  evalscope/perf/plugin/datasets/openqa.py,sha256=5PqqiIvNTLlRrPb8PWqMGQyWRb6LuIqipYn67-xd-dY,1519
413
440
  evalscope/perf/plugin/datasets/random_dataset.py,sha256=NNAXvgFPkLDOSpYNex1DyE4X-ELtQRm13_oBooO30j8,3514
414
- evalscope/perf/plugin/datasets/random_vl_dataset.py,sha256=F3yA9Ih3YO895lZKCo3i85LeKTzjvGcvhzc8UNN-gUI,3240
441
+ evalscope/perf/plugin/datasets/random_vl_dataset.py,sha256=e6exWQnupWkTDNwt2MmEK-hccuxEDmWLJRMM70onKi0,3230
415
442
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
416
443
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
417
444
  evalscope/perf/utils/analysis_result.py,sha256=aoT7JD2zAzBeuZUfncKhJ2odX_7KnymwOmNB1Upam2c,935
418
- evalscope/perf/utils/benchmark_util.py,sha256=V91JwpiR66tOz3N5RPp3Es29M9BghdCHj_Czb0FBekI,7274
445
+ evalscope/perf/utils/benchmark_util.py,sha256=A5d--rCElabDOl6Aaxqnu0fNR5c763YZwKIHBSeTK00,7294
419
446
  evalscope/perf/utils/db_util.py,sha256=HAISq6M7xCD2gjUEqqfbK3FjBxA-tvr_n-751tU9ypo,11634
420
447
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
421
448
  evalscope/perf/utils/local_server.py,sha256=_lSPlNEnOmPA_DtREgPS_vj2w_7D8PPSpypXbb0YfJM,4880
422
449
  evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
423
450
  evalscope/perf/utils/rich_display.py,sha256=AQmXv1EuA1-IGgco-Jy1NLOmTKv4eBFH2K4QS8OoGVo,8206
424
451
  evalscope/report/__init__.py,sha256=DTigCg9fkU_zGNDqIaZy3CWYbrlvODvCxCTVqSx6ano,875
425
- evalscope/report/combinator.py,sha256=MAiOCj_q5mXm8-3lARvCSG12jUVEdJ8VcoEHJapoWzo,4134
452
+ evalscope/report/combinator.py,sha256=Xzlhs7kwfI6cgs7rngxhvsur0bCJkrM0tAy6isq2VME,3235
426
453
  evalscope/report/generator.py,sha256=t2R3WGa4SowTRUPOgITtyTR4QDiJ6i3FH__byDKZU8Y,4959
427
- evalscope/report/report.py,sha256=KxboijAVNENxYHjiwyyqW_aQZ0F2CyJ6MbqUJTRHJMs,8273
454
+ evalscope/report/report.py,sha256=lEBD_E_RJiydFTaGFNLIMTFxNrqv8QcLZb_iuUg5HB0,8479
428
455
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
429
456
  evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
430
457
  evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
@@ -464,12 +491,12 @@ evalscope/utils/__init__.py,sha256=5OH8cOoX3YKMKUu0dMRvwzckXligIbUV-1jjJNXlpGI,2
464
491
  evalscope/utils/argument_utils.py,sha256=D7qOH85wf7LKh_cJ2X51OEaL7CMaddydmHZkfoYpvLk,1952
465
492
  evalscope/utils/chat_service.py,sha256=sSki2pKGQP3UjcIf_lbO06afI-vsaUAqglwX__wUDEw,8766
466
493
  evalscope/utils/deprecation_utils.py,sha256=aDv3HFNcJFZ7rxNgALQP0-ITO8L23HC_RX-C_m2i34Y,1610
467
- evalscope/utils/function_utils.py,sha256=a752Z4Xb1rznnLJU9g5Pxqd3r_XzfLzAkdcjSX0kOVc,650
468
- evalscope/utils/import_utils.py,sha256=b6N2x5kB_TMCkSKBlBZ5kL-x-eo_B_DWRQKtsxYL-WM,3808
469
- evalscope/utils/io_utils.py,sha256=q26SU80VvLi1e--KDbMmIjuw3ex_WEWzkgLkmsK9n1g,11191
470
- evalscope/utils/json_schema.py,sha256=MLCS8cSLXF83UPebBaVWDfXJnf0qXsXnr-bIRG88cI4,7485
471
- evalscope/utils/logger.py,sha256=SPhhXo9gyZtWDYDLumII2CEmwHsaW8Bu1IjK5UqWrKQ,5273
472
- evalscope/utils/model_utils.py,sha256=rzEnlwWgupkH1vmmv-tL9-udpwHuiQlZhbX9fXPEcZg,2434
494
+ evalscope/utils/function_utils.py,sha256=E-AIzx_PKrZDGl1cBvlvqNvMa8yM2WUJ2wh73PNBXrQ,1887
495
+ evalscope/utils/import_utils.py,sha256=S0WQ3gt4zpwJHjGcyC-604pWWExg3JV7f3wzoOH-tuo,5794
496
+ evalscope/utils/io_utils.py,sha256=79F0p7dFxA84tIVSL_C4piJgeQQtVUfb2R_Xcd8v_cE,11615
497
+ evalscope/utils/json_schema.py,sha256=ZExvQA-SI6SxWBx_hCmuQ2RRqwGKuywy4sTotvd2hH0,8288
498
+ evalscope/utils/logger.py,sha256=roFk4Su4aJwsF0s-uYc5-tABnghwYPX3gpkA5QUGzK8,5675
499
+ evalscope/utils/model_utils.py,sha256=mdtYoHhUdfpxUtnS52XZjNdO3uSK4yeIBHT3aDU7s-A,2455
473
500
  evalscope/utils/multi_choices.py,sha256=OxBER7amWpoRY0Z-o39rDmCNK6wpr1HQm9mMHpWLgp0,9524
474
501
  evalscope/utils/url_utils.py,sha256=9HcFt9uZNbOJR3ADUFQ_dBFKziHV6H66Df7HYs1M4Po,1757
475
502
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -477,17 +504,18 @@ tests/common.py,sha256=BB136KcGaEfdWqMwApa48K0CTSGmOCUZ0FYDqpfYnAA,2423
477
504
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
478
505
  tests/utils.py,sha256=Fgm0CU6ilZjCGOfOMJH-Trxy0UIAGbhvy0Ijy_zDGUk,323
479
506
  tests/benchmark/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
480
- tests/benchmark/test_eval.py,sha256=Grms3aMWQONexSsSvOSxkoURHLJ2Z0SqBjrcVWDoMRs,12455
507
+ tests/benchmark/test_eval.py,sha256=vSAvhiCKxHpjHdGhZn8l0qzPSiG1ZZafz_M06B_a8_Y,13827
481
508
  tests/benchmark/test_image_edit.py,sha256=z3z7psMRFynpVgUAFoH--ieeGXzb9cHkrq3tT_sCZo8,2165
509
+ tests/benchmark/test_sandbox.py,sha256=bHyX8ammdn7EsEbN80cIzDNhQZlJD3Ssoj9l4efF7rI,2968
482
510
  tests/benchmark/test_t2i.py,sha256=fciaGsOrkOpT4WQlsnmjrqw6qolCzI0DGyWQAJkM-Es,4513
483
- tests/benchmark/test_vlm.py,sha256=k2DC0zWO2TtVSf-MP-n-wGwfk9MWKKd6hZzkC4nlUO0,2541
511
+ tests/benchmark/test_vlm.py,sha256=gn0ledf_yPY1IhCyCtiqT_dTVPUVZ3NVPr9yzsC_UZQ,4501
484
512
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
485
513
  tests/cli/test_all.py,sha256=1omOXC1lBphBLm0hTf5HNstlF_bwi16dYyr00gvaCTM,7301
486
514
  tests/cli/test_collection.py,sha256=lGz3YUS_0gM6_HjQLe26OfBAkHOPOEDWMO-UyP58GN8,4455
487
515
  tests/cli/test_custom.py,sha256=9z_N7Re712xI62TqVSTBdzB_iFFEUb55wcWIcGvJb84,9254
488
516
  tests/cli/test_reasoning.py,sha256=rU181LLoKbFCpNPFCIZULxEgsJ2PYswel2pP2EsjEmo,2696
489
517
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
490
- tests/perf/test_perf.py,sha256=yqm3abB5ZdNPKaJkvzMvfcz-Cz_o2RxUZ3ZnqgRb-tQ,5937
518
+ tests/perf/test_perf.py,sha256=ugYNEyU32ctryPFa_6fr8aQYxfHJMymdKnKKEHM9Ajc,6174
491
519
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
492
520
  tests/rag/test_clip_benchmark.py,sha256=qpSLgmHMGcYTnxP7AI__y-ii5_tu_fCSht6p3TBetkA,2650
493
521
  tests/rag/test_mteb.py,sha256=fdNQIyUEzE7puPCKw5QhCHTEu7hz-ieHeq1xCWGh6IM,7246
@@ -498,9 +526,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=RwrKkc1WHEZxetM11cGL81G4faKCn7SYn4
498
526
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=UAUtOCQ72xbm8s-sov3cBEpYVDy189wpB-qOL3KoU7M,6053
499
527
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
500
528
  tests/vlm/test_vlmeval.py,sha256=EDQRkYfSyOICUwo_tm3p-puaE_xdFmqOPkrt5etxsqM,3307
501
- evalscope-1.0.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
502
- evalscope-1.0.1.dist-info/METADATA,sha256=2XzuX9tVYzONuLHVq2WsQ_uaWImGVwiY2IPAJhpNEOA,40287
503
- evalscope-1.0.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
504
- evalscope-1.0.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
505
- evalscope-1.0.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
506
- evalscope-1.0.1.dist-info/RECORD,,
529
+ evalscope-1.0.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
530
+ evalscope-1.0.2.dist-info/METADATA,sha256=vZciS7qNosSJOdwyRSxsCyVqvw8hyqKS84yKjlbxwzw,40305
531
+ evalscope-1.0.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
532
+ evalscope-1.0.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
533
+ evalscope-1.0.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
534
+ evalscope-1.0.2.dist-info/RECORD,,
@@ -33,12 +33,13 @@ class TestNativeBenchmark(TestBenchmark):
33
33
  'judge_strategy': JudgeStrategy.AUTO,
34
34
  'judge_worker_num': 5,
35
35
  'judge_model_args': {
36
- 'model_id': 'qwen2.5-72b-instruct',
36
+ 'model_id': 'qwen3-235b-a22b',
37
37
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
38
38
  'api_key': env.get('DASHSCOPE_API_KEY'),
39
39
  'generation_config': {
40
40
  'temperature': 0.0,
41
41
  'max_tokens': 4096,
42
+ 'extra_body': {'enable_thinking': False}
42
43
  }
43
44
  },
44
45
  'debug': True,
@@ -65,6 +66,14 @@ class TestNativeBenchmark(TestBenchmark):
65
66
  }
66
67
  self._run_dataset_test('mmlu', use_mock=True, dataset_args=dataset_args)
67
68
 
69
+ def test_mmlu_reasoning(self):
70
+ """Test MMLU reasoning dataset."""
71
+ dataset_args = {
72
+ 'few_shot_num': 0,
73
+ 'subset_list': ['abstract_algebra', 'computer_security']
74
+ }
75
+ self._run_dataset_test('mmlu', dataset_args=dataset_args, model='qwen3-0.6b', stream=True)
76
+
68
77
  def test_mmlu_pro(self):
69
78
  """Test MMLU-Pro reasoning dataset."""
70
79
  dataset_args = {
@@ -203,6 +212,7 @@ class TestNativeBenchmark(TestBenchmark):
203
212
  def test_bbh(self):
204
213
  dataset_args = {
205
214
  'subset_list': ['temporal_sequences', 'navigate'],
215
+ 'few_shot_num': 0,
206
216
  }
207
217
  self._run_dataset_test('bbh', dataset_args=dataset_args)
208
218
 
@@ -317,20 +327,21 @@ class TestNativeBenchmark(TestBenchmark):
317
327
  def test_humaneval(self):
318
328
  """Test HumanEval dataset."""
319
329
  dataset_args = {
320
- 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5']
330
+ 'metric_list': ['Pass@1']
321
331
  }
322
- self._run_dataset_test('humaneval', dataset_args, repeats=5)
332
+ self._run_dataset_test('humaneval', dataset_args)
323
333
 
324
334
  def test_live_code_bench(self):
325
335
  """Test LiveCodeBench dataset."""
326
336
  dataset_args = {
327
- 'subset_list': ['v6'],
337
+ 'subset_list': ['v5'],
338
+ 'review_timeout': 6,
328
339
  'extra_params': {
329
340
  'start_date': '2024-08-01',
330
341
  'end_date': '2025-02-28'
331
342
  },
332
343
  }
333
- self._run_dataset_test('live_code_bench', dataset_args, judge_worker_num=1)
344
+ self._run_dataset_test('live_code_bench', dataset_args, limit=20, use_cache='outputs/20250918_200232', rerun_review=True)
334
345
 
335
346
  def test_tool_bench(self):
336
347
  """Test ToolBench dataset."""
@@ -339,13 +350,18 @@ class TestNativeBenchmark(TestBenchmark):
339
350
  def test_bfcl(self):
340
351
  """Test BFCL dataset."""
341
352
  dataset_args = {
342
- 'subset_list': ['simple', 'live_multiple', 'multi_turn_base'],
353
+ 'subset_list': [
354
+ # 'simple',
355
+ # 'live_multiple',
356
+ # 'multi_turn_base',
357
+ 'multi_turn_miss_func'
358
+ ],
343
359
  'extra_params': {
344
360
  'is_fc_model': True,
345
361
  'underscore_to_dot': True
346
362
  }
347
363
  }
348
- self._run_dataset_test('bfcl_v3', dataset_args, model='qwq-plus', stream=True)
364
+ self._run_dataset_test('bfcl_v3', dataset_args, model='qwen-plus', limit=30, eval_batch_size=5)
349
365
 
350
366
  def test_tau_bench(self):
351
367
  dataset_args = {
@@ -378,6 +394,34 @@ class TestNativeBenchmark(TestBenchmark):
378
394
  }
379
395
  self._run_dataset_test('data_collection', dataset_args)
380
396
 
397
+ def test_multi_if(self):
398
+ dataset_args = {
399
+ 'subset_list': ['English', 'Chinese'],
400
+ 'few_shot_num': 0,
401
+ }
402
+ self._run_dataset_test('multi_if', dataset_args, limit=5)
403
+
404
+ def test_healthbench(self):
405
+ dataset_args = {
406
+ 'subset_list': ['health_data_tasks'],
407
+ 'extra_params': {
408
+ 'version': 'Hard'
409
+ }
410
+ }
411
+ self._run_dataset_test('health_bench', dataset_args, limit=5)
412
+
413
+
414
+ def test_amc(self):
415
+ dataset_args = {
416
+ 'subset_list': ['amc22'],
417
+ }
418
+ self._run_dataset_test('amc', dataset_args)
419
+
420
+ def test_minerva_math(self):
421
+ dataset_args = {
422
+ 'subset_list': ['default'],
423
+ }
424
+ self._run_dataset_test('minerva_math', dataset_args)
381
425
 
382
426
  if __name__ == '__main__':
383
427
  # Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
@@ -0,0 +1,81 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+
8
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
9
+ from evalscope.utils.logger import get_logger
10
+ from tests.common import TestBenchmark
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class TestCodeBenchmark(TestBenchmark):
16
+ """Benchmark evaluation test cases."""
17
+
18
+ def setUp(self):
19
+ """Setup common test configuration."""
20
+ self.base_config = {
21
+ 'model': 'qwen-plus',
22
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
23
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
24
+ 'eval_type': EvalType.SERVICE,
25
+ 'eval_batch_size': 5,
26
+ 'limit': 5,
27
+ 'generation_config': {
28
+ 'max_tokens': 4096,
29
+ 'temperature': 0.0,
30
+ 'seed': 42,
31
+ 'parallel_tool_calls': True
32
+ },
33
+ 'judge_strategy': JudgeStrategy.AUTO,
34
+ 'judge_worker_num': 5,
35
+ 'judge_model_args': {
36
+ 'model_id': 'qwen2.5-72b-instruct',
37
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
38
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
39
+ 'generation_config': {
40
+ 'temperature': 0.0,
41
+ 'max_tokens': 4096,
42
+ }
43
+ },
44
+ 'use_sandbox': True,
45
+ 'sandbox_type': 'docker',
46
+ 'debug': True,
47
+ }
48
+
49
+ def test_humaneval(self):
50
+ """Test Humaneval dataset."""
51
+ self._run_dataset_test('humaneval', limit=5)
52
+
53
+ def test_humaneval_remote_sandbox(self):
54
+ """Test Humaneval dataset with remote sandbox manager."""
55
+ sandbox_manager_config = {'base_url': 'http://localhost:8000'}
56
+ self._run_dataset_test('humaneval', limit=5, sandbox_manager_config=sandbox_manager_config)
57
+
58
+ def test_live_code_bench(self):
59
+ """Test Live Code Bench dataset."""
60
+ dataset_args = {
61
+ 'subset_list': ['v5'],
62
+ 'review_timeout': 6,
63
+ 'extra_params': {
64
+ 'start_date': '2024-08-01',
65
+ 'end_date': '2025-02-28'
66
+ },
67
+ }
68
+ self._run_dataset_test('live_code_bench', limit=5, dataset_args=dataset_args, use_cache='outputs/20250918_200232', rerun_review=True)
69
+
70
+ def test_live_code_bench_remote_sandbox(self):
71
+ """Test Live Code Bench dataset."""
72
+ dataset_args = {
73
+ 'subset_list': ['v5'],
74
+ 'review_timeout': 6,
75
+ 'extra_params': {
76
+ 'start_date': '2024-08-01',
77
+ 'end_date': '2025-02-28'
78
+ },
79
+ }
80
+ sandbox_manager_config = {'base_url': 'http://localhost:8000'}
81
+ self._run_dataset_test('live_code_bench', limit=20, dataset_args=dataset_args, sandbox_manager_config=sandbox_manager_config, use_cache='outputs/20250918_200232_2', rerun_review=True)
@@ -25,7 +25,7 @@ class TestVLMBenchmark(TestBenchmark):
25
25
  'eval_batch_size': 5,
26
26
  'limit': 5,
27
27
  'generation_config': {
28
- 'max_tokens': 4096,
28
+ 'max_tokens': 2048,
29
29
  'temperature': 0.0,
30
30
  'seed': 42,
31
31
  'parallel_tool_calls': True
@@ -72,9 +72,66 @@ class TestVLMBenchmark(TestBenchmark):
72
72
  }
73
73
  self._run_dataset_test('mmmu_pro', dataset_args=dataset_args, limit=10)
74
74
 
75
- def test_qwen3_collection(self):
75
+ def test_qwen3_vl_collection(self):
76
76
  dataset_args = {
77
77
  'dataset_id': 'outputs/qwen3_vl_test.jsonl',
78
78
  'shuffle': True,
79
79
  }
80
- self._run_dataset_test('data_collection', dataset_args)
80
+ self._run_dataset_test('data_collection', dataset_args, limit=100)
81
+
82
+ def test_real_world_qa(self):
83
+ dataset_args = {
84
+ 'subset_list': ['default']
85
+ }
86
+ self._run_dataset_test('real_world_qa', dataset_args=dataset_args, limit=10)
87
+
88
+ def test_ai2d(self):
89
+ dataset_args = {
90
+ 'subset_list': ['default']
91
+ }
92
+ self._run_dataset_test('ai2d', dataset_args=dataset_args)
93
+
94
+ def test_cc_bench(self):
95
+ dataset_args = {
96
+ 'subset_list': ['cc']
97
+ }
98
+ self._run_dataset_test('cc_bench', dataset_args=dataset_args)
99
+
100
+ def test_mm_bench(self):
101
+ dataset_args = {
102
+ 'subset_list': ['cn', 'en']
103
+ }
104
+ self._run_dataset_test('mm_bench', dataset_args=dataset_args)
105
+
106
+ def test_mm_star(self):
107
+ dataset_args = {
108
+ # 'subset_list': ['val']
109
+ }
110
+ self._run_dataset_test('mm_star', dataset_args=dataset_args)
111
+
112
+ def test_omni_bench(self):
113
+ dataset_args = {
114
+ 'extra_params': {
115
+ 'use_image': True, # Whether to use image input, if False, use text alternative image content.
116
+ 'use_audio': True, # Whether to use audio input, if False, use text alternative audio content.
117
+ }
118
+ }
119
+ self._run_dataset_test('omni_bench', dataset_args=dataset_args, model='qwen-omni-turbo')
120
+
121
+ def test_olympiad_bench(self):
122
+ dataset_args = {
123
+ 'subset_list': [
124
+ # 'OE_MM_maths_en_COMP',
125
+ # 'OE_MM_maths_zh_CEE',
126
+ # 'OE_MM_maths_zh_COMP',
127
+ # 'OE_MM_physics_en_COMP',
128
+ # 'OE_MM_physics_zh_CEE',
129
+ # 'OE_TO_maths_en_COMP',
130
+ # 'OE_TO_maths_zh_CEE',
131
+ # 'OE_TO_maths_zh_COMP',
132
+ # 'OE_TO_physics_en_COMP',
133
+ # 'OE_TO_physics_zh_CEE',
134
+ 'TP_TO_maths_zh_CEE',
135
+ ]
136
+ }
137
+ self._run_dataset_test('olympiad_bench', dataset_args=dataset_args)