evalscope 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/adapters/default_data_adapter.py +6 -4
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
- evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
- evalscope/api/benchmark/benchmark.py +27 -2
- evalscope/api/benchmark/meta.py +3 -0
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +5 -0
- evalscope/api/messages/chat_message.py +6 -1
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +0 -3
- evalscope/api/model/model.py +1 -1
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/arguments.py +6 -0
- evalscope/benchmarks/ai2d/__init__.py +0 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +141 -2
- evalscope/benchmarks/bfcl/generation.py +7 -7
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
- evalscope/config.py +24 -1
- evalscope/constants.py +3 -0
- evalscope/evaluator/evaluator.py +25 -7
- evalscope/metrics/metric.py +27 -2
- evalscope/models/model_apis.py +10 -8
- evalscope/models/utils/openai.py +1 -2
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/report/combinator.py +0 -25
- evalscope/report/report.py +8 -4
- evalscope/run.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +63 -13
- evalscope/utils/io_utils.py +19 -11
- evalscope/utils/json_schema.py +23 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/METADATA +6 -10
- {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/RECORD +87 -59
- tests/benchmark/test_eval.py +51 -7
- tests/benchmark/test_sandbox.py +81 -0
- tests/benchmark/test_vlm.py +60 -3
- tests/perf/test_perf.py +40 -12
- {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
|
@@ -1,47 +1,48 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=oivLvqwNw2JlB-h-Z8_525IpfKcYEkS51F59tEfpy5w,445
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
4
|
-
evalscope/constants.py,sha256=
|
|
5
|
-
evalscope/run.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=jKAF47PsqXRioU21gRHw9hxJnfR31z_X7c__glRY5ns,6257
|
|
3
|
+
evalscope/config.py,sha256=S2N11-AxQkT7lVffpjXdtpT4QpnSP6th-c8I-501mwM,11507
|
|
4
|
+
evalscope/constants.py,sha256=W3E4Jp-x6qxvPOYtU9bNlzlERFvSAA_3F007apIwUlU,3601
|
|
5
|
+
evalscope/run.py,sha256=A9_7pR3FiA-It46A3Mqk7ce6fQy548p0ux2QUugj2hI,6531
|
|
6
6
|
evalscope/summarizer.py,sha256=HUDJ1zKi22uNst3AUfX67Z0sHzeZy-4S8sYyvxJnBzc,5901
|
|
7
|
-
evalscope/version.py,sha256=
|
|
7
|
+
evalscope/version.py,sha256=H_zHGJkiB6equdW6Jo4F_hhdLYKZqriowav05O5_CeY,118
|
|
8
8
|
evalscope/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
9
|
evalscope/api/registry.py,sha256=Qk0KMGDbt-iI0-OfoJZbOtxt76qreAVWh36HOoQAKM4,5448
|
|
10
10
|
evalscope/api/benchmark/__init__.py,sha256=9xcTxpcQ6HhZ0QDwEIZhAT5IjybzaJ60VGLcmaFE5dU,188
|
|
11
|
-
evalscope/api/benchmark/benchmark.py,sha256=
|
|
12
|
-
evalscope/api/benchmark/meta.py,sha256=
|
|
11
|
+
evalscope/api/benchmark/benchmark.py,sha256=gqAM81SeGb_Q0rA6Q-LFpnNkOUiwOj43aRWECtCxAOE,10832
|
|
12
|
+
evalscope/api/benchmark/meta.py,sha256=N4u8NQjkjIw-xaf6KFnb6C8JDKB0DLbsXyXblDqIpvE,4304
|
|
13
13
|
evalscope/api/benchmark/adapters/__init__.py,sha256=uLt_GiU4s-_6Rjgmr4OUTtE7dvEX-ZIQ403fd6oNuxA,264
|
|
14
|
-
evalscope/api/benchmark/adapters/default_data_adapter.py,sha256=
|
|
14
|
+
evalscope/api/benchmark/adapters/default_data_adapter.py,sha256=WS4Pm0pk51Se196Ho31FmOqGyOajTtUGbbjWD9U7UwU,28064
|
|
15
15
|
evalscope/api/benchmark/adapters/image_edit_adapter.py,sha256=06V-_A8RKuMNYMt7-vaXn2qBa9LIZgfFO_6PUuhAkh0,3052
|
|
16
|
-
evalscope/api/benchmark/adapters/multi_choice_adapter.py,sha256=
|
|
17
|
-
evalscope/api/benchmark/adapters/text2image_adapter.py,sha256=
|
|
18
|
-
evalscope/api/benchmark/adapters/vision_language_adapter.py,sha256=
|
|
16
|
+
evalscope/api/benchmark/adapters/multi_choice_adapter.py,sha256=auqLNvF50Or9bo3LOmQLXHfFaTTCTqvQzZog3glInng,3062
|
|
17
|
+
evalscope/api/benchmark/adapters/text2image_adapter.py,sha256=jO64hwjQexIv-MTyHH0Ffp_6p--9TKufOmX_U39mAnE,6385
|
|
18
|
+
evalscope/api/benchmark/adapters/vision_language_adapter.py,sha256=5d7ITkeosikb7u0ag0WkMaZ0SAYGkR_wKM9NP495GKk,280
|
|
19
19
|
evalscope/api/dataset/__init__.py,sha256=RHFMzwfONEqmmn3vRtxyN3r29mipDUUUSEDhuwm0YpQ,147
|
|
20
20
|
evalscope/api/dataset/dataset.py,sha256=9bwSx89zgOOBRQkRPVv-B5Yi30A6J1MLtekQSqwsy9g,11328
|
|
21
21
|
evalscope/api/dataset/loader.py,sha256=t7KLH5ltLUumhiPIyYJzk6zn2iKLx-D2gIIoMhKdnhc,9714
|
|
22
22
|
evalscope/api/dataset/utils.py,sha256=3E0ikqr6QWV_lX0d3Z4F4xFuVTcwbeDPgCvJY7v83Bc,4935
|
|
23
23
|
evalscope/api/evaluator/__init__.py,sha256=-Ure6X4GlE7VYSNWSZ_DpjbUBGa5irVTymLENEHTYqY,138
|
|
24
24
|
evalscope/api/evaluator/cache.py,sha256=a_M2ouUjtkMr5m3wRbmsE8ETP_aacxbm0d38yY5RljM,13244
|
|
25
|
-
evalscope/api/evaluator/evaluator.py,sha256=
|
|
26
|
-
evalscope/api/evaluator/state.py,sha256=
|
|
25
|
+
evalscope/api/evaluator/evaluator.py,sha256=xMF4w2qiQ7NNgOhSKs9Vd4VZ33SCDwTTJ82lDhaj1FQ,1734
|
|
26
|
+
evalscope/api/evaluator/state.py,sha256=Elz2cmbvOOqvOaEOAMatxgk4BdjqDZB3XKTaL4iqJLI,9039
|
|
27
27
|
evalscope/api/filter/__init__.py,sha256=5eWKjT-dAiz8nE0S6WnU6plqjXZHYn7CJOgFiHSoovM,66
|
|
28
28
|
evalscope/api/filter/filter.py,sha256=fsPddaHE5wwFIXgUWITFqlYXqdh6vx3QqcEf3rSXKVI,2068
|
|
29
29
|
evalscope/api/messages/__init__.py,sha256=UKZ9VVCt7NPrcZXv_1e8MZ8mOWu0eLRvMIXykpJPZ9I,378
|
|
30
|
-
evalscope/api/messages/chat_message.py,sha256=
|
|
30
|
+
evalscope/api/messages/chat_message.py,sha256=D88TklSAWOaG21EBDVDoRPwzVCqzEGbVW4sA8Af4axc,10053
|
|
31
31
|
evalscope/api/messages/content.py,sha256=gUBUeK60BUhkwoulyzKL6q0iMt3VLlah9onLG1XVrWY,2772
|
|
32
32
|
evalscope/api/messages/utils.py,sha256=uqlEbYEoUKpXLW8tQtP-cY5Miq7W0Xl6a98j55u6m6E,1266
|
|
33
33
|
evalscope/api/metric/__init__.py,sha256=Cj2F8eiVny5uNtfPXKwQDq2owlHVKNzfr-COLYMEox4,106
|
|
34
34
|
evalscope/api/metric/metric.py,sha256=XkjBqpZbFYynhTIH8WawfPmItbDQ6jWufE_ox9zDPCU,1568
|
|
35
35
|
evalscope/api/metric/scorer.py,sha256=dczSQwkRmPk1uvNCMGT5G6nYbwWTcpwsZtyYXWkrJII,3749
|
|
36
|
-
evalscope/api/mixin/__init__.py,sha256=
|
|
37
|
-
evalscope/api/mixin/llm_judge_mixin.py,sha256=
|
|
36
|
+
evalscope/api/mixin/__init__.py,sha256=xBuoTuao5o_EFThgeeeWI87x64Q12aJttsaZc8gak_c,83
|
|
37
|
+
evalscope/api/mixin/llm_judge_mixin.py,sha256=ECVDfxCeAEkymFssD7xKhIDcct2qgQTqGnbijXk9leE,5675
|
|
38
|
+
evalscope/api/mixin/sandbox_mixin.py,sha256=uKqBtTtttKwrUArY-CTMDdFHjRBOR7Kl1sxaGHe-S2Q,7653
|
|
38
39
|
evalscope/api/model/__init__.py,sha256=YxKdz1IKUt6eYoC7nx81yD2BtyiWQDvaoTcc8O9lvoE,286
|
|
39
|
-
evalscope/api/model/generate_config.py,sha256=
|
|
40
|
-
evalscope/api/model/model.py,sha256=
|
|
40
|
+
evalscope/api/model/generate_config.py,sha256=wQeDknXb49yBKSRL9rlIyerPobGXqU-A4hL1vySNGPo,7656
|
|
41
|
+
evalscope/api/model/model.py,sha256=c7YVbYYk47MHWwPjoB66xWjgmHdUGTOSOdtIsLcJfyc,12782
|
|
41
42
|
evalscope/api/model/model_output.py,sha256=NeN6bLtAvg_3fTirewWfdP-_x4SJXa9pGuRpyXJY3B8,9333
|
|
42
43
|
evalscope/api/tool/__init__.py,sha256=bEaW5ryY-erLcl2zMoDJNgiaBqlSPAL0jQ5daUHvvrw,272
|
|
43
44
|
evalscope/api/tool/tool_call.py,sha256=WqMnw69L_yhQWycENZ7azPRhxRidhmrMcYAy7UTIqvg,2836
|
|
44
|
-
evalscope/api/tool/tool_info.py,sha256=
|
|
45
|
+
evalscope/api/tool/tool_info.py,sha256=FQOBqxKZ6Qb4f40iRH1mLg64cEhu1_-9Rn-f5iUrD2w,5733
|
|
45
46
|
evalscope/api/tool/utils.py,sha256=IWFzM6WspzBmNPicXn6b7KS6Y-1I-ErsK9fua4cb53Y,2324
|
|
46
47
|
evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
|
|
47
48
|
evalscope/app/app.py,sha256=EaBWorA87ZmyIHovIE3styHWEVFsu_F70pTmP4-5zTQ,836
|
|
@@ -107,18 +108,22 @@ evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KB
|
|
|
107
108
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
108
109
|
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=jlwM13Ty-Ax6AeMsNlo9xIBupNFgnceYuXtCmh0hNTQ,6160
|
|
109
110
|
evalscope/benchmarks/__init__.py,sha256=WHR4ej9Tqa2N9CyIaUWXS8EnHZtcujaNeg9hf8GT31Y,1182
|
|
111
|
+
evalscope/benchmarks/ai2d/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
112
|
+
evalscope/benchmarks/ai2d/ai2d_adapter.py,sha256=3GBNV4cNv9bBLJRdG_uA9qNhuN6qAEutHl8d-rsFpFU,2018
|
|
110
113
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
111
114
|
evalscope/benchmarks/aime/aime24_adapter.py,sha256=HTlriHoHzlm1Rf3KAiGRLs8sx6Gyf6s7RGtOjk_hGS4,1767
|
|
112
115
|
evalscope/benchmarks/aime/aime25_adapter.py,sha256=ZOE_6Zhg1MatWJSu2Zq372nKUODYtNFZimS1MJRFz5A,1591
|
|
113
116
|
evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
114
117
|
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=sjaWcK8WH1XY0kzm5eHsq_7J62EJocAf4gRV_UB8ZBE,4971
|
|
118
|
+
evalscope/benchmarks/amc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
119
|
+
evalscope/benchmarks/amc/amc_adapter.py,sha256=NzLPOmj3fJhPw6gVrB8KtxEbqwUqQ923vXHnLWEfdiU,1418
|
|
115
120
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
116
121
|
evalscope/benchmarks/arc/arc_adapter.py,sha256=GASZmoJ-PpzBG70cBdABZA5uVqoyosjV-jf9WShK7L8,1622
|
|
117
122
|
evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
118
123
|
evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=Ddn_hVO1PvNQ_kNknXfdJCz1AVnXZEdGWq4gX1_Qqow,7275
|
|
119
124
|
evalscope/benchmarks/arena_hard/utils.py,sha256=23xCd7_ksrM4xMJBp7N2ZwpUpq1zpoQFjLm1oBcdgQY,5559
|
|
120
125
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
121
|
-
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=
|
|
126
|
+
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=lRI-DfdFkyg4ylW4d-6CUfiNqlF7K_IoTjzJz3jYTUs,6346
|
|
122
127
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
123
128
|
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt,sha256=sfo-2iOeVzB0OGgd7NSQFELTGDTsr2DQ3u-g0ivI-sM,3653
|
|
124
129
|
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt,sha256=UJBsc3Mwz8TZngdWH_NFlhhNbLhNHK6FvW9FHcS8H5g,1167
|
|
@@ -147,8 +152,8 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
|
|
|
147
152
|
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
|
|
148
153
|
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
|
|
149
154
|
evalscope/benchmarks/bfcl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
150
|
-
evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=
|
|
151
|
-
evalscope/benchmarks/bfcl/generation.py,sha256=
|
|
155
|
+
evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=WzpL7XWDdx-EvbLluIOiMlADTO42CYs0IwQFvIfhTI0,18402
|
|
156
|
+
evalscope/benchmarks/bfcl/generation.py,sha256=c6lNjo-VTSUrVg-pqyPSucrbCKBOdBSyN0aR5AAtE4A,8701
|
|
152
157
|
evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
153
158
|
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=4FLPgY-UtqINafnNxfOsE9AwS6GFXFCUGOBI-4EZUGk,8503
|
|
154
159
|
evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -163,7 +168,7 @@ evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
163
168
|
evalscope/benchmarks/docmath/docmath_adapter.py,sha256=-mel6hA-x_e7fV0uOHdX5BpoQEVyQ5VqwIwEqSNDpnc,4623
|
|
164
169
|
evalscope/benchmarks/docmath/utils.py,sha256=d6Yjoa5q91kjr1SdVPVBndzDaUzMlO_GfEqMtUXXr0s,7707
|
|
165
170
|
evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
166
|
-
evalscope/benchmarks/drop/drop_adapter.py,sha256=
|
|
171
|
+
evalscope/benchmarks/drop/drop_adapter.py,sha256=Jbbr5O_Y5LI_vT_RskRQVKxGkiIraX_uXP7fYaZ5eZs,9995
|
|
167
172
|
evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
|
|
168
173
|
evalscope/benchmarks/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
169
174
|
evalscope/benchmarks/frames/frames_adapter.py,sha256=w1kRya7w5omt95HHE6AzbzYVhyTT5r521676d_xJ6Vg,5514
|
|
@@ -180,12 +185,16 @@ evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=zWK2hhyKw5n8K30YvMjSm6XMwyrireO
|
|
|
180
185
|
evalscope/benchmarks/gpqa/prompt.py,sha256=b1Gw2D5dEdhvLYymPfcvGKJdHrIzpiZkOwURKSxiQJg,5576
|
|
181
186
|
evalscope/benchmarks/gsm8k/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
182
187
|
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=W4vTXsC7iHN1AgvpaCf1Rj7y2O8QczIluucnpSC5aYo,2636
|
|
188
|
+
evalscope/benchmarks/healthbench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
189
|
+
evalscope/benchmarks/healthbench/healthbench_adapter.py,sha256=1sL7i9yhORH4xiFWB9puPKWNZZFJGZFAlKdlzHp-fiw,13228
|
|
190
|
+
evalscope/benchmarks/healthbench/utils.py,sha256=M8SnOEhlqXWm03CFE6CAtbMiu6MqdGgVczAv-LPjA7Y,3683
|
|
183
191
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
184
192
|
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=tAe63NfV5ljUm1f4RTSFxWOVKBUhk3Cc0EGzF5uYLK4,2041
|
|
185
193
|
evalscope/benchmarks/hle/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
186
194
|
evalscope/benchmarks/hle/hle_adapter.py,sha256=4YVmETL9mEiLxF4vWRjePLyFaxelax6nOaqoAH5ZxmU,6389
|
|
187
195
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
188
|
-
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=
|
|
196
|
+
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=uLs3UHSALS3YHt0qzBismrIqdEUgbEalQbjC0CU7ym4,4085
|
|
197
|
+
evalscope/benchmarks/humaneval/utils.py,sha256=rPnc_JuSjNg9aV7UMUwsLrDlm-ufj64GNIBCWBeuRcM,6517
|
|
189
198
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
190
199
|
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=55FQwJ0_eDijppkVVlM5XCXzgRFmjH1SvGMItGsvn6o,2769
|
|
191
200
|
evalscope/benchmarks/ifeval/instructions.py,sha256=HXnn1JgU3dpYltqIovFAn02DxkYOGw337kLMlOfJxJE,56048
|
|
@@ -200,19 +209,26 @@ evalscope/benchmarks/image_edit/gedit/vie_prompts.py,sha256=qVXWQyVUwZxEasDjVmYB
|
|
|
200
209
|
evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
201
210
|
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=mNHA_Fuj_gAdOEoR7oChnGmErf1czqwnk8Zk-jRhBys,1304
|
|
202
211
|
evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
203
|
-
evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=
|
|
212
|
+
evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=maN8qHmDHJpexPeB0qwZoXJ5zrqPbJDYVRptqvXI9d4,6827
|
|
204
213
|
evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
|
|
205
|
-
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=
|
|
214
|
+
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=qnprJTv7zWA9aq6Lw4aDoall---kiivR0oDo3uSO2mI,6399
|
|
206
215
|
evalscope/benchmarks/live_code_bench/load_utils.py,sha256=fEzWz_fUGwi5Ncum5PNVF9jFcuDwGgs7Vt_10YKBE2Q,2087
|
|
207
216
|
evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
|
|
208
217
|
evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
|
|
218
|
+
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py,sha256=7DDx46EwtoR776vWjofJl1zaYCLdmeq8cF3fhDGdZgA,7424
|
|
209
219
|
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=TuoOTciC-hz3FTeDzsQB_THH3Be9UOP2XMrax-4sXkM,17282
|
|
210
220
|
evalscope/benchmarks/maritime_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
211
221
|
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=Rx7iZ5JaEo73YwIzhm78gMDQ6gqcErbnWWXHxXM6BcU,2379
|
|
212
222
|
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
213
|
-
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=
|
|
223
|
+
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=hn7SQhoIHKuH-2A_nGUhQPRw2gl2G-kZldc9ueY0G3A,1802
|
|
214
224
|
evalscope/benchmarks/math_vista/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
215
225
|
evalscope/benchmarks/math_vista/math_vista_adapter.py,sha256=Mu9BpH0rDNM0yMrGws4SEOnXy2NTSIKwyLs5t4nAP-s,5842
|
|
226
|
+
evalscope/benchmarks/minerva_math/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
227
|
+
evalscope/benchmarks/minerva_math/minerva_math_adapter.py,sha256=jyT9_D4w8PTtLBN3Kn10_CnssH_mPuRNnn9rek_zUEs,1655
|
|
228
|
+
evalscope/benchmarks/mm_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
229
|
+
evalscope/benchmarks/mm_bench/mm_bench_adapter.py,sha256=py0DakGQX1JE2rqYjYN9w_-H0DtQ-YqG5k2s_UzbxxU,4372
|
|
230
|
+
evalscope/benchmarks/mm_star/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
231
|
+
evalscope/benchmarks/mm_star/mm_star_adapter.py,sha256=oamLv6U2-JAK5mdVLkUgYxkOahxQkQYMRKAyu_xPAUE,2818
|
|
216
232
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
217
233
|
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=2NT3QbfPzajUTFZ0tBCl6PRrtFtAr5jPZNQRW2Idlno,5947
|
|
218
234
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -222,16 +238,27 @@ evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=m_37OIFrJB4ZIvtbDJ_
|
|
|
222
238
|
evalscope/benchmarks/mmmu/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
223
239
|
evalscope/benchmarks/mmmu/mmmu_adapter.py,sha256=C7UM6HvomcA_Srf7771S0CaUvifBX63i161XaacraGQ,6038
|
|
224
240
|
evalscope/benchmarks/mmmu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
225
|
-
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py,sha256=
|
|
241
|
+
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py,sha256=banPS1nDt9bQ95urKbSZnR-hBTw23eL9MSrHt_0ZLp0,4725
|
|
242
|
+
evalscope/benchmarks/multi_if/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
243
|
+
evalscope/benchmarks/multi_if/ifeval.py,sha256=7y2rnJ4q1_DVA7I9mUnF7TBpu7Kez0X_Xhl-AJInzWk,87949
|
|
244
|
+
evalscope/benchmarks/multi_if/metrics.py,sha256=LWnhQw25cRNMReJ_xJ7Fx7WYHcT9i2FG1FUjYOuQDrI,4291
|
|
245
|
+
evalscope/benchmarks/multi_if/multi_if_adapter.py,sha256=I3_YPPUuRbrs9Gt3Qjhx9RM5Vu2gDFnheDcGu-oe840,5924
|
|
226
246
|
evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
227
247
|
evalscope/benchmarks/musr/musr_adapter.py,sha256=kx6bckj7Nijl4Wysuj-mKYdy0hIRDJho8yVTup403Hc,1473
|
|
228
248
|
evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
229
|
-
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=
|
|
249
|
+
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=GYaswWPwYI3aV5HSpcuBTgW9-HDtf2xzNZg0WrsI0Yo,17033
|
|
230
250
|
evalscope/benchmarks/needle_haystack/utils.py,sha256=k8WDigqt5LgzHw6DtaYsLtb3BJL0FTZS9JOyJCpoPq8,2935
|
|
251
|
+
evalscope/benchmarks/olympiad_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
252
|
+
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py,sha256=zePVmGjmyuwCWVb4h1PIQKAIFqBehwRwO2WOD0KX_ik,6565
|
|
253
|
+
evalscope/benchmarks/olympiad_bench/utils.py,sha256=w7vEZcT3vCVq8_DSMgAjZPpVFVHStJPJYsPkrs-yOFM,21412
|
|
254
|
+
evalscope/benchmarks/omni_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
255
|
+
evalscope/benchmarks/omni_bench/omni_bench_adapter.py,sha256=IJkRSokQC6MF_pN46Yofr_NaZaNt1XZFX1PUBmX4-qA,3651
|
|
231
256
|
evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
232
257
|
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=XN3F6NH7mF4ibwGX5nI01sqEHz05UQFnBAyfAe14QYE,6174
|
|
233
258
|
evalscope/benchmarks/race/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
234
259
|
evalscope/benchmarks/race/race_adapter.py,sha256=KibT9gHpIOZhTcWihG0dUDAX4gAHa2g1WdGPOcEP9OY,1705
|
|
260
|
+
evalscope/benchmarks/real_world_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
261
|
+
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py,sha256=J2u0J9d31uvkoz9nBI9tCMqG27hmYwdLQPPef9jx_pg,2788
|
|
235
262
|
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
236
263
|
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=_duveAliSaPUqVSLQ2TtSv5sfwvFFy7t-MgIIokQ24s,9017
|
|
237
264
|
evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -240,7 +267,7 @@ evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=SPqpBebiHj_oyEqU94p
|
|
|
240
267
|
evalscope/benchmarks/super_gpqa/utils.py,sha256=OK_oT-DnWNssITEwu_Zc3Ty5v21n0IaJQYftK2cpwmQ,3401
|
|
241
268
|
evalscope/benchmarks/tau_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
242
269
|
evalscope/benchmarks/tau_bench/generation.py,sha256=d7J5xrxEI-0BYxdSuxdDavcR7f1ipBdpQsKZzwyzGds,5190
|
|
243
|
-
evalscope/benchmarks/tau_bench/tau_bench_adapter.py,sha256=
|
|
270
|
+
evalscope/benchmarks/tau_bench/tau_bench_adapter.py,sha256=47wA0ia6gezA3nqvUpd4Pb8f5alCrBKEt7GOxJFupow,6464
|
|
244
271
|
evalscope/benchmarks/text2image/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
245
272
|
evalscope/benchmarks/text2image/evalmuse_adapter.py,sha256=g-Wc1qTg-xWLTjiZPo8zmQud75ac-8mBpYRxOHfiO0g,3024
|
|
246
273
|
evalscope/benchmarks/text2image/genai_bench_adapter.py,sha256=1GDB3gS9zwrfb9C83LQdQyN7bvvqeYuu5ulJ9Igmi2k,1876
|
|
@@ -268,14 +295,14 @@ evalscope/collections/__init__.py,sha256=x05hFLrjGsdtuHtc6PyQXHNuucVdYaBN9ZrM8gB
|
|
|
268
295
|
evalscope/collections/sampler.py,sha256=086pzXQO4CO_QYCd10z149Sjh6sBpRBeIHf5OTLOVu8,4896
|
|
269
296
|
evalscope/collections/schema.py,sha256=yzAlnH0O7iiWB4UnkFXI_Dvxcsq9hDgl0aGK2OpyBY8,4158
|
|
270
297
|
evalscope/evaluator/__init__.py,sha256=KzYmVTfU-1pdX7va7l3B1-5QKWG07hj1B7rYkMmxitY,91
|
|
271
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
298
|
+
evalscope/evaluator/evaluator.py,sha256=o99m1CF7xuc3Qn2M25AhWulothZxICwZgZiWxSbynTc,14435
|
|
272
299
|
evalscope/filters/__init__.py,sha256=AsXwKYDjGhFsJvtj036PRjMOPsHGt-CRicnHTtM_qA4,51
|
|
273
300
|
evalscope/filters/extraction.py,sha256=KLFr_3XYsrv0PTvmXy0ugj2sqv2ZOWJFV7G_MmGjTHk,4146
|
|
274
301
|
evalscope/filters/selection.py,sha256=yiJu2JjXDH_lgfEtB9umkGcA3zpo3zvnyoq2mKrXbnw,1609
|
|
275
302
|
evalscope/metrics/__init__.py,sha256=1giVHESSjn98uBiAvYm5uLsmRQwmf9NHPSt7OT_QJss,1615
|
|
276
303
|
evalscope/metrics/llm_judge.py,sha256=XukhH9PQtIZAcbjJlOmOD9ye3ngRv_IGKKJE9jhheOE,8653
|
|
277
304
|
evalscope/metrics/math_parser.py,sha256=BMfautQtNNiF9f2DIEfO6SXSn_GYhzaddAjGWG10MJA,17257
|
|
278
|
-
evalscope/metrics/metric.py,sha256=
|
|
305
|
+
evalscope/metrics/metric.py,sha256=CabKKEbw_DptyH1ZQju7WzjB47fWUKdOhFB1ROpUC-4,10871
|
|
279
306
|
evalscope/metrics/metrics.py,sha256=VxAggzEfaLKxWcXyuve8QbEBwV2W71udVyt0gynzGec,14134
|
|
280
307
|
evalscope/metrics/rouge_metric.py,sha256=bqvSotuDdC0MEKmt8v6y6tBTBx0S3Ma-tfF-cMCckA4,4645
|
|
281
308
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
@@ -384,47 +411,47 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugmen
|
|
|
384
411
|
evalscope/models/__init__.py,sha256=RmW2S31BHBhMN49_VVF_5PJAk-TsuZQkuF2ALShbhAw,556
|
|
385
412
|
evalscope/models/image_edit_model.py,sha256=oVjGgebnFu3ZXBJLNn62rJ65fcJR7DlG4qEVxisPJ2Y,4104
|
|
386
413
|
evalscope/models/mockllm.py,sha256=t1fFAHkEb1n_atOCfnGteCX3DWp774lnWcHzi5lBjwM,2511
|
|
387
|
-
evalscope/models/model_apis.py,sha256=
|
|
414
|
+
evalscope/models/model_apis.py,sha256=ZkZ_nfbeAFJnCndRvRIRLcbmJFTMhGRBi-WfMu0uZKE,1922
|
|
388
415
|
evalscope/models/modelscope.py,sha256=jSFkho_Ir2py54y_Bwj9jpCoY2mMKkZ8ORzne-ldAIE,15806
|
|
389
416
|
evalscope/models/openai_compatible.py,sha256=2uK78nDhWwgph7hcIiMc3NHRbIwvswRDM9o9ENahj4k,4659
|
|
390
417
|
evalscope/models/text2image_model.py,sha256=Sdiyw6vewjVTiXK8RFEh1pohOhDge80EoIWYpnLjr5Y,3929
|
|
391
|
-
evalscope/models/utils/openai.py,sha256=
|
|
418
|
+
evalscope/models/utils/openai.py,sha256=0DzuvTQYFEqcTp6sVtB2VZY7xeyWcOS0I6votqWegUg,28130
|
|
392
419
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
393
|
-
evalscope/perf/arguments.py,sha256=
|
|
420
|
+
evalscope/perf/arguments.py,sha256=FmwVE4gC09B8nLd0sdczeEA9b5ztv4kwhOvLuby4wI8,11695
|
|
394
421
|
evalscope/perf/benchmark.py,sha256=nSJr8lQvHDYiG33tNhkYaVOYONjhJ2wUb1x5RlUiXRY,7968
|
|
395
422
|
evalscope/perf/http_client.py,sha256=4Ov1Cwi7gMgO05ZmazwyfYjUGAQNGWn7nbfl1ljRNh4,4610
|
|
396
423
|
evalscope/perf/main.py,sha256=WZbBgFhIj9KqxzC7_NZxDlou019_EXatsHRt5vqDhFg,3439
|
|
397
424
|
evalscope/perf/plugin/__init__.py,sha256=Ztj4h1_JYJqbbWkeuDTj5aTRyGQf5Woc4xEIyjcokVU,94
|
|
398
425
|
evalscope/perf/plugin/registry.py,sha256=GhLe-h1rGzya2bgIUaV5VymQIaHqI7h5SG_i4PoGAm8,1967
|
|
399
426
|
evalscope/perf/plugin/api/__init__.py,sha256=7RsGdYTSfnW6iVpveEzNu8v4x8Yc8H-Kk39DqOHMrd4,152
|
|
400
|
-
evalscope/perf/plugin/api/base.py,sha256=
|
|
427
|
+
evalscope/perf/plugin/api/base.py,sha256=RRZVk9MFuBwb9PFPTklFhQ_RTihg_E7W_LR26-ldPIA,2782
|
|
401
428
|
evalscope/perf/plugin/api/custom_api.py,sha256=VYJO2lUt9EKdWz6zeYCfvdI0MqfcsIgcKvxqvY5C-3k,10376
|
|
402
429
|
evalscope/perf/plugin/api/dashscope_api.py,sha256=Miv2pzMa6sxZyYYJhCzcbOI_QHuZx7tazKpb6Not7ck,3627
|
|
403
|
-
evalscope/perf/plugin/api/default_api.py,sha256=
|
|
404
|
-
evalscope/perf/plugin/api/openai_api.py,sha256=
|
|
430
|
+
evalscope/perf/plugin/api/default_api.py,sha256=qvMIjbe_rM13cDHcFCwjtCsjc11qE80Yg7LypaSNTXc,4251
|
|
431
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=a6w4C_voza61trHskHaWNPFr2x2zhRVwIXdiNnMH81E,10570
|
|
405
432
|
evalscope/perf/plugin/datasets/__init__.py,sha256=qzeQ9BrJhiJJm1wHaFeOQkvXXdSd15Ucspbn5zjs-6Q,495
|
|
406
433
|
evalscope/perf/plugin/datasets/base.py,sha256=-3Ihnp2hYvZyPnP8Gh2Pu8ovlLNFHyZnNgRu3WHG4d0,2714
|
|
407
434
|
evalscope/perf/plugin/datasets/custom.py,sha256=yoRHTvTGAglaZ-mmRkPjYNMG7uZYuT1_KrBxnl2i0qg,1385
|
|
408
|
-
evalscope/perf/plugin/datasets/flickr8k.py,sha256=
|
|
409
|
-
evalscope/perf/plugin/datasets/kontext_bench.py,sha256
|
|
435
|
+
evalscope/perf/plugin/datasets/flickr8k.py,sha256=nhHiGNhXX-2c17NQ5q5Q7FgV2hB8XVeeAP8dKkboyHE,1033
|
|
436
|
+
evalscope/perf/plugin/datasets/kontext_bench.py,sha256=cN70hiBX1940IWvNWZG9YGE4vO1yj41Bo7bqmOWusoQ,1081
|
|
410
437
|
evalscope/perf/plugin/datasets/line_by_line.py,sha256=F4ppdjKKLzFNf_16h6S-6nAU4lOfOFI2-tPgIeZDTMA,996
|
|
411
438
|
evalscope/perf/plugin/datasets/longalpaca.py,sha256=JjPGYP8NdPmP48wff2fL5IZQfajXL5qhZBvKmZxtfW4,1336
|
|
412
439
|
evalscope/perf/plugin/datasets/openqa.py,sha256=5PqqiIvNTLlRrPb8PWqMGQyWRb6LuIqipYn67-xd-dY,1519
|
|
413
440
|
evalscope/perf/plugin/datasets/random_dataset.py,sha256=NNAXvgFPkLDOSpYNex1DyE4X-ELtQRm13_oBooO30j8,3514
|
|
414
|
-
evalscope/perf/plugin/datasets/random_vl_dataset.py,sha256=
|
|
441
|
+
evalscope/perf/plugin/datasets/random_vl_dataset.py,sha256=e6exWQnupWkTDNwt2MmEK-hccuxEDmWLJRMM70onKi0,3230
|
|
415
442
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
|
|
416
443
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
417
444
|
evalscope/perf/utils/analysis_result.py,sha256=aoT7JD2zAzBeuZUfncKhJ2odX_7KnymwOmNB1Upam2c,935
|
|
418
|
-
evalscope/perf/utils/benchmark_util.py,sha256=
|
|
445
|
+
evalscope/perf/utils/benchmark_util.py,sha256=A5d--rCElabDOl6Aaxqnu0fNR5c763YZwKIHBSeTK00,7294
|
|
419
446
|
evalscope/perf/utils/db_util.py,sha256=HAISq6M7xCD2gjUEqqfbK3FjBxA-tvr_n-751tU9ypo,11634
|
|
420
447
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
421
448
|
evalscope/perf/utils/local_server.py,sha256=_lSPlNEnOmPA_DtREgPS_vj2w_7D8PPSpypXbb0YfJM,4880
|
|
422
449
|
evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
|
|
423
450
|
evalscope/perf/utils/rich_display.py,sha256=AQmXv1EuA1-IGgco-Jy1NLOmTKv4eBFH2K4QS8OoGVo,8206
|
|
424
451
|
evalscope/report/__init__.py,sha256=DTigCg9fkU_zGNDqIaZy3CWYbrlvODvCxCTVqSx6ano,875
|
|
425
|
-
evalscope/report/combinator.py,sha256=
|
|
452
|
+
evalscope/report/combinator.py,sha256=Xzlhs7kwfI6cgs7rngxhvsur0bCJkrM0tAy6isq2VME,3235
|
|
426
453
|
evalscope/report/generator.py,sha256=t2R3WGa4SowTRUPOgITtyTR4QDiJ6i3FH__byDKZU8Y,4959
|
|
427
|
-
evalscope/report/report.py,sha256=
|
|
454
|
+
evalscope/report/report.py,sha256=lEBD_E_RJiydFTaGFNLIMTFxNrqv8QcLZb_iuUg5HB0,8479
|
|
428
455
|
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
429
456
|
evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
|
|
430
457
|
evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
|
|
@@ -464,12 +491,12 @@ evalscope/utils/__init__.py,sha256=5OH8cOoX3YKMKUu0dMRvwzckXligIbUV-1jjJNXlpGI,2
|
|
|
464
491
|
evalscope/utils/argument_utils.py,sha256=D7qOH85wf7LKh_cJ2X51OEaL7CMaddydmHZkfoYpvLk,1952
|
|
465
492
|
evalscope/utils/chat_service.py,sha256=sSki2pKGQP3UjcIf_lbO06afI-vsaUAqglwX__wUDEw,8766
|
|
466
493
|
evalscope/utils/deprecation_utils.py,sha256=aDv3HFNcJFZ7rxNgALQP0-ITO8L23HC_RX-C_m2i34Y,1610
|
|
467
|
-
evalscope/utils/function_utils.py,sha256=
|
|
468
|
-
evalscope/utils/import_utils.py,sha256=
|
|
469
|
-
evalscope/utils/io_utils.py,sha256=
|
|
470
|
-
evalscope/utils/json_schema.py,sha256=
|
|
471
|
-
evalscope/utils/logger.py,sha256=
|
|
472
|
-
evalscope/utils/model_utils.py,sha256=
|
|
494
|
+
evalscope/utils/function_utils.py,sha256=E-AIzx_PKrZDGl1cBvlvqNvMa8yM2WUJ2wh73PNBXrQ,1887
|
|
495
|
+
evalscope/utils/import_utils.py,sha256=S0WQ3gt4zpwJHjGcyC-604pWWExg3JV7f3wzoOH-tuo,5794
|
|
496
|
+
evalscope/utils/io_utils.py,sha256=79F0p7dFxA84tIVSL_C4piJgeQQtVUfb2R_Xcd8v_cE,11615
|
|
497
|
+
evalscope/utils/json_schema.py,sha256=ZExvQA-SI6SxWBx_hCmuQ2RRqwGKuywy4sTotvd2hH0,8288
|
|
498
|
+
evalscope/utils/logger.py,sha256=roFk4Su4aJwsF0s-uYc5-tABnghwYPX3gpkA5QUGzK8,5675
|
|
499
|
+
evalscope/utils/model_utils.py,sha256=mdtYoHhUdfpxUtnS52XZjNdO3uSK4yeIBHT3aDU7s-A,2455
|
|
473
500
|
evalscope/utils/multi_choices.py,sha256=OxBER7amWpoRY0Z-o39rDmCNK6wpr1HQm9mMHpWLgp0,9524
|
|
474
501
|
evalscope/utils/url_utils.py,sha256=9HcFt9uZNbOJR3ADUFQ_dBFKziHV6H66Df7HYs1M4Po,1757
|
|
475
502
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
@@ -477,17 +504,18 @@ tests/common.py,sha256=BB136KcGaEfdWqMwApa48K0CTSGmOCUZ0FYDqpfYnAA,2423
|
|
|
477
504
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
478
505
|
tests/utils.py,sha256=Fgm0CU6ilZjCGOfOMJH-Trxy0UIAGbhvy0Ijy_zDGUk,323
|
|
479
506
|
tests/benchmark/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
480
|
-
tests/benchmark/test_eval.py,sha256=
|
|
507
|
+
tests/benchmark/test_eval.py,sha256=vSAvhiCKxHpjHdGhZn8l0qzPSiG1ZZafz_M06B_a8_Y,13827
|
|
481
508
|
tests/benchmark/test_image_edit.py,sha256=z3z7psMRFynpVgUAFoH--ieeGXzb9cHkrq3tT_sCZo8,2165
|
|
509
|
+
tests/benchmark/test_sandbox.py,sha256=bHyX8ammdn7EsEbN80cIzDNhQZlJD3Ssoj9l4efF7rI,2968
|
|
482
510
|
tests/benchmark/test_t2i.py,sha256=fciaGsOrkOpT4WQlsnmjrqw6qolCzI0DGyWQAJkM-Es,4513
|
|
483
|
-
tests/benchmark/test_vlm.py,sha256=
|
|
511
|
+
tests/benchmark/test_vlm.py,sha256=gn0ledf_yPY1IhCyCtiqT_dTVPUVZ3NVPr9yzsC_UZQ,4501
|
|
484
512
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
485
513
|
tests/cli/test_all.py,sha256=1omOXC1lBphBLm0hTf5HNstlF_bwi16dYyr00gvaCTM,7301
|
|
486
514
|
tests/cli/test_collection.py,sha256=lGz3YUS_0gM6_HjQLe26OfBAkHOPOEDWMO-UyP58GN8,4455
|
|
487
515
|
tests/cli/test_custom.py,sha256=9z_N7Re712xI62TqVSTBdzB_iFFEUb55wcWIcGvJb84,9254
|
|
488
516
|
tests/cli/test_reasoning.py,sha256=rU181LLoKbFCpNPFCIZULxEgsJ2PYswel2pP2EsjEmo,2696
|
|
489
517
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
490
|
-
tests/perf/test_perf.py,sha256=
|
|
518
|
+
tests/perf/test_perf.py,sha256=ugYNEyU32ctryPFa_6fr8aQYxfHJMymdKnKKEHM9Ajc,6174
|
|
491
519
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
492
520
|
tests/rag/test_clip_benchmark.py,sha256=qpSLgmHMGcYTnxP7AI__y-ii5_tu_fCSht6p3TBetkA,2650
|
|
493
521
|
tests/rag/test_mteb.py,sha256=fdNQIyUEzE7puPCKw5QhCHTEu7hz-ieHeq1xCWGh6IM,7246
|
|
@@ -498,9 +526,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=RwrKkc1WHEZxetM11cGL81G4faKCn7SYn4
|
|
|
498
526
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=UAUtOCQ72xbm8s-sov3cBEpYVDy189wpB-qOL3KoU7M,6053
|
|
499
527
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
500
528
|
tests/vlm/test_vlmeval.py,sha256=EDQRkYfSyOICUwo_tm3p-puaE_xdFmqOPkrt5etxsqM,3307
|
|
501
|
-
evalscope-1.0.
|
|
502
|
-
evalscope-1.0.
|
|
503
|
-
evalscope-1.0.
|
|
504
|
-
evalscope-1.0.
|
|
505
|
-
evalscope-1.0.
|
|
506
|
-
evalscope-1.0.
|
|
529
|
+
evalscope-1.0.2.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
530
|
+
evalscope-1.0.2.dist-info/METADATA,sha256=vZciS7qNosSJOdwyRSxsCyVqvw8hyqKS84yKjlbxwzw,40305
|
|
531
|
+
evalscope-1.0.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
532
|
+
evalscope-1.0.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
533
|
+
evalscope-1.0.2.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
534
|
+
evalscope-1.0.2.dist-info/RECORD,,
|
tests/benchmark/test_eval.py
CHANGED
|
@@ -33,12 +33,13 @@ class TestNativeBenchmark(TestBenchmark):
|
|
|
33
33
|
'judge_strategy': JudgeStrategy.AUTO,
|
|
34
34
|
'judge_worker_num': 5,
|
|
35
35
|
'judge_model_args': {
|
|
36
|
-
'model_id': '
|
|
36
|
+
'model_id': 'qwen3-235b-a22b',
|
|
37
37
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
38
38
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
39
39
|
'generation_config': {
|
|
40
40
|
'temperature': 0.0,
|
|
41
41
|
'max_tokens': 4096,
|
|
42
|
+
'extra_body': {'enable_thinking': False}
|
|
42
43
|
}
|
|
43
44
|
},
|
|
44
45
|
'debug': True,
|
|
@@ -65,6 +66,14 @@ class TestNativeBenchmark(TestBenchmark):
|
|
|
65
66
|
}
|
|
66
67
|
self._run_dataset_test('mmlu', use_mock=True, dataset_args=dataset_args)
|
|
67
68
|
|
|
69
|
+
def test_mmlu_reasoning(self):
|
|
70
|
+
"""Test MMLU reasoning dataset."""
|
|
71
|
+
dataset_args = {
|
|
72
|
+
'few_shot_num': 0,
|
|
73
|
+
'subset_list': ['abstract_algebra', 'computer_security']
|
|
74
|
+
}
|
|
75
|
+
self._run_dataset_test('mmlu', dataset_args=dataset_args, model='qwen3-0.6b', stream=True)
|
|
76
|
+
|
|
68
77
|
def test_mmlu_pro(self):
|
|
69
78
|
"""Test MMLU-Pro reasoning dataset."""
|
|
70
79
|
dataset_args = {
|
|
@@ -203,6 +212,7 @@ class TestNativeBenchmark(TestBenchmark):
|
|
|
203
212
|
def test_bbh(self):
|
|
204
213
|
dataset_args = {
|
|
205
214
|
'subset_list': ['temporal_sequences', 'navigate'],
|
|
215
|
+
'few_shot_num': 0,
|
|
206
216
|
}
|
|
207
217
|
self._run_dataset_test('bbh', dataset_args=dataset_args)
|
|
208
218
|
|
|
@@ -317,20 +327,21 @@ class TestNativeBenchmark(TestBenchmark):
|
|
|
317
327
|
def test_humaneval(self):
|
|
318
328
|
"""Test HumanEval dataset."""
|
|
319
329
|
dataset_args = {
|
|
320
|
-
'metric_list': ['Pass@1'
|
|
330
|
+
'metric_list': ['Pass@1']
|
|
321
331
|
}
|
|
322
|
-
self._run_dataset_test('humaneval', dataset_args
|
|
332
|
+
self._run_dataset_test('humaneval', dataset_args)
|
|
323
333
|
|
|
324
334
|
def test_live_code_bench(self):
|
|
325
335
|
"""Test LiveCodeBench dataset."""
|
|
326
336
|
dataset_args = {
|
|
327
|
-
'subset_list': ['
|
|
337
|
+
'subset_list': ['v5'],
|
|
338
|
+
'review_timeout': 6,
|
|
328
339
|
'extra_params': {
|
|
329
340
|
'start_date': '2024-08-01',
|
|
330
341
|
'end_date': '2025-02-28'
|
|
331
342
|
},
|
|
332
343
|
}
|
|
333
|
-
self._run_dataset_test('live_code_bench', dataset_args,
|
|
344
|
+
self._run_dataset_test('live_code_bench', dataset_args, limit=20, use_cache='outputs/20250918_200232', rerun_review=True)
|
|
334
345
|
|
|
335
346
|
def test_tool_bench(self):
|
|
336
347
|
"""Test ToolBench dataset."""
|
|
@@ -339,13 +350,18 @@ class TestNativeBenchmark(TestBenchmark):
|
|
|
339
350
|
def test_bfcl(self):
|
|
340
351
|
"""Test BFCL dataset."""
|
|
341
352
|
dataset_args = {
|
|
342
|
-
'subset_list': [
|
|
353
|
+
'subset_list': [
|
|
354
|
+
# 'simple',
|
|
355
|
+
# 'live_multiple',
|
|
356
|
+
# 'multi_turn_base',
|
|
357
|
+
'multi_turn_miss_func'
|
|
358
|
+
],
|
|
343
359
|
'extra_params': {
|
|
344
360
|
'is_fc_model': True,
|
|
345
361
|
'underscore_to_dot': True
|
|
346
362
|
}
|
|
347
363
|
}
|
|
348
|
-
self._run_dataset_test('bfcl_v3', dataset_args, model='
|
|
364
|
+
self._run_dataset_test('bfcl_v3', dataset_args, model='qwen-plus', limit=30, eval_batch_size=5)
|
|
349
365
|
|
|
350
366
|
def test_tau_bench(self):
|
|
351
367
|
dataset_args = {
|
|
@@ -378,6 +394,34 @@ class TestNativeBenchmark(TestBenchmark):
|
|
|
378
394
|
}
|
|
379
395
|
self._run_dataset_test('data_collection', dataset_args)
|
|
380
396
|
|
|
397
|
+
def test_multi_if(self):
|
|
398
|
+
dataset_args = {
|
|
399
|
+
'subset_list': ['English', 'Chinese'],
|
|
400
|
+
'few_shot_num': 0,
|
|
401
|
+
}
|
|
402
|
+
self._run_dataset_test('multi_if', dataset_args, limit=5)
|
|
403
|
+
|
|
404
|
+
def test_healthbench(self):
|
|
405
|
+
dataset_args = {
|
|
406
|
+
'subset_list': ['health_data_tasks'],
|
|
407
|
+
'extra_params': {
|
|
408
|
+
'version': 'Hard'
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
self._run_dataset_test('health_bench', dataset_args, limit=5)
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def test_amc(self):
|
|
415
|
+
dataset_args = {
|
|
416
|
+
'subset_list': ['amc22'],
|
|
417
|
+
}
|
|
418
|
+
self._run_dataset_test('amc', dataset_args)
|
|
419
|
+
|
|
420
|
+
def test_minerva_math(self):
|
|
421
|
+
dataset_args = {
|
|
422
|
+
'subset_list': ['default'],
|
|
423
|
+
}
|
|
424
|
+
self._run_dataset_test('minerva_math', dataset_args)
|
|
381
425
|
|
|
382
426
|
if __name__ == '__main__':
|
|
383
427
|
# Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
6
|
+
import unittest
|
|
7
|
+
|
|
8
|
+
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
9
|
+
from evalscope.utils.logger import get_logger
|
|
10
|
+
from tests.common import TestBenchmark
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestCodeBenchmark(TestBenchmark):
|
|
16
|
+
"""Benchmark evaluation test cases."""
|
|
17
|
+
|
|
18
|
+
def setUp(self):
|
|
19
|
+
"""Setup common test configuration."""
|
|
20
|
+
self.base_config = {
|
|
21
|
+
'model': 'qwen-plus',
|
|
22
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
23
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
24
|
+
'eval_type': EvalType.SERVICE,
|
|
25
|
+
'eval_batch_size': 5,
|
|
26
|
+
'limit': 5,
|
|
27
|
+
'generation_config': {
|
|
28
|
+
'max_tokens': 4096,
|
|
29
|
+
'temperature': 0.0,
|
|
30
|
+
'seed': 42,
|
|
31
|
+
'parallel_tool_calls': True
|
|
32
|
+
},
|
|
33
|
+
'judge_strategy': JudgeStrategy.AUTO,
|
|
34
|
+
'judge_worker_num': 5,
|
|
35
|
+
'judge_model_args': {
|
|
36
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
37
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
38
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
39
|
+
'generation_config': {
|
|
40
|
+
'temperature': 0.0,
|
|
41
|
+
'max_tokens': 4096,
|
|
42
|
+
}
|
|
43
|
+
},
|
|
44
|
+
'use_sandbox': True,
|
|
45
|
+
'sandbox_type': 'docker',
|
|
46
|
+
'debug': True,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def test_humaneval(self):
|
|
50
|
+
"""Test Humaneval dataset."""
|
|
51
|
+
self._run_dataset_test('humaneval', limit=5)
|
|
52
|
+
|
|
53
|
+
def test_humaneval_remote_sandbox(self):
|
|
54
|
+
"""Test Humaneval dataset with remote sandbox manager."""
|
|
55
|
+
sandbox_manager_config = {'base_url': 'http://localhost:8000'}
|
|
56
|
+
self._run_dataset_test('humaneval', limit=5, sandbox_manager_config=sandbox_manager_config)
|
|
57
|
+
|
|
58
|
+
def test_live_code_bench(self):
|
|
59
|
+
"""Test Live Code Bench dataset."""
|
|
60
|
+
dataset_args = {
|
|
61
|
+
'subset_list': ['v5'],
|
|
62
|
+
'review_timeout': 6,
|
|
63
|
+
'extra_params': {
|
|
64
|
+
'start_date': '2024-08-01',
|
|
65
|
+
'end_date': '2025-02-28'
|
|
66
|
+
},
|
|
67
|
+
}
|
|
68
|
+
self._run_dataset_test('live_code_bench', limit=5, dataset_args=dataset_args, use_cache='outputs/20250918_200232', rerun_review=True)
|
|
69
|
+
|
|
70
|
+
def test_live_code_bench_remote_sandbox(self):
|
|
71
|
+
"""Test Live Code Bench dataset."""
|
|
72
|
+
dataset_args = {
|
|
73
|
+
'subset_list': ['v5'],
|
|
74
|
+
'review_timeout': 6,
|
|
75
|
+
'extra_params': {
|
|
76
|
+
'start_date': '2024-08-01',
|
|
77
|
+
'end_date': '2025-02-28'
|
|
78
|
+
},
|
|
79
|
+
}
|
|
80
|
+
sandbox_manager_config = {'base_url': 'http://localhost:8000'}
|
|
81
|
+
self._run_dataset_test('live_code_bench', limit=20, dataset_args=dataset_args, sandbox_manager_config=sandbox_manager_config, use_cache='outputs/20250918_200232_2', rerun_review=True)
|
tests/benchmark/test_vlm.py
CHANGED
|
@@ -25,7 +25,7 @@ class TestVLMBenchmark(TestBenchmark):
|
|
|
25
25
|
'eval_batch_size': 5,
|
|
26
26
|
'limit': 5,
|
|
27
27
|
'generation_config': {
|
|
28
|
-
'max_tokens':
|
|
28
|
+
'max_tokens': 2048,
|
|
29
29
|
'temperature': 0.0,
|
|
30
30
|
'seed': 42,
|
|
31
31
|
'parallel_tool_calls': True
|
|
@@ -72,9 +72,66 @@ class TestVLMBenchmark(TestBenchmark):
|
|
|
72
72
|
}
|
|
73
73
|
self._run_dataset_test('mmmu_pro', dataset_args=dataset_args, limit=10)
|
|
74
74
|
|
|
75
|
-
def
|
|
75
|
+
def test_qwen3_vl_collection(self):
|
|
76
76
|
dataset_args = {
|
|
77
77
|
'dataset_id': 'outputs/qwen3_vl_test.jsonl',
|
|
78
78
|
'shuffle': True,
|
|
79
79
|
}
|
|
80
|
-
self._run_dataset_test('data_collection', dataset_args)
|
|
80
|
+
self._run_dataset_test('data_collection', dataset_args, limit=100)
|
|
81
|
+
|
|
82
|
+
def test_real_world_qa(self):
|
|
83
|
+
dataset_args = {
|
|
84
|
+
'subset_list': ['default']
|
|
85
|
+
}
|
|
86
|
+
self._run_dataset_test('real_world_qa', dataset_args=dataset_args, limit=10)
|
|
87
|
+
|
|
88
|
+
def test_ai2d(self):
|
|
89
|
+
dataset_args = {
|
|
90
|
+
'subset_list': ['default']
|
|
91
|
+
}
|
|
92
|
+
self._run_dataset_test('ai2d', dataset_args=dataset_args)
|
|
93
|
+
|
|
94
|
+
def test_cc_bench(self):
|
|
95
|
+
dataset_args = {
|
|
96
|
+
'subset_list': ['cc']
|
|
97
|
+
}
|
|
98
|
+
self._run_dataset_test('cc_bench', dataset_args=dataset_args)
|
|
99
|
+
|
|
100
|
+
def test_mm_bench(self):
|
|
101
|
+
dataset_args = {
|
|
102
|
+
'subset_list': ['cn', 'en']
|
|
103
|
+
}
|
|
104
|
+
self._run_dataset_test('mm_bench', dataset_args=dataset_args)
|
|
105
|
+
|
|
106
|
+
def test_mm_star(self):
|
|
107
|
+
dataset_args = {
|
|
108
|
+
# 'subset_list': ['val']
|
|
109
|
+
}
|
|
110
|
+
self._run_dataset_test('mm_star', dataset_args=dataset_args)
|
|
111
|
+
|
|
112
|
+
def test_omni_bench(self):
|
|
113
|
+
dataset_args = {
|
|
114
|
+
'extra_params': {
|
|
115
|
+
'use_image': True, # Whether to use image input, if False, use text alternative image content.
|
|
116
|
+
'use_audio': True, # Whether to use audio input, if False, use text alternative audio content.
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
self._run_dataset_test('omni_bench', dataset_args=dataset_args, model='qwen-omni-turbo')
|
|
120
|
+
|
|
121
|
+
def test_olympiad_bench(self):
|
|
122
|
+
dataset_args = {
|
|
123
|
+
'subset_list': [
|
|
124
|
+
# 'OE_MM_maths_en_COMP',
|
|
125
|
+
# 'OE_MM_maths_zh_CEE',
|
|
126
|
+
# 'OE_MM_maths_zh_COMP',
|
|
127
|
+
# 'OE_MM_physics_en_COMP',
|
|
128
|
+
# 'OE_MM_physics_zh_CEE',
|
|
129
|
+
# 'OE_TO_maths_en_COMP',
|
|
130
|
+
# 'OE_TO_maths_zh_CEE',
|
|
131
|
+
# 'OE_TO_maths_zh_COMP',
|
|
132
|
+
# 'OE_TO_physics_en_COMP',
|
|
133
|
+
# 'OE_TO_physics_zh_CEE',
|
|
134
|
+
'TP_TO_maths_zh_CEE',
|
|
135
|
+
]
|
|
136
|
+
}
|
|
137
|
+
self._run_dataset_test('olympiad_bench', dataset_args=dataset_args)
|