evalscope 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (97) hide show
  1. evalscope/api/benchmark/__init__.py +1 -1
  2. evalscope/api/benchmark/adapters/__init__.py +2 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +1 -0
  4. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  5. evalscope/api/benchmark/adapters/text2image_adapter.py +7 -6
  6. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  7. evalscope/api/benchmark/benchmark.py +35 -0
  8. evalscope/api/benchmark/meta.py +6 -0
  9. evalscope/api/dataset/dataset.py +6 -6
  10. evalscope/api/dataset/loader.py +2 -1
  11. evalscope/api/evaluator/cache.py +24 -1
  12. evalscope/api/evaluator/state.py +12 -1
  13. evalscope/api/messages/__init__.py +1 -0
  14. evalscope/api/messages/chat_message.py +47 -2
  15. evalscope/api/metric/scorer.py +15 -7
  16. evalscope/api/mixin/__init__.py +0 -1
  17. evalscope/api/model/generate_config.py +1 -3
  18. evalscope/api/model/model.py +4 -1
  19. evalscope/app/app.py +3 -0
  20. evalscope/app/ui/single_model.py +3 -3
  21. evalscope/app/utils/data_utils.py +7 -7
  22. evalscope/app/utils/env_utils.py +12 -0
  23. evalscope/app/utils/text_utils.py +14 -12
  24. evalscope/arguments.py +2 -4
  25. evalscope/backend/opencompass/backend_manager.py +0 -2
  26. evalscope/backend/rag_eval/utils/embedding.py +9 -1
  27. evalscope/benchmarks/bfcl/bfcl_adapter.py +2 -6
  28. evalscope/benchmarks/bfcl/generation.py +2 -2
  29. evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
  30. evalscope/benchmarks/data_collection/data_collection_adapter.py +23 -19
  31. evalscope/benchmarks/frames/frames_adapter.py +2 -1
  32. evalscope/benchmarks/general_arena/general_arena_adapter.py +5 -1
  33. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  34. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  35. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  36. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  37. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  38. evalscope/benchmarks/mmmu/__init__.py +0 -0
  39. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  40. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  41. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  42. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +5 -1
  43. evalscope/benchmarks/tau_bench/generation.py +1 -1
  44. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +15 -19
  45. evalscope/benchmarks/text2image/__init__.py +0 -0
  46. evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
  47. evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
  48. evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
  49. evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
  50. evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
  51. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
  52. evalscope/cli/start_app.py +7 -1
  53. evalscope/cli/start_perf.py +7 -1
  54. evalscope/config.py +72 -13
  55. evalscope/constants.py +8 -0
  56. evalscope/evaluator/evaluator.py +6 -4
  57. evalscope/metrics/llm_judge.py +19 -7
  58. evalscope/models/image_edit_model.py +125 -0
  59. evalscope/models/model_apis.py +20 -0
  60. evalscope/models/openai_compatible.py +3 -0
  61. evalscope/models/text2image_model.py +2 -2
  62. evalscope/models/utils/openai.py +7 -4
  63. evalscope/perf/benchmark.py +2 -0
  64. evalscope/perf/utils/benchmark_util.py +8 -5
  65. evalscope/perf/utils/local_server.py +3 -0
  66. evalscope/report/__init__.py +0 -1
  67. evalscope/report/generator.py +8 -87
  68. evalscope/run.py +9 -5
  69. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  70. evalscope/utils/chat_service.py +1 -1
  71. evalscope/utils/import_utils.py +23 -1
  72. evalscope/utils/io_utils.py +42 -1
  73. evalscope/utils/model_utils.py +4 -3
  74. evalscope/utils/multi_choices.py +23 -6
  75. evalscope/version.py +2 -2
  76. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/METADATA +12 -15
  77. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/RECORD +94 -80
  78. tests/benchmark/test_eval.py +30 -31
  79. tests/benchmark/test_image_edit.py +65 -0
  80. tests/benchmark/test_vlm.py +80 -0
  81. tests/cli/test_all.py +83 -43
  82. tests/cli/test_collection.py +8 -5
  83. tests/cli/test_reasoning.py +81 -0
  84. tests/common.py +73 -0
  85. tests/perf/test_perf.py +4 -2
  86. tests/rag/test_clip_benchmark.py +0 -3
  87. evalscope/api/mixin/dataset_mixin.py +0 -105
  88. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
  89. tests/aigc/__init__.py +0 -1
  90. /evalscope/benchmarks/{aigc → image_edit}/__init__.py +0 -0
  91. /evalscope/benchmarks/{aigc/i2i → image_edit/gedit}/__init__.py +0 -0
  92. /evalscope/benchmarks/{aigc/t2i → math_vista}/__init__.py +0 -0
  93. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  94. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  95. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  96. {evalscope-1.0.0.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  97. /tests/{aigc → benchmark}/test_t2i.py +0 -0
@@ -1,66 +1,68 @@
1
1
  evalscope/__init__.py,sha256=oivLvqwNw2JlB-h-Z8_525IpfKcYEkS51F59tEfpy5w,445
2
- evalscope/arguments.py,sha256=3mYv_kPerYxxI426GifY5mUmy8CSUaaFy7mdskPFKgY,5881
3
- evalscope/config.py,sha256=CRwJgcPnHp2mBmVA3IihsBrt6gGP0AIXqgBIwxIYAUM,8160
4
- evalscope/constants.py,sha256=mtKSLlr92e6c4ze0-FdZrNE1mp1VlFcE42KMD3DwnK8,3239
5
- evalscope/run.py,sha256=sksjcOsI1Q_0Jzgvs470_bkcKWA1zH6qJj_ZJgEGeMM,6281
2
+ evalscope/arguments.py,sha256=OthHwNhG9VrP7_CYocmjZ4iVyG5LJbzO0FhseoLBalk,5663
3
+ evalscope/config.py,sha256=NVFXbU0kVof2V8Bnjs-O2FEPdlXx3rZuoHcttm1THbM,10564
4
+ evalscope/constants.py,sha256=cbkKHmEcJHF9T0m4yREx08__tulj6MV59im2RW-pR3c,3433
5
+ evalscope/run.py,sha256=1JjqSky3Fm3v1tOE9pgR7alODoSNWa4ZdoLTWFLgjRE,6510
6
6
  evalscope/summarizer.py,sha256=HUDJ1zKi22uNst3AUfX67Z0sHzeZy-4S8sYyvxJnBzc,5901
7
- evalscope/version.py,sha256=OOygMFcZzcXVtNof6kBqsKsaK1Y03SJIHehdMdcqyT0,118
7
+ evalscope/version.py,sha256=5Jk88EAyvBpPzsQaFYKGjukIwF3tVCXIrarT94bYsCQ,118
8
8
  evalscope/api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  evalscope/api/registry.py,sha256=Qk0KMGDbt-iI0-OfoJZbOtxt76qreAVWh36HOoQAKM4,5448
10
- evalscope/api/benchmark/__init__.py,sha256=PW--qkFb5b6DlKKNwhI873hXsaoP3IAgdHayQQYfKt8,147
11
- evalscope/api/benchmark/benchmark.py,sha256=OyQElOvIjeYw6BlYG6OZ9RqX1Gaz5f9Ka_JuUZPj-_A,9099
12
- evalscope/api/benchmark/meta.py,sha256=-zhwrUPbhFCI35ugE5jBhElpiZeSNZ8XP83pdgArqIc,4026
13
- evalscope/api/benchmark/adapters/__init__.py,sha256=qtjE9gXnx4SiITtEl2ii_IYoKBKXMVO7ntabexSXLD4,156
14
- evalscope/api/benchmark/adapters/default_data_adapter.py,sha256=YtiPzjZ2IB4spvNwnJ9Lg7bTWjBXJjQ-7druKi5xlNc,27913
10
+ evalscope/api/benchmark/__init__.py,sha256=9xcTxpcQ6HhZ0QDwEIZhAT5IjybzaJ60VGLcmaFE5dU,188
11
+ evalscope/api/benchmark/benchmark.py,sha256=q5hmEH845DfmvEB1NvlHM1b-oCCMpatIamT-2ubudbM,10088
12
+ evalscope/api/benchmark/meta.py,sha256=G6Q5E1JwO-CpEwsjhMrXHExlVRUF1Ah5Nz21vkP8IV0,4218
13
+ evalscope/api/benchmark/adapters/__init__.py,sha256=uLt_GiU4s-_6Rjgmr4OUTtE7dvEX-ZIQ403fd6oNuxA,264
14
+ evalscope/api/benchmark/adapters/default_data_adapter.py,sha256=Y8wzOxq3qpbE2lgLZyXHxoLUxjlmbS-N6ByObrBwOvc,27977
15
+ evalscope/api/benchmark/adapters/image_edit_adapter.py,sha256=06V-_A8RKuMNYMt7-vaXn2qBa9LIZgfFO_6PUuhAkh0,3052
15
16
  evalscope/api/benchmark/adapters/multi_choice_adapter.py,sha256=wp_6Kws3GoBk_mSzQP8Nr40osFf3iPJpntkANYAuIcc,2979
16
- evalscope/api/benchmark/adapters/text2image_adapter.py,sha256=Q0G0VI8zaxggLO7Q0S6G0ZnK3nTwo7rngXqpaY_cVgo,6281
17
+ evalscope/api/benchmark/adapters/text2image_adapter.py,sha256=4mccYHKB-9iyOZ0uwkTi2TgC76KIJpcu_4hnfbU5NMc,6434
18
+ evalscope/api/benchmark/adapters/vision_language_adapter.py,sha256=N9LPh5tTGkvRYzp4giI0La0u4xzrHcJGhdTY9jiNCxY,219
17
19
  evalscope/api/dataset/__init__.py,sha256=RHFMzwfONEqmmn3vRtxyN3r29mipDUUUSEDhuwm0YpQ,147
18
- evalscope/api/dataset/dataset.py,sha256=rqVQxnEKfNDGowSxTwV9HSDRtmjUfu4oCEIc_y1ExkY,11333
19
- evalscope/api/dataset/loader.py,sha256=7BHtxlV3smrV5R_RNifvMNmH9o_uSSKmngSM-ewDQ70,9668
20
+ evalscope/api/dataset/dataset.py,sha256=9bwSx89zgOOBRQkRPVv-B5Yi30A6J1MLtekQSqwsy9g,11328
21
+ evalscope/api/dataset/loader.py,sha256=t7KLH5ltLUumhiPIyYJzk6zn2iKLx-D2gIIoMhKdnhc,9714
20
22
  evalscope/api/dataset/utils.py,sha256=3E0ikqr6QWV_lX0d3Z4F4xFuVTcwbeDPgCvJY7v83Bc,4935
21
23
  evalscope/api/evaluator/__init__.py,sha256=-Ure6X4GlE7VYSNWSZ_DpjbUBGa5irVTymLENEHTYqY,138
22
- evalscope/api/evaluator/cache.py,sha256=Hovka77enKYTxomeTnH-e1vPzozRiZ4CnHunHEWATiA,12546
24
+ evalscope/api/evaluator/cache.py,sha256=a_M2ouUjtkMr5m3wRbmsE8ETP_aacxbm0d38yY5RljM,13244
23
25
  evalscope/api/evaluator/evaluator.py,sha256=SGW4RIKc79IlUP5FisrEycJlqORcaYxyIP5eabaSfeU,1600
24
- evalscope/api/evaluator/state.py,sha256=vLTrICWWqcK9asfPJFB0JfOGXZnVKmeLcvBACtvEfX4,8543
26
+ evalscope/api/evaluator/state.py,sha256=OyZUtQw9Wd6X8MA2mtmTGn74ReBq1x-JfWwV_TT99UY,8892
25
27
  evalscope/api/filter/__init__.py,sha256=5eWKjT-dAiz8nE0S6WnU6plqjXZHYn7CJOgFiHSoovM,66
26
28
  evalscope/api/filter/filter.py,sha256=fsPddaHE5wwFIXgUWITFqlYXqdh6vx3QqcEf3rSXKVI,2068
27
- evalscope/api/messages/__init__.py,sha256=31jIVA2zSwDgAcOdOd9pmj6-w-U27izxf2Pz-1cMvbk,352
28
- evalscope/api/messages/chat_message.py,sha256=XFalZ4e7Z-V3bbABMMsDvad0UiYcIz8kzrp3Muyzqfg,7698
29
+ evalscope/api/messages/__init__.py,sha256=UKZ9VVCt7NPrcZXv_1e8MZ8mOWu0eLRvMIXykpJPZ9I,378
30
+ evalscope/api/messages/chat_message.py,sha256=LZ3Yv_Ts5ASCfrq2y_zecpY3IN5lzHsRbaxz8WRQgD8,9698
29
31
  evalscope/api/messages/content.py,sha256=gUBUeK60BUhkwoulyzKL6q0iMt3VLlah9onLG1XVrWY,2772
30
32
  evalscope/api/messages/utils.py,sha256=uqlEbYEoUKpXLW8tQtP-cY5Miq7W0Xl6a98j55u6m6E,1266
31
33
  evalscope/api/metric/__init__.py,sha256=Cj2F8eiVny5uNtfPXKwQDq2owlHVKNzfr-COLYMEox4,106
32
34
  evalscope/api/metric/metric.py,sha256=XkjBqpZbFYynhTIH8WawfPmItbDQ6jWufE_ox9zDPCU,1568
33
- evalscope/api/metric/scorer.py,sha256=9IATvlJbp47b2iAn5KNO2v2tQIa1lqRlVaXWXce2iN8,3309
34
- evalscope/api/mixin/__init__.py,sha256=PagRD_Dz93Tsl-5YKQMZQAodx867Ow06P8uPQSBx4KM,89
35
- evalscope/api/mixin/dataset_mixin.py,sha256=ZJMcX3J4L0uNC_GkDwndSRjytxlbgldDeFIRfVCPCks,4395
35
+ evalscope/api/metric/scorer.py,sha256=dczSQwkRmPk1uvNCMGT5G6nYbwWTcpwsZtyYXWkrJII,3749
36
+ evalscope/api/mixin/__init__.py,sha256=DpHdR7t9d-HUzBXxwsW3t5MxM4kgoThQ4WF8s8EuSBY,43
36
37
  evalscope/api/mixin/llm_judge_mixin.py,sha256=KPNH41IL7md5XEYqC2ZbmnYm4tIrV-MgxpfKOWbYsMc,5624
37
38
  evalscope/api/model/__init__.py,sha256=YxKdz1IKUt6eYoC7nx81yD2BtyiWQDvaoTcc8O9lvoE,286
38
- evalscope/api/model/generate_config.py,sha256=QMOgi9PUhvdkHzuP5DdOWUX6dOUPh4lqJd1d-0w7XGE,7852
39
- evalscope/api/model/model.py,sha256=YjXI6rIGhaUZavhQ0O77XdCCMruNy-iSIehP8gPcN4k,12662
39
+ evalscope/api/model/generate_config.py,sha256=SyUNlZhcoBpLlMK8esu1XQs61SSPN_D5QN8TRUcnroI,7760
40
+ evalscope/api/model/model.py,sha256=HecfGqaaB201n7I1pZ5Q4_aVC-xLA93uxdGgoreRYFw,12771
40
41
  evalscope/api/model/model_output.py,sha256=NeN6bLtAvg_3fTirewWfdP-_x4SJXa9pGuRpyXJY3B8,9333
41
42
  evalscope/api/tool/__init__.py,sha256=bEaW5ryY-erLcl2zMoDJNgiaBqlSPAL0jQ5daUHvvrw,272
42
43
  evalscope/api/tool/tool_call.py,sha256=WqMnw69L_yhQWycENZ7azPRhxRidhmrMcYAy7UTIqvg,2836
43
44
  evalscope/api/tool/tool_info.py,sha256=aqquWQRWWx7fPItIwiubiz2VRe2TLl_Jmn1ArIlngbw,5716
44
45
  evalscope/api/tool/utils.py,sha256=IWFzM6WspzBmNPicXn6b7KS6Y-1I-ErsK9fua4cb53Y,2324
45
46
  evalscope/app/__init__.py,sha256=HWLXld_JXcBDsdL4L_4E8JsKyuBwwPUSwlejKnZ3HKc,579
46
- evalscope/app/app.py,sha256=8mSBp8qUCCmqupV4FEPMPdT9jL-bYu4DdH2qj8P0ktk,776
47
+ evalscope/app/app.py,sha256=EaBWorA87ZmyIHovIE3styHWEVFsu_F70pTmP4-5zTQ,836
47
48
  evalscope/app/arguments.py,sha256=1wHTLeFx1G94cKXYOeOVe_wTiOY2D929UctIRGOtRaQ,699
48
49
  evalscope/app/constants.py,sha256=oG6tZ618zJcCnwZ5THnYL0gWTPDb5XKrnmdrWxY3Z4Q,385
49
50
  evalscope/app/ui/__init__.py,sha256=IBxyQ2H-kSHoHJmXWDR8QMermvsMbiu673PQbXP_FnE,616
50
51
  evalscope/app/ui/app_ui.py,sha256=wLrQ4VM7BnzvaYmPAk8NH9t5BaWooHFJcgmAOOd2I1w,2032
51
52
  evalscope/app/ui/multi_model.py,sha256=fO8z-ZFucWtgaKmuQ50AkUp4BoYOFqOkxeTBUUAK0bM,15122
52
53
  evalscope/app/ui/sidebar.py,sha256=JA0QbG2iPStK-lFy6x_AjOHlQdesmgXoS0OYJUJ_Wyg,1339
53
- evalscope/app/ui/single_model.py,sha256=7HjfmufZm7wXNtT-ZKnQ4PgX-I_tX5og-s30leX_Xr4,9487
54
+ evalscope/app/ui/single_model.py,sha256=1rgYrJOO75fJG2pa74tzEocO_91jXOAKFQAUViBcYFk,9459
54
55
  evalscope/app/ui/visualization.py,sha256=jXFX_-7woQkcAiQkPAIRwVv1kdRdXonn9IvmB8yzPDU,1102
55
- evalscope/app/utils/data_utils.py,sha256=H4XYWgynmkY0ENU6FFSmrgse3aq3Is11jRRf-_nCvKw,7408
56
+ evalscope/app/utils/data_utils.py,sha256=m7Z0Us_josUFseI8VJpIp8QaYeLnu91E2HCZ8WSB07E,7396
57
+ evalscope/app/utils/env_utils.py,sha256=2pmz4uNun-XNP6TqM6Oe576XopweEClhBaIdWO--kd0,382
56
58
  evalscope/app/utils/localization.py,sha256=rWEviBmcnhIpAA-cG8djbbUA6p1Y358c0dxge5Pqi1U,6131
57
- evalscope/app/utils/text_utils.py,sha256=lZy-sXccv24KyjvOGpZSQYMfM4XSKz3qcriOhsiYAdY,3499
59
+ evalscope/app/utils/text_utils.py,sha256=-K-hRPMZ29Yqjhzd-391gPaD4B4wUuIg71PfbLnGJ38,3754
58
60
  evalscope/app/utils/visualization.py,sha256=dwEXbGfY7vFysnL0HmrHS2BEWaJkg-dZ9ayDlRhdvv4,3559
59
61
  evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
62
  evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
61
63
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
62
64
  evalscope/backend/opencompass/api_meta_template.py,sha256=OGH0lGJmBFKHs-6u6RPCov13_ArO63E6pV-aX1WVljU,1707
63
- evalscope/backend/opencompass/backend_manager.py,sha256=fxTERjtndDoxfjFDxULYc1XZ24lY9HNkNLsmqhkpZtw,10500
65
+ evalscope/backend/opencompass/backend_manager.py,sha256=q_5ABnnJb14T2L2bKY2y-ErJ9K4_65Rpl0a-h3hZ4TM,10337
64
66
  evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
65
67
  evalscope/backend/opencompass/tasks/eval_api.py,sha256=ZaGdUbEOtAW5VX3ZXmpHIttg_QrID34EnBTylD3uvos,1152
66
68
  evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=JHSq4EnPJgv4sRJJplLH80EqE3ghtkn2k8HnV6DaDew,5406
@@ -99,21 +101,12 @@ evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=XMWW8ucN7ojR
99
101
  evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=dZAjsfiR839INO3nbb9psLn-eL4sZOzpU6JMdtJUXtw,1895
100
102
  evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
103
  evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
102
- evalscope/backend/rag_eval/utils/embedding.py,sha256=QR9ewFMTV35JEWl0nCw4gWxlg8UodosMxRTH-JghMJY,9388
104
+ evalscope/backend/rag_eval/utils/embedding.py,sha256=nuwBsiXPAwZisEmg3V4fWekd2tqp5mWRVb_fxNB1zTg,9867
103
105
  evalscope/backend/rag_eval/utils/llm.py,sha256=1OH-985iIDtCOlCtzGmHu6GT_l1vJe7Iv-WyltQbcSc,2451
104
106
  evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
105
107
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
106
108
  evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=jlwM13Ty-Ax6AeMsNlo9xIBupNFgnceYuXtCmh0hNTQ,6160
107
109
  evalscope/benchmarks/__init__.py,sha256=WHR4ej9Tqa2N9CyIaUWXS8EnHZtcujaNeg9hf8GT31Y,1182
108
- evalscope/benchmarks/aigc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
109
- evalscope/benchmarks/aigc/i2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
110
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py,sha256=QOen4eJ1wE_KOrXk-JDDifDbn6ulqLTgVC61a3TSEYA,1665
111
- evalscope/benchmarks/aigc/t2i/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py,sha256=3kRMglG82RXRiA-Hucj7o_O4hrrDaqJxExbmyohANQE,2898
113
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py,sha256=CkJFoQJzF5tR46hr0X0Wu1VJ57uBr28BiUr3WT-5X2c,1840
114
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py,sha256=nOZ8Lk_sRNiPK-d4a6hdmZ8mM40uIvpu5vlLF8Mb44s,1341
115
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py,sha256=Pr2_YW31-DIiklSkR5bGuwEBQWyBQleRiRAR7L7MoH4,1460
116
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py,sha256=OuOO-txcE5ZQHRZj78XGUOBfxJoPZpL3K0k_P9X4kL4,752
117
110
  evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
118
111
  evalscope/benchmarks/aime/aime24_adapter.py,sha256=HTlriHoHzlm1Rf3KAiGRLs8sx6Gyf6s7RGtOjk_hGS4,1767
119
112
  evalscope/benchmarks/aime/aime25_adapter.py,sha256=ZOE_6Zhg1MatWJSu2Zq372nKUODYtNFZimS1MJRFz5A,1591
@@ -154,10 +147,10 @@ evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt
154
147
  evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt,sha256=s_x6u5MLeKpuAHZj3GNQqY1I8vWqQIfJasOp9XcM7Ck,2945
155
148
  evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt,sha256=qfTZafCzNiz9ULBaDlfy_LISL617NyH5Nc0-nO0K0LE,2164
156
149
  evalscope/benchmarks/bfcl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
157
- evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=FH1y0ROEypq9beIX41FVzBS7zcCl_qUTxcG0N0lYiV4,11092
158
- evalscope/benchmarks/bfcl/generation.py,sha256=kf4BrNC24nHkwVysxJPUiFU55pUkev-7y5Op2Ws8GyI,8704
150
+ evalscope/benchmarks/bfcl/bfcl_adapter.py,sha256=N_AVgdfI4DXph0n3U1bChP9AQLx3_-ogAInFE-4EGig,10972
151
+ evalscope/benchmarks/bfcl/generation.py,sha256=gOYzwTNEi2G0zykKdsx42Pc0Ql8iPD6RoX3MRbUhMJo,8698
159
152
  evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
160
- evalscope/benchmarks/ceval/ceval_adapter.py,sha256=j1WkDrEUiNo6MOu-Kt3NzD9DBgye20JMbMTlQMQw39o,8560
153
+ evalscope/benchmarks/ceval/ceval_adapter.py,sha256=4FLPgY-UtqINafnNxfOsE9AwS6GFXFCUGOBI-4EZUGk,8503
161
154
  evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
162
155
  evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=OWzRlSGswV24V-heLqqo7GQzpJp01TZ0DhFHq0iUP9A,8238
163
156
  evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -165,7 +158,7 @@ evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=P0VPAL5T2V_zj0q7im0FdDoq_W5ri
165
158
  evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
166
159
  evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=NOqckeyuabH_nwaxL5IWmH887UO5rvBKA2jx7qb9fNs,2226
167
160
  evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
168
- evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=Ut5sIcTiJQGaFEgE7gM9q3bsDeFKXg_2sDzxcOqchsE,8451
161
+ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=eetF21dN55e0MNPxTaiDbkPZDidt4cX2decQjC_deJI,8676
169
162
  evalscope/benchmarks/docmath/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
170
163
  evalscope/benchmarks/docmath/docmath_adapter.py,sha256=-mel6hA-x_e7fV0uOHdX5BpoQEVyQ5VqwIwEqSNDpnc,4623
171
164
  evalscope/benchmarks/docmath/utils.py,sha256=d6Yjoa5q91kjr1SdVPVBndzDaUzMlO_GfEqMtUXXr0s,7707
@@ -173,10 +166,10 @@ evalscope/benchmarks/drop/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
173
166
  evalscope/benchmarks/drop/drop_adapter.py,sha256=PyvZ1WOdHQ0u0_JpuP97_yQsCUbzGcYsJf3bWKbakzg,9968
174
167
  evalscope/benchmarks/drop/utils.py,sha256=Z9PHrNnRfGqFHCLONg5SWKARp1eTJlHFc_bU46t_YrM,1344
175
168
  evalscope/benchmarks/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
176
- evalscope/benchmarks/frames/frames_adapter.py,sha256=hqFBJsf6fxwDmkE0ZQnkELiEP0dp2cUodKn7kUbTuqM,5479
169
+ evalscope/benchmarks/frames/frames_adapter.py,sha256=w1kRya7w5omt95HHE6AzbzYVhyTT5r521676d_xJ6Vg,5514
177
170
  evalscope/benchmarks/frames/utils.py,sha256=gULWM6Rwv5bTSSWcDYp-iSIoWj8r5VtbQakhRzHJq8A,1172
178
171
  evalscope/benchmarks/general_arena/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
- evalscope/benchmarks/general_arena/general_arena_adapter.py,sha256=UkIjxPSr-qbrloNf2KhDhZClvnes7qP7Bcu1XTXxsKU,21553
172
+ evalscope/benchmarks/general_arena/general_arena_adapter.py,sha256=DzJaokqZwR2L8HDiahss8EbQ3vcsMXkzkMghxU-uAOo,21639
180
173
  evalscope/benchmarks/general_arena/utils.py,sha256=zS4l1RKwvl0Z9Mk7kth9WVQGHTgE_aNDZa_XNy9tGyM,6874
181
174
  evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
182
175
  evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=7VKg_EzXkRvoWpR7h8qB4sVVb1eZHCGcPk-X_NMS5tE,2062
@@ -197,8 +190,13 @@ evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMp
197
190
  evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=55FQwJ0_eDijppkVVlM5XCXzgRFmjH1SvGMItGsvn6o,2769
198
191
  evalscope/benchmarks/ifeval/instructions.py,sha256=HXnn1JgU3dpYltqIovFAn02DxkYOGw337kLMlOfJxJE,56048
199
192
  evalscope/benchmarks/ifeval/instructions_registry.py,sha256=3UXzVLgKwk_cf-2aG2tozjqYgvqm5Mj3ZRRb8rI-ucU,7262
200
- evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
193
+ evalscope/benchmarks/ifeval/instructions_util.py,sha256=Zl9Q6xwtZtIkXLoVwz7oifSEyvbDGETljKHgc4tk6TM,25730
201
194
  evalscope/benchmarks/ifeval/utils.py,sha256=MQt-b4K6uqU9H5TAM6Gxyz46r6XRBOgDsgdnwB0veg0,4470
195
+ evalscope/benchmarks/image_edit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
196
+ evalscope/benchmarks/image_edit/gedit/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
197
+ evalscope/benchmarks/image_edit/gedit/gedit_adapter.py,sha256=a6hhRbnGCvMEMsbnSbczjXd4vHfMVEnFfP459FCF_Mc,5250
198
+ evalscope/benchmarks/image_edit/gedit/utils.py,sha256=UN0z9Dafs8d8lEXqxin321d8smiS3H9p3gyLkZFPFNg,14735
199
+ evalscope/benchmarks/image_edit/gedit/vie_prompts.py,sha256=qVXWQyVUwZxEasDjVmYBk30_JI4gnvHacMOmMsA4wcI,22056
202
200
  evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
203
201
  evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=mNHA_Fuj_gAdOEoR7oChnGmErf1czqwnk8Zk-jRhBys,1304
204
202
  evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -213,16 +211,22 @@ evalscope/benchmarks/maritime_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQ
213
211
  evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=Rx7iZ5JaEo73YwIzhm78gMDQ6gqcErbnWWXHxXM6BcU,2379
214
212
  evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
215
213
  evalscope/benchmarks/math_500/math_500_adapter.py,sha256=uuxjmqftY_r-hJBCjfBgYUELrBaB86MG8dIu2wTikgI,1848
214
+ evalscope/benchmarks/math_vista/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
215
+ evalscope/benchmarks/math_vista/math_vista_adapter.py,sha256=Mu9BpH0rDNM0yMrGws4SEOnXy2NTSIKwyLs5t4nAP-s,5842
216
216
  evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
217
217
  evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=2NT3QbfPzajUTFZ0tBCl6PRrtFtAr5jPZNQRW2Idlno,5947
218
218
  evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
219
219
  evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=GtIyUubUg6Q6Ydh1Adj0-32OdiwcsF-u-NQ0U-4AnQA,3891
220
220
  evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
221
221
  evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=m_37OIFrJB4ZIvtbDJ_m9P9mA2QtrNjGfbbVo15awJg,7402
222
+ evalscope/benchmarks/mmmu/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
223
+ evalscope/benchmarks/mmmu/mmmu_adapter.py,sha256=C7UM6HvomcA_Srf7771S0CaUvifBX63i161XaacraGQ,6038
224
+ evalscope/benchmarks/mmmu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
225
+ evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py,sha256=a7rZV4WMPxeBdfwanmUjsB8yG1rwNXCsWCoqzOq-dd4,4901
222
226
  evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
223
227
  evalscope/benchmarks/musr/musr_adapter.py,sha256=kx6bckj7Nijl4Wysuj-mKYdy0hIRDJho8yVTup403Hc,1473
224
228
  evalscope/benchmarks/needle_haystack/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
225
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=GRJrJ7O0OZlIMH-FyghcA54xNfBSYjPd-0TgtMw7vHA,17048
229
+ evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py,sha256=mO8zhdCpoWPtlBk9GSzgcP25vEoQLYGwUM1QfcQ4iSE,17151
226
230
  evalscope/benchmarks/needle_haystack/utils.py,sha256=k8WDigqt5LgzHw6DtaYsLtb3BJL0FTZS9JOyJCpoPq8,2935
227
231
  evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
228
232
  evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=XN3F6NH7mF4ibwGX5nI01sqEHz05UQFnBAyfAe14QYE,6174
@@ -235,8 +239,14 @@ evalscope/benchmarks/super_gpqa/prompt.py,sha256=wQ8Y4NAvQJRhPS7gsrUBBzeM_UCHsHO
235
239
  evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=SPqpBebiHj_oyEqU94p9NSqhVkO0KeXQYcBmpfH81nM,6888
236
240
  evalscope/benchmarks/super_gpqa/utils.py,sha256=OK_oT-DnWNssITEwu_Zc3Ty5v21n0IaJQYftK2cpwmQ,3401
237
241
  evalscope/benchmarks/tau_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
238
- evalscope/benchmarks/tau_bench/generation.py,sha256=SankPe87Zi85CGlSKWZyOYo6Q4gRN22I7fkl3ef547U,5165
239
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py,sha256=2Gjvc8RVavcIHQPDXI5d6zxpeHqghRnOhlzwWjGKc24,6438
242
+ evalscope/benchmarks/tau_bench/generation.py,sha256=d7J5xrxEI-0BYxdSuxdDavcR7f1ipBdpQsKZzwyzGds,5190
243
+ evalscope/benchmarks/tau_bench/tau_bench_adapter.py,sha256=1Dj5r9zMuLJ59wHusEcHVTszBE8BVhAK8lNZzBBzKT8,6375
244
+ evalscope/benchmarks/text2image/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
245
+ evalscope/benchmarks/text2image/evalmuse_adapter.py,sha256=g-Wc1qTg-xWLTjiZPo8zmQud75ac-8mBpYRxOHfiO0g,3024
246
+ evalscope/benchmarks/text2image/genai_bench_adapter.py,sha256=1GDB3gS9zwrfb9C83LQdQyN7bvvqeYuu5ulJ9Igmi2k,1876
247
+ evalscope/benchmarks/text2image/general_t2i_adapter.py,sha256=CHy9ufvrVHc_5WkGVR_F-5wfLQVFtxwubZOfdpx9rd8,1354
248
+ evalscope/benchmarks/text2image/hpdv2_adapter.py,sha256=8-vWCV21eo_e9EbxDB5mGw2cFzD4OUQPLB66FvlO9W4,1781
249
+ evalscope/benchmarks/text2image/tifa_adapter.py,sha256=4CcprucAe25UpTZRV3Qgb-8jbeNHtXNRWHw8RiYvfJA,784
240
250
  evalscope/benchmarks/tool_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
241
251
  evalscope/benchmarks/tool_bench/tool_bench_adapter.py,sha256=BHsesDDELEINdbWSR3WKCQGZ6MqWc2LiOZA3MbTp2_s,3805
242
252
  evalscope/benchmarks/tool_bench/utils.py,sha256=led0d-Pa3rvmWkSWhEnZWP00fceudgESq5HXAQzJGls,7042
@@ -244,26 +254,26 @@ evalscope/benchmarks/trivia_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0
244
254
  evalscope/benchmarks/trivia_qa/samples.jsonl,sha256=1isBD62PGhCiNbzQa-GFrHHL4XLHIkojWfgSvn7ktf8,3445
245
255
  evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py,sha256=oZAiCmBpZbBAgzAKPfddaJWMckIyaoRM7fB2XJ5EoQU,2614
246
256
  evalscope/benchmarks/truthful_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
247
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=LkmJFWzLpk0ryUf_XVZvEBIVTcSJ2a4pB9bh7k0DIJI,3519
257
+ evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py,sha256=W7ESUAcLsHwbssiiSCQNUeQcqx6JEeW7FSQiBFycS24,3512
248
258
  evalscope/benchmarks/winogrande/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
249
259
  evalscope/benchmarks/winogrande/winogrande_adapter.py,sha256=LWm6qZd3pJbtpcERq7WPK3adwY3uVm4wiUgfyEI_uHE,1310
250
260
  evalscope/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
251
261
  evalscope/cli/base.py,sha256=m1DFlF16L0Lyrn0YNuFj8ByGjVJIoI0jKzAoodIXjRk,404
252
262
  evalscope/cli/cli.py,sha256=qXQ6k9GBkRy2dmBxM24tbVP42bQDyM6G7kkc32LdpCA,860
253
- evalscope/cli/start_app.py,sha256=dV63nvBYEUl2sGeVxoUH4IJBXJSLecaq293i3alBWxo,794
263
+ evalscope/cli/start_app.py,sha256=LqJ3cSBY8FsM_JjInw4jlpitjaVoIZscUShMpDRPbro,1030
254
264
  evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,775
255
- evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
265
+ evalscope/cli/start_perf.py,sha256=V8DwVPXTGmyDPma7Yk_pJbLb4iVkDj6Y3qPGHV03sE0,1082
256
266
  evalscope/cli/start_server.py,sha256=01iDaEwLx59xRUrrZ_nhQE-QjUE1Rk5d43uMQ_4owbI,3677
257
267
  evalscope/collections/__init__.py,sha256=x05hFLrjGsdtuHtc6PyQXHNuucVdYaBN9ZrM8gBiJWg,720
258
268
  evalscope/collections/sampler.py,sha256=086pzXQO4CO_QYCd10z149Sjh6sBpRBeIHf5OTLOVu8,4896
259
269
  evalscope/collections/schema.py,sha256=yzAlnH0O7iiWB4UnkFXI_Dvxcsq9hDgl0aGK2OpyBY8,4158
260
270
  evalscope/evaluator/__init__.py,sha256=KzYmVTfU-1pdX7va7l3B1-5QKWG07hj1B7rYkMmxitY,91
261
- evalscope/evaluator/evaluator.py,sha256=IY0LElXZXfe2HW1v99dKkN3qhyzo0WO4aR8OyxUny3A,13545
271
+ evalscope/evaluator/evaluator.py,sha256=mkq85ieBRSc5X2FFxijomb2jD3YDKR6UelKFVP6WT8Y,13592
262
272
  evalscope/filters/__init__.py,sha256=AsXwKYDjGhFsJvtj036PRjMOPsHGt-CRicnHTtM_qA4,51
263
273
  evalscope/filters/extraction.py,sha256=KLFr_3XYsrv0PTvmXy0ugj2sqv2ZOWJFV7G_MmGjTHk,4146
264
274
  evalscope/filters/selection.py,sha256=yiJu2JjXDH_lgfEtB9umkGcA3zpo3zvnyoq2mKrXbnw,1609
265
275
  evalscope/metrics/__init__.py,sha256=1giVHESSjn98uBiAvYm5uLsmRQwmf9NHPSt7OT_QJss,1615
266
- evalscope/metrics/llm_judge.py,sha256=xNmchB6ZDlrQnxv-Vit_zcQjeAH-C0o3m4bF4OFDRCI,8174
276
+ evalscope/metrics/llm_judge.py,sha256=XukhH9PQtIZAcbjJlOmOD9ye3ngRv_IGKKJE9jhheOE,8653
267
277
  evalscope/metrics/math_parser.py,sha256=BMfautQtNNiF9f2DIEfO6SXSn_GYhzaddAjGWG10MJA,17257
268
278
  evalscope/metrics/metric.py,sha256=6la8Nq2E_brArDcNwkbRX3ECef0AAE3IrBCfUVE7UKc,10176
269
279
  evalscope/metrics/metrics.py,sha256=VxAggzEfaLKxWcXyuve8QbEBwV2W71udVyt0gynzGec,14134
@@ -372,15 +382,16 @@ evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_proce
372
382
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py,sha256=d4HInkL_Phk0Bgg2cWaOvhsPa6lkqDeovFW86PL0I18,6371
373
383
  evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py,sha256=Epk72q5iTdzRbuzOR669QqAUMgrFFngAU8Z3Qy9KLbM,11209
374
384
  evalscope/models/__init__.py,sha256=RmW2S31BHBhMN49_VVF_5PJAk-TsuZQkuF2ALShbhAw,556
385
+ evalscope/models/image_edit_model.py,sha256=oVjGgebnFu3ZXBJLNn62rJ65fcJR7DlG4qEVxisPJ2Y,4104
375
386
  evalscope/models/mockllm.py,sha256=t1fFAHkEb1n_atOCfnGteCX3DWp774lnWcHzi5lBjwM,2511
376
- evalscope/models/model_apis.py,sha256=-yj2cY0Z0Ku6ZTnFqpoxArHLJyoUdq4YA_ChLo3_xw4,1176
387
+ evalscope/models/model_apis.py,sha256=qzoksjHJHE8CLoNT0UlnFVkmeS7ufguiAtaxZSC5Djc,1957
377
388
  evalscope/models/modelscope.py,sha256=jSFkho_Ir2py54y_Bwj9jpCoY2mMKkZ8ORzne-ldAIE,15806
378
- evalscope/models/openai_compatible.py,sha256=8WlWtu7EWr3Y5e5ErpeLQ7ZKfN4HXkFN3gV_jl5p1NM,4528
379
- evalscope/models/text2image_model.py,sha256=-NMLtZuT7L86HpkMpsz_gNaA5Z9_6p1MYzmjYZN6mvo,3929
380
- evalscope/models/utils/openai.py,sha256=u343L0OVqv4NbVSICSzwfWXh3QEyIIIv9ZWrBzW6IGk,28013
389
+ evalscope/models/openai_compatible.py,sha256=2uK78nDhWwgph7hcIiMc3NHRbIwvswRDM9o9ENahj4k,4659
390
+ evalscope/models/text2image_model.py,sha256=Sdiyw6vewjVTiXK8RFEh1pohOhDge80EoIWYpnLjr5Y,3929
391
+ evalscope/models/utils/openai.py,sha256=xnnpPKWAsqqEscOQr0WJjr7gHUa9POs55Bs1Zv6MXNQ,28182
381
392
  evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
382
393
  evalscope/perf/arguments.py,sha256=lG2IOOzxg29pdnF6IobzPcqEcYqopulFpVU2QzRaEJA,11429
383
- evalscope/perf/benchmark.py,sha256=w6gb5ofGAXxBdp6hejowKgXu0rv1xfTqhg5VYBTcGc0,7885
394
+ evalscope/perf/benchmark.py,sha256=nSJr8lQvHDYiG33tNhkYaVOYONjhJ2wUb1x5RlUiXRY,7968
384
395
  evalscope/perf/http_client.py,sha256=4Ov1Cwi7gMgO05ZmazwyfYjUGAQNGWn7nbfl1ljRNh4,4610
385
396
  evalscope/perf/main.py,sha256=WZbBgFhIj9KqxzC7_NZxDlou019_EXatsHRt5vqDhFg,3439
386
397
  evalscope/perf/plugin/__init__.py,sha256=Ztj4h1_JYJqbbWkeuDTj5aTRyGQf5Woc4xEIyjcokVU,94
@@ -404,15 +415,15 @@ evalscope/perf/plugin/datasets/random_vl_dataset.py,sha256=F3yA9Ih3YO895lZKCo3i8
404
415
  evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
405
416
  evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
406
417
  evalscope/perf/utils/analysis_result.py,sha256=aoT7JD2zAzBeuZUfncKhJ2odX_7KnymwOmNB1Upam2c,935
407
- evalscope/perf/utils/benchmark_util.py,sha256=91hZabkrDnv-0hgmqHi5TmyCT1ztbGbhu3Y5pB6vgRU,7157
418
+ evalscope/perf/utils/benchmark_util.py,sha256=V91JwpiR66tOz3N5RPp3Es29M9BghdCHj_Czb0FBekI,7274
408
419
  evalscope/perf/utils/db_util.py,sha256=HAISq6M7xCD2gjUEqqfbK3FjBxA-tvr_n-751tU9ypo,11634
409
420
  evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
410
- evalscope/perf/utils/local_server.py,sha256=Bp4pWmjZS6CDlumedea_XRsAqWuoCbmr3z4TPOk2vEc,4768
421
+ evalscope/perf/utils/local_server.py,sha256=_lSPlNEnOmPA_DtREgPS_vj2w_7D8PPSpypXbb0YfJM,4880
411
422
  evalscope/perf/utils/log_utils.py,sha256=NWSK_ITG4yoVx5GMLbIRGDoXSs90s7X3mftdm37Os2U,1666
412
423
  evalscope/perf/utils/rich_display.py,sha256=AQmXv1EuA1-IGgco-Jy1NLOmTKv4eBFH2K4QS8OoGVo,8206
413
- evalscope/report/__init__.py,sha256=qpiOJkM4PO8l9X9ZPUsqBFBfNOzYVEkYfwisfli9bJE,907
424
+ evalscope/report/__init__.py,sha256=DTigCg9fkU_zGNDqIaZy3CWYbrlvODvCxCTVqSx6ano,875
414
425
  evalscope/report/combinator.py,sha256=MAiOCj_q5mXm8-3lARvCSG12jUVEdJ8VcoEHJapoWzo,4134
415
- evalscope/report/generator.py,sha256=_ovCzV7E5SfFWLeKIW6CotSqiqiJ8qkNQU5UlPGscSM,8041
426
+ evalscope/report/generator.py,sha256=t2R3WGa4SowTRUPOgITtyTR4QDiJ6i3FH__byDKZU8Y,4959
416
427
  evalscope/report/report.py,sha256=KxboijAVNENxYHjiwyyqW_aQZ0F2CyJ6MbqUJTRHJMs,8273
417
428
  evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
418
429
  evalscope/third_party/longbench_write/README.md,sha256=1yLKeSVIcihpoc4KXr8NpK86JbcLssCPx76aOKdPbYI,5431
@@ -448,34 +459,37 @@ evalscope/third_party/toolbench_static/infer.py,sha256=rsADLhEd2IBcC6EI9aD7hSJmo
448
459
  evalscope/third_party/toolbench_static/requirements.txt,sha256=OW91Z8hfzh7yQUYgP1Di_E6DgNgGoGP1UcvnqrdCR68,22
449
460
  evalscope/third_party/toolbench_static/toolbench_static.py,sha256=xE__eXvSwHmmSh1tXNvyBo6MCO4mDlYTbIYl9OGEfNI,2120
450
461
  evalscope/third_party/toolbench_static/llm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
451
- evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=GITEbyiER10Zi-ZWpSqYCdAsiVtNeGK24hvR3kmYn2s,2689
462
+ evalscope/third_party/toolbench_static/llm/swift_infer.py,sha256=hy0JpjSEkCJh3z5ZnY8gGfdJ2ajkS5zRl-2ZQq6Gu8A,2527
452
463
  evalscope/utils/__init__.py,sha256=5OH8cOoX3YKMKUu0dMRvwzckXligIbUV-1jjJNXlpGI,2231
453
464
  evalscope/utils/argument_utils.py,sha256=D7qOH85wf7LKh_cJ2X51OEaL7CMaddydmHZkfoYpvLk,1952
454
- evalscope/utils/chat_service.py,sha256=47VmV4NdiYazfSAPww5wPf53L5avw6VETYgyGZyGvGc,8758
465
+ evalscope/utils/chat_service.py,sha256=sSki2pKGQP3UjcIf_lbO06afI-vsaUAqglwX__wUDEw,8766
455
466
  evalscope/utils/deprecation_utils.py,sha256=aDv3HFNcJFZ7rxNgALQP0-ITO8L23HC_RX-C_m2i34Y,1610
456
467
  evalscope/utils/function_utils.py,sha256=a752Z4Xb1rznnLJU9g5Pxqd3r_XzfLzAkdcjSX0kOVc,650
457
- evalscope/utils/import_utils.py,sha256=BSdp7RQSZu67129TBbtJvMWU0CfCFu864K31eiM3pr8,2975
458
- evalscope/utils/io_utils.py,sha256=elAFpyb5FGwV3AZxapkXqZmjtJCSvZZVe0QQEH4nxWM,10020
468
+ evalscope/utils/import_utils.py,sha256=b6N2x5kB_TMCkSKBlBZ5kL-x-eo_B_DWRQKtsxYL-WM,3808
469
+ evalscope/utils/io_utils.py,sha256=q26SU80VvLi1e--KDbMmIjuw3ex_WEWzkgLkmsK9n1g,11191
459
470
  evalscope/utils/json_schema.py,sha256=MLCS8cSLXF83UPebBaVWDfXJnf0qXsXnr-bIRG88cI4,7485
460
471
  evalscope/utils/logger.py,sha256=SPhhXo9gyZtWDYDLumII2CEmwHsaW8Bu1IjK5UqWrKQ,5273
461
- evalscope/utils/model_utils.py,sha256=q0mmcfUJVks21NHP8awTQk_1q6ruupjzIBN_Xo3wt40,2394
462
- evalscope/utils/multi_choices.py,sha256=ZEpN8LcZfXhhuATeMZx_uEnMg3l981J_OdSL90iFoZQ,8951
472
+ evalscope/utils/model_utils.py,sha256=rzEnlwWgupkH1vmmv-tL9-udpwHuiQlZhbX9fXPEcZg,2434
473
+ evalscope/utils/multi_choices.py,sha256=OxBER7amWpoRY0Z-o39rDmCNK6wpr1HQm9mMHpWLgp0,9524
463
474
  evalscope/utils/url_utils.py,sha256=9HcFt9uZNbOJR3ADUFQ_dBFKziHV6H66Df7HYs1M4Po,1757
464
475
  tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
476
+ tests/common.py,sha256=BB136KcGaEfdWqMwApa48K0CTSGmOCUZ0FYDqpfYnAA,2423
465
477
  tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
466
478
  tests/utils.py,sha256=Fgm0CU6ilZjCGOfOMJH-Trxy0UIAGbhvy0Ijy_zDGUk,323
467
- tests/aigc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
468
- tests/aigc/test_t2i.py,sha256=fciaGsOrkOpT4WQlsnmjrqw6qolCzI0DGyWQAJkM-Es,4513
469
479
  tests/benchmark/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
470
- tests/benchmark/test_eval.py,sha256=fHAr4h2YjqIVk-FHp93HUZvRZ1fvlVFd1EUeRwrIwYw,12559
480
+ tests/benchmark/test_eval.py,sha256=Grms3aMWQONexSsSvOSxkoURHLJ2Z0SqBjrcVWDoMRs,12455
481
+ tests/benchmark/test_image_edit.py,sha256=z3z7psMRFynpVgUAFoH--ieeGXzb9cHkrq3tT_sCZo8,2165
482
+ tests/benchmark/test_t2i.py,sha256=fciaGsOrkOpT4WQlsnmjrqw6qolCzI0DGyWQAJkM-Es,4513
483
+ tests/benchmark/test_vlm.py,sha256=k2DC0zWO2TtVSf-MP-n-wGwfk9MWKKd6hZzkC4nlUO0,2541
471
484
  tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
472
- tests/cli/test_all.py,sha256=a3G0LMgQx3M97uy0GfX1DFxbA7zWofkxgtwT8PMorQI,6268
473
- tests/cli/test_collection.py,sha256=OUm2_Qt0zkQehPTAmUaGRNBes8ewr7wYfE0E-gUe1J8,4386
485
+ tests/cli/test_all.py,sha256=1omOXC1lBphBLm0hTf5HNstlF_bwi16dYyr00gvaCTM,7301
486
+ tests/cli/test_collection.py,sha256=lGz3YUS_0gM6_HjQLe26OfBAkHOPOEDWMO-UyP58GN8,4455
474
487
  tests/cli/test_custom.py,sha256=9z_N7Re712xI62TqVSTBdzB_iFFEUb55wcWIcGvJb84,9254
488
+ tests/cli/test_reasoning.py,sha256=rU181LLoKbFCpNPFCIZULxEgsJ2PYswel2pP2EsjEmo,2696
475
489
  tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
476
- tests/perf/test_perf.py,sha256=AEWvpN3ID6s-9MEoaZjQqUM8VVsqgk_v9KX8pDgvozA,5864
490
+ tests/perf/test_perf.py,sha256=yqm3abB5ZdNPKaJkvzMvfcz-Cz_o2RxUZ3ZnqgRb-tQ,5937
477
491
  tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
478
- tests/rag/test_clip_benchmark.py,sha256=YmfezEzqBrl9-Ga2pG4YXs0ARcD5gWmuzINjY08tPpM,2695
492
+ tests/rag/test_clip_benchmark.py,sha256=qpSLgmHMGcYTnxP7AI__y-ii5_tu_fCSht6p3TBetkA,2650
479
493
  tests/rag/test_mteb.py,sha256=fdNQIyUEzE7puPCKw5QhCHTEu7hz-ieHeq1xCWGh6IM,7246
480
494
  tests/rag/test_ragas.py,sha256=5qozXvPFIb67T-igJv87ijlOgkPnqgkkBVXu6Ht4D0A,4554
481
495
  tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
@@ -484,9 +498,9 @@ tests/swift/test_run_swift_vlm_eval.py,sha256=RwrKkc1WHEZxetM11cGL81G4faKCn7SYn4
484
498
  tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=UAUtOCQ72xbm8s-sov3cBEpYVDy189wpB-qOL3KoU7M,6053
485
499
  tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
486
500
  tests/vlm/test_vlmeval.py,sha256=EDQRkYfSyOICUwo_tm3p-puaE_xdFmqOPkrt5etxsqM,3307
487
- evalscope-1.0.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
488
- evalscope-1.0.0.dist-info/METADATA,sha256=FKr7sZCbyX_HxicgCX5rHrZz19STzLSK1Tgmm0CrWlg,39723
489
- evalscope-1.0.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
490
- evalscope-1.0.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
491
- evalscope-1.0.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
492
- evalscope-1.0.0.dist-info/RECORD,,
501
+ evalscope-1.0.1.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
502
+ evalscope-1.0.1.dist-info/METADATA,sha256=2XzuX9tVYzONuLHVq2WsQ_uaWImGVwiY2IPAJhpNEOA,40287
503
+ evalscope-1.0.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
504
+ evalscope-1.0.1.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
505
+ evalscope-1.0.1.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
506
+ evalscope-1.0.1.dist-info/RECORD,,
@@ -4,17 +4,15 @@ from dotenv import dotenv_values
4
4
  env = dotenv_values('.env')
5
5
 
6
6
  import unittest
7
- from unittest import TestCase
8
7
 
9
- from evalscope.config import TaskConfig
10
8
  from evalscope.constants import EvalType, JudgeStrategy, OutputType
11
- from evalscope.run import run_task
12
9
  from evalscope.utils.logger import get_logger
10
+ from tests.common import TestBenchmark
13
11
 
14
12
  logger = get_logger()
15
13
 
16
14
 
17
- class TestBenchmark(TestCase):
15
+ class TestNativeBenchmark(TestBenchmark):
18
16
  """Benchmark evaluation test cases."""
19
17
 
20
18
  def setUp(self):
@@ -46,27 +44,6 @@ class TestBenchmark(TestCase):
46
44
  'debug': True,
47
45
  }
48
46
 
49
- def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
50
- """Helper method to run test for a specific dataset."""
51
- config = self.base_config.copy()
52
- config['datasets'] = [dataset_name]
53
-
54
- if use_mock:
55
- config['eval_type'] = EvalType.MOCK_LLM
56
-
57
- # 应用配置覆盖
58
- config.update(config_overrides)
59
-
60
- if dataset_args:
61
- config['dataset_args'] = {dataset_name: dataset_args}
62
-
63
- task_cfg = TaskConfig(**config)
64
- run_task(task_cfg=task_cfg)
65
-
66
- def _run_dataset_load_test(self, dataset_name, dataset_args=None):
67
- """Helper method to test dataset loading."""
68
-
69
- self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
70
47
 
71
48
  # Math & Reasoning datasets
72
49
  def test_gsm8k(self):
@@ -84,7 +61,7 @@ class TestBenchmark(TestCase):
84
61
  """Test MMLU reasoning dataset."""
85
62
  dataset_args = {
86
63
  'few_shot_num': 0,
87
- # 'subset_list': ['abstract_algebra', 'computer_security']
64
+ 'subset_list': ['abstract_algebra', 'computer_security']
88
65
  }
89
66
  self._run_dataset_test('mmlu', use_mock=True, dataset_args=dataset_args)
90
67
 
@@ -116,7 +93,11 @@ class TestBenchmark(TestCase):
116
93
  def test_math_500(self):
117
94
  """Test MATH 500 dataset."""
118
95
  # self._run_dataset_load_test('math_500')
119
- self._run_dataset_test('math_500')
96
+ dataset_args = {
97
+ 'subset_list': ['Level 1', 'Level 2'],
98
+ 'few_shot_num': 0,
99
+ }
100
+ self._run_dataset_test('math_500', dataset_args=dataset_args)
120
101
 
121
102
  def test_aime24(self):
122
103
  """Test AIME 2024 dataset."""
@@ -364,21 +345,39 @@ class TestBenchmark(TestCase):
364
345
  'underscore_to_dot': True
365
346
  }
366
347
  }
367
- self._run_dataset_test('bfcl_v3', dataset_args)
348
+ self._run_dataset_test('bfcl_v3', dataset_args, model='qwq-plus', stream=True)
368
349
 
369
350
  def test_tau_bench(self):
370
351
  dataset_args = {
352
+ 'subset_list': [
353
+ 'airline',
354
+ 'retail'
355
+ ],
371
356
  'extra_params': {
372
357
  'user_model': 'qwen-plus',
373
358
  'api_key': env.get('DASHSCOPE_API_KEY'),
374
359
  'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
375
360
  'generation_config': {
376
- 'temperature': 0.7,
377
- 'max_new_tokens': 1024
361
+ 'temperature': 0.0,
362
+ 'max_tokens': 12000,
363
+ 'stream': True
378
364
  }
379
365
  }
380
366
  }
381
- self._run_dataset_test('tau_bench', dataset_args, limit=1)
367
+ self._run_dataset_test('tau_bench', dataset_args, limit=5, model='qwq-plus', stream=True)
368
+
369
+ def test_r1_collection(self):
370
+ dataset_args = {
371
+ 'dataset_id': 'evalscope/R1-Distill-Math-Test-v2'
372
+ }
373
+ self._run_dataset_test('data_collection', dataset_args)
374
+
375
+ def test_qwen3_collection(self):
376
+ dataset_args = {
377
+ 'dataset_id': 'evalscope/Qwen3-Test-Collection'
378
+ }
379
+ self._run_dataset_test('data_collection', dataset_args)
380
+
382
381
 
383
382
  if __name__ == '__main__':
384
383
  # Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
@@ -0,0 +1,65 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+
8
+ from evalscope.constants import EvalType, JudgeStrategy, ModelTask
9
+ from evalscope.utils.logger import get_logger
10
+ from tests.common import TestBenchmark
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class TestImageEditBenchmark(TestBenchmark):
16
+ def setUp(self):
17
+ """Setup common test configuration."""
18
+ self.base_config = {
19
+ 'model': 'Qwen/Qwen-Image-Edit',
20
+ 'model_args':{
21
+ 'precision': 'bfloat16',
22
+ 'device_map': 'cuda:2'
23
+ },
24
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
25
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
26
+ 'model_task': ModelTask.IMAGE_GENERATION,
27
+ 'eval_type': EvalType.IMAGE_EDITING,
28
+ 'eval_batch_size': 1,
29
+ 'limit': 5,
30
+ 'generation_config': {
31
+ 'true_cfg_scale': 4.0,
32
+ 'num_inference_steps': 50,
33
+ 'negative_prompt': ' ',
34
+ },
35
+ 'judge_strategy': JudgeStrategy.AUTO,
36
+ 'judge_worker_num': 5,
37
+ 'judge_model_args': {
38
+ 'model_id': 'qwen2.5-vl-72b-instruct',
39
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
40
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
41
+ 'generation_config': {
42
+ 'temperature': 0.0,
43
+ 'max_tokens': 4096,
44
+ }
45
+ },
46
+ 'debug': True,
47
+ }
48
+
49
+ def test_gedit(self):
50
+ """Test GEdit dataset."""
51
+ dataset_args = {
52
+ 'extra_params':{
53
+ 'language': 'cn',
54
+ }
55
+ }
56
+ self._run_dataset_test('gedit', dataset_args=dataset_args, use_cache='outputs/20250829_150058')
57
+
58
+ def test_gedit_local(self):
59
+ dataset_args = {
60
+ 'extra_params':{
61
+ 'language': 'cn',
62
+ 'local_file': 'outputs/example_edit.jsonl',
63
+ }
64
+ }
65
+ self._run_dataset_test('gedit', dataset_args=dataset_args, model=None, model_id='offline_model')
@@ -0,0 +1,80 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+
8
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
9
+ from evalscope.utils.logger import get_logger
10
+ from tests.common import TestBenchmark
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class TestVLMBenchmark(TestBenchmark):
16
+ """Benchmark evaluation test cases."""
17
+
18
+ def setUp(self):
19
+ """Setup common test configuration."""
20
+ self.base_config = {
21
+ 'model': 'qwen-vl-plus',
22
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
23
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
24
+ 'eval_type': EvalType.SERVICE,
25
+ 'eval_batch_size': 5,
26
+ 'limit': 5,
27
+ 'generation_config': {
28
+ 'max_tokens': 4096,
29
+ 'temperature': 0.0,
30
+ 'seed': 42,
31
+ 'parallel_tool_calls': True
32
+ },
33
+ 'judge_strategy': JudgeStrategy.AUTO,
34
+ 'judge_worker_num': 5,
35
+ 'judge_model_args': {
36
+ 'model_id': 'qwen2.5-72b-instruct',
37
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
38
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
39
+ 'generation_config': {
40
+ 'temperature': 0.0,
41
+ 'max_tokens': 4096,
42
+ }
43
+ },
44
+ 'debug': True,
45
+ }
46
+
47
+ def test_mmmu(self):
48
+ dataset_args = {
49
+ 'subset_list':[
50
+ 'Accounting',
51
+ 'Agriculture',
52
+ # 'Architecture_and_Engineering'
53
+ ]
54
+ }
55
+ self._run_dataset_test('mmmu', dataset_args=dataset_args)
56
+
57
+ def test_math_vista(self):
58
+ dataset_args = {
59
+ 'subset_list': ['default']
60
+ }
61
+ self._run_dataset_test('math_vista', dataset_args=dataset_args)
62
+
63
+ def test_mmmu_pro(self):
64
+ dataset_args = {
65
+ 'subset_list':[
66
+ 'Accounting',
67
+ # 'Agriculture',
68
+ ],
69
+ 'extra_params': {
70
+ 'dataset_format': 'standard (4 options)', # 'standard (4 options)', 'standard (10 options)', 'vision'
71
+ },
72
+ }
73
+ self._run_dataset_test('mmmu_pro', dataset_args=dataset_args, limit=10)
74
+
75
+ def test_qwen3_collection(self):
76
+ dataset_args = {
77
+ 'dataset_id': 'outputs/qwen3_vl_test.jsonl',
78
+ 'shuffle': True,
79
+ }
80
+ self._run_dataset_test('data_collection', dataset_args)