evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
@@ -0,0 +1,385 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+
8
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
9
+ from evalscope.utils.logger import get_logger
10
+ from tests.common import TestBenchmark
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class TestNativeBenchmark(TestBenchmark):
16
+ """Benchmark evaluation test cases."""
17
+
18
+ def setUp(self):
19
+ """Setup common test configuration."""
20
+ self.base_config = {
21
+ 'model': 'qwen-plus',
22
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
23
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
24
+ 'eval_type': EvalType.SERVICE,
25
+ 'eval_batch_size': 5,
26
+ 'limit': 5,
27
+ 'generation_config': {
28
+ 'max_tokens': 4096,
29
+ 'temperature': 0.0,
30
+ 'seed': 42,
31
+ 'parallel_tool_calls': True
32
+ },
33
+ 'judge_strategy': JudgeStrategy.AUTO,
34
+ 'judge_worker_num': 5,
35
+ 'judge_model_args': {
36
+ 'model_id': 'qwen2.5-72b-instruct',
37
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
38
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
39
+ 'generation_config': {
40
+ 'temperature': 0.0,
41
+ 'max_tokens': 4096,
42
+ }
43
+ },
44
+ 'debug': True,
45
+ }
46
+
47
+
48
+ # Math & Reasoning datasets
49
+ def test_gsm8k(self):
50
+ """Test GSM8K math reasoning dataset."""
51
+ self._run_dataset_test('gsm8k')
52
+
53
+ def test_gsm8k_local(self):
54
+ """Test GSM8K math reasoning dataset with local path."""
55
+ dataset_args = {
56
+ 'local_path': 'data/gsm8k',
57
+ }
58
+ self._run_dataset_test('gsm8k', dataset_args=dataset_args, use_mock=True)
59
+
60
+ def test_mmlu(self):
61
+ """Test MMLU reasoning dataset."""
62
+ dataset_args = {
63
+ 'few_shot_num': 0,
64
+ 'subset_list': ['abstract_algebra', 'computer_security']
65
+ }
66
+ self._run_dataset_test('mmlu', use_mock=True, dataset_args=dataset_args)
67
+
68
+ def test_mmlu_pro(self):
69
+ """Test MMLU-Pro reasoning dataset."""
70
+ dataset_args = {
71
+ 'few_shot_num': 2,
72
+ 'subset_list': ['computer science', 'math']
73
+ }
74
+ self._run_dataset_test('mmlu_pro', use_mock=False, dataset_args=dataset_args, repeats=2)
75
+
76
+ def test_mmlu_redux(self):
77
+ """Test MMLU-Redux reasoning dataset."""
78
+ dataset_args = {
79
+ 'subset_list': ['abstract_algebra', 'computer_security'],
80
+ }
81
+ # self._run_dataset_load_test('mmlu_redux', dataset_args)
82
+ self._run_dataset_test('mmlu_redux', dataset_args=dataset_args)
83
+
84
+ def test_cmmlu(self):
85
+ """Test C-MMLU reasoning dataset."""
86
+ dataset_args = {
87
+ 'subset_list': ['agronomy', 'computer_security'],
88
+ 'few_shot_num': 0,
89
+ }
90
+ # self._run_dataset_load_test('cmmlu')
91
+ self._run_dataset_test('cmmlu', dataset_args=dataset_args)
92
+
93
+ def test_math_500(self):
94
+ """Test MATH 500 dataset."""
95
+ # self._run_dataset_load_test('math_500')
96
+ dataset_args = {
97
+ 'subset_list': ['Level 1', 'Level 2'],
98
+ 'few_shot_num': 0,
99
+ }
100
+ self._run_dataset_test('math_500', dataset_args=dataset_args)
101
+
102
+ def test_aime24(self):
103
+ """Test AIME 2024 dataset."""
104
+ self._run_dataset_test('aime24')
105
+
106
+ def test_aime25(self):
107
+ """Test AIME 2025 dataset."""
108
+ self._run_dataset_test('aime25')
109
+
110
+ def test_competition_math(self):
111
+ """Test Competition Math dataset."""
112
+ dataset_args = {
113
+ 'subset_list': ['Level 4']
114
+ }
115
+ self._run_dataset_test('competition_math', dataset_args)
116
+
117
+ # Knowledge & QA datasets
118
+ def test_arc(self):
119
+ """Test ARC dataset."""
120
+ # self._run_dataset_load_test('arc')
121
+ dataset_args = {
122
+ 'subset_list': ['ARC-Easy', 'ARC-Challenge'],
123
+ 'few_shot_num': 2,
124
+ }
125
+ self._run_dataset_test('arc', dataset_args=dataset_args)
126
+
127
+ def test_ceval(self):
128
+ """Test CEval dataset."""
129
+ dataset_args = {
130
+ 'subset_list': ['logic', 'law'],
131
+ # 'few_shot_num': 0,
132
+ }
133
+ # self._run_dataset_load_test('ceval')
134
+ self._run_dataset_test('ceval', dataset_args=dataset_args)
135
+
136
+ def test_super_gpqa(self):
137
+ """Test Super GPQA dataset."""
138
+ # self._run_dataset_load_test('super_gpqa')
139
+
140
+ dataset_args = {
141
+ 'subset_list': ['History', 'Psychology'],
142
+ 'few_shot_num': 0,
143
+ }
144
+ self._run_dataset_test('super_gpqa', dataset_args=dataset_args, ignore_errors=True)
145
+
146
+ def test_gpqa(self):
147
+ """Test GPQA dataset."""
148
+ # self._run_dataset_load_test('gpqa_diamond')
149
+ dataset_args = {
150
+ 'few_shot_num': 0,
151
+ }
152
+ self._run_dataset_test('gpqa_diamond', dataset_args=dataset_args, ignore_errors=True)
153
+
154
+ def test_iquiz(self):
155
+ """Test IQuiz dataset."""
156
+ dataset_args = {
157
+ 'subset_list': ['IQ', 'EQ'],
158
+ 'few_shot_num': 0,
159
+ }
160
+ self._run_dataset_test('iquiz', dataset_args=dataset_args)
161
+
162
+ def test_maritime_bench(self):
163
+ """Test MaritimeBench dataset."""
164
+ dataset_args = {
165
+ 'subset_list': ['default'],
166
+ 'few_shot_num': 0,
167
+ }
168
+ self._run_dataset_test('maritime_bench', dataset_args=dataset_args)
169
+
170
+ def test_musr(self):
171
+ """Test MuSR dataset."""
172
+ dataset_args = {
173
+ 'subset_list': ['murder_mysteries', 'object_placements', 'team_allocation'],
174
+ 'few_shot_num': 0,
175
+ }
176
+ self._run_dataset_test('musr', dataset_args=dataset_args)
177
+
178
+ def test_hellaswag(self):
179
+ """Test HellaSwag dataset."""
180
+ self._run_dataset_test('hellaswag')
181
+
182
+ def test_truthful_qa(self):
183
+ """Test TruthfulQA dataset."""
184
+ dataset_args = {
185
+ 'extra_params': {
186
+ 'multiple_correct': True
187
+ }
188
+ }
189
+ self._run_dataset_test('truthful_qa', dataset_args=dataset_args)
190
+
191
+ def test_trivia_qa(self):
192
+ """Test TriviaQA dataset."""
193
+ self._run_dataset_test('trivia_qa')
194
+
195
+ def test_race(self):
196
+ """Test RACE dataset."""
197
+ self._run_dataset_test('race')
198
+
199
+ def test_winogrande(self):
200
+ """Test winogrande"""
201
+ self._run_dataset_test('winogrande')
202
+
203
+ def test_bbh(self):
204
+ dataset_args = {
205
+ 'subset_list': ['temporal_sequences', 'navigate'],
206
+ }
207
+ self._run_dataset_test('bbh', dataset_args=dataset_args)
208
+
209
+ def test_simple_qa(self):
210
+ """Test SimpleQA dataset."""
211
+ self._run_dataset_test('simple_qa')
212
+
213
+ def test_chinese_simpleqa(self):
214
+ """Test Chinese SimpleQA dataset."""
215
+ dataset_args = {
216
+ 'subset_list': ['中华文化']
217
+ }
218
+ self._run_dataset_test('chinese_simpleqa', dataset_args)
219
+
220
+ # Code datasets
221
+ def test_live_code_bench(self):
222
+ """Test LiveCodeBench dataset."""
223
+ dataset_args = {
224
+ 'extra_params': {
225
+ 'start_date': '2024-08-01',
226
+ 'end_date': '2025-02-28'
227
+ },
228
+ 'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
229
+ }
230
+ self._run_dataset_test('live_code_bench', dataset_args)
231
+
232
+ def test_humaneval(self):
233
+ """Test HumanEval dataset."""
234
+ self._run_dataset_test('humaneval')
235
+
236
+ # Custom & specialized datasets
237
+ def test_general_qa(self):
238
+ """Test custom general QA dataset."""
239
+ dataset_args = {
240
+ 'local_path': 'custom_eval/text/qa',
241
+ 'subset_list': ['example']
242
+ }
243
+ self._run_dataset_test('general_qa', dataset_args)
244
+
245
+ def test_general_mcq(self):
246
+ """Test custom general MCQ dataset."""
247
+ dataset_args = {
248
+ 'local_path': 'custom_eval/text/mcq',
249
+ 'subset_list': ['example']
250
+ }
251
+ self._run_dataset_test('general_mcq', dataset_args)
252
+
253
+ def test_alpaca_eval(self):
254
+ """Test AlpacaEval dataset."""
255
+ self._run_dataset_test('alpaca_eval')
256
+
257
+ def test_arena_hard(self):
258
+ """Test Arena Hard dataset."""
259
+ self._run_dataset_test('arena_hard', use_cache='outputs/20250818_211353')
260
+
261
+ def test_frames(self):
262
+ """Test Frames dataset."""
263
+ dataset_args = {
264
+ # 'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
265
+ }
266
+ self._run_dataset_test('frames', dataset_args)
267
+
268
+ def test_docmath(self):
269
+ """Test DocMath dataset."""
270
+ self._run_dataset_test('docmath')
271
+
272
+ def test_drop(self):
273
+ """Test DROP dataset."""
274
+ dataset_args = {
275
+ 'few_shot_num': 3,
276
+ }
277
+ self._run_dataset_test('drop', dataset_args=dataset_args)
278
+
279
+ def test_ifeval(self):
280
+ """Test IFEval dataset."""
281
+ self._run_dataset_test('ifeval')
282
+
283
+ def test_needle_haystack(self):
284
+ """Test Needle in Haystack dataset."""
285
+ dataset_args = {
286
+ 'subset_list': ['english'],
287
+ 'extra_params': {
288
+ 'context_lengths_max': 10000,
289
+ 'context_lengths_num_intervals': 5,
290
+ 'document_depth_percent_intervals': 5,
291
+ 'show_score': True,
292
+ }
293
+ }
294
+ self._run_dataset_test('needle_haystack', dataset_args)
295
+
296
+ def test_ifeval(self):
297
+ """Test IFEval dataset."""
298
+ self._run_dataset_test('ifeval')
299
+
300
+ def test_hle(self):
301
+ """Test HLE dataset."""
302
+ dataset_args = {
303
+ 'subset_list': ['Math', 'Other'],
304
+ 'extra_params': {
305
+ 'include_multi_modal': False
306
+ }
307
+ }
308
+ self._run_dataset_test('hle', dataset_args)
309
+
310
+ def test_process_bench(self):
311
+ """Test ProcessBench dataset."""
312
+ dataset_args = {
313
+ 'subset_list': ['gsm8k', 'math'],
314
+ }
315
+ self._run_dataset_test('process_bench', dataset_args, use_cache='outputs/20250819_161844')
316
+
317
+ def test_humaneval(self):
318
+ """Test HumanEval dataset."""
319
+ dataset_args = {
320
+ 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5']
321
+ }
322
+ self._run_dataset_test('humaneval', dataset_args, repeats=5)
323
+
324
+ def test_live_code_bench(self):
325
+ """Test LiveCodeBench dataset."""
326
+ dataset_args = {
327
+ 'subset_list': ['v6'],
328
+ 'extra_params': {
329
+ 'start_date': '2024-08-01',
330
+ 'end_date': '2025-02-28'
331
+ },
332
+ }
333
+ self._run_dataset_test('live_code_bench', dataset_args, judge_worker_num=1)
334
+
335
+ def test_tool_bench(self):
336
+ """Test ToolBench dataset."""
337
+ self._run_dataset_test('tool_bench')
338
+
339
+ def test_bfcl(self):
340
+ """Test BFCL dataset."""
341
+ dataset_args = {
342
+ 'subset_list': ['simple', 'live_multiple', 'multi_turn_base'],
343
+ 'extra_params': {
344
+ 'is_fc_model': True,
345
+ 'underscore_to_dot': True
346
+ }
347
+ }
348
+ self._run_dataset_test('bfcl_v3', dataset_args, model='qwq-plus', stream=True)
349
+
350
+ def test_tau_bench(self):
351
+ dataset_args = {
352
+ 'subset_list': [
353
+ 'airline',
354
+ 'retail'
355
+ ],
356
+ 'extra_params': {
357
+ 'user_model': 'qwen-plus',
358
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
359
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
360
+ 'generation_config': {
361
+ 'temperature': 0.0,
362
+ 'max_tokens': 12000,
363
+ 'stream': True
364
+ }
365
+ }
366
+ }
367
+ self._run_dataset_test('tau_bench', dataset_args, limit=5, model='qwq-plus', stream=True)
368
+
369
+ def test_r1_collection(self):
370
+ dataset_args = {
371
+ 'dataset_id': 'evalscope/R1-Distill-Math-Test-v2'
372
+ }
373
+ self._run_dataset_test('data_collection', dataset_args)
374
+
375
+ def test_qwen3_collection(self):
376
+ dataset_args = {
377
+ 'dataset_id': 'evalscope/Qwen3-Test-Collection'
378
+ }
379
+ self._run_dataset_test('data_collection', dataset_args)
380
+
381
+
382
+ if __name__ == '__main__':
383
+ # Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
384
+ # Run all tests: python -m unittest test_eval.TestBenchmark
385
+ unittest.main()
@@ -0,0 +1,65 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+
8
+ from evalscope.constants import EvalType, JudgeStrategy, ModelTask
9
+ from evalscope.utils.logger import get_logger
10
+ from tests.common import TestBenchmark
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class TestImageEditBenchmark(TestBenchmark):
16
+ def setUp(self):
17
+ """Setup common test configuration."""
18
+ self.base_config = {
19
+ 'model': 'Qwen/Qwen-Image-Edit',
20
+ 'model_args':{
21
+ 'precision': 'bfloat16',
22
+ 'device_map': 'cuda:2'
23
+ },
24
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
25
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
26
+ 'model_task': ModelTask.IMAGE_GENERATION,
27
+ 'eval_type': EvalType.IMAGE_EDITING,
28
+ 'eval_batch_size': 1,
29
+ 'limit': 5,
30
+ 'generation_config': {
31
+ 'true_cfg_scale': 4.0,
32
+ 'num_inference_steps': 50,
33
+ 'negative_prompt': ' ',
34
+ },
35
+ 'judge_strategy': JudgeStrategy.AUTO,
36
+ 'judge_worker_num': 5,
37
+ 'judge_model_args': {
38
+ 'model_id': 'qwen2.5-vl-72b-instruct',
39
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
40
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
41
+ 'generation_config': {
42
+ 'temperature': 0.0,
43
+ 'max_tokens': 4096,
44
+ }
45
+ },
46
+ 'debug': True,
47
+ }
48
+
49
+ def test_gedit(self):
50
+ """Test GEdit dataset."""
51
+ dataset_args = {
52
+ 'extra_params':{
53
+ 'language': 'cn',
54
+ }
55
+ }
56
+ self._run_dataset_test('gedit', dataset_args=dataset_args, use_cache='outputs/20250829_150058')
57
+
58
+ def test_gedit_local(self):
59
+ dataset_args = {
60
+ 'extra_params':{
61
+ 'language': 'cn',
62
+ 'local_file': 'outputs/example_edit.jsonl',
63
+ }
64
+ }
65
+ self._run_dataset_test('gedit', dataset_args=dataset_args, model=None, model_id='offline_model')
@@ -25,11 +25,12 @@ class TestRun(unittest.TestCase):
25
25
  datasets=[
26
26
  'general_t2i'
27
27
  ],
28
+ model_task=ModelTask.IMAGE_GENERATION, # must be IMAGE_GENERATION
28
29
  dataset_args={
29
30
  'general_t2i': {
30
31
  'metric_list': [
31
- # 'PickScore',
32
- 'CLIPScore',
32
+ 'PickScore',
33
+ # 'CLIPScore',
33
34
  # 'HPSv2Score',
34
35
  # 'HPSv2.1Score',
35
36
  # 'BLIPv2Score',
@@ -45,6 +46,23 @@ class TestRun(unittest.TestCase):
45
46
 
46
47
  run_task(task_cfg=task_cfg)
47
48
 
49
+ def test_run_local_evalmuse(self):
50
+ from evalscope import TaskConfig, run_task
51
+
52
+ task_cfg = TaskConfig(
53
+ model_id='T2I-Model', # 只用于展示,实际运行时不需要指定模型ID
54
+ model_task=ModelTask.IMAGE_GENERATION,
55
+ datasets=[
56
+ 'evalmuse', # 使用 EvalMuse benchmark
57
+ ],
58
+ dataset_args={
59
+ 'evalmuse': {
60
+ 'dataset_id': 'data/example.jsonl', # 构建的jsonl路径
61
+ }
62
+ },
63
+ )
64
+
65
+ run_task(task_cfg=task_cfg)
48
66
 
49
67
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
50
68
  def test_run_benchmark(self):
@@ -66,13 +84,13 @@ class TestRun(unittest.TestCase):
66
84
  dataset_args={
67
85
  'tifa160': {
68
86
  'metric_list': [
69
- 'PickScore',
87
+ # 'PickScore',
70
88
  # 'CLIPScore',
71
89
  # 'HPSv2Score',
72
90
  # 'BLIPv2Score',
73
91
  # 'ImageRewardScore',
74
92
  # 'VQAScore',
75
- # 'FGA_BLIP2Score',
93
+ 'FGA_BLIP2Score',
76
94
  ]
77
95
  }
78
96
  },
@@ -0,0 +1,80 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+
8
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
9
+ from evalscope.utils.logger import get_logger
10
+ from tests.common import TestBenchmark
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ class TestVLMBenchmark(TestBenchmark):
16
+ """Benchmark evaluation test cases."""
17
+
18
+ def setUp(self):
19
+ """Setup common test configuration."""
20
+ self.base_config = {
21
+ 'model': 'qwen-vl-plus',
22
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
23
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
24
+ 'eval_type': EvalType.SERVICE,
25
+ 'eval_batch_size': 5,
26
+ 'limit': 5,
27
+ 'generation_config': {
28
+ 'max_tokens': 4096,
29
+ 'temperature': 0.0,
30
+ 'seed': 42,
31
+ 'parallel_tool_calls': True
32
+ },
33
+ 'judge_strategy': JudgeStrategy.AUTO,
34
+ 'judge_worker_num': 5,
35
+ 'judge_model_args': {
36
+ 'model_id': 'qwen2.5-72b-instruct',
37
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
38
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
39
+ 'generation_config': {
40
+ 'temperature': 0.0,
41
+ 'max_tokens': 4096,
42
+ }
43
+ },
44
+ 'debug': True,
45
+ }
46
+
47
+ def test_mmmu(self):
48
+ dataset_args = {
49
+ 'subset_list':[
50
+ 'Accounting',
51
+ 'Agriculture',
52
+ # 'Architecture_and_Engineering'
53
+ ]
54
+ }
55
+ self._run_dataset_test('mmmu', dataset_args=dataset_args)
56
+
57
+ def test_math_vista(self):
58
+ dataset_args = {
59
+ 'subset_list': ['default']
60
+ }
61
+ self._run_dataset_test('math_vista', dataset_args=dataset_args)
62
+
63
+ def test_mmmu_pro(self):
64
+ dataset_args = {
65
+ 'subset_list':[
66
+ 'Accounting',
67
+ # 'Agriculture',
68
+ ],
69
+ 'extra_params': {
70
+ 'dataset_format': 'standard (4 options)', # 'standard (4 options)', 'standard (10 options)', 'vision'
71
+ },
72
+ }
73
+ self._run_dataset_test('mmmu_pro', dataset_args=dataset_args, limit=10)
74
+
75
+ def test_qwen3_collection(self):
76
+ dataset_args = {
77
+ 'dataset_id': 'outputs/qwen3_vl_test.jsonl',
78
+ 'shuffle': True,
79
+ }
80
+ self._run_dataset_test('data_collection', dataset_args)