evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/benchmark/__init__.py +3 -0
  3. evalscope/api/benchmark/adapters/__init__.py +5 -0
  4. evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
  5. evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
  8. evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
  9. evalscope/api/benchmark/benchmark.py +356 -0
  10. evalscope/api/benchmark/meta.py +121 -0
  11. evalscope/api/dataset/__init__.py +2 -0
  12. evalscope/api/dataset/dataset.py +349 -0
  13. evalscope/api/dataset/loader.py +262 -0
  14. evalscope/api/dataset/utils.py +143 -0
  15. evalscope/api/evaluator/__init__.py +3 -0
  16. evalscope/api/evaluator/cache.py +378 -0
  17. evalscope/api/evaluator/evaluator.py +56 -0
  18. evalscope/api/evaluator/state.py +275 -0
  19. evalscope/api/filter/__init__.py +1 -0
  20. evalscope/api/filter/filter.py +72 -0
  21. evalscope/api/messages/__init__.py +12 -0
  22. evalscope/api/messages/chat_message.py +243 -0
  23. evalscope/api/messages/content.py +102 -0
  24. evalscope/api/messages/utils.py +35 -0
  25. evalscope/api/metric/__init__.py +2 -0
  26. evalscope/api/metric/metric.py +55 -0
  27. evalscope/api/metric/scorer.py +113 -0
  28. evalscope/api/mixin/__init__.py +1 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +155 -0
  32. evalscope/api/model/model.py +386 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/app.py +3 -0
  40. evalscope/app/ui/app_ui.py +2 -1
  41. evalscope/app/ui/multi_model.py +50 -25
  42. evalscope/app/ui/single_model.py +26 -14
  43. evalscope/app/utils/data_utils.py +43 -27
  44. evalscope/app/utils/env_utils.py +12 -0
  45. evalscope/app/utils/text_utils.py +14 -14
  46. evalscope/app/utils/visualization.py +9 -4
  47. evalscope/arguments.py +7 -10
  48. evalscope/backend/opencompass/api_meta_template.py +2 -1
  49. evalscope/backend/opencompass/backend_manager.py +6 -5
  50. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  51. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  52. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  53. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  55. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  56. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  57. evalscope/backend/rag_eval/utils/embedding.py +10 -1
  58. evalscope/backend/rag_eval/utils/llm.py +13 -12
  59. evalscope/benchmarks/__init__.py +0 -2
  60. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  61. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  62. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  63. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  64. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  65. evalscope/benchmarks/arena_hard/utils.py +37 -1
  66. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  67. evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
  68. evalscope/benchmarks/bfcl/generation.py +222 -0
  69. evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
  70. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  71. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  72. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  73. evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
  74. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  75. evalscope/benchmarks/docmath/utils.py +4 -5
  76. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  77. evalscope/benchmarks/frames/frames_adapter.py +136 -52
  78. evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
  79. evalscope/benchmarks/general_arena/utils.py +23 -27
  80. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  81. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  82. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  83. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  84. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  85. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  86. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  87. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  88. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  89. evalscope/benchmarks/ifeval/instructions.py +109 -64
  90. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  91. evalscope/benchmarks/ifeval/instructions_util.py +2 -3
  92. evalscope/benchmarks/ifeval/utils.py +6 -7
  93. evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
  94. evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
  95. evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
  96. evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/math_vista/__init__.py +0 -0
  105. evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
  106. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  107. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  108. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  109. evalscope/benchmarks/mmmu/__init__.py +0 -0
  110. evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
  111. evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
  112. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
  113. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  114. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
  115. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  116. evalscope/benchmarks/race/race_adapter.py +33 -119
  117. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  118. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  119. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  120. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  121. evalscope/benchmarks/tau_bench/generation.py +147 -0
  122. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
  123. evalscope/benchmarks/text2image/__init__.py +0 -0
  124. evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
  125. evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
  126. evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
  127. evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
  128. evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
  129. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  131. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
  132. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  133. evalscope/cli/cli.py +2 -0
  134. evalscope/cli/start_app.py +7 -1
  135. evalscope/cli/start_perf.py +7 -1
  136. evalscope/cli/start_server.py +6 -3
  137. evalscope/collections/__init__.py +2 -10
  138. evalscope/collections/sampler.py +10 -10
  139. evalscope/collections/schema.py +13 -11
  140. evalscope/config.py +157 -57
  141. evalscope/constants.py +37 -61
  142. evalscope/evaluator/__init__.py +1 -1
  143. evalscope/evaluator/evaluator.py +275 -419
  144. evalscope/filters/__init__.py +2 -0
  145. evalscope/filters/extraction.py +126 -0
  146. evalscope/filters/selection.py +57 -0
  147. evalscope/metrics/__init__.py +13 -13
  148. evalscope/metrics/llm_judge.py +47 -33
  149. evalscope/metrics/math_parser.py +27 -22
  150. evalscope/metrics/metric.py +307 -0
  151. evalscope/metrics/metrics.py +22 -18
  152. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  153. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  154. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  155. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  156. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  157. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  158. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  159. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  160. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  162. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  163. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  184. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  185. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  186. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  187. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  188. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  189. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  190. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  191. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  192. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  193. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  194. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  195. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  196. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  197. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  198. evalscope/models/__init__.py +6 -29
  199. evalscope/models/image_edit_model.py +125 -0
  200. evalscope/models/mockllm.py +65 -0
  201. evalscope/models/model_apis.py +67 -0
  202. evalscope/models/modelscope.py +455 -0
  203. evalscope/models/openai_compatible.py +126 -0
  204. evalscope/models/text2image_model.py +124 -0
  205. evalscope/models/utils/openai.py +701 -0
  206. evalscope/perf/benchmark.py +4 -1
  207. evalscope/perf/http_client.py +4 -2
  208. evalscope/perf/plugin/api/custom_api.py +5 -4
  209. evalscope/perf/plugin/api/openai_api.py +11 -9
  210. evalscope/perf/plugin/datasets/custom.py +2 -1
  211. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  212. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  213. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  214. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  215. evalscope/perf/plugin/datasets/openqa.py +4 -2
  216. evalscope/perf/utils/benchmark_util.py +15 -10
  217. evalscope/perf/utils/db_util.py +9 -6
  218. evalscope/perf/utils/local_server.py +11 -3
  219. evalscope/perf/utils/rich_display.py +16 -10
  220. evalscope/report/__init__.py +2 -3
  221. evalscope/report/combinator.py +18 -12
  222. evalscope/report/generator.py +51 -35
  223. evalscope/report/{utils.py → report.py} +8 -6
  224. evalscope/run.py +33 -47
  225. evalscope/summarizer.py +1 -1
  226. evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
  227. evalscope/utils/__init__.py +21 -2
  228. evalscope/utils/chat_service.py +3 -2
  229. evalscope/utils/deprecation_utils.py +12 -1
  230. evalscope/utils/function_utils.py +29 -0
  231. evalscope/utils/import_utils.py +23 -1
  232. evalscope/utils/io_utils.py +142 -6
  233. evalscope/utils/json_schema.py +208 -0
  234. evalscope/utils/logger.py +51 -12
  235. evalscope/utils/model_utils.py +11 -7
  236. evalscope/utils/multi_choices.py +288 -0
  237. evalscope/utils/url_utils.py +65 -0
  238. evalscope/version.py +2 -2
  239. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
  240. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
  241. tests/benchmark/test_eval.py +385 -0
  242. tests/benchmark/test_image_edit.py +65 -0
  243. tests/{aigc → benchmark}/test_t2i.py +22 -4
  244. tests/benchmark/test_vlm.py +80 -0
  245. tests/cli/test_all.py +85 -47
  246. tests/cli/test_collection.py +20 -8
  247. tests/cli/test_custom.py +22 -15
  248. tests/cli/test_reasoning.py +81 -0
  249. tests/common.py +73 -0
  250. tests/perf/test_perf.py +4 -2
  251. tests/rag/test_clip_benchmark.py +0 -2
  252. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  253. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
  254. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
  255. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
  256. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
  257. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
  258. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  259. evalscope/benchmarks/benchmark.py +0 -81
  260. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  261. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  262. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  263. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  264. evalscope/benchmarks/data_adapter.py +0 -528
  265. evalscope/benchmarks/filters.py +0 -59
  266. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  267. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  268. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  269. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  270. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  271. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  272. evalscope/benchmarks/race/race.py +0 -104
  273. evalscope/benchmarks/race/samples.jsonl +0 -5
  274. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  275. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  276. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  277. evalscope/benchmarks/utils.py +0 -60
  278. evalscope/collections/evaluator.py +0 -375
  279. evalscope/metrics/completion_parsers.py +0 -227
  280. evalscope/metrics/named_metrics.py +0 -55
  281. evalscope/models/adapters/__init__.py +0 -14
  282. evalscope/models/adapters/base_adapter.py +0 -84
  283. evalscope/models/adapters/bfcl_adapter.py +0 -246
  284. evalscope/models/adapters/chat_adapter.py +0 -207
  285. evalscope/models/adapters/choice_adapter.py +0 -222
  286. evalscope/models/adapters/custom_adapter.py +0 -71
  287. evalscope/models/adapters/server_adapter.py +0 -236
  288. evalscope/models/adapters/t2i_adapter.py +0 -79
  289. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  290. evalscope/models/custom/__init__.py +0 -4
  291. evalscope/models/custom/custom_model.py +0 -50
  292. evalscope/models/custom/dummy_model.py +0 -99
  293. evalscope/models/local_model.py +0 -128
  294. evalscope/models/register.py +0 -41
  295. tests/cli/test_run.py +0 -489
  296. /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
  297. /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
  298. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
  299. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
  300. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
  301. {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
  302. /tests/{aigc → benchmark}/__init__.py +0 -0
tests/cli/test_all.py CHANGED
@@ -17,44 +17,44 @@ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
17
17
  logger = get_logger()
18
18
 
19
19
  datasets=[
20
- 'iquiz',
21
- 'ifeval',
22
- 'mmlu',
23
- 'mmlu_pro',
24
- 'musr',
25
- 'process_bench',
26
- 'race',
27
- 'trivia_qa',
28
- 'cmmlu',
29
- 'humaneval',
30
- 'gsm8k',
31
- 'bbh',
32
- 'competition_math',
33
- 'math_500',
34
- 'aime24',
35
- 'gpqa',
36
- 'arc',
37
- 'ceval',
38
- 'hellaswag',
39
- 'general_mcq',
40
- 'general_qa',
41
- 'super_gpqa',
42
- # 'live_code_bench',
43
- 'mmlu_redux',
44
- 'simple_qa',
45
- 'chinese_simpleqa',
46
- 'alpaca_eval',
47
- 'arena_hard',
48
- 'maritime_bench',
49
- 'drop',
50
- 'winogrande',
51
- 'tool_bench',
52
- 'frames',
53
- 'docmath',
54
- 'needle_haystack',
55
- 'bfcl_v3',
56
- 'hle',
57
- 'tau_bench',
20
+ 'iquiz',
21
+ 'ifeval',
22
+ 'mmlu',
23
+ 'mmlu_pro',
24
+ 'musr',
25
+ 'process_bench',
26
+ 'race',
27
+ 'trivia_qa',
28
+ 'cmmlu',
29
+ 'humaneval',
30
+ 'gsm8k',
31
+ 'bbh',
32
+ 'competition_math',
33
+ 'math_500',
34
+ 'aime24',
35
+ 'gpqa_diamond',
36
+ 'arc',
37
+ 'ceval',
38
+ 'hellaswag',
39
+ 'general_mcq',
40
+ 'general_qa',
41
+ 'super_gpqa',
42
+ # 'live_code_bench',
43
+ 'mmlu_redux',
44
+ 'simple_qa',
45
+ 'chinese_simpleqa',
46
+ 'alpaca_eval',
47
+ 'arena_hard',
48
+ 'maritime_bench',
49
+ 'drop',
50
+ 'winogrande',
51
+ 'tool_bench',
52
+ 'frames',
53
+ 'docmath',
54
+ 'needle_haystack',
55
+ 'bfcl_v3',
56
+ 'hle',
57
+ 'tau_bench',
58
58
  ]
59
59
 
60
60
  # Reverse the datasets list to ensure the order is from most recent to oldest
@@ -82,8 +82,7 @@ dataset_args={
82
82
  'bbh': {
83
83
  'subset_list': ['word_sorting', 'movie_recommendation'],
84
84
  },
85
- 'gpqa': {
86
- 'subset_list': ['gpqa_diamond'],
85
+ 'gpqa_diamond': {
87
86
  'few_shot_num': 0,
88
87
  },
89
88
  'humaneval': {
@@ -112,8 +111,7 @@ dataset_args={
112
111
  'subset_list': [
113
112
  'example', # 评测数据集名称,上述 *_dev.csv 中的 *
114
113
  # 'test'
115
- ],
116
- 'metric_list': ['AverageBLEU']
114
+ ]
117
115
  },
118
116
  'super_gpqa': {
119
117
  'subset_list': ['Philosophy', 'Education'],
@@ -152,7 +150,6 @@ dataset_args={
152
150
  }
153
151
 
154
152
  class TestRun(unittest.TestCase):
155
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
156
153
  def test_benchmarks(self):
157
154
  from evalscope.config import TaskConfig
158
155
 
@@ -182,19 +179,60 @@ class TestRun(unittest.TestCase):
182
179
 
183
180
  run_task(task_cfg=task_cfg)
184
181
 
182
+ def test_vlm_benchmark(self):
183
+ from evalscope.config import TaskConfig
184
+
185
+ task_cfg = TaskConfig(
186
+ model='qwen-vl-plus',
187
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
188
+ api_key= env.get('DASHSCOPE_API_KEY'),
189
+ eval_type=EvalType.SERVICE,
190
+ datasets=[
191
+ 'mmmu',
192
+ # 'math_vista',
193
+ ],
194
+ dataset_args={
195
+ 'mmmu': {
196
+ 'subset_list': ['Accounting']
197
+ },
198
+ 'math_vista': {
199
+ 'subset_list': ['default']
200
+ }
201
+ },
202
+ eval_batch_size=1,
203
+ limit=1,
204
+ stream=True,
205
+ generation_config={
206
+ 'temperature': 0,
207
+ 'n': 1,
208
+ 'max_tokens': 4096,
209
+ 'image_height': 512,
210
+ 'image_width': 512,
211
+ 'image_num': 2,
212
+ },
213
+ judge_worker_num=5,
214
+ judge_strategy=JudgeStrategy.AUTO,
215
+ judge_model_args={
216
+ 'model_id': 'qwen2.5-72b-instruct',
217
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
218
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
219
+ }
220
+ )
221
+
222
+ run_task(task_cfg=task_cfg)
185
223
 
186
- @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
187
224
  def test_ci_lite(self):
188
225
  from evalscope.config import TaskConfig
189
226
 
227
+ api_key = env.get('DASHSCOPE_API_KEY')
228
+
190
229
  task_cfg = TaskConfig(
191
230
  model='qwen-plus',
192
231
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
193
- api_key= env.get('DASHSCOPE_API_KEY'),
194
- eval_type=EvalType.SERVICE,
232
+ api_key=api_key,
233
+ eval_type=EvalType.SERVICE if api_key else EvalType.MOCK_LLM,
195
234
  datasets=[
196
235
  'general_mcq',
197
- 'general_qa',
198
236
  'iquiz',
199
237
  ],
200
238
  dataset_args={
@@ -1,3 +1,6 @@
1
+ from dotenv import dotenv_values
2
+
3
+ env = dotenv_values('.env')
1
4
  import json
2
5
  import os
3
6
  import unittest
@@ -15,7 +18,6 @@ class TestCollection(unittest.TestCase):
15
18
  CollectionSchema(name='math', datasets=[
16
19
  CollectionSchema(name='generation', datasets=[
17
20
  DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
18
- DatasetInfo(name='competition_math', weight=1, task_type='math', tags=['en', 'math']),
19
21
  ]),
20
22
  CollectionSchema(name='multiple_choice', datasets=[
21
23
  DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
@@ -45,15 +47,25 @@ class TestCollection(unittest.TestCase):
45
47
  from evalscope import TaskConfig, run_task
46
48
 
47
49
  task_cfg = TaskConfig(
48
- model='Qwen2.5-0.5B-Instruct',
49
- api_url='http://127.0.0.1:8801/v1/chat/completions',
50
- api_key='EMPTY',
50
+ model='qwen-plus',
51
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
52
+ api_key=env.get('DASHSCOPE_API_KEY'),
51
53
  eval_type=EvalType.SERVICE,
52
54
  datasets=['data_collection'],
53
- dataset_args={'data_collection': {
54
- 'local_path': 'outputs/mixed_data_test.jsonl'
55
- # 'local_path': 'outputs/weighted_mixed_data.jsonl'
56
- }},
55
+ dataset_args={
56
+ 'data_collection': {
57
+ # 'local_path': 'outputs/test_mix.jsonl'
58
+ 'local_path': 'outputs/mixed_data_test.jsonl',
59
+ 'shuffle': True,
60
+ }
61
+ },
62
+ eval_batch_size=5,
63
+ generation_config = {
64
+ 'max_tokens': 10000,
65
+ 'temperature': 0.0,
66
+ },
67
+ limit=10,
68
+ # use_cache='outputs/20250822_161804'
57
69
  )
58
70
  run_task(task_cfg=task_cfg)
59
71
 
tests/cli/test_custom.py CHANGED
@@ -10,7 +10,7 @@ import subprocess
10
10
  import unittest
11
11
 
12
12
  from evalscope.config import TaskConfig
13
- from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
13
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
14
14
  from evalscope.run import run_task
15
15
  from evalscope.utils.import_utils import is_module_installed
16
16
  from evalscope.utils.logger import get_logger
@@ -120,7 +120,7 @@ class TestRunCustom(unittest.TestCase):
120
120
  from evalscope.config import TaskConfig
121
121
 
122
122
  task_cfg = TaskConfig(
123
- model='qwen2.5-72b-instruct',
123
+ model='qwen2.5-7b-instruct',
124
124
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
125
125
  api_key= env.get('DASHSCOPE_API_KEY'),
126
126
  eval_type=EvalType.SERVICE,
@@ -132,7 +132,7 @@ class TestRunCustom(unittest.TestCase):
132
132
  'dataset_id': 'custom_eval/text/qa',
133
133
  'subset_list': [
134
134
  'arena',
135
- 'example'
135
+ # 'example'
136
136
  ],
137
137
  }
138
138
  },
@@ -147,7 +147,7 @@ class TestRunCustom(unittest.TestCase):
147
147
  },
148
148
  ignore_errors=False,
149
149
  judge_model_args={
150
- 'model_id': 'qwen2.5-72b-instruct',
150
+ 'model_id': 'qwen2.5-7b-instruct',
151
151
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
152
152
  'api_key': env.get('DASHSCOPE_API_KEY'),
153
153
  'generation_config': {
@@ -155,9 +155,19 @@ class TestRunCustom(unittest.TestCase):
155
155
  'max_tokens': 4096
156
156
  },
157
157
  'score_type': 'numeric',
158
+ 'prompt_template': """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
159
+ Begin your evaluation by providing a short explanation. Be as objective as possible.
160
+ After providing your explanation, you must rate the response on a scale of 0 (worst) to 100 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\"
161
+
162
+ [Question]
163
+ {question}
164
+
165
+ [Response]
166
+ {pred}
167
+ """
158
168
  },
159
169
  judge_worker_num=5,
160
- judge_strategy=JudgeStrategy.AUTO,
170
+ judge_strategy=JudgeStrategy.LLM,
161
171
  )
162
172
 
163
173
  run_task(task_cfg=task_cfg)
@@ -203,8 +213,9 @@ class TestRunCustom(unittest.TestCase):
203
213
  },
204
214
  'score_type': 'pattern',
205
215
  },
206
- judge_worker_num=5,
207
- judge_strategy=JudgeStrategy.LLM,
216
+ judge_worker_num=1,
217
+ judge_strategy=JudgeStrategy.LLM_RECALL,
218
+ use_cache='outputs/20250818_170420'
208
219
  )
209
220
 
210
221
  run_task(task_cfg=task_cfg)
@@ -223,20 +234,16 @@ class TestRunCustom(unittest.TestCase):
223
234
  'general_arena': {
224
235
  'extra_params':{
225
236
  'models':[
226
- {
227
- 'name': 'qwen2.5-0.5b',
228
- 'report_path': 'outputs/20250702_140354/reports/qwen2.5-0.5b-instruct'
229
- },
230
237
  {
231
238
  'name': 'qwen2.5-7b',
232
- 'report_path': 'outputs/20250702_140702/reports/qwen2.5-7b-instruct'
239
+ 'report_path': 'outputs/20250819_165034/reports/qwen2.5-7b-instruct'
233
240
  },
234
241
  {
235
242
  'name': 'qwen2.5-72b',
236
- 'report_path': 'outputs/20250702_140802/reports/qwen2.5-72b-instruct'
243
+ 'report_path': 'outputs/20250819_164926/reports/qwen2.5-72b-instruct'
237
244
  }
238
245
  ],
239
- 'baseline': 'qwen2.5-7b'
246
+ 'baseline': 'qwen2.5-72b'
240
247
  }
241
248
  }
242
249
  },
@@ -255,7 +262,7 @@ class TestRunCustom(unittest.TestCase):
255
262
  },
256
263
  },
257
264
  judge_worker_num=5,
258
- use_cache='outputs/20250702_165727'
265
+ # use_cache='outputs/20250819_173546'
259
266
  )
260
267
 
261
268
  run_task(task_cfg=task_cfg)
@@ -0,0 +1,81 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+ from unittest import TestCase
8
+
9
+ from evalscope.config import TaskConfig
10
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
11
+ from evalscope.run import run_task
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ class TestReasoning(TestCase):
18
+ """Benchmark evaluation test cases."""
19
+
20
+ def setUp(self):
21
+ """Setup common test configuration."""
22
+ self.base_config = {
23
+ 'model': 'Qwen3-0.6B',
24
+ 'api_url': 'http://0.0.0.0:8801/v1',
25
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
26
+ 'eval_type': EvalType.SERVICE,
27
+ 'eval_batch_size': 5,
28
+ 'limit': 5,
29
+ 'generation_config': {
30
+ 'max_tokens': 4096,
31
+ 'temperature': 0.0,
32
+ 'seed': 42,
33
+ 'parallel_tool_calls': True,
34
+ 'extra_body':{'chat_template_kwargs': {'enable_thinking': False}} # 关闭思考模式
35
+ },
36
+ 'judge_strategy': JudgeStrategy.AUTO,
37
+ 'judge_worker_num': 5,
38
+ 'judge_model_args': {
39
+ 'model_id': 'qwen2.5-72b-instruct',
40
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
41
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
42
+ 'generation_config': {
43
+ 'temperature': 0.0,
44
+ 'max_tokens': 4096,
45
+ }
46
+ },
47
+ 'debug': True,
48
+ }
49
+
50
+ def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
51
+ """Helper method to run test for a specific dataset."""
52
+ config = self.base_config.copy()
53
+ config['datasets'] = [dataset_name]
54
+
55
+ if use_mock:
56
+ config['eval_type'] = EvalType.MOCK_LLM
57
+
58
+ # 应用配置覆盖
59
+ config.update(config_overrides)
60
+
61
+ if dataset_args:
62
+ config['dataset_args'] = {dataset_name: dataset_args}
63
+
64
+ task_cfg = TaskConfig(**config)
65
+ run_task(task_cfg=task_cfg)
66
+
67
+ def _run_dataset_load_test(self, dataset_name, dataset_args=None):
68
+ """Helper method to test dataset loading."""
69
+
70
+ self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
71
+
72
+ # Math & Reasoning datasets
73
+ def test_gsm8k(self):
74
+ """Test GSM8K math reasoning dataset."""
75
+ self._run_dataset_test('gsm8k')
76
+
77
+
78
+ if __name__ == '__main__':
79
+ # Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
80
+ # Run all tests: python -m unittest test_eval.TestBenchmark
81
+ unittest.main()
tests/common.py ADDED
@@ -0,0 +1,73 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+ from unittest import TestCase
8
+
9
+ from evalscope.config import TaskConfig
10
+ from evalscope.constants import EvalType, JudgeStrategy
11
+ from evalscope.run import run_task
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ class TestBenchmark(TestCase):
18
+ """Benchmark evaluation test cases."""
19
+
20
+ def setUp(self):
21
+ """Setup common test configuration."""
22
+ self.base_config = {
23
+ 'model': 'qwen-plus',
24
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
25
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
26
+ 'eval_type': EvalType.SERVICE,
27
+ 'eval_batch_size': 5,
28
+ 'limit': 5,
29
+ 'generation_config': {
30
+ 'max_tokens': 4096,
31
+ 'temperature': 0.0,
32
+ 'seed': 42,
33
+ 'parallel_tool_calls': True
34
+ },
35
+ 'judge_strategy': JudgeStrategy.AUTO,
36
+ 'judge_worker_num': 5,
37
+ 'judge_model_args': {
38
+ 'model_id': 'qwen2.5-72b-instruct',
39
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
40
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
41
+ 'generation_config': {
42
+ 'temperature': 0.0,
43
+ 'max_tokens': 4096,
44
+ }
45
+ },
46
+ 'debug': True,
47
+ }
48
+
49
+ def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
50
+ """Helper method to run test for a specific dataset."""
51
+ config = self.base_config.copy()
52
+ config['datasets'] = [dataset_name]
53
+
54
+ if not env.get('DASHSCOPE_API_KEY'):
55
+ use_mock = True
56
+ logger.warning('DASHSCOPE_API_KEY is not set. Using mock evaluation.')
57
+
58
+ if use_mock:
59
+ config['eval_type'] = EvalType.MOCK_LLM
60
+
61
+ # 应用配置覆盖
62
+ config.update(config_overrides)
63
+
64
+ if dataset_args:
65
+ config['dataset_args'] = {dataset_name: dataset_args}
66
+
67
+ task_cfg = TaskConfig(**config)
68
+ run_task(task_cfg=task_cfg)
69
+
70
+ def _run_dataset_load_test(self, dataset_name, dataset_args=None):
71
+ """Helper method to test dataset loading."""
72
+
73
+ self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
tests/perf/test_perf.py CHANGED
@@ -1,9 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
2
  from dotenv import dotenv_values
4
3
 
5
4
  env = dotenv_values('.env')
6
- os.environ['CUDA_VISIBLE_DEVICES'] = '0'
7
5
  import unittest
8
6
 
9
7
  from evalscope.perf.main import run_perf_benchmark
@@ -123,6 +121,10 @@ class TestPerf(unittest.TestCase):
123
121
 
124
122
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
125
123
  def test_run_perf_multi_parallel(self):
124
+ if not env.get('DASHSCOPE_API_KEY'):
125
+ self.skipTest('DASHSCOPE_API_KEY is not set.')
126
+ return
127
+
126
128
  from evalscope.perf.arguments import Arguments
127
129
  task_cfg = Arguments(
128
130
  parallel=[1, 2],
@@ -1,7 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
2
  import os
4
- # os.environ["CUDA_VISIBLE_DEVICES"] = "1"
5
3
  import subprocess
6
4
  import unittest
7
5
 
@@ -1,56 +0,0 @@
1
- from typing import List, Optional, Union
2
-
3
- from evalscope.benchmarks import DataAdapter
4
- from evalscope.metrics import mean, metric_registry
5
- from evalscope.utils.logger import get_logger
6
-
7
- logger = get_logger()
8
-
9
-
10
- class T2IBaseAdapter(DataAdapter):
11
-
12
- def __init__(self, **kwargs):
13
-
14
- super().__init__(**kwargs)
15
-
16
- logger.info(f'Initializing metrics: {self.metric_list}')
17
- self.metrics = {m: metric_registry.get(m).object() for m in self.metric_list}
18
-
19
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
20
- # dummy prompt for general t2i
21
- return self.gen_prompt_data(prompt=input_d.get('prompt', ''), id=input_d.get('id', 0))
22
-
23
- def get_gold_answer(self, input_d: dict) -> str:
24
- # dummy gold answer for general t2i
25
- return input_d.get('prompt', '')
26
-
27
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
28
- # dummy parse pred result for general t2i
29
- return result or raw_input_d.get('image_path', '')
30
-
31
- def match(self, gold: str, pred: str) -> dict:
32
- # dummy match for general t2i
33
- # pred is the image path, gold is the prompt
34
- res = {}
35
- for metric_name, metric_func in self.metrics.items():
36
- score = metric_func(images=[pred], texts=[gold])[0][0]
37
- if isinstance(score, dict):
38
- for k, v in score.items():
39
- res[f'{metric_name}_{k}'] = v.cpu().item()
40
- else:
41
- res[metric_name] = score.cpu().item() # Updated to use score.cpu().item()
42
- return res
43
-
44
- def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
45
- """
46
- compute weighted mean of the bleu score of all samples
47
-
48
- Args:
49
- review_res_list: [score1, score2, ...]
50
-
51
- Returns:
52
- avg_res: List[dict]
53
-
54
- """
55
- items = super().compute_dict_metric(review_res_list, **kwargs)
56
- return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
@@ -1,78 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os.path
3
- from collections import defaultdict
4
- from typing import List, Optional, Union
5
-
6
- from evalscope.benchmarks import Benchmark
7
- from evalscope.constants import OutputType
8
- from evalscope.metrics import mean
9
- from evalscope.utils.io_utils import jsonl_to_list
10
- from evalscope.utils.logger import get_logger
11
- from .base import T2IBaseAdapter
12
-
13
- logger = get_logger()
14
-
15
-
16
- @Benchmark.register(
17
- name='evalmuse',
18
- dataset_id='AI-ModelScope/T2V-Eval-Prompts',
19
- model_adapter=OutputType.IMAGE_GENERATION,
20
- output_types=[OutputType.IMAGE_GENERATION],
21
- subset_list=['EvalMuse'],
22
- metric_list=['FGA_BLIP2Score'],
23
- few_shot_num=0,
24
- train_split=None,
25
- eval_split='test',
26
- )
27
- class EvalMuseAdapter(T2IBaseAdapter):
28
-
29
- def __init__(self, **kwargs):
30
- super().__init__(**kwargs)
31
-
32
- def load(self, **kwargs) -> dict:
33
- if os.path.isfile(self.dataset_id):
34
- data_list = jsonl_to_list(self.dataset_id)
35
- data_dict = {self.subset_list[0]: {'test': data_list}}
36
- return data_dict
37
- else:
38
- return super().load(**kwargs)
39
-
40
- def get_gold_answer(self, input_d: dict) -> dict:
41
- # return prompt and elements dict
42
- return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
43
-
44
- def match(self, gold: dict, pred: str) -> dict:
45
- # dummy match for general t2i
46
- # pred is the image path, gold is the prompt
47
- res = {}
48
- for metric_name, metric_func in self.metrics.items():
49
- if metric_name == 'FGA_BLIP2Score':
50
- # For FGA_BLIP2Score, we need to pass the dictionary
51
- score = metric_func(images=[pred], texts=[gold])[0][0]
52
- else:
53
- score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
54
- if isinstance(score, dict):
55
- for k, v in score.items():
56
- res[f'{metric_name}:{k}'] = v.cpu().item()
57
- else:
58
- res[metric_name] = score.cpu().item()
59
- return res
60
-
61
- def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
62
- """
63
- compute weighted mean of the bleu score of all samples
64
- """
65
- items = super().compute_dict_metric(review_res_list, **kwargs)
66
- # add statistics for each metric
67
- new_items = defaultdict(list)
68
- for metric_name, value_list in items.items():
69
- if 'FGA_BLIP2Score' in metric_name and '(' in metric_name: # FGA_BLIP2Score element score
70
- metrics_prefix = metric_name.split(':')[0]
71
- category = metric_name.rpartition('(')[-1].split(')')[0]
72
- category = category.split('-')[0].lower() # remove the suffix if exists
73
- new_items[f'{metrics_prefix}:{category}'].extend(value_list)
74
- else:
75
- new_items[metric_name].extend(value_list)
76
-
77
- # calculate mean for each metric
78
- return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in new_items.items()]