evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (273) hide show
  1. evalscope/__init__.py +4 -1
  2. evalscope/api/__init__.py +0 -0
  3. evalscope/api/benchmark/__init__.py +3 -0
  4. evalscope/api/benchmark/adapters/__init__.py +3 -0
  5. evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
  6. evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
  7. evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
  8. evalscope/api/benchmark/benchmark.py +321 -0
  9. evalscope/api/benchmark/meta.py +115 -0
  10. evalscope/api/dataset/__init__.py +2 -0
  11. evalscope/api/dataset/dataset.py +349 -0
  12. evalscope/api/dataset/loader.py +261 -0
  13. evalscope/api/dataset/utils.py +143 -0
  14. evalscope/api/evaluator/__init__.py +3 -0
  15. evalscope/api/evaluator/cache.py +355 -0
  16. evalscope/api/evaluator/evaluator.py +56 -0
  17. evalscope/api/evaluator/state.py +264 -0
  18. evalscope/api/filter/__init__.py +1 -0
  19. evalscope/api/filter/filter.py +72 -0
  20. evalscope/api/messages/__init__.py +11 -0
  21. evalscope/api/messages/chat_message.py +198 -0
  22. evalscope/api/messages/content.py +102 -0
  23. evalscope/api/messages/utils.py +35 -0
  24. evalscope/api/metric/__init__.py +2 -0
  25. evalscope/api/metric/metric.py +55 -0
  26. evalscope/api/metric/scorer.py +105 -0
  27. evalscope/api/mixin/__init__.py +2 -0
  28. evalscope/api/mixin/dataset_mixin.py +105 -0
  29. evalscope/api/mixin/llm_judge_mixin.py +168 -0
  30. evalscope/api/model/__init__.py +12 -0
  31. evalscope/api/model/generate_config.py +157 -0
  32. evalscope/api/model/model.py +383 -0
  33. evalscope/api/model/model_output.py +285 -0
  34. evalscope/api/registry.py +182 -0
  35. evalscope/api/tool/__init__.py +3 -0
  36. evalscope/api/tool/tool_call.py +101 -0
  37. evalscope/api/tool/tool_info.py +173 -0
  38. evalscope/api/tool/utils.py +64 -0
  39. evalscope/app/ui/app_ui.py +2 -1
  40. evalscope/app/ui/multi_model.py +50 -25
  41. evalscope/app/ui/single_model.py +23 -11
  42. evalscope/app/utils/data_utils.py +42 -26
  43. evalscope/app/utils/text_utils.py +0 -2
  44. evalscope/app/utils/visualization.py +9 -4
  45. evalscope/arguments.py +6 -7
  46. evalscope/backend/opencompass/api_meta_template.py +2 -1
  47. evalscope/backend/opencompass/backend_manager.py +6 -3
  48. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
  49. evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
  50. evalscope/backend/rag_eval/ragas/task_template.py +2 -1
  51. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
  52. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
  53. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
  54. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
  55. evalscope/backend/rag_eval/utils/embedding.py +2 -1
  56. evalscope/backend/rag_eval/utils/llm.py +13 -12
  57. evalscope/benchmarks/__init__.py +0 -2
  58. evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
  59. evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
  60. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
  61. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
  62. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
  63. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
  64. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
  65. evalscope/benchmarks/aime/aime24_adapter.py +38 -40
  66. evalscope/benchmarks/aime/aime25_adapter.py +34 -40
  67. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
  68. evalscope/benchmarks/arc/arc_adapter.py +34 -147
  69. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
  70. evalscope/benchmarks/arena_hard/utils.py +37 -1
  71. evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
  72. evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
  73. evalscope/benchmarks/bfcl/generation.py +222 -0
  74. evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
  75. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
  76. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
  77. evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
  78. evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
  79. evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
  80. evalscope/benchmarks/docmath/utils.py +4 -5
  81. evalscope/benchmarks/drop/drop_adapter.py +88 -40
  82. evalscope/benchmarks/frames/frames_adapter.py +135 -52
  83. evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
  84. evalscope/benchmarks/general_arena/utils.py +23 -27
  85. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
  86. evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
  87. evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
  88. evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
  89. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
  90. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
  91. evalscope/benchmarks/hle/hle_adapter.py +127 -93
  92. evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
  93. evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
  94. evalscope/benchmarks/ifeval/instructions.py +109 -64
  95. evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
  96. evalscope/benchmarks/ifeval/utils.py +6 -7
  97. evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
  98. evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
  99. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
  100. evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
  101. evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
  102. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
  103. evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
  104. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
  105. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
  106. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
  107. evalscope/benchmarks/musr/musr_adapter.py +33 -64
  108. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
  109. evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
  110. evalscope/benchmarks/race/race_adapter.py +33 -119
  111. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
  112. evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
  113. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
  114. evalscope/benchmarks/super_gpqa/utils.py +2 -1
  115. evalscope/benchmarks/tau_bench/generation.py +147 -0
  116. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
  117. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
  118. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
  119. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
  120. evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
  121. evalscope/cli/cli.py +2 -0
  122. evalscope/cli/start_server.py +6 -3
  123. evalscope/collections/__init__.py +2 -10
  124. evalscope/collections/sampler.py +10 -10
  125. evalscope/collections/schema.py +13 -11
  126. evalscope/config.py +95 -54
  127. evalscope/constants.py +29 -61
  128. evalscope/evaluator/__init__.py +1 -1
  129. evalscope/evaluator/evaluator.py +277 -423
  130. evalscope/filters/__init__.py +2 -0
  131. evalscope/filters/extraction.py +126 -0
  132. evalscope/filters/selection.py +57 -0
  133. evalscope/metrics/__init__.py +13 -13
  134. evalscope/metrics/llm_judge.py +32 -30
  135. evalscope/metrics/math_parser.py +27 -22
  136. evalscope/metrics/metric.py +307 -0
  137. evalscope/metrics/metrics.py +22 -18
  138. evalscope/metrics/t2v_metrics/__init__.py +0 -52
  139. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
  140. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
  141. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
  142. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
  143. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
  144. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
  145. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
  146. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
  147. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
  148. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
  149. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
  150. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
  151. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
  152. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
  153. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
  154. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
  155. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
  156. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
  157. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
  158. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
  159. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
  160. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
  161. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
  162. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
  163. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
  164. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
  165. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
  166. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
  167. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
  168. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
  169. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
  170. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
  171. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
  172. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
  173. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
  174. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
  175. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
  176. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
  177. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
  178. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
  179. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
  180. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
  181. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
  182. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
  183. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
  184. evalscope/models/__init__.py +6 -29
  185. evalscope/models/mockllm.py +65 -0
  186. evalscope/models/model_apis.py +47 -0
  187. evalscope/models/modelscope.py +455 -0
  188. evalscope/models/openai_compatible.py +123 -0
  189. evalscope/models/text2image_model.py +124 -0
  190. evalscope/models/utils/openai.py +698 -0
  191. evalscope/perf/benchmark.py +2 -1
  192. evalscope/perf/http_client.py +4 -2
  193. evalscope/perf/plugin/api/custom_api.py +5 -4
  194. evalscope/perf/plugin/api/openai_api.py +11 -9
  195. evalscope/perf/plugin/datasets/custom.py +2 -1
  196. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  197. evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
  198. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  199. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  200. evalscope/perf/plugin/datasets/openqa.py +4 -2
  201. evalscope/perf/utils/benchmark_util.py +7 -5
  202. evalscope/perf/utils/db_util.py +9 -6
  203. evalscope/perf/utils/local_server.py +8 -3
  204. evalscope/perf/utils/rich_display.py +16 -10
  205. evalscope/report/__init__.py +2 -2
  206. evalscope/report/combinator.py +18 -12
  207. evalscope/report/generator.py +101 -6
  208. evalscope/report/{utils.py → report.py} +8 -6
  209. evalscope/run.py +26 -44
  210. evalscope/summarizer.py +1 -1
  211. evalscope/utils/__init__.py +21 -2
  212. evalscope/utils/chat_service.py +2 -1
  213. evalscope/utils/deprecation_utils.py +12 -1
  214. evalscope/utils/function_utils.py +29 -0
  215. evalscope/utils/io_utils.py +100 -5
  216. evalscope/utils/json_schema.py +208 -0
  217. evalscope/utils/logger.py +51 -12
  218. evalscope/utils/model_utils.py +10 -7
  219. evalscope/utils/multi_choices.py +271 -0
  220. evalscope/utils/url_utils.py +65 -0
  221. evalscope/version.py +2 -2
  222. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
  223. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
  224. tests/aigc/test_t2i.py +22 -4
  225. tests/benchmark/__init__.py +1 -0
  226. tests/benchmark/test_eval.py +386 -0
  227. tests/cli/test_all.py +3 -5
  228. tests/cli/test_collection.py +13 -4
  229. tests/cli/test_custom.py +22 -15
  230. tests/rag/test_clip_benchmark.py +1 -0
  231. evalscope/benchmarks/aigc/t2i/base.py +0 -56
  232. evalscope/benchmarks/arc/ai2_arc.py +0 -151
  233. evalscope/benchmarks/benchmark.py +0 -81
  234. evalscope/benchmarks/ceval/ceval_exam.py +0 -146
  235. evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
  236. evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
  237. evalscope/benchmarks/competition_math/competition_math.py +0 -79
  238. evalscope/benchmarks/data_adapter.py +0 -528
  239. evalscope/benchmarks/filters.py +0 -59
  240. evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
  241. evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
  242. evalscope/benchmarks/humaneval/humaneval.py +0 -79
  243. evalscope/benchmarks/mmlu/mmlu.py +0 -160
  244. evalscope/benchmarks/mmlu/samples.jsonl +0 -5
  245. evalscope/benchmarks/process_bench/critique_template.txt +0 -13
  246. evalscope/benchmarks/race/race.py +0 -104
  247. evalscope/benchmarks/race/samples.jsonl +0 -5
  248. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
  249. evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
  250. evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
  251. evalscope/benchmarks/utils.py +0 -60
  252. evalscope/collections/evaluator.py +0 -375
  253. evalscope/metrics/completion_parsers.py +0 -227
  254. evalscope/metrics/named_metrics.py +0 -55
  255. evalscope/models/adapters/__init__.py +0 -14
  256. evalscope/models/adapters/base_adapter.py +0 -84
  257. evalscope/models/adapters/bfcl_adapter.py +0 -246
  258. evalscope/models/adapters/chat_adapter.py +0 -207
  259. evalscope/models/adapters/choice_adapter.py +0 -222
  260. evalscope/models/adapters/custom_adapter.py +0 -71
  261. evalscope/models/adapters/server_adapter.py +0 -236
  262. evalscope/models/adapters/t2i_adapter.py +0 -79
  263. evalscope/models/adapters/tau_bench_adapter.py +0 -189
  264. evalscope/models/custom/__init__.py +0 -4
  265. evalscope/models/custom/custom_model.py +0 -50
  266. evalscope/models/custom/dummy_model.py +0 -99
  267. evalscope/models/local_model.py +0 -128
  268. evalscope/models/register.py +0 -41
  269. tests/cli/test_run.py +0 -489
  270. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
  271. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
  272. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
  273. {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
tests/aigc/test_t2i.py CHANGED
@@ -25,11 +25,12 @@ class TestRun(unittest.TestCase):
25
25
  datasets=[
26
26
  'general_t2i'
27
27
  ],
28
+ model_task=ModelTask.IMAGE_GENERATION, # must be IMAGE_GENERATION
28
29
  dataset_args={
29
30
  'general_t2i': {
30
31
  'metric_list': [
31
- # 'PickScore',
32
- 'CLIPScore',
32
+ 'PickScore',
33
+ # 'CLIPScore',
33
34
  # 'HPSv2Score',
34
35
  # 'HPSv2.1Score',
35
36
  # 'BLIPv2Score',
@@ -45,6 +46,23 @@ class TestRun(unittest.TestCase):
45
46
 
46
47
  run_task(task_cfg=task_cfg)
47
48
 
49
+ def test_run_local_evalmuse(self):
50
+ from evalscope import TaskConfig, run_task
51
+
52
+ task_cfg = TaskConfig(
53
+ model_id='T2I-Model', # 只用于展示,实际运行时不需要指定模型ID
54
+ model_task=ModelTask.IMAGE_GENERATION,
55
+ datasets=[
56
+ 'evalmuse', # 使用 EvalMuse benchmark
57
+ ],
58
+ dataset_args={
59
+ 'evalmuse': {
60
+ 'dataset_id': 'data/example.jsonl', # 构建的jsonl路径
61
+ }
62
+ },
63
+ )
64
+
65
+ run_task(task_cfg=task_cfg)
48
66
 
49
67
  @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
50
68
  def test_run_benchmark(self):
@@ -66,13 +84,13 @@ class TestRun(unittest.TestCase):
66
84
  dataset_args={
67
85
  'tifa160': {
68
86
  'metric_list': [
69
- 'PickScore',
87
+ # 'PickScore',
70
88
  # 'CLIPScore',
71
89
  # 'HPSv2Score',
72
90
  # 'BLIPv2Score',
73
91
  # 'ImageRewardScore',
74
92
  # 'VQAScore',
75
- # 'FGA_BLIP2Score',
93
+ 'FGA_BLIP2Score',
76
94
  ]
77
95
  }
78
96
  },
@@ -0,0 +1 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -0,0 +1,386 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from dotenv import dotenv_values
3
+
4
+ env = dotenv_values('.env')
5
+
6
+ import unittest
7
+ from unittest import TestCase
8
+
9
+ from evalscope.config import TaskConfig
10
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
11
+ from evalscope.run import run_task
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ class TestBenchmark(TestCase):
18
+ """Benchmark evaluation test cases."""
19
+
20
+ def setUp(self):
21
+ """Setup common test configuration."""
22
+ self.base_config = {
23
+ 'model': 'qwen-plus',
24
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
25
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
26
+ 'eval_type': EvalType.SERVICE,
27
+ 'eval_batch_size': 5,
28
+ 'limit': 5,
29
+ 'generation_config': {
30
+ 'max_tokens': 4096,
31
+ 'temperature': 0.0,
32
+ 'seed': 42,
33
+ 'parallel_tool_calls': True
34
+ },
35
+ 'judge_strategy': JudgeStrategy.AUTO,
36
+ 'judge_worker_num': 5,
37
+ 'judge_model_args': {
38
+ 'model_id': 'qwen2.5-72b-instruct',
39
+ 'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
40
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
41
+ 'generation_config': {
42
+ 'temperature': 0.0,
43
+ 'max_tokens': 4096,
44
+ }
45
+ },
46
+ 'debug': True,
47
+ }
48
+
49
+ def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
50
+ """Helper method to run test for a specific dataset."""
51
+ config = self.base_config.copy()
52
+ config['datasets'] = [dataset_name]
53
+
54
+ if use_mock:
55
+ config['eval_type'] = EvalType.MOCK_LLM
56
+
57
+ # 应用配置覆盖
58
+ config.update(config_overrides)
59
+
60
+ if dataset_args:
61
+ config['dataset_args'] = {dataset_name: dataset_args}
62
+
63
+ task_cfg = TaskConfig(**config)
64
+ run_task(task_cfg=task_cfg)
65
+
66
+ def _run_dataset_load_test(self, dataset_name, dataset_args=None):
67
+ """Helper method to test dataset loading."""
68
+
69
+ self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
70
+
71
+ # Math & Reasoning datasets
72
+ def test_gsm8k(self):
73
+ """Test GSM8K math reasoning dataset."""
74
+ self._run_dataset_test('gsm8k')
75
+
76
+ def test_gsm8k_local(self):
77
+ """Test GSM8K math reasoning dataset with local path."""
78
+ dataset_args = {
79
+ 'local_path': 'data/gsm8k',
80
+ }
81
+ self._run_dataset_test('gsm8k', dataset_args=dataset_args, use_mock=True)
82
+
83
+ def test_mmlu(self):
84
+ """Test MMLU reasoning dataset."""
85
+ dataset_args = {
86
+ 'few_shot_num': 0,
87
+ # 'subset_list': ['abstract_algebra', 'computer_security']
88
+ }
89
+ self._run_dataset_test('mmlu', use_mock=True, dataset_args=dataset_args)
90
+
91
+ def test_mmlu_pro(self):
92
+ """Test MMLU-Pro reasoning dataset."""
93
+ dataset_args = {
94
+ 'few_shot_num': 2,
95
+ 'subset_list': ['computer science', 'math']
96
+ }
97
+ self._run_dataset_test('mmlu_pro', use_mock=False, dataset_args=dataset_args, repeats=2)
98
+
99
+ def test_mmlu_redux(self):
100
+ """Test MMLU-Redux reasoning dataset."""
101
+ dataset_args = {
102
+ 'subset_list': ['abstract_algebra', 'computer_security'],
103
+ }
104
+ # self._run_dataset_load_test('mmlu_redux', dataset_args)
105
+ self._run_dataset_test('mmlu_redux', dataset_args=dataset_args)
106
+
107
+ def test_cmmlu(self):
108
+ """Test C-MMLU reasoning dataset."""
109
+ dataset_args = {
110
+ 'subset_list': ['agronomy', 'computer_security'],
111
+ 'few_shot_num': 0,
112
+ }
113
+ # self._run_dataset_load_test('cmmlu')
114
+ self._run_dataset_test('cmmlu', dataset_args=dataset_args)
115
+
116
+ def test_math_500(self):
117
+ """Test MATH 500 dataset."""
118
+ # self._run_dataset_load_test('math_500')
119
+ self._run_dataset_test('math_500')
120
+
121
+ def test_aime24(self):
122
+ """Test AIME 2024 dataset."""
123
+ self._run_dataset_test('aime24')
124
+
125
+ def test_aime25(self):
126
+ """Test AIME 2025 dataset."""
127
+ self._run_dataset_test('aime25')
128
+
129
+ def test_competition_math(self):
130
+ """Test Competition Math dataset."""
131
+ dataset_args = {
132
+ 'subset_list': ['Level 4']
133
+ }
134
+ self._run_dataset_test('competition_math', dataset_args)
135
+
136
+ # Knowledge & QA datasets
137
+ def test_arc(self):
138
+ """Test ARC dataset."""
139
+ # self._run_dataset_load_test('arc')
140
+ dataset_args = {
141
+ 'subset_list': ['ARC-Easy', 'ARC-Challenge'],
142
+ 'few_shot_num': 2,
143
+ }
144
+ self._run_dataset_test('arc', dataset_args=dataset_args)
145
+
146
+ def test_ceval(self):
147
+ """Test CEval dataset."""
148
+ dataset_args = {
149
+ 'subset_list': ['logic', 'law'],
150
+ # 'few_shot_num': 0,
151
+ }
152
+ # self._run_dataset_load_test('ceval')
153
+ self._run_dataset_test('ceval', dataset_args=dataset_args)
154
+
155
+ def test_super_gpqa(self):
156
+ """Test Super GPQA dataset."""
157
+ # self._run_dataset_load_test('super_gpqa')
158
+
159
+ dataset_args = {
160
+ 'subset_list': ['History', 'Psychology'],
161
+ 'few_shot_num': 0,
162
+ }
163
+ self._run_dataset_test('super_gpqa', dataset_args=dataset_args, ignore_errors=True)
164
+
165
+ def test_gpqa(self):
166
+ """Test GPQA dataset."""
167
+ # self._run_dataset_load_test('gpqa_diamond')
168
+ dataset_args = {
169
+ 'few_shot_num': 0,
170
+ }
171
+ self._run_dataset_test('gpqa_diamond', dataset_args=dataset_args, ignore_errors=True)
172
+
173
+ def test_iquiz(self):
174
+ """Test IQuiz dataset."""
175
+ dataset_args = {
176
+ 'subset_list': ['IQ', 'EQ'],
177
+ 'few_shot_num': 0,
178
+ }
179
+ self._run_dataset_test('iquiz', dataset_args=dataset_args)
180
+
181
+ def test_maritime_bench(self):
182
+ """Test MaritimeBench dataset."""
183
+ dataset_args = {
184
+ 'subset_list': ['default'],
185
+ 'few_shot_num': 0,
186
+ }
187
+ self._run_dataset_test('maritime_bench', dataset_args=dataset_args)
188
+
189
+ def test_musr(self):
190
+ """Test MuSR dataset."""
191
+ dataset_args = {
192
+ 'subset_list': ['murder_mysteries', 'object_placements', 'team_allocation'],
193
+ 'few_shot_num': 0,
194
+ }
195
+ self._run_dataset_test('musr', dataset_args=dataset_args)
196
+
197
+ def test_hellaswag(self):
198
+ """Test HellaSwag dataset."""
199
+ self._run_dataset_test('hellaswag')
200
+
201
+ def test_truthful_qa(self):
202
+ """Test TruthfulQA dataset."""
203
+ dataset_args = {
204
+ 'extra_params': {
205
+ 'multiple_correct': True
206
+ }
207
+ }
208
+ self._run_dataset_test('truthful_qa', dataset_args=dataset_args)
209
+
210
+ def test_trivia_qa(self):
211
+ """Test TriviaQA dataset."""
212
+ self._run_dataset_test('trivia_qa')
213
+
214
+ def test_race(self):
215
+ """Test RACE dataset."""
216
+ self._run_dataset_test('race')
217
+
218
+ def test_winogrande(self):
219
+ """Test winogrande"""
220
+ self._run_dataset_test('winogrande')
221
+
222
+ def test_bbh(self):
223
+ dataset_args = {
224
+ 'subset_list': ['temporal_sequences', 'navigate'],
225
+ }
226
+ self._run_dataset_test('bbh', dataset_args=dataset_args)
227
+
228
+ def test_simple_qa(self):
229
+ """Test SimpleQA dataset."""
230
+ self._run_dataset_test('simple_qa')
231
+
232
+ def test_chinese_simpleqa(self):
233
+ """Test Chinese SimpleQA dataset."""
234
+ dataset_args = {
235
+ 'subset_list': ['中华文化']
236
+ }
237
+ self._run_dataset_test('chinese_simpleqa', dataset_args)
238
+
239
+ # Code datasets
240
+ def test_live_code_bench(self):
241
+ """Test LiveCodeBench dataset."""
242
+ dataset_args = {
243
+ 'extra_params': {
244
+ 'start_date': '2024-08-01',
245
+ 'end_date': '2025-02-28'
246
+ },
247
+ 'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
248
+ }
249
+ self._run_dataset_test('live_code_bench', dataset_args)
250
+
251
+ def test_humaneval(self):
252
+ """Test HumanEval dataset."""
253
+ self._run_dataset_test('humaneval')
254
+
255
+ # Custom & specialized datasets
256
+ def test_general_qa(self):
257
+ """Test custom general QA dataset."""
258
+ dataset_args = {
259
+ 'local_path': 'custom_eval/text/qa',
260
+ 'subset_list': ['example']
261
+ }
262
+ self._run_dataset_test('general_qa', dataset_args)
263
+
264
+ def test_general_mcq(self):
265
+ """Test custom general MCQ dataset."""
266
+ dataset_args = {
267
+ 'local_path': 'custom_eval/text/mcq',
268
+ 'subset_list': ['example']
269
+ }
270
+ self._run_dataset_test('general_mcq', dataset_args)
271
+
272
+ def test_alpaca_eval(self):
273
+ """Test AlpacaEval dataset."""
274
+ self._run_dataset_test('alpaca_eval')
275
+
276
+ def test_arena_hard(self):
277
+ """Test Arena Hard dataset."""
278
+ self._run_dataset_test('arena_hard', use_cache='outputs/20250818_211353')
279
+
280
+ def test_frames(self):
281
+ """Test Frames dataset."""
282
+ dataset_args = {
283
+ # 'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
284
+ }
285
+ self._run_dataset_test('frames', dataset_args)
286
+
287
+ def test_docmath(self):
288
+ """Test DocMath dataset."""
289
+ self._run_dataset_test('docmath')
290
+
291
+ def test_drop(self):
292
+ """Test DROP dataset."""
293
+ dataset_args = {
294
+ 'few_shot_num': 3,
295
+ }
296
+ self._run_dataset_test('drop', dataset_args=dataset_args)
297
+
298
+ def test_ifeval(self):
299
+ """Test IFEval dataset."""
300
+ self._run_dataset_test('ifeval')
301
+
302
+ def test_needle_haystack(self):
303
+ """Test Needle in Haystack dataset."""
304
+ dataset_args = {
305
+ 'subset_list': ['english'],
306
+ 'extra_params': {
307
+ 'context_lengths_max': 10000,
308
+ 'context_lengths_num_intervals': 5,
309
+ 'document_depth_percent_intervals': 5,
310
+ 'show_score': True,
311
+ }
312
+ }
313
+ self._run_dataset_test('needle_haystack', dataset_args)
314
+
315
+ def test_ifeval(self):
316
+ """Test IFEval dataset."""
317
+ self._run_dataset_test('ifeval')
318
+
319
+ def test_hle(self):
320
+ """Test HLE dataset."""
321
+ dataset_args = {
322
+ 'subset_list': ['Math', 'Other'],
323
+ 'extra_params': {
324
+ 'include_multi_modal': False
325
+ }
326
+ }
327
+ self._run_dataset_test('hle', dataset_args)
328
+
329
+ def test_process_bench(self):
330
+ """Test ProcessBench dataset."""
331
+ dataset_args = {
332
+ 'subset_list': ['gsm8k', 'math'],
333
+ }
334
+ self._run_dataset_test('process_bench', dataset_args, use_cache='outputs/20250819_161844')
335
+
336
+ def test_humaneval(self):
337
+ """Test HumanEval dataset."""
338
+ dataset_args = {
339
+ 'metric_list': ['Pass@1', 'Pass@2', 'Pass@5']
340
+ }
341
+ self._run_dataset_test('humaneval', dataset_args, repeats=5)
342
+
343
+ def test_live_code_bench(self):
344
+ """Test LiveCodeBench dataset."""
345
+ dataset_args = {
346
+ 'subset_list': ['v6'],
347
+ 'extra_params': {
348
+ 'start_date': '2024-08-01',
349
+ 'end_date': '2025-02-28'
350
+ },
351
+ }
352
+ self._run_dataset_test('live_code_bench', dataset_args, judge_worker_num=1)
353
+
354
+ def test_tool_bench(self):
355
+ """Test ToolBench dataset."""
356
+ self._run_dataset_test('tool_bench')
357
+
358
+ def test_bfcl(self):
359
+ """Test BFCL dataset."""
360
+ dataset_args = {
361
+ 'subset_list': ['simple', 'live_multiple', 'multi_turn_base'],
362
+ 'extra_params': {
363
+ 'is_fc_model': True,
364
+ 'underscore_to_dot': True
365
+ }
366
+ }
367
+ self._run_dataset_test('bfcl_v3', dataset_args)
368
+
369
+ def test_tau_bench(self):
370
+ dataset_args = {
371
+ 'extra_params': {
372
+ 'user_model': 'qwen-plus',
373
+ 'api_key': env.get('DASHSCOPE_API_KEY'),
374
+ 'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
375
+ 'generation_config': {
376
+ 'temperature': 0.7,
377
+ 'max_new_tokens': 1024
378
+ }
379
+ }
380
+ }
381
+ self._run_dataset_test('tau_bench', dataset_args, limit=1)
382
+
383
+ if __name__ == '__main__':
384
+ # Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
385
+ # Run all tests: python -m unittest test_eval.TestBenchmark
386
+ unittest.main()
tests/cli/test_all.py CHANGED
@@ -32,7 +32,7 @@ datasets=[
32
32
  'competition_math',
33
33
  'math_500',
34
34
  'aime24',
35
- 'gpqa',
35
+ 'gpqa_diamond',
36
36
  'arc',
37
37
  'ceval',
38
38
  'hellaswag',
@@ -82,8 +82,7 @@ dataset_args={
82
82
  'bbh': {
83
83
  'subset_list': ['word_sorting', 'movie_recommendation'],
84
84
  },
85
- 'gpqa': {
86
- 'subset_list': ['gpqa_diamond'],
85
+ 'gpqa_diamond': {
87
86
  'few_shot_num': 0,
88
87
  },
89
88
  'humaneval': {
@@ -112,8 +111,7 @@ dataset_args={
112
111
  'subset_list': [
113
112
  'example', # 评测数据集名称,上述 *_dev.csv 中的 *
114
113
  # 'test'
115
- ],
116
- 'metric_list': ['AverageBLEU']
114
+ ]
117
115
  },
118
116
  'super_gpqa': {
119
117
  'subset_list': ['Philosophy', 'Education'],
@@ -1,3 +1,6 @@
1
+ from dotenv import dotenv_values
2
+
3
+ env = dotenv_values('.env')
1
4
  import json
2
5
  import os
3
6
  import unittest
@@ -15,7 +18,6 @@ class TestCollection(unittest.TestCase):
15
18
  CollectionSchema(name='math', datasets=[
16
19
  CollectionSchema(name='generation', datasets=[
17
20
  DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
18
- DatasetInfo(name='competition_math', weight=1, task_type='math', tags=['en', 'math']),
19
21
  ]),
20
22
  CollectionSchema(name='multiple_choice', datasets=[
21
23
  DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
@@ -45,15 +47,22 @@ class TestCollection(unittest.TestCase):
45
47
  from evalscope import TaskConfig, run_task
46
48
 
47
49
  task_cfg = TaskConfig(
48
- model='Qwen2.5-0.5B-Instruct',
49
- api_url='http://127.0.0.1:8801/v1/chat/completions',
50
- api_key='EMPTY',
50
+ model='qwen-plus',
51
+ api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
52
+ api_key=env.get('DASHSCOPE_API_KEY'),
51
53
  eval_type=EvalType.SERVICE,
52
54
  datasets=['data_collection'],
53
55
  dataset_args={'data_collection': {
54
56
  'local_path': 'outputs/mixed_data_test.jsonl'
55
57
  # 'local_path': 'outputs/weighted_mixed_data.jsonl'
56
58
  }},
59
+ eval_batch_size=5,
60
+ generation_config = {
61
+ 'max_tokens': 10000,
62
+ 'temperature': 0.0,
63
+ },
64
+ limit=50,
65
+ # use_cache='outputs/20250822_161804'
57
66
  )
58
67
  run_task(task_cfg=task_cfg)
59
68
 
tests/cli/test_custom.py CHANGED
@@ -10,7 +10,7 @@ import subprocess
10
10
  import unittest
11
11
 
12
12
  from evalscope.config import TaskConfig
13
- from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
13
+ from evalscope.constants import EvalType, JudgeStrategy, OutputType
14
14
  from evalscope.run import run_task
15
15
  from evalscope.utils.import_utils import is_module_installed
16
16
  from evalscope.utils.logger import get_logger
@@ -120,7 +120,7 @@ class TestRunCustom(unittest.TestCase):
120
120
  from evalscope.config import TaskConfig
121
121
 
122
122
  task_cfg = TaskConfig(
123
- model='qwen2.5-72b-instruct',
123
+ model='qwen2.5-7b-instruct',
124
124
  api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
125
125
  api_key= env.get('DASHSCOPE_API_KEY'),
126
126
  eval_type=EvalType.SERVICE,
@@ -132,7 +132,7 @@ class TestRunCustom(unittest.TestCase):
132
132
  'dataset_id': 'custom_eval/text/qa',
133
133
  'subset_list': [
134
134
  'arena',
135
- 'example'
135
+ # 'example'
136
136
  ],
137
137
  }
138
138
  },
@@ -147,7 +147,7 @@ class TestRunCustom(unittest.TestCase):
147
147
  },
148
148
  ignore_errors=False,
149
149
  judge_model_args={
150
- 'model_id': 'qwen2.5-72b-instruct',
150
+ 'model_id': 'qwen2.5-7b-instruct',
151
151
  'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
152
152
  'api_key': env.get('DASHSCOPE_API_KEY'),
153
153
  'generation_config': {
@@ -155,9 +155,19 @@ class TestRunCustom(unittest.TestCase):
155
155
  'max_tokens': 4096
156
156
  },
157
157
  'score_type': 'numeric',
158
+ 'prompt_template': """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
159
+ Begin your evaluation by providing a short explanation. Be as objective as possible.
160
+ After providing your explanation, you must rate the response on a scale of 0 (worst) to 100 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\"
161
+
162
+ [Question]
163
+ {question}
164
+
165
+ [Response]
166
+ {pred}
167
+ """
158
168
  },
159
169
  judge_worker_num=5,
160
- judge_strategy=JudgeStrategy.AUTO,
170
+ judge_strategy=JudgeStrategy.LLM,
161
171
  )
162
172
 
163
173
  run_task(task_cfg=task_cfg)
@@ -203,8 +213,9 @@ class TestRunCustom(unittest.TestCase):
203
213
  },
204
214
  'score_type': 'pattern',
205
215
  },
206
- judge_worker_num=5,
207
- judge_strategy=JudgeStrategy.LLM,
216
+ judge_worker_num=1,
217
+ judge_strategy=JudgeStrategy.LLM_RECALL,
218
+ use_cache='outputs/20250818_170420'
208
219
  )
209
220
 
210
221
  run_task(task_cfg=task_cfg)
@@ -223,20 +234,16 @@ class TestRunCustom(unittest.TestCase):
223
234
  'general_arena': {
224
235
  'extra_params':{
225
236
  'models':[
226
- {
227
- 'name': 'qwen2.5-0.5b',
228
- 'report_path': 'outputs/20250702_140354/reports/qwen2.5-0.5b-instruct'
229
- },
230
237
  {
231
238
  'name': 'qwen2.5-7b',
232
- 'report_path': 'outputs/20250702_140702/reports/qwen2.5-7b-instruct'
239
+ 'report_path': 'outputs/20250819_165034/reports/qwen2.5-7b-instruct'
233
240
  },
234
241
  {
235
242
  'name': 'qwen2.5-72b',
236
- 'report_path': 'outputs/20250702_140802/reports/qwen2.5-72b-instruct'
243
+ 'report_path': 'outputs/20250819_164926/reports/qwen2.5-72b-instruct'
237
244
  }
238
245
  ],
239
- 'baseline': 'qwen2.5-7b'
246
+ 'baseline': 'qwen2.5-72b'
240
247
  }
241
248
  }
242
249
  },
@@ -255,7 +262,7 @@ class TestRunCustom(unittest.TestCase):
255
262
  },
256
263
  },
257
264
  judge_worker_num=5,
258
- use_cache='outputs/20250702_165727'
265
+ # use_cache='outputs/20250819_173546'
259
266
  )
260
267
 
261
268
  run_task(task_cfg=task_cfg)
@@ -1,6 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
3
  import os
4
+
4
5
  # os.environ["CUDA_VISIBLE_DEVICES"] = "1"
5
6
  import subprocess
6
7
  import unittest
@@ -1,56 +0,0 @@
1
- from typing import List, Optional, Union
2
-
3
- from evalscope.benchmarks import DataAdapter
4
- from evalscope.metrics import mean, metric_registry
5
- from evalscope.utils.logger import get_logger
6
-
7
- logger = get_logger()
8
-
9
-
10
- class T2IBaseAdapter(DataAdapter):
11
-
12
- def __init__(self, **kwargs):
13
-
14
- super().__init__(**kwargs)
15
-
16
- logger.info(f'Initializing metrics: {self.metric_list}')
17
- self.metrics = {m: metric_registry.get(m).object() for m in self.metric_list}
18
-
19
- def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
20
- # dummy prompt for general t2i
21
- return self.gen_prompt_data(prompt=input_d.get('prompt', ''), id=input_d.get('id', 0))
22
-
23
- def get_gold_answer(self, input_d: dict) -> str:
24
- # dummy gold answer for general t2i
25
- return input_d.get('prompt', '')
26
-
27
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
28
- # dummy parse pred result for general t2i
29
- return result or raw_input_d.get('image_path', '')
30
-
31
- def match(self, gold: str, pred: str) -> dict:
32
- # dummy match for general t2i
33
- # pred is the image path, gold is the prompt
34
- res = {}
35
- for metric_name, metric_func in self.metrics.items():
36
- score = metric_func(images=[pred], texts=[gold])[0][0]
37
- if isinstance(score, dict):
38
- for k, v in score.items():
39
- res[f'{metric_name}_{k}'] = v.cpu().item()
40
- else:
41
- res[metric_name] = score.cpu().item() # Updated to use score.cpu().item()
42
- return res
43
-
44
- def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
45
- """
46
- compute weighted mean of the bleu score of all samples
47
-
48
- Args:
49
- review_res_list: [score1, score2, ...]
50
-
51
- Returns:
52
- avg_res: List[dict]
53
-
54
- """
55
- items = super().compute_dict_metric(review_res_list, **kwargs)
56
- return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]