evalscope 0.14.0__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (178) hide show
  1. evalscope/arguments.py +2 -1
  2. evalscope/benchmarks/__init__.py +2 -2
  3. evalscope/benchmarks/aigc/__init__.py +0 -0
  4. evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  5. evalscope/benchmarks/aigc/t2i/base.py +56 -0
  6. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
  7. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
  8. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
  9. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
  10. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
  11. evalscope/benchmarks/aime/aime24_adapter.py +1 -1
  12. evalscope/benchmarks/aime/aime25_adapter.py +4 -4
  13. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
  14. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  15. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
  16. evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
  17. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
  18. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
  20. evalscope/benchmarks/data_adapter.py +16 -9
  21. evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
  22. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
  23. evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
  24. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  25. evalscope/benchmarks/live_code_bench/testing_util.py +6 -3
  26. evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -1
  28. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
  29. evalscope/benchmarks/utils.py +7 -16
  30. evalscope/cli/start_app.py +1 -1
  31. evalscope/collections/evaluator.py +16 -4
  32. evalscope/config.py +7 -3
  33. evalscope/constants.py +11 -0
  34. evalscope/evaluator/evaluator.py +2 -2
  35. evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
  36. evalscope/metrics/__init__.py +49 -4
  37. evalscope/metrics/llm_judge.py +1 -1
  38. evalscope/metrics/named_metrics.py +13 -0
  39. evalscope/metrics/t2v_metrics/__init__.py +66 -0
  40. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  41. evalscope/metrics/t2v_metrics/constants.py +12 -0
  42. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  43. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  44. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  45. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  46. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  47. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
  48. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
  49. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
  50. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
  51. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
  52. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  53. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  54. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
  55. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
  56. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
  57. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  58. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
  59. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
  60. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  61. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  62. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  63. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  64. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  65. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
  66. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  67. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
  68. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  69. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
  70. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
  71. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  72. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  73. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  74. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
  75. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
  76. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
  77. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
  78. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  79. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  80. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
  81. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
  82. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  83. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  84. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  85. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  86. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  87. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  88. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
  115. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
  116. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
  117. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
  118. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
  119. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  120. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  121. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  122. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
  123. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
  124. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
  125. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  126. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
  127. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
  128. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
  129. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
  130. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
  131. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
  132. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
  133. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  134. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  135. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
  136. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
  137. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
  138. evalscope/metrics/t2v_metrics/score.py +78 -0
  139. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  140. evalscope/models/__init__.py +50 -14
  141. evalscope/models/adapters/__init__.py +17 -0
  142. evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
  143. evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
  144. evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
  145. evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
  146. evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
  147. evalscope/models/adapters/t2i_adapter.py +76 -0
  148. evalscope/models/custom/__init__.py +2 -1
  149. evalscope/models/custom/dummy_model.py +11 -13
  150. evalscope/models/local_model.py +82 -33
  151. evalscope/models/model.py +2 -42
  152. evalscope/models/register.py +26 -0
  153. evalscope/perf/plugin/datasets/flickr8k.py +2 -1
  154. evalscope/perf/utils/benchmark_util.py +2 -2
  155. evalscope/perf/utils/db_util.py +8 -2
  156. evalscope/report/__init__.py +1 -0
  157. evalscope/report/app.py +117 -67
  158. evalscope/report/app_arguments.py +11 -0
  159. evalscope/report/generator.py +1 -1
  160. evalscope/run.py +3 -3
  161. evalscope/third_party/thinkbench/eval.py +19 -7
  162. evalscope/utils/chat_service.py +2 -2
  163. evalscope/utils/import_utils.py +66 -0
  164. evalscope/utils/utils.py +12 -4
  165. evalscope/version.py +2 -2
  166. {evalscope-0.14.0.dist-info → evalscope-0.15.0.dist-info}/METADATA +18 -1
  167. {evalscope-0.14.0.dist-info → evalscope-0.15.0.dist-info}/RECORD +175 -63
  168. tests/aigc/__init__.py +1 -0
  169. tests/aigc/test_t2i.py +87 -0
  170. tests/cli/test_run.py +11 -5
  171. tests/perf/test_perf.py +2 -1
  172. evalscope/metrics/code_metric.py +0 -98
  173. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  174. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  175. {evalscope-0.14.0.dist-info → evalscope-0.15.0.dist-info}/LICENSE +0 -0
  176. {evalscope-0.14.0.dist-info → evalscope-0.15.0.dist-info}/WHEEL +0 -0
  177. {evalscope-0.14.0.dist-info → evalscope-0.15.0.dist-info}/entry_points.txt +0 -0
  178. {evalscope-0.14.0.dist-info → evalscope-0.15.0.dist-info}/top_level.txt +0 -0
evalscope/arguments.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import argparse
2
2
  import json
3
3
 
4
- from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, OutputType
4
+ from evalscope.constants import EvalBackend, EvalStage, EvalType, JudgeStrategy, ModelTask, OutputType
5
5
 
6
6
 
7
7
  class ParseStrArgsAction(argparse.Action):
@@ -35,6 +35,7 @@ def add_argument(parser: argparse.ArgumentParser):
35
35
  parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
36
36
  parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
37
37
  parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
38
+ parser.add_argument('--model-task', type=str, default=ModelTask.TEXT_GENERATION, choices=[ModelTask.TEXT_GENERATION, ModelTask.IMAGE_GENERATION], help='The model task for model id.') # noqa: E501
38
39
 
39
40
  # Template-related arguments
40
41
  parser.add_argument('--template-type', type=str, required=False, help='Deprecated, will be removed in v1.0.0.')
@@ -10,8 +10,8 @@ from evalscope.utils import get_logger
10
10
  logger = get_logger()
11
11
 
12
12
  # Using glob to find all files matching the pattern
13
- pattern = os.path.join(os.path.dirname(__file__), '*', '*_adapter.py')
14
- files = glob.glob(pattern, recursive=False)
13
+ pattern = os.path.join(os.path.dirname(__file__), '*', '**', '*_adapter.py')
14
+ files = glob.glob(pattern, recursive=True)
15
15
 
16
16
  for file_path in files:
17
17
  if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
File without changes
File without changes
@@ -0,0 +1,56 @@
1
+ from typing import List, Optional, Union
2
+
3
+ from evalscope.benchmarks import DataAdapter
4
+ from evalscope.metrics import mean, metric_registry
5
+ from evalscope.utils.logger import get_logger
6
+
7
+ logger = get_logger()
8
+
9
+
10
+ class T2IBaseAdapter(DataAdapter):
11
+
12
+ def __init__(self, **kwargs):
13
+
14
+ super().__init__(**kwargs)
15
+
16
+ logger.info(f'Initializing metrics: {self.metric_list}')
17
+ self.metrics = {m: metric_registry.get(m).object() for m in self.metric_list}
18
+
19
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
20
+ # dummy prompt for general t2i
21
+ return self.gen_prompt_data(prompt=input_d.get('prompt', ''), id=input_d.get('id', 0))
22
+
23
+ def get_gold_answer(self, input_d: dict) -> str:
24
+ # dummy gold answer for general t2i
25
+ return input_d.get('prompt', '')
26
+
27
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
28
+ # dummy parse pred result for general t2i
29
+ return result or raw_input_d.get('image_path', '')
30
+
31
+ def match(self, gold: str, pred: str) -> dict:
32
+ # dummy match for general t2i
33
+ # pred is the image path, gold is the prompt
34
+ res = {}
35
+ for metric_name, metric_func in self.metrics.items():
36
+ score = metric_func(images=[pred], texts=[gold])[0][0]
37
+ if isinstance(score, dict):
38
+ for k, v in score.items():
39
+ res[f'{metric_name}_{k}'] = v.cpu().item()
40
+ else:
41
+ res[metric_name] = score.cpu().item() # Updated to use score.cpu().item()
42
+ return res
43
+
44
+ def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
45
+ """
46
+ compute weighted mean of the bleu score of all samples
47
+
48
+ Args:
49
+ review_res_list: [score1, score2, ...]
50
+
51
+ Returns:
52
+ avg_res: List[dict]
53
+
54
+ """
55
+ items = super().compute_dict_metric(review_res_list, **kwargs)
56
+ return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
@@ -0,0 +1,77 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os.path
3
+ from collections import defaultdict
4
+ from typing import List, Optional, Union
5
+
6
+ from evalscope.benchmarks import Benchmark
7
+ from evalscope.constants import OutputType
8
+ from evalscope.metrics import mean
9
+ from evalscope.utils.io_utils import jsonl_to_list
10
+ from evalscope.utils.logger import get_logger
11
+ from .base import T2IBaseAdapter
12
+
13
+ logger = get_logger()
14
+
15
+
16
+ @Benchmark.register(
17
+ name='evalmuse',
18
+ dataset_id='AI-ModelScope/T2V-Eval-Prompts',
19
+ model_adapter=OutputType.IMAGE_GENERATION,
20
+ output_types=[OutputType.IMAGE_GENERATION],
21
+ subset_list=['EvalMuse'],
22
+ metric_list=['FGA_BLIP2Score'],
23
+ few_shot_num=0,
24
+ train_split=None,
25
+ eval_split='test',
26
+ )
27
+ class EvalMuseAdapter(T2IBaseAdapter):
28
+
29
+ def __init__(self, **kwargs):
30
+ super().__init__(**kwargs)
31
+
32
+ def load(self, **kwargs) -> dict:
33
+ if os.path.isfile(self.dataset_id):
34
+ data_list = jsonl_to_list(self.dataset_id)
35
+ data_dict = {self.subset_list[0]: {'test': data_list}}
36
+ return data_dict
37
+ else:
38
+ return super().load(**kwargs)
39
+
40
+ def get_gold_answer(self, input_d: dict) -> dict:
41
+ # return prompt and elements dict
42
+ return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
43
+
44
+ def match(self, gold: dict, pred: str) -> dict:
45
+ # dummy match for general t2i
46
+ # pred is the image path, gold is the prompt
47
+ res = {}
48
+ for metric_name, metric_func in self.metrics.items():
49
+ if metric_name == 'FGA_BLIP2Score':
50
+ # For FGA_BLIP2Score, we need to pass the dictionary
51
+ score = metric_func(images=[pred], texts=[gold])[0][0]
52
+ else:
53
+ score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
54
+ if isinstance(score, dict):
55
+ for k, v in score.items():
56
+ res[f'{metric_name}:{k}'] = v.cpu().item()
57
+ else:
58
+ res[metric_name] = score.cpu().item()
59
+ return res
60
+
61
+ def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
62
+ """
63
+ compute weighted mean of the bleu score of all samples
64
+ """
65
+ items = super().compute_dict_metric(review_res_list, **kwargs)
66
+ # add statistics for each metric
67
+ new_items = defaultdict(list)
68
+ for metric_name, value_list in items.items():
69
+ if 'FGA_BLIP2Score' in metric_name and '(' in metric_name: # FGA_BLIP2Score element score
70
+ metrics_prefix = metric_name.split(':')[0]
71
+ category = metric_name.rpartition('(')[-1].split(')')[0]
72
+ new_items[f'{metrics_prefix}:{category}'].extend(value_list)
73
+ else:
74
+ new_items[metric_name].extend(value_list)
75
+
76
+ # calculate mean for each metric
77
+ return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in new_items.items()]
@@ -0,0 +1,58 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os.path
3
+ from collections import defaultdict
4
+ from typing import List, Optional, Union
5
+
6
+ from evalscope.benchmarks import Benchmark
7
+ from evalscope.constants import OutputType
8
+ from evalscope.utils.io_utils import jsonl_to_list
9
+ from evalscope.utils.logger import get_logger
10
+ from .base import T2IBaseAdapter
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ @Benchmark.register(
16
+ name='genai_bench',
17
+ dataset_id='AI-ModelScope/T2V-Eval-Prompts',
18
+ model_adapter=OutputType.IMAGE_GENERATION,
19
+ output_types=[OutputType.IMAGE_GENERATION],
20
+ subset_list=['GenAI-Bench-1600'],
21
+ metric_list=['VQAScore'],
22
+ few_shot_num=0,
23
+ train_split=None,
24
+ eval_split='test',
25
+ )
26
+ class GenAIBenchAdapter(T2IBaseAdapter):
27
+
28
+ def __init__(self, **kwargs):
29
+ super().__init__(**kwargs)
30
+
31
+ def load(self, **kwargs) -> dict:
32
+ if os.path.isfile(self.dataset_id):
33
+ data_list = jsonl_to_list(self.dataset_id)
34
+ data_dict = {self.subset_list[0]: {'test': data_list}}
35
+ return data_dict
36
+ else:
37
+ return super().load(**kwargs)
38
+
39
+ def get_gold_answer(self, input_d: dict) -> dict:
40
+ # return prompt and elements dict
41
+ return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
42
+
43
+ def match(self, gold: dict, pred: str) -> dict:
44
+ # dummy match for general t2i
45
+ # pred is the image path, gold is the prompt
46
+ res = {}
47
+ for metric_name, metric_func in self.metrics.items():
48
+ score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
49
+
50
+ res[metric_name] = score.cpu().item()
51
+
52
+ # fine-granular metrics
53
+ if gold['tags'].get('advanced'):
54
+ res[f'{metric_name}_advanced'] = score.cpu().item()
55
+ else:
56
+ res[f'{metric_name}_basic'] = score.cpu().item()
57
+
58
+ return res
@@ -0,0 +1,58 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os.path
3
+ from collections import defaultdict
4
+ from typing import List, Optional, Union
5
+
6
+ from evalscope.benchmarks import Benchmark
7
+ from evalscope.constants import OutputType
8
+ from evalscope.utils.io_utils import jsonl_to_list
9
+ from evalscope.utils.logger import get_logger
10
+ from .base import T2IBaseAdapter
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ @Benchmark.register(
16
+ name='general_t2i',
17
+ dataset_id='general_t2i',
18
+ model_adapter=OutputType.IMAGE_GENERATION,
19
+ output_types=[OutputType.IMAGE_GENERATION],
20
+ subset_list=['default'],
21
+ metric_list=['PickScore'],
22
+ few_shot_num=0,
23
+ train_split=None,
24
+ eval_split='test',
25
+ )
26
+ class GeneralT2IAdapter(T2IBaseAdapter):
27
+
28
+ def __init__(self, **kwargs):
29
+
30
+ super().__init__(**kwargs)
31
+
32
+ def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
33
+ dataset_name_or_path = dataset_name_or_path or self.dataset_id
34
+ subset_list = subset_list or self.subset_list
35
+
36
+ data_file_dict = defaultdict(str)
37
+ data_list = []
38
+
39
+ # get data file path and subset name
40
+ if os.path.isdir(dataset_name_or_path):
41
+ for subset_name in subset_list:
42
+ data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
43
+ elif os.path.isfile(dataset_name_or_path):
44
+ cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
45
+ data_file_dict[cur_subset_name] = dataset_name_or_path
46
+ else:
47
+ raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
48
+
49
+ # load data from local disk
50
+ try:
51
+ for subset_name, file_path in data_file_dict.items():
52
+ data_list.extend(jsonl_to_list(file_path))
53
+ except Exception as e:
54
+ raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
55
+
56
+ data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
57
+
58
+ return data_dict
@@ -0,0 +1,57 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os.path
3
+ from collections import defaultdict
4
+ from typing import List, Optional, Union
5
+
6
+ from evalscope.benchmarks import Benchmark
7
+ from evalscope.constants import OutputType
8
+ from evalscope.utils.io_utils import jsonl_to_list
9
+ from evalscope.utils.logger import get_logger
10
+ from .base import T2IBaseAdapter
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ @Benchmark.register(
16
+ name='hpdv2',
17
+ dataset_id='AI-ModelScope/T2V-Eval-Prompts',
18
+ model_adapter=OutputType.IMAGE_GENERATION,
19
+ output_types=[OutputType.IMAGE_GENERATION],
20
+ subset_list=['HPDv2'],
21
+ metric_list=['HPSv2.1Score'],
22
+ few_shot_num=0,
23
+ train_split=None,
24
+ eval_split='test',
25
+ )
26
+ class HPDv2Adapter(T2IBaseAdapter):
27
+
28
+ def __init__(self, **kwargs):
29
+ super().__init__(**kwargs)
30
+
31
+ def load(self, **kwargs) -> dict:
32
+ if os.path.isfile(self.dataset_id):
33
+ data_list = jsonl_to_list(self.dataset_id)
34
+ data_dict = {self.subset_list[0]: {'test': data_list}}
35
+ return data_dict
36
+ else:
37
+ return super().load(**kwargs)
38
+
39
+ def get_gold_answer(self, input_d: dict) -> dict:
40
+ # return prompt and elements dict
41
+ return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
42
+
43
+ def match(self, gold: dict, pred: str) -> dict:
44
+ # dummy match for general t2i
45
+ # pred is the image path, gold is the prompt
46
+ res = {}
47
+ for metric_name, metric_func in self.metrics.items():
48
+ score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
49
+
50
+ res[metric_name] = score.cpu().item()
51
+
52
+ # fine-granular metrics
53
+ category = gold['tags'].get('category')
54
+ if category:
55
+ res[f'{metric_name}_{category}'] = score.cpu().item()
56
+
57
+ return res
@@ -0,0 +1,37 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os.path
3
+ from collections import defaultdict
4
+ from typing import List, Optional, Union
5
+
6
+ from evalscope.benchmarks import Benchmark
7
+ from evalscope.constants import OutputType
8
+ from evalscope.utils.io_utils import jsonl_to_list
9
+ from evalscope.utils.logger import get_logger
10
+ from .base import T2IBaseAdapter
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ @Benchmark.register(
16
+ name='tifa160',
17
+ dataset_id='AI-ModelScope/T2V-Eval-Prompts',
18
+ model_adapter=OutputType.IMAGE_GENERATION,
19
+ output_types=[OutputType.IMAGE_GENERATION],
20
+ subset_list=['TIFA-160'],
21
+ metric_list=['PickScore'],
22
+ few_shot_num=0,
23
+ train_split=None,
24
+ eval_split='test',
25
+ )
26
+ class TIFA_Adapter(T2IBaseAdapter):
27
+
28
+ def __init__(self, **kwargs):
29
+ super().__init__(**kwargs)
30
+
31
+ def load(self, **kwargs) -> dict:
32
+ if os.path.isfile(self.dataset_id):
33
+ data_list = jsonl_to_list(self.dataset_id)
34
+ data_dict = {self.subset_list[0]: {'test': data_list}}
35
+ return data_dict
36
+ else:
37
+ return super().load(**kwargs)
@@ -1,6 +1,6 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
2
  from evalscope.constants import OutputType
3
- from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
+ from evalscope.metrics import extract_answer, math_equal, strip_answer_string
4
4
  from evalscope.utils.logger import get_logger
5
5
 
6
6
  # flake8: noqa
@@ -1,6 +1,6 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
2
  from evalscope.constants import OutputType
3
- from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
+ from evalscope.metrics import extract_answer, math_equal, strip_answer_string
4
4
  from evalscope.utils.logger import get_logger
5
5
 
6
6
  # flake8: noqa
@@ -11,12 +11,12 @@ logger = get_logger()
11
11
  @Benchmark.register(
12
12
  name='aime25',
13
13
  pretty_name='AIME-2025',
14
- dataset_id='TIGER-Lab/AIME25',
15
- subset_list=['default'],
14
+ dataset_id='opencompass/AIME2025',
15
+ subset_list=['AIME2025-I', 'AIME2025-II'],
16
16
  metric_list=['AveragePass@1'],
17
17
  few_shot_num=0,
18
18
  train_split=None,
19
- eval_split='train', # Only train set is available
19
+ eval_split='test', # Only train set is available
20
20
  prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
21
21
  )
22
22
  class AIME25Adapter(DataAdapter):
@@ -3,8 +3,7 @@ from collections import defaultdict
3
3
  from typing import Any, List
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.metrics import Metric, mean, metric_registry
7
- from evalscope.metrics.llm_judge import LLMJudge
6
+ from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
8
7
  from evalscope.utils.logger import get_logger
9
8
 
10
9
  # flake8: noqa
@@ -18,7 +18,7 @@ logger = get_logger()
18
18
  name='arc',
19
19
  pretty_name='ARC',
20
20
  dataset_id='modelscope/ai2_arc',
21
- model_adapter=OutputType.MULTIPLE_CHOICE,
21
+ model_adapter=OutputType.GENERATION,
22
22
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
23
23
  subset_list=['ARC-Easy', 'ARC-Challenge'],
24
24
  metric_list=['AverageAccuracy'],
@@ -3,9 +3,7 @@ from collections import defaultdict
3
3
  from typing import Any, List
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import AnswerKeys
7
- from evalscope.metrics import Metric, mean, metric_registry
8
- from evalscope.metrics.llm_judge import LLMJudge
6
+ from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
9
7
  from evalscope.utils.logger import get_logger
10
8
 
11
9
  # flake8: noqa
@@ -4,7 +4,7 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType, OutputType
7
- from evalscope.metrics.metrics import exact_match
7
+ from evalscope.metrics import exact_match
8
8
  from evalscope.utils import ResponseParser
9
9
  from evalscope.utils.logger import get_logger
10
10
 
@@ -127,7 +127,7 @@ SUBJECT_MAPPING = {
127
127
  name='ceval',
128
128
  pretty_name='C-Eval',
129
129
  dataset_id='modelscope/ceval-exam',
130
- model_adapter=OutputType.MULTIPLE_CHOICE,
130
+ model_adapter=OutputType.GENERATION,
131
131
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
132
132
  subset_list=SUBSET_LIST,
133
133
  metric_list=['AverageAccuracy'],
@@ -1,10 +1,8 @@
1
1
  import re
2
- from collections import defaultdict
3
2
  from typing import Any, List
4
3
 
5
4
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.metrics import Metric, mean, metric_registry
7
- from evalscope.metrics.llm_judge import LLMJudge
5
+ from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
8
6
  from evalscope.utils.logger import get_logger
9
7
 
10
8
  # flake8: noqa
@@ -104,7 +104,7 @@ SUBJECT_MAPPING = {
104
104
  name='cmmlu',
105
105
  pretty_name='C-MMLU',
106
106
  dataset_id='modelscope/cmmlu',
107
- model_adapter=OutputType.MULTIPLE_CHOICE,
107
+ model_adapter=OutputType.GENERATION,
108
108
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
109
109
  subset_list=SUBSET_LIST,
110
110
  metric_list=['AverageAccuracy'],
@@ -6,8 +6,7 @@ import os
6
6
  from collections import defaultdict
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.constants import AnswerKeys
10
- from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
9
+ from evalscope.metrics import extract_answer, math_equal, strip_answer_string
11
10
  from evalscope.utils.logger import get_logger
12
11
 
13
12
  # flake8: noqa
@@ -3,12 +3,11 @@ import os.path
3
3
  import random
4
4
  from abc import ABC, abstractmethod
5
5
  from collections import defaultdict
6
- from typing import Any, List, Optional, Union
6
+ from typing import Any, Dict, List, Optional, Union
7
7
 
8
8
  from evalscope.benchmarks.utils import PromptData, preprocess_decorator
9
9
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
10
- from evalscope.metrics.llm_judge import LLMJudge
11
- from evalscope.metrics.named_metrics import metric_registry
10
+ from evalscope.metrics import LLMJudge, metric_registry
12
11
  from evalscope.report import Report, ReportGenerator
13
12
  from evalscope.utils.logger import get_logger
14
13
 
@@ -24,6 +23,7 @@ class DataAdapter(ABC):
24
23
  subset_list: list,
25
24
  metric_list: List[str],
26
25
  llm_as_a_judge: bool = False,
26
+ output_types: Optional[List[str]] = None,
27
27
  few_shot_num: Optional[int] = 0,
28
28
  train_split: Optional[str] = None,
29
29
  eval_split: Optional[str] = None,
@@ -63,6 +63,7 @@ class DataAdapter(ABC):
63
63
  self.query_template = query_template
64
64
  self.pretty_name = pretty_name
65
65
  self.config_kwargs = kwargs
66
+ self.output_types = output_types or [model_adapter]
66
67
  self.llm_as_a_judge = llm_as_a_judge
67
68
  self.category_map = kwargs.get('category_map', {})
68
69
  self.choices = kwargs.get('choices', None)
@@ -190,7 +191,7 @@ class DataAdapter(ABC):
190
191
  if self.few_shot_num and self.few_shot_num < 0:
191
192
  raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
192
193
 
193
- logger.info(f'Use default settings: '
194
+ logger.info(f'Use settings: '
194
195
  f'> few_shot_num: {self.few_shot_num}, '
195
196
  f'> few_shot_split: {self.train_split}, '
196
197
  f'> target_eval_split: {self.eval_split}')
@@ -245,7 +246,8 @@ class DataAdapter(ABC):
245
246
  res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
246
247
  return res_list
247
248
 
248
- def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
249
+ def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]],
250
+ **kwargs) -> Dict[str, List[float]]:
249
251
  """
250
252
  compute weighted mean of the bleu score of all samples
251
253
 
@@ -253,7 +255,7 @@ class DataAdapter(ABC):
253
255
  review_res_list: [score1, score2, ...]
254
256
 
255
257
  Returns:
256
- avg_res: List[dict]
258
+ avg_res: Dict[str, List[float]]
257
259
 
258
260
  """
259
261
  if isinstance(review_res_list[0], list):
@@ -318,11 +320,16 @@ class DataAdapter(ABC):
318
320
  prompt: str,
319
321
  system_prompt: Optional[str] = None,
320
322
  choices: Optional[List[str]] = None,
323
+ index: Optional[Union[int, str]] = None,
324
+ id: Optional[Union[int, str]] = None,
321
325
  **kwargs) -> dict:
322
- if not isinstance(prompt, list):
323
- prompt = [prompt]
326
+ data = [prompt] if not isinstance(prompt, list) else prompt
324
327
  prompt_data = PromptData(
325
- data=prompt, multi_choices=choices or self.choices, system_prompt=system_prompt or self.system_prompt)
328
+ data=data,
329
+ multi_choices=choices or self.choices,
330
+ system_prompt=system_prompt or self.system_prompt,
331
+ index=index or 0,
332
+ id=id)
326
333
  return prompt_data.to_dict()
327
334
 
328
335
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
@@ -48,14 +48,16 @@ class DataCollectionAdapter(DataAdapter):
48
48
  if len(dataset) == 0:
49
49
  raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
50
50
  else:
51
- from modelscope.msdatasets import MsDataset
51
+ from modelscope import dataset_snapshot_download
52
52
 
53
53
  # Load dataset from remote
54
54
  logger.info(f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path}')
55
55
 
56
- dataset = MsDataset.load(dataset_name=dataset_name_or_path, cache_dir=work_dir, hub=datasets_hub, **kwargs)
57
-
58
- dataset = dataset[self.eval_split].to_list()
56
+ dataset_path = dataset_snapshot_download(
57
+ dataset_name_or_path, cache_dir=work_dir, allow_file_pattern='*.jsonl')
58
+ # find the jsonl file
59
+ dataset_files = [os.path.join(dataset_path, f) for f in os.listdir(dataset_path) if f.endswith('.jsonl')]
60
+ dataset = jsonl_to_list(dataset_files[0])
59
61
 
60
62
  return dataset
61
63
 
@@ -4,7 +4,7 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType, OutputType
7
- from evalscope.metrics.metrics import exact_match
7
+ from evalscope.metrics import exact_match
8
8
  from evalscope.utils import ResponseParser
9
9
  from evalscope.utils.logger import get_logger
10
10
 
@@ -17,7 +17,7 @@ logger = get_logger()
17
17
  name='general_mcq',
18
18
  pretty_name='General MCQ',
19
19
  dataset_id='general_mcq',
20
- model_adapter=OutputType.MULTIPLE_CHOICE,
20
+ model_adapter=OutputType.GENERATION,
21
21
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
22
22
  subset_list=['default'],
23
23
  metric_list=['AverageAccuracy'],