evalscope 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (181) hide show
  1. evalscope/arguments.py +2 -1
  2. evalscope/benchmarks/__init__.py +2 -2
  3. evalscope/benchmarks/aigc/__init__.py +0 -0
  4. evalscope/benchmarks/aigc/t2i/__init__.py +0 -0
  5. evalscope/benchmarks/aigc/t2i/base.py +56 -0
  6. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +77 -0
  7. evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +58 -0
  8. evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +58 -0
  9. evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +57 -0
  10. evalscope/benchmarks/aigc/t2i/tifa_adapter.py +37 -0
  11. evalscope/benchmarks/aime/aime24_adapter.py +1 -1
  12. evalscope/benchmarks/aime/aime25_adapter.py +4 -4
  13. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -2
  14. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  15. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -3
  16. evalscope/benchmarks/ceval/ceval_adapter.py +2 -2
  17. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -3
  18. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +1 -1
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +1 -2
  20. evalscope/benchmarks/data_adapter.py +16 -9
  21. evalscope/benchmarks/data_collection/data_collection_adapter.py +6 -4
  22. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +2 -2
  23. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -3
  24. evalscope/benchmarks/live_code_bench/evaluate_utils.py +16 -21
  25. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  26. evalscope/benchmarks/live_code_bench/testing_util.py +6 -3
  27. evalscope/benchmarks/math_500/math_500_adapter.py +1 -1
  28. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -1
  29. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -2
  30. evalscope/benchmarks/utils.py +7 -16
  31. evalscope/cli/start_app.py +1 -1
  32. evalscope/collections/evaluator.py +16 -4
  33. evalscope/config.py +7 -3
  34. evalscope/constants.py +11 -0
  35. evalscope/evaluator/evaluator.py +9 -3
  36. evalscope/evaluator/reviewer/auto_reviewer.py +1 -1
  37. evalscope/metrics/__init__.py +49 -4
  38. evalscope/metrics/llm_judge.py +1 -1
  39. evalscope/metrics/named_metrics.py +13 -0
  40. evalscope/metrics/t2v_metrics/__init__.py +66 -0
  41. evalscope/metrics/t2v_metrics/clipscore.py +14 -0
  42. evalscope/metrics/t2v_metrics/constants.py +12 -0
  43. evalscope/metrics/t2v_metrics/itmscore.py +14 -0
  44. evalscope/metrics/t2v_metrics/models/__init__.py +0 -0
  45. evalscope/metrics/t2v_metrics/models/clipscore_models/__init__.py +30 -0
  46. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/__init__.py +0 -0
  47. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/base_model.py +6 -0
  48. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +132 -0
  49. evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +286 -0
  50. evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +114 -0
  51. evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +86 -0
  52. evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +85 -0
  53. evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +62 -0
  54. evalscope/metrics/t2v_metrics/models/itmscore_models/__init__.py +26 -0
  55. evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +84 -0
  56. evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +97 -0
  57. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +171 -0
  58. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/__init__.py +0 -0
  59. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +80 -0
  60. evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +73 -0
  61. evalscope/metrics/t2v_metrics/models/model.py +45 -0
  62. evalscope/metrics/t2v_metrics/models/utils.py +25 -0
  63. evalscope/metrics/t2v_metrics/models/vqascore_models/__init__.py +22 -0
  64. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/__init__.py +0 -0
  65. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/__init__.py +1 -0
  66. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +300 -0
  67. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/builder.py +12 -0
  68. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +82 -0
  69. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_projector/builder.py +50 -0
  70. evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +218 -0
  71. evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +150 -0
  72. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/__init__.py +26 -0
  73. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +465 -0
  74. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +141 -0
  75. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +22 -0
  76. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +188 -0
  77. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +106 -0
  78. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +307 -0
  79. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +416 -0
  80. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +8 -0
  81. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +191 -0
  82. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +318 -0
  83. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/default.yaml +10 -0
  84. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml +42 -0
  85. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml +42 -0
  86. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml +42 -0
  87. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_coco.yaml +36 -0
  88. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml +43 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml +43 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml +43 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml +43 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain.yaml +36 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml +42 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_no_prefix.yaml +42 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_iter_80k_total_100k_prefix.yaml +42 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml +43 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml +42 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml +42 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml +42 -0
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml +37 -0
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna13b.yaml +43 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/blip2/blip2_vicuna7b.yaml +43 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config.json +21 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_config_albef.json +22 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/configs/models/med_large_config.json +21 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +208 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/base_model.py +231 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +1093 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/__init__.py +0 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2.py +211 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_image_text_matching.py +109 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +452 -0
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +364 -0
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +755 -0
  115. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +273 -0
  116. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +880 -0
  117. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +1844 -0
  118. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +81 -0
  119. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +56 -0
  120. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_caption.py +212 -0
  121. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_classification.py +164 -0
  122. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_feature_extractor.py +202 -0
  123. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +185 -0
  124. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +178 -0
  125. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +112 -0
  126. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_pretrain.py +371 -0
  127. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +344 -0
  128. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +858 -0
  129. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +271 -0
  130. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +503 -0
  131. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +1270 -0
  132. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +473 -0
  133. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +31 -0
  134. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/base_processor.py +27 -0
  135. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/blip_processors.py +233 -0
  136. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +392 -0
  137. evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +127 -0
  138. evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +17 -0
  139. evalscope/metrics/t2v_metrics/score.py +78 -0
  140. evalscope/metrics/t2v_metrics/vqascore.py +14 -0
  141. evalscope/models/__init__.py +50 -14
  142. evalscope/models/adapters/__init__.py +17 -0
  143. evalscope/models/{base_adapter.py → adapters/base_adapter.py} +17 -17
  144. evalscope/models/{chat_adapter.py → adapters/chat_adapter.py} +10 -7
  145. evalscope/models/{choice_adapter.py → adapters/choice_adapter.py} +2 -6
  146. evalscope/models/{custom_adapter.py → adapters/custom_adapter.py} +2 -4
  147. evalscope/models/{server_adapter.py → adapters/server_adapter.py} +1 -3
  148. evalscope/models/adapters/t2i_adapter.py +76 -0
  149. evalscope/models/custom/__init__.py +2 -1
  150. evalscope/models/custom/dummy_model.py +11 -13
  151. evalscope/models/local_model.py +82 -33
  152. evalscope/models/model.py +2 -42
  153. evalscope/models/register.py +26 -0
  154. evalscope/perf/benchmark.py +4 -3
  155. evalscope/perf/main.py +4 -2
  156. evalscope/perf/plugin/datasets/flickr8k.py +2 -1
  157. evalscope/perf/utils/benchmark_util.py +2 -2
  158. evalscope/perf/utils/db_util.py +16 -8
  159. evalscope/report/__init__.py +1 -0
  160. evalscope/report/app.py +117 -67
  161. evalscope/report/app_arguments.py +11 -0
  162. evalscope/report/generator.py +1 -1
  163. evalscope/run.py +3 -3
  164. evalscope/third_party/thinkbench/eval.py +19 -7
  165. evalscope/utils/chat_service.py +2 -2
  166. evalscope/utils/import_utils.py +66 -0
  167. evalscope/utils/utils.py +12 -4
  168. evalscope/version.py +2 -2
  169. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/METADATA +20 -3
  170. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/RECORD +178 -66
  171. tests/aigc/__init__.py +1 -0
  172. tests/aigc/test_t2i.py +87 -0
  173. tests/cli/test_run.py +20 -7
  174. tests/perf/test_perf.py +6 -3
  175. evalscope/metrics/code_metric.py +0 -98
  176. evalscope/metrics/resources/gpt2-zhcn3-v4.bpe +0 -58485
  177. evalscope/metrics/resources/gpt2-zhcn3-v4.json +0 -1
  178. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/LICENSE +0 -0
  179. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/WHEEL +0 -0
  180. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/entry_points.txt +0 -0
  181. {evalscope-0.14.0.dist-info → evalscope-0.15.1.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,6 @@ import json
2
2
  import multiprocessing
3
3
  import numpy as np
4
4
  from collections import defaultdict
5
- from concurrent.futures import ProcessPoolExecutor, as_completed
6
5
 
7
6
  from evalscope.utils.logger import get_logger
8
7
  from .pass_k_utils import compute_metrics_from_results
@@ -31,7 +30,10 @@ def codegen_check_correctness(sample, generation, timeout, debug=True):
31
30
  args=(sample, generation, debug, result, metadata_list, timeout),
32
31
  )
33
32
  p.start()
34
- p.join(timeout=(timeout + 1) * len(json.loads(sample['input_output'])['inputs']) + 5)
33
+ global_timeout = (timeout + 1) * len(json.loads(sample['input_output'])['inputs'])
34
+ if debug:
35
+ logger.info(f'global timeout = {global_timeout}')
36
+ p.join(timeout=global_timeout)
35
37
  if p.is_alive():
36
38
  p.kill()
37
39
  if not result:
@@ -39,7 +41,7 @@ def codegen_check_correctness(sample, generation, timeout, debug=True):
39
41
  # consider that all tests failed
40
42
  result = [[-1 for i in range(len(in_outs['inputs']))]]
41
43
  if debug:
42
- logger.info('global timeout')
44
+ logger.info('global timeout occured: alarm went off')
43
45
  return result[0], metadata_list[0]
44
46
 
45
47
 
@@ -99,7 +101,7 @@ def evaluate_generations(
99
101
  samples_list: list,
100
102
  generations_list: list[list[str]],
101
103
  debug: bool = False,
102
- num_process_evaluate: int = 16,
104
+ num_process_evaluate: int = 16, # This parameter will be unused
103
105
  timeout=6,
104
106
  ):
105
107
  """We take the list of code generations and try to compile them and the run
@@ -117,26 +119,19 @@ def evaluate_generations(
117
119
  [-2] = compile error, [-1] = runtime error [False] = failed test
118
120
  case [True] = passed test case
119
121
  """
122
+ results = {}
123
+ metadata = {}
120
124
 
121
- # generations are code generations in the same order of the dataset
122
-
123
- inputs = [[(generations_list[index], samples_list[index], debug, timeout), index]
124
- for index in range(len(generations_list))]
125
-
126
- with ProcessPoolExecutor(max_workers=1 if debug else num_process_evaluate) as executor:
127
- futures = {
128
- executor.submit(evaluate_generations_by_problem, problem_generations, sample, debug, timeout): index
129
- for (problem_generations, sample, debug, timeout), index in inputs
130
- }
125
+ for index in range(len(generations_list)):
126
+ problem_generations = generations_list[index]
127
+ sample = samples_list[index]
131
128
 
132
- results = {}
133
- metadata = {}
134
- for future in as_completed(futures):
135
- index = futures[future]
136
- results[index], metadata[index] = future.result()
129
+ result, meta = evaluate_generations_by_problem(problem_generations, sample, debug, timeout)
130
+ results[index] = result
131
+ metadata[index] = meta
137
132
 
138
- assert len(results) == len(inputs), f'results = {len(results)} inputs = {len(inputs)} {results=}'
139
- # results = {i: r for r, (_, i) in zip(results, inputs)}
133
+ assert len(results) == len(
134
+ generations_list), f'results = {len(results)} inputs = {len(generations_list)} {results=}'
140
135
 
141
136
  return results, metadata
142
137
 
@@ -18,7 +18,8 @@ logger = get_logger()
18
18
  extra_params={
19
19
  'start_date': None,
20
20
  'end_date': None,
21
- 'timeout': 6
21
+ 'timeout': 6,
22
+ 'debug': False
22
23
  },
23
24
  system_prompt=
24
25
  'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.', # noqa: E501
@@ -33,6 +34,7 @@ class LiveCodeBenchAdapter(DataAdapter):
33
34
  extra_params = kwargs.get('extra_params', {})
34
35
 
35
36
  self.timeout = extra_params.get('timeout', 6)
37
+ self.debug = extra_params.get('debug', False)
36
38
  self.start_date = extra_params.get('start_date')
37
39
  self.end_date = extra_params.get('end_date')
38
40
 
@@ -84,5 +86,6 @@ class LiveCodeBenchAdapter(DataAdapter):
84
86
  k_list=[1],
85
87
  num_process_evaluate=1,
86
88
  timeout=self.timeout,
89
+ debug=self.debug,
87
90
  )
88
91
  return metrics['pass@1'] / 100 # convert to point scale
@@ -12,6 +12,7 @@ import time
12
12
  from datetime import datetime
13
13
  from decimal import Decimal
14
14
  from enum import Enum
15
+ from functools import partial
15
16
  from io import StringIO
16
17
  # from pyext import RuntimeModule
17
18
  from types import ModuleType
@@ -46,8 +47,9 @@ class TimeoutException(Exception):
46
47
  pass
47
48
 
48
49
 
49
- def timeout_handler(signum, frame):
50
- logger.info('timeout occured: alarm went off')
50
+ def timeout_handler(debug, signum, frame):
51
+ if debug:
52
+ logger.info('timeout occured: alarm went off')
51
53
  raise TimeoutException
52
54
 
53
55
 
@@ -381,7 +383,8 @@ def run_test(sample, test=None, debug=False, timeout=6):
381
383
  if test(generated_code) is not None it'll try to run the code.
382
384
  otherwise it'll just return an input and output pair.
383
385
  """
384
- signal.signal(signal.SIGALRM, timeout_handler)
386
+ timeout_handler_wrapper = partial(timeout_handler, debug)
387
+ signal.signal(signal.SIGALRM, timeout_handler_wrapper)
385
388
 
386
389
  # Disable functionalities that can make destructive changes to the test.
387
390
  # max memory is set to 4GB
@@ -1,5 +1,5 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
2
+ from evalscope.metrics import extract_answer, math_equal, strip_answer_string
3
3
  from evalscope.utils.logger import get_logger
4
4
 
5
5
  # flake8: noqa
@@ -137,7 +137,7 @@ SUBJECT_MAPPING = {
137
137
  name='mmlu',
138
138
  pretty_name='MMLU',
139
139
  dataset_id='modelscope/mmlu',
140
- model_adapter=OutputType.MULTIPLE_CHOICE,
140
+ model_adapter=OutputType.GENERATION,
141
141
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
142
142
  subset_list=SUBSET_LIST,
143
143
  metric_list=['AverageAccuracy'],
@@ -263,6 +263,8 @@ class MMLUAdapter(DataAdapter):
263
263
 
264
264
  if include_answer:
265
265
  example += f"\nAnswer: {input_d['target']}\n\n"
266
+ else:
267
+ example += '\nAnswer: \n\n'
266
268
 
267
269
  return example
268
270
 
@@ -3,8 +3,7 @@ from collections import defaultdict
3
3
  from typing import Any, List
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.metrics import Metric, mean, metric_registry
7
- from evalscope.metrics.llm_judge import LLMJudge
6
+ from evalscope.metrics import LLMJudge, Metric, mean, metric_registry
8
7
  from evalscope.utils.logger import get_logger
9
8
 
10
9
  # flake8: noqa
@@ -1,6 +1,6 @@
1
- from dataclasses import dataclass
1
+ from dataclasses import asdict, dataclass
2
2
  from functools import wraps
3
- from typing import Dict, List, Optional
3
+ from typing import Dict, List, Optional, Union
4
4
 
5
5
  from evalscope.constants import EvalType
6
6
  from evalscope.utils.filters import Filter
@@ -9,30 +9,21 @@ from evalscope.utils.filters import Filter
9
9
  @dataclass
10
10
  class PromptData:
11
11
  data: List[str]
12
- index: Optional[int] = 0
12
+ index: Optional[Union[int, str]] = 0
13
13
  system_prompt: Optional[str] = None
14
14
  multi_choices: Optional[List[str]] = None
15
+ id: Optional[str] = None
15
16
 
16
17
  def to_dict(self) -> Dict:
17
- if self.multi_choices is None:
18
- return {
19
- 'data': self.data,
20
- 'index': self.index,
21
- 'system_prompt': self.system_prompt,
22
- }
23
- else:
24
- return {
25
- 'data': self.data,
26
- 'index': self.index,
27
- 'system_prompt': self.system_prompt,
28
- 'multi_choices': self.multi_choices,
29
- }
18
+ return {k: v for k, v in asdict(self).items() if v is not None}
30
19
 
31
20
 
32
21
  def preprocess_decorator(func):
33
22
 
34
23
  @wraps(func)
35
24
  def wrapper(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT):
25
+ if result is None:
26
+ result = ''
36
27
  filters = self.config_kwargs.get('filters', None)
37
28
  if filters:
38
29
  # Apply filters to the resultply filters to the result
@@ -21,7 +21,7 @@ class StartAppCMD(CLICommand):
21
21
  def define_args(parsers: ArgumentParser):
22
22
  """ define args for create pipeline template command.
23
23
  """
24
- from evalscope.report.app import add_argument
24
+ from evalscope.report import add_argument
25
25
 
26
26
  parser = parsers.add_parser(StartAppCMD.name)
27
27
  add_argument(parser)
@@ -1,8 +1,10 @@
1
1
  import json
2
2
  import os
3
3
  import pandas as pd
4
+ import random
4
5
  from collections import defaultdict
5
6
  from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from copy import deepcopy
6
8
  from tabulate import tabulate
7
9
  from tqdm import tqdm
8
10
  from typing import List
@@ -10,7 +12,7 @@ from typing import List
10
12
  from evalscope.benchmarks import Benchmark, DataAdapter
11
13
  from evalscope.collections.sampler import DatasetEntry
12
14
  from evalscope.config import TaskConfig
13
- from evalscope.constants import AnswerKeys, DumpMode, EvalType
15
+ from evalscope.constants import AnswerKeys, DataCollection, DumpMode, EvalType
14
16
  from evalscope.evaluator import Evaluator
15
17
  from evalscope.models import initialize_model_adapter
16
18
  from evalscope.report import ReportGenerator
@@ -67,9 +69,10 @@ class EvaluatorCollection:
67
69
  def load(self) -> tuple[list[DatasetEntry], str]:
68
70
  dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
69
71
  raw_dataset = self.data_adapter.load()
70
- # limit the dataset
72
+ # random limit the dataset
71
73
  if self.task_cfg.limit:
72
- raw_dataset = raw_dataset[:self.task_cfg.limit]
74
+ raw_dataset = random.sample(raw_dataset,
75
+ self.task_cfg.limit) if len(raw_dataset) > self.task_cfg.limit else raw_dataset
73
76
  # index dataset
74
77
  datasets = []
75
78
  for sample in raw_dataset:
@@ -95,10 +98,17 @@ class EvaluatorCollection:
95
98
 
96
99
  def _initialize_evaluators(self):
97
100
  evaluators = {}
101
+ # load dataset args
102
+ dataset_args = deepcopy(self.task_cfg.dataset_args)
103
+ common_args = dataset_args.get(DataCollection.NAME, {})
98
104
  for dataset_name in self.dataset_name_map.keys():
99
105
  benchmark = Benchmark.get(dataset_name)
100
106
  model_adapter = initialize_model_adapter(self.task_cfg, benchmark, self.model)
101
- data_adapter = benchmark.get_data_adapter()
107
+ # update dataset args
108
+ cur_dataset_args = dataset_args.get(dataset_name, {})
109
+ cur_dataset_args.update(common_args)
110
+ # get data adapter
111
+ data_adapter = benchmark.get_data_adapter(cur_dataset_args)
102
112
  evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
103
113
  self.outputs)
104
114
  return evaluators
@@ -185,12 +195,14 @@ class EvaluatorCollection:
185
195
  index = answer.get(AnswerKeys.INDEX)
186
196
  answer_dict[index] = answer
187
197
  indices.add(index)
198
+
188
199
  data = []
189
200
  for sample in self.dataset:
190
201
  if sample.index not in indices:
191
202
  data.append(sample)
192
203
  data_map = self._init_name_map(data)
193
204
 
205
+ logger.info(f'Reuse from {pred_file_path}. Loaded {len(indices)} samples, remain {len(data)} samples.')
194
206
  return answer_dict, data, data_map
195
207
  return answer_dict, self.dataset, self.dataset_name_map
196
208
 
evalscope/config.py CHANGED
@@ -4,13 +4,12 @@ import copy
4
4
  import json
5
5
  import os
6
6
  from argparse import Namespace
7
- from collections import OrderedDict
8
7
  from dataclasses import dataclass, field
9
8
  from typing import Dict, List, Optional, Union
10
9
 
11
10
  from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType,
12
- JudgeStrategy, OutputType)
13
- from evalscope.models.custom import CustomModel
11
+ JudgeStrategy, ModelTask, OutputType)
12
+ from evalscope.models import CustomModel, DummyCustomModel
14
13
  from evalscope.utils import gen_hash
15
14
  from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
16
15
  from evalscope.utils.logger import get_logger
@@ -36,6 +35,7 @@ class TaskConfig:
36
35
  model: Union[str, 'CustomModel', None] = None
37
36
  model_id: Optional[str] = None
38
37
  model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
38
+ model_task: Optional[str] = ModelTask.TEXT_GENERATION
39
39
 
40
40
  # Template-related arguments
41
41
  template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
@@ -79,6 +79,10 @@ class TaskConfig:
79
79
  judge_model_args: Optional[Dict] = field(default_factory=lambda: {})
80
80
 
81
81
  def __post_init__(self):
82
+ if self.model is None:
83
+ self.model = DummyCustomModel()
84
+ self.eval_type = EvalType.CUSTOM
85
+
82
86
  if (not self.model_id) and self.model:
83
87
  if isinstance(self.model, CustomModel):
84
88
  self.model_id = self.model.config.get('model_id', 'custom_model')
evalscope/constants.py CHANGED
@@ -1,4 +1,9 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # flake8: noqa
3
+ import os
4
+
5
+ os.environ['MODELSCOPE_LOG_LEVEL'] = '40' # Set default log level to ERROR
6
+
2
7
  from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
3
8
  from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
4
9
 
@@ -145,6 +150,7 @@ class OutputType:
145
150
  GENERATION = 'generation' # for text generation tasks and general tasks
146
151
  MULTIPLE_CHOICE = 'multiple_choice_logits' # for multiple choice tasks
147
152
  CONTINUOUS = 'continuous_logits' # for continuous tasks
153
+ IMAGE_GENERATION = 'image_generation' # for image generation tasks
148
154
 
149
155
 
150
156
  class EvalBackend:
@@ -164,3 +170,8 @@ class JudgeStrategy:
164
170
  RULE = 'rule'
165
171
  LLM = 'llm'
166
172
  LLM_RECALL = 'llm_recall'
173
+
174
+
175
+ class ModelTask:
176
+ TEXT_GENERATION = 'text_generation'
177
+ IMAGE_GENERATION = 'image_generation'
@@ -66,7 +66,7 @@ class Evaluator(object):
66
66
  if self.task_cfg.judge_strategy == JudgeStrategy.RULE:
67
67
  self.judge = None
68
68
  else:
69
- from evalscope.metrics.llm_judge import LLMJudge
69
+ from evalscope.metrics import LLMJudge
70
70
  self.judge = LLMJudge(**self.task_cfg.judge_model_args)
71
71
 
72
72
  def load_dataset(self):
@@ -281,7 +281,7 @@ class Evaluator(object):
281
281
  os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
282
282
 
283
283
  if self.use_cache and os.path.exists(review_file_path):
284
- logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
284
+ logger.info(f'Updating the review file: {review_file_path} ...')
285
285
  os.remove(review_file_path)
286
286
 
287
287
  def process_single_review(answer_d):
@@ -317,6 +317,8 @@ class Evaluator(object):
317
317
  """
318
318
 
319
319
  review_res_list = []
320
+ max_choices = max(
321
+ len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d[ReviewKeys.REVIEWED])
320
322
  for review_d in reviews_list:
321
323
  if not review_d[ReviewKeys.REVIEWED]:
322
324
  logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
@@ -325,10 +327,14 @@ class Evaluator(object):
325
327
  if len(review_d[AnswerKeys.CHOICES]) == 0:
326
328
  logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
327
329
  continue
328
- elif len(review_d[AnswerKeys.CHOICES]) == 1:
330
+ elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
329
331
  review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
330
332
  else:
331
333
  review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
334
+ if len(review_d[AnswerKeys.CHOICES]) < max_choices:
335
+ logger.warning(
336
+ f'Less choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, '
337
+ f'max_choices is {max_choices}, but only {len(review_d[AnswerKeys.CHOICES])} choices found')
332
338
 
333
339
  review_res_list.append(review_res)
334
340
 
@@ -11,7 +11,7 @@ from functools import partial
11
11
  from typing import Any, List, Tuple
12
12
 
13
13
  from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
14
- from evalscope.models.model import OpenAIModel
14
+ from evalscope.models import OpenAIModel
15
15
  from evalscope.utils import completion_parsers, random_seeded_choice
16
16
  from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
17
17
  from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
@@ -1,5 +1,50 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from evalscope.metrics.metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean,
3
- simple_f1_score, weighted_mean)
4
- from evalscope.metrics.named_metrics import *
5
- from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
2
+ from typing import TYPE_CHECKING
3
+
4
+ from evalscope.utils.import_utils import _LazyModule
5
+
6
+ if TYPE_CHECKING:
7
+ from .llm_judge import LLMJudge
8
+ from .math_parser import extract_answer, math_equal, strip_answer_string
9
+ from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
10
+ weighted_mean)
11
+ from .named_metrics import Metric, metric_registry
12
+ from .rouge_metric import compute_rouge_score_one_sample_zh
13
+
14
+ else:
15
+ _import_structure = {
16
+ 'metrics': [
17
+ 'bleu_ngram_one_sample',
18
+ 'exact_match',
19
+ 'macro_mean',
20
+ 'mean',
21
+ 'micro_mean',
22
+ 'simple_f1_score',
23
+ 'weighted_mean',
24
+ ],
25
+ 'named_metrics': [
26
+ 'Metric',
27
+ 'metric_registry',
28
+ ],
29
+ 'rouge_metric': [
30
+ 'compute_rouge_score_one_sample_zh',
31
+ ],
32
+ 'llm_judge': [
33
+ 'LLMJudge',
34
+ ],
35
+ 'math_parser': [
36
+ 'extract_answer',
37
+ 'math_equal',
38
+ 'strip_answer_string',
39
+ ],
40
+ }
41
+
42
+ import sys
43
+
44
+ sys.modules[__name__] = _LazyModule(
45
+ __name__,
46
+ globals()['__file__'],
47
+ _import_structure,
48
+ module_spec=__spec__,
49
+ extra_objects={},
50
+ )
@@ -54,7 +54,7 @@ class LLMJudge:
54
54
  self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
55
55
  self.generation_config = generation_config
56
56
 
57
- from evalscope.models.server_adapter import ServerModelAdapter
57
+ from evalscope.models import ServerModelAdapter
58
58
 
59
59
  # Initialize ServerModelAdapter
60
60
  self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)
@@ -3,6 +3,8 @@ from functools import partial
3
3
  from typing import Callable, Dict
4
4
 
5
5
  from evalscope.metrics.metrics import mean, pass_at_k, weighted_mean
6
+ from evalscope.metrics.t2v_metrics import (blip2_score, clip_flant5_score, clip_score, fga_blip2_score, hpsv2_1_score,
7
+ hpsv2_score, image_reward_score, mps_score, pick_score)
6
8
 
7
9
 
8
10
  @dataclass
@@ -40,3 +42,14 @@ metric_registry.register(Metric(name='WeightedAverageBLEU', object=weighted_mean
40
42
  metric_registry.register(Metric(name='AveragePass@1', object=mean))
41
43
  for k in range(1, 17):
42
44
  metric_registry.register(Metric(name=f'Pass@{k}', object=partial(pass_at_k, k=k)))
45
+
46
+ # t2v_metrics
47
+ metric_registry.register(Metric(name='VQAScore', object=clip_flant5_score))
48
+ metric_registry.register(Metric(name='PickScore', object=pick_score))
49
+ metric_registry.register(Metric(name='CLIPScore', object=clip_score))
50
+ metric_registry.register(Metric(name='BLIPv2Score', object=blip2_score))
51
+ metric_registry.register(Metric(name='HPSv2Score', object=hpsv2_score))
52
+ metric_registry.register(Metric(name='HPSv2.1Score', object=hpsv2_1_score))
53
+ metric_registry.register(Metric(name='ImageRewardScore', object=image_reward_score))
54
+ metric_registry.register(Metric(name='FGA_BLIP2Score', object=fga_blip2_score))
55
+ metric_registry.register(Metric(name='MPS', object=mps_score))
@@ -0,0 +1,66 @@
1
+ from __future__ import absolute_import, division, print_function
2
+
3
+ from .clipscore import CLIPScore, list_all_clipscore_models
4
+ from .constants import CACHE_DIR
5
+ from .itmscore import ITMScore, list_all_itmscore_models
6
+ from .vqascore import VQAScore, list_all_vqascore_models
7
+
8
+
9
+ def list_all_models():
10
+ return list_all_vqascore_models() + list_all_clipscore_models() + list_all_itmscore_models()
11
+
12
+
13
+ def get_score_model(model='clip-flant5-xxl', device='cuda', cache_dir=CACHE_DIR, **kwargs):
14
+ if model in list_all_vqascore_models():
15
+ return VQAScore(model, device=device, cache_dir=cache_dir, **kwargs)
16
+ elif model in list_all_clipscore_models():
17
+ return CLIPScore(model, device=device, cache_dir=cache_dir, **kwargs)
18
+ elif model in list_all_itmscore_models():
19
+ return ITMScore(model, device=device, cache_dir=cache_dir, **kwargs)
20
+ else:
21
+ raise NotImplementedError()
22
+
23
+
24
+ def clip_flant5_score():
25
+ clip_flant5_score = VQAScore(model='clip-flant5-xxl')
26
+ return clip_flant5_score
27
+
28
+
29
+ def pick_score():
30
+ pick_score = CLIPScore(model='pickscore-v1')
31
+ return pick_score
32
+
33
+
34
+ def clip_score():
35
+ clip_score = CLIPScore(model='openai:ViT-L-14-336')
36
+ return clip_score
37
+
38
+
39
+ def blip2_score():
40
+ blip_itm_score = ITMScore(model='blip2-itm')
41
+ return blip_itm_score
42
+
43
+
44
+ def hpsv2_score():
45
+ hpsv2_score = CLIPScore(model='hpsv2')
46
+ return hpsv2_score
47
+
48
+
49
+ def hpsv2_1_score():
50
+ hpsv2_1_score = CLIPScore(model='hpsv2.1')
51
+ return hpsv2_1_score
52
+
53
+
54
+ def image_reward_score():
55
+ image_reward_score = ITMScore(model='image-reward-v1')
56
+ return image_reward_score
57
+
58
+
59
+ def fga_blip2_score():
60
+ fga_blip2_score = ITMScore(model='fga_blip2')
61
+ return fga_blip2_score
62
+
63
+
64
+ def mps_score():
65
+ mps_score = CLIPScore(model='mps')
66
+ return mps_score
@@ -0,0 +1,14 @@
1
+ from typing import List
2
+
3
+ from .constants import CACHE_DIR
4
+ from .models.clipscore_models import get_clipscore_model, list_all_clipscore_models
5
+ from .score import Score
6
+
7
+
8
+ class CLIPScore(Score):
9
+
10
+ def prepare_scoremodel(self, model='openai:ViT-L/14', device='cuda', cache_dir=CACHE_DIR):
11
+ return get_clipscore_model(model, device=device, cache_dir=cache_dir)
12
+
13
+ def list_all_models(self) -> List[str]:
14
+ return list_all_clipscore_models()
@@ -0,0 +1,12 @@
1
+ import os
2
+ from modelscope.utils.file_utils import get_model_cache_root
3
+
4
+ CACHE_DIR = get_model_cache_root()
5
+ os.environ['TORCH_HOME'] = CACHE_DIR # set timm cache dir
6
+
7
+ # For CLIP-FlanT5
8
+ CONTEXT_LEN = 2048
9
+ SYSTEM_MSG = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
10
+ IGNORE_INDEX = -100
11
+ IMAGE_TOKEN_INDEX = -200
12
+ DEFAULT_IMAGE_TOKEN = '<image>'
@@ -0,0 +1,14 @@
1
+ from typing import List
2
+
3
+ from .constants import CACHE_DIR
4
+ from .models.itmscore_models import get_itmscore_model, list_all_itmscore_models
5
+ from .score import Score
6
+
7
+
8
+ class ITMScore(Score):
9
+
10
+ def prepare_scoremodel(self, model='blip2-itm', device='cuda', cache_dir=CACHE_DIR):
11
+ return get_itmscore_model(model, device=device, cache_dir=cache_dir)
12
+
13
+ def list_all_models(self) -> List[str]:
14
+ return list_all_itmscore_models()
File without changes
@@ -0,0 +1,30 @@
1
+ from ...constants import CACHE_DIR
2
+ from .clip_model import CLIP_MODELS, CLIPScoreModel
3
+ from .hpsv2_model import HPSV2_MODELS, HPSV2ScoreModel
4
+ from .mps_model import MPS_MODELS, MPSModel
5
+ from .pickscore_model import PICKSCORE_MODELS, PickScoreModel
6
+
7
+ ALL_CLIP_MODELS = [
8
+ CLIP_MODELS,
9
+ HPSV2_MODELS,
10
+ PICKSCORE_MODELS,
11
+ MPS_MODELS,
12
+ ]
13
+
14
+
15
+ def list_all_clipscore_models():
16
+ return [model for models in ALL_CLIP_MODELS for model in models]
17
+
18
+
19
+ def get_clipscore_model(model_name, device='cuda', cache_dir=CACHE_DIR):
20
+ assert model_name in list_all_clipscore_models()
21
+ if model_name in CLIP_MODELS:
22
+ return CLIPScoreModel(model_name, device=device, cache_dir=cache_dir)
23
+ elif model_name in HPSV2_MODELS:
24
+ return HPSV2ScoreModel(model_name, device=device, cache_dir=cache_dir)
25
+ elif model_name in PICKSCORE_MODELS:
26
+ return PickScoreModel(model_name, device=device, cache_dir=cache_dir)
27
+ elif model_name in MPS_MODELS:
28
+ return MPSModel(model_name, device=device, cache_dir=cache_dir)
29
+ else:
30
+ raise NotImplementedError()
@@ -0,0 +1,6 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class BaseModelConfig:
6
+ pass