evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
evalscope/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from .version import __release_datetime__, __version__
@@ -0,0 +1,3 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.backend.opencompass.backend_manager import OpenCompassBackendManager
@@ -0,0 +1,27 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import Union
3
+
4
+ from evalscope.utils import yaml_to_dict
5
+
6
+
7
+ class BackendManager:
8
+ def __init__(self, config: Union[str, dict], **kwargs):
9
+ """
10
+ BackendManager is the base class for the evaluation backend manager.
11
+ It provides the basic configuration parsing, command generation, task submission, and result fetching.
12
+
13
+ config: str or dict, the configuration of the evaluation backend.
14
+ could be a string of the path to the configuration file (yaml), or a dictionary.
15
+ """
16
+ if isinstance(config, str):
17
+ self.config_d = yaml_to_dict(config)
18
+ else:
19
+ self.config_d = config
20
+
21
+ self.kwargs = kwargs
22
+
23
+ def run(self, *args, **kwargs):
24
+ """
25
+ Run the evaluation backend.
26
+ """
27
+ raise NotImplementedError
@@ -0,0 +1,3 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.backend.opencompass.backend_manager import OpenCompassBackendManager
@@ -0,0 +1,64 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import Dict, Any, List
3
+
4
+ """
5
+ The API meta template for OpenCompass.
6
+
7
+ See more details in the OpenCompass documentation: https://opencompass.org.cn/doc
8
+ Search for `meta template` in the documentation.
9
+ """
10
+
11
+
12
+ class MetaTemplateType:
13
+
14
+ default_api_meta_template_oc = 'default-api-meta-template-oc'
15
+
16
+ @classmethod
17
+ def get_template_name_list(cls) -> List[str]:
18
+ res = []
19
+ for k in cls.__dict__.keys():
20
+ if k.startswith('__') or k == 'get_template_name_list':
21
+ continue
22
+ res.append(cls.__dict__[k])
23
+ return res
24
+
25
+
26
+ TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
27
+
28
+
29
+ def register_template(name: str,
30
+ template: Dict[str, Any],
31
+ exists_ok: bool = False):
32
+ if not exists_ok and name in TEMPLATE_MAPPING:
33
+ raise ValueError(f"The `{name}` has already been registered in the TEMPLATE_MAPPING.")
34
+
35
+ TEMPLATE_MAPPING[name] = template
36
+
37
+
38
+ def get_template(name: str) -> Dict[str, Any]:
39
+ if name not in TEMPLATE_MAPPING:
40
+ raise ValueError(f"The `{name}` has not been registered in the TEMPLATE_MAPPING.")
41
+
42
+ return TEMPLATE_MAPPING[name]
43
+
44
+
45
+ # Default API meta template for OpenCompass
46
+ register_template(
47
+ name=MetaTemplateType.default_api_meta_template_oc,
48
+ template=dict(
49
+ round=[
50
+ dict(role='HUMAN', api_role='HUMAN'),
51
+ dict(role='BOT', api_role='BOT', generate=True)
52
+ ],
53
+ reserved_roles=[
54
+ dict(role='SYSTEM', api_role='SYSTEM'),
55
+ ],
56
+ )
57
+ )
58
+
59
+
60
+ if __name__ == '__main__':
61
+ res = MetaTemplateType.get_template_name_list()
62
+ print(res)
63
+
64
+ print(get_template(MetaTemplateType.default_api_meta_template_oc))
@@ -0,0 +1,247 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from enum import Enum
3
+ from typing import Optional, Union
4
+ import subprocess
5
+ from dataclasses import asdict
6
+ import tempfile
7
+
8
+ from evalscope.utils import is_module_installed, get_module_path, get_valid_list
9
+ from evalscope.backend.base import BackendManager
10
+ from evalscope.backend.opencompass.api_meta_template import get_template
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+
16
+ class CmdMode(Enum):
17
+
18
+ # The basic mode is to run the command directly,
19
+ # e.g. `python -m run --models model1 model2 --datasets dataset1 dataset2`
20
+ BASIC = 'basic'
21
+
22
+ # The script mode is to run the command with a script,
23
+ # e.g. `python -m run your_config_script.py`
24
+ SCRIPT = 'script'
25
+
26
+
27
+ class RunMode(Enum):
28
+
29
+ # The command mode is to run the command directly with the command line.
30
+ CMD = 'cmd'
31
+
32
+ # The function mode is to run the command with a function call -- run_task().
33
+ FUNCTION = 'function'
34
+
35
+
36
+ class OpenCompassBackendManager(BackendManager):
37
+
38
+ def __init__(self, config: Union[str, dict], **kwargs):
39
+ """
40
+ The backend manager for OpenCompass.
41
+
42
+ Args:
43
+ config: Union[str, dict], the configuration yaml-file or the configuration dictionary.
44
+ attributes:
45
+ datasets: list, the datasets.
46
+ models: list, the models.
47
+ work_dir (Optional): str, the working directory. Default to None, which means the current directory.
48
+ dry_run (Optional): bool, the dry-run flag. Default to False.
49
+ debug (Optional): bool, the debug flag. Default to False.
50
+ reuse (Optional): str, reuse previous outputs & results. Default to None.
51
+ generation_kwargs (Optional): dict, the generation config. Default to {}.
52
+ limit (Optional): int or float or str, the limit of the number of examples. Default to None.
53
+ if limit is a string, it should be in the format of '[start:end]'.
54
+
55
+ example:
56
+ # TODO: add demo config
57
+ config = dict(
58
+ datasets=[mmlu, ceval],
59
+ models=[...],
60
+ ...
61
+ )
62
+
63
+ **kwargs: the keyword arguments.
64
+ """
65
+
66
+ self._check_env()
67
+ super().__init__(config, **kwargs)
68
+
69
+ from opencompass.cli.arguments import Arguments as OpenCompassArguments
70
+ self.args = OpenCompassArguments(**self.config_d)
71
+
72
+ @property
73
+ def cmd(self):
74
+ return self.get_cmd()
75
+
76
+ @staticmethod
77
+ def _check_env():
78
+ if is_module_installed('opencompass'):
79
+ logger.info('Please make sure you have installed the `ms-opencompass`: `pip install ms-opencompass`')
80
+ else:
81
+ raise ModuleNotFoundError('Please install the `ms-opencompass` first: `pip install ms-opencompass`')
82
+
83
+ @staticmethod
84
+ def get_restore_arg(arg_name: str, arg_val: bool):
85
+ if arg_val:
86
+ return f'--{arg_name}'
87
+ else:
88
+ return ''
89
+
90
+ @staticmethod
91
+ def get_arg_with_default(arg_name: str, arg_val: Optional[str] = None):
92
+ if arg_val:
93
+ return f'--{arg_name} {arg_val}'
94
+ else:
95
+ return ''
96
+
97
+ def load_task_template(self):
98
+ """
99
+ Load the initial OpenCompass task template from task config file.
100
+
101
+ Returns:
102
+ (mmengine.config.config.Config), the initial task template config.
103
+ """
104
+ from opencompass.utils.run import get_config_from_arg
105
+
106
+ template_config_path = get_module_path('evalscope.backend.opencompass.tasks.eval_api')
107
+ self.args.config = template_config_path
108
+ return get_config_from_arg(self.args)
109
+
110
+ @staticmethod
111
+ def list_datasets(return_details: bool = False):
112
+ from opencompass.utils.run import get_config_from_arg
113
+ from dataclasses import dataclass
114
+
115
+ @dataclass
116
+ class TempArgs:
117
+ config: str
118
+ accelerator: str = None
119
+
120
+ template_config_path = get_module_path('evalscope.backend.opencompass.tasks.eval_api')
121
+ template_cfg = get_config_from_arg(TempArgs(config=template_config_path))
122
+
123
+ # e.g. ['mmlu', 'ceval', 'openai_humaneval', ...]
124
+ dataset_show_names = list(set([_dataset['dataset_name'] for _dataset in template_cfg.datasets]))
125
+
126
+ if return_details:
127
+ return dataset_show_names, template_cfg.datasets
128
+ else:
129
+ return dataset_show_names
130
+
131
+ def get_task_args(self):
132
+ return self.args
133
+
134
+ def get_cmd(self, cmd_mode: str = CmdMode.BASIC):
135
+
136
+ if cmd_mode == CmdMode.BASIC:
137
+ assert self.args.datasets, 'The datasets are required.'
138
+ assert self.args.models, 'The models are required.'
139
+
140
+ cmd_str = f'python -m run_oc ' \
141
+ f'--models {" ".join(self.args.models)} ' \
142
+ f'--datasets {" ".join(self.args.datasets)} ' \
143
+ f'{self.get_restore_arg("dry-run", self.args.dry_run)} ' \
144
+ f'{self.get_arg_with_default("work-dir", self.args.work_dir)}'
145
+
146
+ elif cmd_mode == CmdMode.SCRIPT:
147
+ assert self.args.config, 'The script file is required.'
148
+ cmd_str = f'python -m run_oc {self.args.config}'
149
+ else:
150
+ raise ValueError(f'Unsupported command mode: {cmd_mode}')
151
+
152
+ return cmd_str
153
+
154
+ def run(self, run_mode: str = RunMode.FUNCTION):
155
+ """
156
+ The entry function to run the OpenCompass task.
157
+
158
+ Args:
159
+ run_mode: str, the running mode, e.g. 'function' or 'cmd'.
160
+
161
+ Returns:
162
+ None
163
+ """
164
+ if run_mode == RunMode.FUNCTION:
165
+ from opencompass.cli.main import run_task
166
+ from opencompass.cli.arguments import ApiModelConfig
167
+
168
+ assert isinstance(self.args.models, list) and len(self.args.models) > 0, 'The models are required.'
169
+
170
+ tmp_model_d: dict = self.args.models[0]
171
+ assert 'path' in tmp_model_d and 'openai_api_base' in tmp_model_d, \
172
+ f"Got invalid model config: {tmp_model_d}. \nTo get valid format: " \
173
+ "{'path': 'qwen-7b-chat', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'}"
174
+
175
+ # Get valid datasets
176
+ dataset_names = self.args.datasets # e.g. ['mmlu', 'ceval']
177
+ dataset_names_all, real_dataset_all = self.list_datasets(return_details=True)
178
+
179
+ if not dataset_names:
180
+ logger.warning(f'No datasets are specified in the config. Use all the datasets: {dataset_names_all}')
181
+ valid_dataset_names = dataset_names_all
182
+ else:
183
+ valid_dataset_names, invalid_dataset_names = get_valid_list(dataset_names, dataset_names_all)
184
+ if len(invalid_dataset_names) > 0:
185
+ logger.error(f'Invalid datasets: {invalid_dataset_names}, '
186
+ f'refer to the following list to get proper dataset name: {dataset_names_all}')
187
+ assert len(valid_dataset_names) > 0, f'No valid datasets. ' \
188
+ f'To get the valid datasets, please refer to {dataset_names_all}'
189
+
190
+ valid_datasets = [_dataset for _dataset in real_dataset_all if _dataset['dataset_name'] in valid_dataset_names]
191
+ for _dataset in valid_datasets:
192
+ _dataset.pop('dataset_name')
193
+ _dataset['reader_cfg']['test_range'] = self.args.limit
194
+
195
+ # Get valid models
196
+ models = []
197
+ for model_d in self.args.models:
198
+ # model_d: {'path': 'qwen-7b-chat',
199
+ # 'meta_template': 'default-api-meta-template-oc', # Optional
200
+ # 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'}
201
+ # Note: 'meta_template' can be a dict or a string, default is None
202
+
203
+ if 'meta_template' in model_d and isinstance(model_d['meta_template'], str):
204
+ model_d['meta_template'] = get_template(model_d['meta_template'])
205
+
206
+ # set the 'abbr' as the 'path' if 'abbr' is not specified
207
+ model_d['abbr'] = model_d['path']
208
+
209
+ model_config = ApiModelConfig(**model_d)
210
+ models.append(asdict(model_config))
211
+
212
+ # Load the initial task template and override configs
213
+ template_cfg = self.load_task_template()
214
+ template_cfg.datasets = valid_datasets
215
+ template_cfg.models = models
216
+
217
+ # Dump task config to a temporary file
218
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.py', mode='w')
219
+ template_cfg.dump(tmp_file.name)
220
+ # logger.info(f'The task config is dumped to: {tmp_file.name}')
221
+ self.args.config = tmp_file.name
222
+
223
+ # Submit the task
224
+ logger.info(f'*** Run task with config: {self.args.config} \n')
225
+ run_task(self.args)
226
+
227
+ # TODO: add more arguments for the command line
228
+ elif run_mode == RunMode.CMD:
229
+ subprocess.run(self.cmd, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
230
+ else:
231
+ raise ValueError(f'Unsupported run mode: {run_mode}')
232
+
233
+
234
+ if __name__ == '__main__':
235
+
236
+ # OpenCompassBackendManager.list_datasets()
237
+ # ['mmlu', 'WSC', 'DRCD', 'chid', 'gsm8k', 'AX_g', 'BoolQ', 'cmnli', 'ARC_e', 'ocnli_fc', 'summedits', 'MultiRC', 'GaokaoBench', 'obqa', 'math', 'agieval', 'hellaswag', 'RTE', 'race', 'flores', 'ocnli', 'strategyqa', 'triviaqa', 'WiC', 'COPA', 'commonsenseqa', 'piqa', 'nq', 'mbpp', 'csl', 'Xsum', 'CB', 'tnews', 'ARC_c', 'afqmc', 'eprstmt', 'ReCoRD', 'bbh', 'TheoremQA', 'CMRC', 'AX_b', 'siqa', 'storycloze', 'humaneval', 'cluewsc', 'winogrande', 'lambada', 'ceval', 'bustm', 'C3', 'lcsts']
238
+
239
+ # 'meta_template': 'default-api-meta-template-oc',
240
+ # models: llama3-8b-instruct, qwen-7b-chat
241
+ oc_backend_manager = OpenCompassBackendManager(
242
+ config={'datasets': ['mmlu', 'ceval', 'ARC_c', 'gsm8k'],
243
+ 'models': [{'path': 'llama3-8b-instruct', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'}],
244
+ 'limit': 5
245
+ }
246
+ )
247
+ oc_backend_manager.run()
@@ -0,0 +1 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -0,0 +1,30 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from mmengine.config import read_base
3
+ from opencompass.partitioners import NaivePartitioner
4
+ from opencompass.runners import LocalRunner
5
+ from opencompass.tasks import OpenICLInferTask
6
+
7
+
8
+ with read_base():
9
+ from evalscope.backend.opencompass.tasks.eval_datasets import datasets
10
+
11
+ # 1. Get datasets
12
+ # Note: OpenAI API format evaluation needs a special humaneval postprocessor
13
+ for _dataset in datasets:
14
+ if _dataset['path'] == 'openai_humaneval':
15
+ from opencompass.datasets.humaneval import humaneval_gpt_postprocess
16
+ _dataset['eval_cfg']['pred_postprocessor']['type'] = humaneval_gpt_postprocess
17
+
18
+
19
+ # 2. Get models, only for placeholder, you should fill in the real model information from command line
20
+ # See more templates in `opencompass.cli.arguments.ApiModelConfig`
21
+ models = []
22
+
23
+ # 3. Get infer config
24
+ infer = dict(
25
+ partitioner=dict(type=NaivePartitioner),
26
+ runner=dict(
27
+ type=LocalRunner,
28
+ max_num_workers=4,
29
+ task=dict(type=OpenICLInferTask)),
30
+ )
@@ -0,0 +1,71 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from mmengine.config import read_base
3
+
4
+ with read_base():
5
+ from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
6
+ from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
7
+ from opencompass.configs.datasets.agieval.agieval_gen_64afd3 import agieval_datasets
8
+ from opencompass.configs.datasets.GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
9
+ from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
10
+ from opencompass.configs.datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
11
+ from opencompass.configs.datasets.CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
12
+ from opencompass.configs.datasets.CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
13
+ from opencompass.configs.datasets.CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
14
+ from opencompass.configs.datasets.CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
15
+ from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets
16
+ from opencompass.configs.datasets.CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
17
+ from opencompass.configs.datasets.FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets
18
+ from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets
19
+ from opencompass.configs.datasets.FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
20
+ from opencompass.configs.datasets.FewCLUE_csl.FewCLUE_csl_gen_28b223 import csl_datasets
21
+ from opencompass.configs.datasets.FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
22
+ from opencompass.configs.datasets.FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets
23
+ from opencompass.configs.datasets.FewCLUE_tnews.FewCLUE_tnews_gen_b90e4a import tnews_datasets
24
+ from opencompass.configs.datasets.lcsts.lcsts_gen_8ee1fe import lcsts_datasets
25
+ from opencompass.configs.datasets.lambada.lambada_gen_217e11 import lambada_datasets
26
+ from opencompass.configs.datasets.storycloze.storycloze_gen_7f656a import storycloze_datasets
27
+ from opencompass.configs.datasets.SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
28
+ from opencompass.configs.datasets.SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
29
+ from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
30
+ from opencompass.configs.datasets.SuperGLUE_CB.SuperGLUE_CB_gen_854c6c import CB_datasets
31
+ from opencompass.configs.datasets.SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
32
+ from opencompass.configs.datasets.SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets
33
+ from opencompass.configs.datasets.SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
34
+ from opencompass.configs.datasets.SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
35
+ from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
36
+ from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
37
+ from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
38
+ from opencompass.configs.datasets.Xsum.Xsum_gen_31397e import Xsum_datasets
39
+ from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
40
+ from opencompass.configs.datasets.summedits.summedits_gen_315438 import summedits_datasets
41
+ from opencompass.configs.datasets.math.math_gen_265cce import math_datasets
42
+ from opencompass.configs.datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
43
+ from opencompass.configs.datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
44
+ from opencompass.configs.datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
45
+ from opencompass.configs.datasets.piqa.piqa_gen_1194eb import piqa_datasets
46
+ from opencompass.configs.datasets.siqa.siqa_gen_e78df3 import siqa_datasets
47
+ from opencompass.configs.datasets.strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
48
+ from opencompass.configs.datasets.winogrande.deprecated_winogrande_gen_a9ede5 import winogrande_datasets
49
+ from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
50
+ from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
51
+ from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
52
+ from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
53
+
54
+ # Note: to be supported
55
+ # from opencompass.configs.datasets.flores.flores_gen_806ede import flores_datasets
56
+ # from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
57
+ # from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
58
+
59
+
60
+ datasets = []
61
+ _locals = {k: v for k, v in locals().items() if k.endswith('_datasets')}
62
+
63
+ for k, v in _locals.items():
64
+ for _dataset in v:
65
+ _dataset['dataset_name'] = k.replace('_datasets', '')
66
+ datasets.append(_dataset)
67
+
68
+
69
+ if __name__ == '__main__':
70
+ for _dataset in datasets:
71
+ print(_dataset)
@@ -0,0 +1 @@
1
+ from evalscope.backend.vlm_eval_kit.backend_manager import VLMEvalKitBackendManager
@@ -0,0 +1,153 @@
1
+ from typing import Optional, Union
2
+ from evalscope.utils import is_module_installed, get_module_path, get_valid_list, yaml_to_dict, json_to_dict
3
+ from evalscope.backend.base import BackendManager
4
+ from evalscope.utils.logger import get_logger
5
+ from functools import partial
6
+ import subprocess
7
+ from dataclasses import dataclass
8
+ import copy
9
+
10
+ logger = get_logger()
11
+
12
+
13
+ class ExecutionMode:
14
+
15
+ # The command mode is to run the command directly with the command line.
16
+ CMD = 'cmd'
17
+
18
+ # The function mode is to run the command with a function call -- run_task().
19
+ FUNCTION = 'function'
20
+
21
+
22
+ class VLMEvalKitBackendManager(BackendManager):
23
+ def __init__(self, config: Union[str, dict], **kwargs):
24
+ """BackendManager for VLM Evaluation Kit
25
+
26
+ Args:
27
+ config (Union[str, dict]): the configuration yaml-file or the configuration dictionary
28
+ """
29
+ self._check_env()
30
+ super().__init__(config, **kwargs)
31
+ from vlmeval.utils.arguments import Arguments as VLMEvalArguments
32
+ self.args = VLMEvalArguments(**self.config_d)
33
+
34
+ self.valid_models = self.list_supported_VLMs()
35
+ self.valid_model_names = list(self.valid_models.keys())
36
+ self.valid_datasets = self.list_supported_datasets()
37
+
38
+ self._check_valid()
39
+
40
+ def _check_valid(self):
41
+ # Ensure not both model and datasets are empty
42
+ if not self.args.data or not self.args.model:
43
+ raise ValueError('** Args: Please provide model and datasets. **')
44
+
45
+ # Check datasets
46
+ valid_datasets, invalid_datasets = get_valid_list(self.args.data, self.valid_datasets)
47
+ assert len(invalid_datasets) == 0, f'Invalid datasets: {invalid_datasets}, ' \
48
+ f'refer to the following list to get proper dataset name: {self.valid_datasets}'
49
+
50
+ # Check model
51
+ if isinstance(self.args.model[0], dict):
52
+ model_names = [model['name'] for model in self.args.model]
53
+ valid_model_names, invalid_model_names = get_valid_list(model_names, self.valid_model_names)
54
+ assert len(invalid_model_names) == 0, f'Invalid models: {invalid_model_names}, ' \
55
+ f'refer to the following list to get proper model name: {self.valid_model_names}'
56
+
57
+ # set model_cfg
58
+ new_model_names = []
59
+ for model_cfg in self.args.model:
60
+ model_name = model_cfg['name']
61
+ model_class = self.valid_models[model_name]
62
+ if model_name == 'CustomAPIModel':
63
+ model_type = model_cfg['type']
64
+ self.valid_models.update({
65
+ model_type: partial(model_class,
66
+ model=model_type,
67
+ **model_cfg)
68
+ })
69
+ new_model_names.append(model_type)
70
+ else:
71
+ remain_cfg = copy.deepcopy(model_cfg)
72
+ del remain_cfg['name'] # remove not used args
73
+
74
+ self.valid_models[model_name] = partial(model_class, **remain_cfg)
75
+ new_model_names.append(model_name)
76
+
77
+ self.args.model = new_model_names
78
+
79
+ elif isinstance(self.args.model[0], str):
80
+ valid_model_names, invalid_model_names = get_valid_list(self.args.model, self.valid_model_names)
81
+ assert len(invalid_model_names) == 0, f'Invalid models: {invalid_model_names}, ' \
82
+ f'refer to the following list to get proper model name: {self.valid_model_names}'
83
+
84
+ @property
85
+ def cmd(self):
86
+ return self.get_cmd()
87
+
88
+ @staticmethod
89
+ def list_supported_VLMs():
90
+ from vlmeval.config import supported_VLM
91
+ return supported_VLM
92
+
93
+ @staticmethod
94
+ def list_supported_datasets():
95
+ from vlmeval.dataset import SUPPORTED_DATASETS
96
+ return SUPPORTED_DATASETS
97
+
98
+ @staticmethod
99
+ def _check_env():
100
+ if is_module_installed('vlmeval'):
101
+ logger.info('Please make sure you have installed the `ms-vlmeval`: `pip install ms-vlmeval`')
102
+ else:
103
+ raise ModuleNotFoundError('Please install the `ms-vlmeval` first: `pip install ms-vlmeval`')
104
+
105
+ @staticmethod
106
+ def get_restore_arg(arg_name: str, arg_val: bool):
107
+ if arg_val:
108
+ return f'--{arg_name}'
109
+ else:
110
+ return ''
111
+
112
+ @staticmethod
113
+ def get_arg_with_default(arg_name: str, arg_val: Optional[str] = None):
114
+ if arg_val:
115
+ return f'--{arg_name} {arg_val}'
116
+ else:
117
+ return ''
118
+
119
+ def get_cmd(self):
120
+ assert self.args.data, 'The datasets are required.'
121
+ assert self.args.model, 'The models are required.'
122
+
123
+ cmd_str = f'python -m vlmeval ' \
124
+ f'--model {" ".join(self.args.model)} ' \
125
+ f'--data {" ".join(self.args.data)} ' \
126
+ f'{self.get_restore_arg("verbose", self.args.verbose)} ' \
127
+ f'{self.get_restore_arg("ignore", self.args.ignore)} ' \
128
+ f'{self.get_restore_arg("rerun", self.args.rerun)} ' \
129
+ f'{self.get_arg_with_default("work-dir", self.args.work_dir)} ' \
130
+ f'{self.get_arg_with_default("limit", self.args.limit)} ' \
131
+ f'{self.get_arg_with_default("mode", self.args.mode)} ' \
132
+ f'{self.get_arg_with_default("nproc", self.args.nproc)} ' \
133
+ f'{self.get_arg_with_default("judge", self.args.judge)} ' \
134
+ f'{self.get_arg_with_default("retry", self.args.retry)} '
135
+
136
+ return cmd_str
137
+
138
+ def run(self, run_mode: str = ExecutionMode.FUNCTION):
139
+ if run_mode == ExecutionMode.CMD:
140
+ logger.info(f'** Run command: {self.cmd}')
141
+ try:
142
+ subprocess.run(self.cmd, check=True, ext=True, shell=True,)
143
+ except subprocess.CalledProcessError as e:
144
+ logger.error(f'** Run command failed: {e.stderr}')
145
+ raise
146
+
147
+ elif run_mode == ExecutionMode.FUNCTION:
148
+ from vlmeval.run import run_task
149
+ logger.info(f'*** Run task with config: {self.args} \n')
150
+ run_task(self.args)
151
+
152
+ else:
153
+ raise NotImplementedError
@@ -0,0 +1,4 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.benchmarks.benchmark import Benchmark
4
+ from evalscope.benchmarks.data_adapter import DataAdapter
@@ -0,0 +1,5 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.benchmarks.arc.arc_adapter import ARCAdapter, DATASET_ID, SUBSET_LIST
4
+ from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
5
+ from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa