evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
evalscope/run_arena.py ADDED
@@ -0,0 +1,204 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # flake8: noqa
3
+
4
+ import argparse
5
+ import os
6
+ from pathlib import Path
7
+ import torch
8
+ from tqdm import tqdm
9
+
10
+ from evalscope.constants import EvalConfigKeys
11
+ from evalscope.evaluator.rating_eval import RatingEvaluate
12
+ from evalscope.models.model_adapter import ChatGenerationModelAdapter
13
+ from evalscope.utils import get_obj_from_cfg, yaml_to_dict, jsonl_to_list, dump_jsonl_data
14
+ from evalscope.utils.logger import get_logger
15
+ from modelscope.utils.hf_util import GenerationConfig
16
+
17
+ logger = get_logger()
18
+
19
+ WORK_DIR = Path(__file__).absolute().parent
20
+
21
+
22
+ class ArenaWorkflow:
23
+
24
+ def __init__(self, cfg_file: str, **kwargs):
25
+
26
+ self.cfg_dict = yaml_to_dict(os.path.join(WORK_DIR, cfg_file))
27
+ logger.info(f'**Arena Config: {self.cfg_dict}')
28
+
29
+ self.question_file: str = os.path.join(WORK_DIR, self.cfg_dict.get('question_file'))
30
+ self.answers_gen: dict = self.cfg_dict.get('answers_gen', {})
31
+ self.reviews_gen: dict = self.cfg_dict.get('reviews_gen', {})
32
+ self.reviewer_cfg: dict = ArenaWorkflow._get_obj_from_cfg(self.reviews_gen.get('reviewer', {}))
33
+
34
+ self.prompt_file = os.path.join(WORK_DIR, self.reviews_gen.get('prompt_file'))
35
+ self.review_file = os.path.join(WORK_DIR, self.reviews_gen.get('review_file'))
36
+
37
+ self.rating_gen: dict = self.cfg_dict.get('rating_gen', {})
38
+ self.report_file: str = os.path.join(WORK_DIR, self.rating_gen.get('report_file'))
39
+
40
+ @staticmethod
41
+ def _get_obj_from_cfg(obj_cfg: dict):
42
+ cls_ref = obj_cfg.get(EvalConfigKeys.CLASS_REF, None)
43
+ if not cls_ref:
44
+ logger.warning(
45
+ f'Class reference is not specified in config: {obj_cfg}')
46
+ return obj_cfg
47
+
48
+ cls = get_obj_from_cfg(cls_ref)
49
+ obj_cfg[EvalConfigKeys.CLASS_REF] = cls
50
+
51
+ return obj_cfg
52
+
53
+ def _predict_answers(self,
54
+ model_id_or_path: str,
55
+ model_revision: str,
56
+ precision: torch.dtype,
57
+ generation_config: GenerationConfig,
58
+ template_type: str) -> list:
59
+
60
+ # TODO: multi-task to be supported
61
+ model_adapter = ChatGenerationModelAdapter(model_id=model_id_or_path,
62
+ model_revision=model_revision,
63
+ torch_dtype=precision,
64
+ generation_config=generation_config,
65
+ template_type=template_type)
66
+ res_list = []
67
+ questions_list = jsonl_to_list(self.question_file)
68
+ for data_d in tqdm(questions_list, total=len(questions_list), desc=f'Predicting(answers):'):
69
+ # {"question_id": 1, "text": "How can I improve my time management skills?", "category": "generic"}
70
+ text = data_d.get('text', None)
71
+ if not text:
72
+ logger.warning(f'Invalid question: {data_d}')
73
+ continue
74
+ prompt = f'Question: {text}\n\nAnswer:'
75
+ inputs = {'data': [prompt]}
76
+ res_d: dict = model_adapter.predict(inputs=inputs)
77
+ ans_text: str = res_d['choices'][0]['message']['content']
78
+
79
+ ans = {
80
+ 'question_id': data_d['question_id'],
81
+ 'text': data_d['text'],
82
+ 'category': data_d['category'],
83
+ 'model_id': model_id_or_path,
84
+ 'metadata': {},
85
+ 'answer': ans_text,
86
+ }
87
+ res_list.append(ans)
88
+
89
+ return res_list
90
+
91
+ def get_answers(self):
92
+ for model_name, cfg_d in self.answers_gen.items():
93
+ enable = cfg_d.get(EvalConfigKeys.ENABLE, True)
94
+ if not enable:
95
+ logger.warning(
96
+ f'Skip model {model_name} because it is not enabled.')
97
+ continue
98
+
99
+ model_id_or_path = cfg_d.get(EvalConfigKeys.MODEL_ID_OR_PATH)
100
+ model_revision = cfg_d.get(EvalConfigKeys.MODEL_REVISION, None)
101
+ precision = cfg_d.get(EvalConfigKeys.PRECISION, torch.float16)
102
+ precision = eval(precision) if isinstance(precision, str) else precision
103
+ generation_config = cfg_d.get(EvalConfigKeys.GENERATION_CONFIG, {})
104
+ generation_config = GenerationConfig(**generation_config)
105
+ ans_output_file = os.path.join(WORK_DIR, cfg_d.get(EvalConfigKeys.OUTPUT_FILE))
106
+ template_type = cfg_d.get(EvalConfigKeys.TEMPLATE_TYPE)
107
+
108
+ answers_list = self._predict_answers(model_id_or_path=model_id_or_path,
109
+ model_revision=model_revision,
110
+ precision=precision,
111
+ generation_config=generation_config,
112
+ template_type=template_type)
113
+
114
+ dump_jsonl_data(answers_list, ans_output_file)
115
+ logger.info(f'Answers generated by model {model_name} and saved to {ans_output_file}')
116
+
117
+ def get_reviews(self, dry_run: bool = False):
118
+ enable = self.reviews_gen.get(EvalConfigKeys.ENABLE, True)
119
+ if enable:
120
+ reviewer_cls = self.reviewer_cfg.get(EvalConfigKeys.CLASS_REF)
121
+ if not reviewer_cls:
122
+ logger.warning('Skip reviews generation because class reference is not specified.')
123
+ return
124
+ reviewer_args = self.reviewer_cfg.get(EvalConfigKeys.CLASS_ARGS, {})
125
+ target_answers = self.reviews_gen.get('target_answers')
126
+ if target_answers is None:
127
+ # Get all answers from answers_gen config if target_answers is None
128
+ target_answers = [item[EvalConfigKeys.OUTPUT_FILE] for item in self.answers_gen.values()]
129
+ target_answers = [os.path.join(WORK_DIR, item) for item in target_answers]
130
+ target_answers = [file_path for file_path in target_answers if os.path.exists(file_path)]
131
+
132
+ baseline_file = self.reviews_gen.get('baseline_file', None)
133
+ if baseline_file:
134
+ baseline_file = os.path.join(WORK_DIR, baseline_file)
135
+
136
+ reference_file = self.reviews_gen.get('reference_file', None)
137
+ if reference_file:
138
+ reference_file = os.path.join(WORK_DIR, reference_file)
139
+
140
+ cache_file = self.reviews_gen.get('cache_file', None)
141
+ if cache_file:
142
+ cache_file = os.path.join(WORK_DIR, cache_file)
143
+
144
+ input_kwargs = dict(
145
+ prompt_file=self.prompt_file,
146
+ answer_file_list=target_answers,
147
+ review_result_file=self.review_file,
148
+ baseline_file=baseline_file,
149
+ reference_file=reference_file,
150
+ reviewer_args=reviewer_args,
151
+ cache_file=cache_file)
152
+
153
+ reviewer_obj = reviewer_cls(**input_kwargs)
154
+ reviewer_obj.run(dry_run=dry_run)
155
+ logger.info(f'Reviews with generated by reviewer and saved to {self.review_file}')
156
+
157
+ else:
158
+ logger.warning('Skip reviews generation because it is not enabled.')
159
+
160
+ def get_rating_results(self):
161
+ enable = self.rating_gen.get(EvalConfigKeys.ENABLE, True)
162
+ if enable:
163
+ report_file = os.path.join(WORK_DIR, self.rating_gen.get('report_file'))
164
+ metrics = self.rating_gen.get('metrics', ['elo'])
165
+ baseline_model = self.rating_gen.get(
166
+ 'baseline_model') if metrics[0] == 'pairwise' else None
167
+ ae = RatingEvaluate(metrics=metrics, baseline_model=baseline_model)
168
+ res_list = ae.run(self.review_file)
169
+ rating_df = res_list[0]
170
+ logger.info(f'Rating results:\n{rating_df.to_csv()}')
171
+ rating_df.to_csv(report_file, index=True)
172
+ logger.info(f'Rating results are saved to {report_file}')
173
+ else:
174
+ logger.warning('Skip rating because it is not enabled.')
175
+
176
+ def run(self, dry_run: bool = False):
177
+
178
+ # Get all answers
179
+ self.get_answers()
180
+
181
+ # Get all reviews
182
+ self.get_reviews(dry_run=dry_run)
183
+
184
+ # Get rating results
185
+ self.get_rating_results()
186
+
187
+ logger.info('*** Arena workflow is finished. ***')
188
+
189
+
190
+ def main():
191
+
192
+ # Usage: python evalscope/run_arena.py -c /path/to/xxx_cfg_arena.yaml
193
+
194
+ parser = argparse.ArgumentParser(description='LLMs evaluations with arena mode.')
195
+ parser.add_argument('-c', '--cfg-file', required=True)
196
+ parser.add_argument('--dry-run', action='store_true', default=False)
197
+ args = parser.parse_args()
198
+
199
+ arena_workflow = ArenaWorkflow(cfg_file=args.cfg_file)
200
+ arena_workflow.run(dry_run=args.dry_run)
201
+
202
+
203
+ if __name__ == '__main__':
204
+ main()
evalscope/run_ms.py ADDED
@@ -0,0 +1,140 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # flake8: noqa
3
+
4
+ import argparse
5
+ import torch
6
+
7
+ from evalscope.benchmarks.ceval import DATASET_ID as CEVAL_EXAM
8
+ from evalscope.benchmarks.mmlu import DATASET_ID as MMLU
9
+ from evalscope.benchmarks.hellaswag import DATASET_ID as HELLA_SWAG
10
+ from evalscope.benchmarks.arc import DATASET_ID as ARC
11
+ from evalscope.benchmarks.truthful_qa import DATASET_ID as TRUTHFUL_QA
12
+ from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
13
+ from evalscope.evaluator import Evaluator
14
+ from evalscope.models.model_adapter import MultiChoiceModelAdapter, ContinuationLogitsModelAdapter
15
+ from evalscope.utils.logger import get_logger
16
+
17
+ logger = get_logger()
18
+
19
+ # TODO: add more precision
20
+ MODEL_PRECISION_MAP = {'fp16': torch.float16, 'fp32': torch.float32, 'bf16': torch.bfloat16}
21
+
22
+ """
23
+ Run evaluation process for ModelScope Leaderboard.
24
+ """
25
+
26
+
27
+ def parse_args():
28
+ parser = argparse.ArgumentParser(description='Run evaluation on a model')
29
+
30
+ parser.add_argument('--model', help='Model id from modelscope or huggingface.', required=True)
31
+ parser.add_argument('--revision', help='Model revision.', required=False, default=None)
32
+ parser.add_argument('--precision', help='Model precision.', default='bf16')
33
+ parser.add_argument('--work-dir', help='root work cache dir.', default=None)
34
+ parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
35
+ parser.add_argument('--datasets-dir', help='Datasets dir.', default=DEFAULT_ROOT_CACHE_DIR)
36
+ parser.add_argument('--device-map', help='device map.', default='auto')
37
+ parser.add_argument('--max-eval-size', type=int, help='Max evaluation samples num for each subset', default=None)
38
+ parser.add_argument('--dataset-id', help='Dataset id on modelscope', required=False, default=None)
39
+
40
+ parser.add_argument('--debug',
41
+ help='Debug mode, will print information for debugging.',
42
+ action='store_true',
43
+ default=False)
44
+ parser.add_argument('--dry-run',
45
+ help='Dry run in single processing mode.',
46
+ action='store_true',
47
+ default=False)
48
+ parser.add_argument('--mem-cache',
49
+ help='To use memory cache or not.',
50
+ action='store_true',
51
+ default=False)
52
+
53
+ args = parser.parse_args()
54
+
55
+ return args
56
+
57
+
58
+ def main():
59
+ args = parse_args()
60
+ logger.info(args)
61
+
62
+ # Customize your target datasets here
63
+ all_benchmarks = [CEVAL_EXAM, MMLU, ARC, HELLA_SWAG, TRUTHFUL_QA]
64
+
65
+ dataset_id = args.dataset_id
66
+ if dataset_id is None:
67
+ datasets = all_benchmarks
68
+ elif dataset_id in all_benchmarks:
69
+ datasets = [dataset_id]
70
+ else:
71
+ raise ValueError(f'Unknown dataset: {dataset_id}, Supported datasets: {all_benchmarks}')
72
+
73
+ # Get model instance
74
+ if args.dry_run:
75
+ from evalscope.models.dummy_chat_model import DummyChatModel
76
+ model_adapter = DummyChatModel(model_cfg=dict()) # TODO
77
+ model_id: str = 'dummy'
78
+ model_revision: str = 'v1.0.0'
79
+ model_precision = MODEL_PRECISION_MAP.get(args.precision, torch.bfloat16)
80
+ else:
81
+ model_id: str = args.model
82
+ model_revision: str = args.revision
83
+ model_precision = MODEL_PRECISION_MAP.get(args.precision, torch.bfloat16)
84
+
85
+ model_adapter = MultiChoiceModelAdapter(model_id=model_id,
86
+ device_map=args.device_map,
87
+ torch_dtype=model_precision,
88
+ model_revision=model_revision,)
89
+
90
+ # Evaluate on each dataset
91
+ for dataset_name in datasets:
92
+ if dataset_name == CEVAL_EXAM:
93
+ from evalscope.benchmarks.ceval import CEVALAdapter
94
+ data_adapter = CEVALAdapter()
95
+ elif dataset_name == MMLU:
96
+ from evalscope.benchmarks.mmlu import MMLUAdapter
97
+ data_adapter = MMLUAdapter()
98
+ elif dataset_name == ARC:
99
+ from evalscope.benchmarks.arc import ARCAdapter
100
+ data_adapter = ARCAdapter()
101
+ elif dataset_name == HELLA_SWAG:
102
+ # Note: HellaSwag should run few-shot eval
103
+ from evalscope.benchmarks.hellaswag import HellaSwagAdapter
104
+ data_adapter = HellaSwagAdapter()
105
+ elif dataset_name == TRUTHFUL_QA:
106
+ from evalscope.benchmarks.truthful_qa import TruthfulQaAdapter
107
+ data_adapter = TruthfulQaAdapter()
108
+
109
+ # TODO: add more datasets here
110
+ else:
111
+ raise ValueError(f'Unknown dataset: {dataset_name}')
112
+
113
+ # TODO: add mapping
114
+ if dataset_name in {TRUTHFUL_QA, HELLA_SWAG} and not args.dry_run:
115
+ model_adapter = ContinuationLogitsModelAdapter(model_id=model_id,
116
+ device_map=args.device_map,
117
+ torch_dtype=model_precision,
118
+ model_revision=model_revision, )
119
+
120
+ root_work_dir = args.work_dir if args.work_dir is not None else DEFAULT_ROOT_CACHE_DIR
121
+ evaluator = Evaluator(dataset_name_or_path=dataset_name,
122
+ subset_list=None,
123
+ data_adapter=data_adapter,
124
+ model_adapter=model_adapter,
125
+ use_cache=args.mem_cache,
126
+ root_cache_dir=root_work_dir,
127
+ outputs_dir=args.outputs_dir,
128
+ is_custom_outputs_dir=True,
129
+ datasets_dir=args.datasets_dir, )
130
+
131
+ infer_cfg = dict(max_length=2048, limit=args.max_eval_size)
132
+ evaluator.eval(infer_cfg=infer_cfg, debug=args.debug)
133
+
134
+
135
+ if __name__ == '__main__':
136
+ main()
137
+
138
+ # Usage:
139
+ # python evalscope/run_ms.py --model ZhipuAI/chatglm2-6b --precision fp16 --dry-run --dataset-id modelscope/mmlu --limit 10
140
+
@@ -0,0 +1,144 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import json
3
+ import os
4
+ import glob
5
+ from typing import List, Union
6
+
7
+ from evalscope.config import TaskConfig
8
+ from evalscope.constants import OutputsStructure
9
+ from evalscope.tools.combine_reports import gen_table
10
+ from evalscope.utils import process_outputs_structure, yaml_to_dict, EvalBackend, json_to_dict, get_latest_folder_path, \
11
+ csv_to_list
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ class Summarizer:
18
+
19
+ @staticmethod
20
+ def get_report(outputs_dir: str) -> List[dict]:
21
+ res_list: list = []
22
+
23
+ outputs_structure: dict = process_outputs_structure(outputs_dir, is_make=False)
24
+ reports_dir: str = outputs_structure.get(OutputsStructure.REPORTS_DIR)
25
+ if reports_dir is None:
26
+ raise ValueError(f'No reports directory in {outputs_dir}')
27
+
28
+ report_files: list = glob.glob(os.path.join(reports_dir, '*.json'))
29
+ for report_file in report_files:
30
+ with open(report_file, 'r') as f:
31
+ res_list.append(json.load(f))
32
+
33
+ report_table: str = gen_table([reports_dir])
34
+ logger.info(f'*** Report table ***\n{report_table}')
35
+
36
+ return res_list
37
+
38
+ @staticmethod
39
+ def get_report_from_cfg(task_cfg: Union[str, List[str], TaskConfig, List[TaskConfig], dict]) -> List[dict]:
40
+ """
41
+ Get report from cfg file.
42
+
43
+ Args:
44
+ task_cfg: task cfg file path. refer to evalscope/tasks/eval_qwen-7b-chat_v100.yaml
45
+
46
+ Returns:
47
+ list: list of report dict.
48
+ A report dict is overall report on a benchmark for specific model.
49
+ """
50
+ final_res_list: List[dict] = []
51
+ candidate_task_cfgs: List[dict] = []
52
+
53
+ if isinstance(task_cfg, dict):
54
+ candidate_task_cfgs = [task_cfg]
55
+ elif isinstance(task_cfg, str):
56
+ task_cfg: dict = yaml_to_dict(task_cfg)
57
+ candidate_task_cfgs = [task_cfg]
58
+ elif isinstance(task_cfg, TaskConfig):
59
+ task_cfg: dict = task_cfg.to_dict()
60
+ candidate_task_cfgs = [task_cfg]
61
+ elif isinstance(task_cfg, list):
62
+ for task_cfg_item in task_cfg:
63
+ if isinstance(task_cfg_item, str):
64
+ task_cfg_item: dict = yaml_to_dict(task_cfg_item)
65
+ elif isinstance(task_cfg_item, TaskConfig):
66
+ task_cfg_item: dict = task_cfg_item.to_dict()
67
+ candidate_task_cfgs.append(task_cfg_item)
68
+ else:
69
+ raise ValueError(f'Invalid task_cfg: {task_cfg}')
70
+
71
+ for candidate_task in candidate_task_cfgs:
72
+ logger.info(f'**Loading task cfg for summarizer: {candidate_task}')
73
+ eval_backend = candidate_task.get('eval_backend') or EvalBackend.NATIVE.value
74
+
75
+ if eval_backend == EvalBackend.NATIVE.value:
76
+ outputs_dir: str = candidate_task.get('outputs')
77
+ outputs_dir: str = os.path.expanduser(outputs_dir)
78
+ if outputs_dir is None:
79
+ raise ValueError(f'No outputs_dir in {task_cfg}')
80
+ res_list: list = Summarizer.get_report(outputs_dir=outputs_dir)
81
+ final_res_list.extend(res_list)
82
+
83
+ elif eval_backend == EvalBackend.OPEN_COMPASS.value:
84
+ eval_config = Summarizer.parse_eval_config(candidate_task)
85
+
86
+ work_dir = eval_config.get('work_dir') or 'outputs/default'
87
+ if not os.path.exists(work_dir):
88
+ raise ValueError(f'work_dir {work_dir} does not exist.')
89
+
90
+ res_folder_path = get_latest_folder_path(work_dir=work_dir)
91
+ summary_files = glob.glob(os.path.join(res_folder_path, 'summary', '*.csv'))
92
+ if len(summary_files) == 0:
93
+ raise ValueError(f'No summary files in {res_folder_path}')
94
+
95
+ summary_file_path = summary_files[0]
96
+ # Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'}
97
+ summary_res: List[dict] = csv_to_list(file_path=summary_file_path)
98
+ final_res_list.extend(summary_res)
99
+ elif eval_backend == EvalBackend.VLM_EVAL_KIT.value:
100
+ eval_config = Summarizer.parse_eval_config(candidate_task)
101
+
102
+ work_dir = eval_config.get('work_dir') or 'outputs/default'
103
+ if not os.path.exists(work_dir):
104
+ raise ValueError(f'work_dir {work_dir} does not exist.')
105
+
106
+ # TODO: parse summary files: acc.csv, score.csv, score.json for different models
107
+ for model in eval_config['model']:
108
+ if model['name'] == 'CustomAPIModel':
109
+ model_name = model['type']
110
+ else:
111
+ model_name = model['name']
112
+ summary_files = glob.glob(os.path.join(work_dir, model_name, '*.csv'))
113
+ for summary_file_path in summary_files:
114
+ summary_res: dict = csv_to_list(file_path=summary_file_path)[0]
115
+ file_name = os.path.basename(summary_file_path).split('.')[0]
116
+ final_res_list.append({file_name: summary_res})
117
+
118
+ elif eval_backend == EvalBackend.THIRD_PARTY.value:
119
+ raise ValueError(f'*** The summarizer for Third party evaluation backend is not supported yet ***')
120
+ else:
121
+ raise ValueError(f'Invalid eval_backend: {eval_backend}')
122
+
123
+ return final_res_list
124
+
125
+ @staticmethod
126
+ def parse_eval_config(candidate_task):
127
+ eval_config: Union[str, dict] = candidate_task.get('eval_config')
128
+ assert eval_config is not None, 'Please provide eval_config for specific evaluation backend.'
129
+
130
+ if isinstance(eval_config, str):
131
+ if eval_config.endswith('.yaml'):
132
+ eval_config: dict = yaml_to_dict(eval_config)
133
+ elif eval_config.endswith('.json'):
134
+ eval_config: dict = json_to_dict(eval_config)
135
+ else:
136
+ raise ValueError(f'Invalid eval_config: {eval_config}')
137
+ return eval_config
138
+
139
+
140
+ if __name__ == '__main__':
141
+ cfg_file = 'registry/tasks/eval_qwen-7b-chat_v100.yaml'
142
+ report_list = Summarizer.get_report_from_cfg(cfg_file)
143
+
144
+ print(report_list)
@@ -0,0 +1 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -0,0 +1,3 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.third_party.toolbench_static.toolbench_static import run_task