evalscope 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +3 -0
- evalscope/backend/__init__.py +3 -0
- evalscope/backend/base.py +27 -0
- evalscope/backend/opencompass/__init__.py +3 -0
- evalscope/backend/opencompass/api_meta_template.py +64 -0
- evalscope/backend/opencompass/backend_manager.py +247 -0
- evalscope/backend/opencompass/tasks/__init__.py +1 -0
- evalscope/backend/opencompass/tasks/eval_api.py +30 -0
- evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
- evalscope/backend/vlm_eval_kit/__init__.py +1 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
- evalscope/benchmarks/__init__.py +4 -0
- evalscope/benchmarks/arc/__init__.py +5 -0
- evalscope/benchmarks/arc/ai2_arc.py +148 -0
- evalscope/benchmarks/arc/arc_adapter.py +231 -0
- evalscope/benchmarks/bbh/__init__.py +6 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
- evalscope/benchmarks/benchmark.py +65 -0
- evalscope/benchmarks/ceval/__init__.py +5 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
- evalscope/benchmarks/ceval/ceval_exam.py +159 -0
- evalscope/benchmarks/cmmlu/__init__.py +5 -0
- evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
- evalscope/benchmarks/competition_math/__init__.py +5 -0
- evalscope/benchmarks/competition_math/competition_math.py +88 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
- evalscope/benchmarks/data_adapter.py +263 -0
- evalscope/benchmarks/general_qa/__init__.py +5 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
- evalscope/benchmarks/gsm8k/__init__.py +5 -0
- evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
- evalscope/benchmarks/hellaswag/__init__.py +5 -0
- evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
- evalscope/benchmarks/humaneval/__init__.py +5 -0
- evalscope/benchmarks/humaneval/humaneval.py +82 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
- evalscope/benchmarks/mmlu/__init__.py +5 -0
- evalscope/benchmarks/mmlu/mmlu.py +174 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
- evalscope/benchmarks/race/__init__.py +5 -0
- evalscope/benchmarks/race/race.py +118 -0
- evalscope/benchmarks/race/race_adapter.py +229 -0
- evalscope/benchmarks/trivia_qa/__init__.py +5 -0
- evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
- evalscope/benchmarks/truthful_qa/__init__.py +5 -0
- evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
- evalscope/cache.py +98 -0
- evalscope/cli/__init__.py +1 -0
- evalscope/cli/base.py +20 -0
- evalscope/cli/cli.py +26 -0
- evalscope/cli/start_perf.py +37 -0
- evalscope/cli/start_server.py +138 -0
- evalscope/config.py +165 -0
- evalscope/constants.py +150 -0
- evalscope/evaluator/__init__.py +3 -0
- evalscope/evaluator/evaluator.py +689 -0
- evalscope/evaluator/rating_eval.py +178 -0
- evalscope/evaluator/reviewer/__init__.py +1 -0
- evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
- evalscope/metrics/__init__.py +1 -0
- evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
- evalscope/metrics/code_metric.py +104 -0
- evalscope/metrics/math_accuracy.py +60 -0
- evalscope/metrics/metrics.py +405 -0
- evalscope/metrics/rouge_metric.py +129 -0
- evalscope/models/__init__.py +4 -0
- evalscope/models/custom/__init__.py +4 -0
- evalscope/models/custom/custom_model.py +53 -0
- evalscope/models/dummy_chat_model.py +50 -0
- evalscope/models/model.py +88 -0
- evalscope/models/model_adapter.py +586 -0
- evalscope/models/openai_model.py +103 -0
- evalscope/models/template.py +1446 -0
- evalscope/perf/__init__.py +0 -0
- evalscope/perf/_logging.py +32 -0
- evalscope/perf/api_plugin_base.py +60 -0
- evalscope/perf/custom_api.py +87 -0
- evalscope/perf/dashscope_api.py +84 -0
- evalscope/perf/dataset_plugin_base.py +64 -0
- evalscope/perf/datasets/__init__.py +0 -0
- evalscope/perf/datasets/line_by_line.py +18 -0
- evalscope/perf/datasets/longalpaca_12k.py +20 -0
- evalscope/perf/datasets/openqa.py +22 -0
- evalscope/perf/how_to_analysis_result.py +24 -0
- evalscope/perf/http_client.py +756 -0
- evalscope/perf/openai_api.py +130 -0
- evalscope/perf/plugin_registry.py +35 -0
- evalscope/perf/query_parameters.py +42 -0
- evalscope/perf/server_sent_event.py +43 -0
- evalscope/preprocess/__init__.py +1 -0
- evalscope/preprocess/tokenizers/__init__.py +0 -0
- evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
- evalscope/registry/__init__.py +1 -0
- evalscope/registry/tasks/arc.yaml +29 -0
- evalscope/registry/tasks/bbh.yaml +27 -0
- evalscope/registry/tasks/bbh_mini.yaml +27 -0
- evalscope/registry/tasks/ceval.yaml +27 -0
- evalscope/registry/tasks/ceval_mini.yaml +27 -0
- evalscope/registry/tasks/cmmlu.yaml +27 -0
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
- evalscope/registry/tasks/general_qa.yaml +27 -0
- evalscope/registry/tasks/gsm8k.yaml +29 -0
- evalscope/registry/tasks/mmlu.yaml +29 -0
- evalscope/registry/tasks/mmlu_mini.yaml +27 -0
- evalscope/run.py +404 -0
- evalscope/run_arena.py +204 -0
- evalscope/run_ms.py +140 -0
- evalscope/summarizer.py +144 -0
- evalscope/third_party/__init__.py +1 -0
- evalscope/third_party/toolbench_static/__init__.py +3 -0
- evalscope/third_party/toolbench_static/eval.py +219 -0
- evalscope/third_party/toolbench_static/infer.py +278 -0
- evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
- evalscope/tools/__init__.py +1 -0
- evalscope/tools/combine_reports.py +140 -0
- evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
- evalscope/tools/rewrite_eval_results.py +95 -0
- evalscope/utils/__init__.py +4 -0
- evalscope/utils/arena_utils.py +247 -0
- evalscope/utils/completion_parsers.py +87 -0
- evalscope/utils/logger.py +64 -0
- evalscope/utils/task_cfg_parser.py +10 -0
- evalscope/utils/task_utils.py +19 -0
- evalscope/utils/utils.py +625 -0
- evalscope/version.py +4 -0
- evalscope-0.5.0.dist-info/METADATA +566 -0
- evalscope-0.5.0.dist-info/RECORD +165 -0
- evalscope-0.5.0.dist-info/WHEEL +5 -0
- evalscope-0.5.0.dist-info/entry_points.txt +3 -0
- evalscope-0.5.0.dist-info/top_level.txt +1 -0
evalscope/run_arena.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# flake8: noqa
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import torch
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
from evalscope.constants import EvalConfigKeys
|
|
11
|
+
from evalscope.evaluator.rating_eval import RatingEvaluate
|
|
12
|
+
from evalscope.models.model_adapter import ChatGenerationModelAdapter
|
|
13
|
+
from evalscope.utils import get_obj_from_cfg, yaml_to_dict, jsonl_to_list, dump_jsonl_data
|
|
14
|
+
from evalscope.utils.logger import get_logger
|
|
15
|
+
from modelscope.utils.hf_util import GenerationConfig
|
|
16
|
+
|
|
17
|
+
logger = get_logger()
|
|
18
|
+
|
|
19
|
+
WORK_DIR = Path(__file__).absolute().parent
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ArenaWorkflow:
|
|
23
|
+
|
|
24
|
+
def __init__(self, cfg_file: str, **kwargs):
|
|
25
|
+
|
|
26
|
+
self.cfg_dict = yaml_to_dict(os.path.join(WORK_DIR, cfg_file))
|
|
27
|
+
logger.info(f'**Arena Config: {self.cfg_dict}')
|
|
28
|
+
|
|
29
|
+
self.question_file: str = os.path.join(WORK_DIR, self.cfg_dict.get('question_file'))
|
|
30
|
+
self.answers_gen: dict = self.cfg_dict.get('answers_gen', {})
|
|
31
|
+
self.reviews_gen: dict = self.cfg_dict.get('reviews_gen', {})
|
|
32
|
+
self.reviewer_cfg: dict = ArenaWorkflow._get_obj_from_cfg(self.reviews_gen.get('reviewer', {}))
|
|
33
|
+
|
|
34
|
+
self.prompt_file = os.path.join(WORK_DIR, self.reviews_gen.get('prompt_file'))
|
|
35
|
+
self.review_file = os.path.join(WORK_DIR, self.reviews_gen.get('review_file'))
|
|
36
|
+
|
|
37
|
+
self.rating_gen: dict = self.cfg_dict.get('rating_gen', {})
|
|
38
|
+
self.report_file: str = os.path.join(WORK_DIR, self.rating_gen.get('report_file'))
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def _get_obj_from_cfg(obj_cfg: dict):
|
|
42
|
+
cls_ref = obj_cfg.get(EvalConfigKeys.CLASS_REF, None)
|
|
43
|
+
if not cls_ref:
|
|
44
|
+
logger.warning(
|
|
45
|
+
f'Class reference is not specified in config: {obj_cfg}')
|
|
46
|
+
return obj_cfg
|
|
47
|
+
|
|
48
|
+
cls = get_obj_from_cfg(cls_ref)
|
|
49
|
+
obj_cfg[EvalConfigKeys.CLASS_REF] = cls
|
|
50
|
+
|
|
51
|
+
return obj_cfg
|
|
52
|
+
|
|
53
|
+
def _predict_answers(self,
|
|
54
|
+
model_id_or_path: str,
|
|
55
|
+
model_revision: str,
|
|
56
|
+
precision: torch.dtype,
|
|
57
|
+
generation_config: GenerationConfig,
|
|
58
|
+
template_type: str) -> list:
|
|
59
|
+
|
|
60
|
+
# TODO: multi-task to be supported
|
|
61
|
+
model_adapter = ChatGenerationModelAdapter(model_id=model_id_or_path,
|
|
62
|
+
model_revision=model_revision,
|
|
63
|
+
torch_dtype=precision,
|
|
64
|
+
generation_config=generation_config,
|
|
65
|
+
template_type=template_type)
|
|
66
|
+
res_list = []
|
|
67
|
+
questions_list = jsonl_to_list(self.question_file)
|
|
68
|
+
for data_d in tqdm(questions_list, total=len(questions_list), desc=f'Predicting(answers):'):
|
|
69
|
+
# {"question_id": 1, "text": "How can I improve my time management skills?", "category": "generic"}
|
|
70
|
+
text = data_d.get('text', None)
|
|
71
|
+
if not text:
|
|
72
|
+
logger.warning(f'Invalid question: {data_d}')
|
|
73
|
+
continue
|
|
74
|
+
prompt = f'Question: {text}\n\nAnswer:'
|
|
75
|
+
inputs = {'data': [prompt]}
|
|
76
|
+
res_d: dict = model_adapter.predict(inputs=inputs)
|
|
77
|
+
ans_text: str = res_d['choices'][0]['message']['content']
|
|
78
|
+
|
|
79
|
+
ans = {
|
|
80
|
+
'question_id': data_d['question_id'],
|
|
81
|
+
'text': data_d['text'],
|
|
82
|
+
'category': data_d['category'],
|
|
83
|
+
'model_id': model_id_or_path,
|
|
84
|
+
'metadata': {},
|
|
85
|
+
'answer': ans_text,
|
|
86
|
+
}
|
|
87
|
+
res_list.append(ans)
|
|
88
|
+
|
|
89
|
+
return res_list
|
|
90
|
+
|
|
91
|
+
def get_answers(self):
|
|
92
|
+
for model_name, cfg_d in self.answers_gen.items():
|
|
93
|
+
enable = cfg_d.get(EvalConfigKeys.ENABLE, True)
|
|
94
|
+
if not enable:
|
|
95
|
+
logger.warning(
|
|
96
|
+
f'Skip model {model_name} because it is not enabled.')
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
model_id_or_path = cfg_d.get(EvalConfigKeys.MODEL_ID_OR_PATH)
|
|
100
|
+
model_revision = cfg_d.get(EvalConfigKeys.MODEL_REVISION, None)
|
|
101
|
+
precision = cfg_d.get(EvalConfigKeys.PRECISION, torch.float16)
|
|
102
|
+
precision = eval(precision) if isinstance(precision, str) else precision
|
|
103
|
+
generation_config = cfg_d.get(EvalConfigKeys.GENERATION_CONFIG, {})
|
|
104
|
+
generation_config = GenerationConfig(**generation_config)
|
|
105
|
+
ans_output_file = os.path.join(WORK_DIR, cfg_d.get(EvalConfigKeys.OUTPUT_FILE))
|
|
106
|
+
template_type = cfg_d.get(EvalConfigKeys.TEMPLATE_TYPE)
|
|
107
|
+
|
|
108
|
+
answers_list = self._predict_answers(model_id_or_path=model_id_or_path,
|
|
109
|
+
model_revision=model_revision,
|
|
110
|
+
precision=precision,
|
|
111
|
+
generation_config=generation_config,
|
|
112
|
+
template_type=template_type)
|
|
113
|
+
|
|
114
|
+
dump_jsonl_data(answers_list, ans_output_file)
|
|
115
|
+
logger.info(f'Answers generated by model {model_name} and saved to {ans_output_file}')
|
|
116
|
+
|
|
117
|
+
def get_reviews(self, dry_run: bool = False):
|
|
118
|
+
enable = self.reviews_gen.get(EvalConfigKeys.ENABLE, True)
|
|
119
|
+
if enable:
|
|
120
|
+
reviewer_cls = self.reviewer_cfg.get(EvalConfigKeys.CLASS_REF)
|
|
121
|
+
if not reviewer_cls:
|
|
122
|
+
logger.warning('Skip reviews generation because class reference is not specified.')
|
|
123
|
+
return
|
|
124
|
+
reviewer_args = self.reviewer_cfg.get(EvalConfigKeys.CLASS_ARGS, {})
|
|
125
|
+
target_answers = self.reviews_gen.get('target_answers')
|
|
126
|
+
if target_answers is None:
|
|
127
|
+
# Get all answers from answers_gen config if target_answers is None
|
|
128
|
+
target_answers = [item[EvalConfigKeys.OUTPUT_FILE] for item in self.answers_gen.values()]
|
|
129
|
+
target_answers = [os.path.join(WORK_DIR, item) for item in target_answers]
|
|
130
|
+
target_answers = [file_path for file_path in target_answers if os.path.exists(file_path)]
|
|
131
|
+
|
|
132
|
+
baseline_file = self.reviews_gen.get('baseline_file', None)
|
|
133
|
+
if baseline_file:
|
|
134
|
+
baseline_file = os.path.join(WORK_DIR, baseline_file)
|
|
135
|
+
|
|
136
|
+
reference_file = self.reviews_gen.get('reference_file', None)
|
|
137
|
+
if reference_file:
|
|
138
|
+
reference_file = os.path.join(WORK_DIR, reference_file)
|
|
139
|
+
|
|
140
|
+
cache_file = self.reviews_gen.get('cache_file', None)
|
|
141
|
+
if cache_file:
|
|
142
|
+
cache_file = os.path.join(WORK_DIR, cache_file)
|
|
143
|
+
|
|
144
|
+
input_kwargs = dict(
|
|
145
|
+
prompt_file=self.prompt_file,
|
|
146
|
+
answer_file_list=target_answers,
|
|
147
|
+
review_result_file=self.review_file,
|
|
148
|
+
baseline_file=baseline_file,
|
|
149
|
+
reference_file=reference_file,
|
|
150
|
+
reviewer_args=reviewer_args,
|
|
151
|
+
cache_file=cache_file)
|
|
152
|
+
|
|
153
|
+
reviewer_obj = reviewer_cls(**input_kwargs)
|
|
154
|
+
reviewer_obj.run(dry_run=dry_run)
|
|
155
|
+
logger.info(f'Reviews with generated by reviewer and saved to {self.review_file}')
|
|
156
|
+
|
|
157
|
+
else:
|
|
158
|
+
logger.warning('Skip reviews generation because it is not enabled.')
|
|
159
|
+
|
|
160
|
+
def get_rating_results(self):
|
|
161
|
+
enable = self.rating_gen.get(EvalConfigKeys.ENABLE, True)
|
|
162
|
+
if enable:
|
|
163
|
+
report_file = os.path.join(WORK_DIR, self.rating_gen.get('report_file'))
|
|
164
|
+
metrics = self.rating_gen.get('metrics', ['elo'])
|
|
165
|
+
baseline_model = self.rating_gen.get(
|
|
166
|
+
'baseline_model') if metrics[0] == 'pairwise' else None
|
|
167
|
+
ae = RatingEvaluate(metrics=metrics, baseline_model=baseline_model)
|
|
168
|
+
res_list = ae.run(self.review_file)
|
|
169
|
+
rating_df = res_list[0]
|
|
170
|
+
logger.info(f'Rating results:\n{rating_df.to_csv()}')
|
|
171
|
+
rating_df.to_csv(report_file, index=True)
|
|
172
|
+
logger.info(f'Rating results are saved to {report_file}')
|
|
173
|
+
else:
|
|
174
|
+
logger.warning('Skip rating because it is not enabled.')
|
|
175
|
+
|
|
176
|
+
def run(self, dry_run: bool = False):
|
|
177
|
+
|
|
178
|
+
# Get all answers
|
|
179
|
+
self.get_answers()
|
|
180
|
+
|
|
181
|
+
# Get all reviews
|
|
182
|
+
self.get_reviews(dry_run=dry_run)
|
|
183
|
+
|
|
184
|
+
# Get rating results
|
|
185
|
+
self.get_rating_results()
|
|
186
|
+
|
|
187
|
+
logger.info('*** Arena workflow is finished. ***')
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def main():
|
|
191
|
+
|
|
192
|
+
# Usage: python evalscope/run_arena.py -c /path/to/xxx_cfg_arena.yaml
|
|
193
|
+
|
|
194
|
+
parser = argparse.ArgumentParser(description='LLMs evaluations with arena mode.')
|
|
195
|
+
parser.add_argument('-c', '--cfg-file', required=True)
|
|
196
|
+
parser.add_argument('--dry-run', action='store_true', default=False)
|
|
197
|
+
args = parser.parse_args()
|
|
198
|
+
|
|
199
|
+
arena_workflow = ArenaWorkflow(cfg_file=args.cfg_file)
|
|
200
|
+
arena_workflow.run(dry_run=args.dry_run)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
if __name__ == '__main__':
|
|
204
|
+
main()
|
evalscope/run_ms.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
# flake8: noqa
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import torch
|
|
6
|
+
|
|
7
|
+
from evalscope.benchmarks.ceval import DATASET_ID as CEVAL_EXAM
|
|
8
|
+
from evalscope.benchmarks.mmlu import DATASET_ID as MMLU
|
|
9
|
+
from evalscope.benchmarks.hellaswag import DATASET_ID as HELLA_SWAG
|
|
10
|
+
from evalscope.benchmarks.arc import DATASET_ID as ARC
|
|
11
|
+
from evalscope.benchmarks.truthful_qa import DATASET_ID as TRUTHFUL_QA
|
|
12
|
+
from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
|
|
13
|
+
from evalscope.evaluator import Evaluator
|
|
14
|
+
from evalscope.models.model_adapter import MultiChoiceModelAdapter, ContinuationLogitsModelAdapter
|
|
15
|
+
from evalscope.utils.logger import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger()
|
|
18
|
+
|
|
19
|
+
# TODO: add more precision
|
|
20
|
+
MODEL_PRECISION_MAP = {'fp16': torch.float16, 'fp32': torch.float32, 'bf16': torch.bfloat16}
|
|
21
|
+
|
|
22
|
+
"""
|
|
23
|
+
Run evaluation process for ModelScope Leaderboard.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def parse_args():
|
|
28
|
+
parser = argparse.ArgumentParser(description='Run evaluation on a model')
|
|
29
|
+
|
|
30
|
+
parser.add_argument('--model', help='Model id from modelscope or huggingface.', required=True)
|
|
31
|
+
parser.add_argument('--revision', help='Model revision.', required=False, default=None)
|
|
32
|
+
parser.add_argument('--precision', help='Model precision.', default='bf16')
|
|
33
|
+
parser.add_argument('--work-dir', help='root work cache dir.', default=None)
|
|
34
|
+
parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
|
|
35
|
+
parser.add_argument('--datasets-dir', help='Datasets dir.', default=DEFAULT_ROOT_CACHE_DIR)
|
|
36
|
+
parser.add_argument('--device-map', help='device map.', default='auto')
|
|
37
|
+
parser.add_argument('--max-eval-size', type=int, help='Max evaluation samples num for each subset', default=None)
|
|
38
|
+
parser.add_argument('--dataset-id', help='Dataset id on modelscope', required=False, default=None)
|
|
39
|
+
|
|
40
|
+
parser.add_argument('--debug',
|
|
41
|
+
help='Debug mode, will print information for debugging.',
|
|
42
|
+
action='store_true',
|
|
43
|
+
default=False)
|
|
44
|
+
parser.add_argument('--dry-run',
|
|
45
|
+
help='Dry run in single processing mode.',
|
|
46
|
+
action='store_true',
|
|
47
|
+
default=False)
|
|
48
|
+
parser.add_argument('--mem-cache',
|
|
49
|
+
help='To use memory cache or not.',
|
|
50
|
+
action='store_true',
|
|
51
|
+
default=False)
|
|
52
|
+
|
|
53
|
+
args = parser.parse_args()
|
|
54
|
+
|
|
55
|
+
return args
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def main():
|
|
59
|
+
args = parse_args()
|
|
60
|
+
logger.info(args)
|
|
61
|
+
|
|
62
|
+
# Customize your target datasets here
|
|
63
|
+
all_benchmarks = [CEVAL_EXAM, MMLU, ARC, HELLA_SWAG, TRUTHFUL_QA]
|
|
64
|
+
|
|
65
|
+
dataset_id = args.dataset_id
|
|
66
|
+
if dataset_id is None:
|
|
67
|
+
datasets = all_benchmarks
|
|
68
|
+
elif dataset_id in all_benchmarks:
|
|
69
|
+
datasets = [dataset_id]
|
|
70
|
+
else:
|
|
71
|
+
raise ValueError(f'Unknown dataset: {dataset_id}, Supported datasets: {all_benchmarks}')
|
|
72
|
+
|
|
73
|
+
# Get model instance
|
|
74
|
+
if args.dry_run:
|
|
75
|
+
from evalscope.models.dummy_chat_model import DummyChatModel
|
|
76
|
+
model_adapter = DummyChatModel(model_cfg=dict()) # TODO
|
|
77
|
+
model_id: str = 'dummy'
|
|
78
|
+
model_revision: str = 'v1.0.0'
|
|
79
|
+
model_precision = MODEL_PRECISION_MAP.get(args.precision, torch.bfloat16)
|
|
80
|
+
else:
|
|
81
|
+
model_id: str = args.model
|
|
82
|
+
model_revision: str = args.revision
|
|
83
|
+
model_precision = MODEL_PRECISION_MAP.get(args.precision, torch.bfloat16)
|
|
84
|
+
|
|
85
|
+
model_adapter = MultiChoiceModelAdapter(model_id=model_id,
|
|
86
|
+
device_map=args.device_map,
|
|
87
|
+
torch_dtype=model_precision,
|
|
88
|
+
model_revision=model_revision,)
|
|
89
|
+
|
|
90
|
+
# Evaluate on each dataset
|
|
91
|
+
for dataset_name in datasets:
|
|
92
|
+
if dataset_name == CEVAL_EXAM:
|
|
93
|
+
from evalscope.benchmarks.ceval import CEVALAdapter
|
|
94
|
+
data_adapter = CEVALAdapter()
|
|
95
|
+
elif dataset_name == MMLU:
|
|
96
|
+
from evalscope.benchmarks.mmlu import MMLUAdapter
|
|
97
|
+
data_adapter = MMLUAdapter()
|
|
98
|
+
elif dataset_name == ARC:
|
|
99
|
+
from evalscope.benchmarks.arc import ARCAdapter
|
|
100
|
+
data_adapter = ARCAdapter()
|
|
101
|
+
elif dataset_name == HELLA_SWAG:
|
|
102
|
+
# Note: HellaSwag should run few-shot eval
|
|
103
|
+
from evalscope.benchmarks.hellaswag import HellaSwagAdapter
|
|
104
|
+
data_adapter = HellaSwagAdapter()
|
|
105
|
+
elif dataset_name == TRUTHFUL_QA:
|
|
106
|
+
from evalscope.benchmarks.truthful_qa import TruthfulQaAdapter
|
|
107
|
+
data_adapter = TruthfulQaAdapter()
|
|
108
|
+
|
|
109
|
+
# TODO: add more datasets here
|
|
110
|
+
else:
|
|
111
|
+
raise ValueError(f'Unknown dataset: {dataset_name}')
|
|
112
|
+
|
|
113
|
+
# TODO: add mapping
|
|
114
|
+
if dataset_name in {TRUTHFUL_QA, HELLA_SWAG} and not args.dry_run:
|
|
115
|
+
model_adapter = ContinuationLogitsModelAdapter(model_id=model_id,
|
|
116
|
+
device_map=args.device_map,
|
|
117
|
+
torch_dtype=model_precision,
|
|
118
|
+
model_revision=model_revision, )
|
|
119
|
+
|
|
120
|
+
root_work_dir = args.work_dir if args.work_dir is not None else DEFAULT_ROOT_CACHE_DIR
|
|
121
|
+
evaluator = Evaluator(dataset_name_or_path=dataset_name,
|
|
122
|
+
subset_list=None,
|
|
123
|
+
data_adapter=data_adapter,
|
|
124
|
+
model_adapter=model_adapter,
|
|
125
|
+
use_cache=args.mem_cache,
|
|
126
|
+
root_cache_dir=root_work_dir,
|
|
127
|
+
outputs_dir=args.outputs_dir,
|
|
128
|
+
is_custom_outputs_dir=True,
|
|
129
|
+
datasets_dir=args.datasets_dir, )
|
|
130
|
+
|
|
131
|
+
infer_cfg = dict(max_length=2048, limit=args.max_eval_size)
|
|
132
|
+
evaluator.eval(infer_cfg=infer_cfg, debug=args.debug)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == '__main__':
|
|
136
|
+
main()
|
|
137
|
+
|
|
138
|
+
# Usage:
|
|
139
|
+
# python evalscope/run_ms.py --model ZhipuAI/chatglm2-6b --precision fp16 --dry-run --dataset-id modelscope/mmlu --limit 10
|
|
140
|
+
|
evalscope/summarizer.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import glob
|
|
5
|
+
from typing import List, Union
|
|
6
|
+
|
|
7
|
+
from evalscope.config import TaskConfig
|
|
8
|
+
from evalscope.constants import OutputsStructure
|
|
9
|
+
from evalscope.tools.combine_reports import gen_table
|
|
10
|
+
from evalscope.utils import process_outputs_structure, yaml_to_dict, EvalBackend, json_to_dict, get_latest_folder_path, \
|
|
11
|
+
csv_to_list
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Summarizer:
|
|
18
|
+
|
|
19
|
+
@staticmethod
|
|
20
|
+
def get_report(outputs_dir: str) -> List[dict]:
|
|
21
|
+
res_list: list = []
|
|
22
|
+
|
|
23
|
+
outputs_structure: dict = process_outputs_structure(outputs_dir, is_make=False)
|
|
24
|
+
reports_dir: str = outputs_structure.get(OutputsStructure.REPORTS_DIR)
|
|
25
|
+
if reports_dir is None:
|
|
26
|
+
raise ValueError(f'No reports directory in {outputs_dir}')
|
|
27
|
+
|
|
28
|
+
report_files: list = glob.glob(os.path.join(reports_dir, '*.json'))
|
|
29
|
+
for report_file in report_files:
|
|
30
|
+
with open(report_file, 'r') as f:
|
|
31
|
+
res_list.append(json.load(f))
|
|
32
|
+
|
|
33
|
+
report_table: str = gen_table([reports_dir])
|
|
34
|
+
logger.info(f'*** Report table ***\n{report_table}')
|
|
35
|
+
|
|
36
|
+
return res_list
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def get_report_from_cfg(task_cfg: Union[str, List[str], TaskConfig, List[TaskConfig], dict]) -> List[dict]:
|
|
40
|
+
"""
|
|
41
|
+
Get report from cfg file.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
task_cfg: task cfg file path. refer to evalscope/tasks/eval_qwen-7b-chat_v100.yaml
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
list: list of report dict.
|
|
48
|
+
A report dict is overall report on a benchmark for specific model.
|
|
49
|
+
"""
|
|
50
|
+
final_res_list: List[dict] = []
|
|
51
|
+
candidate_task_cfgs: List[dict] = []
|
|
52
|
+
|
|
53
|
+
if isinstance(task_cfg, dict):
|
|
54
|
+
candidate_task_cfgs = [task_cfg]
|
|
55
|
+
elif isinstance(task_cfg, str):
|
|
56
|
+
task_cfg: dict = yaml_to_dict(task_cfg)
|
|
57
|
+
candidate_task_cfgs = [task_cfg]
|
|
58
|
+
elif isinstance(task_cfg, TaskConfig):
|
|
59
|
+
task_cfg: dict = task_cfg.to_dict()
|
|
60
|
+
candidate_task_cfgs = [task_cfg]
|
|
61
|
+
elif isinstance(task_cfg, list):
|
|
62
|
+
for task_cfg_item in task_cfg:
|
|
63
|
+
if isinstance(task_cfg_item, str):
|
|
64
|
+
task_cfg_item: dict = yaml_to_dict(task_cfg_item)
|
|
65
|
+
elif isinstance(task_cfg_item, TaskConfig):
|
|
66
|
+
task_cfg_item: dict = task_cfg_item.to_dict()
|
|
67
|
+
candidate_task_cfgs.append(task_cfg_item)
|
|
68
|
+
else:
|
|
69
|
+
raise ValueError(f'Invalid task_cfg: {task_cfg}')
|
|
70
|
+
|
|
71
|
+
for candidate_task in candidate_task_cfgs:
|
|
72
|
+
logger.info(f'**Loading task cfg for summarizer: {candidate_task}')
|
|
73
|
+
eval_backend = candidate_task.get('eval_backend') or EvalBackend.NATIVE.value
|
|
74
|
+
|
|
75
|
+
if eval_backend == EvalBackend.NATIVE.value:
|
|
76
|
+
outputs_dir: str = candidate_task.get('outputs')
|
|
77
|
+
outputs_dir: str = os.path.expanduser(outputs_dir)
|
|
78
|
+
if outputs_dir is None:
|
|
79
|
+
raise ValueError(f'No outputs_dir in {task_cfg}')
|
|
80
|
+
res_list: list = Summarizer.get_report(outputs_dir=outputs_dir)
|
|
81
|
+
final_res_list.extend(res_list)
|
|
82
|
+
|
|
83
|
+
elif eval_backend == EvalBackend.OPEN_COMPASS.value:
|
|
84
|
+
eval_config = Summarizer.parse_eval_config(candidate_task)
|
|
85
|
+
|
|
86
|
+
work_dir = eval_config.get('work_dir') or 'outputs/default'
|
|
87
|
+
if not os.path.exists(work_dir):
|
|
88
|
+
raise ValueError(f'work_dir {work_dir} does not exist.')
|
|
89
|
+
|
|
90
|
+
res_folder_path = get_latest_folder_path(work_dir=work_dir)
|
|
91
|
+
summary_files = glob.glob(os.path.join(res_folder_path, 'summary', '*.csv'))
|
|
92
|
+
if len(summary_files) == 0:
|
|
93
|
+
raise ValueError(f'No summary files in {res_folder_path}')
|
|
94
|
+
|
|
95
|
+
summary_file_path = summary_files[0]
|
|
96
|
+
# Example: [{'dataset': 'gsm8k', 'version': '1d7fe4', 'metric': 'accuracy', 'mode': 'gen', 'qwen-7b-chat': '53.98'}
|
|
97
|
+
summary_res: List[dict] = csv_to_list(file_path=summary_file_path)
|
|
98
|
+
final_res_list.extend(summary_res)
|
|
99
|
+
elif eval_backend == EvalBackend.VLM_EVAL_KIT.value:
|
|
100
|
+
eval_config = Summarizer.parse_eval_config(candidate_task)
|
|
101
|
+
|
|
102
|
+
work_dir = eval_config.get('work_dir') or 'outputs/default'
|
|
103
|
+
if not os.path.exists(work_dir):
|
|
104
|
+
raise ValueError(f'work_dir {work_dir} does not exist.')
|
|
105
|
+
|
|
106
|
+
# TODO: parse summary files: acc.csv, score.csv, score.json for different models
|
|
107
|
+
for model in eval_config['model']:
|
|
108
|
+
if model['name'] == 'CustomAPIModel':
|
|
109
|
+
model_name = model['type']
|
|
110
|
+
else:
|
|
111
|
+
model_name = model['name']
|
|
112
|
+
summary_files = glob.glob(os.path.join(work_dir, model_name, '*.csv'))
|
|
113
|
+
for summary_file_path in summary_files:
|
|
114
|
+
summary_res: dict = csv_to_list(file_path=summary_file_path)[0]
|
|
115
|
+
file_name = os.path.basename(summary_file_path).split('.')[0]
|
|
116
|
+
final_res_list.append({file_name: summary_res})
|
|
117
|
+
|
|
118
|
+
elif eval_backend == EvalBackend.THIRD_PARTY.value:
|
|
119
|
+
raise ValueError(f'*** The summarizer for Third party evaluation backend is not supported yet ***')
|
|
120
|
+
else:
|
|
121
|
+
raise ValueError(f'Invalid eval_backend: {eval_backend}')
|
|
122
|
+
|
|
123
|
+
return final_res_list
|
|
124
|
+
|
|
125
|
+
@staticmethod
|
|
126
|
+
def parse_eval_config(candidate_task):
|
|
127
|
+
eval_config: Union[str, dict] = candidate_task.get('eval_config')
|
|
128
|
+
assert eval_config is not None, 'Please provide eval_config for specific evaluation backend.'
|
|
129
|
+
|
|
130
|
+
if isinstance(eval_config, str):
|
|
131
|
+
if eval_config.endswith('.yaml'):
|
|
132
|
+
eval_config: dict = yaml_to_dict(eval_config)
|
|
133
|
+
elif eval_config.endswith('.json'):
|
|
134
|
+
eval_config: dict = json_to_dict(eval_config)
|
|
135
|
+
else:
|
|
136
|
+
raise ValueError(f'Invalid eval_config: {eval_config}')
|
|
137
|
+
return eval_config
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
if __name__ == '__main__':
|
|
141
|
+
cfg_file = 'registry/tasks/eval_qwen-7b-chat_v100.yaml'
|
|
142
|
+
report_list = Summarizer.get_report_from_cfg(cfg_file)
|
|
143
|
+
|
|
144
|
+
print(report_list)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|