evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
- evalscope/benchmarks/ifeval/instructions.py +1478 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +26 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +29 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +7 -5
- evalscope/constants.py +9 -26
- evalscope/evaluator/evaluator.py +87 -121
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +506 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +48 -72
- evalscope/run_arena.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +5 -4
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +5 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +52 -1
- tests/rag/test_mteb.py +3 -2
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -10,10 +10,9 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
10
10
|
|
|
11
11
|
from evalscope.benchmarks import DataAdapter
|
|
12
12
|
from evalscope.config import TaskConfig
|
|
13
|
-
from evalscope.constants import
|
|
14
|
-
|
|
15
|
-
from evalscope.
|
|
16
|
-
from evalscope.tools.combine_reports import gen_table
|
|
13
|
+
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
|
|
14
|
+
from evalscope.models import BaseModelAdapter, CustomModelAdapter
|
|
15
|
+
from evalscope.report import Report, gen_table
|
|
17
16
|
from evalscope.utils import dict_torch_dtype_to_str, gen_hash
|
|
18
17
|
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
19
18
|
from evalscope.utils.logger import get_logger
|
|
@@ -30,73 +29,63 @@ class Evaluator(object):
|
|
|
30
29
|
if the dataset is a local path, e.g. /path/to/your_dataset_name,
|
|
31
30
|
then the task name will be the basename of the path, which is `your_dataset_name`.
|
|
32
31
|
data_adapter: DataAdapter, the data adapter for the dataset.
|
|
33
|
-
subset_list: list, the subset list for the dataset.
|
|
34
32
|
model_adapter: BaseModelAdapter, the model adapter for the model.
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
|
|
38
|
-
datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
|
|
39
|
-
stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
|
|
40
|
-
eval_type: str, the evaluation type. `checkpoint` or `service` or `custom`. Default: 'checkpoint'
|
|
41
|
-
overall_task_cfg: dict, the overall task config. Default: None
|
|
33
|
+
outputs: OutputsStructure, the outputs dir. Default: None
|
|
34
|
+
task_cfg: TaskConfig, the overall task config. Default: None
|
|
42
35
|
**kwargs: kwargs.
|
|
43
36
|
"""
|
|
44
37
|
|
|
45
38
|
def __init__(self,
|
|
46
39
|
dataset_name_or_path: str,
|
|
47
40
|
data_adapter: DataAdapter,
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
outputs: Optional[OutputsStructure] = None,
|
|
52
|
-
datasets_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
|
|
53
|
-
datasets_hub: Optional[str] = HubType.MODELSCOPE,
|
|
54
|
-
stage: Optional[str] = EvalStage.ALL,
|
|
55
|
-
eval_type: Optional[str] = EvalType.CHECKPOINT,
|
|
56
|
-
overall_task_cfg: Optional[TaskConfig] = None,
|
|
41
|
+
model_adapter: BaseModelAdapter,
|
|
42
|
+
outputs: OutputsStructure = None,
|
|
43
|
+
task_cfg: TaskConfig = None,
|
|
57
44
|
**kwargs):
|
|
58
45
|
|
|
46
|
+
self.dataset_name = data_adapter.name
|
|
59
47
|
self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
60
|
-
self.
|
|
61
|
-
self.model_name = overall_task_cfg.model_id
|
|
48
|
+
self.model_name = task_cfg.model_id
|
|
62
49
|
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
63
50
|
|
|
64
|
-
self.datasets_dir = os.path.expanduser(datasets_dir)
|
|
65
|
-
self.kwargs = kwargs
|
|
66
51
|
self.data_adapter = data_adapter
|
|
67
52
|
self.model_adapter = model_adapter
|
|
68
|
-
self.
|
|
69
|
-
self.
|
|
70
|
-
self.
|
|
71
|
-
self.
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
self.model_cfg = self.model_adapter.model_cfg
|
|
76
|
-
|
|
53
|
+
self.model_cfg = model_adapter.model_cfg
|
|
54
|
+
self.eval_type = task_cfg.eval_type
|
|
55
|
+
self.dataset_hub = task_cfg.dataset_hub
|
|
56
|
+
self.stage = task_cfg.stage
|
|
57
|
+
self.use_cache = task_cfg.use_cache
|
|
58
|
+
self.task_cfg = task_cfg
|
|
77
59
|
# Deal with the output paths
|
|
78
60
|
self.outputs_structure = outputs
|
|
79
61
|
|
|
80
|
-
|
|
81
|
-
self.dataset = self.data_adapter.load(
|
|
82
|
-
dataset_name_or_path=dataset_name_or_path,
|
|
83
|
-
subset_list=subset_list,
|
|
84
|
-
work_dir=self.datasets_dir,
|
|
85
|
-
datasets_hub=datasets_hub,
|
|
86
|
-
**kwargs)
|
|
87
|
-
|
|
88
|
-
# Get prompts from dataset
|
|
89
|
-
# TODO: support sampler
|
|
90
|
-
self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
|
|
91
|
-
del self.dataset
|
|
92
|
-
|
|
93
|
-
def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict:
|
|
62
|
+
self.kwargs = kwargs
|
|
94
63
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
64
|
+
def load_dataset(self):
|
|
65
|
+
dataset = self.data_adapter.load(
|
|
66
|
+
dataset_name_or_path=self.dataset_name_or_path,
|
|
67
|
+
subset_list=self.data_adapter.subset_list,
|
|
68
|
+
work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
|
|
69
|
+
datasets_hub=self.dataset_hub,
|
|
70
|
+
**self.kwargs)
|
|
98
71
|
|
|
99
|
-
|
|
72
|
+
# Get prompts from dataset
|
|
73
|
+
prompts = self.data_adapter.gen_prompts(data_dict=dataset)
|
|
74
|
+
return prompts
|
|
75
|
+
|
|
76
|
+
def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
|
|
77
|
+
model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
|
|
78
|
+
input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
|
|
79
|
+
infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
80
|
+
return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
81
|
+
|
|
82
|
+
def _process_answer(self, answer_d, input_d, subset_name, answer_id):
|
|
83
|
+
answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
84
|
+
answer_d[AnswerKeys.ANSWER_ID] = answer_id
|
|
85
|
+
answer_d[AnswerKeys.SUBSET_NAME] = subset_name
|
|
86
|
+
answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
|
|
87
|
+
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
|
|
88
|
+
return answer_d
|
|
100
89
|
|
|
101
90
|
def get_answers(self,
|
|
102
91
|
subset_name: str,
|
|
@@ -147,57 +136,24 @@ class Evaluator(object):
|
|
|
147
136
|
resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
|
|
148
137
|
inputs=prompts_list, infer_cfg=infer_cfg)
|
|
149
138
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
# Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
|
|
156
|
-
model_cfg_str = json.dumps(
|
|
157
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
|
|
158
|
-
ensure_ascii=False)
|
|
159
|
-
input_prompt_str = json.dumps(
|
|
160
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())), ensure_ascii=False)
|
|
161
|
-
infer_cfg_str = json.dumps(
|
|
162
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
163
|
-
answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
164
|
-
|
|
165
|
-
resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
166
|
-
resp_d[AnswerKeys.ANSWER_ID] = answer_id
|
|
167
|
-
resp_d[AnswerKeys.SUBSET_NAME] = subset_name
|
|
168
|
-
resp_d[AnswerKeys.RAW_INPUT] = in_d[AnswerKeys.RAW_INPUT]
|
|
169
|
-
resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
|
|
170
|
-
|
|
171
|
-
answers_list.append(resp_d)
|
|
172
|
-
dump_jsonl_data(resp_d, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
139
|
+
for input_prompt, answer_d in zip(prompts_list, resp_answers_list):
|
|
140
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
141
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
142
|
+
answers_list.append(processed_answer)
|
|
143
|
+
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
173
144
|
|
|
174
145
|
else:
|
|
175
146
|
for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
|
|
180
|
-
ensure_ascii=False)
|
|
181
|
-
input_prompt_str = json.dumps(
|
|
182
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())), ensure_ascii=False)
|
|
183
|
-
infer_cfg_str = json.dumps(
|
|
184
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
|
|
185
|
-
answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
186
|
-
|
|
187
|
-
# Get answers
|
|
188
|
-
answer_d: dict = self._pred_answer(
|
|
189
|
-
input_d=input_prompt, infer_cfg=infer_cfg, subset_name=subset_name, answer_id=answer_id)
|
|
190
|
-
|
|
191
|
-
answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
192
|
-
answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
|
|
193
|
-
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
|
|
147
|
+
answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
|
|
148
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
149
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
194
150
|
|
|
195
151
|
if debug:
|
|
196
152
|
logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
|
|
197
|
-
logger.info(f'**predicted ans: {json.dumps(
|
|
153
|
+
logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
|
|
198
154
|
|
|
199
|
-
answers_list.append(
|
|
200
|
-
dump_jsonl_data(
|
|
155
|
+
answers_list.append(processed_answer)
|
|
156
|
+
dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
201
157
|
|
|
202
158
|
logger.info(f'Dump predictions to {pred_file_path}.')
|
|
203
159
|
return answers_list
|
|
@@ -241,6 +197,19 @@ class Evaluator(object):
|
|
|
241
197
|
|
|
242
198
|
return review_res
|
|
243
199
|
|
|
200
|
+
def _generate_review_id(self, answer_d):
|
|
201
|
+
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
202
|
+
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
203
|
+
reviewer_spec = {
|
|
204
|
+
'metric': [metric.name for metric in self.data_adapter.metric_list],
|
|
205
|
+
'reviewer': ['Evaluator'],
|
|
206
|
+
'revision': ['default']
|
|
207
|
+
}
|
|
208
|
+
reviewer_spec_str = json.dumps(
|
|
209
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
|
|
210
|
+
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
211
|
+
return review_id, reviewer_spec
|
|
212
|
+
|
|
244
213
|
def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
|
|
245
214
|
"""
|
|
246
215
|
Get reviews from answers.
|
|
@@ -264,19 +233,7 @@ class Evaluator(object):
|
|
|
264
233
|
logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
|
|
265
234
|
|
|
266
235
|
for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
|
|
267
|
-
|
|
268
|
-
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
269
|
-
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
270
|
-
|
|
271
|
-
reviewer_spec: dict = {
|
|
272
|
-
'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
|
|
273
|
-
'reviewer': ['Evaluator'],
|
|
274
|
-
'revision': ['default']
|
|
275
|
-
}
|
|
276
|
-
reviewer_spec_str = json.dumps(
|
|
277
|
-
OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
|
|
278
|
-
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
279
|
-
|
|
236
|
+
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
280
237
|
# Get review
|
|
281
238
|
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
282
239
|
|
|
@@ -284,13 +241,12 @@ class Evaluator(object):
|
|
|
284
241
|
logger.info(review_d)
|
|
285
242
|
|
|
286
243
|
reviews_list.append(review_d)
|
|
287
|
-
|
|
288
244
|
# Dump reviews
|
|
289
245
|
dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
|
|
290
246
|
|
|
291
247
|
return reviews_list
|
|
292
248
|
|
|
293
|
-
def compute_metrics(self, reviews_list: List[dict]) ->
|
|
249
|
+
def compute_metrics(self, reviews_list: List[dict]) -> List[dict]:
|
|
294
250
|
"""
|
|
295
251
|
To compute metrics from reviews_list for each subset.
|
|
296
252
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -308,28 +264,37 @@ class Evaluator(object):
|
|
|
308
264
|
logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
309
265
|
continue
|
|
310
266
|
|
|
311
|
-
|
|
267
|
+
if len(review_d[AnswerKeys.CHOICES]) == 0:
|
|
268
|
+
logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
269
|
+
continue
|
|
270
|
+
elif len(review_d[AnswerKeys.CHOICES]) == 1:
|
|
271
|
+
review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
|
|
272
|
+
else:
|
|
273
|
+
review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
|
|
274
|
+
|
|
312
275
|
review_res_list.append(review_res)
|
|
313
276
|
|
|
314
|
-
metric_score:
|
|
277
|
+
metric_score: List[dict] = self.data_adapter.compute_metric(review_res_list=review_res_list)
|
|
315
278
|
|
|
316
279
|
return metric_score
|
|
317
280
|
|
|
318
|
-
def dump_report(self, reviews_score_all: dict, use_table: bool = True):
|
|
281
|
+
def dump_report(self, reviews_score_all: List[dict], use_table: bool = True):
|
|
319
282
|
"""
|
|
320
283
|
Get report for total reviews of specific dataset.
|
|
321
284
|
It is required to rewrite this method to support your own evaluator.
|
|
322
285
|
|
|
323
286
|
Args:
|
|
324
|
-
|
|
287
|
+
reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
|
|
325
288
|
use_table: whether to generate table for reports. Default to True.
|
|
326
289
|
|
|
327
290
|
Returns: None
|
|
328
291
|
"""
|
|
329
292
|
# Get report map
|
|
330
|
-
report_map:
|
|
331
|
-
subset_score_map=reviews_score_all,
|
|
332
|
-
|
|
293
|
+
report_map: Report = self.data_adapter.gen_report(
|
|
294
|
+
subset_score_map=reviews_score_all,
|
|
295
|
+
report_name=self.custom_task_name,
|
|
296
|
+
model_name=self.model_name,
|
|
297
|
+
dataset_name=self.dataset_name)
|
|
333
298
|
|
|
334
299
|
# Dump report
|
|
335
300
|
report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
|
|
@@ -338,7 +303,7 @@ class Evaluator(object):
|
|
|
338
303
|
|
|
339
304
|
# Write report
|
|
340
305
|
with open(report_path, 'w') as f:
|
|
341
|
-
f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
|
|
306
|
+
f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
|
|
342
307
|
logger.info(f'Dump report: {report_path} \n')
|
|
343
308
|
|
|
344
309
|
# Make table
|
|
@@ -380,7 +345,8 @@ class Evaluator(object):
|
|
|
380
345
|
stage_answers_dict = {}
|
|
381
346
|
stage_reviews_dict = {}
|
|
382
347
|
|
|
383
|
-
|
|
348
|
+
prompts = self.load_dataset()
|
|
349
|
+
for subset_name, prompts_list in prompts.items():
|
|
384
350
|
limit = kwargs.get('limit', len(prompts_list))
|
|
385
351
|
prompts_list = prompts_list[:limit]
|
|
386
352
|
|
|
@@ -394,7 +360,7 @@ class Evaluator(object):
|
|
|
394
360
|
subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
|
|
395
361
|
|
|
396
362
|
metric_res = self.compute_metrics(reviews_list=reviews_list)
|
|
397
|
-
reviews_score_all[subset_name] =
|
|
363
|
+
reviews_score_all[subset_name] = metric_res
|
|
398
364
|
stage_reviews_dict[subset_name] = reviews_list
|
|
399
365
|
|
|
400
366
|
if self.stage == EvalStage.INFER:
|
|
@@ -8,10 +8,10 @@ import sys
|
|
|
8
8
|
import time
|
|
9
9
|
from abc import ABC, abstractmethod
|
|
10
10
|
from functools import partial
|
|
11
|
-
from typing import Any, List
|
|
11
|
+
from typing import Any, List, Tuple
|
|
12
12
|
|
|
13
13
|
from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
|
|
14
|
-
from evalscope.models.
|
|
14
|
+
from evalscope.models.model import OpenAIModel
|
|
15
15
|
from evalscope.utils import completion_parsers, random_seeded_choice
|
|
16
16
|
from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
|
|
17
17
|
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
|
|
@@ -240,7 +240,15 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
240
240
|
review_text=review_text)
|
|
241
241
|
return review_result
|
|
242
242
|
|
|
243
|
-
def _get_review_pair(self,
|
|
243
|
+
def _get_review_pair(self,
|
|
244
|
+
model_a,
|
|
245
|
+
model_b,
|
|
246
|
+
question,
|
|
247
|
+
category,
|
|
248
|
+
ans1,
|
|
249
|
+
ans2,
|
|
250
|
+
dry_run=False,
|
|
251
|
+
**kwargs) -> Tuple[str, Any]:
|
|
244
252
|
input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
|
|
245
253
|
|
|
246
254
|
if self.reference_list:
|
|
@@ -263,7 +271,7 @@ class AutoReviewerGpt4(BaseReviewer):
|
|
|
263
271
|
result = (result, None)
|
|
264
272
|
return review_text, *result
|
|
265
273
|
|
|
266
|
-
def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) ->
|
|
274
|
+
def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> Tuple[str, Any]:
|
|
267
275
|
input_msg = dict(ques=question, category=category, ans1=answer)
|
|
268
276
|
|
|
269
277
|
if self.reference_list:
|
evalscope/metrics/__init__.py
CHANGED
|
@@ -1 +1,4 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, weighted_mean
|
|
3
|
+
from evalscope.metrics.named_metrics import *
|
|
4
|
+
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
|
|
@@ -55,7 +55,7 @@ try:
|
|
|
55
55
|
os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
|
|
56
56
|
os.system(f'unzip {punkt_path} -d {nltk_dir}')
|
|
57
57
|
else:
|
|
58
|
-
logger.
|
|
58
|
+
logger.debug(f'{punkt_path} already exists, skipping download')
|
|
59
59
|
except Exception as e:
|
|
60
60
|
logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
|
|
61
61
|
|
|
@@ -1,57 +1,200 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import re
|
|
4
|
-
from collections import defaultdict
|
|
5
|
-
from tqdm import tqdm
|
|
6
3
|
|
|
7
|
-
from
|
|
4
|
+
# Adapted from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py
|
|
5
|
+
def is_equiv(str1, str2, verbose=False):
|
|
6
|
+
if str1 is None and str2 is None:
|
|
7
|
+
print('WARNING: Both None')
|
|
8
|
+
return True
|
|
9
|
+
if str1 is None or str2 is None:
|
|
10
|
+
return False
|
|
8
11
|
|
|
12
|
+
try:
|
|
13
|
+
ss1 = strip_string(str1)
|
|
14
|
+
ss2 = strip_string(str2)
|
|
15
|
+
if verbose:
|
|
16
|
+
print(ss1, ss2)
|
|
17
|
+
return ss1 == ss2
|
|
18
|
+
except Exception:
|
|
19
|
+
return str1 == str2
|
|
9
20
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
if
|
|
13
|
-
|
|
21
|
+
|
|
22
|
+
def remove_boxed(s):
|
|
23
|
+
if '\\boxed ' in s:
|
|
24
|
+
left = '\\boxed '
|
|
25
|
+
assert s[:len(left)] == left
|
|
26
|
+
return s[len(left):]
|
|
27
|
+
|
|
28
|
+
left = '\\boxed{'
|
|
29
|
+
|
|
30
|
+
assert s[:len(left)] == left
|
|
31
|
+
assert s[-1] == '}'
|
|
32
|
+
|
|
33
|
+
return s[len(left):-1]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def last_boxed_only_string(string):
|
|
37
|
+
idx = string.rfind('\\boxed')
|
|
38
|
+
if '\\boxed ' in string:
|
|
39
|
+
return '\\boxed ' + string.split('\\boxed ')[-1].split('$')[0]
|
|
40
|
+
if idx < 0:
|
|
41
|
+
idx = string.rfind('\\fbox')
|
|
42
|
+
if idx < 0:
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
i = idx
|
|
46
|
+
right_brace_idx = None
|
|
47
|
+
num_left_braces_open = 0
|
|
48
|
+
while i < len(string):
|
|
49
|
+
if string[i] == '{':
|
|
50
|
+
num_left_braces_open += 1
|
|
51
|
+
if string[i] == '}':
|
|
52
|
+
num_left_braces_open -= 1
|
|
53
|
+
if num_left_braces_open == 0:
|
|
54
|
+
right_brace_idx = i
|
|
55
|
+
break
|
|
56
|
+
i += 1
|
|
57
|
+
|
|
58
|
+
if right_brace_idx is None:
|
|
59
|
+
retval = None
|
|
14
60
|
else:
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
61
|
+
retval = string[idx:right_brace_idx + 1]
|
|
62
|
+
|
|
63
|
+
return retval
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def fix_fracs(string):
|
|
67
|
+
substrs = string.split('\\frac')
|
|
68
|
+
new_str = substrs[0]
|
|
69
|
+
if len(substrs) > 1:
|
|
70
|
+
substrs = substrs[1:]
|
|
71
|
+
for substr in substrs:
|
|
72
|
+
new_str += '\\frac'
|
|
73
|
+
if substr[0] == '{':
|
|
74
|
+
new_str += substr
|
|
75
|
+
else:
|
|
76
|
+
try:
|
|
77
|
+
assert len(substr) >= 2
|
|
78
|
+
except AssertionError:
|
|
79
|
+
return string
|
|
80
|
+
a = substr[0]
|
|
81
|
+
b = substr[1]
|
|
82
|
+
if b != '{':
|
|
83
|
+
if len(substr) > 2:
|
|
84
|
+
post_substr = substr[2:]
|
|
85
|
+
new_str += '{' + a + '}{' + b + '}' + post_substr
|
|
86
|
+
else:
|
|
87
|
+
new_str += '{' + a + '}{' + b + '}'
|
|
88
|
+
else:
|
|
89
|
+
if len(substr) > 2:
|
|
90
|
+
post_substr = substr[2:]
|
|
91
|
+
new_str += '{' + a + '}' + b + post_substr
|
|
92
|
+
else:
|
|
93
|
+
new_str += '{' + a + '}' + b
|
|
94
|
+
string = new_str
|
|
95
|
+
return string
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def fix_a_slash_b(string):
|
|
99
|
+
if len(string.split('/')) != 2:
|
|
100
|
+
return string
|
|
101
|
+
a = string.split('/')[0]
|
|
102
|
+
b = string.split('/')[1]
|
|
103
|
+
try:
|
|
104
|
+
a = int(a)
|
|
105
|
+
b = int(b)
|
|
106
|
+
assert string == '{}/{}'.format(a, b)
|
|
107
|
+
new_string = '\\frac{' + str(a) + '}{' + str(b) + '}'
|
|
108
|
+
return new_string
|
|
109
|
+
except AssertionError:
|
|
110
|
+
return string
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def remove_right_units(string):
|
|
114
|
+
# "\\text{ " only ever occurs (at least in the val set) when describing units
|
|
115
|
+
if '\\text{ ' in string:
|
|
116
|
+
splits = string.split('\\text{ ')
|
|
117
|
+
assert len(splits) == 2
|
|
118
|
+
return splits[0]
|
|
28
119
|
else:
|
|
29
|
-
return
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
120
|
+
return string
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def fix_sqrt(string):
|
|
124
|
+
if '\\sqrt' not in string:
|
|
125
|
+
return string
|
|
126
|
+
splits = string.split('\\sqrt')
|
|
127
|
+
new_string = splits[0]
|
|
128
|
+
for split in splits[1:]:
|
|
129
|
+
if split[0] != '{':
|
|
130
|
+
a = split[0]
|
|
131
|
+
new_substr = '\\sqrt{' + a + '}' + split[1:]
|
|
132
|
+
else:
|
|
133
|
+
new_substr = '\\sqrt' + split
|
|
134
|
+
new_string += new_substr
|
|
135
|
+
return new_string
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def strip_string(string):
|
|
139
|
+
# linebreaks
|
|
140
|
+
string = string.replace('\n', '')
|
|
141
|
+
|
|
142
|
+
# remove inverse spaces
|
|
143
|
+
string = string.replace('\\!', '')
|
|
144
|
+
|
|
145
|
+
# replace \\ with \
|
|
146
|
+
string = string.replace('\\\\', '\\')
|
|
147
|
+
|
|
148
|
+
# replace tfrac and dfrac with frac
|
|
149
|
+
string = string.replace('tfrac', 'frac')
|
|
150
|
+
string = string.replace('dfrac', 'frac')
|
|
151
|
+
|
|
152
|
+
# remove \left and \right
|
|
153
|
+
string = string.replace('\\left', '')
|
|
154
|
+
string = string.replace('\\right', '')
|
|
155
|
+
|
|
156
|
+
# Remove circ (degrees)
|
|
157
|
+
string = string.replace('^{\\circ}', '')
|
|
158
|
+
string = string.replace('^\\circ', '')
|
|
159
|
+
|
|
160
|
+
# remove dollar signs
|
|
161
|
+
string = string.replace('\\$', '')
|
|
162
|
+
|
|
163
|
+
# remove units (on the right)
|
|
164
|
+
string = remove_right_units(string)
|
|
165
|
+
|
|
166
|
+
# remove percentage
|
|
167
|
+
string = string.replace('\\%', '')
|
|
168
|
+
string = string.replace('\%', '') # noqa: W605
|
|
169
|
+
|
|
170
|
+
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
|
|
171
|
+
string = string.replace(' .', ' 0.')
|
|
172
|
+
string = string.replace('{.', '{0.')
|
|
173
|
+
# if empty, return empty string
|
|
174
|
+
if len(string) == 0:
|
|
175
|
+
return string
|
|
176
|
+
if string[0] == '.':
|
|
177
|
+
string = '0' + string
|
|
178
|
+
|
|
179
|
+
# to consider: get rid of e.g. "k = " or "q = " at beginning
|
|
180
|
+
if len(string.split('=')) == 2:
|
|
181
|
+
if len(string.split('=')[0]) <= 2:
|
|
182
|
+
string = string.split('=')[1]
|
|
183
|
+
|
|
184
|
+
# fix sqrt3 --> sqrt{3}
|
|
185
|
+
string = fix_sqrt(string)
|
|
186
|
+
|
|
187
|
+
# remove spaces
|
|
188
|
+
string = string.replace(' ', '')
|
|
189
|
+
|
|
190
|
+
# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b} # noqa: E501
|
|
191
|
+
string = fix_fracs(string)
|
|
192
|
+
|
|
193
|
+
# manually change 0.5 --> \frac{1}{2}
|
|
194
|
+
if string == '0.5':
|
|
195
|
+
string = '\\frac{1}{2}'
|
|
196
|
+
|
|
197
|
+
# NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
|
|
198
|
+
string = fix_a_slash_b(string)
|
|
199
|
+
|
|
200
|
+
return string
|