evalscope 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +3 -0
- evalscope/backend/__init__.py +3 -0
- evalscope/backend/base.py +27 -0
- evalscope/backend/opencompass/__init__.py +3 -0
- evalscope/backend/opencompass/api_meta_template.py +64 -0
- evalscope/backend/opencompass/backend_manager.py +247 -0
- evalscope/backend/opencompass/tasks/__init__.py +1 -0
- evalscope/backend/opencompass/tasks/eval_api.py +30 -0
- evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
- evalscope/backend/vlm_eval_kit/__init__.py +1 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
- evalscope/benchmarks/__init__.py +4 -0
- evalscope/benchmarks/arc/__init__.py +5 -0
- evalscope/benchmarks/arc/ai2_arc.py +148 -0
- evalscope/benchmarks/arc/arc_adapter.py +231 -0
- evalscope/benchmarks/bbh/__init__.py +6 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
- evalscope/benchmarks/benchmark.py +65 -0
- evalscope/benchmarks/ceval/__init__.py +5 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
- evalscope/benchmarks/ceval/ceval_exam.py +159 -0
- evalscope/benchmarks/cmmlu/__init__.py +5 -0
- evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
- evalscope/benchmarks/competition_math/__init__.py +5 -0
- evalscope/benchmarks/competition_math/competition_math.py +88 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
- evalscope/benchmarks/data_adapter.py +263 -0
- evalscope/benchmarks/general_qa/__init__.py +5 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
- evalscope/benchmarks/gsm8k/__init__.py +5 -0
- evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
- evalscope/benchmarks/hellaswag/__init__.py +5 -0
- evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
- evalscope/benchmarks/humaneval/__init__.py +5 -0
- evalscope/benchmarks/humaneval/humaneval.py +82 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
- evalscope/benchmarks/mmlu/__init__.py +5 -0
- evalscope/benchmarks/mmlu/mmlu.py +174 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
- evalscope/benchmarks/race/__init__.py +5 -0
- evalscope/benchmarks/race/race.py +118 -0
- evalscope/benchmarks/race/race_adapter.py +229 -0
- evalscope/benchmarks/trivia_qa/__init__.py +5 -0
- evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
- evalscope/benchmarks/truthful_qa/__init__.py +5 -0
- evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
- evalscope/cache.py +98 -0
- evalscope/cli/__init__.py +1 -0
- evalscope/cli/base.py +20 -0
- evalscope/cli/cli.py +26 -0
- evalscope/cli/start_perf.py +37 -0
- evalscope/cli/start_server.py +138 -0
- evalscope/config.py +165 -0
- evalscope/constants.py +150 -0
- evalscope/evaluator/__init__.py +3 -0
- evalscope/evaluator/evaluator.py +689 -0
- evalscope/evaluator/rating_eval.py +178 -0
- evalscope/evaluator/reviewer/__init__.py +1 -0
- evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
- evalscope/metrics/__init__.py +1 -0
- evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
- evalscope/metrics/code_metric.py +104 -0
- evalscope/metrics/math_accuracy.py +60 -0
- evalscope/metrics/metrics.py +405 -0
- evalscope/metrics/rouge_metric.py +129 -0
- evalscope/models/__init__.py +4 -0
- evalscope/models/custom/__init__.py +4 -0
- evalscope/models/custom/custom_model.py +53 -0
- evalscope/models/dummy_chat_model.py +50 -0
- evalscope/models/model.py +88 -0
- evalscope/models/model_adapter.py +586 -0
- evalscope/models/openai_model.py +103 -0
- evalscope/models/template.py +1446 -0
- evalscope/perf/__init__.py +0 -0
- evalscope/perf/_logging.py +32 -0
- evalscope/perf/api_plugin_base.py +60 -0
- evalscope/perf/custom_api.py +87 -0
- evalscope/perf/dashscope_api.py +84 -0
- evalscope/perf/dataset_plugin_base.py +64 -0
- evalscope/perf/datasets/__init__.py +0 -0
- evalscope/perf/datasets/line_by_line.py +18 -0
- evalscope/perf/datasets/longalpaca_12k.py +20 -0
- evalscope/perf/datasets/openqa.py +22 -0
- evalscope/perf/how_to_analysis_result.py +24 -0
- evalscope/perf/http_client.py +756 -0
- evalscope/perf/openai_api.py +130 -0
- evalscope/perf/plugin_registry.py +35 -0
- evalscope/perf/query_parameters.py +42 -0
- evalscope/perf/server_sent_event.py +43 -0
- evalscope/preprocess/__init__.py +1 -0
- evalscope/preprocess/tokenizers/__init__.py +0 -0
- evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
- evalscope/registry/__init__.py +1 -0
- evalscope/registry/tasks/arc.yaml +29 -0
- evalscope/registry/tasks/bbh.yaml +27 -0
- evalscope/registry/tasks/bbh_mini.yaml +27 -0
- evalscope/registry/tasks/ceval.yaml +27 -0
- evalscope/registry/tasks/ceval_mini.yaml +27 -0
- evalscope/registry/tasks/cmmlu.yaml +27 -0
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
- evalscope/registry/tasks/general_qa.yaml +27 -0
- evalscope/registry/tasks/gsm8k.yaml +29 -0
- evalscope/registry/tasks/mmlu.yaml +29 -0
- evalscope/registry/tasks/mmlu_mini.yaml +27 -0
- evalscope/run.py +404 -0
- evalscope/run_arena.py +204 -0
- evalscope/run_ms.py +140 -0
- evalscope/summarizer.py +144 -0
- evalscope/third_party/__init__.py +1 -0
- evalscope/third_party/toolbench_static/__init__.py +3 -0
- evalscope/third_party/toolbench_static/eval.py +219 -0
- evalscope/third_party/toolbench_static/infer.py +278 -0
- evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
- evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
- evalscope/tools/__init__.py +1 -0
- evalscope/tools/combine_reports.py +140 -0
- evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
- evalscope/tools/rewrite_eval_results.py +95 -0
- evalscope/utils/__init__.py +4 -0
- evalscope/utils/arena_utils.py +247 -0
- evalscope/utils/completion_parsers.py +87 -0
- evalscope/utils/logger.py +64 -0
- evalscope/utils/task_cfg_parser.py +10 -0
- evalscope/utils/task_utils.py +19 -0
- evalscope/utils/utils.py +625 -0
- evalscope/version.py +4 -0
- evalscope-0.5.0.dist-info/METADATA +566 -0
- evalscope-0.5.0.dist-info/RECORD +165 -0
- evalscope-0.5.0.dist-info/WHEEL +5 -0
- evalscope-0.5.0.dist-info/entry_points.txt +3 -0
- evalscope-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,689 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import time
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from copy import deepcopy
|
|
8
|
+
from collections import OrderedDict
|
|
9
|
+
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
from typing import Optional, List, Any, Union, Dict
|
|
12
|
+
|
|
13
|
+
from evalscope.benchmarks import DataAdapter
|
|
14
|
+
from evalscope.constants import DEFAULT_ROOT_CACHE_DIR, OutputsStructure, AnswerKeys, ReviewKeys, EvalStage
|
|
15
|
+
from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
|
|
16
|
+
from evalscope.tools.combine_reports import gen_table
|
|
17
|
+
from evalscope.utils import gen_hash, dict_torch_dtype_to_str, dump_jsonl_data, process_outputs_structure, \
|
|
18
|
+
normalize_score, dict_to_yaml, jsonl_to_list
|
|
19
|
+
from evalscope.utils.logger import get_logger
|
|
20
|
+
|
|
21
|
+
logger = get_logger()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Evaluator(object):
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
The evaluator for model on datasets.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
dataset_name_or_path: str, the dataset name or path.
|
|
31
|
+
if the dataset is a local path, e.g. /path/to/your_dataset_name,
|
|
32
|
+
then the task name will be the basename of the path, which is `your_dataset_name`.
|
|
33
|
+
data_adapter: DataAdapter, the data adapter for the dataset.
|
|
34
|
+
subset_list: list, the subset list for the dataset.
|
|
35
|
+
model_adapter: BaseModelAdapter, the model adapter for the model.
|
|
36
|
+
use_cache: bool, whether to use local cache. Default: True
|
|
37
|
+
mem_cache_method: str, the memory cache method. Default: 'ttl' (deprecated)
|
|
38
|
+
root_cache_dir: str, the root cache dir. Default: DEFAULT_ROOT_CACHE_DIR
|
|
39
|
+
outputs_dir: str, the outputs dir. Default: ''
|
|
40
|
+
is_custom_outputs_dir: bool, whether to use custom outputs dir. Default: False (deprecated)
|
|
41
|
+
datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
|
|
42
|
+
datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
|
|
43
|
+
stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
|
|
44
|
+
eval_type: str, the evaluation type. `checkpoint` or `service` or `custom`. Default: 'checkpoint'
|
|
45
|
+
overall_task_cfg: dict, the overall task config. Default: None
|
|
46
|
+
**kwargs: kwargs.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(self,
|
|
50
|
+
dataset_name_or_path: str,
|
|
51
|
+
data_adapter: DataAdapter,
|
|
52
|
+
subset_list: Optional[list] = None,
|
|
53
|
+
model_adapter: Optional[BaseModelAdapter] = None,
|
|
54
|
+
use_cache: bool = True,
|
|
55
|
+
mem_cache_method: str = 'ttl',
|
|
56
|
+
root_cache_dir: Optional[str] = DEFAULT_ROOT_CACHE_DIR,
|
|
57
|
+
outputs_dir: Optional[str] = '',
|
|
58
|
+
is_custom_outputs_dir: bool = False,
|
|
59
|
+
datasets_dir: Optional[str] = DEFAULT_ROOT_CACHE_DIR,
|
|
60
|
+
datasets_hub: Optional[str] = 'ModelScope',
|
|
61
|
+
stage: Optional[str] = 'all', # refer to evalscope.constants.EvalStage
|
|
62
|
+
eval_type: Optional[str] = 'checkpoint', # `checkpoint` or `service` or `custom`
|
|
63
|
+
overall_task_cfg: Optional[dict] = None,
|
|
64
|
+
**kwargs):
|
|
65
|
+
|
|
66
|
+
self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
67
|
+
self.custom_task_name: str = None
|
|
68
|
+
if os.path.exists(self.dataset_name_or_path):
|
|
69
|
+
self.custom_task_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep))
|
|
70
|
+
|
|
71
|
+
self.root_cache_dir = os.path.expanduser(root_cache_dir)
|
|
72
|
+
self.datasets_dir = os.path.expanduser(datasets_dir)
|
|
73
|
+
self.kwargs = kwargs
|
|
74
|
+
self.data_adapter = data_adapter
|
|
75
|
+
self.model_adapter = model_adapter
|
|
76
|
+
self.eval_type = eval_type
|
|
77
|
+
self.stage = stage
|
|
78
|
+
self.use_cache = use_cache
|
|
79
|
+
self.overall_task_cfg = overall_task_cfg
|
|
80
|
+
if isinstance(self.model_adapter, CustomModelAdapter):
|
|
81
|
+
self.overall_task_cfg.update({'custom_config': self.model_adapter.custom_model.config})
|
|
82
|
+
|
|
83
|
+
self.model_cfg = self.model_adapter.model_cfg
|
|
84
|
+
self.model_id = self.model_cfg['model_id']
|
|
85
|
+
self.model_revision = self.model_cfg.get('revision', None)
|
|
86
|
+
self.model_revision_str = self.model_revision if self.model_revision is not None else 'none'
|
|
87
|
+
|
|
88
|
+
# Get default outputs_dir
|
|
89
|
+
# TODO: refactor outputs_dir, del timestamp concat
|
|
90
|
+
# if not is_custom_outputs_dir:
|
|
91
|
+
# outputs_dir = make_outputs_dir(work_dir=outputs_dir,
|
|
92
|
+
# model_id=self.model_id,
|
|
93
|
+
# model_revision=self.model_revision_str)
|
|
94
|
+
|
|
95
|
+
self.outputs_dir = os.path.expanduser(outputs_dir)
|
|
96
|
+
|
|
97
|
+
# Deal with the output paths
|
|
98
|
+
self.outputs_structure = process_outputs_structure(self.outputs_dir)
|
|
99
|
+
|
|
100
|
+
# Load dataset
|
|
101
|
+
self.dataset = self.data_adapter.load(dataset_name_or_path=dataset_name_or_path,
|
|
102
|
+
subset_list=subset_list,
|
|
103
|
+
work_dir=self.datasets_dir,
|
|
104
|
+
datasets_hub=datasets_hub,
|
|
105
|
+
**kwargs)
|
|
106
|
+
|
|
107
|
+
# Get prompts from dataset
|
|
108
|
+
self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
|
|
109
|
+
del self.dataset
|
|
110
|
+
|
|
111
|
+
# Init memory cache
|
|
112
|
+
# TODO: refactor mem cache manager
|
|
113
|
+
# mem_cache_file_name = self.dataset_name_or_path.replace('/', '_') + \
|
|
114
|
+
# '_' + self.model_id.replace('/', '_') + \
|
|
115
|
+
# '_' + self.model_revision_str + \
|
|
116
|
+
# '_cache.pkl'
|
|
117
|
+
# self.mem_cache_path = os.path.join(self.root_cache_dir, 'mem_cache', mem_cache_file_name)
|
|
118
|
+
|
|
119
|
+
# Note: mem_cache is deprecated, use `use_cache` instead
|
|
120
|
+
self.mem_cache = None
|
|
121
|
+
self.mem_cache_method = mem_cache_method
|
|
122
|
+
# if self.use_cache:
|
|
123
|
+
# self.mem_cache = init_mem_cache(method=self.mem_cache_method, cache_file_path=self.mem_cache_path)
|
|
124
|
+
# logger.info(f'** Using memory cache with size: {len(self.mem_cache)}')
|
|
125
|
+
|
|
126
|
+
def _pred_answer(self,
|
|
127
|
+
input_d: dict,
|
|
128
|
+
infer_cfg: dict,
|
|
129
|
+
subset_name: str,
|
|
130
|
+
answer_id: str = None) -> dict:
|
|
131
|
+
|
|
132
|
+
# Get answer from memory cache
|
|
133
|
+
if self.mem_cache is not None:
|
|
134
|
+
if answer_id in self.mem_cache:
|
|
135
|
+
logger.info(f'** Reusing answer `{answer_id}` in memory cache.')
|
|
136
|
+
return self.mem_cache[answer_id]
|
|
137
|
+
|
|
138
|
+
ans: dict = self.model_adapter.predict(inputs=input_d, infer_cfg=infer_cfg)
|
|
139
|
+
ans[AnswerKeys.ANSWER_ID] = answer_id
|
|
140
|
+
ans[AnswerKeys.SUBSET_NAME] = subset_name
|
|
141
|
+
|
|
142
|
+
if self.mem_cache is not None:
|
|
143
|
+
self.mem_cache[answer_id] = ans
|
|
144
|
+
|
|
145
|
+
return ans
|
|
146
|
+
|
|
147
|
+
def get_answers(self,
|
|
148
|
+
subset_name: str,
|
|
149
|
+
prompts_list: List[dict],
|
|
150
|
+
infer_cfg: dict = None,
|
|
151
|
+
debug: bool = False,
|
|
152
|
+
**kwargs) -> list:
|
|
153
|
+
"""
|
|
154
|
+
Get answers from model inference.
|
|
155
|
+
It is required to rewrite this method to support your own evaluator.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
subset_name: subset name for benchmark.
|
|
159
|
+
prompts_list: prompts list.
|
|
160
|
+
infer_cfg: model inference config.
|
|
161
|
+
Attributes:
|
|
162
|
+
do_sample: bool, whether to use sampling.
|
|
163
|
+
top_k: int, the number of highest probability vocabulary tokens to keep for top-k-filtering.
|
|
164
|
+
top_p: float, if set to float < 1, only the most probable tokens with probabilities to add.
|
|
165
|
+
temperature: float, the value used to module the next token probabilities.
|
|
166
|
+
num_beams: int, number of beams for beam search. 1 means no beam search.
|
|
167
|
+
max_length: int, the max length of the sequence to be generated.
|
|
168
|
+
max_new_tokens: int, the max number of new tokens to be generated.
|
|
169
|
+
repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
|
|
170
|
+
debug: whether to run in debug mode.
|
|
171
|
+
**kwargs: kwargs.
|
|
172
|
+
|
|
173
|
+
Returns: The list of answers.
|
|
174
|
+
"""
|
|
175
|
+
assert self.data_adapter is not None, 'data_adapter must be provided when calling func get_answers() !'
|
|
176
|
+
assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
|
|
177
|
+
|
|
178
|
+
answers_list = []
|
|
179
|
+
pred_dir: str = self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR)
|
|
180
|
+
|
|
181
|
+
if self.custom_task_name:
|
|
182
|
+
pred_file_name: str = self.custom_task_name + '_' + subset_name + '.jsonl'
|
|
183
|
+
else:
|
|
184
|
+
pred_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '_' + subset_name + '.jsonl'
|
|
185
|
+
|
|
186
|
+
pred_file_path: str = os.path.join(pred_dir, pred_file_name)
|
|
187
|
+
|
|
188
|
+
if self.use_cache and os.path.exists(pred_file_path):
|
|
189
|
+
answers_list = jsonl_to_list(pred_file_path)
|
|
190
|
+
logger.info(f'** Reusing predictions from {pred_file_path}, got {len(answers_list)} answers.')
|
|
191
|
+
|
|
192
|
+
return answers_list
|
|
193
|
+
|
|
194
|
+
if isinstance(self.model_adapter, CustomModelAdapter):
|
|
195
|
+
# Batch inference for custom model
|
|
196
|
+
|
|
197
|
+
resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(inputs=prompts_list,
|
|
198
|
+
infer_cfg=infer_cfg)
|
|
199
|
+
|
|
200
|
+
assert len(prompts_list) == len(resp_answers_list), \
|
|
201
|
+
f'Length of prompts_list({len(prompts_list)}) != Length of resp_answers_list({len(resp_answers_list)})'
|
|
202
|
+
|
|
203
|
+
for in_d, resp_d in zip(prompts_list, resp_answers_list):
|
|
204
|
+
|
|
205
|
+
# Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
|
|
206
|
+
model_cfg_str = json.dumps(
|
|
207
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
|
|
208
|
+
ensure_ascii=False)
|
|
209
|
+
input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())),
|
|
210
|
+
ensure_ascii=False)
|
|
211
|
+
infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())),
|
|
212
|
+
ensure_ascii=False)
|
|
213
|
+
answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
214
|
+
|
|
215
|
+
resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
216
|
+
resp_d[AnswerKeys.ANSWER_ID] = answer_id
|
|
217
|
+
resp_d[AnswerKeys.SUBSET_NAME] = subset_name
|
|
218
|
+
resp_d[AnswerKeys.RAW_INPUT] = in_d[AnswerKeys.RAW_INPUT]
|
|
219
|
+
resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
|
|
220
|
+
|
|
221
|
+
answers_list.append(resp_d)
|
|
222
|
+
|
|
223
|
+
else:
|
|
224
|
+
for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
|
|
225
|
+
|
|
226
|
+
# Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
|
|
227
|
+
model_cfg_str = json.dumps(
|
|
228
|
+
OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
|
|
229
|
+
ensure_ascii=False)
|
|
230
|
+
input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())),
|
|
231
|
+
ensure_ascii=False)
|
|
232
|
+
infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())),
|
|
233
|
+
ensure_ascii=False)
|
|
234
|
+
answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
|
|
235
|
+
|
|
236
|
+
# Get answers
|
|
237
|
+
answer_d: dict = self._pred_answer(input_d=input_prompt,
|
|
238
|
+
infer_cfg=infer_cfg,
|
|
239
|
+
subset_name=subset_name,
|
|
240
|
+
answer_id=answer_id)
|
|
241
|
+
|
|
242
|
+
answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
|
|
243
|
+
answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
|
|
244
|
+
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
|
|
245
|
+
|
|
246
|
+
if debug:
|
|
247
|
+
logger.debug(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
|
|
248
|
+
logger.debug(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
|
|
249
|
+
|
|
250
|
+
answers_list.append(answer_d)
|
|
251
|
+
|
|
252
|
+
if len(answers_list) == 0:
|
|
253
|
+
logger.error(f'** Got empty predictions on subset {subset_name} of dataset: {self.dataset_name_or_path}')
|
|
254
|
+
|
|
255
|
+
# Dump answers
|
|
256
|
+
os.makedirs(pred_dir, exist_ok=True)
|
|
257
|
+
dump_jsonl_data(answers_list, pred_file_path)
|
|
258
|
+
|
|
259
|
+
return answers_list
|
|
260
|
+
|
|
261
|
+
def _get_review(self,
|
|
262
|
+
answer_d: dict,
|
|
263
|
+
review_id: str = None,
|
|
264
|
+
reviewer_spec: dict = None) -> dict:
|
|
265
|
+
|
|
266
|
+
# Get review from memory cache
|
|
267
|
+
if self.mem_cache is not None:
|
|
268
|
+
if review_id in self.mem_cache:
|
|
269
|
+
logger.info(f'** Reusing review `{review_id}` in memory cache.')
|
|
270
|
+
return self.mem_cache[review_id]
|
|
271
|
+
|
|
272
|
+
if reviewer_spec is None:
|
|
273
|
+
reviewer_spec = {}
|
|
274
|
+
|
|
275
|
+
review_res = deepcopy(answer_d)
|
|
276
|
+
choices = review_res[AnswerKeys.CHOICES]
|
|
277
|
+
if len(choices) == 0:
|
|
278
|
+
review_res[ReviewKeys.REVIEWED] = False
|
|
279
|
+
review_res[ReviewKeys.REVIEW_ID] = None
|
|
280
|
+
review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
|
|
281
|
+
review_res[ReviewKeys.REVIEW_TIME] = time.time()
|
|
282
|
+
return review_res
|
|
283
|
+
|
|
284
|
+
rev_choices = []
|
|
285
|
+
for choice in choices:
|
|
286
|
+
raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
|
|
287
|
+
answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
|
|
288
|
+
answer_content = self.data_adapter.parse_pred_result(result=answer_content,
|
|
289
|
+
raw_input_d=raw_input_d,
|
|
290
|
+
eval_type=self.eval_type)
|
|
291
|
+
gold_content = self.data_adapter.get_gold_answer(raw_input_d)
|
|
292
|
+
|
|
293
|
+
review_result = self.data_adapter.match(gold_content, answer_content)
|
|
294
|
+
choice[ReviewKeys.REVIEW] = {ReviewKeys.GOLD: gold_content,
|
|
295
|
+
ReviewKeys.PRED: answer_content,
|
|
296
|
+
ReviewKeys.RESULT: review_result}
|
|
297
|
+
|
|
298
|
+
rev_choices.append(choice)
|
|
299
|
+
|
|
300
|
+
review_res[AnswerKeys.CHOICES] = rev_choices
|
|
301
|
+
review_res[ReviewKeys.REVIEWED] = True
|
|
302
|
+
review_res[ReviewKeys.REVIEW_ID] = review_id
|
|
303
|
+
review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
|
|
304
|
+
review_res[ReviewKeys.REVIEW_TIME] = time.time()
|
|
305
|
+
|
|
306
|
+
if self.mem_cache is not None:
|
|
307
|
+
self.mem_cache[review_id] = review_res
|
|
308
|
+
|
|
309
|
+
return review_res
|
|
310
|
+
|
|
311
|
+
def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
|
|
312
|
+
"""
|
|
313
|
+
Get reviews from answers.
|
|
314
|
+
It is required to rewrite this method to support your own evaluator.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
subset_name: subset name of benchmark
|
|
318
|
+
answers_list: inference results list.
|
|
319
|
+
debug: whether to run in debug mode.
|
|
320
|
+
**kwargs: kwargs.
|
|
321
|
+
|
|
322
|
+
Returns: reviews list.
|
|
323
|
+
"""
|
|
324
|
+
reviews_list = []
|
|
325
|
+
|
|
326
|
+
review_dir: str = self.outputs_structure.get(OutputsStructure.REVIEWS_DIR)
|
|
327
|
+
if self.custom_task_name:
|
|
328
|
+
review_file_name: str = self.custom_task_name + '_' + subset_name + '.jsonl'
|
|
329
|
+
else:
|
|
330
|
+
review_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '_' + subset_name + '.jsonl'
|
|
331
|
+
review_file_path: str = os.path.join(review_dir, review_file_name)
|
|
332
|
+
|
|
333
|
+
if self.use_cache and os.path.exists(review_file_path):
|
|
334
|
+
logger.warning(f'** Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
|
|
335
|
+
|
|
336
|
+
for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
|
|
337
|
+
|
|
338
|
+
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
339
|
+
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
340
|
+
|
|
341
|
+
reviewer_spec: dict = {'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
|
|
342
|
+
'reviewer': ['Evaluator'],
|
|
343
|
+
'revision': ['default']}
|
|
344
|
+
reviewer_spec_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())),
|
|
345
|
+
ensure_ascii=False)
|
|
346
|
+
review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
|
|
347
|
+
|
|
348
|
+
# Get review
|
|
349
|
+
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
350
|
+
|
|
351
|
+
if debug:
|
|
352
|
+
logger.debug(review_d)
|
|
353
|
+
|
|
354
|
+
reviews_list.append(review_d)
|
|
355
|
+
|
|
356
|
+
# Dump reviews
|
|
357
|
+
os.makedirs(review_dir, exist_ok=True)
|
|
358
|
+
dump_jsonl_data(reviews_list, review_file_path)
|
|
359
|
+
|
|
360
|
+
return reviews_list
|
|
361
|
+
|
|
362
|
+
def compute_metrics(self, reviews_list: List[dict]) -> Any:
|
|
363
|
+
"""
|
|
364
|
+
To compute metrics from reviews_list for each subset.
|
|
365
|
+
It is required to rewrite this method to support your own evaluator.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
reviews_list: reviews list.
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
The metric result. Depends on the metric function in data_adapter.
|
|
372
|
+
"""
|
|
373
|
+
|
|
374
|
+
review_res_list = []
|
|
375
|
+
for review_d in reviews_list:
|
|
376
|
+
if not review_d[ReviewKeys.REVIEWED]:
|
|
377
|
+
logger.warning(f'** Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
378
|
+
continue
|
|
379
|
+
|
|
380
|
+
review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
|
|
381
|
+
review_res_list.append(review_res)
|
|
382
|
+
|
|
383
|
+
metric_score: Union[float, dict] = self.data_adapter.compute_metric(review_res_list=review_res_list)
|
|
384
|
+
|
|
385
|
+
return metric_score
|
|
386
|
+
|
|
387
|
+
def dump_report(self, report_map: dict, use_table: bool = True):
|
|
388
|
+
"""
|
|
389
|
+
Get report for total reviews of specific dataset.
|
|
390
|
+
It is required to rewrite this method to support your own evaluator.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
report_map: report dict. Generated by func self.data_adapter.gen_report().
|
|
394
|
+
use_table: whether to generate table for reports. Default to True.
|
|
395
|
+
|
|
396
|
+
Returns: None
|
|
397
|
+
"""
|
|
398
|
+
|
|
399
|
+
# Dump report
|
|
400
|
+
report_dir: str = self.outputs_structure[OutputsStructure.REPORTS_DIR]
|
|
401
|
+
|
|
402
|
+
if self.custom_task_name:
|
|
403
|
+
report_file_name: str = self.custom_task_name + '.json'
|
|
404
|
+
else:
|
|
405
|
+
report_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '.json'
|
|
406
|
+
|
|
407
|
+
os.makedirs(report_dir, exist_ok=True)
|
|
408
|
+
report_path: str = os.path.join(report_dir, report_file_name)
|
|
409
|
+
with open(report_path, 'w') as f:
|
|
410
|
+
f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
|
|
411
|
+
# logger.info(f'** Dump report to {report_path} \n')
|
|
412
|
+
logger.info(f'** Dump report: {report_file_name} \n')
|
|
413
|
+
|
|
414
|
+
if use_table:
|
|
415
|
+
try:
|
|
416
|
+
# Make table
|
|
417
|
+
report_table: str = gen_table([report_dir])
|
|
418
|
+
logger.info(f'** Report table: \n {report_table} \n')
|
|
419
|
+
except:
|
|
420
|
+
logger.error('Failed to generate report table.')
|
|
421
|
+
|
|
422
|
+
# def save_cache(self):
|
|
423
|
+
# if self.mem_cache is not None:
|
|
424
|
+
# logger.info(f'** Saving memory cache with size: {len(self.mem_cache)}')
|
|
425
|
+
# Cache.save(cache=self.mem_cache, path=self.mem_cache_path)
|
|
426
|
+
|
|
427
|
+
# def clear_cache(self):
|
|
428
|
+
# """
|
|
429
|
+
# Clear memory cache.
|
|
430
|
+
#
|
|
431
|
+
# Returns: None
|
|
432
|
+
# """
|
|
433
|
+
# if self.mem_cache is not None:
|
|
434
|
+
# cache_len = len(self.mem_cache)
|
|
435
|
+
# self.mem_cache.clear()
|
|
436
|
+
# logger.info(f'** Memory cache cleared, length changed: {cache_len} -> {len(self.mem_cache)}')
|
|
437
|
+
|
|
438
|
+
def eval(self,
|
|
439
|
+
infer_cfg: dict = None,
|
|
440
|
+
debug: bool = False,
|
|
441
|
+
**kwargs) -> dict:
|
|
442
|
+
"""
|
|
443
|
+
Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
|
|
444
|
+
It is required to rewrite this method to support your own evaluator.
|
|
445
|
+
|
|
446
|
+
The evaluation process is as follows:
|
|
447
|
+
1. Get the input samples from the dataset (benchmarks on the ModelScope or HuggingFace).
|
|
448
|
+
2. Get the input prompts from dataset with specific data adapter.
|
|
449
|
+
3. Get answers with model inference.
|
|
450
|
+
4. Get reviews with metric function (or reviewers).
|
|
451
|
+
5. Generate report from review results.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
infer_cfg: The config for model inference.
|
|
455
|
+
debug: Whether to run in debug mode. Default: False.
|
|
456
|
+
|
|
457
|
+
Returns:
|
|
458
|
+
Dict of results. Depends on the stage of evaluation.
|
|
459
|
+
|
|
460
|
+
stage == 'all': return the report_map
|
|
461
|
+
stage == 'infer': return the answers_map
|
|
462
|
+
stage == 'review': return the reviews_map
|
|
463
|
+
"""
|
|
464
|
+
|
|
465
|
+
logger.info(f'**** Start evaluating on dataset {self.dataset_name_or_path} ****')
|
|
466
|
+
|
|
467
|
+
reviews_score_all = {} # {subset_name: (score, num)}
|
|
468
|
+
stage_answers_dict = {}
|
|
469
|
+
stage_reviews_dict = {}
|
|
470
|
+
|
|
471
|
+
for subset_name, prompts_list in self.prompts.items():
|
|
472
|
+
limit = infer_cfg.get('limit', len(prompts_list))
|
|
473
|
+
prompts_list = prompts_list[:limit]
|
|
474
|
+
|
|
475
|
+
answers_list: list = self.get_answers(subset_name=subset_name,
|
|
476
|
+
prompts_list=prompts_list,
|
|
477
|
+
infer_cfg=infer_cfg,
|
|
478
|
+
debug=debug,
|
|
479
|
+
**kwargs)
|
|
480
|
+
if self.stage == EvalStage.INFER:
|
|
481
|
+
stage_answers_dict[subset_name] = answers_list
|
|
482
|
+
continue
|
|
483
|
+
|
|
484
|
+
reviews_list: list = self.get_reviews(subset_name=subset_name,
|
|
485
|
+
answers_list=answers_list,
|
|
486
|
+
debug=debug,
|
|
487
|
+
**kwargs)
|
|
488
|
+
|
|
489
|
+
metric_res = self.compute_metrics(reviews_list=reviews_list)
|
|
490
|
+
reviews_score_all[subset_name] = (metric_res, len(reviews_list))
|
|
491
|
+
stage_reviews_dict[subset_name] = reviews_list
|
|
492
|
+
|
|
493
|
+
if self.stage == EvalStage.INFER:
|
|
494
|
+
return stage_answers_dict
|
|
495
|
+
|
|
496
|
+
if self.stage == EvalStage.REVIEW:
|
|
497
|
+
return stage_reviews_dict
|
|
498
|
+
|
|
499
|
+
# Generate report
|
|
500
|
+
report_map: dict = self.data_adapter.gen_report(subset_score_map=reviews_score_all,
|
|
501
|
+
report_name=self.custom_task_name)
|
|
502
|
+
self.dump_report(report_map=report_map)
|
|
503
|
+
|
|
504
|
+
# Dump overall task config
|
|
505
|
+
overall_task_cfg_file: str = os.path.join(self.outputs_structure.get(OutputsStructure.CONFIGS_DIR),
|
|
506
|
+
'task_output_config.yaml')
|
|
507
|
+
overall_task_cfg_file = os.path.abspath(overall_task_cfg_file)
|
|
508
|
+
|
|
509
|
+
# TODO: check the robustness of dump yaml
|
|
510
|
+
try:
|
|
511
|
+
logger.info(f'** Dump overall task config to {overall_task_cfg_file}')
|
|
512
|
+
logger.info(f'** The overall task config:\n {self.overall_task_cfg}')
|
|
513
|
+
if 'model' in self.overall_task_cfg and not isinstance(self.overall_task_cfg['model'], str):
|
|
514
|
+
self.overall_task_cfg['model'] = None
|
|
515
|
+
logger.info(f'>> Overwrite overall_task_cfg for `model` due to it is not a string')
|
|
516
|
+
if 'model_args' in self.overall_task_cfg and self.overall_task_cfg.get('model_args') is not None:
|
|
517
|
+
self.overall_task_cfg['model_args'].update({'precision': str(self.overall_task_cfg['model_args']['precision'])})
|
|
518
|
+
logger.info(f'>> Overwrite overall_task_cfg for `model_args.precision` due to it is not a string')
|
|
519
|
+
|
|
520
|
+
dict_to_yaml(self.overall_task_cfg, overall_task_cfg_file)
|
|
521
|
+
except Exception as e:
|
|
522
|
+
logger.warning(f'Failed to dump overall task config: {e}')
|
|
523
|
+
|
|
524
|
+
# Note: deprecated
|
|
525
|
+
# self.save_cache()
|
|
526
|
+
# self.clear_cache()
|
|
527
|
+
|
|
528
|
+
logger.info(f'\n**** Evaluation finished on {self.dataset_name_or_path} ****\n')
|
|
529
|
+
|
|
530
|
+
return report_map
|
|
531
|
+
|
|
532
|
+
|
|
533
|
+
class HumanevalEvaluator(object):
|
|
534
|
+
|
|
535
|
+
def __init__(self,
|
|
536
|
+
problem_file: str,
|
|
537
|
+
model_id: str,
|
|
538
|
+
model_revision: str,
|
|
539
|
+
model_adapter: BaseModelAdapter,
|
|
540
|
+
outputs_dir: Optional[str] = '',
|
|
541
|
+
is_custom_outputs_dir: bool = False,
|
|
542
|
+
k: List[int] = [1, 10, 100],
|
|
543
|
+
n_workers: int = 4,
|
|
544
|
+
timeout: float = 3.0,):
|
|
545
|
+
try:
|
|
546
|
+
from human_eval.evaluation import evaluate_functional_correctness
|
|
547
|
+
from human_eval.data import read_problems, write_jsonl
|
|
548
|
+
except ImportError:
|
|
549
|
+
raise ImportError('Please install human_eval:'
|
|
550
|
+
'https://github.com/openai/human-eval/tree/master#installation , '
|
|
551
|
+
'Note that you need to enable the execution code in the human_eval/execution.py first.')
|
|
552
|
+
|
|
553
|
+
self.problem_file = problem_file
|
|
554
|
+
self.k = k
|
|
555
|
+
self.num_workers = n_workers
|
|
556
|
+
self.timeout = timeout
|
|
557
|
+
self.model_adapter = model_adapter
|
|
558
|
+
|
|
559
|
+
self.read_problems_func = read_problems
|
|
560
|
+
self.write_jsonl_func = write_jsonl
|
|
561
|
+
self.eval_func = evaluate_functional_correctness
|
|
562
|
+
|
|
563
|
+
# {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
|
|
564
|
+
self.problems = self.read_problems_func(self.problem_file)
|
|
565
|
+
|
|
566
|
+
# Get default outputs_dir
|
|
567
|
+
model_revision_str: str = model_revision if model_revision is not None else 'none'
|
|
568
|
+
# if not is_custom_outputs_dir:
|
|
569
|
+
# outputs_dir = make_outputs_dir(work_dir=outputs_dir,
|
|
570
|
+
# model_id=model_id,
|
|
571
|
+
# model_revision=model_revision_str)
|
|
572
|
+
self.outputs_dir = os.path.expanduser(outputs_dir)
|
|
573
|
+
|
|
574
|
+
# Deal with the output paths
|
|
575
|
+
self.outputs_structure = process_outputs_structure(self.outputs_dir)
|
|
576
|
+
|
|
577
|
+
def get_answers(self, infer_cfg: dict) -> List[dict]:
|
|
578
|
+
ans_list: list = []
|
|
579
|
+
system_prompt: str = 'Complete the following python code:\n'
|
|
580
|
+
for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
|
|
581
|
+
prompt: str = system_prompt + data_d['prompt']
|
|
582
|
+
inputs: dict = {'data': [prompt]}
|
|
583
|
+
# pred_res: dict = self.model_adapter.predict(inputs)
|
|
584
|
+
|
|
585
|
+
pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
|
|
586
|
+
|
|
587
|
+
pred_ans: str = pred_res['choices'][0]['message']['content']
|
|
588
|
+
pred_ans = self._postprocess(pred_ans)
|
|
589
|
+
|
|
590
|
+
ans_list.append({'task_id': task_id, 'completion': pred_ans})
|
|
591
|
+
|
|
592
|
+
return ans_list
|
|
593
|
+
|
|
594
|
+
def eval(self, infer_cfg: dict, **kwargs):
|
|
595
|
+
|
|
596
|
+
# predict
|
|
597
|
+
ans_list: list = self.get_answers(infer_cfg)
|
|
598
|
+
ans_out_file: str = os.path.join(self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR),
|
|
599
|
+
'human_eval_predictions.jsonl')
|
|
600
|
+
|
|
601
|
+
self.write_jsonl_func(filename=ans_out_file, data=ans_list)
|
|
602
|
+
# logger.info(f'** Dump predictions to {ans_out_file} successfully.')
|
|
603
|
+
logger.info('** Dump predictions successfully.')
|
|
604
|
+
|
|
605
|
+
# evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
|
|
606
|
+
results = self.eval_func(sample_file=ans_out_file,
|
|
607
|
+
k=self.k,
|
|
608
|
+
n_workers=self.num_workers,
|
|
609
|
+
timeout=self.timeout,
|
|
610
|
+
problem_file=self.problem_file)
|
|
611
|
+
|
|
612
|
+
# output: report
|
|
613
|
+
report_map: dict = self.gen_report(results=results)
|
|
614
|
+
report_dir: str = self.outputs_structure.get(OutputsStructure.REPORTS_DIR)
|
|
615
|
+
report_file: str = os.path.join(report_dir, 'human_eval_report.json')
|
|
616
|
+
|
|
617
|
+
with open(report_file, 'w') as f:
|
|
618
|
+
f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
|
|
619
|
+
# logger.info(f'** Dump report to {report_file} \n')
|
|
620
|
+
logger.info(f'** Dump report \n')
|
|
621
|
+
|
|
622
|
+
try:
|
|
623
|
+
# Make table
|
|
624
|
+
report_table: str = gen_table([report_dir])
|
|
625
|
+
logger.info(f'** Report table: \n {report_table} \n')
|
|
626
|
+
except:
|
|
627
|
+
logger.error('Failed to generate report table.')
|
|
628
|
+
|
|
629
|
+
def gen_report(self, results: dict) -> dict:
|
|
630
|
+
"""
|
|
631
|
+
Generate report from evaluation results.
|
|
632
|
+
|
|
633
|
+
Returns:
|
|
634
|
+
{
|
|
635
|
+
"name":"ARC-Challenge",
|
|
636
|
+
"metric":"WeightedAverageAccuracy",
|
|
637
|
+
"score":0.3389,
|
|
638
|
+
"category":[
|
|
639
|
+
{
|
|
640
|
+
"name":"DEFAULT",
|
|
641
|
+
"score":0.3389,
|
|
642
|
+
"subset":[
|
|
643
|
+
{
|
|
644
|
+
"name":"ARC-Challenge",
|
|
645
|
+
"score":0.3389
|
|
646
|
+
},
|
|
647
|
+
]
|
|
648
|
+
}
|
|
649
|
+
],
|
|
650
|
+
"total_num":100
|
|
651
|
+
}
|
|
652
|
+
"""
|
|
653
|
+
results = {k: normalize_score(score=v) for k, v in results.items()}
|
|
654
|
+
|
|
655
|
+
category_d = dict(name='DEFAULT',
|
|
656
|
+
score=results,
|
|
657
|
+
subset=[])
|
|
658
|
+
|
|
659
|
+
res_map = dict(name='HumanEval',
|
|
660
|
+
metric='pass@k',
|
|
661
|
+
score=results,
|
|
662
|
+
category=[category_d],
|
|
663
|
+
total_num=len(self.problems))
|
|
664
|
+
|
|
665
|
+
return res_map
|
|
666
|
+
|
|
667
|
+
@classmethod
|
|
668
|
+
def _postprocess(cls, text: str) -> str:
|
|
669
|
+
if '```' in text:
|
|
670
|
+
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
|
|
671
|
+
if len(blocks) == 0:
|
|
672
|
+
text = text.split('```')[1] # fall back to default strategy
|
|
673
|
+
else:
|
|
674
|
+
text = blocks[0] # fetch the first code block
|
|
675
|
+
if not text.startswith('\n'): # in case starting with ```python
|
|
676
|
+
text = text[max(text.find('\n') + 1, 0):]
|
|
677
|
+
if text.strip().startswith('from') or text.strip().startswith('import'):
|
|
678
|
+
def_idx = text.find('def')
|
|
679
|
+
if def_idx != -1:
|
|
680
|
+
text = text[max(text.find('\n', def_idx) + 1, 0):]
|
|
681
|
+
text = text.split('\n\n')[0]
|
|
682
|
+
if text.strip().startswith('def'):
|
|
683
|
+
text = '\n'.join(text.split('\n')[1:])
|
|
684
|
+
if not text.startswith(' '):
|
|
685
|
+
if text.startswith(' '):
|
|
686
|
+
text = ' ' + text.lstrip()
|
|
687
|
+
else:
|
|
688
|
+
text = '\n'.join([' ' + line for line in text.split('\n')])
|
|
689
|
+
return text
|