evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,689 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import os
4
+ import time
5
+ import json
6
+ import re
7
+ from copy import deepcopy
8
+ from collections import OrderedDict
9
+
10
+ from tqdm import tqdm
11
+ from typing import Optional, List, Any, Union, Dict
12
+
13
+ from evalscope.benchmarks import DataAdapter
14
+ from evalscope.constants import DEFAULT_ROOT_CACHE_DIR, OutputsStructure, AnswerKeys, ReviewKeys, EvalStage
15
+ from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
16
+ from evalscope.tools.combine_reports import gen_table
17
+ from evalscope.utils import gen_hash, dict_torch_dtype_to_str, dump_jsonl_data, process_outputs_structure, \
18
+ normalize_score, dict_to_yaml, jsonl_to_list
19
+ from evalscope.utils.logger import get_logger
20
+
21
+ logger = get_logger()
22
+
23
+
24
+ class Evaluator(object):
25
+
26
+ """
27
+ The evaluator for model on datasets.
28
+
29
+ Args:
30
+ dataset_name_or_path: str, the dataset name or path.
31
+ if the dataset is a local path, e.g. /path/to/your_dataset_name,
32
+ then the task name will be the basename of the path, which is `your_dataset_name`.
33
+ data_adapter: DataAdapter, the data adapter for the dataset.
34
+ subset_list: list, the subset list for the dataset.
35
+ model_adapter: BaseModelAdapter, the model adapter for the model.
36
+ use_cache: bool, whether to use local cache. Default: True
37
+ mem_cache_method: str, the memory cache method. Default: 'ttl' (deprecated)
38
+ root_cache_dir: str, the root cache dir. Default: DEFAULT_ROOT_CACHE_DIR
39
+ outputs_dir: str, the outputs dir. Default: ''
40
+ is_custom_outputs_dir: bool, whether to use custom outputs dir. Default: False (deprecated)
41
+ datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
42
+ datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
43
+ stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
44
+ eval_type: str, the evaluation type. `checkpoint` or `service` or `custom`. Default: 'checkpoint'
45
+ overall_task_cfg: dict, the overall task config. Default: None
46
+ **kwargs: kwargs.
47
+ """
48
+
49
+ def __init__(self,
50
+ dataset_name_or_path: str,
51
+ data_adapter: DataAdapter,
52
+ subset_list: Optional[list] = None,
53
+ model_adapter: Optional[BaseModelAdapter] = None,
54
+ use_cache: bool = True,
55
+ mem_cache_method: str = 'ttl',
56
+ root_cache_dir: Optional[str] = DEFAULT_ROOT_CACHE_DIR,
57
+ outputs_dir: Optional[str] = '',
58
+ is_custom_outputs_dir: bool = False,
59
+ datasets_dir: Optional[str] = DEFAULT_ROOT_CACHE_DIR,
60
+ datasets_hub: Optional[str] = 'ModelScope',
61
+ stage: Optional[str] = 'all', # refer to evalscope.constants.EvalStage
62
+ eval_type: Optional[str] = 'checkpoint', # `checkpoint` or `service` or `custom`
63
+ overall_task_cfg: Optional[dict] = None,
64
+ **kwargs):
65
+
66
+ self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
67
+ self.custom_task_name: str = None
68
+ if os.path.exists(self.dataset_name_or_path):
69
+ self.custom_task_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep))
70
+
71
+ self.root_cache_dir = os.path.expanduser(root_cache_dir)
72
+ self.datasets_dir = os.path.expanduser(datasets_dir)
73
+ self.kwargs = kwargs
74
+ self.data_adapter = data_adapter
75
+ self.model_adapter = model_adapter
76
+ self.eval_type = eval_type
77
+ self.stage = stage
78
+ self.use_cache = use_cache
79
+ self.overall_task_cfg = overall_task_cfg
80
+ if isinstance(self.model_adapter, CustomModelAdapter):
81
+ self.overall_task_cfg.update({'custom_config': self.model_adapter.custom_model.config})
82
+
83
+ self.model_cfg = self.model_adapter.model_cfg
84
+ self.model_id = self.model_cfg['model_id']
85
+ self.model_revision = self.model_cfg.get('revision', None)
86
+ self.model_revision_str = self.model_revision if self.model_revision is not None else 'none'
87
+
88
+ # Get default outputs_dir
89
+ # TODO: refactor outputs_dir, del timestamp concat
90
+ # if not is_custom_outputs_dir:
91
+ # outputs_dir = make_outputs_dir(work_dir=outputs_dir,
92
+ # model_id=self.model_id,
93
+ # model_revision=self.model_revision_str)
94
+
95
+ self.outputs_dir = os.path.expanduser(outputs_dir)
96
+
97
+ # Deal with the output paths
98
+ self.outputs_structure = process_outputs_structure(self.outputs_dir)
99
+
100
+ # Load dataset
101
+ self.dataset = self.data_adapter.load(dataset_name_or_path=dataset_name_or_path,
102
+ subset_list=subset_list,
103
+ work_dir=self.datasets_dir,
104
+ datasets_hub=datasets_hub,
105
+ **kwargs)
106
+
107
+ # Get prompts from dataset
108
+ self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
109
+ del self.dataset
110
+
111
+ # Init memory cache
112
+ # TODO: refactor mem cache manager
113
+ # mem_cache_file_name = self.dataset_name_or_path.replace('/', '_') + \
114
+ # '_' + self.model_id.replace('/', '_') + \
115
+ # '_' + self.model_revision_str + \
116
+ # '_cache.pkl'
117
+ # self.mem_cache_path = os.path.join(self.root_cache_dir, 'mem_cache', mem_cache_file_name)
118
+
119
+ # Note: mem_cache is deprecated, use `use_cache` instead
120
+ self.mem_cache = None
121
+ self.mem_cache_method = mem_cache_method
122
+ # if self.use_cache:
123
+ # self.mem_cache = init_mem_cache(method=self.mem_cache_method, cache_file_path=self.mem_cache_path)
124
+ # logger.info(f'** Using memory cache with size: {len(self.mem_cache)}')
125
+
126
+ def _pred_answer(self,
127
+ input_d: dict,
128
+ infer_cfg: dict,
129
+ subset_name: str,
130
+ answer_id: str = None) -> dict:
131
+
132
+ # Get answer from memory cache
133
+ if self.mem_cache is not None:
134
+ if answer_id in self.mem_cache:
135
+ logger.info(f'** Reusing answer `{answer_id}` in memory cache.')
136
+ return self.mem_cache[answer_id]
137
+
138
+ ans: dict = self.model_adapter.predict(inputs=input_d, infer_cfg=infer_cfg)
139
+ ans[AnswerKeys.ANSWER_ID] = answer_id
140
+ ans[AnswerKeys.SUBSET_NAME] = subset_name
141
+
142
+ if self.mem_cache is not None:
143
+ self.mem_cache[answer_id] = ans
144
+
145
+ return ans
146
+
147
+ def get_answers(self,
148
+ subset_name: str,
149
+ prompts_list: List[dict],
150
+ infer_cfg: dict = None,
151
+ debug: bool = False,
152
+ **kwargs) -> list:
153
+ """
154
+ Get answers from model inference.
155
+ It is required to rewrite this method to support your own evaluator.
156
+
157
+ Args:
158
+ subset_name: subset name for benchmark.
159
+ prompts_list: prompts list.
160
+ infer_cfg: model inference config.
161
+ Attributes:
162
+ do_sample: bool, whether to use sampling.
163
+ top_k: int, the number of highest probability vocabulary tokens to keep for top-k-filtering.
164
+ top_p: float, if set to float < 1, only the most probable tokens with probabilities to add.
165
+ temperature: float, the value used to module the next token probabilities.
166
+ num_beams: int, number of beams for beam search. 1 means no beam search.
167
+ max_length: int, the max length of the sequence to be generated.
168
+ max_new_tokens: int, the max number of new tokens to be generated.
169
+ repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
170
+ debug: whether to run in debug mode.
171
+ **kwargs: kwargs.
172
+
173
+ Returns: The list of answers.
174
+ """
175
+ assert self.data_adapter is not None, 'data_adapter must be provided when calling func get_answers() !'
176
+ assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
177
+
178
+ answers_list = []
179
+ pred_dir: str = self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR)
180
+
181
+ if self.custom_task_name:
182
+ pred_file_name: str = self.custom_task_name + '_' + subset_name + '.jsonl'
183
+ else:
184
+ pred_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '_' + subset_name + '.jsonl'
185
+
186
+ pred_file_path: str = os.path.join(pred_dir, pred_file_name)
187
+
188
+ if self.use_cache and os.path.exists(pred_file_path):
189
+ answers_list = jsonl_to_list(pred_file_path)
190
+ logger.info(f'** Reusing predictions from {pred_file_path}, got {len(answers_list)} answers.')
191
+
192
+ return answers_list
193
+
194
+ if isinstance(self.model_adapter, CustomModelAdapter):
195
+ # Batch inference for custom model
196
+
197
+ resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(inputs=prompts_list,
198
+ infer_cfg=infer_cfg)
199
+
200
+ assert len(prompts_list) == len(resp_answers_list), \
201
+ f'Length of prompts_list({len(prompts_list)}) != Length of resp_answers_list({len(resp_answers_list)})'
202
+
203
+ for in_d, resp_d in zip(prompts_list, resp_answers_list):
204
+
205
+ # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
206
+ model_cfg_str = json.dumps(
207
+ OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
208
+ ensure_ascii=False)
209
+ input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())),
210
+ ensure_ascii=False)
211
+ infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())),
212
+ ensure_ascii=False)
213
+ answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
214
+
215
+ resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
216
+ resp_d[AnswerKeys.ANSWER_ID] = answer_id
217
+ resp_d[AnswerKeys.SUBSET_NAME] = subset_name
218
+ resp_d[AnswerKeys.RAW_INPUT] = in_d[AnswerKeys.RAW_INPUT]
219
+ resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
220
+
221
+ answers_list.append(resp_d)
222
+
223
+ else:
224
+ for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
225
+
226
+ # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
227
+ model_cfg_str = json.dumps(
228
+ OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
229
+ ensure_ascii=False)
230
+ input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())),
231
+ ensure_ascii=False)
232
+ infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())),
233
+ ensure_ascii=False)
234
+ answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
235
+
236
+ # Get answers
237
+ answer_d: dict = self._pred_answer(input_d=input_prompt,
238
+ infer_cfg=infer_cfg,
239
+ subset_name=subset_name,
240
+ answer_id=answer_id)
241
+
242
+ answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
243
+ answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
244
+ answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
245
+
246
+ if debug:
247
+ logger.debug(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
248
+ logger.debug(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
249
+
250
+ answers_list.append(answer_d)
251
+
252
+ if len(answers_list) == 0:
253
+ logger.error(f'** Got empty predictions on subset {subset_name} of dataset: {self.dataset_name_or_path}')
254
+
255
+ # Dump answers
256
+ os.makedirs(pred_dir, exist_ok=True)
257
+ dump_jsonl_data(answers_list, pred_file_path)
258
+
259
+ return answers_list
260
+
261
+ def _get_review(self,
262
+ answer_d: dict,
263
+ review_id: str = None,
264
+ reviewer_spec: dict = None) -> dict:
265
+
266
+ # Get review from memory cache
267
+ if self.mem_cache is not None:
268
+ if review_id in self.mem_cache:
269
+ logger.info(f'** Reusing review `{review_id}` in memory cache.')
270
+ return self.mem_cache[review_id]
271
+
272
+ if reviewer_spec is None:
273
+ reviewer_spec = {}
274
+
275
+ review_res = deepcopy(answer_d)
276
+ choices = review_res[AnswerKeys.CHOICES]
277
+ if len(choices) == 0:
278
+ review_res[ReviewKeys.REVIEWED] = False
279
+ review_res[ReviewKeys.REVIEW_ID] = None
280
+ review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
281
+ review_res[ReviewKeys.REVIEW_TIME] = time.time()
282
+ return review_res
283
+
284
+ rev_choices = []
285
+ for choice in choices:
286
+ raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
287
+ answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
288
+ answer_content = self.data_adapter.parse_pred_result(result=answer_content,
289
+ raw_input_d=raw_input_d,
290
+ eval_type=self.eval_type)
291
+ gold_content = self.data_adapter.get_gold_answer(raw_input_d)
292
+
293
+ review_result = self.data_adapter.match(gold_content, answer_content)
294
+ choice[ReviewKeys.REVIEW] = {ReviewKeys.GOLD: gold_content,
295
+ ReviewKeys.PRED: answer_content,
296
+ ReviewKeys.RESULT: review_result}
297
+
298
+ rev_choices.append(choice)
299
+
300
+ review_res[AnswerKeys.CHOICES] = rev_choices
301
+ review_res[ReviewKeys.REVIEWED] = True
302
+ review_res[ReviewKeys.REVIEW_ID] = review_id
303
+ review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
304
+ review_res[ReviewKeys.REVIEW_TIME] = time.time()
305
+
306
+ if self.mem_cache is not None:
307
+ self.mem_cache[review_id] = review_res
308
+
309
+ return review_res
310
+
311
+ def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
312
+ """
313
+ Get reviews from answers.
314
+ It is required to rewrite this method to support your own evaluator.
315
+
316
+ Args:
317
+ subset_name: subset name of benchmark
318
+ answers_list: inference results list.
319
+ debug: whether to run in debug mode.
320
+ **kwargs: kwargs.
321
+
322
+ Returns: reviews list.
323
+ """
324
+ reviews_list = []
325
+
326
+ review_dir: str = self.outputs_structure.get(OutputsStructure.REVIEWS_DIR)
327
+ if self.custom_task_name:
328
+ review_file_name: str = self.custom_task_name + '_' + subset_name + '.jsonl'
329
+ else:
330
+ review_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '_' + subset_name + '.jsonl'
331
+ review_file_path: str = os.path.join(review_dir, review_file_name)
332
+
333
+ if self.use_cache and os.path.exists(review_file_path):
334
+ logger.warning(f'** Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
335
+
336
+ for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
337
+
338
+ # Gen review_id (concat: answer_id + reviewer_spec)
339
+ answer_id = answer_d[AnswerKeys.ANSWER_ID]
340
+
341
+ reviewer_spec: dict = {'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
342
+ 'reviewer': ['Evaluator'],
343
+ 'revision': ['default']}
344
+ reviewer_spec_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())),
345
+ ensure_ascii=False)
346
+ review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
347
+
348
+ # Get review
349
+ review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
350
+
351
+ if debug:
352
+ logger.debug(review_d)
353
+
354
+ reviews_list.append(review_d)
355
+
356
+ # Dump reviews
357
+ os.makedirs(review_dir, exist_ok=True)
358
+ dump_jsonl_data(reviews_list, review_file_path)
359
+
360
+ return reviews_list
361
+
362
+ def compute_metrics(self, reviews_list: List[dict]) -> Any:
363
+ """
364
+ To compute metrics from reviews_list for each subset.
365
+ It is required to rewrite this method to support your own evaluator.
366
+
367
+ Args:
368
+ reviews_list: reviews list.
369
+
370
+ Returns:
371
+ The metric result. Depends on the metric function in data_adapter.
372
+ """
373
+
374
+ review_res_list = []
375
+ for review_d in reviews_list:
376
+ if not review_d[ReviewKeys.REVIEWED]:
377
+ logger.warning(f'** Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
378
+ continue
379
+
380
+ review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
381
+ review_res_list.append(review_res)
382
+
383
+ metric_score: Union[float, dict] = self.data_adapter.compute_metric(review_res_list=review_res_list)
384
+
385
+ return metric_score
386
+
387
+ def dump_report(self, report_map: dict, use_table: bool = True):
388
+ """
389
+ Get report for total reviews of specific dataset.
390
+ It is required to rewrite this method to support your own evaluator.
391
+
392
+ Args:
393
+ report_map: report dict. Generated by func self.data_adapter.gen_report().
394
+ use_table: whether to generate table for reports. Default to True.
395
+
396
+ Returns: None
397
+ """
398
+
399
+ # Dump report
400
+ report_dir: str = self.outputs_structure[OutputsStructure.REPORTS_DIR]
401
+
402
+ if self.custom_task_name:
403
+ report_file_name: str = self.custom_task_name + '.json'
404
+ else:
405
+ report_file_name: str = self.dataset_name_or_path.replace(os.sep, '_') + '.json'
406
+
407
+ os.makedirs(report_dir, exist_ok=True)
408
+ report_path: str = os.path.join(report_dir, report_file_name)
409
+ with open(report_path, 'w') as f:
410
+ f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
411
+ # logger.info(f'** Dump report to {report_path} \n')
412
+ logger.info(f'** Dump report: {report_file_name} \n')
413
+
414
+ if use_table:
415
+ try:
416
+ # Make table
417
+ report_table: str = gen_table([report_dir])
418
+ logger.info(f'** Report table: \n {report_table} \n')
419
+ except:
420
+ logger.error('Failed to generate report table.')
421
+
422
+ # def save_cache(self):
423
+ # if self.mem_cache is not None:
424
+ # logger.info(f'** Saving memory cache with size: {len(self.mem_cache)}')
425
+ # Cache.save(cache=self.mem_cache, path=self.mem_cache_path)
426
+
427
+ # def clear_cache(self):
428
+ # """
429
+ # Clear memory cache.
430
+ #
431
+ # Returns: None
432
+ # """
433
+ # if self.mem_cache is not None:
434
+ # cache_len = len(self.mem_cache)
435
+ # self.mem_cache.clear()
436
+ # logger.info(f'** Memory cache cleared, length changed: {cache_len} -> {len(self.mem_cache)}')
437
+
438
+ def eval(self,
439
+ infer_cfg: dict = None,
440
+ debug: bool = False,
441
+ **kwargs) -> dict:
442
+ """
443
+ Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
444
+ It is required to rewrite this method to support your own evaluator.
445
+
446
+ The evaluation process is as follows:
447
+ 1. Get the input samples from the dataset (benchmarks on the ModelScope or HuggingFace).
448
+ 2. Get the input prompts from dataset with specific data adapter.
449
+ 3. Get answers with model inference.
450
+ 4. Get reviews with metric function (or reviewers).
451
+ 5. Generate report from review results.
452
+
453
+ Args:
454
+ infer_cfg: The config for model inference.
455
+ debug: Whether to run in debug mode. Default: False.
456
+
457
+ Returns:
458
+ Dict of results. Depends on the stage of evaluation.
459
+
460
+ stage == 'all': return the report_map
461
+ stage == 'infer': return the answers_map
462
+ stage == 'review': return the reviews_map
463
+ """
464
+
465
+ logger.info(f'**** Start evaluating on dataset {self.dataset_name_or_path} ****')
466
+
467
+ reviews_score_all = {} # {subset_name: (score, num)}
468
+ stage_answers_dict = {}
469
+ stage_reviews_dict = {}
470
+
471
+ for subset_name, prompts_list in self.prompts.items():
472
+ limit = infer_cfg.get('limit', len(prompts_list))
473
+ prompts_list = prompts_list[:limit]
474
+
475
+ answers_list: list = self.get_answers(subset_name=subset_name,
476
+ prompts_list=prompts_list,
477
+ infer_cfg=infer_cfg,
478
+ debug=debug,
479
+ **kwargs)
480
+ if self.stage == EvalStage.INFER:
481
+ stage_answers_dict[subset_name] = answers_list
482
+ continue
483
+
484
+ reviews_list: list = self.get_reviews(subset_name=subset_name,
485
+ answers_list=answers_list,
486
+ debug=debug,
487
+ **kwargs)
488
+
489
+ metric_res = self.compute_metrics(reviews_list=reviews_list)
490
+ reviews_score_all[subset_name] = (metric_res, len(reviews_list))
491
+ stage_reviews_dict[subset_name] = reviews_list
492
+
493
+ if self.stage == EvalStage.INFER:
494
+ return stage_answers_dict
495
+
496
+ if self.stage == EvalStage.REVIEW:
497
+ return stage_reviews_dict
498
+
499
+ # Generate report
500
+ report_map: dict = self.data_adapter.gen_report(subset_score_map=reviews_score_all,
501
+ report_name=self.custom_task_name)
502
+ self.dump_report(report_map=report_map)
503
+
504
+ # Dump overall task config
505
+ overall_task_cfg_file: str = os.path.join(self.outputs_structure.get(OutputsStructure.CONFIGS_DIR),
506
+ 'task_output_config.yaml')
507
+ overall_task_cfg_file = os.path.abspath(overall_task_cfg_file)
508
+
509
+ # TODO: check the robustness of dump yaml
510
+ try:
511
+ logger.info(f'** Dump overall task config to {overall_task_cfg_file}')
512
+ logger.info(f'** The overall task config:\n {self.overall_task_cfg}')
513
+ if 'model' in self.overall_task_cfg and not isinstance(self.overall_task_cfg['model'], str):
514
+ self.overall_task_cfg['model'] = None
515
+ logger.info(f'>> Overwrite overall_task_cfg for `model` due to it is not a string')
516
+ if 'model_args' in self.overall_task_cfg and self.overall_task_cfg.get('model_args') is not None:
517
+ self.overall_task_cfg['model_args'].update({'precision': str(self.overall_task_cfg['model_args']['precision'])})
518
+ logger.info(f'>> Overwrite overall_task_cfg for `model_args.precision` due to it is not a string')
519
+
520
+ dict_to_yaml(self.overall_task_cfg, overall_task_cfg_file)
521
+ except Exception as e:
522
+ logger.warning(f'Failed to dump overall task config: {e}')
523
+
524
+ # Note: deprecated
525
+ # self.save_cache()
526
+ # self.clear_cache()
527
+
528
+ logger.info(f'\n**** Evaluation finished on {self.dataset_name_or_path} ****\n')
529
+
530
+ return report_map
531
+
532
+
533
+ class HumanevalEvaluator(object):
534
+
535
+ def __init__(self,
536
+ problem_file: str,
537
+ model_id: str,
538
+ model_revision: str,
539
+ model_adapter: BaseModelAdapter,
540
+ outputs_dir: Optional[str] = '',
541
+ is_custom_outputs_dir: bool = False,
542
+ k: List[int] = [1, 10, 100],
543
+ n_workers: int = 4,
544
+ timeout: float = 3.0,):
545
+ try:
546
+ from human_eval.evaluation import evaluate_functional_correctness
547
+ from human_eval.data import read_problems, write_jsonl
548
+ except ImportError:
549
+ raise ImportError('Please install human_eval:'
550
+ 'https://github.com/openai/human-eval/tree/master#installation , '
551
+ 'Note that you need to enable the execution code in the human_eval/execution.py first.')
552
+
553
+ self.problem_file = problem_file
554
+ self.k = k
555
+ self.num_workers = n_workers
556
+ self.timeout = timeout
557
+ self.model_adapter = model_adapter
558
+
559
+ self.read_problems_func = read_problems
560
+ self.write_jsonl_func = write_jsonl
561
+ self.eval_func = evaluate_functional_correctness
562
+
563
+ # {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
564
+ self.problems = self.read_problems_func(self.problem_file)
565
+
566
+ # Get default outputs_dir
567
+ model_revision_str: str = model_revision if model_revision is not None else 'none'
568
+ # if not is_custom_outputs_dir:
569
+ # outputs_dir = make_outputs_dir(work_dir=outputs_dir,
570
+ # model_id=model_id,
571
+ # model_revision=model_revision_str)
572
+ self.outputs_dir = os.path.expanduser(outputs_dir)
573
+
574
+ # Deal with the output paths
575
+ self.outputs_structure = process_outputs_structure(self.outputs_dir)
576
+
577
+ def get_answers(self, infer_cfg: dict) -> List[dict]:
578
+ ans_list: list = []
579
+ system_prompt: str = 'Complete the following python code:\n'
580
+ for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
581
+ prompt: str = system_prompt + data_d['prompt']
582
+ inputs: dict = {'data': [prompt]}
583
+ # pred_res: dict = self.model_adapter.predict(inputs)
584
+
585
+ pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
586
+
587
+ pred_ans: str = pred_res['choices'][0]['message']['content']
588
+ pred_ans = self._postprocess(pred_ans)
589
+
590
+ ans_list.append({'task_id': task_id, 'completion': pred_ans})
591
+
592
+ return ans_list
593
+
594
+ def eval(self, infer_cfg: dict, **kwargs):
595
+
596
+ # predict
597
+ ans_list: list = self.get_answers(infer_cfg)
598
+ ans_out_file: str = os.path.join(self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR),
599
+ 'human_eval_predictions.jsonl')
600
+
601
+ self.write_jsonl_func(filename=ans_out_file, data=ans_list)
602
+ # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
603
+ logger.info('** Dump predictions successfully.')
604
+
605
+ # evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
606
+ results = self.eval_func(sample_file=ans_out_file,
607
+ k=self.k,
608
+ n_workers=self.num_workers,
609
+ timeout=self.timeout,
610
+ problem_file=self.problem_file)
611
+
612
+ # output: report
613
+ report_map: dict = self.gen_report(results=results)
614
+ report_dir: str = self.outputs_structure.get(OutputsStructure.REPORTS_DIR)
615
+ report_file: str = os.path.join(report_dir, 'human_eval_report.json')
616
+
617
+ with open(report_file, 'w') as f:
618
+ f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
619
+ # logger.info(f'** Dump report to {report_file} \n')
620
+ logger.info(f'** Dump report \n')
621
+
622
+ try:
623
+ # Make table
624
+ report_table: str = gen_table([report_dir])
625
+ logger.info(f'** Report table: \n {report_table} \n')
626
+ except:
627
+ logger.error('Failed to generate report table.')
628
+
629
+ def gen_report(self, results: dict) -> dict:
630
+ """
631
+ Generate report from evaluation results.
632
+
633
+ Returns:
634
+ {
635
+ "name":"ARC-Challenge",
636
+ "metric":"WeightedAverageAccuracy",
637
+ "score":0.3389,
638
+ "category":[
639
+ {
640
+ "name":"DEFAULT",
641
+ "score":0.3389,
642
+ "subset":[
643
+ {
644
+ "name":"ARC-Challenge",
645
+ "score":0.3389
646
+ },
647
+ ]
648
+ }
649
+ ],
650
+ "total_num":100
651
+ }
652
+ """
653
+ results = {k: normalize_score(score=v) for k, v in results.items()}
654
+
655
+ category_d = dict(name='DEFAULT',
656
+ score=results,
657
+ subset=[])
658
+
659
+ res_map = dict(name='HumanEval',
660
+ metric='pass@k',
661
+ score=results,
662
+ category=[category_d],
663
+ total_num=len(self.problems))
664
+
665
+ return res_map
666
+
667
+ @classmethod
668
+ def _postprocess(cls, text: str) -> str:
669
+ if '```' in text:
670
+ blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
671
+ if len(blocks) == 0:
672
+ text = text.split('```')[1] # fall back to default strategy
673
+ else:
674
+ text = blocks[0] # fetch the first code block
675
+ if not text.startswith('\n'): # in case starting with ```python
676
+ text = text[max(text.find('\n') + 1, 0):]
677
+ if text.strip().startswith('from') or text.strip().startswith('import'):
678
+ def_idx = text.find('def')
679
+ if def_idx != -1:
680
+ text = text[max(text.find('\n', def_idx) + 1, 0):]
681
+ text = text.split('\n\n')[0]
682
+ if text.strip().startswith('def'):
683
+ text = '\n'.join(text.split('\n')[1:])
684
+ if not text.startswith(' '):
685
+ if text.startswith(' '):
686
+ text = ' ' + text.lstrip()
687
+ else:
688
+ text = '\n'.join([' ' + line for line in text.split('\n')])
689
+ return text