evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +115 -87
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
  26. evalscope/benchmarks/ifeval/__init__.py +0 -0
  27. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  28. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  29. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  30. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  31. evalscope/benchmarks/ifeval/utils.py +134 -0
  32. evalscope/benchmarks/iquiz/__init__.py +0 -0
  33. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  34. evalscope/benchmarks/mmlu/__init__.py +0 -5
  35. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  36. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  37. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  38. evalscope/benchmarks/race/__init__.py +0 -5
  39. evalscope/benchmarks/race/race_adapter.py +26 -123
  40. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  41. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  42. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  43. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  44. evalscope/cli/cli.py +2 -0
  45. evalscope/cli/start_app.py +29 -0
  46. evalscope/collections/__init__.py +3 -0
  47. evalscope/collections/evaluator.py +198 -0
  48. evalscope/collections/sampler.py +138 -0
  49. evalscope/collections/schema.py +126 -0
  50. evalscope/config.py +7 -5
  51. evalscope/constants.py +9 -26
  52. evalscope/evaluator/evaluator.py +87 -121
  53. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  54. evalscope/metrics/__init__.py +3 -0
  55. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  56. evalscope/metrics/math_accuracy.py +193 -50
  57. evalscope/metrics/metrics.py +18 -6
  58. evalscope/metrics/named_metrics.py +17 -0
  59. evalscope/metrics/rouge_metric.py +13 -8
  60. evalscope/models/__init__.py +14 -1
  61. evalscope/models/base_adapter.py +52 -0
  62. evalscope/models/chat_adapter.py +138 -0
  63. evalscope/models/choice_adapter.py +211 -0
  64. evalscope/models/custom_adapter.py +67 -0
  65. evalscope/models/local_model.py +74 -0
  66. evalscope/models/model.py +141 -0
  67. evalscope/models/server_adapter.py +111 -0
  68. evalscope/perf/__init__.py +1 -0
  69. evalscope/perf/main.py +0 -1
  70. evalscope/perf/plugin/api/custom_api.py +1 -1
  71. evalscope/perf/plugin/api/openai_api.py +1 -1
  72. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  73. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  74. evalscope/report/__init__.py +5 -0
  75. evalscope/report/app.py +506 -0
  76. evalscope/report/combinator.py +73 -0
  77. evalscope/report/generator.py +80 -0
  78. evalscope/report/utils.py +133 -0
  79. evalscope/run.py +48 -72
  80. evalscope/run_arena.py +1 -1
  81. evalscope/summarizer.py +1 -1
  82. evalscope/utils/__init__.py +1 -1
  83. evalscope/utils/chat_service.py +5 -4
  84. evalscope/utils/io_utils.py +8 -0
  85. evalscope/utils/logger.py +5 -0
  86. evalscope/utils/model_utils.py +15 -2
  87. evalscope/utils/utils.py +3 -25
  88. evalscope/version.py +2 -2
  89. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
  90. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
  91. tests/cli/test_collection.py +57 -0
  92. tests/cli/test_run.py +52 -1
  93. tests/rag/test_mteb.py +3 -2
  94. evalscope/models/api/__init__.py +0 -3
  95. evalscope/models/dummy_chat_model.py +0 -49
  96. evalscope/models/model_adapter.py +0 -525
  97. evalscope/models/openai_model.py +0 -103
  98. evalscope/tools/__init__.py +0 -1
  99. evalscope/tools/combine_reports.py +0 -133
  100. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  101. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  102. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  103. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  104. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  105. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  106. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -10,10 +10,9 @@ from typing import Any, Dict, List, Optional, Union
10
10
 
11
11
  from evalscope.benchmarks import DataAdapter
12
12
  from evalscope.config import TaskConfig
13
- from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
14
- ReviewKeys)
15
- from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
16
- from evalscope.tools.combine_reports import gen_table
13
+ from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
14
+ from evalscope.models import BaseModelAdapter, CustomModelAdapter
15
+ from evalscope.report import Report, gen_table
17
16
  from evalscope.utils import dict_torch_dtype_to_str, gen_hash
18
17
  from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
19
18
  from evalscope.utils.logger import get_logger
@@ -30,73 +29,63 @@ class Evaluator(object):
30
29
  if the dataset is a local path, e.g. /path/to/your_dataset_name,
31
30
  then the task name will be the basename of the path, which is `your_dataset_name`.
32
31
  data_adapter: DataAdapter, the data adapter for the dataset.
33
- subset_list: list, the subset list for the dataset.
34
32
  model_adapter: BaseModelAdapter, the model adapter for the model.
35
- use_cache: str, path to local cache. Default: None
36
- outputs_dir: OutputsStructure, the outputs dir. Default: None
37
- datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
38
- datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
39
- stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
40
- eval_type: str, the evaluation type. `checkpoint` or `service` or `custom`. Default: 'checkpoint'
41
- overall_task_cfg: dict, the overall task config. Default: None
33
+ outputs: OutputsStructure, the outputs dir. Default: None
34
+ task_cfg: TaskConfig, the overall task config. Default: None
42
35
  **kwargs: kwargs.
43
36
  """
44
37
 
45
38
  def __init__(self,
46
39
  dataset_name_or_path: str,
47
40
  data_adapter: DataAdapter,
48
- subset_list: Optional[list] = None,
49
- model_adapter: Optional[BaseModelAdapter] = None,
50
- use_cache: Optional[str] = None,
51
- outputs: Optional[OutputsStructure] = None,
52
- datasets_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
53
- datasets_hub: Optional[str] = HubType.MODELSCOPE,
54
- stage: Optional[str] = EvalStage.ALL,
55
- eval_type: Optional[str] = EvalType.CHECKPOINT,
56
- overall_task_cfg: Optional[TaskConfig] = None,
41
+ model_adapter: BaseModelAdapter,
42
+ outputs: OutputsStructure = None,
43
+ task_cfg: TaskConfig = None,
57
44
  **kwargs):
58
45
 
46
+ self.dataset_name = data_adapter.name
59
47
  self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
60
- self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
61
- self.model_name = overall_task_cfg.model_id
48
+ self.model_name = task_cfg.model_id
62
49
  self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
63
50
 
64
- self.datasets_dir = os.path.expanduser(datasets_dir)
65
- self.kwargs = kwargs
66
51
  self.data_adapter = data_adapter
67
52
  self.model_adapter = model_adapter
68
- self.eval_type = eval_type
69
- self.stage = stage
70
- self.use_cache = use_cache
71
- self.overall_task_cfg = overall_task_cfg
72
- if isinstance(self.model_adapter, CustomModelAdapter):
73
- self.overall_task_cfg.model_args = self.model_adapter.custom_model.config
74
-
75
- self.model_cfg = self.model_adapter.model_cfg
76
-
53
+ self.model_cfg = model_adapter.model_cfg
54
+ self.eval_type = task_cfg.eval_type
55
+ self.dataset_hub = task_cfg.dataset_hub
56
+ self.stage = task_cfg.stage
57
+ self.use_cache = task_cfg.use_cache
58
+ self.task_cfg = task_cfg
77
59
  # Deal with the output paths
78
60
  self.outputs_structure = outputs
79
61
 
80
- # Load dataset
81
- self.dataset = self.data_adapter.load(
82
- dataset_name_or_path=dataset_name_or_path,
83
- subset_list=subset_list,
84
- work_dir=self.datasets_dir,
85
- datasets_hub=datasets_hub,
86
- **kwargs)
87
-
88
- # Get prompts from dataset
89
- # TODO: support sampler
90
- self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
91
- del self.dataset
92
-
93
- def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict:
62
+ self.kwargs = kwargs
94
63
 
95
- ans: dict = self.model_adapter.predict(inputs=input_d, infer_cfg=infer_cfg)
96
- ans[AnswerKeys.ANSWER_ID] = answer_id
97
- ans[AnswerKeys.SUBSET_NAME] = subset_name
64
+ def load_dataset(self):
65
+ dataset = self.data_adapter.load(
66
+ dataset_name_or_path=self.dataset_name_or_path,
67
+ subset_list=self.data_adapter.subset_list,
68
+ work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
69
+ datasets_hub=self.dataset_hub,
70
+ **self.kwargs)
98
71
 
99
- return ans
72
+ # Get prompts from dataset
73
+ prompts = self.data_adapter.gen_prompts(data_dict=dataset)
74
+ return prompts
75
+
76
+ def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
77
+ model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
78
+ input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
79
+ infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
80
+ return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
81
+
82
+ def _process_answer(self, answer_d, input_d, subset_name, answer_id):
83
+ answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
84
+ answer_d[AnswerKeys.ANSWER_ID] = answer_id
85
+ answer_d[AnswerKeys.SUBSET_NAME] = subset_name
86
+ answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
87
+ answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
88
+ return answer_d
100
89
 
101
90
  def get_answers(self,
102
91
  subset_name: str,
@@ -147,57 +136,24 @@ class Evaluator(object):
147
136
  resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
148
137
  inputs=prompts_list, infer_cfg=infer_cfg)
149
138
 
150
- assert len(prompts_list) == len(resp_answers_list), \
151
- f'Length of prompts_list({len(prompts_list)}) != Length of resp_answers_list({len(resp_answers_list)})'
152
-
153
- for in_d, resp_d in zip(prompts_list, resp_answers_list):
154
-
155
- # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
156
- model_cfg_str = json.dumps(
157
- OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
158
- ensure_ascii=False)
159
- input_prompt_str = json.dumps(
160
- OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())), ensure_ascii=False)
161
- infer_cfg_str = json.dumps(
162
- OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
163
- answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
164
-
165
- resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
166
- resp_d[AnswerKeys.ANSWER_ID] = answer_id
167
- resp_d[AnswerKeys.SUBSET_NAME] = subset_name
168
- resp_d[AnswerKeys.RAW_INPUT] = in_d[AnswerKeys.RAW_INPUT]
169
- resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
170
-
171
- answers_list.append(resp_d)
172
- dump_jsonl_data(resp_d, pred_file_path, dump_mode=DumpMode.APPEND)
139
+ for input_prompt, answer_d in zip(prompts_list, resp_answers_list):
140
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
141
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
142
+ answers_list.append(processed_answer)
143
+ dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
173
144
 
174
145
  else:
175
146
  for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
176
-
177
- # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
178
- model_cfg_str = json.dumps(
179
- OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
180
- ensure_ascii=False)
181
- input_prompt_str = json.dumps(
182
- OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())), ensure_ascii=False)
183
- infer_cfg_str = json.dumps(
184
- OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
185
- answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
186
-
187
- # Get answers
188
- answer_d: dict = self._pred_answer(
189
- input_d=input_prompt, infer_cfg=infer_cfg, subset_name=subset_name, answer_id=answer_id)
190
-
191
- answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
192
- answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
193
- answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
147
+ answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
148
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
149
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
194
150
 
195
151
  if debug:
196
152
  logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
197
- logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
153
+ logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
198
154
 
199
- answers_list.append(answer_d)
200
- dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
155
+ answers_list.append(processed_answer)
156
+ dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
201
157
 
202
158
  logger.info(f'Dump predictions to {pred_file_path}.')
203
159
  return answers_list
@@ -241,6 +197,19 @@ class Evaluator(object):
241
197
 
242
198
  return review_res
243
199
 
200
+ def _generate_review_id(self, answer_d):
201
+ # Gen review_id (concat: answer_id + reviewer_spec)
202
+ answer_id = answer_d[AnswerKeys.ANSWER_ID]
203
+ reviewer_spec = {
204
+ 'metric': [metric.name for metric in self.data_adapter.metric_list],
205
+ 'reviewer': ['Evaluator'],
206
+ 'revision': ['default']
207
+ }
208
+ reviewer_spec_str = json.dumps(
209
+ OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
210
+ review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
211
+ return review_id, reviewer_spec
212
+
244
213
  def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
245
214
  """
246
215
  Get reviews from answers.
@@ -264,19 +233,7 @@ class Evaluator(object):
264
233
  logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
265
234
 
266
235
  for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
267
-
268
- # Gen review_id (concat: answer_id + reviewer_spec)
269
- answer_id = answer_d[AnswerKeys.ANSWER_ID]
270
-
271
- reviewer_spec: dict = {
272
- 'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
273
- 'reviewer': ['Evaluator'],
274
- 'revision': ['default']
275
- }
276
- reviewer_spec_str = json.dumps(
277
- OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
278
- review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
279
-
236
+ review_id, reviewer_spec = self._generate_review_id(answer_d)
280
237
  # Get review
281
238
  review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
282
239
 
@@ -284,13 +241,12 @@ class Evaluator(object):
284
241
  logger.info(review_d)
285
242
 
286
243
  reviews_list.append(review_d)
287
-
288
244
  # Dump reviews
289
245
  dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
290
246
 
291
247
  return reviews_list
292
248
 
293
- def compute_metrics(self, reviews_list: List[dict]) -> Any:
249
+ def compute_metrics(self, reviews_list: List[dict]) -> List[dict]:
294
250
  """
295
251
  To compute metrics from reviews_list for each subset.
296
252
  It is required to rewrite this method to support your own evaluator.
@@ -308,28 +264,37 @@ class Evaluator(object):
308
264
  logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
309
265
  continue
310
266
 
311
- review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
267
+ if len(review_d[AnswerKeys.CHOICES]) == 0:
268
+ logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
269
+ continue
270
+ elif len(review_d[AnswerKeys.CHOICES]) == 1:
271
+ review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
272
+ else:
273
+ review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
274
+
312
275
  review_res_list.append(review_res)
313
276
 
314
- metric_score: Union[float, dict] = self.data_adapter.compute_metric(review_res_list=review_res_list)
277
+ metric_score: List[dict] = self.data_adapter.compute_metric(review_res_list=review_res_list)
315
278
 
316
279
  return metric_score
317
280
 
318
- def dump_report(self, reviews_score_all: dict, use_table: bool = True):
281
+ def dump_report(self, reviews_score_all: List[dict], use_table: bool = True):
319
282
  """
320
283
  Get report for total reviews of specific dataset.
321
284
  It is required to rewrite this method to support your own evaluator.
322
285
 
323
286
  Args:
324
- report_map: report dict. Generated by func self.data_adapter.gen_report().
287
+ reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
325
288
  use_table: whether to generate table for reports. Default to True.
326
289
 
327
290
  Returns: None
328
291
  """
329
292
  # Get report map
330
- report_map: dict = self.data_adapter.gen_report(
331
- subset_score_map=reviews_score_all, report_name=self.custom_task_name)
332
- report_map.update(dict(model_name=self.model_name, dataset_name=self.dataset_name))
293
+ report_map: Report = self.data_adapter.gen_report(
294
+ subset_score_map=reviews_score_all,
295
+ report_name=self.custom_task_name,
296
+ model_name=self.model_name,
297
+ dataset_name=self.dataset_name)
333
298
 
334
299
  # Dump report
335
300
  report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
@@ -338,7 +303,7 @@ class Evaluator(object):
338
303
 
339
304
  # Write report
340
305
  with open(report_path, 'w') as f:
341
- f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
306
+ f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
342
307
  logger.info(f'Dump report: {report_path} \n')
343
308
 
344
309
  # Make table
@@ -380,7 +345,8 @@ class Evaluator(object):
380
345
  stage_answers_dict = {}
381
346
  stage_reviews_dict = {}
382
347
 
383
- for subset_name, prompts_list in self.prompts.items():
348
+ prompts = self.load_dataset()
349
+ for subset_name, prompts_list in prompts.items():
384
350
  limit = kwargs.get('limit', len(prompts_list))
385
351
  prompts_list = prompts_list[:limit]
386
352
 
@@ -394,7 +360,7 @@ class Evaluator(object):
394
360
  subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
395
361
 
396
362
  metric_res = self.compute_metrics(reviews_list=reviews_list)
397
- reviews_score_all[subset_name] = (metric_res, len(reviews_list))
363
+ reviews_score_all[subset_name] = metric_res
398
364
  stage_reviews_dict[subset_name] = reviews_list
399
365
 
400
366
  if self.stage == EvalStage.INFER:
@@ -8,10 +8,10 @@ import sys
8
8
  import time
9
9
  from abc import ABC, abstractmethod
10
10
  from functools import partial
11
- from typing import Any, List
11
+ from typing import Any, List, Tuple
12
12
 
13
13
  from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
14
- from evalscope.models.openai_model import OpenAIModel
14
+ from evalscope.models.model import OpenAIModel
15
15
  from evalscope.utils import completion_parsers, random_seeded_choice
16
16
  from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
17
17
  from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
@@ -240,7 +240,15 @@ class AutoReviewerGpt4(BaseReviewer):
240
240
  review_text=review_text)
241
241
  return review_result
242
242
 
243
- def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry_run=False, **kwargs) -> (str, Any):
243
+ def _get_review_pair(self,
244
+ model_a,
245
+ model_b,
246
+ question,
247
+ category,
248
+ ans1,
249
+ ans2,
250
+ dry_run=False,
251
+ **kwargs) -> Tuple[str, Any]:
244
252
  input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
245
253
 
246
254
  if self.reference_list:
@@ -263,7 +271,7 @@ class AutoReviewerGpt4(BaseReviewer):
263
271
  result = (result, None)
264
272
  return review_text, *result
265
273
 
266
- def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> (str, Any):
274
+ def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> Tuple[str, Any]:
267
275
  input_msg = dict(ques=question, category=category, ans1=answer)
268
276
 
269
277
  if self.reference_list:
@@ -1 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, weighted_mean
3
+ from evalscope.metrics.named_metrics import *
4
+ from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
@@ -55,7 +55,7 @@ try:
55
55
  os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
56
56
  os.system(f'unzip {punkt_path} -d {nltk_dir}')
57
57
  else:
58
- logger.info(f'{punkt_path} already exists, skipping download')
58
+ logger.debug(f'{punkt_path} already exists, skipping download')
59
59
  except Exception as e:
60
60
  logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
61
61
 
@@ -1,57 +1,200 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import re
4
- from collections import defaultdict
5
- from tqdm import tqdm
6
3
 
7
- from evalscope.constants import MetricsConstant
4
+ # Adapted from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py
5
+ def is_equiv(str1, str2, verbose=False):
6
+ if str1 is None and str2 is None:
7
+ print('WARNING: Both None')
8
+ return True
9
+ if str1 is None or str2 is None:
10
+ return False
8
11
 
12
+ try:
13
+ ss1 = strip_string(str1)
14
+ ss2 = strip_string(str2)
15
+ if verbose:
16
+ print(ss1, ss2)
17
+ return ss1 == ss2
18
+ except Exception:
19
+ return str1 == str2
9
20
 
10
- def get_last_number(s):
11
- match = re.search(r'[-+]?\d*\.\d+|\d+', s[::-1])
12
- if match:
13
- last_digit = match.group()[::-1]
21
+
22
+ def remove_boxed(s):
23
+ if '\\boxed ' in s:
24
+ left = '\\boxed '
25
+ assert s[:len(left)] == left
26
+ return s[len(left):]
27
+
28
+ left = '\\boxed{'
29
+
30
+ assert s[:len(left)] == left
31
+ assert s[-1] == '}'
32
+
33
+ return s[len(left):-1]
34
+
35
+
36
+ def last_boxed_only_string(string):
37
+ idx = string.rfind('\\boxed')
38
+ if '\\boxed ' in string:
39
+ return '\\boxed ' + string.split('\\boxed ')[-1].split('$')[0]
40
+ if idx < 0:
41
+ idx = string.rfind('\\fbox')
42
+ if idx < 0:
43
+ return None
44
+
45
+ i = idx
46
+ right_brace_idx = None
47
+ num_left_braces_open = 0
48
+ while i < len(string):
49
+ if string[i] == '{':
50
+ num_left_braces_open += 1
51
+ if string[i] == '}':
52
+ num_left_braces_open -= 1
53
+ if num_left_braces_open == 0:
54
+ right_brace_idx = i
55
+ break
56
+ i += 1
57
+
58
+ if right_brace_idx is None:
59
+ retval = None
14
60
  else:
15
- last_digit = -100000
16
- return float(last_digit)
17
-
18
-
19
- def compute_math_accuracy_one_sample(predict, reference):
20
- if isinstance(predict, list):
21
- predict = predict[0]
22
- if isinstance(reference, list):
23
- reference = reference[0]
24
- predict_number = get_last_number(predict)
25
- reference_number = get_last_number(reference)
26
- if abs(predict_number - reference_number) <= MetricsConstant.EPSILON:
27
- return 1
61
+ retval = string[idx:right_brace_idx + 1]
62
+
63
+ return retval
64
+
65
+
66
+ def fix_fracs(string):
67
+ substrs = string.split('\\frac')
68
+ new_str = substrs[0]
69
+ if len(substrs) > 1:
70
+ substrs = substrs[1:]
71
+ for substr in substrs:
72
+ new_str += '\\frac'
73
+ if substr[0] == '{':
74
+ new_str += substr
75
+ else:
76
+ try:
77
+ assert len(substr) >= 2
78
+ except AssertionError:
79
+ return string
80
+ a = substr[0]
81
+ b = substr[1]
82
+ if b != '{':
83
+ if len(substr) > 2:
84
+ post_substr = substr[2:]
85
+ new_str += '{' + a + '}{' + b + '}' + post_substr
86
+ else:
87
+ new_str += '{' + a + '}{' + b + '}'
88
+ else:
89
+ if len(substr) > 2:
90
+ post_substr = substr[2:]
91
+ new_str += '{' + a + '}' + b + post_substr
92
+ else:
93
+ new_str += '{' + a + '}' + b
94
+ string = new_str
95
+ return string
96
+
97
+
98
+ def fix_a_slash_b(string):
99
+ if len(string.split('/')) != 2:
100
+ return string
101
+ a = string.split('/')[0]
102
+ b = string.split('/')[1]
103
+ try:
104
+ a = int(a)
105
+ b = int(b)
106
+ assert string == '{}/{}'.format(a, b)
107
+ new_string = '\\frac{' + str(a) + '}{' + str(b) + '}'
108
+ return new_string
109
+ except AssertionError:
110
+ return string
111
+
112
+
113
+ def remove_right_units(string):
114
+ # "\\text{ " only ever occurs (at least in the val set) when describing units
115
+ if '\\text{ ' in string:
116
+ splits = string.split('\\text{ ')
117
+ assert len(splits) == 2
118
+ return splits[0]
28
119
  else:
29
- return 0
30
-
31
-
32
- def compute_math_accuracy(predict_l, reference_l):
33
- assert len(predict_l) == len(reference_l)
34
- if len(predict_l) == 0:
35
- return 0
36
- total_cnt = len(predict_l)
37
- correct_cnt = 0
38
- for predict, reference in zip(predict_l, reference_l):
39
- correct_cnt += compute_math_accuracy_one_sample(predict, reference)
40
- return {'math accuracy': correct_cnt / total_cnt}
41
-
42
-
43
- def run_math_eval(data_l, md_level=2):
44
- print(f"{'#' * md_level} Math Eval(math accuracy)")
45
- for data in tqdm(data_l):
46
- data['math_accuracy'] = compute_math_accuracy_one_sample(data['gen'], data['target'])
47
- task_data_d = defaultdict(list)
48
- for data in data_l:
49
- for task in data['task_tags']:
50
- task_data_d[task].append(data)
51
- correct_cnt = sum([data['math_accuracy'] for data in data_l])
52
- print(f'[total], count: {len(data_l)}, math accuracy: '
53
- f'{correct_cnt / len(data_l) * 100:0.2f}%')
54
- for task in task_data_d.keys():
55
- correct_cnt = sum([data['math_accuracy'] for data in task_data_d[task]])
56
- print(f'[{task}], count: {len(task_data_d[task])}, math accuracy: '
57
- f'{correct_cnt / len(task_data_d[task]) * 100:0.2f}%')
120
+ return string
121
+
122
+
123
+ def fix_sqrt(string):
124
+ if '\\sqrt' not in string:
125
+ return string
126
+ splits = string.split('\\sqrt')
127
+ new_string = splits[0]
128
+ for split in splits[1:]:
129
+ if split[0] != '{':
130
+ a = split[0]
131
+ new_substr = '\\sqrt{' + a + '}' + split[1:]
132
+ else:
133
+ new_substr = '\\sqrt' + split
134
+ new_string += new_substr
135
+ return new_string
136
+
137
+
138
+ def strip_string(string):
139
+ # linebreaks
140
+ string = string.replace('\n', '')
141
+
142
+ # remove inverse spaces
143
+ string = string.replace('\\!', '')
144
+
145
+ # replace \\ with \
146
+ string = string.replace('\\\\', '\\')
147
+
148
+ # replace tfrac and dfrac with frac
149
+ string = string.replace('tfrac', 'frac')
150
+ string = string.replace('dfrac', 'frac')
151
+
152
+ # remove \left and \right
153
+ string = string.replace('\\left', '')
154
+ string = string.replace('\\right', '')
155
+
156
+ # Remove circ (degrees)
157
+ string = string.replace('^{\\circ}', '')
158
+ string = string.replace('^\\circ', '')
159
+
160
+ # remove dollar signs
161
+ string = string.replace('\\$', '')
162
+
163
+ # remove units (on the right)
164
+ string = remove_right_units(string)
165
+
166
+ # remove percentage
167
+ string = string.replace('\\%', '')
168
+ string = string.replace('\%', '') # noqa: W605
169
+
170
+ # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
171
+ string = string.replace(' .', ' 0.')
172
+ string = string.replace('{.', '{0.')
173
+ # if empty, return empty string
174
+ if len(string) == 0:
175
+ return string
176
+ if string[0] == '.':
177
+ string = '0' + string
178
+
179
+ # to consider: get rid of e.g. "k = " or "q = " at beginning
180
+ if len(string.split('=')) == 2:
181
+ if len(string.split('=')[0]) <= 2:
182
+ string = string.split('=')[1]
183
+
184
+ # fix sqrt3 --> sqrt{3}
185
+ string = fix_sqrt(string)
186
+
187
+ # remove spaces
188
+ string = string.replace(' ', '')
189
+
190
+ # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b} # noqa: E501
191
+ string = fix_fracs(string)
192
+
193
+ # manually change 0.5 --> \frac{1}{2}
194
+ if string == '0.5':
195
+ string = '\\frac{1}{2}'
196
+
197
+ # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
198
+ string = fix_a_slash_b(string)
199
+
200
+ return string