evalscope 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (59) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/aime24/__init__.py +0 -0
  3. evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  5. evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
  6. evalscope/benchmarks/benchmark.py +2 -2
  7. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
  10. evalscope/benchmarks/data_adapter.py +18 -12
  11. evalscope/benchmarks/data_collection/__init__.py +0 -0
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  13. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
  16. evalscope/benchmarks/gpqa/__init__.py +0 -0
  17. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  18. evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
  19. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
  20. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  21. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  22. evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
  23. evalscope/benchmarks/ifeval/instructions.py +3 -4
  24. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  25. evalscope/benchmarks/math_500/__init__.py +0 -0
  26. evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  28. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
  29. evalscope/benchmarks/race/race_adapter.py +3 -3
  30. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  31. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
  32. evalscope/cli/start_app.py +3 -2
  33. evalscope/collections/evaluator.py +103 -39
  34. evalscope/collections/sampler.py +2 -1
  35. evalscope/collections/schema.py +1 -2
  36. evalscope/config.py +1 -0
  37. evalscope/evaluator/evaluator.py +78 -64
  38. evalscope/metrics/math_parser.py +526 -0
  39. evalscope/metrics/metrics.py +16 -1
  40. evalscope/metrics/named_metrics.py +31 -7
  41. evalscope/models/chat_adapter.py +69 -47
  42. evalscope/models/choice_adapter.py +52 -45
  43. evalscope/models/custom_adapter.py +2 -2
  44. evalscope/models/local_model.py +4 -0
  45. evalscope/models/server_adapter.py +28 -34
  46. evalscope/report/app.py +298 -96
  47. evalscope/run.py +10 -7
  48. evalscope/utils/chat_service.py +2 -2
  49. evalscope/utils/io_utils.py +1 -1
  50. evalscope/version.py +2 -2
  51. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/METADATA +20 -11
  52. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/RECORD +57 -47
  53. tests/cli/test_run.py +93 -16
  54. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  55. evalscope/metrics/math_accuracy.py +0 -200
  56. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
  57. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
  58. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
  59. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
@@ -3,15 +3,16 @@
3
3
  import json
4
4
  import os
5
5
  import time
6
- from collections import OrderedDict
6
+ from collections import OrderedDict, defaultdict
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
8
  from copy import deepcopy
8
9
  from tqdm import tqdm
9
10
  from typing import Any, Dict, List, Optional, Union
10
11
 
11
12
  from evalscope.benchmarks import DataAdapter
12
13
  from evalscope.config import TaskConfig
13
- from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
14
- from evalscope.models import BaseModelAdapter, CustomModelAdapter
14
+ from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, ReviewKeys
15
+ from evalscope.models import BaseModelAdapter
15
16
  from evalscope.report import Report, gen_table
16
17
  from evalscope.utils import dict_torch_dtype_to_str, gen_hash
17
18
  from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
@@ -36,7 +37,6 @@ class Evaluator(object):
36
37
  """
37
38
 
38
39
  def __init__(self,
39
- dataset_name_or_path: str,
40
40
  data_adapter: DataAdapter,
41
41
  model_adapter: BaseModelAdapter,
42
42
  outputs: OutputsStructure = None,
@@ -44,7 +44,7 @@ class Evaluator(object):
44
44
  **kwargs):
45
45
 
46
46
  self.dataset_name = data_adapter.name
47
- self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
47
+ self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
48
48
  self.model_name = task_cfg.model_id
49
49
  self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
50
50
 
@@ -63,15 +63,20 @@ class Evaluator(object):
63
63
 
64
64
  def load_dataset(self):
65
65
  dataset = self.data_adapter.load(
66
- dataset_name_or_path=self.dataset_name_or_path,
67
- subset_list=self.data_adapter.subset_list,
68
- work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
69
- datasets_hub=self.dataset_hub,
70
- **self.kwargs)
66
+ work_dir=os.path.expanduser(self.task_cfg.dataset_dir), datasets_hub=self.dataset_hub, **self.kwargs)
71
67
 
72
68
  # Get prompts from dataset
73
69
  prompts = self.data_adapter.gen_prompts(data_dict=dataset)
74
- return prompts
70
+
71
+ # Limit and index prompts
72
+ limited_prompts = defaultdict(list)
73
+ for subset_name, prompts_list in prompts.items():
74
+ limit = self.task_cfg.limit or len(prompts_list)
75
+ for index, prompt in enumerate(prompts_list[:limit]):
76
+ prompt['index'] = index
77
+ limited_prompts[subset_name].append(prompt)
78
+
79
+ return limited_prompts
75
80
 
76
81
  def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
77
82
  model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
@@ -87,12 +92,38 @@ class Evaluator(object):
87
92
  answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
88
93
  return answer_d
89
94
 
90
- def get_answers(self,
91
- subset_name: str,
92
- prompts_list: List[dict],
93
- infer_cfg: dict = None,
94
- debug: bool = False,
95
- **kwargs) -> list:
95
+ def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
96
+ answers_list = []
97
+ answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
98
+ for answer_d, input_prompt in zip(answer_ds, input_prompts):
99
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
100
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
101
+ answers_list.append(processed_answer)
102
+ return answers_list
103
+
104
+ @staticmethod
105
+ def filter_answer(use_cache, prompts_list, pred_file_path) -> dict:
106
+ # Filter prompts that have been answered
107
+ answers_list = []
108
+ if not use_cache or not os.path.exists(pred_file_path):
109
+ return answers_list, prompts_list
110
+
111
+ def get_answered_indices(answers_list: List[Dict]) -> List[int]:
112
+ indices = [answer[AnswerKeys.ORIGIN_PROMPT].get('index') for answer in answers_list]
113
+
114
+ if all(index is None for index in indices):
115
+ return list(range(len(answers_list)))
116
+
117
+ return [index for index in indices if index is not None]
118
+
119
+ answers_list = jsonl_to_list(pred_file_path)
120
+ answered_indices = set(get_answered_indices(answers_list))
121
+ logger.info(f'Reusing predictions from {pred_file_path}, got {len(answered_indices)} answers.')
122
+
123
+ prompts = [prompt for i, prompt in enumerate(prompts_list) if i not in answered_indices]
124
+ return answers_list, prompts
125
+
126
+ def get_answers(self, subset_name: str, prompts_list: List[dict], infer_cfg: dict = None, **kwargs) -> list:
96
127
  """
97
128
  Get answers from model inference.
98
129
  It is required to rewrite this method to support your own evaluator.
@@ -110,7 +141,6 @@ class Evaluator(object):
110
141
  max_length: int, the max length of the sequence to be generated.
111
142
  max_new_tokens: int, the max number of new tokens to be generated.
112
143
  repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
113
- debug: whether to run in debug mode.
114
144
  **kwargs: kwargs.
115
145
 
116
146
  Returns: The list of answers.
@@ -119,41 +149,35 @@ class Evaluator(object):
119
149
  assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
120
150
  assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
121
151
 
122
- answers_list = []
123
152
  pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
124
153
  pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
125
154
  os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
126
155
 
127
- if self.use_cache and os.path.exists(pred_file_path):
128
- answers_list = jsonl_to_list(pred_file_path)
129
- logger.info(f'Reusing predictions from {pred_file_path}, got {len(answers_list)} answers.')
130
- # Note: assume prediction in order of prompts_list
131
- prompts_list = prompts_list[len(answers_list):]
132
-
133
- if isinstance(self.model_adapter, CustomModelAdapter):
134
- # Batch inference for custom model
135
-
136
- resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
137
- inputs=prompts_list, infer_cfg=infer_cfg)
138
-
139
- for input_prompt, answer_d in zip(prompts_list, resp_answers_list):
140
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
141
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
142
- answers_list.append(processed_answer)
143
- dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
144
-
156
+ answers_list, prompts_list = Evaluator.filter_answer(self.use_cache, prompts_list, pred_file_path)
157
+
158
+ eval_batch_size = self.task_cfg.eval_batch_size
159
+ if self.task_cfg.eval_type == EvalType.SERVICE:
160
+ with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
161
+ with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
162
+ futures = []
163
+ for input_prompt in prompts_list:
164
+ futures.append(executor.submit(self._get_answer, [input_prompt], subset_name, infer_cfg))
165
+ for future in as_completed(futures):
166
+ answer_ds: List[dict] = future.result()
167
+ answers_list.extend(answer_ds)
168
+ dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
169
+ pbar.update(len(answer_ds))
145
170
  else:
146
- for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
147
- answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
148
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
149
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
150
-
151
- if debug:
152
- logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
153
- logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
154
-
155
- answers_list.append(processed_answer)
156
- dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
171
+ batch_prompts_list = [
172
+ prompts_list[i:i + eval_batch_size] for i in range(0, len(prompts_list), eval_batch_size)
173
+ ]
174
+ with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
175
+ for batch_prompts in batch_prompts_list:
176
+ answer_ds: List[dict] = self._get_answer(
177
+ input_prompts=batch_prompts, subset_name=subset_name, infer_cfg=infer_cfg)
178
+ answers_list.extend(answer_ds)
179
+ dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
180
+ pbar.update(len(batch_prompts))
157
181
 
158
182
  logger.info(f'Dump predictions to {pred_file_path}.')
159
183
  return answers_list
@@ -200,17 +224,13 @@ class Evaluator(object):
200
224
  def _generate_review_id(self, answer_d):
201
225
  # Gen review_id (concat: answer_id + reviewer_spec)
202
226
  answer_id = answer_d[AnswerKeys.ANSWER_ID]
203
- reviewer_spec = {
204
- 'metric': [metric.name for metric in self.data_adapter.metric_list],
205
- 'reviewer': ['Evaluator'],
206
- 'revision': ['default']
207
- }
227
+ reviewer_spec = {'metric': self.data_adapter.metric_list, 'reviewer': ['Evaluator'], 'revision': ['default']}
208
228
  reviewer_spec_str = json.dumps(
209
229
  OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
210
230
  review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
211
231
  return review_id, reviewer_spec
212
232
 
213
- def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
233
+ def get_reviews(self, subset_name: str, answers_list: List[dict], **kwargs) -> list:
214
234
  """
215
235
  Get reviews from answers.
216
236
  It is required to rewrite this method to support your own evaluator.
@@ -218,7 +238,6 @@ class Evaluator(object):
218
238
  Args:
219
239
  subset_name: subset name of benchmark
220
240
  answers_list: inference results list.
221
- debug: whether to run in debug mode.
222
241
  **kwargs: kwargs.
223
242
 
224
243
  Returns: reviews list.
@@ -237,8 +256,7 @@ class Evaluator(object):
237
256
  # Get review
238
257
  review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
239
258
 
240
- if debug:
241
- logger.info(review_d)
259
+ logger.debug(review_d)
242
260
 
243
261
  reviews_list.append(review_d)
244
262
  # Dump reviews
@@ -315,7 +333,7 @@ class Evaluator(object):
315
333
  logger.error('Failed to generate report table.')
316
334
  return report_map
317
335
 
318
- def eval(self, infer_cfg: dict = None, debug: bool = False, **kwargs) -> dict:
336
+ def eval(self, **kwargs) -> dict:
319
337
  """
320
338
  Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
321
339
  It is required to rewrite this method to support your own evaluator.
@@ -329,7 +347,6 @@ class Evaluator(object):
329
347
 
330
348
  Args:
331
349
  infer_cfg: The config for model inference.
332
- debug: Whether to run in debug mode. Default: False.
333
350
 
334
351
  Returns:
335
352
  Dict of results. Depends on the stage of evaluation.
@@ -347,17 +364,14 @@ class Evaluator(object):
347
364
 
348
365
  prompts = self.load_dataset()
349
366
  for subset_name, prompts_list in prompts.items():
350
- limit = kwargs.get('limit', len(prompts_list))
351
- prompts_list = prompts_list[:limit]
352
367
 
353
368
  answers_list: list = self.get_answers(
354
- subset_name=subset_name, prompts_list=prompts_list, infer_cfg=infer_cfg, debug=debug, **kwargs)
369
+ subset_name=subset_name, prompts_list=prompts_list, infer_cfg=self.task_cfg.generation_config, **kwargs)
355
370
  if self.stage == EvalStage.INFER:
356
371
  stage_answers_dict[subset_name] = answers_list
357
372
  continue
358
373
 
359
- reviews_list: list = self.get_reviews(
360
- subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
374
+ reviews_list: list = self.get_reviews(subset_name=subset_name, answers_list=answers_list, **kwargs)
361
375
 
362
376
  metric_res = self.compute_metrics(reviews_list=reviews_list)
363
377
  reviews_score_all[subset_name] = metric_res