evalscope 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (85) hide show
  1. evalscope/arguments.py +6 -1
  2. evalscope/benchmarks/aime/aime24_adapter.py +3 -3
  3. evalscope/benchmarks/aime/aime25_adapter.py +3 -3
  4. evalscope/benchmarks/arc/arc_adapter.py +15 -18
  5. evalscope/benchmarks/bbh/bbh_adapter.py +6 -6
  6. evalscope/benchmarks/benchmark.py +12 -11
  7. evalscope/benchmarks/ceval/ceval_adapter.py +12 -16
  8. evalscope/benchmarks/chinese_simple_qa/__init__.py +0 -0
  9. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +168 -0
  10. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +13 -17
  11. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -3
  12. evalscope/benchmarks/data_adapter.py +59 -21
  13. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +9 -12
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +30 -15
  16. evalscope/benchmarks/gpqa/gpqa_adapter.py +12 -7
  17. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -3
  18. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -31
  19. evalscope/benchmarks/humaneval/humaneval_adapter.py +10 -7
  20. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -3
  21. evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  22. evalscope/benchmarks/live_code_bench/__init__.py +0 -0
  23. evalscope/benchmarks/live_code_bench/evaluate_utils.py +193 -0
  24. evalscope/benchmarks/live_code_bench/execute_utils.py +267 -0
  25. evalscope/benchmarks/live_code_bench/extract_utils.py +70 -0
  26. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +90 -0
  27. evalscope/benchmarks/live_code_bench/load_utils.py +71 -0
  28. evalscope/benchmarks/live_code_bench/pass_k_utils.py +56 -0
  29. evalscope/benchmarks/live_code_bench/prompts.py +207 -0
  30. evalscope/benchmarks/live_code_bench/testing_util.py +721 -0
  31. evalscope/benchmarks/math_500/math_500_adapter.py +2 -6
  32. evalscope/benchmarks/mmlu/mmlu_adapter.py +13 -17
  33. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +9 -5
  34. evalscope/benchmarks/musr/musr_adapter.py +8 -5
  35. evalscope/benchmarks/process_bench/process_bench_adapter.py +8 -5
  36. evalscope/benchmarks/race/race_adapter.py +12 -16
  37. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  38. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +167 -0
  39. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  40. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  41. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  42. evalscope/benchmarks/super_gpqa/utils.py +85 -0
  43. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  45. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +6 -13
  46. evalscope/benchmarks/utils.py +43 -0
  47. evalscope/collections/evaluator.py +14 -5
  48. evalscope/config.py +15 -2
  49. evalscope/constants.py +14 -0
  50. evalscope/evaluator/evaluator.py +51 -13
  51. evalscope/metrics/llm_judge.py +104 -0
  52. evalscope/metrics/named_metrics.py +1 -0
  53. evalscope/models/__init__.py +2 -1
  54. evalscope/models/base_adapter.py +25 -5
  55. evalscope/models/chat_adapter.py +3 -0
  56. evalscope/models/choice_adapter.py +4 -0
  57. evalscope/models/custom_adapter.py +2 -0
  58. evalscope/models/register.py +28 -0
  59. evalscope/models/server_adapter.py +35 -8
  60. evalscope/perf/arguments.py +13 -7
  61. evalscope/perf/benchmark.py +5 -0
  62. evalscope/perf/http_client.py +15 -5
  63. evalscope/perf/main.py +1 -0
  64. evalscope/perf/utils/analysis_result.py +1 -1
  65. evalscope/report/app.py +3 -0
  66. evalscope/report/combinator.py +2 -2
  67. evalscope/run.py +6 -5
  68. evalscope/third_party/longbench_write/infer.py +1 -1
  69. evalscope/third_party/thinkbench/eval.py +220 -55
  70. evalscope/third_party/thinkbench/infer.py +37 -7
  71. evalscope/third_party/thinkbench/tools/llm.py +1 -0
  72. evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
  73. evalscope/utils/chat_service.py +1 -0
  74. evalscope/utils/filters.py +59 -0
  75. evalscope/utils/logger.py +3 -3
  76. evalscope/version.py +2 -2
  77. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/METADATA +31 -12
  78. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/RECORD +85 -62
  79. tests/cli/test_all.py +144 -0
  80. tests/cli/test_collection.py +28 -2
  81. tests/cli/test_run.py +201 -32
  82. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/LICENSE +0 -0
  83. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/WHEEL +0 -0
  84. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/entry_points.txt +0 -0
  85. {evalscope-0.12.0.dist-info → evalscope-0.13.0.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,9 @@ from abc import ABC, abstractmethod
5
5
  from collections import defaultdict
6
6
  from typing import Any, List, Optional, Union
7
7
 
8
+ from evalscope.benchmarks.utils import PromptData, preprocess_decorator
8
9
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
10
+ from evalscope.metrics.llm_judge import LLMJudge
9
11
  from evalscope.metrics.named_metrics import metric_registry
10
12
  from evalscope.report import Report, ReportGenerator
11
13
  from evalscope.utils.logger import get_logger
@@ -18,8 +20,10 @@ class DataAdapter(ABC):
18
20
  def __init__(self,
19
21
  name: str,
20
22
  dataset_id: str,
23
+ model_adapter: str,
21
24
  subset_list: list,
22
25
  metric_list: List[str],
26
+ llm_as_a_judge: bool = False,
23
27
  few_shot_num: Optional[int] = 0,
24
28
  train_split: Optional[str] = None,
25
29
  eval_split: Optional[str] = None,
@@ -48,6 +52,7 @@ class DataAdapter(ABC):
48
52
  """
49
53
  self.name = name
50
54
  self.dataset_id = dataset_id
55
+ self.model_adapter = model_adapter
51
56
  self.subset_list = subset_list
52
57
  self.metric_list = metric_list
53
58
  self.few_shot_num = few_shot_num
@@ -58,7 +63,17 @@ class DataAdapter(ABC):
58
63
  self.query_template = query_template
59
64
  self.pretty_name = pretty_name
60
65
  self.config_kwargs = kwargs
66
+ self.llm_as_a_judge = llm_as_a_judge
61
67
  self.category_map = kwargs.get('category_map', {})
68
+ self.choices = kwargs.get('choices', None)
69
+
70
+ def __init_subclass__(cls, **kwargs):
71
+ super().__init_subclass__(**kwargs)
72
+
73
+ # find and decorate parse_pred_result method
74
+ if hasattr(cls, 'parse_pred_result'):
75
+ original_method = cls.parse_pred_result
76
+ cls.parse_pred_result = preprocess_decorator(original_method)
62
77
 
63
78
  def load(self,
64
79
  dataset_name_or_path: str = None,
@@ -78,11 +93,17 @@ class DataAdapter(ABC):
78
93
 
79
94
  # Try to load dataset from local disk
80
95
  if os.path.exists(dataset_name_or_path):
81
- data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
96
+ logger.info(f'Loading dataset from local disk: {dataset_name_or_path}')
97
+ trust_remote_code = kwargs.pop('trust_remote_code', False)
98
+ data_dict = self.load_from_disk(
99
+ dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs)
82
100
  else:
83
- data_dict = self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
84
- if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
85
- raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
101
+ logger.info(f'Loading dataset from hub: {dataset_name_or_path}')
102
+ trust_remote_code = kwargs.pop('trust_remote_code', True)
103
+ data_dict = self.load_from_hub(
104
+ dataset_name_or_path, subset_list, work_dir, trust_remote_code=trust_remote_code, **kwargs)
105
+ if len(data_dict) == 0:
106
+ raise ValueError(f'Dataset is empty: {dataset_name_or_path}')
86
107
  return data_dict
87
108
 
88
109
  def load_from_hub(self, dataset_name_or_path: str, subset_list: list, work_dir: str, **kwargs) -> dict:
@@ -91,8 +112,7 @@ class DataAdapter(ABC):
91
112
  datasets_hub: str = kwargs.pop('datasets_hub', HubType.MODELSCOPE)
92
113
  split_as_subset: bool = kwargs.pop('split_as_subset', False)
93
114
  # Load dataset from remote
94
- logger.info(
95
- f'Loading dataset from {datasets_hub}: > dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
115
+ logger.info(f'Loading dataset: dataset_name: {dataset_name_or_path} > subsets: {subset_list}')
96
116
 
97
117
  data_dict = {}
98
118
  split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
@@ -133,21 +153,7 @@ class DataAdapter(ABC):
133
153
  If you want to support local dataset, please rewrite this method in xxx_data_adapter.
134
154
  Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
135
155
  """
136
- from modelscope.msdatasets import MsDataset
137
-
138
- logger.info(f'Loading dataset from work_dir: {work_dir}: > dataset_name: {dataset_name_or_path} > \
139
- subsets: {subset_list}')
140
- data_dict = {}
141
- subset_list = subset_list or self.subset_list
142
- split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
143
- for sub_name in subset_list:
144
- data_dict[sub_name] = {}
145
- # e.g. train: few-shot, test: target dataset to evaluate
146
- for split in split_list:
147
- dataset = MsDataset.load(
148
- dataset_name=dataset_name_or_path, subset_name=sub_name, split=split, cache_dir=work_dir, **kwargs)
149
- data_dict[sub_name].update({split: dataset})
150
- return data_dict
156
+ return self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
151
157
 
152
158
  def reformat_subset(self, data_dict: dict, subset_key: str, format: str = '{}') -> dict:
153
159
  """
@@ -285,6 +291,12 @@ class DataAdapter(ABC):
285
291
  kwargs['metric_list'] = self.metric_list
286
292
  return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
287
293
 
294
+ def gen_prompt_data(self, prompt: str, **kwargs) -> dict:
295
+ if not isinstance(prompt, list):
296
+ prompt = [prompt]
297
+ prompt_data = PromptData(data=prompt, multi_choices=self.choices, system_prompt=self.system_prompt)
298
+ return prompt_data.to_dict()
299
+
288
300
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
289
301
  """
290
302
  Generate model prompt from raw input, unify the prompt format for different datasets.
@@ -348,3 +360,29 @@ class DataAdapter(ABC):
348
360
  The match result. Usually a score (float) for chat/multiple-choice-questions.
349
361
  """
350
362
  raise NotImplementedError
363
+
364
+ def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float:
365
+ """
366
+ Use LLM as a judge to evaluate the predicted answer against the gold answer.
367
+
368
+ Args:
369
+ gold (Any): The golden answer.
370
+ pred (Any): The predicted answer.
371
+
372
+ Returns:
373
+ The match result as a float score between 0 and 1.
374
+ """
375
+ # Default judge handling
376
+ if judge is None:
377
+ logger.warning('No judge LLM provided, please specify a judge LLM in the config.')
378
+ return 0
379
+
380
+ # Extract question from raw_input if available
381
+ raw_input = kwargs.get('raw_input', {})
382
+ question_keys = ['question', 'prompt', 'query', 'problem']
383
+ question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
384
+
385
+ # Request judge and obtain score
386
+ prompt = judge.build_prompt(pred, gold, question)
387
+ score = judge(prompt)
388
+ return judge.get_score(score)
@@ -15,7 +15,6 @@ logger = get_logger()
15
15
  @Benchmark.register(
16
16
  name='data_collection',
17
17
  dataset_id='', # dataset_id need to be set
18
- model_adapter=ChatGenerationModelAdapter,
19
18
  subset_list=['default'],
20
19
  metric_list=['AverageAccuracy'],
21
20
  few_shot_num=0,
@@ -3,9 +3,8 @@ import csv
3
3
  import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType
6
+ from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics.metrics import exact_match
8
- from evalscope.models import MultiChoiceModelAdapter
9
8
  from evalscope.utils import ResponseParser
10
9
  from evalscope.utils.logger import get_logger
11
10
 
@@ -16,8 +15,10 @@ logger = get_logger()
16
15
 
17
16
  @Benchmark.register(
18
17
  name='general_mcq',
18
+ pretty_name='General MCQ',
19
19
  dataset_id='general_mcq',
20
- model_adapter=MultiChoiceModelAdapter,
20
+ model_adapter=OutputType.MULTIPLE_CHOICE,
21
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
21
22
  subset_list=['default'],
22
23
  metric_list=['AverageAccuracy'],
23
24
  few_shot_num=0,
@@ -27,11 +28,11 @@ logger = get_logger()
27
28
  query_template='问题:{question}\n{choices}\n答案: {answer}\n\n')
28
29
  class GeneralMCQAdapter(DataAdapter):
29
30
 
30
- choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
31
-
32
31
  def __init__(self, **kwargs):
33
32
  super().__init__(**kwargs)
34
33
 
34
+ self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
35
+
35
36
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
36
37
  data_dict = {}
37
38
  for subset_name in subset_list:
@@ -85,7 +86,7 @@ class GeneralMCQAdapter(DataAdapter):
85
86
 
86
87
  full_prompt = self.prompt_template.format(query=context)
87
88
 
88
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
89
+ return self.gen_prompt_data(full_prompt)
89
90
 
90
91
  def get_gold_answer(self, input_d: dict) -> str:
91
92
  # Get the gold choice
@@ -103,14 +104,10 @@ class GeneralMCQAdapter(DataAdapter):
103
104
  Returns:
104
105
  The parsed answer. Depending on the dataset. Usually a string for chat.
105
106
  """
106
- if eval_type == EvalType.CHECKPOINT:
107
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
107
108
  return result
108
- elif eval_type == EvalType.SERVICE:
109
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
110
- elif eval_type == EvalType.CUSTOM:
111
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
112
109
  else:
113
- raise ValueError(f'Invalid eval_type: {eval_type}')
110
+ return ResponseParser.parse_first_option_with_choices(text=result, options=self.choices)
114
111
 
115
112
  def match(self, gold: str, pred: str) -> float:
116
113
  return exact_match(gold=gold, pred=pred)
@@ -1,12 +1,10 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import glob
3
2
  import os.path
4
3
  from collections import defaultdict
5
4
  from typing import List
6
5
 
7
6
  from evalscope.benchmarks import Benchmark, DataAdapter
8
7
  from evalscope.metrics import bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
9
- from evalscope.models import ChatGenerationModelAdapter
10
8
  from evalscope.utils.io_utils import jsonl_to_list
11
9
  from evalscope.utils.logger import get_logger
12
10
 
@@ -16,9 +14,8 @@ logger = get_logger()
16
14
  @Benchmark.register(
17
15
  name='general_qa',
18
16
  dataset_id='general_qa',
19
- model_adapter=ChatGenerationModelAdapter,
20
17
  subset_list=['default'],
21
- metric_list=['AverageBLEU'],
18
+ metric_list=['AverageBLEU', 'AverageRouge'],
22
19
  few_shot_num=0,
23
20
  train_split=None,
24
21
  eval_split='test',
@@ -31,18 +28,31 @@ class GeneralQAAdapter(DataAdapter):
31
28
 
32
29
  super().__init__(**kwargs)
33
30
 
34
- def load(self, **kwargs) -> dict:
31
+ def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
32
+ dataset_name_or_path = dataset_name_or_path or self.dataset_id
33
+ subset_list = subset_list or self.subset_list
35
34
 
36
- data_file_list = glob.glob(os.path.join(self.dataset_id, '*.jsonl'))
35
+ data_file_dict = defaultdict(str)
37
36
  data_list = []
38
37
 
38
+ # get data file path and subset name
39
+ if os.path.isdir(dataset_name_or_path):
40
+ for subset_name in subset_list:
41
+ data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
42
+ elif os.path.isfile(dataset_name_or_path):
43
+ cur_subset_name = os.path.basename(dataset_name_or_path).split('.')[0]
44
+ data_file_dict[cur_subset_name] = dataset_name_or_path
45
+ else:
46
+ raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
47
+
48
+ # load data from local disk
39
49
  try:
40
- for file_path in data_file_list:
50
+ for subset_name, file_path in data_file_dict.items():
41
51
  data_list.extend(jsonl_to_list(file_path))
42
52
  except Exception as e:
43
53
  raise ValueError(f'Failed to load data from {self.dataset_id}, got error: {e}')
44
54
 
45
- data_dict = {'default': {'test': data_list}}
55
+ data_dict = {subset_name: {'test': data_list} for subset_name in data_file_dict.keys()}
46
56
 
47
57
  return data_dict
48
58
 
@@ -65,7 +75,7 @@ class GeneralQAAdapter(DataAdapter):
65
75
 
66
76
  query = input_d.get('question', '') or input_d.get('query', '')
67
77
  prompt = self.prompt_template.format(query=query)
68
- return {'data': [prompt], 'system_prompt': self.system_prompt}
78
+ return self.gen_prompt_data(prompt)
69
79
 
70
80
  def get_gold_answer(self, input_d: dict) -> str:
71
81
  """
@@ -100,10 +110,12 @@ class GeneralQAAdapter(DataAdapter):
100
110
 
101
111
  """
102
112
  res = dict()
103
- rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
104
- bleu_dict = bleu_ngram_one_sample(pred, gold)
105
- res.update(rouge_dict)
106
- res.update(bleu_dict)
113
+ if 'AverageRouge' in self.metric_list:
114
+ rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
115
+ res.update(rouge_dict)
116
+ if 'AverageBLEU' in self.metric_list:
117
+ bleu_dict = bleu_ngram_one_sample(pred, gold)
118
+ res.update(bleu_dict)
107
119
  return res
108
120
 
109
121
  def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
@@ -119,7 +131,10 @@ class GeneralQAAdapter(DataAdapter):
119
131
  """
120
132
  items = defaultdict(list)
121
133
  for scores in review_res_list:
122
- for k, v in scores.items():
123
- items[k].append(v)
134
+ if isinstance(scores, dict):
135
+ for k, v in scores.items():
136
+ items[k].append(v)
137
+ else:
138
+ items['AverageAccuracy'].append(scores)
124
139
  # items = [(score, 1.0) for score in review_res_list]
125
140
  return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
@@ -3,15 +3,16 @@ import random
3
3
  import re
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
- from evalscope.constants import EvalType
6
+ from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics import exact_match
8
- from evalscope.models import ChatGenerationModelAdapter
9
8
 
10
9
 
11
10
  @Benchmark.register(
12
11
  name='gpqa',
12
+ pretty_name='GPQA',
13
13
  dataset_id='modelscope/gpqa',
14
- model_adapter=ChatGenerationModelAdapter,
14
+ model_adapter=OutputType.GENERATION,
15
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
15
16
  subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
16
17
  metric_list=['AveragePass@1'],
17
18
  few_shot_num=5,
@@ -27,8 +28,9 @@ class GPQAAdapter(DataAdapter):
27
28
  self.choices = ['A', 'B', 'C', 'D']
28
29
  if self.few_shot_num and self.few_shot_num > 0:
29
30
  self.prompt_prefix = 'Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n' # noqa: E501
30
- self.prompt_prefix += open(os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'),
31
- 'r').read() + '\nQuestion: '
31
+ self.prompt_prefix += open(
32
+ os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'), 'r',
33
+ encoding='utf-8').read() + '\nQuestion: '
32
34
  else:
33
35
  self.prompt_prefix = 'What is the correct answer to this question:'
34
36
 
@@ -50,7 +52,7 @@ class GPQAAdapter(DataAdapter):
50
52
  query = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}" # noqa: E501
51
53
 
52
54
  prompt = self.prompt_template.format(query=query)
53
- return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
55
+ return self.gen_prompt_data(prompt)
54
56
 
55
57
  def __process_input(self, input_d: dict) -> dict:
56
58
 
@@ -94,7 +96,10 @@ class GPQAAdapter(DataAdapter):
94
96
  """
95
97
  Parse the predicted result and extract proper answer.
96
98
  """
97
- return GPQAAdapter.get_multiple_choice_answer(result)
99
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
100
+ return result
101
+ else:
102
+ return GPQAAdapter.get_multiple_choice_answer(result)
98
103
 
99
104
  def match(self, gold: str, pred: str) -> float:
100
105
  """
@@ -6,7 +6,6 @@ import os
6
6
  import re
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.models import ChatGenerationModelAdapter
10
9
  from evalscope.utils.io_utils import jsonl_to_list
11
10
  from evalscope.utils.logger import get_logger
12
11
 
@@ -15,8 +14,8 @@ logger = get_logger()
15
14
 
16
15
  @Benchmark.register(
17
16
  name='gsm8k',
17
+ pretty_name='GSM8K',
18
18
  dataset_id='modelscope/gsm8k',
19
- model_adapter=ChatGenerationModelAdapter,
20
19
  subset_list=['main'],
21
20
  metric_list=['AverageAccuracy'],
22
21
  few_shot_num=4,
@@ -76,7 +75,7 @@ class GSM8KAdapter(DataAdapter):
76
75
 
77
76
  full_prompt = context + self.prompt_template.format(query=input_d['question'])
78
77
 
79
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
78
+ return self.gen_prompt_data(full_prompt)
80
79
 
81
80
  def get_gold_answer(self, input_d: dict) -> str:
82
81
  # Extract the gold answer from the input dict.
@@ -4,9 +4,8 @@ import os
4
4
  import re
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
- from evalscope.constants import EvalType
7
+ from evalscope.constants import EvalType, OutputType
8
8
  from evalscope.metrics import exact_match
9
- from evalscope.models import ContinuationLogitsModelAdapter
10
9
  from evalscope.utils.io_utils import jsonl_to_list
11
10
  from evalscope.utils.logger import get_logger
12
11
  from evalscope.utils.utils import ResponseParser
@@ -18,20 +17,19 @@ logger = get_logger()
18
17
 
19
18
  @Benchmark.register(
20
19
  name='hellaswag',
20
+ pretty_name='HellaSwag',
21
21
  dataset_id='modelscope/hellaswag',
22
- model_adapter=ContinuationLogitsModelAdapter,
22
+ model_adapter=OutputType.MULTIPLE_CHOICE,
23
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
23
24
  subset_list=['default'],
24
25
  metric_list=['AverageAccuracy'],
25
26
  few_shot_num=0,
26
27
  train_split='train',
27
28
  eval_split='validation',
28
- prompt_template=
29
- 'Respond with the index of sentence that makes the most sense, chose from 0, 1, 2, 3, derive your final answer as `The answer is ...`.', # noqa: E501
29
+ prompt_template='{query}', # noqa: E501
30
30
  )
31
31
  class HellaSwagAdapter(DataAdapter):
32
32
 
33
- choices = ['0', '1', '2', '3']
34
-
35
33
  def __init__(self, **kwargs):
36
34
 
37
35
  few_shot_num = kwargs.get('few_shot_num', 0)
@@ -40,6 +38,7 @@ class HellaSwagAdapter(DataAdapter):
40
38
  kwargs['few_shot_num'] = 0
41
39
 
42
40
  super().__init__(**kwargs)
41
+ self.choices = ['A', 'B', 'C', 'D']
43
42
 
44
43
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
45
44
  data_dict = {}
@@ -85,15 +84,14 @@ class HellaSwagAdapter(DataAdapter):
85
84
  self._generate_prompt(input_d=sample, endings=endings, include_answer=True) for sample in few_shot_list
86
85
  ]
87
86
  context: str = '\n'.join(few_shot_prompts) + '\n'
88
- context += self._generate_prompt(input_d=input_d, endings=endings, include_answer=False)
89
-
90
- ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
87
+ query = context.strip() + self._generate_prompt(input_d=input_d, endings=endings, include_answer=False)
91
88
 
92
- return {'data': ctx_continuation_pair_list, 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
89
+ full_prompt = self.prompt_template.format(query=query)
90
+ return self.gen_prompt_data(full_prompt)
93
91
 
94
92
  def get_gold_answer(self, input_d: dict) -> str:
95
- # Get the gold choice
96
- return input_d['label']
93
+ # Get the gold choice from the label
94
+ return self.choices[int(input_d['label'])]
97
95
 
98
96
  def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
99
97
  """
@@ -107,34 +105,22 @@ class HellaSwagAdapter(DataAdapter):
107
105
  Returns:
108
106
  The parsed answer. Depending on the dataset. Usually a string for chat.
109
107
  """
110
- if eval_type == EvalType.CHECKPOINT:
111
- # answer: in the form of [-2.3, -4.5, ...], len of self.choices
112
- result = np.array(result)
113
- endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
114
- completion_len = np.array([float(len(i)) for i in endings])
115
- best_choice_idx = np.argmax(result / completion_len)
116
-
117
- return str(best_choice_idx)
118
- elif eval_type == EvalType.SERVICE:
119
- return ResponseParser.parse_first_option(result)
120
- elif eval_type == EvalType.CUSTOM:
121
- return ResponseParser.parse_first_option(result)
108
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
109
+ return result
122
110
  else:
123
- raise ValueError(f'Invalid eval_type: {eval_type}')
111
+ return ResponseParser.parse_first_option(result)
124
112
 
125
113
  def match(self, gold: str, pred: str) -> float:
126
114
  return exact_match(gold=str(gold), pred=str(pred))
127
115
 
128
- @classmethod
129
- def _preprocess(cls, text):
116
+ def _preprocess(self, text):
130
117
  text = text.strip()
131
118
  text = text.replace(' [title]', '. ')
132
119
  text = re.sub('\\[.*?\\]', '', text)
133
120
  text = text.replace(' ', ' ')
134
121
  return text
135
122
 
136
- @classmethod
137
- def _generate_prompt(cls, input_d: dict, endings: list, include_answer=True) -> str:
123
+ def _generate_prompt(self, input_d: dict, endings: list, include_answer=True) -> str:
138
124
  """
139
125
  Generate prompt for HellaSwag dataset.
140
126
 
@@ -148,7 +134,13 @@ class HellaSwagAdapter(DataAdapter):
148
134
  """
149
135
 
150
136
  ctx = input_d['ctx_a'] + ' ' + input_d['ctx_b'].capitalize()
151
- example: str = cls._preprocess(input_d['activity_label'] + ': ' + ctx)
137
+ # example: str = cls._preprocess(input_d['activity_label'] + ': ' + ctx)
138
+ example: str = self._preprocess(ctx)
139
+
140
+ example += '\nQuestion: Which ending makes the most sense?'
141
+ for i, ending in enumerate(endings):
142
+ example += f'\n{self.choices[i]}. {ending}'
143
+ example += '\nYou may choose from A, B, C, D. Derive your final answer as `The answer is ...`.'
152
144
 
153
145
  if include_answer:
154
146
  example += '{}\n\n'.format(endings[int(input_d['label'])])
@@ -2,7 +2,6 @@
2
2
  import re
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.models import ChatGenerationModelAdapter
6
5
  from evalscope.utils.logger import get_logger
7
6
 
8
7
  logger = get_logger()
@@ -13,14 +12,18 @@ logger = get_logger()
13
12
 
14
13
  @Benchmark.register(
15
14
  name='humaneval',
15
+ pretty_name='HumanEval',
16
16
  dataset_id='modelscope/humaneval',
17
- model_adapter=ChatGenerationModelAdapter,
18
17
  subset_list=['openai_humaneval'],
19
18
  metric_list=['Pass@1'],
20
19
  few_shot_num=0,
21
20
  train_split=None,
22
21
  eval_split='test',
23
22
  prompt_template='Complete the following python code:\n{query}',
23
+ extra_params={
24
+ 'num_workers': 4,
25
+ 'timeout': 4
26
+ },
24
27
  )
25
28
  class HumanevalAdapter(DataAdapter):
26
29
  """
@@ -35,17 +38,17 @@ class HumanevalAdapter(DataAdapter):
35
38
  raise ImportError('Please install human_eval:'
36
39
  'https://github.com/openai/human-eval/tree/master#installation , '
37
40
  'Note that you need to enable the execution code in the human_eval/execution.py first.')
41
+ super().__init__(**kwargs)
38
42
 
43
+ extra_params = kwargs.get('extra_params', {})
39
44
  self.k = [1]
40
- self.num_workers = 4
41
- self.timeout = 4.0
45
+ self.num_workers = extra_params.get('num_workers', 4)
46
+ self.timeout = extra_params.get('timeout', 4)
42
47
 
43
48
  self.read_problems_func = stream_jsonl
44
49
  self.write_jsonl_func = write_jsonl
45
50
  self.eval_func = check_correctness
46
51
 
47
- super().__init__(**kwargs)
48
-
49
52
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
50
53
  data_dict = {}
51
54
  for subset_name in subset_list:
@@ -66,7 +69,7 @@ class HumanevalAdapter(DataAdapter):
66
69
  query = input_d['prompt']
67
70
  full_prompt = self.prompt_template.format(query=query)
68
71
 
69
- return {'data': [full_prompt], 'system_prompt': self.system_prompt}
72
+ return self.gen_prompt_data(full_prompt)
70
73
 
71
74
  @classmethod
72
75
  def _postprocess(cls, text: str) -> str:
@@ -5,13 +5,12 @@ from evalscope.benchmarks import Benchmark, DataAdapter
5
5
  from evalscope.benchmarks.ifeval.utils import process_results
6
6
  from evalscope.constants import EvalType
7
7
  from evalscope.metrics import Metric, mean, metric_registry
8
- from evalscope.models import ChatGenerationModelAdapter
9
8
 
10
9
 
11
10
  @Benchmark.register(
12
11
  name='ifeval',
12
+ pretty_name='IFEval',
13
13
  dataset_id='opencompass/ifeval',
14
- model_adapter=ChatGenerationModelAdapter,
15
14
  subset_list=['default'],
16
15
  metric_list=[
17
16
  'prompt_level_strict_acc',
@@ -36,7 +35,7 @@ class IFEvalAdapter(DataAdapter):
36
35
  metric_registry.register(Metric(name='inst_level_loose_acc', object=mean))
37
36
 
38
37
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
39
- return {'data': [input_d['prompt']], 'system_prompt': self.system_prompt}
38
+ return self.gen_prompt_data(input_d['prompt'])
40
39
 
41
40
  def get_gold_answer(self, input_d: dict) -> str:
42
41
  return input_d
@@ -1,14 +1,15 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.constants import EvalType
2
+ from evalscope.constants import EvalType, OutputType
3
3
  from evalscope.metrics import exact_match
4
- from evalscope.models import ChatGenerationModelAdapter
5
4
  from evalscope.utils.utils import ResponseParser
6
5
 
7
6
 
8
7
  @Benchmark.register(
9
8
  name='iquiz',
9
+ pretty_name='IQuiz',
10
10
  dataset_id='AI-ModelScope/IQuiz',
11
- model_adapter=ChatGenerationModelAdapter,
11
+ model_adapter=OutputType.GENERATION,
12
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
12
13
  subset_list=['IQ', 'EQ'],
13
14
  metric_list=['AverageAccuracy'],
14
15
  few_shot_num=0,
@@ -36,7 +37,7 @@ class IQuizAdapter(DataAdapter):
36
37
  """
37
38
  prompt = f"问题: {input_d['question']}\n"
38
39
  prompt += self.__form_options(input_d['choices'])
39
- return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
40
+ return self.gen_prompt_data(prompt)
40
41
 
41
42
  def __form_options(self, options: list):
42
43
  option_str = '选项:\n'
@@ -54,7 +55,10 @@ class IQuizAdapter(DataAdapter):
54
55
  """
55
56
  Parse the predicted result and extract proper answer.
56
57
  """
57
- return ResponseParser.parse_first_option_with_choices(result, self.choices)
58
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
59
+ return result
60
+ else:
61
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
58
62
 
59
63
  def match(self, gold: str, pred: str) -> float:
60
64
  """
File without changes