evalscope 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (69) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -5
  3. evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
  4. evalscope/benchmarks/benchmark.py +1 -1
  5. evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
  6. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
  7. evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
  8. evalscope/benchmarks/data_adapter.py +69 -70
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
  10. evalscope/benchmarks/gpqa/__init__.py +0 -0
  11. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  12. evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  13. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
  14. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
  15. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
  16. evalscope/benchmarks/ifeval/__init__.py +0 -0
  17. evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  18. evalscope/benchmarks/ifeval/instructions.py +1477 -0
  19. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  20. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  21. evalscope/benchmarks/ifeval/utils.py +134 -0
  22. evalscope/benchmarks/iquiz/__init__.py +0 -0
  23. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  24. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
  25. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
  26. evalscope/benchmarks/race/race_adapter.py +4 -73
  27. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
  28. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
  29. evalscope/cli/cli.py +2 -0
  30. evalscope/cli/start_app.py +30 -0
  31. evalscope/collections/evaluator.py +82 -62
  32. evalscope/collections/sampler.py +47 -41
  33. evalscope/collections/schema.py +14 -10
  34. evalscope/constants.py +4 -0
  35. evalscope/evaluator/evaluator.py +22 -13
  36. evalscope/metrics/__init__.py +2 -5
  37. evalscope/metrics/metrics.py +11 -2
  38. evalscope/metrics/named_metrics.py +17 -0
  39. evalscope/models/chat_adapter.py +2 -0
  40. evalscope/models/server_adapter.py +11 -4
  41. evalscope/perf/__init__.py +1 -0
  42. evalscope/perf/main.py +0 -1
  43. evalscope/perf/plugin/api/custom_api.py +1 -1
  44. evalscope/perf/plugin/api/openai_api.py +1 -1
  45. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  46. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  47. evalscope/report/__init__.py +5 -0
  48. evalscope/report/app.py +693 -0
  49. evalscope/report/combinator.py +73 -0
  50. evalscope/report/generator.py +80 -0
  51. evalscope/report/utils.py +133 -0
  52. evalscope/run.py +16 -11
  53. evalscope/summarizer.py +1 -1
  54. evalscope/utils/chat_service.py +1 -1
  55. evalscope/utils/logger.py +1 -0
  56. evalscope/utils/model_utils.py +5 -2
  57. evalscope/version.py +2 -2
  58. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +84 -7
  59. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +66 -51
  60. tests/cli/test_collection.py +11 -7
  61. tests/cli/test_run.py +13 -4
  62. evalscope/tools/__init__.py +0 -1
  63. evalscope/tools/combine_reports.py +0 -133
  64. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  65. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  66. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
  67. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
  68. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
  69. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,11 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import glob
3
- import json
4
3
  import os.path
5
4
  from collections import defaultdict
6
- from typing import Any, Optional
5
+ from typing import List
7
6
 
8
7
  from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.metrics import (WeightedAverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh,
10
- weighted_mean)
8
+ from evalscope.metrics import AverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
11
9
  from evalscope.models import ChatGenerationModelAdapter
12
10
  from evalscope.utils.io_utils import jsonl_to_list
13
11
  from evalscope.utils.logger import get_logger
@@ -20,7 +18,7 @@ logger = get_logger()
20
18
  dataset_id='general_qa',
21
19
  model_adapter=ChatGenerationModelAdapter,
22
20
  subset_list=['default'],
23
- metric_list=[WeightedAverageBLEU],
21
+ metric_list=[AverageBLEU],
24
22
  few_shot_num=0,
25
23
  train_split=None,
26
24
  eval_split='test',
@@ -68,7 +66,7 @@ class GeneralQAAdapter(DataAdapter):
68
66
 
69
67
  # if len(history) > 0:
70
68
  # prompt = '\n'.join(history) + '\n' + prompt
71
- return {'data': [prompt]}
69
+ return {'data': [prompt], 'system_prompt': self.prompt_template}
72
70
 
73
71
  def get_gold_answer(self, input_d: dict) -> str:
74
72
  """
@@ -92,14 +90,14 @@ class GeneralQAAdapter(DataAdapter):
92
90
  """
93
91
  return result
94
92
 
95
- def match(self, gold: str, pred: str) -> float:
93
+ def match(self, gold: str, pred: str) -> dict:
96
94
  """
97
95
  Args:
98
96
  gold: str
99
97
  pred: str
100
98
 
101
99
  Returns:
102
- bleu_score: float
100
+ bleu_score: dict
103
101
 
104
102
  """
105
103
  res = dict()
@@ -107,10 +105,9 @@ class GeneralQAAdapter(DataAdapter):
107
105
  bleu_dict = bleu_ngram_one_sample(pred, gold)
108
106
  res.update(rouge_dict)
109
107
  res.update(bleu_dict)
110
- # return bleu(item)
111
108
  return res
112
109
 
113
- def compute_metric(self, review_res_list: list) -> float:
110
+ def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
114
111
  """
115
112
  compute weighted mean of the bleu score of all samples
116
113
 
@@ -118,62 +115,12 @@ class GeneralQAAdapter(DataAdapter):
118
115
  review_res_list: [score1, score2, ...]
119
116
 
120
117
  Returns:
121
- avg_res: float
118
+ avg_res: List[dict]
122
119
 
123
120
  """
124
121
  items = defaultdict(list)
125
122
  for scores in review_res_list:
126
123
  for k, v in scores.items():
127
- items[k].append((v, 1.0))
124
+ items[k].append(v)
128
125
  # items = [(score, 1.0) for score in review_res_list]
129
- res = {k: weighted_mean(v) for k, v in items.items()}
130
- # return weighted_mean(items)
131
- return res
132
-
133
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
134
- """
135
- Args:
136
- subset_score_map: {subset_name: (score_dict, num), ...}
137
- report_name: str, the user-defined report name.
138
-
139
- Returns:
140
- {
141
- "name":"GeneralQA",
142
- "metric":"WeightedAverageBLEU",
143
- "score":0.399,
144
- "category":[
145
- {
146
- "name":"DEFAULT",
147
- "score":0.399,
148
- "subset":[
149
- {
150
- "name":"default",
151
- "score":0.399
152
- },
153
- ]
154
- }
155
- ],
156
- "total_num":10
157
- }
158
- """
159
- total_num: int = sum([num for _, num in subset_score_map.values()])
160
- # weighted_avg_bleu: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
161
- cate_avg_list = [{
162
- 'name': subset_name,
163
- 'score': score_dict
164
- } for subset_name, (score_dict, _) in subset_score_map.items()]
165
- total_avg_list = defaultdict(float)
166
- for score_dict, num in subset_score_map.values():
167
- for metric, score in score_dict.items():
168
- total_avg_list[metric] += score * num / total_num
169
-
170
- category_d = dict(name='DEFAULT', score=total_avg_list, subset=cate_avg_list)
171
-
172
- res_map = dict(
173
- name=report_name or 'general_qa',
174
- metric=self.metric_list[0]['name'],
175
- score=total_avg_list,
176
- category=[category_d],
177
- total_num=total_num)
178
-
179
- return res_map
126
+ return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
File without changes
@@ -0,0 +1,81 @@
1
+ Question: In a given population, 1 out of every 400 people has a cancer caused by a completely recessive allele, b. Assuming the population is in Hardy-Weinberg equilibrium, which of the following is the expected proportion of individuals who carry the b allele but are not expected to develop the cancer?
2
+ Choices:
3
+ (A) 1/400
4
+ (B) 19/400
5
+ (C) 20/400
6
+ (D) 38/400
7
+ Let's think step by step:
8
+ The expected proportion of individuals who carry the b allele but are not expected to develop the cancer equals to the frequency of heterozygous allele in the given population.
9
+ According to the Hardy-Weinberg equation p∧2 + 2pq + q∧2 = 1, where p is the frequency of dominant allele frequency, q is the frequency of recessive allele frequency, p∧2 is the frequency of the homozygous dominant allele, q∧2 is the frequency of the recessive allele, and 2pq is the frequency of the heterozygous allele.
10
+ Given that q∧2=1/400, hence, q=0.05 and p=1-q=0.95.
11
+ The frequency of the heterozygous allele is 2pq=2*0.05*0.95=38/400.
12
+ The correct answer is (D)
13
+ Question: A Fe pellet of 0.056 g is first dissolved in 10 mL of hydrobromic acid HBr (0.1 M). The resulting solution is then titrated by KMnO4 (0.02 M). How many equivalence points are there?
14
+ Choices:
15
+ (A) Two points, 25 ml and 35 ml
16
+ (B) One point, 25 mL
17
+ (C) One point, 10 ml
18
+ (D) Two points, 25 ml and 30 ml
19
+ Let's think step by step:
20
+ HBr will react with Fe to produce Fe2+. MnO4- will first react with Fe2+ then Br-.
21
+ Two equivalence points will exist 25 ml and 35 ml.
22
+ HBr will react with Fe to produce Fe2+. MnO4- will first react with Fe2+ then Br-.
23
+ Two equivalence points will exist 25 ml and 35 ml.
24
+ In the beaker there is Fe2+ and Br-.
25
+ When considering titration with two analytes one will have to consider which reaction will occur first.
26
+ Since it is a redox titration consider the reduction potential of:
27
+ E0 (Br2 /Br- ) = 1.09 V E0 (MnO4-/ Mn2+) = 1.49 V E0 (Fe3+/Fe2+) =0.77 V
28
+ [Fe2+]=m/MV=0.1M.
29
+ Reaction 1: MnO4- + 5Fe2+ + 8H+ → Mn2+ + 5Fe3+ + 4H2O
30
+ Reaction 2: 2MnO4- + 10Br- + 16H+ → 2Mn2+ + 5Br2 + 8H2O
31
+ So MnO4- will first react with Fe2+ with a stoichiometry of 1:5 so Veq1 will be 10 ml.
32
+ Then when Fe2+ is used up, MnO4- will react with Br- with a stoichiometry of 2:10 then V added will be 25 ml so Veq2=25+10=35 ml.
33
+ The correct answer is (A)
34
+ Question: Consider a quantum mechanical system containing a particle of mass $m$ moving in an istropic three dimensional potential of the form $V(r) = 1/2 m \omega^2 r^2$ corresponding to the acted force obeying Hooke’s law. Here, $\omega$ is the angular frequency of oscillation and $r$ is the radial distance of the particle from the origin in spherical polar coordinate. What is the value of energy of the third excited state, and how many linearly independent eigenfunctions are possible for the same energy eigenvalue?
35
+ Choices:
36
+ (A) 11 \pi^2 \hbar^2 / (2m r^2), 3
37
+ (B) (9/2) \hbar \omega , 10
38
+ (C) 11 \pi^2 \hbar^2 / (2m r^2), 10
39
+ (D) (9/2) \hbar \omega, 3
40
+ Let's think step by step:
41
+ This problem is nothing but the three dimensional simple harmonic oscillator (SHO) problem.
42
+ The energy spectrum of three dimensional SHO is $E_n= (n+3/2)\hbar \omega$ where $n=0,1,2,3….$.
43
+ For third excited state n=3.
44
+ 3+3/2=6/2+3/2=9/2.
45
+ Thus the corresponding energy is $(9/2)\hbar \omega$.
46
+ The degeneracy of the state is $g_n= (n+1)(n+2)/2$.
47
+ For n=3, degeneracy is (3+1)*(3+2)/2=4*5/2=10.
48
+ The correct answer is (B)
49
+ Question: "Your overhear two chemists talking to each other as they leave a synthetic organic chemistry lab. One asks the other "So, how did it go?" The second chemist replies, "Not well - my compounds are on top of each other." What is the second chemist most likely referring to?"
50
+ Choices:
51
+ (A) The compounds they are working with have similar polarities.
52
+ (B) The compounds they are working with have similar boiling points.
53
+ (C) The compounds they are working with are bonding to each other through non-covalent/van der Waals interactions.
54
+ (D) The compounds they are working with have similar optical rotations.
55
+ Let's think step by step:
56
+ "On top of each other" commonly refers to two compounds that have similar Rf values on chromatography (a common operation in synthetic chemistry).
57
+ Similar Rf values arise for compounds with similar polarities.
58
+ The correct answer is (A)
59
+ Question: Two people are playing the following game. A fair coin is tossed into the air. Person A says that in a single toss of the coin, the tail will come. So it's like the first shot or the third shot or the fifth shot. Person B says that the coin will come with a double toss. So like the second, fourth, sixth or eighth shot. Imagine this game played forever. What is the probability that person A wins this game?
60
+ Choices:
61
+ (A) 1/2
62
+ (B) 1/4
63
+ (C) 2/3
64
+ (D) 1/8
65
+ Let's think step by step:
66
+ When finding the correct answer, the probability of playing forever and the coin's single-point toss will be calculated.
67
+ For example, a tail may appear on the first shot.
68
+ This probability is 1/2. if the first toss doesn't come up, it shouldn't come to the second roll either, because the second throw is an even number.
69
+ So it can come in the third shot.
70
+ This is (1/2)(1/2)(1/2).
71
+ So (1/2)^3=1/8.
72
+ Or it could come on the fifth shot.
73
+ This is (1/2)^5=1/32.
74
+ This is actually a geometric series that goes on forever.
75
+ We can write this series as follows.
76
+ (1/2) + (1/2)^3 + (1/2)^5 + (1/2)^7 + ……….
77
+ The solution for this series is as follows : a1/(1-r) where a1 is the first number and r is the sequence or r= a2/a1 or a3/a2 etc.
78
+ a1=1/2
79
+ r=(1/2)^2=1/4
80
+ So a1/(1-r)=(1/2)/(1-1/4)=(1/2)/(3/4)=2/3.
81
+ The correct answer is (C)
@@ -0,0 +1,103 @@
1
+ import os
2
+ import random
3
+ import re
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import AnswerKeys, EvalType
7
+ from evalscope.metrics import Pass1, exact_match
8
+ from evalscope.models import ChatGenerationModelAdapter
9
+ from evalscope.utils.utils import ResponseParser
10
+
11
+
12
+ @Benchmark.register(
13
+ name='gpqa',
14
+ dataset_id='modelscope/gpqa',
15
+ model_adapter=ChatGenerationModelAdapter,
16
+ subset_list=['gpqa_extended', 'gpqa_main', 'gpqa_diamond'],
17
+ metric_list=[Pass1],
18
+ few_shot_num=5,
19
+ train_split='train',
20
+ eval_split='train', # only have train split
21
+ prompt_template='',
22
+ )
23
+ class GPQAAdapter(DataAdapter):
24
+
25
+ def __init__(self, **kwargs):
26
+ super().__init__(**kwargs)
27
+
28
+ self.choices = ['A', 'B', 'C', 'D']
29
+ if self.few_shot_num and self.few_shot_num > 0:
30
+ self.prompt_prefix = 'Here are some example questions from experts. Answer the final question yourself, following the format of the previous questions exactly.\n' # noqa: E501
31
+ self.prompt_prefix += open(os.path.join(os.path.dirname(__file__), 'chain_of_thought.txt'),
32
+ 'r').read() + '\nQuestion: '
33
+ else:
34
+ self.prompt_prefix = 'What is the correct answer to this question:'
35
+
36
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
37
+ """
38
+ Generate model prompt from input data.
39
+ example:
40
+ {
41
+ "question":"Two people are playing the following game. A fair coin is tossed into the air. Person A says that in a single toss of the coin, the tail will come. So it's like the first shot or the third shot or the fifth shot. Person B says that the coin will come with a double toss. So like the second, fourth, sixth or eighth shot. Imagine this game played forever. What is the probability that person A wins this game?",
42
+ "choice1":"1/2",
43
+ "choice2":"1/4",
44
+ "choice3":"2/3",
45
+ "choice4":"1/8",
46
+ "answer":"C",
47
+ }
48
+ """ # noqa: E501
49
+ processed_input_d = self.__process_input(input_d)
50
+ input_d['answer'] = processed_input_d['answer'] # add answer to input_d for answer extraction
51
+ prompt = self.prompt_prefix + f"{input_d['Question']}\n{self.__form_options(processed_input_d['choices'])}Let's think step by step: " # noqa: E501
52
+
53
+ return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
54
+
55
+ def __process_input(self, input_d: dict) -> dict:
56
+
57
+ def preprocess(text):
58
+ if text is None:
59
+ return ' '
60
+ text = text.strip()
61
+ text = text.replace(' [title]', '. ')
62
+ text = re.sub('\\[.*?\\]', '', text)
63
+ text = text.replace(' ', ' ')
64
+ return text
65
+
66
+ choices = [
67
+ preprocess(input_d['Incorrect Answer 1']),
68
+ preprocess(input_d['Incorrect Answer 2']),
69
+ preprocess(input_d['Incorrect Answer 3']),
70
+ preprocess(input_d['Correct Answer']),
71
+ ]
72
+ random.shuffle(choices)
73
+ correct_answer_index = choices.index(preprocess(input_d['Correct Answer']))
74
+
75
+ out_doc = {
76
+ 'choices': [choices[0], choices[1], choices[2], choices[3]],
77
+ 'answer': f'{chr(65 + correct_answer_index)}',
78
+ }
79
+ return out_doc
80
+
81
+ def __form_options(self, options: list):
82
+ option_str = 'Choices:\n'
83
+ for opt, choice in zip(options, self.choices):
84
+ option_str += f'({choice}) {opt}' + '\n'
85
+ return option_str
86
+
87
+ def get_gold_answer(self, input_d: dict) -> str:
88
+ """
89
+ Parse the raw input labels (gold).
90
+ """
91
+ return input_d['answer']
92
+
93
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
94
+ """
95
+ Parse the predicted result and extract proper answer.
96
+ """
97
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
98
+
99
+ def match(self, gold: str, pred: str) -> float:
100
+ """
101
+ Match the gold answer and the predicted answer.
102
+ """
103
+ return exact_match(gold=gold, pred=pred)
@@ -6,7 +6,7 @@ import os
6
6
  import re
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.metrics import WeightedAverageAccuracy
9
+ from evalscope.metrics import AverageAccuracy
10
10
  from evalscope.models import ChatGenerationModelAdapter
11
11
  from evalscope.utils.io_utils import jsonl_to_list
12
12
  from evalscope.utils.logger import get_logger
@@ -19,7 +19,7 @@ logger = get_logger()
19
19
  dataset_id='modelscope/gsm8k',
20
20
  model_adapter=ChatGenerationModelAdapter,
21
21
  subset_list=['main'],
22
- metric_list=[WeightedAverageAccuracy],
22
+ metric_list=[AverageAccuracy],
23
23
  few_shot_num=4,
24
24
  train_split='train',
25
25
  eval_split='test',
@@ -33,7 +33,7 @@ class GSM8KAdapter(DataAdapter):
33
33
 
34
34
  Args:
35
35
  subset_list (list): Subset list for the dataset. Default: ['main']
36
- metric_list (list): Metric list for the dataset. Default: [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
36
+ metric_list (list): Metric list for the dataset. Default: [{'name': 'AverageAccuracy', 'object': mean}]
37
37
  few_shot_num (int): Number of few-shot examples. Default: 4
38
38
  train_split (str): Train split name. Default: 'train'
39
39
  eval_split (str): The target eval split name. Default: 'test'
@@ -75,9 +75,8 @@ class GSM8KAdapter(DataAdapter):
75
75
  use_fewshot = self.few_shot_num > 0
76
76
 
77
77
  full_prompt = self._generate_prompt(input_d, few_shot_list=few_shot_list, use_fewshot=use_fewshot)
78
- full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
79
78
 
80
- return {'data': [full_prompt]}
79
+ return {'data': [full_prompt], 'system_prompt': self.prompt_template}
81
80
 
82
81
  def get_gold_answer(self, input_d: dict) -> str:
83
82
  # Extract the gold answer from the input dict.
@@ -5,10 +5,11 @@ import re
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import WeightedAverageAccuracy, exact_match
8
+ from evalscope.metrics import AverageAccuracy, exact_match
9
9
  from evalscope.models import ContinuationLogitsModelAdapter
10
10
  from evalscope.utils.io_utils import jsonl_to_list
11
11
  from evalscope.utils.logger import get_logger
12
+ from evalscope.utils.utils import ResponseParser
12
13
 
13
14
  # flake8: noqa
14
15
 
@@ -20,11 +21,12 @@ logger = get_logger()
20
21
  dataset_id='modelscope/hellaswag',
21
22
  model_adapter=ContinuationLogitsModelAdapter,
22
23
  subset_list=['default'],
23
- metric_list=[WeightedAverageAccuracy],
24
+ metric_list=[AverageAccuracy],
24
25
  few_shot_num=0,
25
26
  train_split='train',
26
27
  eval_split='validation',
27
- prompt_template='',
28
+ prompt_template=
29
+ 'Respond with the index of sentence that makes the most sense, chose from 0, 1, 2, 3, derive your final answer as `The answer is ...`.', # noqa: E501
28
30
  )
29
31
  class HellaSwagAdapter(DataAdapter):
30
32
 
@@ -87,7 +89,11 @@ class HellaSwagAdapter(DataAdapter):
87
89
 
88
90
  ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
89
91
 
90
- return {'data': ctx_continuation_pair_list, 'multi_choices': self.choices}
92
+ return {
93
+ 'data': ctx_continuation_pair_list,
94
+ 'multi_choices': self.choices,
95
+ 'system_prompt': self.prompt_template
96
+ }
91
97
 
92
98
  def get_gold_answer(self, input_d: dict) -> str:
93
99
  # Get the gold choice
@@ -114,9 +120,9 @@ class HellaSwagAdapter(DataAdapter):
114
120
 
115
121
  return str(best_choice_idx)
116
122
  elif eval_type == EvalType.SERVICE:
117
- return result # TODO: to be supported !
123
+ return ResponseParser.parse_first_option(result)
118
124
  elif eval_type == EvalType.CUSTOM:
119
- return result # TODO: to be supported !
125
+ return ResponseParser.parse_first_option(result)
120
126
  else:
121
127
  raise ValueError(f'Invalid eval_type: {eval_type}')
122
128
 
@@ -1,6 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import re
3
- from typing import List
4
3
 
5
4
  from evalscope.benchmarks import Benchmark, DataAdapter
6
5
  from evalscope.metrics import Pass1
@@ -22,7 +21,7 @@ logger = get_logger()
22
21
  few_shot_num=0,
23
22
  train_split=None,
24
23
  eval_split='test',
25
- prompt_template='Complete the following python code:\n',
24
+ prompt_template='',
26
25
  )
27
26
  class HumanevalAdapter(DataAdapter):
28
27
  """
@@ -66,9 +65,9 @@ class HumanevalAdapter(DataAdapter):
66
65
  {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
67
66
  """
68
67
  full_prompt = input_d['prompt']
69
- full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
68
+ full_prompt = f'Complete the following python code:\n{full_prompt}' if self.prompt_template else full_prompt
70
69
 
71
- return {'data': [full_prompt]}
70
+ return {'data': [full_prompt], 'system_prompt': self.prompt_template}
72
71
 
73
72
  @classmethod
74
73
  def _postprocess(cls, text: str) -> str:
File without changes
@@ -0,0 +1,56 @@
1
+ from collections import defaultdict
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.benchmarks import Benchmark, DataAdapter
5
+ from evalscope.benchmarks.ifeval.utils import agg_inst_level_acc, process_results
6
+ from evalscope.constants import EvalType
7
+ from evalscope.metrics import Metric, mean
8
+ from evalscope.models import ChatGenerationModelAdapter
9
+
10
+
11
+ @Benchmark.register(
12
+ name='ifeval',
13
+ dataset_id='opencompass/ifeval',
14
+ model_adapter=ChatGenerationModelAdapter,
15
+ subset_list=['default'],
16
+ metric_list=[
17
+ Metric(name='prompt_level_strict_acc', object=mean),
18
+ Metric(name='inst_level_strict_acc', object=agg_inst_level_acc),
19
+ Metric(name='prompt_level_loose_acc', object=mean),
20
+ Metric(name='inst_level_loose_acc', object=agg_inst_level_acc),
21
+ ],
22
+ few_shot_num=0,
23
+ train_split=None,
24
+ eval_split='train',
25
+ prompt_template='',
26
+ )
27
+ class IFEvalAdapter(DataAdapter):
28
+
29
+ def __init__(self, **kwargs):
30
+ super().__init__(**kwargs)
31
+
32
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
33
+ return {'data': [input_d['prompt']], 'system_prompt': self.prompt_template}
34
+
35
+ def get_gold_answer(self, input_d: dict) -> str:
36
+ return input_d
37
+
38
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
39
+ return result
40
+
41
+ def match(self, gold: Any, pred: Any) -> Dict:
42
+ return process_results(gold, [pred])
43
+
44
+ def compute_metric(self, review_res_list: List[dict]) -> Any:
45
+ # aggregate review results
46
+ res_dict = defaultdict(list)
47
+ for res in review_res_list:
48
+ for k, v in res.items():
49
+ res_dict[k].append(v)
50
+
51
+ metrics = []
52
+ for metric in self.metric_list:
53
+ metric_name = metric.name
54
+ pred_value = res_dict[metric_name]
55
+ metrics.append({'metric_name': metric_name, 'score': metric.object(pred_value), 'num': len(pred_value)})
56
+ return metrics