evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +115 -87
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
  26. evalscope/benchmarks/ifeval/__init__.py +0 -0
  27. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  28. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  29. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  30. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  31. evalscope/benchmarks/ifeval/utils.py +134 -0
  32. evalscope/benchmarks/iquiz/__init__.py +0 -0
  33. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  34. evalscope/benchmarks/mmlu/__init__.py +0 -5
  35. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  36. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  37. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  38. evalscope/benchmarks/race/__init__.py +0 -5
  39. evalscope/benchmarks/race/race_adapter.py +26 -123
  40. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  41. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  42. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  43. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  44. evalscope/cli/cli.py +2 -0
  45. evalscope/cli/start_app.py +29 -0
  46. evalscope/collections/__init__.py +3 -0
  47. evalscope/collections/evaluator.py +198 -0
  48. evalscope/collections/sampler.py +138 -0
  49. evalscope/collections/schema.py +126 -0
  50. evalscope/config.py +7 -5
  51. evalscope/constants.py +9 -26
  52. evalscope/evaluator/evaluator.py +87 -121
  53. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  54. evalscope/metrics/__init__.py +3 -0
  55. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  56. evalscope/metrics/math_accuracy.py +193 -50
  57. evalscope/metrics/metrics.py +18 -6
  58. evalscope/metrics/named_metrics.py +17 -0
  59. evalscope/metrics/rouge_metric.py +13 -8
  60. evalscope/models/__init__.py +14 -1
  61. evalscope/models/base_adapter.py +52 -0
  62. evalscope/models/chat_adapter.py +138 -0
  63. evalscope/models/choice_adapter.py +211 -0
  64. evalscope/models/custom_adapter.py +67 -0
  65. evalscope/models/local_model.py +74 -0
  66. evalscope/models/model.py +141 -0
  67. evalscope/models/server_adapter.py +111 -0
  68. evalscope/perf/__init__.py +1 -0
  69. evalscope/perf/main.py +0 -1
  70. evalscope/perf/plugin/api/custom_api.py +1 -1
  71. evalscope/perf/plugin/api/openai_api.py +1 -1
  72. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  73. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  74. evalscope/report/__init__.py +5 -0
  75. evalscope/report/app.py +506 -0
  76. evalscope/report/combinator.py +73 -0
  77. evalscope/report/generator.py +80 -0
  78. evalscope/report/utils.py +133 -0
  79. evalscope/run.py +48 -72
  80. evalscope/run_arena.py +1 -1
  81. evalscope/summarizer.py +1 -1
  82. evalscope/utils/__init__.py +1 -1
  83. evalscope/utils/chat_service.py +5 -4
  84. evalscope/utils/io_utils.py +8 -0
  85. evalscope/utils/logger.py +5 -0
  86. evalscope/utils/model_utils.py +15 -2
  87. evalscope/utils/utils.py +3 -25
  88. evalscope/version.py +2 -2
  89. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
  90. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
  91. tests/cli/test_collection.py +57 -0
  92. tests/cli/test_run.py +52 -1
  93. tests/rag/test_mteb.py +3 -2
  94. evalscope/models/api/__init__.py +0 -3
  95. evalscope/models/dummy_chat_model.py +0 -49
  96. evalscope/models/model_adapter.py +0 -525
  97. evalscope/models/openai_model.py +0 -103
  98. evalscope/tools/__init__.py +0 -1
  99. evalscope/tools/combine_reports.py +0 -133
  100. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  101. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  102. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  103. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  104. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  105. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  106. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -3,54 +3,43 @@ import numpy as np
3
3
  import os
4
4
  import re
5
5
 
6
- from evalscope.benchmarks.data_adapter import DataAdapter
7
- from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import normalize_score
6
+ from evalscope.benchmarks import Benchmark, DataAdapter
7
+ from evalscope.constants import EvalType
8
+ from evalscope.metrics import AverageAccuracy, exact_match
9
+ from evalscope.models import ContinuationLogitsModelAdapter
9
10
  from evalscope.utils.io_utils import jsonl_to_list
10
11
  from evalscope.utils.logger import get_logger
12
+ from evalscope.utils.utils import ResponseParser
11
13
 
12
14
  # flake8: noqa
13
15
 
14
16
  logger = get_logger()
15
17
 
16
- DATASET_ID = 'modelscope/hellaswag'
17
- SUBSET_LIST = ['default']
18
-
19
18
 
19
+ @Benchmark.register(
20
+ name='hellaswag',
21
+ dataset_id='modelscope/hellaswag',
22
+ model_adapter=ContinuationLogitsModelAdapter,
23
+ subset_list=['default'],
24
+ metric_list=[AverageAccuracy],
25
+ few_shot_num=0,
26
+ train_split='train',
27
+ eval_split='validation',
28
+ prompt_template=
29
+ 'Respond with the index of sentence that makes the most sense, chose from 0, 1, 2, 3, derive your final answer as `The answer is ...`.', # noqa: E501
30
+ )
20
31
  class HellaSwagAdapter(DataAdapter):
21
32
 
22
33
  choices = ['0', '1', '2', '3']
23
34
 
24
- def __init__(self,
25
- subset_list: list = None,
26
- metric_list: list = None,
27
- few_shot_num: int = None,
28
- train_split: str = 'train',
29
- eval_split: str = 'validation',
30
- **kwargs):
31
-
32
- if subset_list is None:
33
- subset_list = SUBSET_LIST
34
-
35
- if metric_list is None:
36
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
37
-
38
- if few_shot_num is None:
39
- # Use 0-shot by default
40
- logger.info(f'Set 0-shot examples by system for HellaSwag.')
41
- few_shot_num = 0
35
+ def __init__(self, **kwargs):
42
36
 
37
+ few_shot_num = kwargs.get('few_shot_num', 0)
43
38
  if few_shot_num != 0:
44
39
  logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.')
45
- few_shot_num = 0
40
+ kwargs['few_shot_num'] = 0
46
41
 
47
- super().__init__(
48
- subset_list=subset_list,
49
- metric_list=metric_list,
50
- few_shot_num=few_shot_num,
51
- train_split=train_split,
52
- eval_split=eval_split,
53
- **kwargs)
42
+ super().__init__(**kwargs)
54
43
 
55
44
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
56
45
  data_dict = {}
@@ -100,13 +89,17 @@ class HellaSwagAdapter(DataAdapter):
100
89
 
101
90
  ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
102
91
 
103
- return {'data': ctx_continuation_pair_list, 'multi_choices': self.choices}
92
+ return {
93
+ 'data': ctx_continuation_pair_list,
94
+ 'multi_choices': self.choices,
95
+ 'system_prompt': self.prompt_template
96
+ }
104
97
 
105
98
  def get_gold_answer(self, input_d: dict) -> str:
106
99
  # Get the gold choice
107
100
  return input_d['label']
108
101
 
109
- def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
102
+ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
110
103
  """
111
104
  Parse the model output to get the answer. Could be the best choice index.
112
105
 
@@ -118,7 +111,7 @@ class HellaSwagAdapter(DataAdapter):
118
111
  Returns:
119
112
  The parsed answer. Depending on the dataset. Usually a string for chat.
120
113
  """
121
- if eval_type == 'checkpoint':
114
+ if eval_type == EvalType.CHECKPOINT:
122
115
  # answer: in the form of [-2.3, -4.5, ...], len of self.choices
123
116
  result = np.array(result)
124
117
  endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
@@ -126,76 +119,16 @@ class HellaSwagAdapter(DataAdapter):
126
119
  best_choice_idx = np.argmax(result / completion_len)
127
120
 
128
121
  return str(best_choice_idx)
129
- elif eval_type == 'service':
130
- return result # TODO: to be supported !
131
- elif eval_type == 'custom':
132
- return result # TODO: to be supported !
122
+ elif eval_type == EvalType.SERVICE:
123
+ return ResponseParser.parse_first_option(result)
124
+ elif eval_type == EvalType.CUSTOM:
125
+ return ResponseParser.parse_first_option(result)
133
126
  else:
134
127
  raise ValueError(f'Invalid eval_type: {eval_type}')
135
128
 
136
129
  def match(self, gold: str, pred: str) -> float:
137
130
  return exact_match(gold=str(gold), pred=str(pred))
138
131
 
139
- def compute_metric(self, review_res_list: list) -> float:
140
- """
141
- Compute evaluation result by specific metric.
142
-
143
- Args:
144
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
145
-
146
- Returns:
147
- The metric score.
148
- """
149
- items = [(score, 1.0) for score in review_res_list]
150
- return weighted_mean(items)
151
-
152
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
153
- """
154
- Generate the report for the model output.
155
-
156
- Args:
157
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
158
- report_name: The user-defined report name.
159
-
160
- Returns: A dict of metric calculation results. The format is like:
161
- {
162
- "name":"HellaSwag",
163
- "metric":"WeightedAverageAccuracy",
164
- "score":0.3389,
165
- "category":[
166
- {
167
- "name":"DEFAULT",
168
- "score":0.4128,
169
- "subset":[
170
- {
171
- "name":"default",
172
- "score":0.5632
173
- },
174
- ]
175
- }
176
- ],
177
- "total_num":7800
178
- }
179
- """
180
- total_num: int = sum([num for _, num in subset_score_map.values()])
181
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
182
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
183
- cate_avg_list = [{
184
- 'name': subset_name,
185
- 'score': normalize_score(score=score)
186
- } for subset_name, (score, _) in subset_score_map.items()]
187
-
188
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
189
-
190
- res_map = dict(
191
- name=report_name or 'hellaswag',
192
- metric=self.metric_list[0]['name'],
193
- score=weighted_avg_acc,
194
- category=[category_d],
195
- total_num=total_num)
196
-
197
- return res_map
198
-
199
132
  @classmethod
200
133
  def _preprocess(cls, text):
201
134
  text = text.strip()
@@ -1,5 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.humaneval.humaneval_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.humaneval.humaneval_adapter import HumanevalAdapter as DataAdapterClass
5
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
@@ -1,38 +1,34 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import json
3
- import os
4
2
  import re
5
- from tqdm import tqdm
6
- from typing import List
7
3
 
8
- from evalscope.benchmarks.data_adapter import DataAdapter
9
- from evalscope.metrics.metrics import weighted_mean
10
- from evalscope.tools.combine_reports import gen_table
11
- from evalscope.utils import normalize_score
4
+ from evalscope.benchmarks import Benchmark, DataAdapter
5
+ from evalscope.metrics import Pass1
6
+ from evalscope.models import ChatGenerationModelAdapter
12
7
  from evalscope.utils.logger import get_logger
13
8
 
14
9
  logger = get_logger()
15
10
 
16
- DATASET_ID = 'modelscope/humaneval'
17
- SUBSET_LIST = ['openai_humaneval']
18
-
19
11
  # Example:
20
12
  # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa
21
13
 
22
14
 
15
+ @Benchmark.register(
16
+ name='humaneval',
17
+ dataset_id='modelscope/humaneval',
18
+ model_adapter=ChatGenerationModelAdapter,
19
+ subset_list=['openai_humaneval'],
20
+ metric_list=[Pass1],
21
+ few_shot_num=0,
22
+ train_split=None,
23
+ eval_split='test',
24
+ prompt_template='',
25
+ )
23
26
  class HumanevalAdapter(DataAdapter):
24
27
  """
25
28
  A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
26
29
  """
27
30
 
28
- def __init__(self,
29
- subset_list: list = None,
30
- metric_list: list = None,
31
- few_shot_num: int = None,
32
- train_split: str = None,
33
- eval_split: str = 'test',
34
- prompt_template: str = 'Complete the following python code:\n',
35
- **kwargs):
31
+ def __init__(self, **kwargs):
36
32
  try:
37
33
  from human_eval.data import stream_jsonl, write_jsonl
38
34
  from human_eval.evaluation import check_correctness
@@ -41,29 +37,15 @@ class HumanevalAdapter(DataAdapter):
41
37
  'https://github.com/openai/human-eval/tree/master#installation , '
42
38
  'Note that you need to enable the execution code in the human_eval/execution.py first.')
43
39
 
44
- if subset_list is None:
45
- subset_list = SUBSET_LIST
46
-
47
- if metric_list is None:
48
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
49
-
50
40
  self.k = [1]
51
41
  self.num_workers = 4
52
42
  self.timeout = 4.0
53
- self.outputs = kwargs.get('outputs', None)
54
43
 
55
44
  self.read_problems_func = stream_jsonl
56
45
  self.write_jsonl_func = write_jsonl
57
46
  self.eval_func = check_correctness
58
47
 
59
- super().__init__(
60
- subset_list=subset_list,
61
- metric_list=metric_list,
62
- few_shot_num=few_shot_num,
63
- train_split=train_split,
64
- eval_split=eval_split,
65
- prompt_template=prompt_template,
66
- **kwargs)
48
+ super().__init__(**kwargs)
67
49
 
68
50
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
69
51
  data_dict = {}
@@ -83,80 +65,9 @@ class HumanevalAdapter(DataAdapter):
83
65
  {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
84
66
  """
85
67
  full_prompt = input_d['prompt']
86
- full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
87
-
88
- return {'data': [full_prompt]}
89
-
90
- def get_answers(self, infer_cfg: dict) -> List[dict]:
91
- ans_list: list = []
92
- system_prompt: str = ''
93
- for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
94
- prompt: str = system_prompt + data_d['prompt']
95
- inputs: dict = {'data': [prompt]}
96
-
97
- pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
98
-
99
- pred_ans: str = pred_res['choices'][0]['message']['content']
100
- pred_ans = self._postprocess(pred_ans)
101
-
102
- ans_list.append({'task_id': task_id, 'completion': pred_ans})
103
-
104
- return ans_list
105
-
106
- def eval(self, infer_cfg: dict, **kwargs):
107
-
108
- # predict
109
- ans_list: list = self.get_answers(infer_cfg)
110
- ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
68
+ full_prompt = f'Complete the following python code:\n{full_prompt}' if self.prompt_template else full_prompt
111
69
 
112
- self.write_jsonl_func(filename=ans_out_file, data=ans_list)
113
- # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
114
- logger.info('** Dump predictions successfully.')
115
-
116
- # evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
117
- results = self.eval_func(
118
- sample_file=ans_out_file,
119
- k=self.k,
120
- n_workers=self.num_workers,
121
- timeout=self.timeout,
122
- problem_file=self.problem_file)
123
-
124
- # output: report
125
- report_map: dict = self.gen_report(results=results)
126
- report_dir: str = self.outputs_structure.reports_dir
127
- report_file: str = os.path.join(report_dir, 'human_eval_report.json')
128
-
129
- with open(report_file, 'w') as f:
130
- f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
131
- # logger.info(f'** Dump report to {report_file} \n')
132
- logger.info('** Dump report \n')
133
-
134
- try:
135
- # Make table
136
- report_table: str = gen_table([report_dir])
137
- logger.info(f'** Report table: \n {report_table} \n')
138
- except Exception:
139
- logger.error('Failed to generate report table.')
140
-
141
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
142
- total_num: int = sum([num for _, num in subset_score_map.values()])
143
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
144
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
145
- cate_avg_list = [{
146
- 'name': subset_name,
147
- 'score': normalize_score(score=score)
148
- } for subset_name, (score, _) in subset_score_map.items()]
149
-
150
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
151
-
152
- res_map = dict(
153
- name=report_name or 'HumanEval',
154
- metric='pass@1',
155
- score=weighted_avg_acc,
156
- category=[category_d],
157
- total_num=total_num)
158
-
159
- return res_map
70
+ return {'data': [full_prompt], 'system_prompt': self.prompt_template}
160
71
 
161
72
  @classmethod
162
73
  def _postprocess(cls, text: str) -> str:
@@ -182,19 +93,6 @@ class HumanevalAdapter(DataAdapter):
182
93
  text = '\n'.join([' ' + line for line in text.split('\n')])
183
94
  return text
184
95
 
185
- def compute_metric(self, review_res_list: list) -> float:
186
- """
187
- Compute evaluation result by specific metric.
188
-
189
- Args:
190
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
191
-
192
- Returns:
193
- The metric score.
194
- """
195
- items = [(score, 1.0) for score in review_res_list]
196
- return weighted_mean(items)
197
-
198
96
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
199
97
  return self._postprocess(result)
200
98
 
File without changes
@@ -0,0 +1,57 @@
1
+ from collections import defaultdict
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.benchmarks import Benchmark, DataAdapter
5
+ from evalscope.benchmarks.ifeval.utils import agg_inst_level_acc, process_results
6
+ from evalscope.constants import EvalType
7
+ from evalscope.metrics import Metric, mean
8
+ from evalscope.models import ChatGenerationModelAdapter
9
+ from evalscope.utils.utils import normalize_score
10
+
11
+
12
+ @Benchmark.register(
13
+ name='ifeval',
14
+ dataset_id='opencompass/ifeval',
15
+ model_adapter=ChatGenerationModelAdapter,
16
+ subset_list=['default'],
17
+ metric_list=[
18
+ Metric(name='prompt_level_strict_acc', object=mean),
19
+ Metric(name='inst_level_strict_acc', object=agg_inst_level_acc),
20
+ Metric(name='prompt_level_loose_acc', object=mean),
21
+ Metric(name='inst_level_loose_acc', object=agg_inst_level_acc),
22
+ ],
23
+ few_shot_num=0,
24
+ train_split=None,
25
+ eval_split='train',
26
+ prompt_template='',
27
+ )
28
+ class IFEvalAdapter(DataAdapter):
29
+
30
+ def __init__(self, **kwargs):
31
+ super().__init__(**kwargs)
32
+
33
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
34
+ return {'data': [input_d['prompt']], 'system_prompt': self.prompt_template}
35
+
36
+ def get_gold_answer(self, input_d: dict) -> str:
37
+ return input_d
38
+
39
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
40
+ return result
41
+
42
+ def match(self, gold: Any, pred: Any) -> Dict:
43
+ return process_results(gold, [pred])
44
+
45
+ def compute_metric(self, review_res_list: List[dict]) -> Any:
46
+ # aggregate review results
47
+ res_dict = defaultdict(list)
48
+ for res in review_res_list:
49
+ for k, v in res.items():
50
+ res_dict[k].append(v)
51
+
52
+ metrics = []
53
+ for metric in self.metric_list:
54
+ metric_name = metric.name
55
+ pred_value = res_dict[metric_name]
56
+ metrics.append({'metric_name': metric_name, 'score': metric.object(pred_value), 'num': len(pred_value)})
57
+ return metrics