evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
- evalscope/benchmarks/ifeval/instructions.py +1478 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +26 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +29 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +7 -5
- evalscope/constants.py +9 -26
- evalscope/evaluator/evaluator.py +87 -121
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +506 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +48 -72
- evalscope/run_arena.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +5 -4
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +5 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +52 -1
- tests/rag/test_mteb.py +3 -2
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
|
@@ -3,54 +3,43 @@ import numpy as np
|
|
|
3
3
|
import os
|
|
4
4
|
import re
|
|
5
5
|
|
|
6
|
-
from evalscope.benchmarks
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
6
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
|
+
from evalscope.constants import EvalType
|
|
8
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
9
|
+
from evalscope.models import ContinuationLogitsModelAdapter
|
|
9
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
11
|
from evalscope.utils.logger import get_logger
|
|
12
|
+
from evalscope.utils.utils import ResponseParser
|
|
11
13
|
|
|
12
14
|
# flake8: noqa
|
|
13
15
|
|
|
14
16
|
logger = get_logger()
|
|
15
17
|
|
|
16
|
-
DATASET_ID = 'modelscope/hellaswag'
|
|
17
|
-
SUBSET_LIST = ['default']
|
|
18
|
-
|
|
19
18
|
|
|
19
|
+
@Benchmark.register(
|
|
20
|
+
name='hellaswag',
|
|
21
|
+
dataset_id='modelscope/hellaswag',
|
|
22
|
+
model_adapter=ContinuationLogitsModelAdapter,
|
|
23
|
+
subset_list=['default'],
|
|
24
|
+
metric_list=[AverageAccuracy],
|
|
25
|
+
few_shot_num=0,
|
|
26
|
+
train_split='train',
|
|
27
|
+
eval_split='validation',
|
|
28
|
+
prompt_template=
|
|
29
|
+
'Respond with the index of sentence that makes the most sense, chose from 0, 1, 2, 3, derive your final answer as `The answer is ...`.', # noqa: E501
|
|
30
|
+
)
|
|
20
31
|
class HellaSwagAdapter(DataAdapter):
|
|
21
32
|
|
|
22
33
|
choices = ['0', '1', '2', '3']
|
|
23
34
|
|
|
24
|
-
def __init__(self,
|
|
25
|
-
subset_list: list = None,
|
|
26
|
-
metric_list: list = None,
|
|
27
|
-
few_shot_num: int = None,
|
|
28
|
-
train_split: str = 'train',
|
|
29
|
-
eval_split: str = 'validation',
|
|
30
|
-
**kwargs):
|
|
31
|
-
|
|
32
|
-
if subset_list is None:
|
|
33
|
-
subset_list = SUBSET_LIST
|
|
34
|
-
|
|
35
|
-
if metric_list is None:
|
|
36
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
37
|
-
|
|
38
|
-
if few_shot_num is None:
|
|
39
|
-
# Use 0-shot by default
|
|
40
|
-
logger.info(f'Set 0-shot examples by system for HellaSwag.')
|
|
41
|
-
few_shot_num = 0
|
|
35
|
+
def __init__(self, **kwargs):
|
|
42
36
|
|
|
37
|
+
few_shot_num = kwargs.get('few_shot_num', 0)
|
|
43
38
|
if few_shot_num != 0:
|
|
44
39
|
logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.')
|
|
45
|
-
few_shot_num = 0
|
|
40
|
+
kwargs['few_shot_num'] = 0
|
|
46
41
|
|
|
47
|
-
super().__init__(
|
|
48
|
-
subset_list=subset_list,
|
|
49
|
-
metric_list=metric_list,
|
|
50
|
-
few_shot_num=few_shot_num,
|
|
51
|
-
train_split=train_split,
|
|
52
|
-
eval_split=eval_split,
|
|
53
|
-
**kwargs)
|
|
42
|
+
super().__init__(**kwargs)
|
|
54
43
|
|
|
55
44
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
56
45
|
data_dict = {}
|
|
@@ -100,13 +89,17 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
100
89
|
|
|
101
90
|
ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
|
|
102
91
|
|
|
103
|
-
return {
|
|
92
|
+
return {
|
|
93
|
+
'data': ctx_continuation_pair_list,
|
|
94
|
+
'multi_choices': self.choices,
|
|
95
|
+
'system_prompt': self.prompt_template
|
|
96
|
+
}
|
|
104
97
|
|
|
105
98
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
106
99
|
# Get the gold choice
|
|
107
100
|
return input_d['label']
|
|
108
101
|
|
|
109
|
-
def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str =
|
|
102
|
+
def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
110
103
|
"""
|
|
111
104
|
Parse the model output to get the answer. Could be the best choice index.
|
|
112
105
|
|
|
@@ -118,7 +111,7 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
118
111
|
Returns:
|
|
119
112
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
120
113
|
"""
|
|
121
|
-
if eval_type ==
|
|
114
|
+
if eval_type == EvalType.CHECKPOINT:
|
|
122
115
|
# answer: in the form of [-2.3, -4.5, ...], len of self.choices
|
|
123
116
|
result = np.array(result)
|
|
124
117
|
endings: list = [self._preprocess(ending) for ending in raw_input_d['endings']]
|
|
@@ -126,76 +119,16 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
126
119
|
best_choice_idx = np.argmax(result / completion_len)
|
|
127
120
|
|
|
128
121
|
return str(best_choice_idx)
|
|
129
|
-
elif eval_type ==
|
|
130
|
-
return result
|
|
131
|
-
elif eval_type ==
|
|
132
|
-
return result
|
|
122
|
+
elif eval_type == EvalType.SERVICE:
|
|
123
|
+
return ResponseParser.parse_first_option(result)
|
|
124
|
+
elif eval_type == EvalType.CUSTOM:
|
|
125
|
+
return ResponseParser.parse_first_option(result)
|
|
133
126
|
else:
|
|
134
127
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
135
128
|
|
|
136
129
|
def match(self, gold: str, pred: str) -> float:
|
|
137
130
|
return exact_match(gold=str(gold), pred=str(pred))
|
|
138
131
|
|
|
139
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
140
|
-
"""
|
|
141
|
-
Compute evaluation result by specific metric.
|
|
142
|
-
|
|
143
|
-
Args:
|
|
144
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
145
|
-
|
|
146
|
-
Returns:
|
|
147
|
-
The metric score.
|
|
148
|
-
"""
|
|
149
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
150
|
-
return weighted_mean(items)
|
|
151
|
-
|
|
152
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
153
|
-
"""
|
|
154
|
-
Generate the report for the model output.
|
|
155
|
-
|
|
156
|
-
Args:
|
|
157
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
158
|
-
report_name: The user-defined report name.
|
|
159
|
-
|
|
160
|
-
Returns: A dict of metric calculation results. The format is like:
|
|
161
|
-
{
|
|
162
|
-
"name":"HellaSwag",
|
|
163
|
-
"metric":"WeightedAverageAccuracy",
|
|
164
|
-
"score":0.3389,
|
|
165
|
-
"category":[
|
|
166
|
-
{
|
|
167
|
-
"name":"DEFAULT",
|
|
168
|
-
"score":0.4128,
|
|
169
|
-
"subset":[
|
|
170
|
-
{
|
|
171
|
-
"name":"default",
|
|
172
|
-
"score":0.5632
|
|
173
|
-
},
|
|
174
|
-
]
|
|
175
|
-
}
|
|
176
|
-
],
|
|
177
|
-
"total_num":7800
|
|
178
|
-
}
|
|
179
|
-
"""
|
|
180
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
181
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
182
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
183
|
-
cate_avg_list = [{
|
|
184
|
-
'name': subset_name,
|
|
185
|
-
'score': normalize_score(score=score)
|
|
186
|
-
} for subset_name, (score, _) in subset_score_map.items()]
|
|
187
|
-
|
|
188
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
189
|
-
|
|
190
|
-
res_map = dict(
|
|
191
|
-
name=report_name or 'hellaswag',
|
|
192
|
-
metric=self.metric_list[0]['name'],
|
|
193
|
-
score=weighted_avg_acc,
|
|
194
|
-
category=[category_d],
|
|
195
|
-
total_num=total_num)
|
|
196
|
-
|
|
197
|
-
return res_map
|
|
198
|
-
|
|
199
132
|
@classmethod
|
|
200
133
|
def _preprocess(cls, text):
|
|
201
134
|
text = text.strip()
|
|
@@ -1,5 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.humaneval.humaneval_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.humaneval.humaneval_adapter import HumanevalAdapter as DataAdapterClass
|
|
5
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
|
|
@@ -1,38 +1,34 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import json
|
|
3
|
-
import os
|
|
4
2
|
import re
|
|
5
|
-
from tqdm import tqdm
|
|
6
|
-
from typing import List
|
|
7
3
|
|
|
8
|
-
from evalscope.benchmarks
|
|
9
|
-
from evalscope.metrics
|
|
10
|
-
from evalscope.
|
|
11
|
-
from evalscope.utils import normalize_score
|
|
4
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
+
from evalscope.metrics import Pass1
|
|
6
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
12
7
|
from evalscope.utils.logger import get_logger
|
|
13
8
|
|
|
14
9
|
logger = get_logger()
|
|
15
10
|
|
|
16
|
-
DATASET_ID = 'modelscope/humaneval'
|
|
17
|
-
SUBSET_LIST = ['openai_humaneval']
|
|
18
|
-
|
|
19
11
|
# Example:
|
|
20
12
|
# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"} # noqa
|
|
21
13
|
|
|
22
14
|
|
|
15
|
+
@Benchmark.register(
|
|
16
|
+
name='humaneval',
|
|
17
|
+
dataset_id='modelscope/humaneval',
|
|
18
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
19
|
+
subset_list=['openai_humaneval'],
|
|
20
|
+
metric_list=[Pass1],
|
|
21
|
+
few_shot_num=0,
|
|
22
|
+
train_split=None,
|
|
23
|
+
eval_split='test',
|
|
24
|
+
prompt_template='',
|
|
25
|
+
)
|
|
23
26
|
class HumanevalAdapter(DataAdapter):
|
|
24
27
|
"""
|
|
25
28
|
A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
|
|
26
29
|
"""
|
|
27
30
|
|
|
28
|
-
def __init__(self,
|
|
29
|
-
subset_list: list = None,
|
|
30
|
-
metric_list: list = None,
|
|
31
|
-
few_shot_num: int = None,
|
|
32
|
-
train_split: str = None,
|
|
33
|
-
eval_split: str = 'test',
|
|
34
|
-
prompt_template: str = 'Complete the following python code:\n',
|
|
35
|
-
**kwargs):
|
|
31
|
+
def __init__(self, **kwargs):
|
|
36
32
|
try:
|
|
37
33
|
from human_eval.data import stream_jsonl, write_jsonl
|
|
38
34
|
from human_eval.evaluation import check_correctness
|
|
@@ -41,29 +37,15 @@ class HumanevalAdapter(DataAdapter):
|
|
|
41
37
|
'https://github.com/openai/human-eval/tree/master#installation , '
|
|
42
38
|
'Note that you need to enable the execution code in the human_eval/execution.py first.')
|
|
43
39
|
|
|
44
|
-
if subset_list is None:
|
|
45
|
-
subset_list = SUBSET_LIST
|
|
46
|
-
|
|
47
|
-
if metric_list is None:
|
|
48
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
49
|
-
|
|
50
40
|
self.k = [1]
|
|
51
41
|
self.num_workers = 4
|
|
52
42
|
self.timeout = 4.0
|
|
53
|
-
self.outputs = kwargs.get('outputs', None)
|
|
54
43
|
|
|
55
44
|
self.read_problems_func = stream_jsonl
|
|
56
45
|
self.write_jsonl_func = write_jsonl
|
|
57
46
|
self.eval_func = check_correctness
|
|
58
47
|
|
|
59
|
-
super().__init__(
|
|
60
|
-
subset_list=subset_list,
|
|
61
|
-
metric_list=metric_list,
|
|
62
|
-
few_shot_num=few_shot_num,
|
|
63
|
-
train_split=train_split,
|
|
64
|
-
eval_split=eval_split,
|
|
65
|
-
prompt_template=prompt_template,
|
|
66
|
-
**kwargs)
|
|
48
|
+
super().__init__(**kwargs)
|
|
67
49
|
|
|
68
50
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
69
51
|
data_dict = {}
|
|
@@ -83,80 +65,9 @@ class HumanevalAdapter(DataAdapter):
|
|
|
83
65
|
{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
|
|
84
66
|
"""
|
|
85
67
|
full_prompt = input_d['prompt']
|
|
86
|
-
full_prompt = f'
|
|
87
|
-
|
|
88
|
-
return {'data': [full_prompt]}
|
|
89
|
-
|
|
90
|
-
def get_answers(self, infer_cfg: dict) -> List[dict]:
|
|
91
|
-
ans_list: list = []
|
|
92
|
-
system_prompt: str = ''
|
|
93
|
-
for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
|
|
94
|
-
prompt: str = system_prompt + data_d['prompt']
|
|
95
|
-
inputs: dict = {'data': [prompt]}
|
|
96
|
-
|
|
97
|
-
pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
|
|
98
|
-
|
|
99
|
-
pred_ans: str = pred_res['choices'][0]['message']['content']
|
|
100
|
-
pred_ans = self._postprocess(pred_ans)
|
|
101
|
-
|
|
102
|
-
ans_list.append({'task_id': task_id, 'completion': pred_ans})
|
|
103
|
-
|
|
104
|
-
return ans_list
|
|
105
|
-
|
|
106
|
-
def eval(self, infer_cfg: dict, **kwargs):
|
|
107
|
-
|
|
108
|
-
# predict
|
|
109
|
-
ans_list: list = self.get_answers(infer_cfg)
|
|
110
|
-
ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
|
|
68
|
+
full_prompt = f'Complete the following python code:\n{full_prompt}' if self.prompt_template else full_prompt
|
|
111
69
|
|
|
112
|
-
self.
|
|
113
|
-
# logger.info(f'** Dump predictions to {ans_out_file} successfully.')
|
|
114
|
-
logger.info('** Dump predictions successfully.')
|
|
115
|
-
|
|
116
|
-
# evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
|
|
117
|
-
results = self.eval_func(
|
|
118
|
-
sample_file=ans_out_file,
|
|
119
|
-
k=self.k,
|
|
120
|
-
n_workers=self.num_workers,
|
|
121
|
-
timeout=self.timeout,
|
|
122
|
-
problem_file=self.problem_file)
|
|
123
|
-
|
|
124
|
-
# output: report
|
|
125
|
-
report_map: dict = self.gen_report(results=results)
|
|
126
|
-
report_dir: str = self.outputs_structure.reports_dir
|
|
127
|
-
report_file: str = os.path.join(report_dir, 'human_eval_report.json')
|
|
128
|
-
|
|
129
|
-
with open(report_file, 'w') as f:
|
|
130
|
-
f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
|
|
131
|
-
# logger.info(f'** Dump report to {report_file} \n')
|
|
132
|
-
logger.info('** Dump report \n')
|
|
133
|
-
|
|
134
|
-
try:
|
|
135
|
-
# Make table
|
|
136
|
-
report_table: str = gen_table([report_dir])
|
|
137
|
-
logger.info(f'** Report table: \n {report_table} \n')
|
|
138
|
-
except Exception:
|
|
139
|
-
logger.error('Failed to generate report table.')
|
|
140
|
-
|
|
141
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
142
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
143
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
144
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
145
|
-
cate_avg_list = [{
|
|
146
|
-
'name': subset_name,
|
|
147
|
-
'score': normalize_score(score=score)
|
|
148
|
-
} for subset_name, (score, _) in subset_score_map.items()]
|
|
149
|
-
|
|
150
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
151
|
-
|
|
152
|
-
res_map = dict(
|
|
153
|
-
name=report_name or 'HumanEval',
|
|
154
|
-
metric='pass@1',
|
|
155
|
-
score=weighted_avg_acc,
|
|
156
|
-
category=[category_d],
|
|
157
|
-
total_num=total_num)
|
|
158
|
-
|
|
159
|
-
return res_map
|
|
70
|
+
return {'data': [full_prompt], 'system_prompt': self.prompt_template}
|
|
160
71
|
|
|
161
72
|
@classmethod
|
|
162
73
|
def _postprocess(cls, text: str) -> str:
|
|
@@ -182,19 +93,6 @@ class HumanevalAdapter(DataAdapter):
|
|
|
182
93
|
text = '\n'.join([' ' + line for line in text.split('\n')])
|
|
183
94
|
return text
|
|
184
95
|
|
|
185
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
186
|
-
"""
|
|
187
|
-
Compute evaluation result by specific metric.
|
|
188
|
-
|
|
189
|
-
Args:
|
|
190
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
191
|
-
|
|
192
|
-
Returns:
|
|
193
|
-
The metric score.
|
|
194
|
-
"""
|
|
195
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
196
|
-
return weighted_mean(items)
|
|
197
|
-
|
|
198
96
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
199
97
|
return self._postprocess(result)
|
|
200
98
|
|
|
File without changes
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
+
from evalscope.benchmarks.ifeval.utils import agg_inst_level_acc, process_results
|
|
6
|
+
from evalscope.constants import EvalType
|
|
7
|
+
from evalscope.metrics import Metric, mean
|
|
8
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
9
|
+
from evalscope.utils.utils import normalize_score
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@Benchmark.register(
|
|
13
|
+
name='ifeval',
|
|
14
|
+
dataset_id='opencompass/ifeval',
|
|
15
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
16
|
+
subset_list=['default'],
|
|
17
|
+
metric_list=[
|
|
18
|
+
Metric(name='prompt_level_strict_acc', object=mean),
|
|
19
|
+
Metric(name='inst_level_strict_acc', object=agg_inst_level_acc),
|
|
20
|
+
Metric(name='prompt_level_loose_acc', object=mean),
|
|
21
|
+
Metric(name='inst_level_loose_acc', object=agg_inst_level_acc),
|
|
22
|
+
],
|
|
23
|
+
few_shot_num=0,
|
|
24
|
+
train_split=None,
|
|
25
|
+
eval_split='train',
|
|
26
|
+
prompt_template='',
|
|
27
|
+
)
|
|
28
|
+
class IFEvalAdapter(DataAdapter):
|
|
29
|
+
|
|
30
|
+
def __init__(self, **kwargs):
|
|
31
|
+
super().__init__(**kwargs)
|
|
32
|
+
|
|
33
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
34
|
+
return {'data': [input_d['prompt']], 'system_prompt': self.prompt_template}
|
|
35
|
+
|
|
36
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
37
|
+
return input_d
|
|
38
|
+
|
|
39
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
40
|
+
return result
|
|
41
|
+
|
|
42
|
+
def match(self, gold: Any, pred: Any) -> Dict:
|
|
43
|
+
return process_results(gold, [pred])
|
|
44
|
+
|
|
45
|
+
def compute_metric(self, review_res_list: List[dict]) -> Any:
|
|
46
|
+
# aggregate review results
|
|
47
|
+
res_dict = defaultdict(list)
|
|
48
|
+
for res in review_res_list:
|
|
49
|
+
for k, v in res.items():
|
|
50
|
+
res_dict[k].append(v)
|
|
51
|
+
|
|
52
|
+
metrics = []
|
|
53
|
+
for metric in self.metric_list:
|
|
54
|
+
metric_name = metric.name
|
|
55
|
+
pred_value = res_dict[metric_name]
|
|
56
|
+
metrics.append({'metric_name': metric_name, 'score': metric.object(pred_value), 'num': len(pred_value)})
|
|
57
|
+
return metrics
|