evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -5
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
- evalscope/benchmarks/benchmark.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
- evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
- evalscope/benchmarks/data_adapter.py +69 -70
- evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
- evalscope/benchmarks/ifeval/instructions.py +1478 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
- evalscope/benchmarks/race/race_adapter.py +4 -73
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +29 -0
- evalscope/collections/evaluator.py +82 -62
- evalscope/collections/sampler.py +47 -41
- evalscope/collections/schema.py +14 -10
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +22 -13
- evalscope/metrics/__init__.py +2 -5
- evalscope/metrics/metrics.py +11 -2
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/models/server_adapter.py +11 -4
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +506 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +16 -11
- evalscope/summarizer.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/logger.py +1 -0
- evalscope/utils/model_utils.py +5 -2
- evalscope/version.py +2 -2
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
- tests/cli/test_collection.py +11 -7
- tests/cli/test_run.py +13 -4
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import glob
|
|
3
|
-
import json
|
|
4
3
|
import os.path
|
|
5
4
|
from collections import defaultdict
|
|
6
|
-
from typing import
|
|
5
|
+
from typing import List
|
|
7
6
|
|
|
8
7
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
|
-
from evalscope.metrics import
|
|
10
|
-
weighted_mean)
|
|
8
|
+
from evalscope.metrics import AverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
|
|
11
9
|
from evalscope.models import ChatGenerationModelAdapter
|
|
12
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
13
11
|
from evalscope.utils.logger import get_logger
|
|
@@ -20,7 +18,7 @@ logger = get_logger()
|
|
|
20
18
|
dataset_id='general_qa',
|
|
21
19
|
model_adapter=ChatGenerationModelAdapter,
|
|
22
20
|
subset_list=['default'],
|
|
23
|
-
metric_list=[
|
|
21
|
+
metric_list=[AverageBLEU],
|
|
24
22
|
few_shot_num=0,
|
|
25
23
|
train_split=None,
|
|
26
24
|
eval_split='test',
|
|
@@ -68,7 +66,7 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
68
66
|
|
|
69
67
|
# if len(history) > 0:
|
|
70
68
|
# prompt = '\n'.join(history) + '\n' + prompt
|
|
71
|
-
return {'data': [prompt]}
|
|
69
|
+
return {'data': [prompt], 'system_prompt': self.prompt_template}
|
|
72
70
|
|
|
73
71
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
74
72
|
"""
|
|
@@ -92,14 +90,14 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
92
90
|
"""
|
|
93
91
|
return result
|
|
94
92
|
|
|
95
|
-
def match(self, gold: str, pred: str) ->
|
|
93
|
+
def match(self, gold: str, pred: str) -> dict:
|
|
96
94
|
"""
|
|
97
95
|
Args:
|
|
98
96
|
gold: str
|
|
99
97
|
pred: str
|
|
100
98
|
|
|
101
99
|
Returns:
|
|
102
|
-
bleu_score:
|
|
100
|
+
bleu_score: dict
|
|
103
101
|
|
|
104
102
|
"""
|
|
105
103
|
res = dict()
|
|
@@ -107,10 +105,9 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
107
105
|
bleu_dict = bleu_ngram_one_sample(pred, gold)
|
|
108
106
|
res.update(rouge_dict)
|
|
109
107
|
res.update(bleu_dict)
|
|
110
|
-
# return bleu(item)
|
|
111
108
|
return res
|
|
112
109
|
|
|
113
|
-
def compute_metric(self, review_res_list:
|
|
110
|
+
def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
|
|
114
111
|
"""
|
|
115
112
|
compute weighted mean of the bleu score of all samples
|
|
116
113
|
|
|
@@ -118,62 +115,12 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
118
115
|
review_res_list: [score1, score2, ...]
|
|
119
116
|
|
|
120
117
|
Returns:
|
|
121
|
-
avg_res:
|
|
118
|
+
avg_res: List[dict]
|
|
122
119
|
|
|
123
120
|
"""
|
|
124
121
|
items = defaultdict(list)
|
|
125
122
|
for scores in review_res_list:
|
|
126
123
|
for k, v in scores.items():
|
|
127
|
-
items[k].append(
|
|
124
|
+
items[k].append(v)
|
|
128
125
|
# items = [(score, 1.0) for score in review_res_list]
|
|
129
|
-
|
|
130
|
-
# return weighted_mean(items)
|
|
131
|
-
return res
|
|
132
|
-
|
|
133
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
134
|
-
"""
|
|
135
|
-
Args:
|
|
136
|
-
subset_score_map: {subset_name: (score_dict, num), ...}
|
|
137
|
-
report_name: str, the user-defined report name.
|
|
138
|
-
|
|
139
|
-
Returns:
|
|
140
|
-
{
|
|
141
|
-
"name":"GeneralQA",
|
|
142
|
-
"metric":"WeightedAverageBLEU",
|
|
143
|
-
"score":0.399,
|
|
144
|
-
"category":[
|
|
145
|
-
{
|
|
146
|
-
"name":"DEFAULT",
|
|
147
|
-
"score":0.399,
|
|
148
|
-
"subset":[
|
|
149
|
-
{
|
|
150
|
-
"name":"default",
|
|
151
|
-
"score":0.399
|
|
152
|
-
},
|
|
153
|
-
]
|
|
154
|
-
}
|
|
155
|
-
],
|
|
156
|
-
"total_num":10
|
|
157
|
-
}
|
|
158
|
-
"""
|
|
159
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
160
|
-
# weighted_avg_bleu: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
161
|
-
cate_avg_list = [{
|
|
162
|
-
'name': subset_name,
|
|
163
|
-
'score': score_dict
|
|
164
|
-
} for subset_name, (score_dict, _) in subset_score_map.items()]
|
|
165
|
-
total_avg_list = defaultdict(float)
|
|
166
|
-
for score_dict, num in subset_score_map.values():
|
|
167
|
-
for metric, score in score_dict.items():
|
|
168
|
-
total_avg_list[metric] += score * num / total_num
|
|
169
|
-
|
|
170
|
-
category_d = dict(name='DEFAULT', score=total_avg_list, subset=cate_avg_list)
|
|
171
|
-
|
|
172
|
-
res_map = dict(
|
|
173
|
-
name=report_name or 'general_qa',
|
|
174
|
-
metric=self.metric_list[0]['name'],
|
|
175
|
-
score=total_avg_list,
|
|
176
|
-
category=[category_d],
|
|
177
|
-
total_num=total_num)
|
|
178
|
-
|
|
179
|
-
return res_map
|
|
126
|
+
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
|
|
@@ -6,7 +6,7 @@ import os
|
|
|
6
6
|
import re
|
|
7
7
|
|
|
8
8
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
9
|
-
from evalscope.metrics import
|
|
9
|
+
from evalscope.metrics import AverageAccuracy
|
|
10
10
|
from evalscope.models import ChatGenerationModelAdapter
|
|
11
11
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
12
12
|
from evalscope.utils.logger import get_logger
|
|
@@ -19,7 +19,7 @@ logger = get_logger()
|
|
|
19
19
|
dataset_id='modelscope/gsm8k',
|
|
20
20
|
model_adapter=ChatGenerationModelAdapter,
|
|
21
21
|
subset_list=['main'],
|
|
22
|
-
metric_list=[
|
|
22
|
+
metric_list=[AverageAccuracy],
|
|
23
23
|
few_shot_num=4,
|
|
24
24
|
train_split='train',
|
|
25
25
|
eval_split='test',
|
|
@@ -33,7 +33,7 @@ class GSM8KAdapter(DataAdapter):
|
|
|
33
33
|
|
|
34
34
|
Args:
|
|
35
35
|
subset_list (list): Subset list for the dataset. Default: ['main']
|
|
36
|
-
metric_list (list): Metric list for the dataset. Default: [{'name': '
|
|
36
|
+
metric_list (list): Metric list for the dataset. Default: [{'name': 'AverageAccuracy', 'object': mean}]
|
|
37
37
|
few_shot_num (int): Number of few-shot examples. Default: 4
|
|
38
38
|
train_split (str): Train split name. Default: 'train'
|
|
39
39
|
eval_split (str): The target eval split name. Default: 'test'
|
|
@@ -75,9 +75,8 @@ class GSM8KAdapter(DataAdapter):
|
|
|
75
75
|
use_fewshot = self.few_shot_num > 0
|
|
76
76
|
|
|
77
77
|
full_prompt = self._generate_prompt(input_d, few_shot_list=few_shot_list, use_fewshot=use_fewshot)
|
|
78
|
-
full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
|
|
79
78
|
|
|
80
|
-
return {'data': [full_prompt]}
|
|
79
|
+
return {'data': [full_prompt], 'system_prompt': self.prompt_template}
|
|
81
80
|
|
|
82
81
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
83
82
|
# Extract the gold answer from the input dict.
|
|
@@ -5,10 +5,11 @@ import re
|
|
|
5
5
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType
|
|
8
|
-
from evalscope.metrics import
|
|
8
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
9
9
|
from evalscope.models import ContinuationLogitsModelAdapter
|
|
10
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
|
+
from evalscope.utils.utils import ResponseParser
|
|
12
13
|
|
|
13
14
|
# flake8: noqa
|
|
14
15
|
|
|
@@ -20,11 +21,12 @@ logger = get_logger()
|
|
|
20
21
|
dataset_id='modelscope/hellaswag',
|
|
21
22
|
model_adapter=ContinuationLogitsModelAdapter,
|
|
22
23
|
subset_list=['default'],
|
|
23
|
-
metric_list=[
|
|
24
|
+
metric_list=[AverageAccuracy],
|
|
24
25
|
few_shot_num=0,
|
|
25
26
|
train_split='train',
|
|
26
27
|
eval_split='validation',
|
|
27
|
-
prompt_template=
|
|
28
|
+
prompt_template=
|
|
29
|
+
'Respond with the index of sentence that makes the most sense, chose from 0, 1, 2, 3, derive your final answer as `The answer is ...`.', # noqa: E501
|
|
28
30
|
)
|
|
29
31
|
class HellaSwagAdapter(DataAdapter):
|
|
30
32
|
|
|
@@ -87,7 +89,11 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
87
89
|
|
|
88
90
|
ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
|
|
89
91
|
|
|
90
|
-
return {
|
|
92
|
+
return {
|
|
93
|
+
'data': ctx_continuation_pair_list,
|
|
94
|
+
'multi_choices': self.choices,
|
|
95
|
+
'system_prompt': self.prompt_template
|
|
96
|
+
}
|
|
91
97
|
|
|
92
98
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
93
99
|
# Get the gold choice
|
|
@@ -114,9 +120,9 @@ class HellaSwagAdapter(DataAdapter):
|
|
|
114
120
|
|
|
115
121
|
return str(best_choice_idx)
|
|
116
122
|
elif eval_type == EvalType.SERVICE:
|
|
117
|
-
return result
|
|
123
|
+
return ResponseParser.parse_first_option(result)
|
|
118
124
|
elif eval_type == EvalType.CUSTOM:
|
|
119
|
-
return result
|
|
125
|
+
return ResponseParser.parse_first_option(result)
|
|
120
126
|
else:
|
|
121
127
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
122
128
|
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import re
|
|
3
|
-
from typing import List
|
|
4
3
|
|
|
5
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
5
|
from evalscope.metrics import Pass1
|
|
@@ -22,7 +21,7 @@ logger = get_logger()
|
|
|
22
21
|
few_shot_num=0,
|
|
23
22
|
train_split=None,
|
|
24
23
|
eval_split='test',
|
|
25
|
-
prompt_template='
|
|
24
|
+
prompt_template='',
|
|
26
25
|
)
|
|
27
26
|
class HumanevalAdapter(DataAdapter):
|
|
28
27
|
"""
|
|
@@ -66,9 +65,9 @@ class HumanevalAdapter(DataAdapter):
|
|
|
66
65
|
{'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
|
|
67
66
|
"""
|
|
68
67
|
full_prompt = input_d['prompt']
|
|
69
|
-
full_prompt = f'
|
|
68
|
+
full_prompt = f'Complete the following python code:\n{full_prompt}' if self.prompt_template else full_prompt
|
|
70
69
|
|
|
71
|
-
return {'data': [full_prompt]}
|
|
70
|
+
return {'data': [full_prompt], 'system_prompt': self.prompt_template}
|
|
72
71
|
|
|
73
72
|
@classmethod
|
|
74
73
|
def _postprocess(cls, text: str) -> str:
|
|
File without changes
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
+
from evalscope.benchmarks.ifeval.utils import agg_inst_level_acc, process_results
|
|
6
|
+
from evalscope.constants import EvalType
|
|
7
|
+
from evalscope.metrics import Metric, mean
|
|
8
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
9
|
+
from evalscope.utils.utils import normalize_score
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@Benchmark.register(
|
|
13
|
+
name='ifeval',
|
|
14
|
+
dataset_id='opencompass/ifeval',
|
|
15
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
16
|
+
subset_list=['default'],
|
|
17
|
+
metric_list=[
|
|
18
|
+
Metric(name='prompt_level_strict_acc', object=mean),
|
|
19
|
+
Metric(name='inst_level_strict_acc', object=agg_inst_level_acc),
|
|
20
|
+
Metric(name='prompt_level_loose_acc', object=mean),
|
|
21
|
+
Metric(name='inst_level_loose_acc', object=agg_inst_level_acc),
|
|
22
|
+
],
|
|
23
|
+
few_shot_num=0,
|
|
24
|
+
train_split=None,
|
|
25
|
+
eval_split='train',
|
|
26
|
+
prompt_template='',
|
|
27
|
+
)
|
|
28
|
+
class IFEvalAdapter(DataAdapter):
|
|
29
|
+
|
|
30
|
+
def __init__(self, **kwargs):
|
|
31
|
+
super().__init__(**kwargs)
|
|
32
|
+
|
|
33
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
34
|
+
return {'data': [input_d['prompt']], 'system_prompt': self.prompt_template}
|
|
35
|
+
|
|
36
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
37
|
+
return input_d
|
|
38
|
+
|
|
39
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
40
|
+
return result
|
|
41
|
+
|
|
42
|
+
def match(self, gold: Any, pred: Any) -> Dict:
|
|
43
|
+
return process_results(gold, [pred])
|
|
44
|
+
|
|
45
|
+
def compute_metric(self, review_res_list: List[dict]) -> Any:
|
|
46
|
+
# aggregate review results
|
|
47
|
+
res_dict = defaultdict(list)
|
|
48
|
+
for res in review_res_list:
|
|
49
|
+
for k, v in res.items():
|
|
50
|
+
res_dict[k].append(v)
|
|
51
|
+
|
|
52
|
+
metrics = []
|
|
53
|
+
for metric in self.metric_list:
|
|
54
|
+
metric_name = metric.name
|
|
55
|
+
pred_value = res_dict[metric_name]
|
|
56
|
+
metrics.append({'metric_name': metric_name, 'score': metric.object(pred_value), 'num': len(pred_value)})
|
|
57
|
+
return metrics
|