evalscope 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -5
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
- evalscope/benchmarks/benchmark.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
- evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
- evalscope/benchmarks/data_adapter.py +69 -70
- evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
- evalscope/benchmarks/ifeval/instructions.py +1477 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
- evalscope/benchmarks/race/race_adapter.py +4 -73
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +30 -0
- evalscope/collections/evaluator.py +82 -62
- evalscope/collections/sampler.py +47 -41
- evalscope/collections/schema.py +14 -10
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +22 -13
- evalscope/metrics/__init__.py +2 -5
- evalscope/metrics/metrics.py +11 -2
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/models/chat_adapter.py +2 -0
- evalscope/models/server_adapter.py +11 -4
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +693 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +16 -11
- evalscope/summarizer.py +1 -1
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/logger.py +1 -0
- evalscope/utils/model_utils.py +5 -2
- evalscope/version.py +2 -2
- {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +84 -7
- {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +66 -51
- tests/cli/test_collection.py +11 -7
- tests/cli/test_run.py +13 -4
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
|
@@ -4,13 +4,15 @@ import pandas as pd
|
|
|
4
4
|
from collections import defaultdict
|
|
5
5
|
from tabulate import tabulate
|
|
6
6
|
from tqdm import tqdm
|
|
7
|
+
from typing import List
|
|
7
8
|
|
|
8
9
|
from evalscope.benchmarks import Benchmark
|
|
9
10
|
from evalscope.collections.sampler import DatasetEntry
|
|
10
11
|
from evalscope.config import TaskConfig
|
|
11
|
-
from evalscope.constants import
|
|
12
|
+
from evalscope.constants import DataCollection, DumpMode
|
|
12
13
|
from evalscope.evaluator import Evaluator
|
|
13
14
|
from evalscope.models import get_local_model, initialize_model_adapter
|
|
15
|
+
from evalscope.report import ReportGenerator
|
|
14
16
|
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
15
17
|
from evalscope.utils.logger import get_logger
|
|
16
18
|
|
|
@@ -38,6 +40,12 @@ class SimpleEvaluator(Evaluator):
|
|
|
38
40
|
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
39
41
|
return review_d
|
|
40
42
|
|
|
43
|
+
def get_score(self, review_d) -> float:
|
|
44
|
+
metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
|
|
45
|
+
# use the first metric by default
|
|
46
|
+
score = metric_score[0]['score']
|
|
47
|
+
return score
|
|
48
|
+
|
|
41
49
|
|
|
42
50
|
class EvaluatorCollection:
|
|
43
51
|
|
|
@@ -45,16 +53,18 @@ class EvaluatorCollection:
|
|
|
45
53
|
self.task_cfg = task_cfg
|
|
46
54
|
self.outputs = outputs
|
|
47
55
|
self.model = get_local_model(task_cfg)
|
|
48
|
-
self.dataset = self.load()
|
|
56
|
+
self.dataset, self.dataset_name = self.load()
|
|
49
57
|
self.dataset_name_map, self.dataset_id_map = self._parse_dataset()
|
|
50
58
|
self.evaluators = self._initialize_evaluators()
|
|
51
59
|
|
|
52
|
-
def load(self) -> list[DatasetEntry]:
|
|
53
|
-
|
|
60
|
+
def load(self) -> tuple[list[DatasetEntry], str]:
|
|
61
|
+
dataset_path = self.task_cfg.dataset_args[DataCollection.NAME]['local_path']
|
|
62
|
+
dataset_name = os.path.basename(dataset_path).split('.')[0]
|
|
63
|
+
raw_dataset = jsonl_to_list(dataset_path)
|
|
54
64
|
datasets = []
|
|
55
65
|
for sample in raw_dataset:
|
|
56
66
|
datasets.append(DatasetEntry(**sample))
|
|
57
|
-
return datasets
|
|
67
|
+
return datasets, dataset_name
|
|
58
68
|
|
|
59
69
|
def _parse_dataset(self):
|
|
60
70
|
dataset_name_map = defaultdict(lambda: defaultdict(list))
|
|
@@ -75,65 +85,80 @@ class EvaluatorCollection:
|
|
|
75
85
|
self.outputs)
|
|
76
86
|
return evaluators
|
|
77
87
|
|
|
78
|
-
def get_report(self,
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
# Helper function for aggregation and sorting
|
|
88
|
+
def get_report(self, scores):
|
|
89
|
+
|
|
90
|
+
def get_dataframe(scores):
|
|
91
|
+
data = []
|
|
92
|
+
for dataset_name, data_map in self.dataset_name_map.items():
|
|
93
|
+
for subset_name, ids in data_map.items():
|
|
94
|
+
for _id in ids:
|
|
95
|
+
row_data: DatasetEntry = self.dataset_id_map[_id]
|
|
96
|
+
score = scores[_id]
|
|
97
|
+
data.append(
|
|
98
|
+
dict(
|
|
99
|
+
task_type=row_data.task_type,
|
|
100
|
+
categories=tuple(row_data.categories),
|
|
101
|
+
dataset_name=dataset_name,
|
|
102
|
+
subset_name=subset_name,
|
|
103
|
+
tags=row_data.tags,
|
|
104
|
+
score=score))
|
|
105
|
+
return pd.DataFrame(data)
|
|
106
|
+
|
|
99
107
|
def aggregate_and_sort(df, group_by_cols):
|
|
108
|
+
# aggregate by group_by_cols, and calculate average_score and count
|
|
100
109
|
report_df = df.groupby(group_by_cols) \
|
|
101
110
|
.agg(average_score=('score', 'mean'), count=('score', 'size')) \
|
|
102
111
|
.reset_index()
|
|
103
|
-
|
|
104
|
-
# Round average_score to 4 decimal places
|
|
105
112
|
report_df['average_score'] = report_df['average_score'].round(4)
|
|
106
|
-
|
|
107
113
|
report_df = report_df.sort_values(by='count', ascending=False) \
|
|
108
114
|
.to_dict(orient='records')
|
|
109
115
|
return report_df
|
|
110
116
|
|
|
111
|
-
|
|
117
|
+
df = get_dataframe(scores)
|
|
118
|
+
|
|
119
|
+
# multi-level aggregation
|
|
112
120
|
subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
|
|
113
121
|
dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
|
|
114
122
|
task_report_df = aggregate_and_sort(df, ['task_type'])
|
|
115
|
-
tag_report_df = aggregate_and_sort(df_exploded, ['tags'])
|
|
116
123
|
|
|
117
|
-
#
|
|
118
|
-
|
|
124
|
+
# explode tags to multiple rows
|
|
125
|
+
df_exploded_tags = df.explode('tags')
|
|
126
|
+
tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags'])
|
|
127
|
+
|
|
128
|
+
# process multi-level categories
|
|
129
|
+
df_categories = df.copy()
|
|
130
|
+
# multi-level aggregation for categories
|
|
131
|
+
max_depth = df_categories['categories'].apply(len).max()
|
|
132
|
+
for level in range(max_depth):
|
|
133
|
+
df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
|
|
134
|
+
if len(x) > level else '')
|
|
135
|
+
category_report_df = aggregate_and_sort(df_categories, [f'category{level}' for level in range(max_depth)])
|
|
136
|
+
|
|
137
|
+
# convert to dict format
|
|
138
|
+
report_dict = {
|
|
119
139
|
'subset_level': subset_report_df,
|
|
120
140
|
'dataset_level': dataset_report_df,
|
|
121
141
|
'task_level': task_report_df,
|
|
122
|
-
'tag_level': tag_report_df
|
|
142
|
+
'tag_level': tag_report_df,
|
|
143
|
+
'category_level': category_report_df,
|
|
123
144
|
}
|
|
124
145
|
|
|
125
|
-
#
|
|
126
|
-
for level, data in
|
|
146
|
+
# record report
|
|
147
|
+
for level, data in report_dict.items():
|
|
127
148
|
table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
|
|
128
149
|
logger.info(f'{level} Report:\n{table}')
|
|
129
150
|
|
|
130
|
-
|
|
131
|
-
|
|
151
|
+
report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
|
|
152
|
+
# save report to JSON file
|
|
153
|
+
report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
|
|
154
|
+
os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
|
|
132
155
|
with open(report_file_path, 'w', encoding='utf-8') as f:
|
|
133
|
-
json.dump(report, f, ensure_ascii=False, indent=4)
|
|
156
|
+
json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
|
|
134
157
|
|
|
135
158
|
def get_answers(self):
|
|
136
|
-
pred_file_path = os.path.join(self.outputs.predictions_dir,
|
|
159
|
+
pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
|
|
160
|
+
f'{self.dataset_name}.jsonl')
|
|
161
|
+
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
|
|
137
162
|
answers = defaultdict(dict)
|
|
138
163
|
for sample in tqdm(self.dataset, desc='Getting answers'):
|
|
139
164
|
evaluator = self.evaluators[sample.dataset_name]
|
|
@@ -143,36 +168,31 @@ class EvaluatorCollection:
|
|
|
143
168
|
return answers
|
|
144
169
|
|
|
145
170
|
def get_reviews(self, answers):
|
|
146
|
-
review_file_path = os.path.join(self.outputs.reviews_dir,
|
|
171
|
+
review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
|
|
172
|
+
os.makedirs(review_file_path, exist_ok=True)
|
|
147
173
|
reviews = defaultdict(dict)
|
|
148
174
|
for sample in tqdm(self.dataset, desc='Getting reviews'):
|
|
149
175
|
evaluator = self.evaluators[sample.dataset_name]
|
|
150
176
|
review_d = evaluator.get_review(answers[sample.index])
|
|
151
177
|
reviews[sample.index] = review_d
|
|
152
|
-
dump_jsonl_data(
|
|
178
|
+
dump_jsonl_data(
|
|
179
|
+
review_d,
|
|
180
|
+
os.path.join(review_file_path, f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'),
|
|
181
|
+
dump_mode=DumpMode.APPEND)
|
|
153
182
|
return reviews
|
|
154
183
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
184
|
+
def get_scores(self, reviews) -> float:
|
|
185
|
+
scores = defaultdict(dict)
|
|
186
|
+
for sample in tqdm(self.dataset, desc='Getting scores'):
|
|
187
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
188
|
+
review_d = reviews[sample.index]
|
|
189
|
+
score = evaluator.get_score(review_d)
|
|
190
|
+
scores[sample.index] = score
|
|
191
|
+
|
|
192
|
+
return scores
|
|
158
193
|
|
|
159
194
|
def eval(self, **kwargs):
|
|
160
195
|
answers = self.get_answers()
|
|
161
196
|
reviews = self.get_reviews(answers)
|
|
162
|
-
self.
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
if __name__ == '__main__':
|
|
166
|
-
task_cfg = TaskConfig(
|
|
167
|
-
model='qwen2.5',
|
|
168
|
-
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
169
|
-
api_key='EMPTY',
|
|
170
|
-
eval_type=EvalType.SERVICE,
|
|
171
|
-
datasets=['data_collection'],
|
|
172
|
-
dataset_args={'data_collection': {
|
|
173
|
-
'local_path': 'outputs/mixed_data.jsonl'
|
|
174
|
-
}},
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
evaluator_collection = EvaluatorCollection(task_cfg)
|
|
178
|
-
evaluator_collection.eval()
|
|
197
|
+
scores = self.get_scores(reviews)
|
|
198
|
+
self.get_report(scores)
|
evalscope/collections/sampler.py
CHANGED
|
@@ -12,7 +12,8 @@ class DatasetEntry:
|
|
|
12
12
|
index: int = 0
|
|
13
13
|
prompt: dict = field(default_factory=dict)
|
|
14
14
|
tags: List[str] = field(default_factory=list)
|
|
15
|
-
|
|
15
|
+
categories: List[str] = field(default_factory=list)
|
|
16
|
+
task_type: str = ''
|
|
16
17
|
weight: float = 0.0
|
|
17
18
|
dataset_name: str = ''
|
|
18
19
|
subset_name: str = ''
|
|
@@ -21,30 +22,30 @@ class DatasetEntry:
|
|
|
21
22
|
# Define an abstract base class for Samplers
|
|
22
23
|
class Sampler(ABC):
|
|
23
24
|
|
|
24
|
-
def __init__(self, schema: CollectionSchema
|
|
25
|
+
def __init__(self, schema: CollectionSchema):
|
|
25
26
|
self.schema = schema
|
|
26
|
-
self.count = count
|
|
27
27
|
|
|
28
28
|
@abstractmethod
|
|
29
29
|
def sample(self) -> List[dict]:
|
|
30
30
|
raise NotImplementedError
|
|
31
31
|
|
|
32
|
-
def
|
|
32
|
+
def _sample_dataset(self, dataset: DatasetInfo, count: int) -> List[DatasetEntry]:
|
|
33
33
|
all_data = []
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
for
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
34
|
+
data_dict = dataset.get_data()
|
|
35
|
+
for subset_name, subset_data in data_dict.items():
|
|
36
|
+
for prompt in subset_data:
|
|
37
|
+
all_data.append(
|
|
38
|
+
DatasetEntry(
|
|
39
|
+
prompt=prompt,
|
|
40
|
+
tags=dataset.tags,
|
|
41
|
+
categories=dataset.hierarchy,
|
|
42
|
+
task_type=dataset.task_type,
|
|
43
|
+
weight=dataset.weight,
|
|
44
|
+
dataset_name=dataset.name,
|
|
45
|
+
subset_name=subset_name,
|
|
46
|
+
))
|
|
47
|
+
sampled_data = random.choices(all_data, k=count)
|
|
48
|
+
return sampled_data
|
|
48
49
|
|
|
49
50
|
def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
|
|
50
51
|
result = []
|
|
@@ -59,21 +60,19 @@ class WeightedSampler(Sampler):
|
|
|
59
60
|
Weighted sampler, according to the weight of each dataset, sample data from each dataset.
|
|
60
61
|
"""
|
|
61
62
|
|
|
62
|
-
def sample(self) -> List[dict]:
|
|
63
|
+
def sample(self, count: int) -> List[dict]:
|
|
63
64
|
dataset_info_list = self.schema.flatten()
|
|
64
|
-
all_data = self._collect_dataset_data(dataset_info_list)
|
|
65
|
-
|
|
66
|
-
remaining_count = self.count
|
|
67
65
|
sampled_data = []
|
|
66
|
+
remaining_count = count
|
|
68
67
|
|
|
69
68
|
for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
|
|
70
69
|
if i == len(dataset_info_list) - 1:
|
|
71
70
|
dataset_sample_count = remaining_count
|
|
72
71
|
else:
|
|
73
|
-
dataset_sample_count = int(dataset.weight *
|
|
72
|
+
dataset_sample_count = int(dataset.weight * count)
|
|
74
73
|
remaining_count -= dataset_sample_count
|
|
75
74
|
|
|
76
|
-
sampled_data.extend(
|
|
75
|
+
sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
|
|
77
76
|
|
|
78
77
|
return self._update_index(sampled_data)
|
|
79
78
|
|
|
@@ -83,16 +82,20 @@ class UniformSampler(Sampler):
|
|
|
83
82
|
Uniform sampler, sample data from each dataset with the same number of samples.
|
|
84
83
|
"""
|
|
85
84
|
|
|
86
|
-
def sample(self) -> List[dict]:
|
|
85
|
+
def sample(self, count: int) -> List[dict]:
|
|
87
86
|
dataset_info_list = self.schema.flatten()
|
|
88
|
-
all_data = self._collect_dataset_data(dataset_info_list)
|
|
89
|
-
|
|
90
87
|
num_datasets = len(dataset_info_list)
|
|
91
|
-
|
|
88
|
+
remaining_count = count
|
|
92
89
|
sampled_data = []
|
|
93
90
|
|
|
94
|
-
for
|
|
95
|
-
|
|
91
|
+
for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
|
|
92
|
+
if i == len(dataset_info_list) - 1:
|
|
93
|
+
dataset_sample_count = remaining_count
|
|
94
|
+
else:
|
|
95
|
+
dataset_sample_count = count // num_datasets
|
|
96
|
+
remaining_count -= dataset_sample_count
|
|
97
|
+
|
|
98
|
+
sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
|
|
96
99
|
|
|
97
100
|
return self._update_index(sampled_data)
|
|
98
101
|
|
|
@@ -102,18 +105,21 @@ class StratifiedSampler(Sampler):
|
|
|
102
105
|
Stratified sampler, sample data from each dataset according to the number of samples of each dataset.
|
|
103
106
|
"""
|
|
104
107
|
|
|
105
|
-
def sample(self) -> List[dict]:
|
|
108
|
+
def sample(self, count: int) -> List[dict]:
|
|
106
109
|
dataset_info_list = self.schema.flatten()
|
|
107
|
-
all_data = self._collect_dataset_data(dataset_info_list)
|
|
108
110
|
|
|
109
111
|
total_samples = sum(len(dataset.get_data()) for dataset in dataset_info_list)
|
|
112
|
+
remaining_count = count
|
|
110
113
|
sampled_data = []
|
|
111
114
|
|
|
112
|
-
for dataset in tqdm(dataset_info_list, desc='Sampling data'):
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
115
|
+
for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
|
|
116
|
+
if i == len(dataset_info_list) - 1:
|
|
117
|
+
dataset_sample_count = remaining_count
|
|
118
|
+
else:
|
|
119
|
+
dataset_sample_count = int((len(dataset.get_data()) / total_samples) * count)
|
|
120
|
+
remaining_count -= dataset_sample_count
|
|
116
121
|
|
|
122
|
+
sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
|
|
117
123
|
return self._update_index(sampled_data)
|
|
118
124
|
|
|
119
125
|
|
|
@@ -122,11 +128,11 @@ if __name__ == '__main__':
|
|
|
122
128
|
|
|
123
129
|
schema = CollectionSchema.from_json('outputs/schema.json')
|
|
124
130
|
print(schema.to_dict())
|
|
125
|
-
mixed_data = WeightedSampler(schema
|
|
131
|
+
mixed_data = WeightedSampler(schema).sample(10)
|
|
126
132
|
dump_jsonl_data(mixed_data, 'outputs/weighted_mixed_data.jsonl')
|
|
127
133
|
|
|
128
|
-
mixed_data = UniformSampler(schema, 100).sample()
|
|
129
|
-
dump_jsonl_data(mixed_data, 'outputs/uniform_mixed_data.jsonl')
|
|
134
|
+
# mixed_data = UniformSampler(schema, 100).sample()
|
|
135
|
+
# dump_jsonl_data(mixed_data, 'outputs/uniform_mixed_data.jsonl')
|
|
130
136
|
|
|
131
|
-
mixed_data = StratifiedSampler(schema, 100).sample()
|
|
132
|
-
dump_jsonl_data(mixed_data, 'outputs/stratified_mixed_data.jsonl')
|
|
137
|
+
# mixed_data = StratifiedSampler(schema, 100).sample()
|
|
138
|
+
# dump_jsonl_data(mixed_data, 'outputs/stratified_mixed_data.jsonl')
|
evalscope/collections/schema.py
CHANGED
|
@@ -11,6 +11,7 @@ class DatasetInfo:
|
|
|
11
11
|
task_type: str = ''
|
|
12
12
|
tags: List[str] = field(default_factory=list)
|
|
13
13
|
args: dict = field(default_factory=dict)
|
|
14
|
+
hierarchy: List[str] = field(default_factory=list)
|
|
14
15
|
|
|
15
16
|
def get_data(self) -> dict:
|
|
16
17
|
from evalscope.benchmarks import Benchmark
|
|
@@ -34,18 +35,15 @@ def flatten_weight(collection: 'CollectionSchema', base_weight=1):
|
|
|
34
35
|
dataset.weight = current_weight
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
def
|
|
38
|
+
def flatten_name(collection: 'CollectionSchema', parent_names=None):
|
|
38
39
|
if parent_names is None:
|
|
39
40
|
parent_names = []
|
|
40
41
|
current_names = parent_names + [collection.name]
|
|
41
42
|
for dataset in collection.datasets:
|
|
42
43
|
if isinstance(dataset, CollectionSchema):
|
|
43
|
-
|
|
44
|
+
flatten_name(dataset, current_names)
|
|
44
45
|
else:
|
|
45
|
-
|
|
46
|
-
for name in current_names:
|
|
47
|
-
if name not in dataset.tags:
|
|
48
|
-
dataset.tags.append(name)
|
|
46
|
+
dataset.hierarchy = current_names.copy()
|
|
49
47
|
|
|
50
48
|
|
|
51
49
|
def flatten_datasets(collection: 'CollectionSchema') -> List[DatasetInfo]:
|
|
@@ -100,7 +98,7 @@ class CollectionSchema:
|
|
|
100
98
|
|
|
101
99
|
def flatten(self) -> List[DatasetInfo]:
|
|
102
100
|
collection = copy.deepcopy(self)
|
|
103
|
-
|
|
101
|
+
flatten_name(collection)
|
|
104
102
|
flatten_weight(collection)
|
|
105
103
|
return flatten_datasets(collection)
|
|
106
104
|
|
|
@@ -109,8 +107,12 @@ if __name__ == '__main__':
|
|
|
109
107
|
schema = CollectionSchema(
|
|
110
108
|
name='reasoning',
|
|
111
109
|
datasets=[
|
|
112
|
-
|
|
113
|
-
|
|
110
|
+
CollectionSchema(name='english', datasets=[
|
|
111
|
+
DatasetInfo(name='arc', weight=1, tags=['en']),
|
|
112
|
+
]),
|
|
113
|
+
CollectionSchema(
|
|
114
|
+
name='chinese',
|
|
115
|
+
datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})])
|
|
114
116
|
])
|
|
115
117
|
print(schema)
|
|
116
118
|
print(schema.flatten())
|
|
@@ -118,5 +120,7 @@ if __name__ == '__main__':
|
|
|
118
120
|
|
|
119
121
|
schema = CollectionSchema.from_json('outputs/schema.json')
|
|
120
122
|
print(schema)
|
|
123
|
+
# 打印扁平化后的结果
|
|
121
124
|
for dataset in schema.flatten():
|
|
122
|
-
print(dataset)
|
|
125
|
+
print(f'Dataset: {dataset.name}')
|
|
126
|
+
print(f"Hierarchy: {' -> '.join(dataset.hierarchy)}")
|
evalscope/constants.py
CHANGED
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -12,7 +12,7 @@ from evalscope.benchmarks import DataAdapter
|
|
|
12
12
|
from evalscope.config import TaskConfig
|
|
13
13
|
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
|
|
14
14
|
from evalscope.models import BaseModelAdapter, CustomModelAdapter
|
|
15
|
-
from evalscope.
|
|
15
|
+
from evalscope.report import Report, gen_table
|
|
16
16
|
from evalscope.utils import dict_torch_dtype_to_str, gen_hash
|
|
17
17
|
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
18
18
|
from evalscope.utils.logger import get_logger
|
|
@@ -43,8 +43,8 @@ class Evaluator(object):
|
|
|
43
43
|
task_cfg: TaskConfig = None,
|
|
44
44
|
**kwargs):
|
|
45
45
|
|
|
46
|
+
self.dataset_name = data_adapter.name
|
|
46
47
|
self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
|
|
47
|
-
self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
|
|
48
48
|
self.model_name = task_cfg.model_id
|
|
49
49
|
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
50
50
|
|
|
@@ -201,7 +201,7 @@ class Evaluator(object):
|
|
|
201
201
|
# Gen review_id (concat: answer_id + reviewer_spec)
|
|
202
202
|
answer_id = answer_d[AnswerKeys.ANSWER_ID]
|
|
203
203
|
reviewer_spec = {
|
|
204
|
-
'metric': [
|
|
204
|
+
'metric': [metric.name for metric in self.data_adapter.metric_list],
|
|
205
205
|
'reviewer': ['Evaluator'],
|
|
206
206
|
'revision': ['default']
|
|
207
207
|
}
|
|
@@ -246,7 +246,7 @@ class Evaluator(object):
|
|
|
246
246
|
|
|
247
247
|
return reviews_list
|
|
248
248
|
|
|
249
|
-
def compute_metrics(self, reviews_list: List[dict]) ->
|
|
249
|
+
def compute_metrics(self, reviews_list: List[dict]) -> List[dict]:
|
|
250
250
|
"""
|
|
251
251
|
To compute metrics from reviews_list for each subset.
|
|
252
252
|
It is required to rewrite this method to support your own evaluator.
|
|
@@ -264,28 +264,37 @@ class Evaluator(object):
|
|
|
264
264
|
logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
265
265
|
continue
|
|
266
266
|
|
|
267
|
-
|
|
267
|
+
if len(review_d[AnswerKeys.CHOICES]) == 0:
|
|
268
|
+
logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
269
|
+
continue
|
|
270
|
+
elif len(review_d[AnswerKeys.CHOICES]) == 1:
|
|
271
|
+
review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
|
|
272
|
+
else:
|
|
273
|
+
review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
|
|
274
|
+
|
|
268
275
|
review_res_list.append(review_res)
|
|
269
276
|
|
|
270
|
-
metric_score:
|
|
277
|
+
metric_score: List[dict] = self.data_adapter.compute_metric(review_res_list=review_res_list)
|
|
271
278
|
|
|
272
279
|
return metric_score
|
|
273
280
|
|
|
274
|
-
def dump_report(self, reviews_score_all: dict, use_table: bool = True):
|
|
281
|
+
def dump_report(self, reviews_score_all: List[dict], use_table: bool = True):
|
|
275
282
|
"""
|
|
276
283
|
Get report for total reviews of specific dataset.
|
|
277
284
|
It is required to rewrite this method to support your own evaluator.
|
|
278
285
|
|
|
279
286
|
Args:
|
|
280
|
-
|
|
287
|
+
reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
|
|
281
288
|
use_table: whether to generate table for reports. Default to True.
|
|
282
289
|
|
|
283
290
|
Returns: None
|
|
284
291
|
"""
|
|
285
292
|
# Get report map
|
|
286
|
-
report_map:
|
|
287
|
-
subset_score_map=reviews_score_all,
|
|
288
|
-
|
|
293
|
+
report_map: Report = self.data_adapter.gen_report(
|
|
294
|
+
subset_score_map=reviews_score_all,
|
|
295
|
+
report_name=self.custom_task_name,
|
|
296
|
+
model_name=self.model_name,
|
|
297
|
+
dataset_name=self.dataset_name)
|
|
289
298
|
|
|
290
299
|
# Dump report
|
|
291
300
|
report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
|
|
@@ -294,7 +303,7 @@ class Evaluator(object):
|
|
|
294
303
|
|
|
295
304
|
# Write report
|
|
296
305
|
with open(report_path, 'w') as f:
|
|
297
|
-
f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
|
|
306
|
+
f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
|
|
298
307
|
logger.info(f'Dump report: {report_path} \n')
|
|
299
308
|
|
|
300
309
|
# Make table
|
|
@@ -351,7 +360,7 @@ class Evaluator(object):
|
|
|
351
360
|
subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
|
|
352
361
|
|
|
353
362
|
metric_res = self.compute_metrics(reviews_list=reviews_list)
|
|
354
|
-
reviews_score_all[subset_name] =
|
|
363
|
+
reviews_score_all[subset_name] = metric_res
|
|
355
364
|
stage_reviews_dict[subset_name] = reviews_list
|
|
356
365
|
|
|
357
366
|
if self.stage == EvalStage.INFER:
|
evalscope/metrics/__init__.py
CHANGED
|
@@ -1,7 +1,4 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, weighted_mean
|
|
2
|
+
from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, weighted_mean
|
|
3
|
+
from evalscope.metrics.named_metrics import *
|
|
3
4
|
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
|
|
4
|
-
|
|
5
|
-
WeightedAverageAccuracy = {'name': 'WeightedAverageAccuracy', 'object': weighted_mean}
|
|
6
|
-
WeightedAverageBLEU = {'name': 'WeightedAverageBLEU', 'object': weighted_mean}
|
|
7
|
-
Pass1 = {'name': 'Pass@1', 'object': weighted_mean}
|
evalscope/metrics/metrics.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
# Copyright (c) EleutherAI. and its affiliates.
|
|
3
3
|
# Copyright (c) OpenAI. and its affiliates.
|
|
4
|
+
|
|
4
5
|
import itertools
|
|
5
6
|
import math
|
|
6
7
|
import numpy as np
|
|
@@ -8,7 +9,7 @@ import random
|
|
|
8
9
|
import sacrebleu
|
|
9
10
|
from collections import defaultdict
|
|
10
11
|
from collections.abc import Iterable
|
|
11
|
-
from typing import Dict, List, Union
|
|
12
|
+
from typing import TYPE_CHECKING, Dict, List, Union
|
|
12
13
|
|
|
13
14
|
|
|
14
15
|
def mean(arr):
|
|
@@ -103,12 +104,20 @@ def perplexity(items):
|
|
|
103
104
|
return math.exp(-mean(items))
|
|
104
105
|
|
|
105
106
|
|
|
106
|
-
def weighted_mean(items) -> float:
|
|
107
|
+
def weighted_mean(items: List) -> float:
|
|
107
108
|
# e.g. [(0,1), (0.5,1), (1,1)]
|
|
108
109
|
a, b = zip(*items)
|
|
109
110
|
return sum(a) / sum(b)
|
|
110
111
|
|
|
111
112
|
|
|
113
|
+
def micro_mean(items):
|
|
114
|
+
return sum([item.score * item.num for item in items]) / sum([item.num for item in items])
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def macro_mean(items):
|
|
118
|
+
return sum([item.score for item in items]) / len(items)
|
|
119
|
+
|
|
120
|
+
|
|
112
121
|
def weighted_perplexity(items):
|
|
113
122
|
return math.exp(-weighted_mean(items))
|
|
114
123
|
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Callable
|
|
3
|
+
|
|
4
|
+
from evalscope.metrics.metrics import mean, weighted_mean
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class Metric:
|
|
9
|
+
name: str = 'default_metric'
|
|
10
|
+
object: Callable = field(default_factory=lambda: mean)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
AverageAccuracy = Metric(name='AverageAccuracy', object=mean)
|
|
14
|
+
WeightedAverageAccuracy = Metric(name='WeightedAverageAccuracy', object=weighted_mean)
|
|
15
|
+
AverageBLEU = Metric(name='AverageBLEU', object=mean)
|
|
16
|
+
WeightedAverageBLEU = Metric(name='WeightedAverageBLEU', object=weighted_mean)
|
|
17
|
+
Pass1 = Metric(name='Pass@1', object=mean)
|
evalscope/models/chat_adapter.py
CHANGED
|
@@ -76,6 +76,8 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
76
76
|
# For base model, use the query as the input
|
|
77
77
|
formatted_prompt = query
|
|
78
78
|
|
|
79
|
+
logger.debug(f'formatted_prompt: {formatted_prompt}')
|
|
80
|
+
|
|
79
81
|
inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
|
|
80
82
|
input_ids = inputs['input_ids']
|
|
81
83
|
|
|
@@ -46,9 +46,13 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
46
46
|
query = inputs
|
|
47
47
|
system_prompt = None
|
|
48
48
|
elif isinstance(inputs, dict):
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
49
|
+
data: list = inputs['data']
|
|
50
|
+
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
51
|
+
query = '\n'.join(''.join(item) for item in data)
|
|
52
|
+
system_prompt = inputs.get('system_prompt', None)
|
|
53
|
+
else:
|
|
54
|
+
query = data[0]
|
|
55
|
+
system_prompt = inputs.get('system_prompt', None)
|
|
52
56
|
elif isinstance(inputs, list):
|
|
53
57
|
query = '\n'.join(inputs)
|
|
54
58
|
system_prompt = None
|
|
@@ -76,10 +80,13 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
76
80
|
def make_request(self, content: dict, infer_cfg: dict = {}) -> dict:
|
|
77
81
|
"""Make request to remote API."""
|
|
78
82
|
# Format request JSON according to OpenAI API format
|
|
83
|
+
do_sample = infer_cfg.get('do_sample', False)
|
|
84
|
+
temperature = infer_cfg.get('temperature', 0.0) if do_sample else 0.0
|
|
85
|
+
|
|
79
86
|
request_json = {
|
|
80
87
|
**content, 'model': self.model_id,
|
|
81
88
|
'max_tokens': infer_cfg.get('max_tokens', 2048),
|
|
82
|
-
'temperature':
|
|
89
|
+
'temperature': temperature,
|
|
83
90
|
'top_p': infer_cfg.get('top_p', 1.0),
|
|
84
91
|
'n': infer_cfg.get('num_return_sequences', 1),
|
|
85
92
|
'stop': infer_cfg.get('stop', None)
|
evalscope/perf/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from evalscope.perf.main import run_perf_benchmark
|