evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
- evalscope/benchmarks/ifeval/instructions.py +1478 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +26 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +29 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +7 -5
- evalscope/constants.py +9 -26
- evalscope/evaluator/evaluator.py +87 -121
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/main.py +0 -1
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +1 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +506 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +48 -72
- evalscope/run_arena.py +1 -1
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +5 -4
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +5 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +52 -1
- tests/rag/test_mteb.py +3 -2
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -133
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from tabulate import tabulate
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
from typing import List
|
|
8
|
+
|
|
9
|
+
from evalscope.benchmarks import Benchmark
|
|
10
|
+
from evalscope.collections.sampler import DatasetEntry
|
|
11
|
+
from evalscope.config import TaskConfig
|
|
12
|
+
from evalscope.constants import DataCollection, DumpMode
|
|
13
|
+
from evalscope.evaluator import Evaluator
|
|
14
|
+
from evalscope.models import get_local_model, initialize_model_adapter
|
|
15
|
+
from evalscope.report import ReportGenerator
|
|
16
|
+
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
17
|
+
from evalscope.utils.logger import get_logger
|
|
18
|
+
|
|
19
|
+
logger = get_logger()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SimpleEvaluator(Evaluator):
|
|
23
|
+
|
|
24
|
+
def __init__(self, dataset_name, data_adapter, model_adapter, task_cfg, outputs):
|
|
25
|
+
super().__init__(
|
|
26
|
+
dataset_name_or_path=dataset_name,
|
|
27
|
+
data_adapter=data_adapter,
|
|
28
|
+
model_adapter=model_adapter,
|
|
29
|
+
task_cfg=task_cfg,
|
|
30
|
+
outputs=outputs)
|
|
31
|
+
|
|
32
|
+
def get_answer(self, input_prompt, subset_name, infer_cfg) -> dict:
|
|
33
|
+
answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
|
|
34
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
35
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
36
|
+
return processed_answer
|
|
37
|
+
|
|
38
|
+
def get_review(self, answer_d) -> dict:
|
|
39
|
+
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
40
|
+
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
41
|
+
return review_d
|
|
42
|
+
|
|
43
|
+
def get_score(self, review_d) -> float:
|
|
44
|
+
metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
|
|
45
|
+
# use the first metric by default
|
|
46
|
+
score = metric_score[0]['score']
|
|
47
|
+
return score
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class EvaluatorCollection:
|
|
51
|
+
|
|
52
|
+
def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure):
|
|
53
|
+
self.task_cfg = task_cfg
|
|
54
|
+
self.outputs = outputs
|
|
55
|
+
self.model = get_local_model(task_cfg)
|
|
56
|
+
self.dataset, self.dataset_name = self.load()
|
|
57
|
+
self.dataset_name_map, self.dataset_id_map = self._parse_dataset()
|
|
58
|
+
self.evaluators = self._initialize_evaluators()
|
|
59
|
+
|
|
60
|
+
def load(self) -> tuple[list[DatasetEntry], str]:
|
|
61
|
+
dataset_path = self.task_cfg.dataset_args[DataCollection.NAME]['local_path']
|
|
62
|
+
dataset_name = os.path.basename(dataset_path).split('.')[0]
|
|
63
|
+
raw_dataset = jsonl_to_list(dataset_path)
|
|
64
|
+
datasets = []
|
|
65
|
+
for sample in raw_dataset:
|
|
66
|
+
datasets.append(DatasetEntry(**sample))
|
|
67
|
+
return datasets, dataset_name
|
|
68
|
+
|
|
69
|
+
def _parse_dataset(self):
|
|
70
|
+
dataset_name_map = defaultdict(lambda: defaultdict(list))
|
|
71
|
+
dataset_id_map = {}
|
|
72
|
+
for sample in self.dataset:
|
|
73
|
+
dataset_name, subset_name = sample.dataset_name, sample.subset_name
|
|
74
|
+
dataset_name_map[dataset_name][subset_name].append(sample.index)
|
|
75
|
+
dataset_id_map[sample.index] = sample
|
|
76
|
+
return dataset_name_map, dataset_id_map
|
|
77
|
+
|
|
78
|
+
def _initialize_evaluators(self):
|
|
79
|
+
evaluators = {}
|
|
80
|
+
for dataset_name in self.dataset_name_map.keys():
|
|
81
|
+
benchmark = Benchmark.get(dataset_name)
|
|
82
|
+
data_adapter = benchmark.get_data_adapter()
|
|
83
|
+
model_adapter = initialize_model_adapter(self.task_cfg, benchmark.model_adapter, self.model)
|
|
84
|
+
evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
|
|
85
|
+
self.outputs)
|
|
86
|
+
return evaluators
|
|
87
|
+
|
|
88
|
+
def get_report(self, scores):
|
|
89
|
+
|
|
90
|
+
def get_dataframe(scores):
|
|
91
|
+
data = []
|
|
92
|
+
for dataset_name, data_map in self.dataset_name_map.items():
|
|
93
|
+
for subset_name, ids in data_map.items():
|
|
94
|
+
for _id in ids:
|
|
95
|
+
row_data: DatasetEntry = self.dataset_id_map[_id]
|
|
96
|
+
score = scores[_id]
|
|
97
|
+
data.append(
|
|
98
|
+
dict(
|
|
99
|
+
task_type=row_data.task_type,
|
|
100
|
+
categories=tuple(row_data.categories),
|
|
101
|
+
dataset_name=dataset_name,
|
|
102
|
+
subset_name=subset_name,
|
|
103
|
+
tags=row_data.tags,
|
|
104
|
+
score=score))
|
|
105
|
+
return pd.DataFrame(data)
|
|
106
|
+
|
|
107
|
+
def aggregate_and_sort(df, group_by_cols):
|
|
108
|
+
# aggregate by group_by_cols, and calculate average_score and count
|
|
109
|
+
report_df = df.groupby(group_by_cols) \
|
|
110
|
+
.agg(average_score=('score', 'mean'), count=('score', 'size')) \
|
|
111
|
+
.reset_index()
|
|
112
|
+
report_df['average_score'] = report_df['average_score'].round(4)
|
|
113
|
+
report_df = report_df.sort_values(by='count', ascending=False) \
|
|
114
|
+
.to_dict(orient='records')
|
|
115
|
+
return report_df
|
|
116
|
+
|
|
117
|
+
df = get_dataframe(scores)
|
|
118
|
+
|
|
119
|
+
# multi-level aggregation
|
|
120
|
+
subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
|
|
121
|
+
dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
|
|
122
|
+
task_report_df = aggregate_and_sort(df, ['task_type'])
|
|
123
|
+
|
|
124
|
+
# explode tags to multiple rows
|
|
125
|
+
df_exploded_tags = df.explode('tags')
|
|
126
|
+
tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags'])
|
|
127
|
+
|
|
128
|
+
# process multi-level categories
|
|
129
|
+
df_categories = df.copy()
|
|
130
|
+
# multi-level aggregation for categories
|
|
131
|
+
max_depth = df_categories['categories'].apply(len).max()
|
|
132
|
+
for level in range(max_depth):
|
|
133
|
+
df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
|
|
134
|
+
if len(x) > level else '')
|
|
135
|
+
category_report_df = aggregate_and_sort(df_categories, [f'category{level}' for level in range(max_depth)])
|
|
136
|
+
|
|
137
|
+
# convert to dict format
|
|
138
|
+
report_dict = {
|
|
139
|
+
'subset_level': subset_report_df,
|
|
140
|
+
'dataset_level': dataset_report_df,
|
|
141
|
+
'task_level': task_report_df,
|
|
142
|
+
'tag_level': tag_report_df,
|
|
143
|
+
'category_level': category_report_df,
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
# record report
|
|
147
|
+
for level, data in report_dict.items():
|
|
148
|
+
table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
|
|
149
|
+
logger.info(f'{level} Report:\n{table}')
|
|
150
|
+
|
|
151
|
+
report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
|
|
152
|
+
# save report to JSON file
|
|
153
|
+
report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
|
|
154
|
+
os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
|
|
155
|
+
with open(report_file_path, 'w', encoding='utf-8') as f:
|
|
156
|
+
json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
|
|
157
|
+
|
|
158
|
+
def get_answers(self):
|
|
159
|
+
pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
|
|
160
|
+
f'{self.dataset_name}.jsonl')
|
|
161
|
+
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
|
|
162
|
+
answers = defaultdict(dict)
|
|
163
|
+
for sample in tqdm(self.dataset, desc='Getting answers'):
|
|
164
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
165
|
+
answer_d = evaluator.get_answer(sample.prompt, sample.subset_name, self.task_cfg.generation_config)
|
|
166
|
+
answers[sample.index] = answer_d
|
|
167
|
+
dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
168
|
+
return answers
|
|
169
|
+
|
|
170
|
+
def get_reviews(self, answers):
|
|
171
|
+
review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
|
|
172
|
+
os.makedirs(review_file_path, exist_ok=True)
|
|
173
|
+
reviews = defaultdict(dict)
|
|
174
|
+
for sample in tqdm(self.dataset, desc='Getting reviews'):
|
|
175
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
176
|
+
review_d = evaluator.get_review(answers[sample.index])
|
|
177
|
+
reviews[sample.index] = review_d
|
|
178
|
+
dump_jsonl_data(
|
|
179
|
+
review_d,
|
|
180
|
+
os.path.join(review_file_path, f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'),
|
|
181
|
+
dump_mode=DumpMode.APPEND)
|
|
182
|
+
return reviews
|
|
183
|
+
|
|
184
|
+
def get_scores(self, reviews) -> float:
|
|
185
|
+
scores = defaultdict(dict)
|
|
186
|
+
for sample in tqdm(self.dataset, desc='Getting scores'):
|
|
187
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
188
|
+
review_d = reviews[sample.index]
|
|
189
|
+
score = evaluator.get_score(review_d)
|
|
190
|
+
scores[sample.index] = score
|
|
191
|
+
|
|
192
|
+
return scores
|
|
193
|
+
|
|
194
|
+
def eval(self, **kwargs):
|
|
195
|
+
answers = self.get_answers()
|
|
196
|
+
reviews = self.get_reviews(answers)
|
|
197
|
+
scores = self.get_scores(reviews)
|
|
198
|
+
self.get_report(scores)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from evalscope.collections.schema import CollectionSchema, DatasetInfo
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class DatasetEntry:
|
|
12
|
+
index: int = 0
|
|
13
|
+
prompt: dict = field(default_factory=dict)
|
|
14
|
+
tags: List[str] = field(default_factory=list)
|
|
15
|
+
categories: List[str] = field(default_factory=list)
|
|
16
|
+
task_type: str = ''
|
|
17
|
+
weight: float = 0.0
|
|
18
|
+
dataset_name: str = ''
|
|
19
|
+
subset_name: str = ''
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Define an abstract base class for Samplers
|
|
23
|
+
class Sampler(ABC):
|
|
24
|
+
|
|
25
|
+
def __init__(self, schema: CollectionSchema):
|
|
26
|
+
self.schema = schema
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def sample(self) -> List[dict]:
|
|
30
|
+
raise NotImplementedError
|
|
31
|
+
|
|
32
|
+
def _sample_dataset(self, dataset: DatasetInfo, count: int) -> List[DatasetEntry]:
|
|
33
|
+
all_data = []
|
|
34
|
+
data_dict = dataset.get_data()
|
|
35
|
+
for subset_name, subset_data in data_dict.items():
|
|
36
|
+
for prompt in subset_data:
|
|
37
|
+
all_data.append(
|
|
38
|
+
DatasetEntry(
|
|
39
|
+
prompt=prompt,
|
|
40
|
+
tags=dataset.tags,
|
|
41
|
+
categories=dataset.hierarchy,
|
|
42
|
+
task_type=dataset.task_type,
|
|
43
|
+
weight=dataset.weight,
|
|
44
|
+
dataset_name=dataset.name,
|
|
45
|
+
subset_name=subset_name,
|
|
46
|
+
))
|
|
47
|
+
sampled_data = random.choices(all_data, k=count)
|
|
48
|
+
return sampled_data
|
|
49
|
+
|
|
50
|
+
def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
|
|
51
|
+
result = []
|
|
52
|
+
for i, entry in enumerate(all_data):
|
|
53
|
+
entry.index = i
|
|
54
|
+
result.append(asdict(entry))
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class WeightedSampler(Sampler):
|
|
59
|
+
"""
|
|
60
|
+
Weighted sampler, according to the weight of each dataset, sample data from each dataset.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def sample(self, count: int) -> List[dict]:
|
|
64
|
+
dataset_info_list = self.schema.flatten()
|
|
65
|
+
sampled_data = []
|
|
66
|
+
remaining_count = count
|
|
67
|
+
|
|
68
|
+
for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
|
|
69
|
+
if i == len(dataset_info_list) - 1:
|
|
70
|
+
dataset_sample_count = remaining_count
|
|
71
|
+
else:
|
|
72
|
+
dataset_sample_count = int(dataset.weight * count)
|
|
73
|
+
remaining_count -= dataset_sample_count
|
|
74
|
+
|
|
75
|
+
sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
|
|
76
|
+
|
|
77
|
+
return self._update_index(sampled_data)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class UniformSampler(Sampler):
|
|
81
|
+
"""
|
|
82
|
+
Uniform sampler, sample data from each dataset with the same number of samples.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def sample(self, count: int) -> List[dict]:
|
|
86
|
+
dataset_info_list = self.schema.flatten()
|
|
87
|
+
num_datasets = len(dataset_info_list)
|
|
88
|
+
remaining_count = count
|
|
89
|
+
sampled_data = []
|
|
90
|
+
|
|
91
|
+
for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
|
|
92
|
+
if i == len(dataset_info_list) - 1:
|
|
93
|
+
dataset_sample_count = remaining_count
|
|
94
|
+
else:
|
|
95
|
+
dataset_sample_count = count // num_datasets
|
|
96
|
+
remaining_count -= dataset_sample_count
|
|
97
|
+
|
|
98
|
+
sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
|
|
99
|
+
|
|
100
|
+
return self._update_index(sampled_data)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class StratifiedSampler(Sampler):
|
|
104
|
+
"""
|
|
105
|
+
Stratified sampler, sample data from each dataset according to the number of samples of each dataset.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def sample(self, count: int) -> List[dict]:
|
|
109
|
+
dataset_info_list = self.schema.flatten()
|
|
110
|
+
|
|
111
|
+
total_samples = sum(len(dataset.get_data()) for dataset in dataset_info_list)
|
|
112
|
+
remaining_count = count
|
|
113
|
+
sampled_data = []
|
|
114
|
+
|
|
115
|
+
for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
|
|
116
|
+
if i == len(dataset_info_list) - 1:
|
|
117
|
+
dataset_sample_count = remaining_count
|
|
118
|
+
else:
|
|
119
|
+
dataset_sample_count = int((len(dataset.get_data()) / total_samples) * count)
|
|
120
|
+
remaining_count -= dataset_sample_count
|
|
121
|
+
|
|
122
|
+
sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
|
|
123
|
+
return self._update_index(sampled_data)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
if __name__ == '__main__':
|
|
127
|
+
from evalscope.utils.io_utils import dump_jsonl_data
|
|
128
|
+
|
|
129
|
+
schema = CollectionSchema.from_json('outputs/schema.json')
|
|
130
|
+
print(schema.to_dict())
|
|
131
|
+
mixed_data = WeightedSampler(schema).sample(10)
|
|
132
|
+
dump_jsonl_data(mixed_data, 'outputs/weighted_mixed_data.jsonl')
|
|
133
|
+
|
|
134
|
+
# mixed_data = UniformSampler(schema, 100).sample()
|
|
135
|
+
# dump_jsonl_data(mixed_data, 'outputs/uniform_mixed_data.jsonl')
|
|
136
|
+
|
|
137
|
+
# mixed_data = StratifiedSampler(schema, 100).sample()
|
|
138
|
+
# dump_jsonl_data(mixed_data, 'outputs/stratified_mixed_data.jsonl')
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import json
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
|
+
from typing import List, Union
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class DatasetInfo:
|
|
9
|
+
name: str
|
|
10
|
+
weight: float = 1.0 # sample weight in each collection
|
|
11
|
+
task_type: str = ''
|
|
12
|
+
tags: List[str] = field(default_factory=list)
|
|
13
|
+
args: dict = field(default_factory=dict)
|
|
14
|
+
hierarchy: List[str] = field(default_factory=list)
|
|
15
|
+
|
|
16
|
+
def get_data(self) -> dict:
|
|
17
|
+
from evalscope.benchmarks import Benchmark
|
|
18
|
+
|
|
19
|
+
benchmark_meta = Benchmark.get(self.name)
|
|
20
|
+
|
|
21
|
+
data_adapter = benchmark_meta.get_data_adapter(config=self.args)
|
|
22
|
+
data_dict = data_adapter.load(
|
|
23
|
+
dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
|
|
24
|
+
prompts = data_adapter.gen_prompts(data_dict)
|
|
25
|
+
return prompts
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def flatten_weight(collection: 'CollectionSchema', base_weight=1):
|
|
29
|
+
total_weight = sum(dataset.weight for dataset in collection.datasets)
|
|
30
|
+
for dataset in collection.datasets:
|
|
31
|
+
current_weight = dataset.weight / total_weight * base_weight
|
|
32
|
+
if isinstance(dataset, CollectionSchema):
|
|
33
|
+
flatten_weight(dataset, current_weight)
|
|
34
|
+
else:
|
|
35
|
+
dataset.weight = current_weight
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def flatten_name(collection: 'CollectionSchema', parent_names=None):
|
|
39
|
+
if parent_names is None:
|
|
40
|
+
parent_names = []
|
|
41
|
+
current_names = parent_names + [collection.name]
|
|
42
|
+
for dataset in collection.datasets:
|
|
43
|
+
if isinstance(dataset, CollectionSchema):
|
|
44
|
+
flatten_name(dataset, current_names)
|
|
45
|
+
else:
|
|
46
|
+
dataset.hierarchy = current_names.copy()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def flatten_datasets(collection: 'CollectionSchema') -> List[DatasetInfo]:
|
|
50
|
+
flat_datasets = []
|
|
51
|
+
for dataset in collection.datasets:
|
|
52
|
+
if isinstance(dataset, CollectionSchema):
|
|
53
|
+
flat_datasets.extend(flatten_datasets(dataset))
|
|
54
|
+
else:
|
|
55
|
+
flat_datasets.append(dataset)
|
|
56
|
+
return flat_datasets
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class CollectionSchema:
|
|
61
|
+
name: str
|
|
62
|
+
weight: float = 1.0
|
|
63
|
+
datasets: List[Union[DatasetInfo, 'CollectionSchema']] = field(default_factory=list)
|
|
64
|
+
|
|
65
|
+
def __str__(self):
|
|
66
|
+
return json.dumps(self.to_dict(), ensure_ascii=False, indent=4)
|
|
67
|
+
|
|
68
|
+
def to_dict(self):
|
|
69
|
+
return {
|
|
70
|
+
'name':
|
|
71
|
+
self.name,
|
|
72
|
+
'weight':
|
|
73
|
+
self.weight,
|
|
74
|
+
'datasets':
|
|
75
|
+
[asdict(dataset) if isinstance(dataset, DatasetInfo) else dataset.to_dict() for dataset in self.datasets],
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
@classmethod
|
|
79
|
+
def from_dict(cls, data):
|
|
80
|
+
instance = cls(name=data.get('name', ''), weight=data.get('weight', 1))
|
|
81
|
+
for dataset in data.get('datasets', []):
|
|
82
|
+
if 'datasets' in dataset:
|
|
83
|
+
instance.datasets.append(CollectionSchema.from_dict(dataset))
|
|
84
|
+
else:
|
|
85
|
+
instance.datasets.append(DatasetInfo(**dataset))
|
|
86
|
+
return instance
|
|
87
|
+
|
|
88
|
+
def dump_json(self, file_path):
|
|
89
|
+
d = self.to_dict()
|
|
90
|
+
with open(file_path, 'w') as f:
|
|
91
|
+
json.dump(d, f, ensure_ascii=False, indent=4)
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def from_json(cls, file_path):
|
|
95
|
+
with open(file_path, 'r') as f:
|
|
96
|
+
data = json.load(f)
|
|
97
|
+
return cls.from_dict(data)
|
|
98
|
+
|
|
99
|
+
def flatten(self) -> List[DatasetInfo]:
|
|
100
|
+
collection = copy.deepcopy(self)
|
|
101
|
+
flatten_name(collection)
|
|
102
|
+
flatten_weight(collection)
|
|
103
|
+
return flatten_datasets(collection)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == '__main__':
|
|
107
|
+
schema = CollectionSchema(
|
|
108
|
+
name='reasoning',
|
|
109
|
+
datasets=[
|
|
110
|
+
CollectionSchema(name='english', datasets=[
|
|
111
|
+
DatasetInfo(name='arc', weight=1, tags=['en']),
|
|
112
|
+
]),
|
|
113
|
+
CollectionSchema(
|
|
114
|
+
name='chinese',
|
|
115
|
+
datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})])
|
|
116
|
+
])
|
|
117
|
+
print(schema)
|
|
118
|
+
print(schema.flatten())
|
|
119
|
+
schema.dump_json('outputs/schema.json')
|
|
120
|
+
|
|
121
|
+
schema = CollectionSchema.from_json('outputs/schema.json')
|
|
122
|
+
print(schema)
|
|
123
|
+
# 打印扁平化后的结果
|
|
124
|
+
for dataset in schema.flatten():
|
|
125
|
+
print(f'Dataset: {dataset.name}')
|
|
126
|
+
print(f"Hierarchy: {' -> '.join(dataset.hierarchy)}")
|
evalscope/config.py
CHANGED
|
@@ -31,7 +31,7 @@ DEFAULT_GENERATION_CONFIG = {
|
|
|
31
31
|
@dataclass
|
|
32
32
|
class TaskConfig:
|
|
33
33
|
# Model-related arguments
|
|
34
|
-
model: Union[str, CustomModel, None] = None
|
|
34
|
+
model: Union[str, 'CustomModel', None] = None
|
|
35
35
|
model_id: Optional[str] = None
|
|
36
36
|
model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
|
|
37
37
|
|
|
@@ -40,8 +40,8 @@ class TaskConfig:
|
|
|
40
40
|
chat_template: Optional[str] = None
|
|
41
41
|
|
|
42
42
|
# Dataset-related arguments
|
|
43
|
-
datasets:
|
|
44
|
-
dataset_args:
|
|
43
|
+
datasets: List[str] = field(default_factory=list)
|
|
44
|
+
dataset_args: Dict = field(default_factory=dict)
|
|
45
45
|
dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
|
|
46
46
|
dataset_hub: str = HubType.MODELSCOPE
|
|
47
47
|
|
|
@@ -64,7 +64,9 @@ class TaskConfig:
|
|
|
64
64
|
# Debug and runtime mode arguments
|
|
65
65
|
debug: bool = False
|
|
66
66
|
dry_run: bool = False
|
|
67
|
-
seed: int = 42
|
|
67
|
+
seed: Optional[int] = 42
|
|
68
|
+
api_url: Optional[str] = None # Only used for server model
|
|
69
|
+
api_key: Optional[str] = 'EMPTY' # Only used for server model
|
|
68
70
|
|
|
69
71
|
def __post_init__(self):
|
|
70
72
|
if (not self.model_id) and self.model:
|
|
@@ -74,7 +76,6 @@ class TaskConfig:
|
|
|
74
76
|
self.model_id = os.path.basename(self.model).rstrip(os.sep)
|
|
75
77
|
|
|
76
78
|
def to_dict(self):
|
|
77
|
-
# Note: to avoid serialization error for some model instance
|
|
78
79
|
return self.__dict__
|
|
79
80
|
|
|
80
81
|
def __str__(self):
|
|
@@ -130,6 +131,7 @@ class TaskConfig:
|
|
|
130
131
|
continue
|
|
131
132
|
|
|
132
133
|
task.model = custom_model
|
|
134
|
+
task.model_args = custom_model.config
|
|
133
135
|
task.model_id = type(custom_model).__name__
|
|
134
136
|
res_list.append(task)
|
|
135
137
|
|
evalscope/constants.py
CHANGED
|
@@ -135,34 +135,17 @@ class EvalStage:
|
|
|
135
135
|
class EvalType:
|
|
136
136
|
|
|
137
137
|
CUSTOM = 'custom'
|
|
138
|
-
CHECKPOINT = 'checkpoint'
|
|
138
|
+
CHECKPOINT = 'checkpoint' # native model checkpoint
|
|
139
|
+
SERVICE = 'service' # model service
|
|
139
140
|
|
|
140
141
|
|
|
141
142
|
class EvalBackend:
|
|
143
|
+
NATIVE = 'Native'
|
|
144
|
+
OPEN_COMPASS = 'OpenCompass'
|
|
145
|
+
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
146
|
+
RAG_EVAL = 'RAGEval'
|
|
147
|
+
THIRD_PARTY = 'ThirdParty'
|
|
142
148
|
|
|
143
|
-
class _Backend:
|
|
144
|
-
# compatible with old version, set 'value'
|
|
145
149
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
@property
|
|
150
|
-
def value(self):
|
|
151
|
-
return self._value
|
|
152
|
-
|
|
153
|
-
def __str__(self):
|
|
154
|
-
return self._value
|
|
155
|
-
|
|
156
|
-
def __repr__(self):
|
|
157
|
-
return f"'{self._value}'"
|
|
158
|
-
|
|
159
|
-
def __eq__(self, other):
|
|
160
|
-
if isinstance(other, str):
|
|
161
|
-
return self._value == other
|
|
162
|
-
return NotImplemented
|
|
163
|
-
|
|
164
|
-
NATIVE = _Backend('Native')
|
|
165
|
-
OPEN_COMPASS = _Backend('OpenCompass')
|
|
166
|
-
VLM_EVAL_KIT = _Backend('VLMEvalKit')
|
|
167
|
-
RAG_EVAL = _Backend('RAGEval')
|
|
168
|
-
THIRD_PARTY = _Backend('ThirdParty')
|
|
150
|
+
class DataCollection:
|
|
151
|
+
NAME = 'data_collection'
|