evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/base.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/clip.py +2 -2
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
- evalscope/benchmarks/ifeval/instructions.py +1477 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +27 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +30 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +45 -7
- evalscope/constants.py +7 -38
- evalscope/evaluator/__init__.py +0 -1
- evalscope/evaluator/evaluator.py +89 -121
- evalscope/evaluator/rating_eval.py +1 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +140 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/arguments.py +3 -1
- evalscope/perf/benchmark.py +3 -3
- evalscope/perf/main.py +5 -7
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +54 -50
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/benchmark_util.py +4 -4
- evalscope/perf/utils/db_util.py +66 -22
- evalscope/perf/utils/local_server.py +4 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +693 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +64 -125
- evalscope/run_arena.py +3 -2
- evalscope/summarizer.py +15 -27
- evalscope/third_party/longbench_write/eval.py +2 -1
- evalscope/third_party/longbench_write/longbench_write.py +2 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +1 -0
- evalscope/utils/chat_service.py +6 -5
- evalscope/utils/io_utils.py +170 -0
- evalscope/utils/logger.py +13 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -200
- evalscope/version.py +2 -2
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +57 -7
- tests/perf/test_perf.py +3 -2
- tests/rag/test_mteb.py +3 -2
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/evaluator/humaneval_evaluator.py +0 -158
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -135
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from tabulate import tabulate
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
from typing import List
|
|
8
|
+
|
|
9
|
+
from evalscope.benchmarks import Benchmark
|
|
10
|
+
from evalscope.collections.sampler import DatasetEntry
|
|
11
|
+
from evalscope.config import TaskConfig
|
|
12
|
+
from evalscope.constants import DataCollection, DumpMode
|
|
13
|
+
from evalscope.evaluator import Evaluator
|
|
14
|
+
from evalscope.models import get_local_model, initialize_model_adapter
|
|
15
|
+
from evalscope.report import ReportGenerator
|
|
16
|
+
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
17
|
+
from evalscope.utils.logger import get_logger
|
|
18
|
+
|
|
19
|
+
logger = get_logger()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SimpleEvaluator(Evaluator):
|
|
23
|
+
|
|
24
|
+
def __init__(self, dataset_name, data_adapter, model_adapter, task_cfg, outputs):
|
|
25
|
+
super().__init__(
|
|
26
|
+
dataset_name_or_path=dataset_name,
|
|
27
|
+
data_adapter=data_adapter,
|
|
28
|
+
model_adapter=model_adapter,
|
|
29
|
+
task_cfg=task_cfg,
|
|
30
|
+
outputs=outputs)
|
|
31
|
+
|
|
32
|
+
def get_answer(self, input_prompt, subset_name, infer_cfg) -> dict:
|
|
33
|
+
answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
|
|
34
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
35
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
36
|
+
return processed_answer
|
|
37
|
+
|
|
38
|
+
def get_review(self, answer_d) -> dict:
|
|
39
|
+
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
40
|
+
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
41
|
+
return review_d
|
|
42
|
+
|
|
43
|
+
def get_score(self, review_d) -> float:
|
|
44
|
+
metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
|
|
45
|
+
# use the first metric by default
|
|
46
|
+
score = metric_score[0]['score']
|
|
47
|
+
return score
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class EvaluatorCollection:
|
|
51
|
+
|
|
52
|
+
def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure):
|
|
53
|
+
self.task_cfg = task_cfg
|
|
54
|
+
self.outputs = outputs
|
|
55
|
+
self.model = get_local_model(task_cfg)
|
|
56
|
+
self.dataset, self.dataset_name = self.load()
|
|
57
|
+
self.dataset_name_map, self.dataset_id_map = self._parse_dataset()
|
|
58
|
+
self.evaluators = self._initialize_evaluators()
|
|
59
|
+
|
|
60
|
+
def load(self) -> tuple[list[DatasetEntry], str]:
|
|
61
|
+
dataset_path = self.task_cfg.dataset_args[DataCollection.NAME]['local_path']
|
|
62
|
+
dataset_name = os.path.basename(dataset_path).split('.')[0]
|
|
63
|
+
raw_dataset = jsonl_to_list(dataset_path)
|
|
64
|
+
datasets = []
|
|
65
|
+
for sample in raw_dataset:
|
|
66
|
+
datasets.append(DatasetEntry(**sample))
|
|
67
|
+
return datasets, dataset_name
|
|
68
|
+
|
|
69
|
+
def _parse_dataset(self):
|
|
70
|
+
dataset_name_map = defaultdict(lambda: defaultdict(list))
|
|
71
|
+
dataset_id_map = {}
|
|
72
|
+
for sample in self.dataset:
|
|
73
|
+
dataset_name, subset_name = sample.dataset_name, sample.subset_name
|
|
74
|
+
dataset_name_map[dataset_name][subset_name].append(sample.index)
|
|
75
|
+
dataset_id_map[sample.index] = sample
|
|
76
|
+
return dataset_name_map, dataset_id_map
|
|
77
|
+
|
|
78
|
+
def _initialize_evaluators(self):
|
|
79
|
+
evaluators = {}
|
|
80
|
+
for dataset_name in self.dataset_name_map.keys():
|
|
81
|
+
benchmark = Benchmark.get(dataset_name)
|
|
82
|
+
data_adapter = benchmark.get_data_adapter()
|
|
83
|
+
model_adapter = initialize_model_adapter(self.task_cfg, benchmark.model_adapter, self.model)
|
|
84
|
+
evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
|
|
85
|
+
self.outputs)
|
|
86
|
+
return evaluators
|
|
87
|
+
|
|
88
|
+
def get_report(self, scores):
|
|
89
|
+
|
|
90
|
+
def get_dataframe(scores):
|
|
91
|
+
data = []
|
|
92
|
+
for dataset_name, data_map in self.dataset_name_map.items():
|
|
93
|
+
for subset_name, ids in data_map.items():
|
|
94
|
+
for _id in ids:
|
|
95
|
+
row_data: DatasetEntry = self.dataset_id_map[_id]
|
|
96
|
+
score = scores[_id]
|
|
97
|
+
data.append(
|
|
98
|
+
dict(
|
|
99
|
+
task_type=row_data.task_type,
|
|
100
|
+
categories=tuple(row_data.categories),
|
|
101
|
+
dataset_name=dataset_name,
|
|
102
|
+
subset_name=subset_name,
|
|
103
|
+
tags=row_data.tags,
|
|
104
|
+
score=score))
|
|
105
|
+
return pd.DataFrame(data)
|
|
106
|
+
|
|
107
|
+
def aggregate_and_sort(df, group_by_cols):
|
|
108
|
+
# aggregate by group_by_cols, and calculate average_score and count
|
|
109
|
+
report_df = df.groupby(group_by_cols) \
|
|
110
|
+
.agg(average_score=('score', 'mean'), count=('score', 'size')) \
|
|
111
|
+
.reset_index()
|
|
112
|
+
report_df['average_score'] = report_df['average_score'].round(4)
|
|
113
|
+
report_df = report_df.sort_values(by='count', ascending=False) \
|
|
114
|
+
.to_dict(orient='records')
|
|
115
|
+
return report_df
|
|
116
|
+
|
|
117
|
+
df = get_dataframe(scores)
|
|
118
|
+
|
|
119
|
+
# multi-level aggregation
|
|
120
|
+
subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
|
|
121
|
+
dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
|
|
122
|
+
task_report_df = aggregate_and_sort(df, ['task_type'])
|
|
123
|
+
|
|
124
|
+
# explode tags to multiple rows
|
|
125
|
+
df_exploded_tags = df.explode('tags')
|
|
126
|
+
tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags'])
|
|
127
|
+
|
|
128
|
+
# process multi-level categories
|
|
129
|
+
df_categories = df.copy()
|
|
130
|
+
# multi-level aggregation for categories
|
|
131
|
+
max_depth = df_categories['categories'].apply(len).max()
|
|
132
|
+
for level in range(max_depth):
|
|
133
|
+
df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
|
|
134
|
+
if len(x) > level else '')
|
|
135
|
+
category_report_df = aggregate_and_sort(df_categories, [f'category{level}' for level in range(max_depth)])
|
|
136
|
+
|
|
137
|
+
# convert to dict format
|
|
138
|
+
report_dict = {
|
|
139
|
+
'subset_level': subset_report_df,
|
|
140
|
+
'dataset_level': dataset_report_df,
|
|
141
|
+
'task_level': task_report_df,
|
|
142
|
+
'tag_level': tag_report_df,
|
|
143
|
+
'category_level': category_report_df,
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
# record report
|
|
147
|
+
for level, data in report_dict.items():
|
|
148
|
+
table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
|
|
149
|
+
logger.info(f'{level} Report:\n{table}')
|
|
150
|
+
|
|
151
|
+
report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
|
|
152
|
+
# save report to JSON file
|
|
153
|
+
report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
|
|
154
|
+
os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
|
|
155
|
+
with open(report_file_path, 'w', encoding='utf-8') as f:
|
|
156
|
+
json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
|
|
157
|
+
|
|
158
|
+
def get_answers(self):
|
|
159
|
+
pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
|
|
160
|
+
f'{self.dataset_name}.jsonl')
|
|
161
|
+
os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
|
|
162
|
+
answers = defaultdict(dict)
|
|
163
|
+
for sample in tqdm(self.dataset, desc='Getting answers'):
|
|
164
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
165
|
+
answer_d = evaluator.get_answer(sample.prompt, sample.subset_name, self.task_cfg.generation_config)
|
|
166
|
+
answers[sample.index] = answer_d
|
|
167
|
+
dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
168
|
+
return answers
|
|
169
|
+
|
|
170
|
+
def get_reviews(self, answers):
|
|
171
|
+
review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
|
|
172
|
+
os.makedirs(review_file_path, exist_ok=True)
|
|
173
|
+
reviews = defaultdict(dict)
|
|
174
|
+
for sample in tqdm(self.dataset, desc='Getting reviews'):
|
|
175
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
176
|
+
review_d = evaluator.get_review(answers[sample.index])
|
|
177
|
+
reviews[sample.index] = review_d
|
|
178
|
+
dump_jsonl_data(
|
|
179
|
+
review_d,
|
|
180
|
+
os.path.join(review_file_path, f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'),
|
|
181
|
+
dump_mode=DumpMode.APPEND)
|
|
182
|
+
return reviews
|
|
183
|
+
|
|
184
|
+
def get_scores(self, reviews) -> float:
|
|
185
|
+
scores = defaultdict(dict)
|
|
186
|
+
for sample in tqdm(self.dataset, desc='Getting scores'):
|
|
187
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
188
|
+
review_d = reviews[sample.index]
|
|
189
|
+
score = evaluator.get_score(review_d)
|
|
190
|
+
scores[sample.index] = score
|
|
191
|
+
|
|
192
|
+
return scores
|
|
193
|
+
|
|
194
|
+
def eval(self, **kwargs):
|
|
195
|
+
answers = self.get_answers()
|
|
196
|
+
reviews = self.get_reviews(answers)
|
|
197
|
+
scores = self.get_scores(reviews)
|
|
198
|
+
self.get_report(scores)
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from evalscope.collections.schema import CollectionSchema, DatasetInfo
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class DatasetEntry:
|
|
12
|
+
index: int = 0
|
|
13
|
+
prompt: dict = field(default_factory=dict)
|
|
14
|
+
tags: List[str] = field(default_factory=list)
|
|
15
|
+
categories: List[str] = field(default_factory=list)
|
|
16
|
+
task_type: str = ''
|
|
17
|
+
weight: float = 0.0
|
|
18
|
+
dataset_name: str = ''
|
|
19
|
+
subset_name: str = ''
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Define an abstract base class for Samplers
|
|
23
|
+
class Sampler(ABC):
|
|
24
|
+
|
|
25
|
+
def __init__(self, schema: CollectionSchema):
|
|
26
|
+
self.schema = schema
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def sample(self) -> List[dict]:
|
|
30
|
+
raise NotImplementedError
|
|
31
|
+
|
|
32
|
+
def _sample_dataset(self, dataset: DatasetInfo, count: int) -> List[DatasetEntry]:
|
|
33
|
+
all_data = []
|
|
34
|
+
data_dict = dataset.get_data()
|
|
35
|
+
for subset_name, subset_data in data_dict.items():
|
|
36
|
+
for prompt in subset_data:
|
|
37
|
+
all_data.append(
|
|
38
|
+
DatasetEntry(
|
|
39
|
+
prompt=prompt,
|
|
40
|
+
tags=dataset.tags,
|
|
41
|
+
categories=dataset.hierarchy,
|
|
42
|
+
task_type=dataset.task_type,
|
|
43
|
+
weight=dataset.weight,
|
|
44
|
+
dataset_name=dataset.name,
|
|
45
|
+
subset_name=subset_name,
|
|
46
|
+
))
|
|
47
|
+
sampled_data = random.choices(all_data, k=count)
|
|
48
|
+
return sampled_data
|
|
49
|
+
|
|
50
|
+
def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
|
|
51
|
+
result = []
|
|
52
|
+
for i, entry in enumerate(all_data):
|
|
53
|
+
entry.index = i
|
|
54
|
+
result.append(asdict(entry))
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class WeightedSampler(Sampler):
|
|
59
|
+
"""
|
|
60
|
+
Weighted sampler, according to the weight of each dataset, sample data from each dataset.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
def sample(self, count: int) -> List[dict]:
|
|
64
|
+
dataset_info_list = self.schema.flatten()
|
|
65
|
+
sampled_data = []
|
|
66
|
+
remaining_count = count
|
|
67
|
+
|
|
68
|
+
for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
|
|
69
|
+
if i == len(dataset_info_list) - 1:
|
|
70
|
+
dataset_sample_count = remaining_count
|
|
71
|
+
else:
|
|
72
|
+
dataset_sample_count = int(dataset.weight * count)
|
|
73
|
+
remaining_count -= dataset_sample_count
|
|
74
|
+
|
|
75
|
+
sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
|
|
76
|
+
|
|
77
|
+
return self._update_index(sampled_data)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class UniformSampler(Sampler):
|
|
81
|
+
"""
|
|
82
|
+
Uniform sampler, sample data from each dataset with the same number of samples.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
def sample(self, count: int) -> List[dict]:
|
|
86
|
+
dataset_info_list = self.schema.flatten()
|
|
87
|
+
num_datasets = len(dataset_info_list)
|
|
88
|
+
remaining_count = count
|
|
89
|
+
sampled_data = []
|
|
90
|
+
|
|
91
|
+
for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
|
|
92
|
+
if i == len(dataset_info_list) - 1:
|
|
93
|
+
dataset_sample_count = remaining_count
|
|
94
|
+
else:
|
|
95
|
+
dataset_sample_count = count // num_datasets
|
|
96
|
+
remaining_count -= dataset_sample_count
|
|
97
|
+
|
|
98
|
+
sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
|
|
99
|
+
|
|
100
|
+
return self._update_index(sampled_data)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class StratifiedSampler(Sampler):
|
|
104
|
+
"""
|
|
105
|
+
Stratified sampler, sample data from each dataset according to the number of samples of each dataset.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
def sample(self, count: int) -> List[dict]:
|
|
109
|
+
dataset_info_list = self.schema.flatten()
|
|
110
|
+
|
|
111
|
+
total_samples = sum(len(dataset.get_data()) for dataset in dataset_info_list)
|
|
112
|
+
remaining_count = count
|
|
113
|
+
sampled_data = []
|
|
114
|
+
|
|
115
|
+
for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
|
|
116
|
+
if i == len(dataset_info_list) - 1:
|
|
117
|
+
dataset_sample_count = remaining_count
|
|
118
|
+
else:
|
|
119
|
+
dataset_sample_count = int((len(dataset.get_data()) / total_samples) * count)
|
|
120
|
+
remaining_count -= dataset_sample_count
|
|
121
|
+
|
|
122
|
+
sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
|
|
123
|
+
return self._update_index(sampled_data)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
if __name__ == '__main__':
|
|
127
|
+
from evalscope.utils.io_utils import dump_jsonl_data
|
|
128
|
+
|
|
129
|
+
schema = CollectionSchema.from_json('outputs/schema.json')
|
|
130
|
+
print(schema.to_dict())
|
|
131
|
+
mixed_data = WeightedSampler(schema).sample(10)
|
|
132
|
+
dump_jsonl_data(mixed_data, 'outputs/weighted_mixed_data.jsonl')
|
|
133
|
+
|
|
134
|
+
# mixed_data = UniformSampler(schema, 100).sample()
|
|
135
|
+
# dump_jsonl_data(mixed_data, 'outputs/uniform_mixed_data.jsonl')
|
|
136
|
+
|
|
137
|
+
# mixed_data = StratifiedSampler(schema, 100).sample()
|
|
138
|
+
# dump_jsonl_data(mixed_data, 'outputs/stratified_mixed_data.jsonl')
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import copy
|
|
2
|
+
import json
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
|
+
from typing import List, Union
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class DatasetInfo:
|
|
9
|
+
name: str
|
|
10
|
+
weight: float = 1.0 # sample weight in each collection
|
|
11
|
+
task_type: str = ''
|
|
12
|
+
tags: List[str] = field(default_factory=list)
|
|
13
|
+
args: dict = field(default_factory=dict)
|
|
14
|
+
hierarchy: List[str] = field(default_factory=list)
|
|
15
|
+
|
|
16
|
+
def get_data(self) -> dict:
|
|
17
|
+
from evalscope.benchmarks import Benchmark
|
|
18
|
+
|
|
19
|
+
benchmark_meta = Benchmark.get(self.name)
|
|
20
|
+
|
|
21
|
+
data_adapter = benchmark_meta.get_data_adapter(config=self.args)
|
|
22
|
+
data_dict = data_adapter.load(
|
|
23
|
+
dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
|
|
24
|
+
prompts = data_adapter.gen_prompts(data_dict)
|
|
25
|
+
return prompts
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def flatten_weight(collection: 'CollectionSchema', base_weight=1):
|
|
29
|
+
total_weight = sum(dataset.weight for dataset in collection.datasets)
|
|
30
|
+
for dataset in collection.datasets:
|
|
31
|
+
current_weight = dataset.weight / total_weight * base_weight
|
|
32
|
+
if isinstance(dataset, CollectionSchema):
|
|
33
|
+
flatten_weight(dataset, current_weight)
|
|
34
|
+
else:
|
|
35
|
+
dataset.weight = current_weight
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def flatten_name(collection: 'CollectionSchema', parent_names=None):
|
|
39
|
+
if parent_names is None:
|
|
40
|
+
parent_names = []
|
|
41
|
+
current_names = parent_names + [collection.name]
|
|
42
|
+
for dataset in collection.datasets:
|
|
43
|
+
if isinstance(dataset, CollectionSchema):
|
|
44
|
+
flatten_name(dataset, current_names)
|
|
45
|
+
else:
|
|
46
|
+
dataset.hierarchy = current_names.copy()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def flatten_datasets(collection: 'CollectionSchema') -> List[DatasetInfo]:
|
|
50
|
+
flat_datasets = []
|
|
51
|
+
for dataset in collection.datasets:
|
|
52
|
+
if isinstance(dataset, CollectionSchema):
|
|
53
|
+
flat_datasets.extend(flatten_datasets(dataset))
|
|
54
|
+
else:
|
|
55
|
+
flat_datasets.append(dataset)
|
|
56
|
+
return flat_datasets
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class CollectionSchema:
|
|
61
|
+
name: str
|
|
62
|
+
weight: float = 1.0
|
|
63
|
+
datasets: List[Union[DatasetInfo, 'CollectionSchema']] = field(default_factory=list)
|
|
64
|
+
|
|
65
|
+
def __str__(self):
|
|
66
|
+
return json.dumps(self.to_dict(), ensure_ascii=False, indent=4)
|
|
67
|
+
|
|
68
|
+
def to_dict(self):
|
|
69
|
+
return {
|
|
70
|
+
'name':
|
|
71
|
+
self.name,
|
|
72
|
+
'weight':
|
|
73
|
+
self.weight,
|
|
74
|
+
'datasets':
|
|
75
|
+
[asdict(dataset) if isinstance(dataset, DatasetInfo) else dataset.to_dict() for dataset in self.datasets],
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
@classmethod
|
|
79
|
+
def from_dict(cls, data):
|
|
80
|
+
instance = cls(name=data.get('name', ''), weight=data.get('weight', 1))
|
|
81
|
+
for dataset in data.get('datasets', []):
|
|
82
|
+
if 'datasets' in dataset:
|
|
83
|
+
instance.datasets.append(CollectionSchema.from_dict(dataset))
|
|
84
|
+
else:
|
|
85
|
+
instance.datasets.append(DatasetInfo(**dataset))
|
|
86
|
+
return instance
|
|
87
|
+
|
|
88
|
+
def dump_json(self, file_path):
|
|
89
|
+
d = self.to_dict()
|
|
90
|
+
with open(file_path, 'w') as f:
|
|
91
|
+
json.dump(d, f, ensure_ascii=False, indent=4)
|
|
92
|
+
|
|
93
|
+
@classmethod
|
|
94
|
+
def from_json(cls, file_path):
|
|
95
|
+
with open(file_path, 'r') as f:
|
|
96
|
+
data = json.load(f)
|
|
97
|
+
return cls.from_dict(data)
|
|
98
|
+
|
|
99
|
+
def flatten(self) -> List[DatasetInfo]:
|
|
100
|
+
collection = copy.deepcopy(self)
|
|
101
|
+
flatten_name(collection)
|
|
102
|
+
flatten_weight(collection)
|
|
103
|
+
return flatten_datasets(collection)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
if __name__ == '__main__':
|
|
107
|
+
schema = CollectionSchema(
|
|
108
|
+
name='reasoning',
|
|
109
|
+
datasets=[
|
|
110
|
+
CollectionSchema(name='english', datasets=[
|
|
111
|
+
DatasetInfo(name='arc', weight=1, tags=['en']),
|
|
112
|
+
]),
|
|
113
|
+
CollectionSchema(
|
|
114
|
+
name='chinese',
|
|
115
|
+
datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})])
|
|
116
|
+
])
|
|
117
|
+
print(schema)
|
|
118
|
+
print(schema.flatten())
|
|
119
|
+
schema.dump_json('outputs/schema.json')
|
|
120
|
+
|
|
121
|
+
schema = CollectionSchema.from_json('outputs/schema.json')
|
|
122
|
+
print(schema)
|
|
123
|
+
# 打印扁平化后的结果
|
|
124
|
+
for dataset in schema.flatten():
|
|
125
|
+
print(f'Dataset: {dataset.name}')
|
|
126
|
+
print(f"Hierarchy: {' -> '.join(dataset.hierarchy)}")
|
evalscope/config.py
CHANGED
|
@@ -9,7 +9,8 @@ from typing import Dict, List, Optional, Union
|
|
|
9
9
|
|
|
10
10
|
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
|
|
11
11
|
from evalscope.models.custom import CustomModel
|
|
12
|
-
from evalscope.utils import
|
|
12
|
+
from evalscope.utils import gen_hash
|
|
13
|
+
from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
|
|
13
14
|
from evalscope.utils.logger import get_logger
|
|
14
15
|
|
|
15
16
|
logger = get_logger()
|
|
@@ -30,7 +31,8 @@ DEFAULT_GENERATION_CONFIG = {
|
|
|
30
31
|
@dataclass
|
|
31
32
|
class TaskConfig:
|
|
32
33
|
# Model-related arguments
|
|
33
|
-
model: Union[str, CustomModel, None] = None
|
|
34
|
+
model: Union[str, 'CustomModel', None] = None
|
|
35
|
+
model_id: Optional[str] = None
|
|
34
36
|
model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
|
|
35
37
|
|
|
36
38
|
# Template-related arguments
|
|
@@ -38,8 +40,8 @@ class TaskConfig:
|
|
|
38
40
|
chat_template: Optional[str] = None
|
|
39
41
|
|
|
40
42
|
# Dataset-related arguments
|
|
41
|
-
datasets:
|
|
42
|
-
dataset_args:
|
|
43
|
+
datasets: List[str] = field(default_factory=list)
|
|
44
|
+
dataset_args: Dict = field(default_factory=dict)
|
|
43
45
|
dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
|
|
44
46
|
dataset_hub: str = HubType.MODELSCOPE
|
|
45
47
|
|
|
@@ -62,10 +64,18 @@ class TaskConfig:
|
|
|
62
64
|
# Debug and runtime mode arguments
|
|
63
65
|
debug: bool = False
|
|
64
66
|
dry_run: bool = False
|
|
65
|
-
seed: int = 42
|
|
67
|
+
seed: Optional[int] = 42
|
|
68
|
+
api_url: Optional[str] = None # Only used for server model
|
|
69
|
+
api_key: Optional[str] = 'EMPTY' # Only used for server model
|
|
70
|
+
|
|
71
|
+
def __post_init__(self):
|
|
72
|
+
if (not self.model_id) and self.model:
|
|
73
|
+
if isinstance(self.model, CustomModel):
|
|
74
|
+
self.model_id = type(self.model).__name__
|
|
75
|
+
else:
|
|
76
|
+
self.model_id = os.path.basename(self.model).rstrip(os.sep)
|
|
66
77
|
|
|
67
78
|
def to_dict(self):
|
|
68
|
-
# Note: to avoid serialization error for some model instance
|
|
69
79
|
return self.__dict__
|
|
70
80
|
|
|
71
81
|
def __str__(self):
|
|
@@ -105,7 +115,9 @@ class TaskConfig:
|
|
|
105
115
|
def from_args(args: Namespace):
|
|
106
116
|
# Convert Namespace to a dictionary and filter out None values
|
|
107
117
|
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
108
|
-
|
|
118
|
+
|
|
119
|
+
if 'func' in args_dict:
|
|
120
|
+
del args_dict['func'] # Note: compat CLI arguments
|
|
109
121
|
|
|
110
122
|
return TaskConfig.from_dict(args_dict)
|
|
111
123
|
|
|
@@ -119,6 +131,8 @@ class TaskConfig:
|
|
|
119
131
|
continue
|
|
120
132
|
|
|
121
133
|
task.model = custom_model
|
|
134
|
+
task.model_args = custom_model.config
|
|
135
|
+
task.model_id = type(custom_model).__name__
|
|
122
136
|
res_list.append(task)
|
|
123
137
|
|
|
124
138
|
return res_list
|
|
@@ -168,6 +182,30 @@ tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
|
|
|
168
182
|
registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
|
|
169
183
|
|
|
170
184
|
|
|
185
|
+
def parse_task_config(task_cfg) -> TaskConfig:
|
|
186
|
+
"""Parse task configuration from various formats into a TaskConfig object."""
|
|
187
|
+
if isinstance(task_cfg, TaskConfig):
|
|
188
|
+
logger.info('Args: Task config is provided with TaskConfig type.')
|
|
189
|
+
elif isinstance(task_cfg, dict):
|
|
190
|
+
logger.info('Args: Task config is provided with dictionary type.')
|
|
191
|
+
task_cfg = TaskConfig.from_dict(task_cfg)
|
|
192
|
+
elif isinstance(task_cfg, Namespace):
|
|
193
|
+
logger.info('Args: Task config is provided with CommandLine type.')
|
|
194
|
+
task_cfg = TaskConfig.from_args(task_cfg)
|
|
195
|
+
elif isinstance(task_cfg, str):
|
|
196
|
+
extension = task_cfg.split('.')[-1]
|
|
197
|
+
logger.info(f'Args: Task config is provided with {extension} file type.')
|
|
198
|
+
if extension in ['yaml', 'yml']:
|
|
199
|
+
task_cfg = TaskConfig.from_yaml(task_cfg)
|
|
200
|
+
elif extension == 'json':
|
|
201
|
+
task_cfg = TaskConfig.from_json(task_cfg)
|
|
202
|
+
else:
|
|
203
|
+
raise ValueError('Args: Unsupported file extension.')
|
|
204
|
+
else:
|
|
205
|
+
raise ValueError('Args: Please provide a valid task config.')
|
|
206
|
+
return task_cfg
|
|
207
|
+
|
|
208
|
+
|
|
171
209
|
class TempModel(CustomModel):
|
|
172
210
|
|
|
173
211
|
def __init__(self, config: dict):
|
evalscope/constants.py
CHANGED
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
2
|
from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
|
|
4
3
|
from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
|
|
5
4
|
|
|
@@ -7,6 +6,7 @@ DEFAULT_WORK_DIR = './outputs'
|
|
|
7
6
|
DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
|
|
8
7
|
DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
|
|
9
8
|
DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
|
|
9
|
+
DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old version
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class HubType:
|
|
@@ -76,33 +76,6 @@ class ArenaMode:
|
|
|
76
76
|
PAIRWISE_BASELINE = 'pairwise_baseline'
|
|
77
77
|
|
|
78
78
|
|
|
79
|
-
class OutputsStructure:
|
|
80
|
-
LOGS_DIR = 'logs'
|
|
81
|
-
PREDICTIONS_DIR = 'predictions'
|
|
82
|
-
REVIEWS_DIR = 'reviews'
|
|
83
|
-
REPORTS_DIR = 'reports'
|
|
84
|
-
CONFIGS_DIR = 'configs'
|
|
85
|
-
|
|
86
|
-
def __init__(self, outputs_dir: str, is_make: bool = True):
|
|
87
|
-
self.outputs_dir = outputs_dir
|
|
88
|
-
self.logs_dir = os.path.join(outputs_dir, OutputsStructure.LOGS_DIR)
|
|
89
|
-
self.predictions_dir = os.path.join(outputs_dir, OutputsStructure.PREDICTIONS_DIR)
|
|
90
|
-
self.reviews_dir = os.path.join(outputs_dir, OutputsStructure.REVIEWS_DIR)
|
|
91
|
-
self.reports_dir = os.path.join(outputs_dir, OutputsStructure.REPORTS_DIR)
|
|
92
|
-
self.configs_dir = os.path.join(outputs_dir, OutputsStructure.CONFIGS_DIR)
|
|
93
|
-
|
|
94
|
-
if is_make:
|
|
95
|
-
self.create_directories()
|
|
96
|
-
|
|
97
|
-
def create_directories(self):
|
|
98
|
-
os.makedirs(self.outputs_dir, exist_ok=True)
|
|
99
|
-
os.makedirs(self.logs_dir, exist_ok=True)
|
|
100
|
-
os.makedirs(self.predictions_dir, exist_ok=True)
|
|
101
|
-
os.makedirs(self.reviews_dir, exist_ok=True)
|
|
102
|
-
os.makedirs(self.reports_dir, exist_ok=True)
|
|
103
|
-
os.makedirs(self.configs_dir, exist_ok=True)
|
|
104
|
-
|
|
105
|
-
|
|
106
79
|
class AnswerKeys:
|
|
107
80
|
ANSWER_ID = 'answer_id'
|
|
108
81
|
RAW_INPUT = 'raw_input'
|
|
@@ -162,21 +135,17 @@ class EvalStage:
|
|
|
162
135
|
class EvalType:
|
|
163
136
|
|
|
164
137
|
CUSTOM = 'custom'
|
|
165
|
-
CHECKPOINT = 'checkpoint'
|
|
138
|
+
CHECKPOINT = 'checkpoint' # native model checkpoint
|
|
139
|
+
SERVICE = 'service' # model service
|
|
166
140
|
|
|
167
141
|
|
|
168
142
|
class EvalBackend:
|
|
169
|
-
# Use native evaluation pipeline of EvalScope
|
|
170
143
|
NATIVE = 'Native'
|
|
171
|
-
|
|
172
|
-
# Use OpenCompass framework as the evaluation backend
|
|
173
144
|
OPEN_COMPASS = 'OpenCompass'
|
|
174
|
-
|
|
175
|
-
# Use VLM Eval Kit as the multi-modal model evaluation backend
|
|
176
145
|
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
177
|
-
|
|
178
|
-
# Use RAGEval as the RAG evaluation backend
|
|
179
146
|
RAG_EVAL = 'RAGEval'
|
|
180
|
-
|
|
181
|
-
# Use third-party evaluation backend/modules
|
|
182
147
|
THIRD_PARTY = 'ThirdParty'
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class DataCollection:
|
|
151
|
+
NAME = 'data_collection'
|
evalscope/evaluator/__init__.py
CHANGED