evalscope 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +10 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +23 -99
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +114 -85
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +25 -53
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +178 -0
- evalscope/collections/sampler.py +132 -0
- evalscope/collections/schema.py +122 -0
- evalscope/config.py +10 -6
- evalscope/constants.py +7 -28
- evalscope/evaluator/evaluator.py +66 -108
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +6 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +7 -4
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +104 -0
- evalscope/perf/arguments.py +1 -0
- evalscope/perf/benchmark.py +1 -1
- evalscope/perf/main.py +3 -1
- evalscope/perf/plugin/api/openai_api.py +51 -47
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/run.py +37 -66
- evalscope/run_arena.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +4 -3
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +4 -0
- evalscope/utils/model_utils.py +10 -0
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/METADATA +46 -17
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/RECORD +81 -92
- tests/cli/test_collection.py +53 -0
- tests/cli/test_run.py +43 -1
- tests/perf/test_perf.py +3 -3
- tests/rag/test_mteb.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -5,45 +5,34 @@ import numpy as np
|
|
|
5
5
|
import os
|
|
6
6
|
from typing import List
|
|
7
7
|
|
|
8
|
+
from evalscope.benchmarks import Benchmark
|
|
8
9
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
9
|
-
from evalscope.
|
|
10
|
-
from evalscope.
|
|
10
|
+
from evalscope.constants import EvalType
|
|
11
|
+
from evalscope.metrics import WeightedAverageAccuracy
|
|
12
|
+
from evalscope.metrics.metrics import exact_match
|
|
13
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
14
|
+
from evalscope.utils import get_logger
|
|
11
15
|
|
|
12
16
|
# flake8: noqa
|
|
13
17
|
|
|
14
18
|
logger = get_logger()
|
|
15
19
|
|
|
16
|
-
DATASET_ID = 'modelscope/trivia_qa'
|
|
17
|
-
SUBSET_LIST = ['default']
|
|
18
|
-
|
|
19
20
|
|
|
21
|
+
@Benchmark.register(
|
|
22
|
+
name='trivia_qa',
|
|
23
|
+
dataset_id='modelscope/trivia_qa',
|
|
24
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
25
|
+
subset_list=['default'],
|
|
26
|
+
metric_list=[WeightedAverageAccuracy],
|
|
27
|
+
few_shot_num=5,
|
|
28
|
+
train_split='dev',
|
|
29
|
+
eval_split='test',
|
|
30
|
+
)
|
|
20
31
|
class TriviaQaAdapter(DataAdapter):
|
|
21
32
|
|
|
22
|
-
def __init__(self,
|
|
23
|
-
subset_list: list = None,
|
|
24
|
-
metric_list: list = None,
|
|
25
|
-
few_shot_num: int = None,
|
|
26
|
-
train_split: str = 'dev',
|
|
27
|
-
eval_split: str = 'test',
|
|
28
|
-
**kwargs):
|
|
29
|
-
|
|
30
|
-
if subset_list is None:
|
|
31
|
-
subset_list = SUBSET_LIST
|
|
32
|
-
|
|
33
|
-
if metric_list is None:
|
|
34
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
33
|
+
def __init__(self, **kwargs):
|
|
35
34
|
|
|
36
|
-
|
|
37
|
-
logger.info(f'few_shot_num is not specified for TriviaQA, use default value: 5')
|
|
38
|
-
few_shot_num = 5
|
|
39
|
-
|
|
40
|
-
super().__init__(
|
|
41
|
-
subset_list=subset_list,
|
|
42
|
-
metric_list=metric_list,
|
|
43
|
-
few_shot_num=few_shot_num,
|
|
44
|
-
train_split=train_split,
|
|
45
|
-
eval_split=eval_split,
|
|
46
|
-
**kwargs)
|
|
35
|
+
super().__init__(**kwargs)
|
|
47
36
|
|
|
48
37
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
49
38
|
data_dict = {}
|
|
@@ -113,16 +102,16 @@ class TriviaQaAdapter(DataAdapter):
|
|
|
113
102
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
114
103
|
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
115
104
|
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
116
|
-
full_prompt =
|
|
105
|
+
full_prompt = context
|
|
117
106
|
|
|
118
|
-
return {'data': [full_prompt]}
|
|
107
|
+
return {'data': [full_prompt], 'system_prompt': prompt}
|
|
119
108
|
|
|
120
109
|
def get_gold_answer(self, input_d: dict) -> list:
|
|
121
110
|
# Get the gold choice
|
|
122
111
|
ans: list = input_d.get('ideal', [])
|
|
123
112
|
return ans
|
|
124
113
|
|
|
125
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str =
|
|
114
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
126
115
|
"""
|
|
127
116
|
Parse the model output to get the answer.
|
|
128
117
|
|
|
@@ -134,73 +123,11 @@ class TriviaQaAdapter(DataAdapter):
|
|
|
134
123
|
Returns:
|
|
135
124
|
The predicted answer.
|
|
136
125
|
"""
|
|
137
|
-
|
|
138
|
-
return result
|
|
139
|
-
elif eval_type == 'service': # TODO: to be implemented
|
|
140
|
-
return result
|
|
141
|
-
elif eval_type == 'custom': # TODO: to be implemented
|
|
142
|
-
return result
|
|
143
|
-
else:
|
|
144
|
-
raise ValueError(f'Unknown eval_type: {eval_type}')
|
|
126
|
+
return result
|
|
145
127
|
|
|
146
128
|
def match(self, gold: list, pred: str) -> float:
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
150
|
-
"""
|
|
151
|
-
Compute evaluation result by specific metric.
|
|
152
|
-
|
|
153
|
-
Args:
|
|
154
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
155
|
-
|
|
156
|
-
Returns:
|
|
157
|
-
The metric score.
|
|
158
|
-
"""
|
|
159
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
160
|
-
return weighted_mean(items)
|
|
161
|
-
|
|
162
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
163
|
-
"""
|
|
164
|
-
Generate the report for the model output.
|
|
165
|
-
|
|
166
|
-
Args:
|
|
167
|
-
subset_score_map: {subset_name: (score, num), ...}
|
|
168
|
-
report_name: The user-defined report name.
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
{
|
|
172
|
-
"name":"TriviaQA",
|
|
173
|
-
"metric":"WeightedAverageAccuracy",
|
|
174
|
-
"score":0.3389,
|
|
175
|
-
"category":[
|
|
176
|
-
{
|
|
177
|
-
"name":"DEFAULT",
|
|
178
|
-
"score":0.3389,
|
|
179
|
-
"subset":[
|
|
180
|
-
{
|
|
181
|
-
"name":"default",
|
|
182
|
-
"score":0.3389
|
|
183
|
-
}
|
|
184
|
-
]
|
|
185
|
-
}
|
|
186
|
-
],
|
|
187
|
-
"total_num":100
|
|
188
|
-
}
|
|
189
|
-
"""
|
|
190
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
191
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
192
|
-
cate_avg_list = [{'name': subset_name, 'score': score} for subset_name, (score, _) in subset_score_map.items()]
|
|
193
|
-
|
|
194
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
195
|
-
|
|
196
|
-
res_map = dict(
|
|
197
|
-
name=report_name or 'trivia_qa',
|
|
198
|
-
metric=self.metric_list[0]['name'],
|
|
199
|
-
score=weighted_avg_acc,
|
|
200
|
-
category=[category_d],
|
|
201
|
-
total_num=total_num)
|
|
202
|
-
|
|
203
|
-
return res_map
|
|
129
|
+
is_correct = any([cand in pred for cand in gold])
|
|
130
|
+
return 1 if is_correct else 0
|
|
204
131
|
|
|
205
132
|
@classmethod
|
|
206
133
|
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter
|
|
5
|
-
from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
|
|
@@ -6,20 +6,31 @@ import numpy as np
|
|
|
6
6
|
import os
|
|
7
7
|
from typing import List
|
|
8
8
|
|
|
9
|
+
from evalscope.benchmarks import Benchmark
|
|
9
10
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
11
|
+
from evalscope.constants import EvalType
|
|
12
|
+
from evalscope.metrics import WeightedAverageAccuracy
|
|
10
13
|
from evalscope.metrics.metrics import weighted_mean
|
|
11
|
-
from evalscope.
|
|
12
|
-
from evalscope.utils
|
|
14
|
+
from evalscope.models import ContinuationLogitsModelAdapter
|
|
15
|
+
from evalscope.utils import get_logger, normalize_score
|
|
13
16
|
|
|
14
17
|
# flake8: noqa
|
|
15
18
|
|
|
16
19
|
logger = get_logger()
|
|
17
20
|
|
|
18
|
-
DATASET_ID = 'modelscope/truthful_qa'
|
|
19
21
|
# task_list = ['generation', 'multiple_choice']
|
|
20
|
-
SUBSET_LIST = ['multiple_choice']
|
|
21
22
|
|
|
22
23
|
|
|
24
|
+
@Benchmark.register(
|
|
25
|
+
name='truthful_qa',
|
|
26
|
+
dataset_id='modelscope/truthful_qa',
|
|
27
|
+
model_adapter=ContinuationLogitsModelAdapter,
|
|
28
|
+
subset_list=['multiple_choice'],
|
|
29
|
+
metric_list=[WeightedAverageAccuracy],
|
|
30
|
+
few_shot_num=0,
|
|
31
|
+
train_split=None,
|
|
32
|
+
eval_split='validation',
|
|
33
|
+
)
|
|
23
34
|
class TruthfulQaAdapter(DataAdapter):
|
|
24
35
|
"""
|
|
25
36
|
Adapter for TruthfulQA benchmark.
|
|
@@ -40,35 +51,14 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
40
51
|
'Q: Where were the 1992 Olympics held?\n'
|
|
41
52
|
'A: The 1992 Olympics were held in Barcelona, Spain.')
|
|
42
53
|
|
|
43
|
-
def __init__(self,
|
|
44
|
-
subset_list: list = None,
|
|
45
|
-
metric_list: list = None,
|
|
46
|
-
few_shot_num: int = None,
|
|
47
|
-
train_split: str = None,
|
|
48
|
-
eval_split: str = 'validation',
|
|
49
|
-
**kwargs):
|
|
50
|
-
|
|
51
|
-
if subset_list is None:
|
|
52
|
-
subset_list = SUBSET_LIST
|
|
53
|
-
|
|
54
|
-
if metric_list is None:
|
|
55
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
56
|
-
|
|
57
|
-
if few_shot_num is None:
|
|
58
|
-
logger.info(f'Set 0-shot examples by system for TruthfulQA.')
|
|
59
|
-
few_shot_num = 0
|
|
54
|
+
def __init__(self, **kwargs):
|
|
60
55
|
|
|
56
|
+
few_shot_num = kwargs.get('few_shot_num', 0)
|
|
61
57
|
if few_shot_num != 0:
|
|
62
58
|
logger.warning(f'few_shot_num should be 0 for TruthfulQA, but got {few_shot_num}. Use 0-shot by default.')
|
|
63
|
-
few_shot_num = 0
|
|
59
|
+
kwargs['few_shot_num'] = 0
|
|
64
60
|
|
|
65
|
-
super().__init__(
|
|
66
|
-
subset_list=subset_list,
|
|
67
|
-
metric_list=metric_list,
|
|
68
|
-
few_shot_num=few_shot_num,
|
|
69
|
-
train_split=train_split,
|
|
70
|
-
eval_split=eval_split,
|
|
71
|
-
**kwargs)
|
|
61
|
+
super().__init__(**kwargs)
|
|
72
62
|
|
|
73
63
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
74
64
|
data_dict = {}
|
|
@@ -215,7 +205,7 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
215
205
|
# TODO: generation sub-task to be added
|
|
216
206
|
return {'mc1_labels': input_d['mc1_targets']['labels'], 'mc2_labels': input_d['mc2_targets']['labels']}
|
|
217
207
|
|
|
218
|
-
def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str =
|
|
208
|
+
def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> list:
|
|
219
209
|
"""
|
|
220
210
|
Parse the model output to get the answer.
|
|
221
211
|
|
|
@@ -227,11 +217,11 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
227
217
|
Returns:
|
|
228
218
|
The predicted answer.
|
|
229
219
|
"""
|
|
230
|
-
if eval_type ==
|
|
220
|
+
if eval_type == EvalType.CHECKPOINT:
|
|
231
221
|
return result
|
|
232
|
-
elif eval_type ==
|
|
222
|
+
elif eval_type == EvalType.SERVICE: # TODO: to be supported !
|
|
233
223
|
return result
|
|
234
|
-
elif eval_type ==
|
|
224
|
+
elif eval_type == EvalType.CUSTOM: # TODO: to be supported !
|
|
235
225
|
return result
|
|
236
226
|
else:
|
|
237
227
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from tabulate import tabulate
|
|
6
|
+
from tqdm import tqdm
|
|
7
|
+
|
|
8
|
+
from evalscope.benchmarks import Benchmark
|
|
9
|
+
from evalscope.collections.sampler import DatasetEntry
|
|
10
|
+
from evalscope.config import TaskConfig
|
|
11
|
+
from evalscope.constants import AnswerKeys, DumpMode, EvalType, ReviewKeys
|
|
12
|
+
from evalscope.evaluator import Evaluator
|
|
13
|
+
from evalscope.models import get_local_model, initialize_model_adapter
|
|
14
|
+
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
15
|
+
from evalscope.utils.logger import get_logger
|
|
16
|
+
|
|
17
|
+
logger = get_logger()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SimpleEvaluator(Evaluator):
|
|
21
|
+
|
|
22
|
+
def __init__(self, dataset_name, data_adapter, model_adapter, task_cfg, outputs):
|
|
23
|
+
super().__init__(
|
|
24
|
+
dataset_name_or_path=dataset_name,
|
|
25
|
+
data_adapter=data_adapter,
|
|
26
|
+
model_adapter=model_adapter,
|
|
27
|
+
task_cfg=task_cfg,
|
|
28
|
+
outputs=outputs)
|
|
29
|
+
|
|
30
|
+
def get_answer(self, input_prompt, subset_name, infer_cfg) -> dict:
|
|
31
|
+
answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
|
|
32
|
+
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
33
|
+
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
34
|
+
return processed_answer
|
|
35
|
+
|
|
36
|
+
def get_review(self, answer_d) -> dict:
|
|
37
|
+
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
38
|
+
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
39
|
+
return review_d
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class EvaluatorCollection:
|
|
43
|
+
|
|
44
|
+
def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure):
|
|
45
|
+
self.task_cfg = task_cfg
|
|
46
|
+
self.outputs = outputs
|
|
47
|
+
self.model = get_local_model(task_cfg)
|
|
48
|
+
self.dataset = self.load()
|
|
49
|
+
self.dataset_name_map, self.dataset_id_map = self._parse_dataset()
|
|
50
|
+
self.evaluators = self._initialize_evaluators()
|
|
51
|
+
|
|
52
|
+
def load(self) -> list[DatasetEntry]:
|
|
53
|
+
raw_dataset = jsonl_to_list(self.task_cfg.dataset_args['data_collection']['local_path'])
|
|
54
|
+
datasets = []
|
|
55
|
+
for sample in raw_dataset:
|
|
56
|
+
datasets.append(DatasetEntry(**sample))
|
|
57
|
+
return datasets
|
|
58
|
+
|
|
59
|
+
def _parse_dataset(self):
|
|
60
|
+
dataset_name_map = defaultdict(lambda: defaultdict(list))
|
|
61
|
+
dataset_id_map = {}
|
|
62
|
+
for sample in self.dataset:
|
|
63
|
+
dataset_name, subset_name = sample.dataset_name, sample.subset_name
|
|
64
|
+
dataset_name_map[dataset_name][subset_name].append(sample.index)
|
|
65
|
+
dataset_id_map[sample.index] = sample
|
|
66
|
+
return dataset_name_map, dataset_id_map
|
|
67
|
+
|
|
68
|
+
def _initialize_evaluators(self):
|
|
69
|
+
evaluators = {}
|
|
70
|
+
for dataset_name in self.dataset_name_map.keys():
|
|
71
|
+
benchmark = Benchmark.get(dataset_name)
|
|
72
|
+
data_adapter = benchmark.get_data_adapter()
|
|
73
|
+
model_adapter = initialize_model_adapter(self.task_cfg, benchmark.model_adapter, self.model)
|
|
74
|
+
evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
|
|
75
|
+
self.outputs)
|
|
76
|
+
return evaluators
|
|
77
|
+
|
|
78
|
+
def get_report(self, reviews):
|
|
79
|
+
data = []
|
|
80
|
+
for dataset_name, data_map in self.dataset_name_map.items():
|
|
81
|
+
for subset_name, ids in data_map.items():
|
|
82
|
+
for _id in ids:
|
|
83
|
+
review_d = reviews[_id]
|
|
84
|
+
row_data: DatasetEntry = self.dataset_id_map[_id]
|
|
85
|
+
score = self.get_pred_score(review_d)
|
|
86
|
+
data.append({
|
|
87
|
+
'task_type': row_data.task,
|
|
88
|
+
'dataset_name': dataset_name,
|
|
89
|
+
'subset_name': subset_name,
|
|
90
|
+
'tags': row_data.tags,
|
|
91
|
+
'score': score
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
df = pd.DataFrame(data)
|
|
95
|
+
# Explode tags to multiple rows
|
|
96
|
+
df_exploded = df.explode('tags')
|
|
97
|
+
|
|
98
|
+
# Helper function for aggregation and sorting
|
|
99
|
+
def aggregate_and_sort(df, group_by_cols):
|
|
100
|
+
report_df = df.groupby(group_by_cols) \
|
|
101
|
+
.agg(average_score=('score', 'mean'), count=('score', 'size')) \
|
|
102
|
+
.reset_index()
|
|
103
|
+
|
|
104
|
+
# Round average_score to 4 decimal places
|
|
105
|
+
report_df['average_score'] = report_df['average_score'].round(4)
|
|
106
|
+
|
|
107
|
+
report_df = report_df.sort_values(by='count', ascending=False) \
|
|
108
|
+
.to_dict(orient='records')
|
|
109
|
+
return report_df
|
|
110
|
+
|
|
111
|
+
# Multi-level aggregation
|
|
112
|
+
subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
|
|
113
|
+
dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
|
|
114
|
+
task_report_df = aggregate_and_sort(df, ['task_type'])
|
|
115
|
+
tag_report_df = aggregate_and_sort(df_exploded, ['tags'])
|
|
116
|
+
|
|
117
|
+
# Convert sorted DataFrames to Dict
|
|
118
|
+
report = {
|
|
119
|
+
'subset_level': subset_report_df,
|
|
120
|
+
'dataset_level': dataset_report_df,
|
|
121
|
+
'task_level': task_report_df,
|
|
122
|
+
'tag_level': tag_report_df
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
# Log the report
|
|
126
|
+
for level, data in report.items():
|
|
127
|
+
table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
|
|
128
|
+
logger.info(f'{level} Report:\n{table}')
|
|
129
|
+
|
|
130
|
+
# Save the report to a JSON file
|
|
131
|
+
report_file_path = os.path.join(self.outputs.reports_dir, 'data_collection.json')
|
|
132
|
+
with open(report_file_path, 'w', encoding='utf-8') as f:
|
|
133
|
+
json.dump(report, f, ensure_ascii=False, indent=4)
|
|
134
|
+
|
|
135
|
+
def get_answers(self):
|
|
136
|
+
pred_file_path = os.path.join(self.outputs.predictions_dir, 'data_collection.jsonl')
|
|
137
|
+
answers = defaultdict(dict)
|
|
138
|
+
for sample in tqdm(self.dataset, desc='Getting answers'):
|
|
139
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
140
|
+
answer_d = evaluator.get_answer(sample.prompt, sample.subset_name, self.task_cfg.generation_config)
|
|
141
|
+
answers[sample.index] = answer_d
|
|
142
|
+
dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
|
|
143
|
+
return answers
|
|
144
|
+
|
|
145
|
+
def get_reviews(self, answers):
|
|
146
|
+
review_file_path = os.path.join(self.outputs.reviews_dir, 'data_collection.jsonl')
|
|
147
|
+
reviews = defaultdict(dict)
|
|
148
|
+
for sample in tqdm(self.dataset, desc='Getting reviews'):
|
|
149
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
150
|
+
review_d = evaluator.get_review(answers[sample.index])
|
|
151
|
+
reviews[sample.index] = review_d
|
|
152
|
+
dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
|
|
153
|
+
return reviews
|
|
154
|
+
|
|
155
|
+
@staticmethod
|
|
156
|
+
def get_pred_score(review_d) -> float:
|
|
157
|
+
return float(review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT])
|
|
158
|
+
|
|
159
|
+
def eval(self, **kwargs):
|
|
160
|
+
answers = self.get_answers()
|
|
161
|
+
reviews = self.get_reviews(answers)
|
|
162
|
+
self.get_report(reviews)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
if __name__ == '__main__':
|
|
166
|
+
task_cfg = TaskConfig(
|
|
167
|
+
model='qwen2.5',
|
|
168
|
+
api_url='http://127.0.0.1:8801/v1/chat/completions',
|
|
169
|
+
api_key='EMPTY',
|
|
170
|
+
eval_type=EvalType.SERVICE,
|
|
171
|
+
datasets=['data_collection'],
|
|
172
|
+
dataset_args={'data_collection': {
|
|
173
|
+
'local_path': 'outputs/mixed_data.jsonl'
|
|
174
|
+
}},
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
evaluator_collection = EvaluatorCollection(task_cfg)
|
|
178
|
+
evaluator_collection.eval()
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
import random
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from dataclasses import asdict, dataclass, field
|
|
4
|
+
from tqdm import tqdm
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from evalscope.collections.schema import CollectionSchema, DatasetInfo
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class DatasetEntry:
|
|
12
|
+
index: int = 0
|
|
13
|
+
prompt: dict = field(default_factory=dict)
|
|
14
|
+
tags: List[str] = field(default_factory=list)
|
|
15
|
+
task: str = ''
|
|
16
|
+
weight: float = 0.0
|
|
17
|
+
dataset_name: str = ''
|
|
18
|
+
subset_name: str = ''
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Define an abstract base class for Samplers
|
|
22
|
+
class Sampler(ABC):
|
|
23
|
+
|
|
24
|
+
def __init__(self, schema: CollectionSchema, count: Optional[int] = None):
|
|
25
|
+
self.schema = schema
|
|
26
|
+
self.count = count
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def sample(self) -> List[dict]:
|
|
30
|
+
raise NotImplementedError
|
|
31
|
+
|
|
32
|
+
def _collect_dataset_data(self, dataset_info_list: List[DatasetInfo]) -> List[DatasetEntry]:
|
|
33
|
+
all_data = []
|
|
34
|
+
for dataset in tqdm(dataset_info_list, desc='Collecting dataset data'):
|
|
35
|
+
data_dict = dataset.get_data()
|
|
36
|
+
for subset_name, subset_data in data_dict.items():
|
|
37
|
+
for prompt in subset_data:
|
|
38
|
+
all_data.append(
|
|
39
|
+
DatasetEntry(
|
|
40
|
+
prompt=prompt,
|
|
41
|
+
tags=dataset.tags,
|
|
42
|
+
task=dataset.task_type,
|
|
43
|
+
weight=dataset.weight,
|
|
44
|
+
dataset_name=dataset.name,
|
|
45
|
+
subset_name=subset_name,
|
|
46
|
+
))
|
|
47
|
+
return all_data
|
|
48
|
+
|
|
49
|
+
def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
|
|
50
|
+
result = []
|
|
51
|
+
for i, entry in enumerate(all_data):
|
|
52
|
+
entry.index = i
|
|
53
|
+
result.append(asdict(entry))
|
|
54
|
+
return result
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class WeightedSampler(Sampler):
|
|
58
|
+
"""
|
|
59
|
+
Weighted sampler, according to the weight of each dataset, sample data from each dataset.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def sample(self) -> List[dict]:
|
|
63
|
+
dataset_info_list = self.schema.flatten()
|
|
64
|
+
all_data = self._collect_dataset_data(dataset_info_list)
|
|
65
|
+
|
|
66
|
+
remaining_count = self.count
|
|
67
|
+
sampled_data = []
|
|
68
|
+
|
|
69
|
+
for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
|
|
70
|
+
if i == len(dataset_info_list) - 1:
|
|
71
|
+
dataset_sample_count = remaining_count
|
|
72
|
+
else:
|
|
73
|
+
dataset_sample_count = int(dataset.weight * self.count)
|
|
74
|
+
remaining_count -= dataset_sample_count
|
|
75
|
+
|
|
76
|
+
sampled_data.extend(random.choices(all_data, k=dataset_sample_count))
|
|
77
|
+
|
|
78
|
+
return self._update_index(sampled_data)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class UniformSampler(Sampler):
|
|
82
|
+
"""
|
|
83
|
+
Uniform sampler, sample data from each dataset with the same number of samples.
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def sample(self) -> List[dict]:
|
|
87
|
+
dataset_info_list = self.schema.flatten()
|
|
88
|
+
all_data = self._collect_dataset_data(dataset_info_list)
|
|
89
|
+
|
|
90
|
+
num_datasets = len(dataset_info_list)
|
|
91
|
+
samples_per_dataset = self.count // num_datasets
|
|
92
|
+
sampled_data = []
|
|
93
|
+
|
|
94
|
+
for _ in tqdm(dataset_info_list, desc='Sampling data'):
|
|
95
|
+
sampled_data.extend(random.choices(all_data, k=samples_per_dataset))
|
|
96
|
+
|
|
97
|
+
return self._update_index(sampled_data)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class StratifiedSampler(Sampler):
|
|
101
|
+
"""
|
|
102
|
+
Stratified sampler, sample data from each dataset according to the number of samples of each dataset.
|
|
103
|
+
"""
|
|
104
|
+
|
|
105
|
+
def sample(self) -> List[dict]:
|
|
106
|
+
dataset_info_list = self.schema.flatten()
|
|
107
|
+
all_data = self._collect_dataset_data(dataset_info_list)
|
|
108
|
+
|
|
109
|
+
total_samples = sum(len(dataset.get_data()) for dataset in dataset_info_list)
|
|
110
|
+
sampled_data = []
|
|
111
|
+
|
|
112
|
+
for dataset in tqdm(dataset_info_list, desc='Sampling data'):
|
|
113
|
+
dataset_samples = len(dataset.get_data())
|
|
114
|
+
samples_for_dataset = int((dataset_samples / total_samples) * self.count)
|
|
115
|
+
sampled_data.extend(random.choices(all_data, k=samples_for_dataset))
|
|
116
|
+
|
|
117
|
+
return self._update_index(sampled_data)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
if __name__ == '__main__':
|
|
121
|
+
from evalscope.utils.io_utils import dump_jsonl_data
|
|
122
|
+
|
|
123
|
+
schema = CollectionSchema.from_json('outputs/schema.json')
|
|
124
|
+
print(schema.to_dict())
|
|
125
|
+
mixed_data = WeightedSampler(schema, 100).sample()
|
|
126
|
+
dump_jsonl_data(mixed_data, 'outputs/weighted_mixed_data.jsonl')
|
|
127
|
+
|
|
128
|
+
mixed_data = UniformSampler(schema, 100).sample()
|
|
129
|
+
dump_jsonl_data(mixed_data, 'outputs/uniform_mixed_data.jsonl')
|
|
130
|
+
|
|
131
|
+
mixed_data = StratifiedSampler(schema, 100).sample()
|
|
132
|
+
dump_jsonl_data(mixed_data, 'outputs/stratified_mixed_data.jsonl')
|