evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +11 -3
- evalscope/backend/base.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/clip.py +2 -2
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +24 -102
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +115 -87
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
- evalscope/benchmarks/ifeval/__init__.py +0 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
- evalscope/benchmarks/ifeval/instructions.py +1477 -0
- evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
- evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
- evalscope/benchmarks/ifeval/utils.py +134 -0
- evalscope/benchmarks/iquiz/__init__.py +0 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +27 -123
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +30 -0
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +198 -0
- evalscope/collections/sampler.py +138 -0
- evalscope/collections/schema.py +126 -0
- evalscope/config.py +45 -7
- evalscope/constants.py +7 -38
- evalscope/evaluator/__init__.py +0 -1
- evalscope/evaluator/evaluator.py +89 -121
- evalscope/evaluator/rating_eval.py +1 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
- evalscope/metrics/__init__.py +3 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +18 -6
- evalscope/metrics/named_metrics.py +17 -0
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +140 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +111 -0
- evalscope/perf/__init__.py +1 -0
- evalscope/perf/arguments.py +3 -1
- evalscope/perf/benchmark.py +3 -3
- evalscope/perf/main.py +5 -7
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +54 -50
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/longalpaca.py +1 -1
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/benchmark_util.py +4 -4
- evalscope/perf/utils/db_util.py +66 -22
- evalscope/perf/utils/local_server.py +4 -1
- evalscope/report/__init__.py +5 -0
- evalscope/report/app.py +693 -0
- evalscope/report/combinator.py +73 -0
- evalscope/report/generator.py +80 -0
- evalscope/report/utils.py +133 -0
- evalscope/run.py +64 -125
- evalscope/run_arena.py +3 -2
- evalscope/summarizer.py +15 -27
- evalscope/third_party/longbench_write/eval.py +2 -1
- evalscope/third_party/longbench_write/longbench_write.py +2 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/__init__.py +1 -0
- evalscope/utils/chat_service.py +6 -5
- evalscope/utils/io_utils.py +170 -0
- evalscope/utils/logger.py +13 -0
- evalscope/utils/model_utils.py +15 -2
- evalscope/utils/utils.py +3 -200
- evalscope/version.py +2 -2
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
- tests/cli/test_collection.py +57 -0
- tests/cli/test_run.py +57 -7
- tests/perf/test_perf.py +3 -2
- tests/rag/test_mteb.py +3 -2
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/evaluator/humaneval_evaluator.py +0 -158
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- evalscope/tools/__init__.py +0 -1
- evalscope/tools/combine_reports.py +0 -135
- evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
|
@@ -1,57 +1,41 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import json
|
|
4
3
|
import os
|
|
5
4
|
|
|
6
|
-
from evalscope.benchmarks
|
|
7
|
-
from evalscope.
|
|
8
|
-
from evalscope.
|
|
5
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
|
+
from evalscope.constants import EvalType
|
|
7
|
+
from evalscope.metrics import AverageAccuracy, exact_match
|
|
8
|
+
from evalscope.models import MultiChoiceModelAdapter
|
|
9
|
+
from evalscope.utils import ResponseParser
|
|
10
|
+
from evalscope.utils.io_utils import jsonl_to_list
|
|
9
11
|
from evalscope.utils.logger import get_logger
|
|
10
12
|
|
|
11
13
|
# flake8: noqa
|
|
12
14
|
|
|
13
15
|
logger = get_logger()
|
|
14
16
|
|
|
15
|
-
DATASET_ID = 'modelscope/race'
|
|
16
|
-
|
|
17
|
-
SUBSET_LIST = ['high', 'middle']
|
|
18
|
-
|
|
19
|
-
SUBJECT_MAPPING = {'high': 'High', 'middle': 'Middle'}
|
|
20
|
-
|
|
21
17
|
|
|
18
|
+
@Benchmark.register(
|
|
19
|
+
name='race',
|
|
20
|
+
dataset_id='modelscope/race',
|
|
21
|
+
model_adapter=MultiChoiceModelAdapter,
|
|
22
|
+
subset_list=['high', 'middle'],
|
|
23
|
+
metric_list=[AverageAccuracy],
|
|
24
|
+
few_shot_num=3,
|
|
25
|
+
train_split='train',
|
|
26
|
+
eval_split='test',
|
|
27
|
+
)
|
|
22
28
|
class RACEAdapter(DataAdapter):
|
|
23
29
|
|
|
24
30
|
choices = ['A', 'B', 'C', 'D']
|
|
25
31
|
|
|
26
|
-
def __init__(self,
|
|
27
|
-
|
|
28
|
-
metric_list: list = None,
|
|
29
|
-
few_shot_num: int = None,
|
|
30
|
-
train_split: str = 'train',
|
|
31
|
-
eval_split: str = 'test',
|
|
32
|
-
**kwargs):
|
|
33
|
-
|
|
34
|
-
if subset_list is None:
|
|
35
|
-
subset_list = SUBSET_LIST
|
|
36
|
-
|
|
37
|
-
if metric_list is None:
|
|
38
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
39
|
-
|
|
40
|
-
if few_shot_num is None:
|
|
41
|
-
logger.info(f'Set 3-shot examples by system for RACE.')
|
|
42
|
-
few_shot_num = 3
|
|
43
|
-
|
|
32
|
+
def __init__(self, **kwargs):
|
|
33
|
+
few_shot_num = kwargs.get('few_shot_num', 3)
|
|
44
34
|
if few_shot_num > 3:
|
|
45
35
|
logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.')
|
|
46
|
-
few_shot_num = 3
|
|
36
|
+
kwargs['few_shot_num'] = 3
|
|
47
37
|
|
|
48
|
-
super().__init__(
|
|
49
|
-
subset_list=subset_list,
|
|
50
|
-
metric_list=metric_list,
|
|
51
|
-
few_shot_num=few_shot_num,
|
|
52
|
-
train_split=train_split,
|
|
53
|
-
eval_split=eval_split,
|
|
54
|
-
**kwargs)
|
|
38
|
+
super().__init__(**kwargs)
|
|
55
39
|
|
|
56
40
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
57
41
|
data_dict = {}
|
|
@@ -98,13 +82,13 @@ class RACEAdapter(DataAdapter):
|
|
|
98
82
|
|
|
99
83
|
full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
100
84
|
|
|
101
|
-
return {'data': [full_prompt], 'multi_choices': self.choices}
|
|
85
|
+
return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
|
|
102
86
|
|
|
103
87
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
104
88
|
# Get the gold choice
|
|
105
89
|
return input_d.get('answer', '')
|
|
106
90
|
|
|
107
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str =
|
|
91
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
108
92
|
"""
|
|
109
93
|
Parse the model output to get the answer. Could be the best choice index.
|
|
110
94
|
|
|
@@ -116,98 +100,18 @@ class RACEAdapter(DataAdapter):
|
|
|
116
100
|
Returns:
|
|
117
101
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
118
102
|
"""
|
|
119
|
-
if eval_type ==
|
|
120
|
-
return result
|
|
121
|
-
elif eval_type == 'service': # TODO: to be implemented
|
|
122
|
-
return result
|
|
123
|
-
elif eval_type == 'custom': # TODO: to be implemented
|
|
103
|
+
if eval_type == EvalType.CHECKPOINT:
|
|
124
104
|
return result
|
|
105
|
+
elif eval_type == EvalType.SERVICE:
|
|
106
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
|
|
107
|
+
elif eval_type == EvalType.CUSTOM:
|
|
108
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
|
|
125
109
|
else:
|
|
126
110
|
raise ValueError(f'Unknown eval_type: {eval_type}')
|
|
127
111
|
|
|
128
112
|
def match(self, gold: str, pred: str) -> float:
|
|
129
113
|
return exact_match(gold=gold, pred=pred)
|
|
130
114
|
|
|
131
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
132
|
-
"""
|
|
133
|
-
Compute evaluation result by specific metric.
|
|
134
|
-
|
|
135
|
-
Args:
|
|
136
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
137
|
-
|
|
138
|
-
Returns:
|
|
139
|
-
The metric score.
|
|
140
|
-
"""
|
|
141
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
142
|
-
return weighted_mean(items)
|
|
143
|
-
|
|
144
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
145
|
-
"""
|
|
146
|
-
Generate report for the evaluation.
|
|
147
|
-
|
|
148
|
-
Args:
|
|
149
|
-
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
|
|
150
|
-
report_name: The user-defined report name.
|
|
151
|
-
|
|
152
|
-
Returns:
|
|
153
|
-
{
|
|
154
|
-
"name":"RACE",
|
|
155
|
-
"metric":"WeightedAverageAccuracy",
|
|
156
|
-
"score":0.3389,
|
|
157
|
-
"category":[
|
|
158
|
-
{
|
|
159
|
-
"name":"High",
|
|
160
|
-
"score":0.2528,
|
|
161
|
-
"subset":[
|
|
162
|
-
{
|
|
163
|
-
"name":"high",
|
|
164
|
-
"score":0.2528
|
|
165
|
-
}
|
|
166
|
-
]
|
|
167
|
-
}
|
|
168
|
-
],
|
|
169
|
-
"total_num":59
|
|
170
|
-
}
|
|
171
|
-
"""
|
|
172
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
173
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
174
|
-
|
|
175
|
-
# Get domain-subject mapping
|
|
176
|
-
subject_review_map = {}
|
|
177
|
-
for subset_name, (subset_score, num) in subset_score_map.items():
|
|
178
|
-
domain_name: str = SUBJECT_MAPPING.get(subset_name)
|
|
179
|
-
if domain_name in subject_review_map:
|
|
180
|
-
subject_review_map[domain_name].append((subset_name, subset_score, num))
|
|
181
|
-
else:
|
|
182
|
-
subject_review_map[domain_name] = [(subset_name, subset_score, num)]
|
|
183
|
-
|
|
184
|
-
# Get domain score
|
|
185
|
-
category_list = []
|
|
186
|
-
for domain_name, domain_res_list in subject_review_map.items():
|
|
187
|
-
domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
|
|
188
|
-
sum([num for _, _, num in domain_res_list])
|
|
189
|
-
domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
|
|
190
|
-
category_list.append({
|
|
191
|
-
'name':
|
|
192
|
-
domain_name,
|
|
193
|
-
'score':
|
|
194
|
-
normalize_score(score=domain_weighted_avg_acc),
|
|
195
|
-
'subset': [{
|
|
196
|
-
'name': subset_name,
|
|
197
|
-
'score': subset_score
|
|
198
|
-
} for subset_name, subset_score, _ in domain_res_list]
|
|
199
|
-
})
|
|
200
|
-
|
|
201
|
-
# Get final dict of report
|
|
202
|
-
res_map = dict(
|
|
203
|
-
name=report_name or 'race',
|
|
204
|
-
metric=self.metric_list[0]['name'],
|
|
205
|
-
score=weighted_avg_acc,
|
|
206
|
-
category=category_list,
|
|
207
|
-
total_num=total_num)
|
|
208
|
-
|
|
209
|
-
return res_map
|
|
210
|
-
|
|
211
115
|
@classmethod
|
|
212
116
|
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
213
117
|
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter
|
|
5
|
-
from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
|
|
@@ -1,49 +1,35 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
# Copyright (c) EleutherAI Inc, and its affiliates.
|
|
3
3
|
import csv
|
|
4
|
-
import numpy as np
|
|
5
4
|
import os
|
|
6
|
-
from typing import List
|
|
7
5
|
|
|
6
|
+
from evalscope.benchmarks import Benchmark
|
|
8
7
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
9
|
-
from evalscope.
|
|
10
|
-
from evalscope.
|
|
8
|
+
from evalscope.constants import EvalType
|
|
9
|
+
from evalscope.metrics import AverageAccuracy
|
|
10
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
11
|
+
from evalscope.utils import get_logger
|
|
11
12
|
|
|
12
13
|
# flake8: noqa
|
|
13
14
|
|
|
14
15
|
logger = get_logger()
|
|
15
16
|
|
|
16
|
-
DATASET_ID = 'modelscope/trivia_qa'
|
|
17
|
-
SUBSET_LIST = ['default']
|
|
18
|
-
|
|
19
17
|
|
|
18
|
+
@Benchmark.register(
|
|
19
|
+
name='trivia_qa',
|
|
20
|
+
dataset_id='modelscope/trivia_qa',
|
|
21
|
+
model_adapter=ChatGenerationModelAdapter,
|
|
22
|
+
subset_list=['default'],
|
|
23
|
+
metric_list=[AverageAccuracy],
|
|
24
|
+
few_shot_num=5,
|
|
25
|
+
train_split='dev',
|
|
26
|
+
eval_split='test',
|
|
27
|
+
)
|
|
20
28
|
class TriviaQaAdapter(DataAdapter):
|
|
21
29
|
|
|
22
|
-
def __init__(self,
|
|
23
|
-
subset_list: list = None,
|
|
24
|
-
metric_list: list = None,
|
|
25
|
-
few_shot_num: int = None,
|
|
26
|
-
train_split: str = 'dev',
|
|
27
|
-
eval_split: str = 'test',
|
|
28
|
-
**kwargs):
|
|
29
|
-
|
|
30
|
-
if subset_list is None:
|
|
31
|
-
subset_list = SUBSET_LIST
|
|
32
|
-
|
|
33
|
-
if metric_list is None:
|
|
34
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
30
|
+
def __init__(self, **kwargs):
|
|
35
31
|
|
|
36
|
-
|
|
37
|
-
logger.info(f'few_shot_num is not specified for TriviaQA, use default value: 5')
|
|
38
|
-
few_shot_num = 5
|
|
39
|
-
|
|
40
|
-
super().__init__(
|
|
41
|
-
subset_list=subset_list,
|
|
42
|
-
metric_list=metric_list,
|
|
43
|
-
few_shot_num=few_shot_num,
|
|
44
|
-
train_split=train_split,
|
|
45
|
-
eval_split=eval_split,
|
|
46
|
-
**kwargs)
|
|
32
|
+
super().__init__(**kwargs)
|
|
47
33
|
|
|
48
34
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
49
35
|
data_dict = {}
|
|
@@ -113,16 +99,16 @@ class TriviaQaAdapter(DataAdapter):
|
|
|
113
99
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
114
100
|
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
115
101
|
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
116
|
-
full_prompt =
|
|
102
|
+
full_prompt = context
|
|
117
103
|
|
|
118
|
-
return {'data': [full_prompt]}
|
|
104
|
+
return {'data': [full_prompt], 'system_prompt': prompt or self.prompt_template}
|
|
119
105
|
|
|
120
106
|
def get_gold_answer(self, input_d: dict) -> list:
|
|
121
107
|
# Get the gold choice
|
|
122
108
|
ans: list = input_d.get('ideal', [])
|
|
123
109
|
return ans
|
|
124
110
|
|
|
125
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str =
|
|
111
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
126
112
|
"""
|
|
127
113
|
Parse the model output to get the answer.
|
|
128
114
|
|
|
@@ -134,73 +120,11 @@ class TriviaQaAdapter(DataAdapter):
|
|
|
134
120
|
Returns:
|
|
135
121
|
The predicted answer.
|
|
136
122
|
"""
|
|
137
|
-
|
|
138
|
-
return result
|
|
139
|
-
elif eval_type == 'service': # TODO: to be implemented
|
|
140
|
-
return result
|
|
141
|
-
elif eval_type == 'custom': # TODO: to be implemented
|
|
142
|
-
return result
|
|
143
|
-
else:
|
|
144
|
-
raise ValueError(f'Unknown eval_type: {eval_type}')
|
|
123
|
+
return result
|
|
145
124
|
|
|
146
125
|
def match(self, gold: list, pred: str) -> float:
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
def compute_metric(self, review_res_list: list) -> float:
|
|
150
|
-
"""
|
|
151
|
-
Compute evaluation result by specific metric.
|
|
152
|
-
|
|
153
|
-
Args:
|
|
154
|
-
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
|
|
155
|
-
|
|
156
|
-
Returns:
|
|
157
|
-
The metric score.
|
|
158
|
-
"""
|
|
159
|
-
items = [(score, 1.0) for score in review_res_list]
|
|
160
|
-
return weighted_mean(items)
|
|
161
|
-
|
|
162
|
-
def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
|
|
163
|
-
"""
|
|
164
|
-
Generate the report for the model output.
|
|
165
|
-
|
|
166
|
-
Args:
|
|
167
|
-
subset_score_map: {subset_name: (score, num), ...}
|
|
168
|
-
report_name: The user-defined report name.
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
{
|
|
172
|
-
"name":"TriviaQA",
|
|
173
|
-
"metric":"WeightedAverageAccuracy",
|
|
174
|
-
"score":0.3389,
|
|
175
|
-
"category":[
|
|
176
|
-
{
|
|
177
|
-
"name":"DEFAULT",
|
|
178
|
-
"score":0.3389,
|
|
179
|
-
"subset":[
|
|
180
|
-
{
|
|
181
|
-
"name":"default",
|
|
182
|
-
"score":0.3389
|
|
183
|
-
}
|
|
184
|
-
]
|
|
185
|
-
}
|
|
186
|
-
],
|
|
187
|
-
"total_num":100
|
|
188
|
-
}
|
|
189
|
-
"""
|
|
190
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
191
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
192
|
-
cate_avg_list = [{'name': subset_name, 'score': score} for subset_name, (score, _) in subset_score_map.items()]
|
|
193
|
-
|
|
194
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
195
|
-
|
|
196
|
-
res_map = dict(
|
|
197
|
-
name=report_name or 'trivia_qa',
|
|
198
|
-
metric=self.metric_list[0]['name'],
|
|
199
|
-
score=weighted_avg_acc,
|
|
200
|
-
category=[category_d],
|
|
201
|
-
total_num=total_num)
|
|
202
|
-
|
|
203
|
-
return res_map
|
|
126
|
+
is_correct = any([cand in pred for cand in gold])
|
|
127
|
+
return 1 if is_correct else 0
|
|
204
128
|
|
|
205
129
|
@classmethod
|
|
206
130
|
def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
|
|
@@ -1,6 +1 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import DATASET_ID, SUBSET_LIST
|
|
4
|
-
from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter
|
|
5
|
-
from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter as DataAdapterClass
|
|
6
|
-
from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
|
|
@@ -6,20 +6,30 @@ import numpy as np
|
|
|
6
6
|
import os
|
|
7
7
|
from typing import List
|
|
8
8
|
|
|
9
|
+
from evalscope.benchmarks import Benchmark
|
|
9
10
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
10
|
-
from evalscope.
|
|
11
|
-
from evalscope.
|
|
12
|
-
from evalscope.
|
|
11
|
+
from evalscope.constants import EvalType
|
|
12
|
+
from evalscope.metrics import AverageAccuracy
|
|
13
|
+
from evalscope.models import ContinuationLogitsModelAdapter
|
|
14
|
+
from evalscope.utils import get_logger, normalize_score
|
|
13
15
|
|
|
14
16
|
# flake8: noqa
|
|
15
17
|
|
|
16
18
|
logger = get_logger()
|
|
17
19
|
|
|
18
|
-
DATASET_ID = 'modelscope/truthful_qa'
|
|
19
20
|
# task_list = ['generation', 'multiple_choice']
|
|
20
|
-
SUBSET_LIST = ['multiple_choice']
|
|
21
21
|
|
|
22
22
|
|
|
23
|
+
@Benchmark.register(
|
|
24
|
+
name='truthful_qa',
|
|
25
|
+
dataset_id='modelscope/truthful_qa',
|
|
26
|
+
model_adapter=ContinuationLogitsModelAdapter,
|
|
27
|
+
subset_list=['multiple_choice'],
|
|
28
|
+
metric_list=[AverageAccuracy],
|
|
29
|
+
few_shot_num=0,
|
|
30
|
+
train_split=None,
|
|
31
|
+
eval_split='validation',
|
|
32
|
+
)
|
|
23
33
|
class TruthfulQaAdapter(DataAdapter):
|
|
24
34
|
"""
|
|
25
35
|
Adapter for TruthfulQA benchmark.
|
|
@@ -40,35 +50,14 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
40
50
|
'Q: Where were the 1992 Olympics held?\n'
|
|
41
51
|
'A: The 1992 Olympics were held in Barcelona, Spain.')
|
|
42
52
|
|
|
43
|
-
def __init__(self,
|
|
44
|
-
subset_list: list = None,
|
|
45
|
-
metric_list: list = None,
|
|
46
|
-
few_shot_num: int = None,
|
|
47
|
-
train_split: str = None,
|
|
48
|
-
eval_split: str = 'validation',
|
|
49
|
-
**kwargs):
|
|
50
|
-
|
|
51
|
-
if subset_list is None:
|
|
52
|
-
subset_list = SUBSET_LIST
|
|
53
|
-
|
|
54
|
-
if metric_list is None:
|
|
55
|
-
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
|
|
56
|
-
|
|
57
|
-
if few_shot_num is None:
|
|
58
|
-
logger.info(f'Set 0-shot examples by system for TruthfulQA.')
|
|
59
|
-
few_shot_num = 0
|
|
53
|
+
def __init__(self, **kwargs):
|
|
60
54
|
|
|
55
|
+
few_shot_num = kwargs.get('few_shot_num', 0)
|
|
61
56
|
if few_shot_num != 0:
|
|
62
57
|
logger.warning(f'few_shot_num should be 0 for TruthfulQA, but got {few_shot_num}. Use 0-shot by default.')
|
|
63
|
-
few_shot_num = 0
|
|
58
|
+
kwargs['few_shot_num'] = 0
|
|
64
59
|
|
|
65
|
-
super().__init__(
|
|
66
|
-
subset_list=subset_list,
|
|
67
|
-
metric_list=metric_list,
|
|
68
|
-
few_shot_num=few_shot_num,
|
|
69
|
-
train_split=train_split,
|
|
70
|
-
eval_split=eval_split,
|
|
71
|
-
**kwargs)
|
|
60
|
+
super().__init__(**kwargs)
|
|
72
61
|
|
|
73
62
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
74
63
|
data_dict = {}
|
|
@@ -215,7 +204,7 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
215
204
|
# TODO: generation sub-task to be added
|
|
216
205
|
return {'mc1_labels': input_d['mc1_targets']['labels'], 'mc2_labels': input_d['mc2_targets']['labels']}
|
|
217
206
|
|
|
218
|
-
def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str =
|
|
207
|
+
def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> list:
|
|
219
208
|
"""
|
|
220
209
|
Parse the model output to get the answer.
|
|
221
210
|
|
|
@@ -227,11 +216,11 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
227
216
|
Returns:
|
|
228
217
|
The predicted answer.
|
|
229
218
|
"""
|
|
230
|
-
if eval_type ==
|
|
219
|
+
if eval_type == EvalType.CHECKPOINT:
|
|
231
220
|
return result
|
|
232
|
-
elif eval_type ==
|
|
221
|
+
elif eval_type == EvalType.SERVICE: # TODO: to be supported !
|
|
233
222
|
return result
|
|
234
|
-
elif eval_type ==
|
|
223
|
+
elif eval_type == EvalType.CUSTOM: # TODO: to be supported !
|
|
235
224
|
return result
|
|
236
225
|
else:
|
|
237
226
|
raise ValueError(f'Invalid eval_type: {eval_type}')
|
|
@@ -270,7 +259,7 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
270
259
|
|
|
271
260
|
return {'multiple_choice': {'mc1': mc1(mc1_lls), 'mc2': mc2(mc2_lls)}} # or {'generation': xxx}
|
|
272
261
|
|
|
273
|
-
def compute_metric(self, review_res_list: List[dict]) ->
|
|
262
|
+
def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
|
|
274
263
|
"""
|
|
275
264
|
Compute evaluation result by specific metric for each subset.
|
|
276
265
|
|
|
@@ -295,56 +284,8 @@ class TruthfulQaAdapter(DataAdapter):
|
|
|
295
284
|
logger.error(f'** Unknown review_res: {review_res_d}')
|
|
296
285
|
|
|
297
286
|
# To get mc2 score
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
Generate the report for the model output.
|
|
304
|
-
|
|
305
|
-
Args:
|
|
306
|
-
subset_score_map: {subset_name: (score, num), ...}
|
|
307
|
-
report_name: The user-defined report name.
|
|
308
|
-
|
|
309
|
-
Returns:
|
|
310
|
-
{
|
|
311
|
-
"name":"TruthfulQA",
|
|
312
|
-
"metric":"WeightedAverageAccuracy",
|
|
313
|
-
"score":0.3389,
|
|
314
|
-
"category":[
|
|
315
|
-
{
|
|
316
|
-
"name":"DEFAULT",
|
|
317
|
-
"score":0.2527,
|
|
318
|
-
"subset":[
|
|
319
|
-
{
|
|
320
|
-
"name":"multiple_choice",
|
|
321
|
-
"score":0.3157
|
|
322
|
-
},
|
|
323
|
-
# {
|
|
324
|
-
# "name":"generation",
|
|
325
|
-
# "score":0.2631
|
|
326
|
-
# }
|
|
327
|
-
]
|
|
328
|
-
}
|
|
329
|
-
],
|
|
330
|
-
"total_num":100
|
|
331
|
-
}
|
|
332
|
-
"""
|
|
333
|
-
total_num: int = sum([num for _, num in subset_score_map.values()])
|
|
334
|
-
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
|
|
335
|
-
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
|
|
336
|
-
cate_avg_list = [{
|
|
337
|
-
'name': subset_name,
|
|
338
|
-
'score': normalize_score(score=score)
|
|
339
|
-
} for subset_name, (score, _) in subset_score_map.items()]
|
|
340
|
-
|
|
341
|
-
category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
|
|
342
|
-
|
|
343
|
-
res_map = dict(
|
|
344
|
-
name=report_name or 'truthful_qa',
|
|
345
|
-
metric=self.metric_list[0]['name'],
|
|
346
|
-
score=weighted_avg_acc,
|
|
347
|
-
category=[category_d],
|
|
348
|
-
total_num=total_num)
|
|
349
|
-
|
|
350
|
-
return res_map
|
|
287
|
+
return [{
|
|
288
|
+
'metric_name': self.metric_list[0].name,
|
|
289
|
+
'score': self.metric_list[0].object(mc2_list),
|
|
290
|
+
'num': len(mc2_list)
|
|
291
|
+
}]
|
evalscope/cli/cli.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import argparse
|
|
4
4
|
|
|
5
|
+
from evalscope.cli.start_app import StartAppCMD
|
|
5
6
|
from evalscope.cli.start_eval import EvalCMD
|
|
6
7
|
from evalscope.cli.start_perf import PerfBenchCMD
|
|
7
8
|
|
|
@@ -12,6 +13,7 @@ def run_cmd():
|
|
|
12
13
|
|
|
13
14
|
PerfBenchCMD.define_args(subparsers)
|
|
14
15
|
EvalCMD.define_args(subparsers)
|
|
16
|
+
StartAppCMD.define_args(subparsers)
|
|
15
17
|
|
|
16
18
|
args = parser.parse_args()
|
|
17
19
|
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
3
|
+
from argparse import ArgumentParser
|
|
4
|
+
|
|
5
|
+
from evalscope.cli.base import CLICommand
|
|
6
|
+
from evalscope.report.app import add_argument, create_app
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def subparser_func(args):
|
|
10
|
+
""" Function which will be called for a specific sub parser.
|
|
11
|
+
"""
|
|
12
|
+
return StartAppCMD(args)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class StartAppCMD(CLICommand):
|
|
16
|
+
name = 'app'
|
|
17
|
+
|
|
18
|
+
def __init__(self, args):
|
|
19
|
+
self.args = args
|
|
20
|
+
|
|
21
|
+
@staticmethod
|
|
22
|
+
def define_args(parsers: ArgumentParser):
|
|
23
|
+
""" define args for create pipeline template command.
|
|
24
|
+
"""
|
|
25
|
+
parser = parsers.add_parser(StartAppCMD.name)
|
|
26
|
+
add_argument(parser)
|
|
27
|
+
parser.set_defaults(func=subparser_func)
|
|
28
|
+
|
|
29
|
+
def execute(self):
|
|
30
|
+
create_app(self.args)
|