evalscope 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +21 -5
- evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
- evalscope/backend/rag_eval/utils/embedding.py +49 -3
- evalscope/backend/rag_eval/utils/llm.py +8 -9
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
- evalscope/benchmarks/arena_hard/utils.py +162 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
- evalscope/benchmarks/data_adapter.py +30 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
- evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
- evalscope/collections/evaluator.py +4 -2
- evalscope/config.py +2 -2
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/models/chat_adapter.py +32 -11
- evalscope/perf/arguments.py +30 -9
- evalscope/perf/benchmark.py +57 -103
- evalscope/perf/http_client.py +2 -3
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +4 -2
- evalscope/perf/plugin/datasets/custom.py +4 -1
- evalscope/perf/plugin/datasets/line_by_line.py +4 -1
- evalscope/perf/plugin/datasets/longalpaca.py +4 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -1
- evalscope/perf/plugin/datasets/random_dataset.py +13 -6
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/utils/benchmark_util.py +12 -6
- evalscope/perf/utils/db_util.py +3 -3
- evalscope/perf/utils/log_utils.py +41 -0
- evalscope/report/app.py +11 -11
- evalscope/run.py +7 -0
- evalscope/summarizer.py +2 -1
- evalscope/utils/utils.py +36 -25
- evalscope/version.py +2 -2
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/METADATA +21 -55
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/RECORD +70 -62
- tests/cli/test_all.py +36 -27
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +38 -20
- tests/perf/test_perf.py +1 -2
- tests/rag/test_clip_benchmark.py +0 -1
- tests/rag/test_mteb.py +37 -8
- tests/rag/test_ragas.py +33 -27
- tests/vlm/test_vlmeval.py +37 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
|
File without changes
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
4
|
+
from evalscope.constants import EvalType, OutputType
|
|
5
|
+
from evalscope.metrics import exact_match
|
|
6
|
+
from evalscope.utils.utils import ResponseParser
|
|
7
|
+
|
|
8
|
+
SUBSET_LIST = ['default']
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@Benchmark.register(
|
|
12
|
+
name='maritime_bench',
|
|
13
|
+
pretty_name='MaritimeBench',
|
|
14
|
+
dataset_id='HiDolphin/MaritimeBench',
|
|
15
|
+
model_adapter=OutputType.GENERATION,
|
|
16
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
17
|
+
subset_list=SUBSET_LIST,
|
|
18
|
+
metric_list=['AverageAccuracy'],
|
|
19
|
+
eval_split='test',
|
|
20
|
+
prompt_template=
|
|
21
|
+
'题目来自于{subset_name}请回答单选题。要求只输出选项,不输出解释,将选项放在<>里,直接输出答案。示例:\n\n题目:在船舶主推进动力装置中,传动轴系在运转中承受以下复杂的应力和负荷,但不包括______。\n选项:\nA. 电磁力\nB. 压拉应力\nC. 弯曲应力\nD. 扭应力\n答:<A> 当前题目\n {query}', # noqa: E501
|
|
22
|
+
)
|
|
23
|
+
class MaritimeBenchAdapter(DataAdapter):
|
|
24
|
+
|
|
25
|
+
def __init__(self, **kwargs):
|
|
26
|
+
super().__init__(**kwargs)
|
|
27
|
+
|
|
28
|
+
self.choices = ['A', 'B', 'C', 'D']
|
|
29
|
+
|
|
30
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
31
|
+
|
|
32
|
+
prefix = ''
|
|
33
|
+
query = prefix + input_d['question'] + '\n'
|
|
34
|
+
available_choices = []
|
|
35
|
+
for option in self.choices:
|
|
36
|
+
if option in input_d and input_d[option]:
|
|
37
|
+
query += option + ':' + input_d[option] + '\n'
|
|
38
|
+
available_choices.append(option)
|
|
39
|
+
|
|
40
|
+
full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
|
|
41
|
+
return self.gen_prompt_data(full_prompt, choices=available_choices)
|
|
42
|
+
|
|
43
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
44
|
+
"""
|
|
45
|
+
Parse the raw input labels (gold).
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
input_d: input raw data. Depending on the dataset.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
The parsed input. e.g. gold answer ... Depending on the dataset.
|
|
52
|
+
"""
|
|
53
|
+
return input_d['answer']
|
|
54
|
+
|
|
55
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Parse the raw model prediction (pred).
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
pred: model prediction. Depending on the model.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
The parsed prediction. e.g. model answer... Depending on the model.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
return ResponseParser.parse_bracketed_answer(result, options=self.choices)
|
|
67
|
+
|
|
68
|
+
def match(self, gold: Any, pred: Any) -> Any:
|
|
69
|
+
"""
|
|
70
|
+
Match the gold answer with the predicted answer.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
gold: The gold answer.
|
|
74
|
+
pred: The predicted answer.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
The result of the match.
|
|
78
|
+
"""
|
|
79
|
+
return exact_match(gold=gold, pred=pred)
|
|
@@ -145,7 +145,7 @@ SUBJECT_MAPPING = {
|
|
|
145
145
|
train_split='train',
|
|
146
146
|
eval_split='test',
|
|
147
147
|
prompt_template=
|
|
148
|
-
|
|
148
|
+
"""Answer the following multiple choice question about {subset_name}. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.\n\n{query}""", # noqa: E501
|
|
149
149
|
)
|
|
150
150
|
class MMLUAdapter(DataAdapter):
|
|
151
151
|
|
|
@@ -224,9 +224,8 @@ class MMLUAdapter(DataAdapter):
|
|
|
224
224
|
|
|
225
225
|
context: str = '\n'.join(few_shot_prompts) + '\n'
|
|
226
226
|
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
227
|
-
query = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
|
|
228
227
|
|
|
229
|
-
full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=
|
|
228
|
+
full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=context.strip())
|
|
230
229
|
|
|
231
230
|
return self.gen_prompt_data(full_prompt)
|
|
232
231
|
|
|
@@ -249,7 +248,7 @@ class MMLUAdapter(DataAdapter):
|
|
|
249
248
|
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
250
249
|
return result
|
|
251
250
|
else:
|
|
252
|
-
return ResponseParser.parse_first_option(result)
|
|
251
|
+
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
253
252
|
|
|
254
253
|
def match(self, gold: str, pred: str) -> float:
|
|
255
254
|
return exact_match(gold=gold, pred=pred)
|
|
@@ -260,11 +259,10 @@ class MMLUAdapter(DataAdapter):
|
|
|
260
259
|
|
|
261
260
|
example: str = input_d['input']
|
|
262
261
|
for j in range(len(self.choices)):
|
|
263
|
-
example += '\n{
|
|
262
|
+
example += f'\n{self.choices[j]}) {input_choices[j]}'
|
|
264
263
|
|
|
265
|
-
example += '\nAnswer:'
|
|
266
264
|
if include_answer:
|
|
267
|
-
example +=
|
|
265
|
+
example += f"\nAnswer: {input_d['target']}\n\n"
|
|
268
266
|
|
|
269
267
|
return example
|
|
270
268
|
|
|
@@ -92,7 +92,7 @@ class MMLUProAdapter(DataAdapter):
|
|
|
92
92
|
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
93
93
|
return result
|
|
94
94
|
else:
|
|
95
|
-
return ResponseParser.parse_first_option(result)
|
|
95
|
+
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
96
96
|
|
|
97
97
|
def match(self, gold: str, pred: str) -> float:
|
|
98
98
|
"""
|
|
File without changes
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import Any, Dict
|
|
3
|
+
|
|
4
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
+
from evalscope.constants import EvalType, OutputType
|
|
6
|
+
from evalscope.metrics import exact_match
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
from evalscope.utils.utils import ResponseParser
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
SUBSET_LIST = [
|
|
13
|
+
'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics', 'clinical_knowledge', 'college_biology',
|
|
14
|
+
'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_medicine', 'college_physics',
|
|
15
|
+
'computer_security', 'conceptual_physics', 'econometrics', 'electrical_engineering', 'elementary_mathematics',
|
|
16
|
+
'formal_logic', 'global_facts', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science',
|
|
17
|
+
'high_school_european_history', 'high_school_geography', 'high_school_government_and_politics',
|
|
18
|
+
'high_school_macroeconomics', 'high_school_mathematics', 'high_school_microeconomics', 'high_school_physics',
|
|
19
|
+
'high_school_psychology', 'high_school_statistics', 'high_school_us_history', 'high_school_world_history',
|
|
20
|
+
'human_aging', 'human_sexuality', 'international_law', 'jurisprudence', 'logical_fallacies', 'machine_learning',
|
|
21
|
+
'management', 'marketing', 'medical_genetics', 'miscellaneous', 'moral_disputes', 'moral_scenarios', 'nutrition',
|
|
22
|
+
'philosophy', 'prehistory', 'professional_accounting', 'professional_law', 'professional_medicine',
|
|
23
|
+
'professional_psychology', 'public_relations', 'security_studies', 'sociology', 'us_foreign_policy', 'virology',
|
|
24
|
+
'world_religions'
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
SUBJECT_MAPPING = {
|
|
28
|
+
'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
|
|
29
|
+
'anatomy': ['Anatomy', 'health', 'Other'],
|
|
30
|
+
'astronomy': ['Astronomy', 'physics', 'STEM'],
|
|
31
|
+
'business_ethics': ['Business Ethics', 'business', 'Other'],
|
|
32
|
+
'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
|
|
33
|
+
'college_biology': ['College Biology', 'biology', 'STEM'],
|
|
34
|
+
'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
|
|
35
|
+
'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
|
|
36
|
+
'college_mathematics': ['College Mathematics', 'math', 'STEM'],
|
|
37
|
+
'college_medicine': ['College Medicine', 'health', 'Other'],
|
|
38
|
+
'college_physics': ['College Physics', 'physics', 'STEM'],
|
|
39
|
+
'computer_security': ['Computer Security', 'computer science', 'STEM'],
|
|
40
|
+
'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
|
|
41
|
+
'econometrics': ['Econometrics', 'economics', 'Social Science'],
|
|
42
|
+
'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
|
|
43
|
+
'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
|
|
44
|
+
'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
|
|
45
|
+
'global_facts': ['Global Facts', 'other', 'Other'],
|
|
46
|
+
'high_school_biology': ['High School Biology', 'biology', 'STEM'],
|
|
47
|
+
'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
|
|
48
|
+
'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
|
|
49
|
+
'high_school_european_history': ['High School European History', 'history', 'Humanities'],
|
|
50
|
+
'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
|
|
51
|
+
'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
|
|
52
|
+
'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
|
|
53
|
+
'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
|
|
54
|
+
'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
|
|
55
|
+
'high_school_physics': ['High School Physics', 'physics', 'STEM'],
|
|
56
|
+
'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
|
|
57
|
+
'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
|
|
58
|
+
'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
|
|
59
|
+
'high_school_world_history': ['High School World History', 'history', 'Humanities'],
|
|
60
|
+
'human_aging': ['Human Aging', 'health', 'Other'],
|
|
61
|
+
'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
|
|
62
|
+
'international_law': ['International Law', 'law', 'Humanities'],
|
|
63
|
+
'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
|
|
64
|
+
'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
|
|
65
|
+
'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
|
|
66
|
+
'management': ['Management', 'business', 'Other'],
|
|
67
|
+
'marketing': ['Marketing', 'business', 'Other'],
|
|
68
|
+
'medical_genetics': ['Medical Genetics', 'health', 'Other'],
|
|
69
|
+
'miscellaneous': ['Miscellaneous', 'other', 'Other'],
|
|
70
|
+
'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
|
|
71
|
+
'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
|
|
72
|
+
'nutrition': ['Nutrition', 'health', 'Other'],
|
|
73
|
+
'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
|
|
74
|
+
'prehistory': ['Prehistory', 'history', 'Humanities'],
|
|
75
|
+
'professional_accounting': ['Professional Accounting', 'other', 'Other'],
|
|
76
|
+
'professional_law': ['Professional Law', 'law', 'Humanities'],
|
|
77
|
+
'professional_medicine': ['Professional Medicine', 'health', 'Other'],
|
|
78
|
+
'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
|
|
79
|
+
'public_relations': ['Public Relations', 'politics', 'Social Science'],
|
|
80
|
+
'security_studies': ['Security Studies', 'politics', 'Social Science'],
|
|
81
|
+
'sociology': ['Sociology', 'culture', 'Social Science'],
|
|
82
|
+
'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
|
|
83
|
+
'virology': ['Virology', 'health', 'Other'],
|
|
84
|
+
'world_religions': ['World Religions', 'philosophy', 'Humanities'],
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@Benchmark.register(
|
|
89
|
+
name='mmlu_redux',
|
|
90
|
+
pretty_name='MMLU-Redux',
|
|
91
|
+
dataset_id='AI-ModelScope/mmlu-redux-2.0',
|
|
92
|
+
model_adapter=OutputType.GENERATION,
|
|
93
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
94
|
+
subset_list=SUBSET_LIST,
|
|
95
|
+
metric_list=['AverageAccuracy'],
|
|
96
|
+
few_shot_num=0,
|
|
97
|
+
train_split=None,
|
|
98
|
+
eval_split='test',
|
|
99
|
+
prompt_template=
|
|
100
|
+
'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}', # noqa: E501
|
|
101
|
+
)
|
|
102
|
+
class MMLUReduxAdapter(DataAdapter):
|
|
103
|
+
|
|
104
|
+
def __init__(self, **kwargs):
|
|
105
|
+
super().__init__(**kwargs)
|
|
106
|
+
|
|
107
|
+
if self.few_shot_num > 0:
|
|
108
|
+
self.few_shot_num = 0
|
|
109
|
+
logger.warning('Few-shot examples are not supported for MMLU-Redux dataset. Setting few_shot_num to 0.')
|
|
110
|
+
|
|
111
|
+
self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
|
|
112
|
+
self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
|
|
113
|
+
|
|
114
|
+
def gen_prompt(self, input_d: Dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
|
|
115
|
+
if self.few_shot_num > 0:
|
|
116
|
+
prefix = self.format_fewshot_examples(few_shot_list)
|
|
117
|
+
else:
|
|
118
|
+
prefix = ''
|
|
119
|
+
query = prefix + 'Q: ' + input_d['question'] + '\n' + \
|
|
120
|
+
self.__form_options(input_d['choices']) + '\n'
|
|
121
|
+
|
|
122
|
+
full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
|
|
123
|
+
return self.gen_prompt_data(full_prompt)
|
|
124
|
+
|
|
125
|
+
def format_fewshot_examples(self, few_shot_list):
|
|
126
|
+
# load few-shot prompts for each category
|
|
127
|
+
prompts = ''
|
|
128
|
+
for index, d in enumerate(few_shot_list):
|
|
129
|
+
prompts += 'Q: ' + d['question'] + '\n' + \
|
|
130
|
+
self.__form_options(d['choices']) + '\n'
|
|
131
|
+
return prompts
|
|
132
|
+
|
|
133
|
+
def __form_options(self, options: list):
|
|
134
|
+
option_str = 'Options are:\n'
|
|
135
|
+
for opt, choice in zip(options, self.choices):
|
|
136
|
+
option_str += f'({choice}): {opt}' + '\n'
|
|
137
|
+
return option_str
|
|
138
|
+
|
|
139
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
140
|
+
"""
|
|
141
|
+
Parse the raw input labels (gold).
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
input_d: input raw data. Depending on the dataset.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
The parsed input. e.g. gold answer ... Depending on the dataset.
|
|
148
|
+
"""
|
|
149
|
+
answer_index = int(input_d['answer'])
|
|
150
|
+
return self.choices[answer_index]
|
|
151
|
+
|
|
152
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
153
|
+
"""
|
|
154
|
+
Parse the predicted result and extract proper answer.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
result: Predicted answer from the model. Usually a string for chat.
|
|
158
|
+
raw_input_d: The raw input. Depending on the dataset.
|
|
159
|
+
eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
163
|
+
"""
|
|
164
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
165
|
+
return result
|
|
166
|
+
else:
|
|
167
|
+
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
168
|
+
|
|
169
|
+
def match(self, gold: str, pred: str) -> float:
|
|
170
|
+
"""
|
|
171
|
+
Match the gold answer and the predicted answer.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
|
|
175
|
+
e.g. 'A', extracted from get_gold_answer method.
|
|
176
|
+
pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
|
|
177
|
+
e.g. 'B', extracted from parse_pred_result method.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
The match result. Usually a score (float) for chat/multiple-choice-questions.
|
|
181
|
+
"""
|
|
182
|
+
return exact_match(gold=gold, pred=pred)
|
|
@@ -62,7 +62,7 @@ class MuSRAdapter(DataAdapter):
|
|
|
62
62
|
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
63
63
|
return result
|
|
64
64
|
else:
|
|
65
|
-
return ResponseParser.parse_first_option(result)
|
|
65
|
+
return ResponseParser.parse_first_option(result, options=self.choices)
|
|
66
66
|
|
|
67
67
|
def match(self, gold: str, pred: str) -> float:
|
|
68
68
|
"""
|
|
@@ -126,7 +126,7 @@ class SimpleQAAdapter(DataAdapter):
|
|
|
126
126
|
|
|
127
127
|
def match(self, gold: str, pred: str) -> float:
|
|
128
128
|
# simple match
|
|
129
|
-
logger.warning(f'Please use LLMJudge to match the result for
|
|
129
|
+
logger.warning(f'Please use LLMJudge to match the result for {self.name}')
|
|
130
130
|
is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
|
|
131
131
|
is_incorrect = not is_correct
|
|
132
132
|
is_not_attempted = 0
|
|
@@ -159,9 +159,6 @@ class SimpleQAAdapter(DataAdapter):
|
|
|
159
159
|
review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
|
|
160
160
|
"""
|
|
161
161
|
# zip dict answers
|
|
162
|
-
res_dict =
|
|
163
|
-
for res in review_res_list:
|
|
164
|
-
for key, value in res.items():
|
|
165
|
-
res_dict[key].append(value)
|
|
162
|
+
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
166
163
|
|
|
167
164
|
return super().compute_metric(res_dict, **kwargs)
|
|
@@ -65,7 +65,7 @@ class EvaluatorCollection:
|
|
|
65
65
|
self.evaluators = self._initialize_evaluators()
|
|
66
66
|
|
|
67
67
|
def load(self) -> tuple[list[DatasetEntry], str]:
|
|
68
|
-
dataset_name = os.path.basename(self.data_adapter.dataset_id)
|
|
68
|
+
dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
|
|
69
69
|
raw_dataset = self.data_adapter.load()
|
|
70
70
|
# limit the dataset
|
|
71
71
|
if self.task_cfg.limit:
|
|
@@ -174,6 +174,7 @@ class EvaluatorCollection:
|
|
|
174
174
|
os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
|
|
175
175
|
with open(report_file_path, 'w', encoding='utf-8') as f:
|
|
176
176
|
json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
|
|
177
|
+
return report
|
|
177
178
|
|
|
178
179
|
def _filter_answer(self, pred_file_path):
|
|
179
180
|
answer_dict = defaultdict(dict)
|
|
@@ -274,4 +275,5 @@ class EvaluatorCollection:
|
|
|
274
275
|
answers = self.get_answers()
|
|
275
276
|
reviews = self.get_reviews(answers)
|
|
276
277
|
scores = self.get_scores(reviews)
|
|
277
|
-
self.get_report(scores)
|
|
278
|
+
report = self.get_report(scores)
|
|
279
|
+
return report
|
evalscope/config.py
CHANGED
|
@@ -75,7 +75,7 @@ class TaskConfig:
|
|
|
75
75
|
|
|
76
76
|
# LLMJudge arguments
|
|
77
77
|
judge_strategy: str = JudgeStrategy.AUTO
|
|
78
|
-
judge_worker_num: int =
|
|
78
|
+
judge_worker_num: int = 1
|
|
79
79
|
judge_model_args: Optional[Dict] = field(default_factory=lambda: {})
|
|
80
80
|
|
|
81
81
|
def __post_init__(self):
|
|
@@ -212,7 +212,7 @@ def parse_task_config(task_cfg) -> TaskConfig:
|
|
|
212
212
|
logger.info('Args: Task config is provided with CommandLine type.')
|
|
213
213
|
task_cfg = TaskConfig.from_args(task_cfg)
|
|
214
214
|
elif isinstance(task_cfg, str):
|
|
215
|
-
extension =
|
|
215
|
+
extension = os.path.splitext(task_cfg)[-1]
|
|
216
216
|
logger.info(f'Args: Task config is provided with {extension} file type.')
|
|
217
217
|
if extension in ['yaml', 'yml']:
|
|
218
218
|
task_cfg = TaskConfig.from_yaml(task_cfg)
|
evalscope/metrics/llm_judge.py
CHANGED
|
@@ -49,7 +49,7 @@ class LLMJudge:
|
|
|
49
49
|
"""
|
|
50
50
|
self.api_key = api_key or os.environ.get('OPENAI_API_KEY', 'EMPTY')
|
|
51
51
|
self.api_url = api_url or os.environ.get('OPENAI_API_BASE', 'https://api.openai.com/v1')
|
|
52
|
-
self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-
|
|
52
|
+
self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-4')
|
|
53
53
|
self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
|
|
54
54
|
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
55
55
|
self.generation_config = generation_config
|
evalscope/models/chat_adapter.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import time
|
|
3
3
|
import torch
|
|
4
|
-
from typing import List, Union
|
|
4
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.constants import OutputType
|
|
7
7
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
8
8
|
from evalscope.models.local_model import LocalModel
|
|
9
9
|
from evalscope.models.register import register_model_adapter
|
|
10
|
-
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
10
|
+
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
|
|
11
11
|
from evalscope.utils.logger import get_logger
|
|
12
12
|
from evalscope.utils.model_utils import fix_do_sample_warning
|
|
13
13
|
|
|
@@ -60,7 +60,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
60
60
|
|
|
61
61
|
return generation_config
|
|
62
62
|
|
|
63
|
-
def _model_generate(self,
|
|
63
|
+
def _model_generate(self,
|
|
64
|
+
queries: List[str],
|
|
65
|
+
system_prompts: List[str] = None,
|
|
66
|
+
infer_cfg: Dict[str, Any] = None) -> Tuple[List[List[str]], List[int]]:
|
|
64
67
|
"""
|
|
65
68
|
Args:
|
|
66
69
|
queries: The input queries.
|
|
@@ -69,6 +72,11 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
69
72
|
Returns:
|
|
70
73
|
The prediction results.
|
|
71
74
|
"""
|
|
75
|
+
if system_prompts is None:
|
|
76
|
+
system_prompts = []
|
|
77
|
+
if infer_cfg is None:
|
|
78
|
+
infer_cfg = {}
|
|
79
|
+
|
|
72
80
|
# Process infer_cfg
|
|
73
81
|
num_return_sequences = infer_cfg.get('num_return_sequences', 1)
|
|
74
82
|
if num_return_sequences > 1:
|
|
@@ -111,7 +119,9 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
111
119
|
# Run inference
|
|
112
120
|
output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
|
|
113
121
|
|
|
122
|
+
# Decode output
|
|
114
123
|
responses = []
|
|
124
|
+
input_lengths = [len(self.tokenizer.encode(prompt)) for prompt in formatted_prompts]
|
|
115
125
|
for i in range(0, len(output_ids), num_return_sequences):
|
|
116
126
|
query_responses = []
|
|
117
127
|
for j in range(num_return_sequences):
|
|
@@ -121,7 +131,7 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
121
131
|
query_responses.append(response)
|
|
122
132
|
responses.append(query_responses)
|
|
123
133
|
|
|
124
|
-
return responses
|
|
134
|
+
return responses, input_lengths
|
|
125
135
|
|
|
126
136
|
@torch.no_grad()
|
|
127
137
|
def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
|
|
@@ -141,22 +151,33 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
141
151
|
queries.append(input_item['data'][0])
|
|
142
152
|
system_prompts.append(input_item.get('system_prompt', None))
|
|
143
153
|
|
|
144
|
-
|
|
154
|
+
# Run inference
|
|
155
|
+
responses, input_lengths = self._model_generate(queries, system_prompts, infer_cfg)
|
|
145
156
|
|
|
157
|
+
# Process outputs
|
|
146
158
|
results = []
|
|
147
|
-
for response in responses:
|
|
148
|
-
choices_list = [
|
|
149
|
-
|
|
159
|
+
for response, input_length in zip(responses, input_lengths):
|
|
160
|
+
choices_list = []
|
|
161
|
+
completion_tokens = 0
|
|
162
|
+
|
|
163
|
+
for index, one_response in enumerate(response):
|
|
164
|
+
choice = ChatCompletionResponseChoice(
|
|
150
165
|
index=index, message=ChatMessage(content=one_response, role='assistant'), finish_reason='stop')
|
|
151
|
-
|
|
152
|
-
|
|
166
|
+
choices_list.append(choice)
|
|
167
|
+
|
|
168
|
+
completion_tokens += len(self.tokenizer.encode(one_response))
|
|
169
|
+
|
|
170
|
+
usage = Usage(
|
|
171
|
+
prompt_tokens=input_length,
|
|
172
|
+
completion_tokens=completion_tokens,
|
|
173
|
+
total_tokens=input_length + completion_tokens)
|
|
153
174
|
|
|
154
175
|
res_d = ChatCompletionResponse(
|
|
155
176
|
model=self.model_id,
|
|
156
177
|
choices=choices_list,
|
|
157
178
|
object='chat.completion',
|
|
158
179
|
created=int(time.time()),
|
|
159
|
-
usage=
|
|
180
|
+
usage=usage).model_dump(exclude_unset=True)
|
|
160
181
|
|
|
161
182
|
results.append(res_d)
|
|
162
183
|
|
evalscope/perf/arguments.py
CHANGED
|
@@ -27,7 +27,7 @@ class Arguments:
|
|
|
27
27
|
no_test_connection: bool = False # Test the connection before starting the benchmark
|
|
28
28
|
|
|
29
29
|
# Performance and parallelism
|
|
30
|
-
number:
|
|
30
|
+
number: int = 1000 # Number of requests to be made
|
|
31
31
|
parallel: int = 1 # Number of parallel requests
|
|
32
32
|
rate: int = -1 # Rate limit for requests (default: -1, no limit)
|
|
33
33
|
|
|
@@ -35,6 +35,7 @@ class Arguments:
|
|
|
35
35
|
log_every_n_query: int = 10 # Log every N queries
|
|
36
36
|
debug: bool = False # Debug mode
|
|
37
37
|
wandb_api_key: Optional[str] = None # WandB API key for logging
|
|
38
|
+
swanlab_api_key: Optional[str] = None # SwanLab API key for logging
|
|
38
39
|
name: Optional[str] = None # Name for the run
|
|
39
40
|
|
|
40
41
|
# Output settings
|
|
@@ -46,6 +47,7 @@ class Arguments:
|
|
|
46
47
|
prefix_length: int = 0 # Length of the prefix, only for random dataset
|
|
47
48
|
prompt: Optional[str] = None # The prompt text
|
|
48
49
|
query_template: Optional[str] = None # Template for the query
|
|
50
|
+
apply_chat_template: Optional[bool] = None # Whether to apply chat template
|
|
49
51
|
|
|
50
52
|
# Dataset settings
|
|
51
53
|
dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
|
|
@@ -57,13 +59,14 @@ class Arguments:
|
|
|
57
59
|
max_tokens: Optional[int] = 2048 # Maximum number of tokens in the response
|
|
58
60
|
min_tokens: Optional[int] = None # Minimum number of tokens in the response
|
|
59
61
|
n_choices: Optional[int] = None # Number of response choices
|
|
60
|
-
seed: Optional[int] =
|
|
62
|
+
seed: Optional[int] = 0 # Random seed for reproducibility
|
|
61
63
|
stop: Optional[List[str]] = field(default_factory=list) # Stop sequences for the response
|
|
62
64
|
stop_token_ids: Optional[List[str]] = field(default_factory=list) # Stop token IDs for the response
|
|
63
|
-
stream: Optional[bool] =
|
|
64
|
-
temperature:
|
|
65
|
+
stream: Optional[bool] = True # Whether to stream the response
|
|
66
|
+
temperature: float = 0.0 # Temperature setting for the response
|
|
65
67
|
top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
|
|
66
68
|
top_k: Optional[int] = None # Top-k sampling setting for the response
|
|
69
|
+
extra_args: Optional[Dict[str, Any]] = None # Extra arguments
|
|
67
70
|
|
|
68
71
|
@staticmethod
|
|
69
72
|
def from_args(args):
|
|
@@ -75,12 +78,26 @@ class Arguments:
|
|
|
75
78
|
return Arguments(**args_dict)
|
|
76
79
|
|
|
77
80
|
def __post_init__(self):
|
|
81
|
+
# Set the default headers
|
|
78
82
|
self.headers = self.headers or {} # Default to empty dictionary
|
|
79
83
|
if self.api_key:
|
|
80
84
|
# Assuming the API key is used as a Bearer token
|
|
81
85
|
self.headers['Authorization'] = f'Bearer {self.api_key}'
|
|
86
|
+
|
|
87
|
+
# Set the model ID based on the model name
|
|
82
88
|
self.model_id = os.path.basename(self.model)
|
|
83
89
|
|
|
90
|
+
# Set the URL based on the dataset type
|
|
91
|
+
if self.api.startswith('local'):
|
|
92
|
+
if self.dataset.startswith('speed_benchmark'):
|
|
93
|
+
self.url = f'http://127.0.0.1:{self.port}/v1/completions'
|
|
94
|
+
else:
|
|
95
|
+
self.url = f'http://127.0.0.1:{self.port}/v1/chat/completions'
|
|
96
|
+
|
|
97
|
+
# Set the apply_chat_template flag based on the URL
|
|
98
|
+
if self.apply_chat_template is None:
|
|
99
|
+
self.apply_chat_template = self.url.strip('/').endswith('chat/completions')
|
|
100
|
+
|
|
84
101
|
def __str__(self):
|
|
85
102
|
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
86
103
|
|
|
@@ -126,7 +143,7 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
126
143
|
parser.add_argument('--no-test-connection', action='store_false', default=False, help='Do not test the connection before starting the benchmark') # noqa: E501
|
|
127
144
|
|
|
128
145
|
# Performance and parallelism
|
|
129
|
-
parser.add_argument('-n', '--number', type=int, default=
|
|
146
|
+
parser.add_argument('-n', '--number', type=int, default=1000, help='How many requests to be made')
|
|
130
147
|
parser.add_argument('--parallel', type=int, default=1, help='Set number of concurrency requests, default 1')
|
|
131
148
|
parser.add_argument('--rate', type=int, default=-1, help='Number of requests per second. default None')
|
|
132
149
|
|
|
@@ -134,7 +151,8 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
134
151
|
parser.add_argument('--log-every-n-query', type=int, default=10, help='Logging every n query')
|
|
135
152
|
parser.add_argument('--debug', action='store_true', default=False, help='Debug request send')
|
|
136
153
|
parser.add_argument('--wandb-api-key', type=str, default=None, help='The wandb API key')
|
|
137
|
-
parser.add_argument('--
|
|
154
|
+
parser.add_argument('--swanlab-api-key', type=str, default=None, help='The swanlab API key')
|
|
155
|
+
parser.add_argument('--name', type=str, help='The wandb/swanlab db result name and result db name')
|
|
138
156
|
|
|
139
157
|
# Prompt settings
|
|
140
158
|
parser.add_argument('--max-prompt-length', type=int, default=sys.maxsize, help='Maximum input prompt length')
|
|
@@ -142,6 +160,8 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
142
160
|
parser.add_argument('--prefix-length', type=int, default=0, help='The prefix length')
|
|
143
161
|
parser.add_argument('--prompt', type=str, required=False, default=None, help='Specified the request prompt')
|
|
144
162
|
parser.add_argument('--query-template', type=str, default=None, help='Specify the query template')
|
|
163
|
+
parser.add_argument(
|
|
164
|
+
'--apply-chat-template', type=argparse.BooleanOptionalAction, default=None, help='Apply chat template to the prompt') # noqa: E501
|
|
145
165
|
|
|
146
166
|
# Output settings
|
|
147
167
|
parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
|
|
@@ -158,13 +178,14 @@ def add_argument(parser: argparse.ArgumentParser):
|
|
|
158
178
|
parser.add_argument(
|
|
159
179
|
'--min-tokens', type=int, help='The minimum number of tokens that can be generated', default=None)
|
|
160
180
|
parser.add_argument('--n-choices', type=int, help='How many completion choices to generate', default=None)
|
|
161
|
-
parser.add_argument('--seed', type=int, help='The random seed', default=
|
|
181
|
+
parser.add_argument('--seed', type=int, help='The random seed', default=0)
|
|
162
182
|
parser.add_argument('--stop', nargs='*', help='The stop tokens', default=None)
|
|
163
183
|
parser.add_argument('--stop-token-ids', nargs='*', help='Set the stop token IDs', default=None)
|
|
164
|
-
parser.add_argument('--stream', action=
|
|
165
|
-
parser.add_argument('--temperature', type=float, help='The sample temperature', default=
|
|
184
|
+
parser.add_argument('--stream', action=argparse.BooleanOptionalAction, help='Stream output with SSE', default=True)
|
|
185
|
+
parser.add_argument('--temperature', type=float, help='The sample temperature', default=0.0)
|
|
166
186
|
parser.add_argument('--top-p', type=float, help='Sampling top p', default=None)
|
|
167
187
|
parser.add_argument('--top-k', type=int, help='Sampling top k', default=None)
|
|
188
|
+
parser.add_argument('--extra-args', type=json.loads, default='{}', help='Extra arguments, should in JSON format',)
|
|
168
189
|
# yapf: enable
|
|
169
190
|
|
|
170
191
|
|