evalscope 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +10 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +23 -99
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +114 -85
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +25 -53
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +178 -0
- evalscope/collections/sampler.py +132 -0
- evalscope/collections/schema.py +122 -0
- evalscope/config.py +10 -6
- evalscope/constants.py +7 -28
- evalscope/evaluator/evaluator.py +66 -108
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +6 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +7 -4
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +104 -0
- evalscope/perf/arguments.py +1 -0
- evalscope/perf/benchmark.py +1 -1
- evalscope/perf/main.py +3 -1
- evalscope/perf/plugin/api/openai_api.py +51 -47
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/run.py +37 -66
- evalscope/run_arena.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +4 -3
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +4 -0
- evalscope/utils/model_utils.py +10 -0
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/METADATA +46 -17
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/RECORD +81 -92
- tests/cli/test_collection.py +53 -0
- tests/cli/test_run.py +43 -1
- tests/perf/test_perf.py +3 -3
- tests/rag/test_mteb.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import time
|
|
3
|
+
import torch
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from evalscope.models.base_adapter import BaseModelAdapter
|
|
7
|
+
from evalscope.models.local_model import LocalModel
|
|
8
|
+
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
12
|
+
""" The multi-choice model adapter. """
|
|
13
|
+
|
|
14
|
+
_DEFAULT_MAX_LENGTH = 2048
|
|
15
|
+
|
|
16
|
+
def __init__(self, model: LocalModel, **kwargs):
|
|
17
|
+
super().__init__(model)
|
|
18
|
+
|
|
19
|
+
self._max_length = kwargs.get('max_length')
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def max_length(self):
|
|
23
|
+
if self._max_length:
|
|
24
|
+
return self._max_length
|
|
25
|
+
seqlen_config_attrs = ('n_positions', 'max_position_embeddings', 'n_ctx')
|
|
26
|
+
for attr in seqlen_config_attrs:
|
|
27
|
+
if hasattr(self.model.config, attr):
|
|
28
|
+
return getattr(self.model.config, attr)
|
|
29
|
+
if hasattr(self.tokenizer, 'model_max_length'):
|
|
30
|
+
if self.tokenizer.model_max_length == 1000000000000000019884624838656:
|
|
31
|
+
return self._DEFAULT_MAX_LENGTH
|
|
32
|
+
return self.tokenizer.model_max_length
|
|
33
|
+
return self._DEFAULT_MAX_LENGTH
|
|
34
|
+
|
|
35
|
+
@torch.no_grad()
|
|
36
|
+
def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
|
|
37
|
+
"""
|
|
38
|
+
Multi-choice model prediction func.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
inputs (dict): The inputs for a doc. Format:
|
|
42
|
+
{'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']}
|
|
43
|
+
|
|
44
|
+
infer_cfg (dict): inference configuration.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
res (dict): The model prediction results. Format:
|
|
48
|
+
{
|
|
49
|
+
'choices': [
|
|
50
|
+
{
|
|
51
|
+
'index': 0,
|
|
52
|
+
'message': {
|
|
53
|
+
'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs.
|
|
54
|
+
'role': 'assistant'
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
],
|
|
58
|
+
'created': 1677664795,
|
|
59
|
+
# For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
|
|
60
|
+
'model': 'gpt-3.5-turbo-0613',
|
|
61
|
+
'object': 'chat.completion',
|
|
62
|
+
'usage': {
|
|
63
|
+
'completion_tokens': 17,
|
|
64
|
+
'prompt_tokens': 57,
|
|
65
|
+
'total_tokens': 74
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
"""
|
|
69
|
+
infer_cfg = infer_cfg or {}
|
|
70
|
+
self.model.generation_config.update(**infer_cfg)
|
|
71
|
+
|
|
72
|
+
input_data = inputs['data']
|
|
73
|
+
multi_choices = inputs['multi_choices']
|
|
74
|
+
|
|
75
|
+
output, input_info = self._get_logits(self.tokenizer, self.model, input_data)
|
|
76
|
+
assert output.shape[0] == 1
|
|
77
|
+
logits = output.flatten()
|
|
78
|
+
|
|
79
|
+
choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in multi_choices]
|
|
80
|
+
softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
|
|
81
|
+
|
|
82
|
+
if softval.dtype in {torch.bfloat16, torch.float16}:
|
|
83
|
+
softval = softval.to(dtype=torch.float32)
|
|
84
|
+
probs = softval.detach().cpu().numpy()
|
|
85
|
+
pred: str = multi_choices[int(np.argmax(probs))] # Format: A or B or C or D
|
|
86
|
+
|
|
87
|
+
res_d = ChatCompletionResponse(
|
|
88
|
+
model=self.model_id,
|
|
89
|
+
choices=[
|
|
90
|
+
ChatCompletionResponseChoice(
|
|
91
|
+
index=0, message=ChatMessage(content=pred, role='assistant'), finish_reason='stop')
|
|
92
|
+
],
|
|
93
|
+
object='chat.completion',
|
|
94
|
+
created=int(time.time()),
|
|
95
|
+
usage=None).model_dump(exclude_unset=True)
|
|
96
|
+
|
|
97
|
+
return res_d
|
|
98
|
+
|
|
99
|
+
@staticmethod
|
|
100
|
+
def _get_logits(tokenizer, model, inputs: List[str]):
|
|
101
|
+
input_ids = tokenizer(inputs, padding=False)['input_ids']
|
|
102
|
+
input_ids = torch.tensor(input_ids, device=model.device)
|
|
103
|
+
tokens = {'input_ids': input_ids}
|
|
104
|
+
|
|
105
|
+
outputs = model(input_ids)['logits']
|
|
106
|
+
logits = outputs[:, -1, :]
|
|
107
|
+
log_probs = torch.nn.functional.softmax(logits, dim=-1)
|
|
108
|
+
return log_probs, {'tokens': tokens}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
|
|
112
|
+
"""
|
|
113
|
+
Continuation-logits model adapter.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
def __init__(self, model: LocalModel, **kwargs):
|
|
117
|
+
super().__init__(model, **kwargs)
|
|
118
|
+
|
|
119
|
+
@torch.no_grad()
|
|
120
|
+
def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
|
|
121
|
+
"""
|
|
122
|
+
Multi-choice model prediction func.
|
|
123
|
+
Args:
|
|
124
|
+
inputs (dict): The inputs for a doc. Format:
|
|
125
|
+
{'data': [(context, continuation), ...]}
|
|
126
|
+
infer_cfg (dict): inference configuration.
|
|
127
|
+
Returns:
|
|
128
|
+
res (dict): The model prediction results. Format:
|
|
129
|
+
{
|
|
130
|
+
'choices': [
|
|
131
|
+
{
|
|
132
|
+
'index': 0,
|
|
133
|
+
'message': {
|
|
134
|
+
'content': [-14.9609, -13.6015, ...], # loglikelihood values for inputs context-continuation pairs.
|
|
135
|
+
'role': 'assistant'
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
],
|
|
139
|
+
'created': 1677664795,
|
|
140
|
+
# For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
|
|
141
|
+
'model': 'gpt-3.5-turbo-0613',
|
|
142
|
+
'object': 'chat.completion',
|
|
143
|
+
'usage': {
|
|
144
|
+
'completion_tokens': 17,
|
|
145
|
+
'prompt_tokens': 57,
|
|
146
|
+
'total_tokens': 74
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
"""
|
|
150
|
+
infer_cfg = infer_cfg or {}
|
|
151
|
+
|
|
152
|
+
pred_list: list = self.loglikelihood(inputs=inputs['data'], infer_cfg=infer_cfg)
|
|
153
|
+
|
|
154
|
+
res_d = ChatCompletionResponse(
|
|
155
|
+
model=self.model_id,
|
|
156
|
+
choices=[{
|
|
157
|
+
'index': 0,
|
|
158
|
+
'message': {
|
|
159
|
+
'content': pred_list,
|
|
160
|
+
'role': 'assistant'
|
|
161
|
+
}
|
|
162
|
+
}],
|
|
163
|
+
object='chat.completion',
|
|
164
|
+
created=int(time.time()),
|
|
165
|
+
usage=None).model_dump(exclude_unset=True)
|
|
166
|
+
|
|
167
|
+
return res_d
|
|
168
|
+
|
|
169
|
+
def loglikelihood(self, inputs: list, infer_cfg: dict = None) -> list:
|
|
170
|
+
self.model.generation_config.update(**infer_cfg)
|
|
171
|
+
# To predict one doc
|
|
172
|
+
doc_ele_pred = []
|
|
173
|
+
for ctx, continuation in inputs:
|
|
174
|
+
|
|
175
|
+
# ctx_enc shape: [context_tok_len] cont_enc shape: [continuation_tok_len]
|
|
176
|
+
ctx_enc, cont_enc = self._encode_pair(ctx, continuation)
|
|
177
|
+
|
|
178
|
+
inputs_tokens = torch.tensor(
|
|
179
|
+
(ctx_enc.tolist() + cont_enc.tolist())[-(self.max_length + 1):][:-1],
|
|
180
|
+
dtype=torch.long,
|
|
181
|
+
device=self.model.device).unsqueeze(0)
|
|
182
|
+
|
|
183
|
+
logits = self.model(inputs_tokens)[0]
|
|
184
|
+
logits = torch.nn.functional.log_softmax(logits.float(), dim=-1)
|
|
185
|
+
|
|
186
|
+
logits = logits[:, -len(cont_enc):, :]
|
|
187
|
+
cont_enc = cont_enc.unsqueeze(0).unsqueeze(-1)
|
|
188
|
+
logits = torch.gather(logits.cpu(), 2, cont_enc.cpu()).squeeze(-1)
|
|
189
|
+
|
|
190
|
+
choice_score = float(logits.sum())
|
|
191
|
+
doc_ele_pred.append(choice_score)
|
|
192
|
+
|
|
193
|
+
# e.g. [-2.3, -9.2, -12.9, 1.1], length=len(choices)
|
|
194
|
+
return doc_ele_pred
|
|
195
|
+
|
|
196
|
+
def _encode_pair(self, context, continuation):
|
|
197
|
+
n_spaces = len(context) - len(context.rstrip())
|
|
198
|
+
if n_spaces > 0:
|
|
199
|
+
continuation = context[-n_spaces:] + continuation
|
|
200
|
+
context = context[:-n_spaces]
|
|
201
|
+
|
|
202
|
+
whole_enc = self.tokenizer(context + continuation, padding=False)['input_ids']
|
|
203
|
+
whole_enc = torch.tensor(whole_enc, device=self.device)
|
|
204
|
+
|
|
205
|
+
context_enc = self.tokenizer(context, padding=False)['input_ids']
|
|
206
|
+
context_enc = torch.tensor(context_enc, device=self.device)
|
|
207
|
+
|
|
208
|
+
context_enc_len = len(context_enc)
|
|
209
|
+
continuation_enc = whole_enc[context_enc_len:]
|
|
210
|
+
|
|
211
|
+
return context_enc, continuation_enc
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Union
|
|
2
|
+
|
|
3
|
+
from evalscope.models.base_adapter import BaseModelAdapter
|
|
4
|
+
from evalscope.models.custom import CustomModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CustomModelAdapter(BaseModelAdapter):
|
|
8
|
+
|
|
9
|
+
def __init__(self, custom_model: CustomModel, **kwargs):
|
|
10
|
+
"""
|
|
11
|
+
Custom model adapter.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
custom_model: The custom model instance.
|
|
15
|
+
**kwargs: Other args.
|
|
16
|
+
"""
|
|
17
|
+
self.custom_model = custom_model
|
|
18
|
+
super(CustomModelAdapter, self).__init__(model=custom_model)
|
|
19
|
+
|
|
20
|
+
def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]:
|
|
21
|
+
"""
|
|
22
|
+
Model prediction func.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
inputs (Union[str, dict, list]): The input data. Depending on the specific model.
|
|
26
|
+
str: 'xxx'
|
|
27
|
+
dict: {'data': [full_prompt]}
|
|
28
|
+
list: ['xxx', 'yyy', 'zzz']
|
|
29
|
+
**kwargs: kwargs
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
res (dict): The model prediction results. Format:
|
|
33
|
+
{
|
|
34
|
+
'choices': [
|
|
35
|
+
{
|
|
36
|
+
'index': 0,
|
|
37
|
+
'message': {
|
|
38
|
+
'content': 'xxx',
|
|
39
|
+
'role': 'assistant'
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
],
|
|
43
|
+
'created': 1677664795,
|
|
44
|
+
'model': 'gpt-3.5-turbo-0613', # should be model_id
|
|
45
|
+
'object': 'chat.completion',
|
|
46
|
+
'usage': {
|
|
47
|
+
'completion_tokens': 17,
|
|
48
|
+
'prompt_tokens': 57,
|
|
49
|
+
'total_tokens': 74
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
"""
|
|
53
|
+
in_prompts = []
|
|
54
|
+
|
|
55
|
+
# Note: here we assume the inputs are all prompts for the benchmark.
|
|
56
|
+
for input_prompt in inputs:
|
|
57
|
+
if isinstance(input_prompt, str):
|
|
58
|
+
in_prompts.append(input_prompt)
|
|
59
|
+
elif isinstance(input_prompt, dict):
|
|
60
|
+
# TODO: to be supported for continuation list like truthful_qa
|
|
61
|
+
in_prompts.append(input_prompt['data'][0])
|
|
62
|
+
elif isinstance(input_prompt, list):
|
|
63
|
+
in_prompts.append('\n'.join(input_prompt))
|
|
64
|
+
else:
|
|
65
|
+
raise TypeError(f'Unsupported inputs type: {type(input_prompt)}')
|
|
66
|
+
|
|
67
|
+
return self.custom_model.predict(prompts=in_prompts, **kwargs)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
from typing import TYPE_CHECKING, Optional
|
|
3
|
+
|
|
4
|
+
from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType
|
|
5
|
+
from evalscope.utils.logger import get_logger
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from evalscope.config import TaskConfig
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class LocalModel:
|
|
14
|
+
|
|
15
|
+
def __init__(self,
|
|
16
|
+
model_id: str,
|
|
17
|
+
model_revision: str = DEFAULT_MODEL_REVISION,
|
|
18
|
+
device_map: str = 'auto',
|
|
19
|
+
torch_dtype: str = 'auto',
|
|
20
|
+
cache_dir: str = None,
|
|
21
|
+
**kwargs):
|
|
22
|
+
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
23
|
+
|
|
24
|
+
model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
|
|
25
|
+
|
|
26
|
+
if isinstance(torch_dtype, str) and torch_dtype != 'auto':
|
|
27
|
+
torch_dtype = eval(torch_dtype)
|
|
28
|
+
|
|
29
|
+
self.model_id = model_id
|
|
30
|
+
self.model_revision = model_revision
|
|
31
|
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
32
|
+
|
|
33
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
34
|
+
self.model_id,
|
|
35
|
+
revision=model_revision,
|
|
36
|
+
trust_remote_code=True,
|
|
37
|
+
cache_dir=model_cache_dir,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
self.model = AutoModelForCausalLM.from_pretrained(
|
|
41
|
+
self.model_id,
|
|
42
|
+
revision=model_revision,
|
|
43
|
+
device_map=device_map,
|
|
44
|
+
trust_remote_code=True,
|
|
45
|
+
torch_dtype=torch_dtype,
|
|
46
|
+
cache_dir=model_cache_dir,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
self.model_cfg = {
|
|
50
|
+
'model_id': model_id,
|
|
51
|
+
'device_map': device_map,
|
|
52
|
+
'torch_dtype': str(torch_dtype),
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]:
|
|
57
|
+
"""Get the base local model for the task. If the task is not checkpoint-based, return None.
|
|
58
|
+
Avoids loading model multiple times for different datasets.
|
|
59
|
+
"""
|
|
60
|
+
if task_cfg.eval_type != EvalType.CHECKPOINT:
|
|
61
|
+
return None
|
|
62
|
+
else:
|
|
63
|
+
device_map = task_cfg.model_args.get('device_map', 'auto')
|
|
64
|
+
cache_dir = task_cfg.model_args.get('cache_dir', None)
|
|
65
|
+
model_precision = task_cfg.model_args.get('precision', 'torch.float16')
|
|
66
|
+
model_revision = task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION)
|
|
67
|
+
|
|
68
|
+
base_model = LocalModel(
|
|
69
|
+
model_id=task_cfg.model,
|
|
70
|
+
model_revision=model_revision,
|
|
71
|
+
device_map=device_map,
|
|
72
|
+
torch_dtype=model_precision,
|
|
73
|
+
cache_dir=cache_dir)
|
|
74
|
+
return base_model
|
evalscope/models/model.py
CHANGED
|
@@ -1,7 +1,14 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import os
|
|
3
|
+
import random
|
|
4
|
+
import time
|
|
2
5
|
from abc import ABC, abstractmethod
|
|
3
6
|
from typing import Any
|
|
4
7
|
|
|
8
|
+
from evalscope.utils.logger import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
5
12
|
|
|
6
13
|
class BaseModel(ABC):
|
|
7
14
|
|
|
@@ -86,3 +93,137 @@ class ChatBaseModel(BaseModel):
|
|
|
86
93
|
}
|
|
87
94
|
"""
|
|
88
95
|
raise NotImplementedError
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class OpenAIModel(ChatBaseModel):
|
|
99
|
+
"""
|
|
100
|
+
APIs of OpenAI models.
|
|
101
|
+
Available models: gpt-3.5-turbo, gpt-4
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
MAX_RETRIES = 3
|
|
105
|
+
|
|
106
|
+
def __init__(self, model_cfg: dict, **kwargs):
|
|
107
|
+
super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs)
|
|
108
|
+
|
|
109
|
+
openai_api_key = os.environ.get('OPENAI_API_KEY', None)
|
|
110
|
+
self.api_key = self.model_cfg.get('api_key', openai_api_key)
|
|
111
|
+
|
|
112
|
+
if not self.api_key:
|
|
113
|
+
logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY')
|
|
114
|
+
# raise ValueError(
|
|
115
|
+
# 'OpenAI API key is not provided, '
|
|
116
|
+
# 'please set it in environment variable OPENAI_API_KEY')
|
|
117
|
+
|
|
118
|
+
def predict(self, model_id: str, inputs: dict, **kwargs) -> dict:
|
|
119
|
+
|
|
120
|
+
sys_prompt: str = inputs.get('sys_prompt', '')
|
|
121
|
+
user_prompt: str = inputs.get('user_prompt', '')
|
|
122
|
+
|
|
123
|
+
# model_id: str = kwargs.get('model_id', '')
|
|
124
|
+
temperature: float = kwargs.pop('temperature', 0.2)
|
|
125
|
+
max_tokens: int = kwargs.pop('max_tokens', 1024)
|
|
126
|
+
mode: str = kwargs.pop('mode', 'chat.completion')
|
|
127
|
+
|
|
128
|
+
logger.info(f'Using OpenAI model_id: {model_id}')
|
|
129
|
+
|
|
130
|
+
res = self._predict(
|
|
131
|
+
model_id=model_id,
|
|
132
|
+
sys_prompt=sys_prompt,
|
|
133
|
+
user_prompt=user_prompt,
|
|
134
|
+
temperature=temperature,
|
|
135
|
+
max_tokens=max_tokens,
|
|
136
|
+
mode=mode)
|
|
137
|
+
|
|
138
|
+
return res
|
|
139
|
+
|
|
140
|
+
def _predict(
|
|
141
|
+
self,
|
|
142
|
+
model_id,
|
|
143
|
+
sys_prompt,
|
|
144
|
+
user_prompt,
|
|
145
|
+
temperature,
|
|
146
|
+
max_tokens,
|
|
147
|
+
mode: str = 'chat.completion',
|
|
148
|
+
) -> dict:
|
|
149
|
+
import openai
|
|
150
|
+
|
|
151
|
+
res = {}
|
|
152
|
+
openai.api_key = self.api_key
|
|
153
|
+
|
|
154
|
+
for i in range(self.MAX_RETRIES):
|
|
155
|
+
try:
|
|
156
|
+
if mode == 'chat.completion':
|
|
157
|
+
resp = openai.ChatCompletion.create(
|
|
158
|
+
model=model_id,
|
|
159
|
+
messages=[{
|
|
160
|
+
'role': 'system',
|
|
161
|
+
'content': sys_prompt
|
|
162
|
+
}, {
|
|
163
|
+
'role': 'user',
|
|
164
|
+
'content': user_prompt
|
|
165
|
+
}],
|
|
166
|
+
temperature=temperature,
|
|
167
|
+
max_tokens=max_tokens)
|
|
168
|
+
|
|
169
|
+
if resp:
|
|
170
|
+
ans_text = resp['choices'][0]['message']['content']
|
|
171
|
+
model_id = resp['model']
|
|
172
|
+
else:
|
|
173
|
+
logger.warning(f'OpenAI GPT API call failed: got empty response '
|
|
174
|
+
f'for input {sys_prompt} {user_prompt}')
|
|
175
|
+
ans_text = ''
|
|
176
|
+
model_id = ''
|
|
177
|
+
|
|
178
|
+
res['ans_text'] = ans_text
|
|
179
|
+
res['model_id'] = model_id
|
|
180
|
+
else:
|
|
181
|
+
raise ValueError(f'Invalid mode: {mode}')
|
|
182
|
+
|
|
183
|
+
return res
|
|
184
|
+
|
|
185
|
+
except Exception as e:
|
|
186
|
+
logger.warning(f'OpenAI API call failed: {e}')
|
|
187
|
+
time.sleep(3)
|
|
188
|
+
logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
|
|
189
|
+
return res
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class DummyChatModel(ChatBaseModel):
|
|
193
|
+
|
|
194
|
+
MODEL_ID = 'dummy_chat_model_0801'
|
|
195
|
+
REVISION = 'v1.0.0'
|
|
196
|
+
|
|
197
|
+
def __init__(self, model_cfg: dict, **kwargs):
|
|
198
|
+
model_cfg['model_id'] = self.MODEL_ID
|
|
199
|
+
model_cfg['revision'] = self.REVISION
|
|
200
|
+
super(DummyChatModel, self).__init__(model_cfg=model_cfg)
|
|
201
|
+
|
|
202
|
+
def predict(self, inputs: dict, **kwargs) -> dict:
|
|
203
|
+
|
|
204
|
+
debug: bool = False
|
|
205
|
+
if debug:
|
|
206
|
+
messages = inputs['messages']
|
|
207
|
+
history = inputs['history']
|
|
208
|
+
|
|
209
|
+
logger.info(f'** messages: {messages}')
|
|
210
|
+
logger.info(f'** history: {history}')
|
|
211
|
+
|
|
212
|
+
choice = random.choice(['A', 'B', 'C', 'D'])
|
|
213
|
+
|
|
214
|
+
# Build response
|
|
215
|
+
res = {
|
|
216
|
+
'choices': [{
|
|
217
|
+
'index': 0,
|
|
218
|
+
'message': {
|
|
219
|
+
'content': choice,
|
|
220
|
+
'role': 'assistant'
|
|
221
|
+
}
|
|
222
|
+
}],
|
|
223
|
+
'created': time.time(),
|
|
224
|
+
'model': self.MODEL_ID + '-' + self.REVISION,
|
|
225
|
+
'object': 'chat.completion',
|
|
226
|
+
'usage': {}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
return res
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import time
|
|
3
|
+
from typing import Optional, Union
|
|
4
|
+
|
|
5
|
+
from evalscope.models.base_adapter import BaseModelAdapter
|
|
6
|
+
from evalscope.utils.chat_service import ChatMessage
|
|
7
|
+
from evalscope.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ServerModelAdapter(BaseModelAdapter):
|
|
13
|
+
"""
|
|
14
|
+
Server model adapter to request remote API model and generate results.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
|
|
18
|
+
"""
|
|
19
|
+
Args:
|
|
20
|
+
api_url: The URL of the remote API model.
|
|
21
|
+
model_id: The ID of the remote API model.
|
|
22
|
+
api_key: The API key of the remote API model.
|
|
23
|
+
"""
|
|
24
|
+
self.api_url = api_url
|
|
25
|
+
self.model_id = model_id
|
|
26
|
+
self.api_key = api_key
|
|
27
|
+
self.seed = kwargs.get('seed', None)
|
|
28
|
+
self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
|
|
29
|
+
super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
|
|
30
|
+
|
|
31
|
+
def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dict:
|
|
32
|
+
"""
|
|
33
|
+
Model prediction func.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
inputs (Union[str, dict, list]): The input data.
|
|
37
|
+
infer_cfg (dict): Inference configuration.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
res (dict): The model prediction results.
|
|
41
|
+
"""
|
|
42
|
+
infer_cfg = infer_cfg or {}
|
|
43
|
+
|
|
44
|
+
# Process inputs
|
|
45
|
+
if isinstance(inputs, str):
|
|
46
|
+
query = inputs
|
|
47
|
+
system_prompt = None
|
|
48
|
+
elif isinstance(inputs, dict):
|
|
49
|
+
# TODO: to be supported for continuation list like truthful_qa
|
|
50
|
+
query = inputs['data'][0]
|
|
51
|
+
system_prompt = inputs.get('system_prompt', None)
|
|
52
|
+
elif isinstance(inputs, list):
|
|
53
|
+
query = '\n'.join(inputs)
|
|
54
|
+
system_prompt = None
|
|
55
|
+
else:
|
|
56
|
+
raise TypeError(f'Unsupported inputs type: {type(inputs)}')
|
|
57
|
+
|
|
58
|
+
content = self.make_request_content(query, system_prompt)
|
|
59
|
+
request_json = self.make_request(content, infer_cfg)
|
|
60
|
+
response = self.send_request(request_json)
|
|
61
|
+
return response
|
|
62
|
+
|
|
63
|
+
def make_request_content(self, query: str, system_prompt: Optional[str] = None) -> dict:
|
|
64
|
+
"""
|
|
65
|
+
Make request content for API.
|
|
66
|
+
"""
|
|
67
|
+
if system_prompt is not None:
|
|
68
|
+
messages = [
|
|
69
|
+
ChatMessage(role='system', content=system_prompt).model_dump(exclude_unset=True),
|
|
70
|
+
ChatMessage(role='user', content=query).model_dump(exclude_unset=True)
|
|
71
|
+
]
|
|
72
|
+
else:
|
|
73
|
+
messages = [ChatMessage(role='user', content=query).model_dump(exclude_unset=True)]
|
|
74
|
+
return {'messages': messages}
|
|
75
|
+
|
|
76
|
+
def make_request(self, content: dict, infer_cfg: dict = {}) -> dict:
|
|
77
|
+
"""Make request to remote API."""
|
|
78
|
+
# Format request JSON according to OpenAI API format
|
|
79
|
+
request_json = {
|
|
80
|
+
**content, 'model': self.model_id,
|
|
81
|
+
'max_tokens': infer_cfg.get('max_tokens', 2048),
|
|
82
|
+
'temperature': infer_cfg.get('temperature', 0.0),
|
|
83
|
+
'top_p': infer_cfg.get('top_p', 1.0),
|
|
84
|
+
'n': infer_cfg.get('num_return_sequences', 1),
|
|
85
|
+
'stop': infer_cfg.get('stop', None)
|
|
86
|
+
}
|
|
87
|
+
if self.seed is not None:
|
|
88
|
+
request_json['seed'] = self.seed
|
|
89
|
+
logger.debug(f'Request to remote API: {request_json}')
|
|
90
|
+
return request_json
|
|
91
|
+
|
|
92
|
+
def send_request(self, request_json: dict, max_retries: int = 3) -> dict:
|
|
93
|
+
for attempt in range(max_retries):
|
|
94
|
+
response = requests.post(
|
|
95
|
+
self.api_url, json=request_json, headers={'Authorization': f'Bearer {self.api_key}'})
|
|
96
|
+
if response.status_code == 200:
|
|
97
|
+
response_data = response.json()
|
|
98
|
+
return response_data
|
|
99
|
+
logger.warning(f'Failed to request to remote API: {response.status_code} {response.text}')
|
|
100
|
+
if attempt < max_retries - 1:
|
|
101
|
+
time.sleep(5) # Sleep for 5 seconds before retrying
|
|
102
|
+
else:
|
|
103
|
+
raise RuntimeError(f'Failed to request to remote API after {max_retries} attempts: '
|
|
104
|
+
f'{response.status_code} {response.text}')
|
evalscope/perf/arguments.py
CHANGED
evalscope/perf/benchmark.py
CHANGED
|
@@ -157,7 +157,7 @@ async def statistic_benchmark_metric_worker(benchmark_data_queue: asyncio.Queue,
|
|
|
157
157
|
while not (data_process_completed_event.is_set() and benchmark_data_queue.empty()):
|
|
158
158
|
try:
|
|
159
159
|
# Attempt to get benchmark data from the queue with a timeout
|
|
160
|
-
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=
|
|
160
|
+
benchmark_data = await asyncio.wait_for(benchmark_data_queue.get(), timeout=0.01)
|
|
161
161
|
benchmark_data_queue.task_done()
|
|
162
162
|
except asyncio.TimeoutError:
|
|
163
163
|
# If timeout, continue to the next iteration
|
evalscope/perf/main.py
CHANGED
|
@@ -19,7 +19,9 @@ def run_perf_benchmark(args):
|
|
|
19
19
|
args = Arguments(**args)
|
|
20
20
|
elif isinstance(args, Namespace):
|
|
21
21
|
args = Arguments.from_args(args)
|
|
22
|
-
|
|
22
|
+
|
|
23
|
+
if args.seed is not None:
|
|
24
|
+
seed_everything(args.seed)
|
|
23
25
|
|
|
24
26
|
# Setup logger and output
|
|
25
27
|
args.outputs_dir = get_output_path(args)
|