evalscope 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -0
- evalscope/benchmarks/aime24/__init__.py +0 -0
- evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
- evalscope/benchmarks/arc/arc_adapter.py +5 -7
- evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
- evalscope/benchmarks/benchmark.py +2 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
- evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
- evalscope/benchmarks/data_adapter.py +18 -12
- evalscope/benchmarks/data_collection/__init__.py +0 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
- evalscope/benchmarks/general_mcq/__init__.py +0 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
- evalscope/benchmarks/gpqa/__init__.py +0 -0
- evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
- evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
- evalscope/benchmarks/ifeval/instructions.py +3 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
- evalscope/benchmarks/math_500/__init__.py +0 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
- evalscope/benchmarks/race/race_adapter.py +3 -3
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
- evalscope/cli/start_app.py +3 -2
- evalscope/collections/evaluator.py +103 -39
- evalscope/collections/sampler.py +2 -1
- evalscope/collections/schema.py +1 -2
- evalscope/config.py +1 -0
- evalscope/evaluator/evaluator.py +78 -64
- evalscope/metrics/math_parser.py +526 -0
- evalscope/metrics/metrics.py +16 -1
- evalscope/metrics/named_metrics.py +31 -7
- evalscope/models/chat_adapter.py +69 -47
- evalscope/models/choice_adapter.py +52 -45
- evalscope/models/custom_adapter.py +2 -2
- evalscope/models/local_model.py +4 -0
- evalscope/models/server_adapter.py +28 -34
- evalscope/report/app.py +298 -96
- evalscope/run.py +10 -7
- evalscope/utils/chat_service.py +2 -2
- evalscope/utils/io_utils.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/METADATA +20 -11
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/RECORD +57 -47
- tests/cli/test_run.py +93 -16
- evalscope/benchmarks/ceval/samples.jsonl +0 -1
- evalscope/metrics/math_accuracy.py +0 -200
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
evalscope/models/chat_adapter.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import time
|
|
3
3
|
import torch
|
|
4
|
-
from typing import Union
|
|
4
|
+
from typing import List, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
7
7
|
from evalscope.models.local_model import LocalModel
|
|
@@ -57,82 +57,104 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
57
57
|
|
|
58
58
|
return generation_config
|
|
59
59
|
|
|
60
|
-
def _model_generate(self,
|
|
60
|
+
def _model_generate(self, queries: List[str], system_prompts: List[str] = None, infer_cfg: dict = {}) -> List[str]:
|
|
61
61
|
"""
|
|
62
62
|
Args:
|
|
63
|
-
|
|
64
|
-
|
|
63
|
+
queries: The input queries.
|
|
64
|
+
system_prompts: The system prompts.
|
|
65
65
|
infer_cfg: The inference configuration.
|
|
66
66
|
Returns:
|
|
67
|
-
The prediction
|
|
67
|
+
The prediction results.
|
|
68
68
|
"""
|
|
69
|
-
# For chat model, use the chat template to format the input
|
|
70
|
-
if self.tokenizer.chat_template is not None:
|
|
71
|
-
messages = [ChatMessage(role='user', content=query)]
|
|
72
|
-
if system_prompt:
|
|
73
|
-
messages = [ChatMessage(role='system', content=system_prompt)] + messages
|
|
74
|
-
formatted_prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
75
|
-
else:
|
|
76
|
-
# For base model, use the query as the input
|
|
77
|
-
formatted_prompt = query
|
|
78
|
-
|
|
79
|
-
inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
|
|
80
|
-
input_ids = inputs['input_ids']
|
|
81
|
-
|
|
82
69
|
# Process infer_cfg
|
|
83
|
-
|
|
70
|
+
num_return_sequences = infer_cfg.get('num_return_sequences', 1)
|
|
71
|
+
if num_return_sequences > 1:
|
|
84
72
|
infer_cfg['do_sample'] = True
|
|
85
73
|
|
|
86
74
|
# stop settings
|
|
87
|
-
stop = infer_cfg.get('stop',
|
|
88
|
-
|
|
89
|
-
|
|
75
|
+
stop = infer_cfg.get('stop', [])
|
|
76
|
+
if stop:
|
|
77
|
+
eos_token_id = self.tokenizer.encode(stop, add_special_tokens=False)[0]
|
|
78
|
+
else:
|
|
79
|
+
eos_token_id = self.tokenizer.eos_token_id
|
|
90
80
|
|
|
91
81
|
if eos_token_id is not None:
|
|
92
82
|
infer_cfg['eos_token_id'] = eos_token_id
|
|
93
|
-
infer_cfg['pad_token_id'] = eos_token_id # setting eos_token_id as pad token
|
|
94
83
|
|
|
95
84
|
self.generation_config.update(**infer_cfg)
|
|
96
85
|
fix_do_sample_warning(self.generation_config)
|
|
97
86
|
|
|
87
|
+
# For chat model, use the chat template to format the input
|
|
88
|
+
if self.tokenizer.chat_template is not None:
|
|
89
|
+
formatted_prompts = []
|
|
90
|
+
for i, query in enumerate(queries):
|
|
91
|
+
messages = [ChatMessage(role='user', content=query)]
|
|
92
|
+
if i < len(system_prompts) and system_prompts[i]:
|
|
93
|
+
messages = [ChatMessage(role='system', content=system_prompts[i])] + messages
|
|
94
|
+
formatted_prompts.append(
|
|
95
|
+
self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
|
|
96
|
+
else:
|
|
97
|
+
# For base model, use the queries as the input
|
|
98
|
+
formatted_prompts = queries
|
|
99
|
+
|
|
100
|
+
logger.debug(f'formatted_prompts: {formatted_prompts}')
|
|
101
|
+
|
|
102
|
+
# Get input ids
|
|
103
|
+
inputs = self.tokenizer(
|
|
104
|
+
formatted_prompts, return_tensors='pt', padding=True, truncation=True,
|
|
105
|
+
padding_side='left').to(self.device) # padding_side='left' is important for chat model
|
|
106
|
+
input_ids = inputs['input_ids']
|
|
107
|
+
|
|
98
108
|
# Run inference
|
|
99
109
|
output_ids = self.model.generate(**inputs, generation_config=self.generation_config)
|
|
100
110
|
|
|
101
|
-
|
|
102
|
-
|
|
111
|
+
responses = []
|
|
112
|
+
for i in range(0, len(output_ids), num_return_sequences):
|
|
113
|
+
query_responses = []
|
|
114
|
+
for j in range(num_return_sequences):
|
|
115
|
+
output = output_ids[i + j]
|
|
116
|
+
response = self.tokenizer.decode(
|
|
117
|
+
output[len(input_ids[i // num_return_sequences]):], skip_special_tokens=True)
|
|
118
|
+
query_responses.append(response)
|
|
119
|
+
responses.append(query_responses)
|
|
120
|
+
|
|
121
|
+
return responses
|
|
103
122
|
|
|
104
123
|
@torch.no_grad()
|
|
105
|
-
def predict(self, inputs:
|
|
124
|
+
def predict(self, inputs: List[dict], infer_cfg: dict = {}) -> List[dict]:
|
|
106
125
|
"""
|
|
107
126
|
Args:
|
|
108
127
|
inputs: The input data.
|
|
109
128
|
infer_cfg: The inference configuration.
|
|
110
129
|
Returns:
|
|
111
|
-
The prediction
|
|
130
|
+
The prediction results.
|
|
112
131
|
"""
|
|
113
132
|
|
|
114
133
|
# Process inputs
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
system_prompt = None
|
|
124
|
-
else:
|
|
125
|
-
raise TypeError(f'Unsupported inputs type: {type(inputs)}')
|
|
134
|
+
queries = []
|
|
135
|
+
system_prompts = []
|
|
136
|
+
|
|
137
|
+
for input_item in inputs:
|
|
138
|
+
queries.append(input_item['data'][0])
|
|
139
|
+
system_prompts.append(input_item.get('system_prompt', None))
|
|
140
|
+
|
|
141
|
+
responses = self._model_generate(queries, system_prompts, infer_cfg)
|
|
126
142
|
|
|
127
|
-
|
|
143
|
+
results = []
|
|
144
|
+
for response in responses:
|
|
145
|
+
choices_list = [
|
|
146
|
+
ChatCompletionResponseChoice(
|
|
147
|
+
index=index, message=ChatMessage(content=one_response, role='assistant'), finish_reason='stop')
|
|
148
|
+
for index, one_response in enumerate(response)
|
|
149
|
+
]
|
|
128
150
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
151
|
+
res_d = ChatCompletionResponse(
|
|
152
|
+
model=self.model_id,
|
|
153
|
+
choices=choices_list,
|
|
154
|
+
object='chat.completion',
|
|
155
|
+
created=int(time.time()),
|
|
156
|
+
usage=None).model_dump(exclude_unset=True)
|
|
133
157
|
|
|
134
|
-
|
|
135
|
-
model=self.model_id, choices=choices_list, object='chat.completion', created=int(time.time()),
|
|
136
|
-
usage=None).model_dump(exclude_unset=True)
|
|
158
|
+
results.append(res_d)
|
|
137
159
|
|
|
138
|
-
return
|
|
160
|
+
return results
|
|
@@ -33,12 +33,12 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
|
33
33
|
return self._DEFAULT_MAX_LENGTH
|
|
34
34
|
|
|
35
35
|
@torch.no_grad()
|
|
36
|
-
def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
|
|
36
|
+
def predict(self, inputs: List[dict], infer_cfg: dict = None) -> dict:
|
|
37
37
|
"""
|
|
38
38
|
Multi-choice model prediction func.
|
|
39
39
|
|
|
40
40
|
Args:
|
|
41
|
-
inputs (dict): The inputs for a doc. Format:
|
|
41
|
+
inputs (List[dict]): The inputs for a doc. Format:
|
|
42
42
|
{'data': [full_prompt], 'multi_choices': ['A', 'B', 'C', 'D']}
|
|
43
43
|
|
|
44
44
|
infer_cfg (dict): inference configuration.
|
|
@@ -69,37 +69,39 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
|
69
69
|
infer_cfg = infer_cfg or {}
|
|
70
70
|
self.model.generation_config.update(**infer_cfg)
|
|
71
71
|
|
|
72
|
-
input_data =
|
|
73
|
-
multi_choices =
|
|
72
|
+
input_data = [inp['data'][0] for inp in inputs]
|
|
73
|
+
multi_choices = [inp['multi_choices'] for inp in inputs]
|
|
74
74
|
|
|
75
|
-
|
|
76
|
-
assert output.shape[0] == 1
|
|
77
|
-
logits = output.flatten()
|
|
75
|
+
outputs, input_info = self._get_logits(self.tokenizer, self.model, input_data)
|
|
78
76
|
|
|
79
|
-
|
|
80
|
-
|
|
77
|
+
results = []
|
|
78
|
+
for i, (logits, choices) in enumerate(zip(outputs, multi_choices)):
|
|
79
|
+
choice_logits = [logits[self.tokenizer(ch)['input_ids'][-1:]] for ch in choices]
|
|
80
|
+
softval = torch.nn.functional.softmax(torch.tensor(choice_logits).float(), dim=0)
|
|
81
81
|
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
82
|
+
if softval.dtype in {torch.bfloat16, torch.float16}:
|
|
83
|
+
softval = softval.to(dtype=torch.float32)
|
|
84
|
+
probs = softval.detach().cpu().numpy()
|
|
85
|
+
pred: str = choices[int(np.argmax(probs))] # Format: A or B or C or D
|
|
86
86
|
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
87
|
+
res_d = ChatCompletionResponse(
|
|
88
|
+
model=self.model_id,
|
|
89
|
+
choices=[
|
|
90
|
+
ChatCompletionResponseChoice(
|
|
91
|
+
index=0, message=ChatMessage(content=pred, role='assistant'), finish_reason='stop')
|
|
92
|
+
],
|
|
93
|
+
object='chat.completion',
|
|
94
|
+
created=int(time.time()),
|
|
95
|
+
usage=None).model_dump(exclude_unset=True)
|
|
96
96
|
|
|
97
|
-
|
|
97
|
+
results.append(res_d)
|
|
98
|
+
|
|
99
|
+
return results
|
|
98
100
|
|
|
99
101
|
@staticmethod
|
|
100
102
|
def _get_logits(tokenizer, model, inputs: List[str]):
|
|
101
|
-
input_ids = tokenizer(
|
|
102
|
-
|
|
103
|
+
input_ids = tokenizer(
|
|
104
|
+
inputs, padding=True, return_tensors='pt', padding_side='left')['input_ids'].to(model.device)
|
|
103
105
|
tokens = {'input_ids': input_ids}
|
|
104
106
|
|
|
105
107
|
outputs = model(input_ids)['logits']
|
|
@@ -117,11 +119,11 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
|
|
|
117
119
|
super().__init__(model, **kwargs)
|
|
118
120
|
|
|
119
121
|
@torch.no_grad()
|
|
120
|
-
def predict(self, inputs: dict, infer_cfg: dict = None) -> dict:
|
|
122
|
+
def predict(self, inputs: List[dict], infer_cfg: dict = None) -> dict:
|
|
121
123
|
"""
|
|
122
124
|
Multi-choice model prediction func.
|
|
123
125
|
Args:
|
|
124
|
-
inputs (dict): The inputs for a doc. Format:
|
|
126
|
+
inputs (List[dict]): The inputs for a doc. Format:
|
|
125
127
|
{'data': [(context, continuation), ...]}
|
|
126
128
|
infer_cfg (dict): inference configuration.
|
|
127
129
|
Returns:
|
|
@@ -149,24 +151,29 @@ class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
|
|
|
149
151
|
"""
|
|
150
152
|
infer_cfg = infer_cfg or {}
|
|
151
153
|
|
|
152
|
-
pred_list: list =
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
154
|
+
pred_list: list = []
|
|
155
|
+
for inp in inputs:
|
|
156
|
+
pred_list.append(self.loglikelihood(inputs=inp['data'], infer_cfg=infer_cfg))
|
|
157
|
+
|
|
158
|
+
results = []
|
|
159
|
+
for pred in pred_list:
|
|
160
|
+
res_d = ChatCompletionResponse(
|
|
161
|
+
model=self.model_id,
|
|
162
|
+
choices=[{
|
|
163
|
+
'index': 0,
|
|
164
|
+
'message': {
|
|
165
|
+
'content': pred,
|
|
166
|
+
'role': 'assistant'
|
|
167
|
+
}
|
|
168
|
+
}],
|
|
169
|
+
object='chat.completion',
|
|
170
|
+
created=int(time.time()),
|
|
171
|
+
usage=None).model_dump(exclude_unset=True)
|
|
172
|
+
results.append(res_d)
|
|
173
|
+
|
|
174
|
+
return results
|
|
175
|
+
|
|
176
|
+
def loglikelihood(self, inputs: List[tuple], infer_cfg: dict = None) -> list:
|
|
170
177
|
self.model.generation_config.update(**infer_cfg)
|
|
171
178
|
# To predict one doc
|
|
172
179
|
doc_ele_pred = []
|
|
@@ -17,12 +17,12 @@ class CustomModelAdapter(BaseModelAdapter):
|
|
|
17
17
|
self.custom_model = custom_model
|
|
18
18
|
super(CustomModelAdapter, self).__init__(model=custom_model)
|
|
19
19
|
|
|
20
|
-
def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]:
|
|
20
|
+
def predict(self, inputs: List[Union[str, dict, list]], **kwargs) -> List[Dict[str, Any]]:
|
|
21
21
|
"""
|
|
22
22
|
Model prediction func.
|
|
23
23
|
|
|
24
24
|
Args:
|
|
25
|
-
inputs (Union[str, dict, list]): The input data. Depending on the specific model.
|
|
25
|
+
inputs (List[Union[str, dict, list]]): The input data. Depending on the specific model.
|
|
26
26
|
str: 'xxx'
|
|
27
27
|
dict: {'data': [full_prompt]}
|
|
28
28
|
list: ['xxx', 'yyy', 'zzz']
|
evalscope/models/local_model.py
CHANGED
|
@@ -37,6 +37,10 @@ class LocalModel:
|
|
|
37
37
|
cache_dir=model_cache_dir,
|
|
38
38
|
)
|
|
39
39
|
|
|
40
|
+
# Fix no padding
|
|
41
|
+
if self.tokenizer.pad_token is None:
|
|
42
|
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
43
|
+
|
|
40
44
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|
41
45
|
self.model_id,
|
|
42
46
|
revision=model_revision,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import requests
|
|
2
2
|
import time
|
|
3
|
-
from typing import Optional, Union
|
|
3
|
+
from typing import List, Optional, Union
|
|
4
4
|
|
|
5
5
|
from evalscope.models.base_adapter import BaseModelAdapter
|
|
6
6
|
from evalscope.utils.chat_service import ChatMessage
|
|
@@ -28,36 +28,35 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
28
28
|
self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
|
|
29
29
|
super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
|
|
30
30
|
|
|
31
|
-
def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dict:
|
|
31
|
+
def predict(self, inputs: List[Union[str, dict, list]], infer_cfg: dict = None) -> List[dict]:
|
|
32
32
|
"""
|
|
33
33
|
Model prediction func.
|
|
34
34
|
|
|
35
35
|
Args:
|
|
36
|
-
inputs (Union[str, dict, list]): The input data.
|
|
36
|
+
inputs (List[Union[str, dict, list]]): The input data.
|
|
37
37
|
infer_cfg (dict): Inference configuration.
|
|
38
38
|
|
|
39
39
|
Returns:
|
|
40
|
-
res (dict): The model prediction results.
|
|
40
|
+
res (List[dict]): The model prediction results.
|
|
41
41
|
"""
|
|
42
42
|
infer_cfg = infer_cfg or {}
|
|
43
|
+
results = []
|
|
43
44
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
elif isinstance(inputs, list):
|
|
57
|
-
query = '\n'.join(inputs)
|
|
58
|
-
system_prompt = None
|
|
45
|
+
for input_item in inputs:
|
|
46
|
+
response = self.process_single_input(input_item, infer_cfg)
|
|
47
|
+
results.append(response)
|
|
48
|
+
|
|
49
|
+
return results
|
|
50
|
+
|
|
51
|
+
def process_single_input(self, input_item: dict, infer_cfg: dict) -> dict:
|
|
52
|
+
"""Process a single input item."""
|
|
53
|
+
data: list = input_item['data']
|
|
54
|
+
if isinstance(data[0], tuple): # for truthful_qa and hellaswag
|
|
55
|
+
query = '\n'.join(''.join(item) for item in data)
|
|
56
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
59
57
|
else:
|
|
60
|
-
|
|
58
|
+
query = data[0]
|
|
59
|
+
system_prompt = input_item.get('system_prompt', None)
|
|
61
60
|
|
|
62
61
|
content = self.make_request_content(query, system_prompt)
|
|
63
62
|
request_json = self.make_request(content, infer_cfg)
|
|
@@ -68,7 +67,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
68
67
|
"""
|
|
69
68
|
Make request content for API.
|
|
70
69
|
"""
|
|
71
|
-
if system_prompt
|
|
70
|
+
if system_prompt:
|
|
72
71
|
messages = [
|
|
73
72
|
ChatMessage(role='system', content=system_prompt).model_dump(exclude_unset=True),
|
|
74
73
|
ChatMessage(role='user', content=query).model_dump(exclude_unset=True)
|
|
@@ -80,19 +79,14 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
80
79
|
def make_request(self, content: dict, infer_cfg: dict = {}) -> dict:
|
|
81
80
|
"""Make request to remote API."""
|
|
82
81
|
# Format request JSON according to OpenAI API format
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
'n': infer_cfg.get('num_return_sequences', 1),
|
|
92
|
-
'stop': infer_cfg.get('stop', None)
|
|
93
|
-
}
|
|
94
|
-
if self.seed is not None:
|
|
95
|
-
request_json['seed'] = self.seed
|
|
82
|
+
from evalscope.config import DEFAULT_GENERATION_CONFIG
|
|
83
|
+
if infer_cfg == DEFAULT_GENERATION_CONFIG:
|
|
84
|
+
infer_cfg = {
|
|
85
|
+
'max_tokens': 2048,
|
|
86
|
+
'temperature': 0.0,
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
request_json = {'model': self.model_id, **content, **infer_cfg}
|
|
96
90
|
logger.debug(f'Request to remote API: {request_json}')
|
|
97
91
|
return request_json
|
|
98
92
|
|