evalscope 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/opencompass/backend_manager.py +2 -0
- evalscope/backend/opencompass/tasks/eval_datasets.py +1 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +12 -7
- evalscope/backend/vlm_eval_kit/custom_dataset.py +47 -0
- evalscope/benchmarks/benchmark.py +1 -1
- evalscope/config.py +1 -0
- evalscope/evaluator/evaluator.py +3 -3
- evalscope/models/api/__init__.py +3 -0
- evalscope/models/api/openai_api.py +228 -0
- evalscope/models/model_adapter.py +6 -0
- evalscope/perf/http_client.py +5 -5
- evalscope/run_arena.py +5 -3
- evalscope/summarizer.py +10 -4
- evalscope/third_party/longbench_write/__init__.py +3 -0
- evalscope/third_party/longbench_write/eval.py +284 -0
- evalscope/third_party/longbench_write/infer.py +217 -0
- evalscope/third_party/longbench_write/longbench_write.py +88 -0
- evalscope/third_party/longbench_write/resources/__init__.py +1 -0
- evalscope/third_party/longbench_write/resources/judge.txt +31 -0
- evalscope/third_party/longbench_write/resources/longbench_write.jsonl +120 -0
- evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +60 -0
- evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +48 -0
- evalscope/third_party/longbench_write/tools/__init__.py +1 -0
- evalscope/third_party/longbench_write/tools/data_etl.py +155 -0
- evalscope/third_party/longbench_write/utils.py +37 -0
- evalscope/version.py +2 -2
- evalscope-0.5.4.dist-info/METADATA +399 -0
- {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/RECORD +31 -16
- evalscope-0.5.2.dist-info/METADATA +0 -578
- {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/WHEEL +0 -0
- {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/entry_points.txt +0 -0
- {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/top_level.txt +0 -0
|
@@ -49,6 +49,7 @@ with read_base():
|
|
|
49
49
|
from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
|
|
50
50
|
from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
|
|
51
51
|
from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
|
52
|
+
from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
|
|
52
53
|
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
|
|
53
54
|
|
|
54
55
|
# Note: to be supported
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from typing import Optional, Union
|
|
2
|
-
from evalscope.utils import is_module_installed,
|
|
2
|
+
from evalscope.utils import is_module_installed, get_valid_list
|
|
3
3
|
from evalscope.backend.base import BackendManager
|
|
4
4
|
from evalscope.utils.logger import get_logger
|
|
5
5
|
from functools import partial
|
|
@@ -37,6 +37,7 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
37
37
|
|
|
38
38
|
self._check_valid()
|
|
39
39
|
|
|
40
|
+
|
|
40
41
|
def _check_valid(self):
|
|
41
42
|
# Ensure not both model and datasets are empty
|
|
42
43
|
if not self.args.data or not self.args.model:
|
|
@@ -44,9 +45,9 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
44
45
|
|
|
45
46
|
# Check datasets
|
|
46
47
|
valid_datasets, invalid_datasets = get_valid_list(self.args.data, self.valid_datasets)
|
|
47
|
-
|
|
48
|
-
f
|
|
49
|
-
|
|
48
|
+
if len(invalid_datasets) != 0:
|
|
49
|
+
logger.warning(f"Using custom dataset: {invalid_datasets}, ")
|
|
50
|
+
|
|
50
51
|
# Check model
|
|
51
52
|
if isinstance(self.args.model[0], dict):
|
|
52
53
|
model_names = [model['name'] for model in self.args.model]
|
|
@@ -61,10 +62,14 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
61
62
|
model_class = self.valid_models[model_name]
|
|
62
63
|
if model_name == 'CustomAPIModel':
|
|
63
64
|
model_type = model_cfg['type']
|
|
65
|
+
remain_cfg = copy.deepcopy(model_cfg)
|
|
66
|
+
del remain_cfg['name'] # remove not used args
|
|
67
|
+
del remain_cfg['type'] # remove not used args
|
|
68
|
+
|
|
64
69
|
self.valid_models.update({
|
|
65
70
|
model_type: partial(model_class,
|
|
66
71
|
model=model_type,
|
|
67
|
-
**
|
|
72
|
+
**remain_cfg)
|
|
68
73
|
})
|
|
69
74
|
new_model_names.append(model_type)
|
|
70
75
|
else:
|
|
@@ -78,8 +83,8 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
78
83
|
|
|
79
84
|
elif isinstance(self.args.model[0], str):
|
|
80
85
|
valid_model_names, invalid_model_names = get_valid_list(self.args.model, self.valid_model_names)
|
|
81
|
-
|
|
82
|
-
f
|
|
86
|
+
if len(invalid_datasets) != 0:
|
|
87
|
+
logger.warning(f"Using custom dataset: {invalid_datasets}, ")
|
|
83
88
|
|
|
84
89
|
@property
|
|
85
90
|
def cmd(self):
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import numpy as np
|
|
3
|
+
from vlmeval.dataset.image_base import ImageBaseDataset
|
|
4
|
+
from vlmeval.dataset.image_vqa import CustomVQADataset
|
|
5
|
+
from vlmeval.smp import load, dump, d2df
|
|
6
|
+
|
|
7
|
+
class CustomDataset:
|
|
8
|
+
|
|
9
|
+
def load_data(self, dataset):
|
|
10
|
+
# customize the loading of the dataset
|
|
11
|
+
data_path = os.path.join("~/LMUData", f'{dataset}.tsv')
|
|
12
|
+
return load(data_path)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_prompt(self, line):
|
|
16
|
+
msgs = ImageBaseDataset.build_prompt(self, line)
|
|
17
|
+
# add a hint or custom instruction here
|
|
18
|
+
msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
|
|
19
|
+
return msgs
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def evaluate(self, eval_file, **judge_kwargs):
|
|
23
|
+
data = load(eval_file)
|
|
24
|
+
assert 'answer' in data and 'prediction' in data
|
|
25
|
+
data['prediction'] = [str(x) for x in data['prediction']]
|
|
26
|
+
data['answer'] = [str(x).lower() for x in data['answer']]
|
|
27
|
+
|
|
28
|
+
print(data)
|
|
29
|
+
|
|
30
|
+
# ========compute the evaluation metrics as you need =========
|
|
31
|
+
# exact match
|
|
32
|
+
result = np.mean(data['answer'] == data['prediction'])
|
|
33
|
+
ret = {'Overall': result}
|
|
34
|
+
ret = d2df(ret).round(2)
|
|
35
|
+
|
|
36
|
+
# save the result
|
|
37
|
+
suffix = eval_file.split('.')[-1]
|
|
38
|
+
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
|
39
|
+
dump(ret, result_file)
|
|
40
|
+
return ret
|
|
41
|
+
# ============================================================
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# override the default dataset class
|
|
45
|
+
CustomVQADataset.load_data = CustomDataset.load_data
|
|
46
|
+
CustomVQADataset.build_prompt = CustomDataset.build_prompt
|
|
47
|
+
CustomVQADataset.evaluate = CustomDataset.evaluate
|
|
@@ -46,7 +46,7 @@ class Benchmark(object):
|
|
|
46
46
|
|
|
47
47
|
dataset.dataset_name = dataset_name.split('/')[-1]
|
|
48
48
|
dataset.subset_name = subset
|
|
49
|
-
dataset.split = split
|
|
49
|
+
# dataset.split = split
|
|
50
50
|
return dataset
|
|
51
51
|
elif hub == 'HuggingFace':
|
|
52
52
|
# TODO: implement this by xingjun.wxj@alibaba-inc.com
|
evalscope/config.py
CHANGED
|
@@ -33,6 +33,7 @@ registry_tasks = {
|
|
|
33
33
|
@dataclass
|
|
34
34
|
class TaskConfig:
|
|
35
35
|
model_args: Optional[dict] = field(default_factory=dict)
|
|
36
|
+
template_type: Optional[str] = 'default-generation'
|
|
36
37
|
generation_config: Optional[dict] = field(default_factory=dict)
|
|
37
38
|
dataset_args: Optional[dict] = field(default_factory=dict)
|
|
38
39
|
dry_run: bool = False
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -244,8 +244,8 @@ class Evaluator(object):
|
|
|
244
244
|
answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
|
|
245
245
|
|
|
246
246
|
if debug:
|
|
247
|
-
logger.
|
|
248
|
-
logger.
|
|
247
|
+
logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
|
|
248
|
+
logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
|
|
249
249
|
|
|
250
250
|
answers_list.append(answer_d)
|
|
251
251
|
|
|
@@ -349,7 +349,7 @@ class Evaluator(object):
|
|
|
349
349
|
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
350
350
|
|
|
351
351
|
if debug:
|
|
352
|
-
logger.
|
|
352
|
+
logger.info(review_d)
|
|
353
353
|
|
|
354
354
|
reviews_list.append(review_d)
|
|
355
355
|
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
from asyncio import Queue
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
from typing import Union, List, Optional, Dict
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
11
|
+
from modelscope.utils.logger import get_logger
|
|
12
|
+
|
|
13
|
+
logger = get_logger()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OpenaiApi:
|
|
17
|
+
|
|
18
|
+
def __init__(self,
|
|
19
|
+
model: str,
|
|
20
|
+
openai_api_key,
|
|
21
|
+
openai_api_base,
|
|
22
|
+
logprobs: Optional[bool] = False,
|
|
23
|
+
top_logprobs: Optional[int] = None,
|
|
24
|
+
max_new_tokens: int = 4096,
|
|
25
|
+
temperature: Optional[float] = 0.0,
|
|
26
|
+
repetition_penalty: Optional[float] = 1.0,
|
|
27
|
+
is_chat: bool = True,
|
|
28
|
+
verbose: bool = True,
|
|
29
|
+
retry: int = 3,
|
|
30
|
+
query_per_second: int = 10, # TODO
|
|
31
|
+
**kwargs):
|
|
32
|
+
|
|
33
|
+
self.temperature = temperature
|
|
34
|
+
self.repetition_penalty = repetition_penalty
|
|
35
|
+
self.max_tokens = max_new_tokens
|
|
36
|
+
self.logprobs = logprobs
|
|
37
|
+
self.top_logprobs = top_logprobs
|
|
38
|
+
|
|
39
|
+
self.openai_api_key = openai_api_key
|
|
40
|
+
self.url = openai_api_base
|
|
41
|
+
self.model = model
|
|
42
|
+
self.is_chat = is_chat
|
|
43
|
+
self.retry = retry
|
|
44
|
+
self.verbose = verbose
|
|
45
|
+
|
|
46
|
+
self.token_bucket = TokenBucket(query_per_second, verbose)
|
|
47
|
+
|
|
48
|
+
def generate_simple(self, inputs: Union[List[str]]):
|
|
49
|
+
|
|
50
|
+
def process_one(in_data: str):
|
|
51
|
+
|
|
52
|
+
if self.is_chat:
|
|
53
|
+
data = dict(
|
|
54
|
+
model=self.model,
|
|
55
|
+
messages=[{'role': 'user', 'content': in_data}],
|
|
56
|
+
max_tokens=self.max_tokens,
|
|
57
|
+
n=1,
|
|
58
|
+
logprobs=self.logprobs,
|
|
59
|
+
top_logprobs=self.top_logprobs,
|
|
60
|
+
stop=None,
|
|
61
|
+
temperature=self.temperature,
|
|
62
|
+
repetition_penalty=self.repetition_penalty,
|
|
63
|
+
)
|
|
64
|
+
else:
|
|
65
|
+
data = dict(
|
|
66
|
+
model=self.model,
|
|
67
|
+
prompt=in_data,
|
|
68
|
+
max_tokens=self.max_tokens,
|
|
69
|
+
temperature=self.temperature,
|
|
70
|
+
repetition_penalty=self.repetition_penalty,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# todo
|
|
74
|
+
openai_api_key = self.openai_api_key or ''
|
|
75
|
+
header = {'Authorization': f'Bearer ', 'content-type': 'application/json', }
|
|
76
|
+
data = json.dumps(data, ensure_ascii=False)
|
|
77
|
+
|
|
78
|
+
if self.verbose:
|
|
79
|
+
print(f'>>data in generate_simple: {data}')
|
|
80
|
+
|
|
81
|
+
resp = requests.post(self.url, headers=header, data=data)
|
|
82
|
+
resp = resp.json()
|
|
83
|
+
if self.verbose:
|
|
84
|
+
print(f'>>resp in generate_simple: {resp}')
|
|
85
|
+
|
|
86
|
+
if self.logprobs:
|
|
87
|
+
return resp['choices']
|
|
88
|
+
else:
|
|
89
|
+
if self.is_chat:
|
|
90
|
+
return resp['choices'][0]['message']['content'].strip()
|
|
91
|
+
else:
|
|
92
|
+
return resp['choices'][0]['text'].strip()
|
|
93
|
+
|
|
94
|
+
with ThreadPoolExecutor() as executor:
|
|
95
|
+
results = list(executor.map(process_one, inputs))
|
|
96
|
+
|
|
97
|
+
return results
|
|
98
|
+
|
|
99
|
+
def generate(self,
|
|
100
|
+
inputs: Union[List[str], List[List]],
|
|
101
|
+
**kwargs) -> List[str]:
|
|
102
|
+
"""
|
|
103
|
+
Generate responses from OpenAI API.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
inputs: The input messages for the model. It can be a string or a list of messages.
|
|
107
|
+
e.g. ['who are you ?', 'what is your name ?']
|
|
108
|
+
e.g. [[{'role': 'user', 'content': 'who are you ?'}], ...]
|
|
109
|
+
kwargs: The optional arguments for the model.
|
|
110
|
+
"""
|
|
111
|
+
results = []
|
|
112
|
+
# with ThreadPoolExecutor() as executor:
|
|
113
|
+
# results = list(executor.map(self._generate, inputs))
|
|
114
|
+
|
|
115
|
+
for input in inputs:
|
|
116
|
+
results.append(self._generate(input))
|
|
117
|
+
|
|
118
|
+
return results
|
|
119
|
+
|
|
120
|
+
def _generate(self, messages: Union[str, List[Dict]]) -> str:
|
|
121
|
+
|
|
122
|
+
if isinstance(messages, str):
|
|
123
|
+
messages = [{'role': 'user', 'content': messages}]
|
|
124
|
+
|
|
125
|
+
max_num_retries = 0
|
|
126
|
+
while max_num_retries < self.retry:
|
|
127
|
+
# self.wait()
|
|
128
|
+
|
|
129
|
+
header = {
|
|
130
|
+
'Authorization': f'Bearer {self.openai_api_key}',
|
|
131
|
+
'content-type': 'application/json',
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
if self.is_chat:
|
|
136
|
+
data = dict(
|
|
137
|
+
model=self.model,
|
|
138
|
+
messages=messages,
|
|
139
|
+
max_tokens=self.max_tokens,
|
|
140
|
+
n=1,
|
|
141
|
+
logprobs=self.logprobs,
|
|
142
|
+
top_logprobs=self.top_logprobs,
|
|
143
|
+
stop=None,
|
|
144
|
+
temperature=self.temperature,
|
|
145
|
+
repetition_penalty=self.repetition_penalty,
|
|
146
|
+
)
|
|
147
|
+
else:
|
|
148
|
+
# TODO: This is a temporary solution for non-chat models.
|
|
149
|
+
input_prompts = []
|
|
150
|
+
for msg in messages:
|
|
151
|
+
input_prompts.append(msg['content'])
|
|
152
|
+
|
|
153
|
+
data = dict(
|
|
154
|
+
model=self.model,
|
|
155
|
+
prompt='\n'.join(input_prompts),
|
|
156
|
+
max_tokens=self.max_tokens,
|
|
157
|
+
temperature=self.temperature,
|
|
158
|
+
repetition_penalty=self.repetition_penalty,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def remove_none_val(input_d: dict):
|
|
162
|
+
return {k: v for k, v in input_d.items() if v is not None}
|
|
163
|
+
data = remove_none_val(data)
|
|
164
|
+
|
|
165
|
+
if self.verbose:
|
|
166
|
+
logger.info(f'>> Post data: {json.dumps(data, ensure_ascii=False)}')
|
|
167
|
+
raw_response = requests.post(self.url,
|
|
168
|
+
headers=header,
|
|
169
|
+
data=json.dumps(data, ensure_ascii=False))
|
|
170
|
+
|
|
171
|
+
response = raw_response.json()
|
|
172
|
+
if self.verbose:
|
|
173
|
+
logger.info(f'>> response: {response}')
|
|
174
|
+
|
|
175
|
+
if self.logprobs:
|
|
176
|
+
return response['choices']
|
|
177
|
+
else:
|
|
178
|
+
if self.is_chat:
|
|
179
|
+
return response['choices'][0]['message']['content'].strip()
|
|
180
|
+
else:
|
|
181
|
+
return response['choices'][0]['text'].strip()
|
|
182
|
+
|
|
183
|
+
except Exception as e:
|
|
184
|
+
logger.error(f'Error occurs: {str(e)}')
|
|
185
|
+
max_num_retries += 1
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
def wait(self):
|
|
189
|
+
return self.token_bucket.get_token()
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class TokenBucket:
|
|
193
|
+
"""A token bucket for rate limiting.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
query_per_second (float): The rate of the token bucket.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def __init__(self, rate, verbose=False):
|
|
200
|
+
self._rate = rate
|
|
201
|
+
self._tokens = threading.Semaphore(0)
|
|
202
|
+
self.started = False
|
|
203
|
+
self._request_queue = Queue()
|
|
204
|
+
self.logger = get_logger()
|
|
205
|
+
self.verbose = verbose
|
|
206
|
+
|
|
207
|
+
def _add_tokens(self):
|
|
208
|
+
"""Add tokens to the bucket."""
|
|
209
|
+
while True:
|
|
210
|
+
if self._tokens._value < self._rate:
|
|
211
|
+
self._tokens.release()
|
|
212
|
+
time.sleep(1 / self._rate)
|
|
213
|
+
|
|
214
|
+
def get_token(self):
|
|
215
|
+
"""Get a token from the bucket."""
|
|
216
|
+
if not self.started:
|
|
217
|
+
self.started = True
|
|
218
|
+
threading.Thread(target=self._add_tokens, daemon=True).start()
|
|
219
|
+
self._tokens.acquire()
|
|
220
|
+
if self.verbose:
|
|
221
|
+
cur_time = time.time()
|
|
222
|
+
while not self._request_queue.empty():
|
|
223
|
+
if cur_time - self._request_queue.queue[0] > 60:
|
|
224
|
+
self._request_queue.get()
|
|
225
|
+
else:
|
|
226
|
+
break
|
|
227
|
+
self._request_queue.put(cur_time)
|
|
228
|
+
self.logger.info(f'Current RPM {self._request_queue.qsize()}.')
|
|
@@ -362,6 +362,8 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
362
362
|
torch_dtype: The torch dtype for model inference. Default: torch.float16.
|
|
363
363
|
**kwargs: Other args.
|
|
364
364
|
"""
|
|
365
|
+
|
|
366
|
+
custom_generation_config = kwargs.pop('generation_config', None)
|
|
365
367
|
model_cache_dir = get_model_cache_dir(root_cache_dir=cache_dir)
|
|
366
368
|
|
|
367
369
|
self.model_id: str = model_id
|
|
@@ -414,6 +416,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
|
414
416
|
self.origin_tokenizer = deepcopy(tokenizer)
|
|
415
417
|
|
|
416
418
|
self.generation_config, self.generation_template = self._parse_generation_config(tokenizer, model)
|
|
419
|
+
|
|
420
|
+
if custom_generation_config:
|
|
421
|
+
logger.info('**Updating generation config ...')
|
|
422
|
+
self.generation_config.update(**custom_generation_config.to_dict())
|
|
417
423
|
logger.info(f'**Generation config init: {self.generation_config.to_dict()}')
|
|
418
424
|
|
|
419
425
|
super().__init__(model=model, tokenizer=self.generation_template.tokenizer, model_cfg=model_cfg)
|
evalscope/perf/http_client.py
CHANGED
|
@@ -51,15 +51,15 @@ UNLIMITED_RATE = -1
|
|
|
51
51
|
|
|
52
52
|
|
|
53
53
|
async def on_request_start(session, context, params):
|
|
54
|
-
logger.
|
|
54
|
+
logger.info(f'Starting request: <{params}>')
|
|
55
55
|
|
|
56
56
|
|
|
57
57
|
async def on_request_chunk_sent(session, context, params):
|
|
58
|
-
logger.
|
|
58
|
+
logger.info(f'Request body: {params}')
|
|
59
59
|
|
|
60
60
|
|
|
61
61
|
async def on_response_chunk_received(session, context, params):
|
|
62
|
-
logger.
|
|
62
|
+
logger.info(f'Response info: <{params}>')
|
|
63
63
|
|
|
64
64
|
|
|
65
65
|
class AioHttpClient:
|
|
@@ -116,7 +116,7 @@ class AioHttpClient:
|
|
|
116
116
|
line = line.decode("utf8")
|
|
117
117
|
line = line.rstrip("\n").rstrip("\r")
|
|
118
118
|
if self.debug:
|
|
119
|
-
logger.
|
|
119
|
+
logger.info(line)
|
|
120
120
|
sse_msg = ServerSentEvent.decode(line)
|
|
121
121
|
if not sse_msg:
|
|
122
122
|
continue
|
|
@@ -567,7 +567,7 @@ async def send_requests_worker(task_id, request_queue: asyncio.Queue, benchmark_
|
|
|
567
567
|
else:
|
|
568
568
|
if response_data:
|
|
569
569
|
collected_messages.append(response_data) # save the message
|
|
570
|
-
logger.
|
|
570
|
+
logger.info(response_data)
|
|
571
571
|
benchmark_data["chunk_times"].append(time.perf_counter())
|
|
572
572
|
|
|
573
573
|
benchmark_data["response_messages"] = collected_messages
|
evalscope/run_arena.py
CHANGED
|
@@ -100,17 +100,18 @@ class ArenaWorkflow:
|
|
|
100
100
|
model_revision = cfg_d.get(EvalConfigKeys.MODEL_REVISION, None)
|
|
101
101
|
precision = cfg_d.get(EvalConfigKeys.PRECISION, torch.float16)
|
|
102
102
|
precision = eval(precision) if isinstance(precision, str) else precision
|
|
103
|
-
|
|
104
|
-
|
|
103
|
+
custom_generation_config = cfg_d.get(EvalConfigKeys.GENERATION_CONFIG, {})
|
|
104
|
+
custom_generation_config = GenerationConfig(**custom_generation_config)
|
|
105
105
|
ans_output_file = os.path.join(WORK_DIR, cfg_d.get(EvalConfigKeys.OUTPUT_FILE))
|
|
106
106
|
template_type = cfg_d.get(EvalConfigKeys.TEMPLATE_TYPE)
|
|
107
107
|
|
|
108
108
|
answers_list = self._predict_answers(model_id_or_path=model_id_or_path,
|
|
109
109
|
model_revision=model_revision,
|
|
110
110
|
precision=precision,
|
|
111
|
-
generation_config=
|
|
111
|
+
generation_config=custom_generation_config,
|
|
112
112
|
template_type=template_type)
|
|
113
113
|
|
|
114
|
+
os.makedirs(os.path.dirname(ans_output_file), exist_ok=True)
|
|
114
115
|
dump_jsonl_data(answers_list, ans_output_file)
|
|
115
116
|
logger.info(f'Answers generated by model {model_name} and saved to {ans_output_file}')
|
|
116
117
|
|
|
@@ -168,6 +169,7 @@ class ArenaWorkflow:
|
|
|
168
169
|
res_list = ae.run(self.review_file)
|
|
169
170
|
rating_df = res_list[0]
|
|
170
171
|
logger.info(f'Rating results:\n{rating_df.to_csv()}')
|
|
172
|
+
os.makedirs(os.path.dirname(report_file), exist_ok=True)
|
|
171
173
|
rating_df.to_csv(report_file, index=True)
|
|
172
174
|
logger.info(f'Rating results are saved to {report_file}')
|
|
173
175
|
else:
|
evalscope/summarizer.py
CHANGED
|
@@ -99,19 +99,25 @@ class Summarizer:
|
|
|
99
99
|
elif eval_backend == EvalBackend.VLM_EVAL_KIT.value:
|
|
100
100
|
eval_config = Summarizer.parse_eval_config(candidate_task)
|
|
101
101
|
|
|
102
|
-
work_dir = eval_config.get('work_dir') or 'outputs
|
|
102
|
+
work_dir = eval_config.get('work_dir') or 'outputs'
|
|
103
103
|
if not os.path.exists(work_dir):
|
|
104
104
|
raise ValueError(f'work_dir {work_dir} does not exist.')
|
|
105
105
|
|
|
106
|
-
# TODO: parse summary files: acc.csv, score.csv, score.json for different models
|
|
107
106
|
for model in eval_config['model']:
|
|
108
107
|
if model['name'] == 'CustomAPIModel':
|
|
109
108
|
model_name = model['type']
|
|
110
109
|
else:
|
|
111
110
|
model_name = model['name']
|
|
112
|
-
|
|
111
|
+
|
|
112
|
+
csv_files = glob.glob(os.path.join(work_dir, model_name, '*.csv'))
|
|
113
|
+
json_files = glob.glob(os.path.join(work_dir, model_name, '*.json'))
|
|
114
|
+
|
|
115
|
+
summary_files = csv_files + json_files
|
|
113
116
|
for summary_file_path in summary_files:
|
|
114
|
-
|
|
117
|
+
if summary_file_path.endswith('csv'):
|
|
118
|
+
summary_res: dict = csv_to_list(summary_file_path)[0]
|
|
119
|
+
elif summary_file_path.endswith('json'):
|
|
120
|
+
summary_res: dict = json_to_dict(summary_file_path)
|
|
115
121
|
file_name = os.path.basename(summary_file_path).split('.')[0]
|
|
116
122
|
final_res_list.append({file_name: summary_res})
|
|
117
123
|
|