evalscope 0.5.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (32) hide show
  1. evalscope/backend/opencompass/backend_manager.py +2 -0
  2. evalscope/backend/opencompass/tasks/eval_datasets.py +1 -0
  3. evalscope/backend/vlm_eval_kit/backend_manager.py +12 -7
  4. evalscope/backend/vlm_eval_kit/custom_dataset.py +47 -0
  5. evalscope/benchmarks/benchmark.py +1 -1
  6. evalscope/config.py +1 -0
  7. evalscope/evaluator/evaluator.py +3 -3
  8. evalscope/models/api/__init__.py +3 -0
  9. evalscope/models/api/openai_api.py +228 -0
  10. evalscope/models/model_adapter.py +6 -0
  11. evalscope/perf/http_client.py +5 -5
  12. evalscope/run_arena.py +5 -3
  13. evalscope/summarizer.py +10 -4
  14. evalscope/third_party/longbench_write/__init__.py +3 -0
  15. evalscope/third_party/longbench_write/eval.py +284 -0
  16. evalscope/third_party/longbench_write/infer.py +217 -0
  17. evalscope/third_party/longbench_write/longbench_write.py +88 -0
  18. evalscope/third_party/longbench_write/resources/__init__.py +1 -0
  19. evalscope/third_party/longbench_write/resources/judge.txt +31 -0
  20. evalscope/third_party/longbench_write/resources/longbench_write.jsonl +120 -0
  21. evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +60 -0
  22. evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +48 -0
  23. evalscope/third_party/longbench_write/tools/__init__.py +1 -0
  24. evalscope/third_party/longbench_write/tools/data_etl.py +155 -0
  25. evalscope/third_party/longbench_write/utils.py +37 -0
  26. evalscope/version.py +2 -2
  27. evalscope-0.5.4.dist-info/METADATA +399 -0
  28. {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/RECORD +31 -16
  29. evalscope-0.5.2.dist-info/METADATA +0 -578
  30. {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/WHEEL +0 -0
  31. {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/entry_points.txt +0 -0
  32. {evalscope-0.5.2.dist-info → evalscope-0.5.4.dist-info}/top_level.txt +0 -0
@@ -242,4 +242,6 @@ if __name__ == '__main__':
242
242
  'limit': 5
243
243
  }
244
244
  )
245
+ all_datasets = OpenCompassBackendManager.list_datasets()
246
+ print(f'all_datasets: {all_datasets}')
245
247
  oc_backend_manager.run()
@@ -49,6 +49,7 @@ with read_base():
49
49
  from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
50
50
  from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
51
51
  from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
52
+ from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
52
53
  from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
53
54
 
54
55
  # Note: to be supported
@@ -1,5 +1,5 @@
1
1
  from typing import Optional, Union
2
- from evalscope.utils import is_module_installed, get_module_path, get_valid_list, yaml_to_dict, json_to_dict
2
+ from evalscope.utils import is_module_installed, get_valid_list
3
3
  from evalscope.backend.base import BackendManager
4
4
  from evalscope.utils.logger import get_logger
5
5
  from functools import partial
@@ -37,6 +37,7 @@ class VLMEvalKitBackendManager(BackendManager):
37
37
 
38
38
  self._check_valid()
39
39
 
40
+
40
41
  def _check_valid(self):
41
42
  # Ensure not both model and datasets are empty
42
43
  if not self.args.data or not self.args.model:
@@ -44,9 +45,9 @@ class VLMEvalKitBackendManager(BackendManager):
44
45
 
45
46
  # Check datasets
46
47
  valid_datasets, invalid_datasets = get_valid_list(self.args.data, self.valid_datasets)
47
- assert len(invalid_datasets) == 0, f'Invalid datasets: {invalid_datasets}, ' \
48
- f'refer to the following list to get proper dataset name: {self.valid_datasets}'
49
-
48
+ if len(invalid_datasets) != 0:
49
+ logger.warning(f"Using custom dataset: {invalid_datasets}, ")
50
+
50
51
  # Check model
51
52
  if isinstance(self.args.model[0], dict):
52
53
  model_names = [model['name'] for model in self.args.model]
@@ -61,10 +62,14 @@ class VLMEvalKitBackendManager(BackendManager):
61
62
  model_class = self.valid_models[model_name]
62
63
  if model_name == 'CustomAPIModel':
63
64
  model_type = model_cfg['type']
65
+ remain_cfg = copy.deepcopy(model_cfg)
66
+ del remain_cfg['name'] # remove not used args
67
+ del remain_cfg['type'] # remove not used args
68
+
64
69
  self.valid_models.update({
65
70
  model_type: partial(model_class,
66
71
  model=model_type,
67
- **model_cfg)
72
+ **remain_cfg)
68
73
  })
69
74
  new_model_names.append(model_type)
70
75
  else:
@@ -78,8 +83,8 @@ class VLMEvalKitBackendManager(BackendManager):
78
83
 
79
84
  elif isinstance(self.args.model[0], str):
80
85
  valid_model_names, invalid_model_names = get_valid_list(self.args.model, self.valid_model_names)
81
- assert len(invalid_model_names) == 0, f'Invalid models: {invalid_model_names}, ' \
82
- f'refer to the following list to get proper model name: {self.valid_model_names}'
86
+ if len(invalid_datasets) != 0:
87
+ logger.warning(f"Using custom dataset: {invalid_datasets}, ")
83
88
 
84
89
  @property
85
90
  def cmd(self):
@@ -0,0 +1,47 @@
1
+ import os
2
+ import numpy as np
3
+ from vlmeval.dataset.image_base import ImageBaseDataset
4
+ from vlmeval.dataset.image_vqa import CustomVQADataset
5
+ from vlmeval.smp import load, dump, d2df
6
+
7
+ class CustomDataset:
8
+
9
+ def load_data(self, dataset):
10
+ # customize the loading of the dataset
11
+ data_path = os.path.join("~/LMUData", f'{dataset}.tsv')
12
+ return load(data_path)
13
+
14
+
15
+ def build_prompt(self, line):
16
+ msgs = ImageBaseDataset.build_prompt(self, line)
17
+ # add a hint or custom instruction here
18
+ msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
19
+ return msgs
20
+
21
+
22
+ def evaluate(self, eval_file, **judge_kwargs):
23
+ data = load(eval_file)
24
+ assert 'answer' in data and 'prediction' in data
25
+ data['prediction'] = [str(x) for x in data['prediction']]
26
+ data['answer'] = [str(x).lower() for x in data['answer']]
27
+
28
+ print(data)
29
+
30
+ # ========compute the evaluation metrics as you need =========
31
+ # exact match
32
+ result = np.mean(data['answer'] == data['prediction'])
33
+ ret = {'Overall': result}
34
+ ret = d2df(ret).round(2)
35
+
36
+ # save the result
37
+ suffix = eval_file.split('.')[-1]
38
+ result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
39
+ dump(ret, result_file)
40
+ return ret
41
+ # ============================================================
42
+
43
+
44
+ # override the default dataset class
45
+ CustomVQADataset.load_data = CustomDataset.load_data
46
+ CustomVQADataset.build_prompt = CustomDataset.build_prompt
47
+ CustomVQADataset.evaluate = CustomDataset.evaluate
@@ -46,7 +46,7 @@ class Benchmark(object):
46
46
 
47
47
  dataset.dataset_name = dataset_name.split('/')[-1]
48
48
  dataset.subset_name = subset
49
- dataset.split = split
49
+ # dataset.split = split
50
50
  return dataset
51
51
  elif hub == 'HuggingFace':
52
52
  # TODO: implement this by xingjun.wxj@alibaba-inc.com
evalscope/config.py CHANGED
@@ -33,6 +33,7 @@ registry_tasks = {
33
33
  @dataclass
34
34
  class TaskConfig:
35
35
  model_args: Optional[dict] = field(default_factory=dict)
36
+ template_type: Optional[str] = 'default-generation'
36
37
  generation_config: Optional[dict] = field(default_factory=dict)
37
38
  dataset_args: Optional[dict] = field(default_factory=dict)
38
39
  dry_run: bool = False
@@ -244,8 +244,8 @@ class Evaluator(object):
244
244
  answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
245
245
 
246
246
  if debug:
247
- logger.debug(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
248
- logger.debug(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
247
+ logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
248
+ logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
249
249
 
250
250
  answers_list.append(answer_d)
251
251
 
@@ -349,7 +349,7 @@ class Evaluator(object):
349
349
  review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
350
350
 
351
351
  if debug:
352
- logger.debug(review_d)
352
+ logger.info(review_d)
353
353
 
354
354
  reviews_list.append(review_d)
355
355
 
@@ -0,0 +1,3 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.models.api.openai_api import OpenaiApi
@@ -0,0 +1,228 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import json
4
+ import threading
5
+ import time
6
+ from asyncio import Queue
7
+
8
+ import requests
9
+ from typing import Union, List, Optional, Dict
10
+ from concurrent.futures import ThreadPoolExecutor
11
+ from modelscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+
16
+ class OpenaiApi:
17
+
18
+ def __init__(self,
19
+ model: str,
20
+ openai_api_key,
21
+ openai_api_base,
22
+ logprobs: Optional[bool] = False,
23
+ top_logprobs: Optional[int] = None,
24
+ max_new_tokens: int = 4096,
25
+ temperature: Optional[float] = 0.0,
26
+ repetition_penalty: Optional[float] = 1.0,
27
+ is_chat: bool = True,
28
+ verbose: bool = True,
29
+ retry: int = 3,
30
+ query_per_second: int = 10, # TODO
31
+ **kwargs):
32
+
33
+ self.temperature = temperature
34
+ self.repetition_penalty = repetition_penalty
35
+ self.max_tokens = max_new_tokens
36
+ self.logprobs = logprobs
37
+ self.top_logprobs = top_logprobs
38
+
39
+ self.openai_api_key = openai_api_key
40
+ self.url = openai_api_base
41
+ self.model = model
42
+ self.is_chat = is_chat
43
+ self.retry = retry
44
+ self.verbose = verbose
45
+
46
+ self.token_bucket = TokenBucket(query_per_second, verbose)
47
+
48
+ def generate_simple(self, inputs: Union[List[str]]):
49
+
50
+ def process_one(in_data: str):
51
+
52
+ if self.is_chat:
53
+ data = dict(
54
+ model=self.model,
55
+ messages=[{'role': 'user', 'content': in_data}],
56
+ max_tokens=self.max_tokens,
57
+ n=1,
58
+ logprobs=self.logprobs,
59
+ top_logprobs=self.top_logprobs,
60
+ stop=None,
61
+ temperature=self.temperature,
62
+ repetition_penalty=self.repetition_penalty,
63
+ )
64
+ else:
65
+ data = dict(
66
+ model=self.model,
67
+ prompt=in_data,
68
+ max_tokens=self.max_tokens,
69
+ temperature=self.temperature,
70
+ repetition_penalty=self.repetition_penalty,
71
+ )
72
+
73
+ # todo
74
+ openai_api_key = self.openai_api_key or ''
75
+ header = {'Authorization': f'Bearer ', 'content-type': 'application/json', }
76
+ data = json.dumps(data, ensure_ascii=False)
77
+
78
+ if self.verbose:
79
+ print(f'>>data in generate_simple: {data}')
80
+
81
+ resp = requests.post(self.url, headers=header, data=data)
82
+ resp = resp.json()
83
+ if self.verbose:
84
+ print(f'>>resp in generate_simple: {resp}')
85
+
86
+ if self.logprobs:
87
+ return resp['choices']
88
+ else:
89
+ if self.is_chat:
90
+ return resp['choices'][0]['message']['content'].strip()
91
+ else:
92
+ return resp['choices'][0]['text'].strip()
93
+
94
+ with ThreadPoolExecutor() as executor:
95
+ results = list(executor.map(process_one, inputs))
96
+
97
+ return results
98
+
99
+ def generate(self,
100
+ inputs: Union[List[str], List[List]],
101
+ **kwargs) -> List[str]:
102
+ """
103
+ Generate responses from OpenAI API.
104
+
105
+ Args:
106
+ inputs: The input messages for the model. It can be a string or a list of messages.
107
+ e.g. ['who are you ?', 'what is your name ?']
108
+ e.g. [[{'role': 'user', 'content': 'who are you ?'}], ...]
109
+ kwargs: The optional arguments for the model.
110
+ """
111
+ results = []
112
+ # with ThreadPoolExecutor() as executor:
113
+ # results = list(executor.map(self._generate, inputs))
114
+
115
+ for input in inputs:
116
+ results.append(self._generate(input))
117
+
118
+ return results
119
+
120
+ def _generate(self, messages: Union[str, List[Dict]]) -> str:
121
+
122
+ if isinstance(messages, str):
123
+ messages = [{'role': 'user', 'content': messages}]
124
+
125
+ max_num_retries = 0
126
+ while max_num_retries < self.retry:
127
+ # self.wait()
128
+
129
+ header = {
130
+ 'Authorization': f'Bearer {self.openai_api_key}',
131
+ 'content-type': 'application/json',
132
+ }
133
+
134
+ try:
135
+ if self.is_chat:
136
+ data = dict(
137
+ model=self.model,
138
+ messages=messages,
139
+ max_tokens=self.max_tokens,
140
+ n=1,
141
+ logprobs=self.logprobs,
142
+ top_logprobs=self.top_logprobs,
143
+ stop=None,
144
+ temperature=self.temperature,
145
+ repetition_penalty=self.repetition_penalty,
146
+ )
147
+ else:
148
+ # TODO: This is a temporary solution for non-chat models.
149
+ input_prompts = []
150
+ for msg in messages:
151
+ input_prompts.append(msg['content'])
152
+
153
+ data = dict(
154
+ model=self.model,
155
+ prompt='\n'.join(input_prompts),
156
+ max_tokens=self.max_tokens,
157
+ temperature=self.temperature,
158
+ repetition_penalty=self.repetition_penalty,
159
+ )
160
+
161
+ def remove_none_val(input_d: dict):
162
+ return {k: v for k, v in input_d.items() if v is not None}
163
+ data = remove_none_val(data)
164
+
165
+ if self.verbose:
166
+ logger.info(f'>> Post data: {json.dumps(data, ensure_ascii=False)}')
167
+ raw_response = requests.post(self.url,
168
+ headers=header,
169
+ data=json.dumps(data, ensure_ascii=False))
170
+
171
+ response = raw_response.json()
172
+ if self.verbose:
173
+ logger.info(f'>> response: {response}')
174
+
175
+ if self.logprobs:
176
+ return response['choices']
177
+ else:
178
+ if self.is_chat:
179
+ return response['choices'][0]['message']['content'].strip()
180
+ else:
181
+ return response['choices'][0]['text'].strip()
182
+
183
+ except Exception as e:
184
+ logger.error(f'Error occurs: {str(e)}')
185
+ max_num_retries += 1
186
+ continue
187
+
188
+ def wait(self):
189
+ return self.token_bucket.get_token()
190
+
191
+
192
+ class TokenBucket:
193
+ """A token bucket for rate limiting.
194
+
195
+ Args:
196
+ query_per_second (float): The rate of the token bucket.
197
+ """
198
+
199
+ def __init__(self, rate, verbose=False):
200
+ self._rate = rate
201
+ self._tokens = threading.Semaphore(0)
202
+ self.started = False
203
+ self._request_queue = Queue()
204
+ self.logger = get_logger()
205
+ self.verbose = verbose
206
+
207
+ def _add_tokens(self):
208
+ """Add tokens to the bucket."""
209
+ while True:
210
+ if self._tokens._value < self._rate:
211
+ self._tokens.release()
212
+ time.sleep(1 / self._rate)
213
+
214
+ def get_token(self):
215
+ """Get a token from the bucket."""
216
+ if not self.started:
217
+ self.started = True
218
+ threading.Thread(target=self._add_tokens, daemon=True).start()
219
+ self._tokens.acquire()
220
+ if self.verbose:
221
+ cur_time = time.time()
222
+ while not self._request_queue.empty():
223
+ if cur_time - self._request_queue.queue[0] > 60:
224
+ self._request_queue.get()
225
+ else:
226
+ break
227
+ self._request_queue.put(cur_time)
228
+ self.logger.info(f'Current RPM {self._request_queue.qsize()}.')
@@ -362,6 +362,8 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
362
362
  torch_dtype: The torch dtype for model inference. Default: torch.float16.
363
363
  **kwargs: Other args.
364
364
  """
365
+
366
+ custom_generation_config = kwargs.pop('generation_config', None)
365
367
  model_cache_dir = get_model_cache_dir(root_cache_dir=cache_dir)
366
368
 
367
369
  self.model_id: str = model_id
@@ -414,6 +416,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
414
416
  self.origin_tokenizer = deepcopy(tokenizer)
415
417
 
416
418
  self.generation_config, self.generation_template = self._parse_generation_config(tokenizer, model)
419
+
420
+ if custom_generation_config:
421
+ logger.info('**Updating generation config ...')
422
+ self.generation_config.update(**custom_generation_config.to_dict())
417
423
  logger.info(f'**Generation config init: {self.generation_config.to_dict()}')
418
424
 
419
425
  super().__init__(model=model, tokenizer=self.generation_template.tokenizer, model_cfg=model_cfg)
@@ -51,15 +51,15 @@ UNLIMITED_RATE = -1
51
51
 
52
52
 
53
53
  async def on_request_start(session, context, params):
54
- logger.debug(f'Starting request: <{params}>')
54
+ logger.info(f'Starting request: <{params}>')
55
55
 
56
56
 
57
57
  async def on_request_chunk_sent(session, context, params):
58
- logger.debug(f'Request body: {params}')
58
+ logger.info(f'Request body: {params}')
59
59
 
60
60
 
61
61
  async def on_response_chunk_received(session, context, params):
62
- logger.debug(f'Response info: <{params}>')
62
+ logger.info(f'Response info: <{params}>')
63
63
 
64
64
 
65
65
  class AioHttpClient:
@@ -116,7 +116,7 @@ class AioHttpClient:
116
116
  line = line.decode("utf8")
117
117
  line = line.rstrip("\n").rstrip("\r")
118
118
  if self.debug:
119
- logger.debug(line)
119
+ logger.info(line)
120
120
  sse_msg = ServerSentEvent.decode(line)
121
121
  if not sse_msg:
122
122
  continue
@@ -567,7 +567,7 @@ async def send_requests_worker(task_id, request_queue: asyncio.Queue, benchmark_
567
567
  else:
568
568
  if response_data:
569
569
  collected_messages.append(response_data) # save the message
570
- logger.debug(response_data)
570
+ logger.info(response_data)
571
571
  benchmark_data["chunk_times"].append(time.perf_counter())
572
572
 
573
573
  benchmark_data["response_messages"] = collected_messages
evalscope/run_arena.py CHANGED
@@ -100,17 +100,18 @@ class ArenaWorkflow:
100
100
  model_revision = cfg_d.get(EvalConfigKeys.MODEL_REVISION, None)
101
101
  precision = cfg_d.get(EvalConfigKeys.PRECISION, torch.float16)
102
102
  precision = eval(precision) if isinstance(precision, str) else precision
103
- generation_config = cfg_d.get(EvalConfigKeys.GENERATION_CONFIG, {})
104
- generation_config = GenerationConfig(**generation_config)
103
+ custom_generation_config = cfg_d.get(EvalConfigKeys.GENERATION_CONFIG, {})
104
+ custom_generation_config = GenerationConfig(**custom_generation_config)
105
105
  ans_output_file = os.path.join(WORK_DIR, cfg_d.get(EvalConfigKeys.OUTPUT_FILE))
106
106
  template_type = cfg_d.get(EvalConfigKeys.TEMPLATE_TYPE)
107
107
 
108
108
  answers_list = self._predict_answers(model_id_or_path=model_id_or_path,
109
109
  model_revision=model_revision,
110
110
  precision=precision,
111
- generation_config=generation_config,
111
+ generation_config=custom_generation_config,
112
112
  template_type=template_type)
113
113
 
114
+ os.makedirs(os.path.dirname(ans_output_file), exist_ok=True)
114
115
  dump_jsonl_data(answers_list, ans_output_file)
115
116
  logger.info(f'Answers generated by model {model_name} and saved to {ans_output_file}')
116
117
 
@@ -168,6 +169,7 @@ class ArenaWorkflow:
168
169
  res_list = ae.run(self.review_file)
169
170
  rating_df = res_list[0]
170
171
  logger.info(f'Rating results:\n{rating_df.to_csv()}')
172
+ os.makedirs(os.path.dirname(report_file), exist_ok=True)
171
173
  rating_df.to_csv(report_file, index=True)
172
174
  logger.info(f'Rating results are saved to {report_file}')
173
175
  else:
evalscope/summarizer.py CHANGED
@@ -99,19 +99,25 @@ class Summarizer:
99
99
  elif eval_backend == EvalBackend.VLM_EVAL_KIT.value:
100
100
  eval_config = Summarizer.parse_eval_config(candidate_task)
101
101
 
102
- work_dir = eval_config.get('work_dir') or 'outputs/default'
102
+ work_dir = eval_config.get('work_dir') or 'outputs'
103
103
  if not os.path.exists(work_dir):
104
104
  raise ValueError(f'work_dir {work_dir} does not exist.')
105
105
 
106
- # TODO: parse summary files: acc.csv, score.csv, score.json for different models
107
106
  for model in eval_config['model']:
108
107
  if model['name'] == 'CustomAPIModel':
109
108
  model_name = model['type']
110
109
  else:
111
110
  model_name = model['name']
112
- summary_files = glob.glob(os.path.join(work_dir, model_name, '*.csv'))
111
+
112
+ csv_files = glob.glob(os.path.join(work_dir, model_name, '*.csv'))
113
+ json_files = glob.glob(os.path.join(work_dir, model_name, '*.json'))
114
+
115
+ summary_files = csv_files + json_files
113
116
  for summary_file_path in summary_files:
114
- summary_res: dict = csv_to_list(file_path=summary_file_path)[0]
117
+ if summary_file_path.endswith('csv'):
118
+ summary_res: dict = csv_to_list(summary_file_path)[0]
119
+ elif summary_file_path.endswith('json'):
120
+ summary_res: dict = json_to_dict(summary_file_path)
115
121
  file_name = os.path.basename(summary_file_path).split('.')[0]
116
122
  final_res_list.append({file_name: summary_res})
117
123
 
@@ -0,0 +1,3 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.third_party.longbench_write.longbench_write import run_task