evalscope 0.5.0rc0__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

@@ -76,9 +76,7 @@ class OpenCompassBackendManager(BackendManager):
76
76
  @staticmethod
77
77
  def _check_env():
78
78
  if is_module_installed('opencompass'):
79
- logger.info('Please make sure you have installed the `ms-opencompass`: `pip install ms-opencompass`')
80
- else:
81
- raise ModuleNotFoundError('Please install the `ms-opencompass` first: `pip install ms-opencompass`')
79
+ logger.info('Check the OpenCompass environment: OK')
82
80
 
83
81
  @staticmethod
84
82
  def get_restore_arg(arg_name: str, arg_val: bool):
@@ -6,6 +6,7 @@ from opencompass.tasks import OpenICLInferTask
6
6
 
7
7
 
8
8
  with read_base():
9
+ from opencompass.configs.summarizers.medium import summarizer
9
10
  from evalscope.backend.opencompass.tasks.eval_datasets import datasets
10
11
 
11
12
  # 1. Get datasets
@@ -49,6 +49,7 @@ with read_base():
49
49
  from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
50
50
  from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
51
51
  from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
52
+ from opencompass.configs.datasets.cmb.cmb_gen_dfb5c4 import cmb_datasets
52
53
  from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
53
54
 
54
55
  # Note: to be supported
@@ -31,12 +31,13 @@ class VLMEvalKitBackendManager(BackendManager):
31
31
  from vlmeval.utils.arguments import Arguments as VLMEvalArguments
32
32
  self.args = VLMEvalArguments(**self.config_d)
33
33
 
34
- self.valid_models = self.list_supported_VLMs()
34
+ self.valid_models = self.list_supported_models()
35
35
  self.valid_model_names = list(self.valid_models.keys())
36
36
  self.valid_datasets = self.list_supported_datasets()
37
37
 
38
38
  self._check_valid()
39
39
 
40
+
40
41
  def _check_valid(self):
41
42
  # Ensure not both model and datasets are empty
42
43
  if not self.args.data or not self.args.model:
@@ -44,9 +45,9 @@ class VLMEvalKitBackendManager(BackendManager):
44
45
 
45
46
  # Check datasets
46
47
  valid_datasets, invalid_datasets = get_valid_list(self.args.data, self.valid_datasets)
47
- assert len(invalid_datasets) == 0, f'Invalid datasets: {invalid_datasets}, ' \
48
- f'refer to the following list to get proper dataset name: {self.valid_datasets}'
49
-
48
+ if len(invalid_datasets) != 0:
49
+ logger.warning(f"Using custom dataset: {invalid_datasets}, ")
50
+
50
51
  # Check model
51
52
  if isinstance(self.args.model[0], dict):
52
53
  model_names = [model['name'] for model in self.args.model]
@@ -61,10 +62,14 @@ class VLMEvalKitBackendManager(BackendManager):
61
62
  model_class = self.valid_models[model_name]
62
63
  if model_name == 'CustomAPIModel':
63
64
  model_type = model_cfg['type']
65
+ remain_cfg = copy.deepcopy(model_cfg)
66
+ del remain_cfg['name'] # remove not used args
67
+ del remain_cfg['type'] # remove not used args
68
+
64
69
  self.valid_models.update({
65
70
  model_type: partial(model_class,
66
71
  model=model_type,
67
- **model_cfg)
72
+ **remain_cfg)
68
73
  })
69
74
  new_model_names.append(model_type)
70
75
  else:
@@ -78,15 +83,15 @@ class VLMEvalKitBackendManager(BackendManager):
78
83
 
79
84
  elif isinstance(self.args.model[0], str):
80
85
  valid_model_names, invalid_model_names = get_valid_list(self.args.model, self.valid_model_names)
81
- assert len(invalid_model_names) == 0, f'Invalid models: {invalid_model_names}, ' \
82
- f'refer to the following list to get proper model name: {self.valid_model_names}'
86
+ if len(invalid_datasets) != 0:
87
+ logger.warning(f"Using custom dataset: {invalid_datasets}, ")
83
88
 
84
89
  @property
85
90
  def cmd(self):
86
91
  return self.get_cmd()
87
92
 
88
93
  @staticmethod
89
- def list_supported_VLMs():
94
+ def list_supported_models():
90
95
  from vlmeval.config import supported_VLM
91
96
  return supported_VLM
92
97
 
@@ -98,9 +103,7 @@ class VLMEvalKitBackendManager(BackendManager):
98
103
  @staticmethod
99
104
  def _check_env():
100
105
  if is_module_installed('vlmeval'):
101
- logger.info('Please make sure you have installed the `ms-vlmeval`: `pip install ms-vlmeval`')
102
- else:
103
- raise ModuleNotFoundError('Please install the `ms-vlmeval` first: `pip install ms-vlmeval`')
106
+ logger.info('Check VLM Evaluation Kit: Installed')
104
107
 
105
108
  @staticmethod
106
109
  def get_restore_arg(arg_name: str, arg_val: bool):
@@ -0,0 +1,47 @@
1
+ import os
2
+ import numpy as np
3
+ from vlmeval.dataset.image_base import ImageBaseDataset
4
+ from vlmeval.dataset.image_vqa import CustomVQADataset
5
+ from vlmeval.smp import load, dump, d2df
6
+
7
+ class CustomDataset:
8
+
9
+ def load_data(self, dataset):
10
+ # customize the loading of the dataset
11
+ data_path = os.path.join("~/LMUData", f'{dataset}.tsv')
12
+ return load(data_path)
13
+
14
+
15
+ def build_prompt(self, line):
16
+ msgs = ImageBaseDataset.build_prompt(self, line)
17
+ # add a hint or custom instruction here
18
+ msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
19
+ return msgs
20
+
21
+
22
+ def evaluate(self, eval_file, **judge_kwargs):
23
+ data = load(eval_file)
24
+ assert 'answer' in data and 'prediction' in data
25
+ data['prediction'] = [str(x) for x in data['prediction']]
26
+ data['answer'] = [str(x).lower() for x in data['answer']]
27
+
28
+ print(data)
29
+
30
+ # ========compute the evaluation metrics as you need =========
31
+ # exact match
32
+ result = np.mean(data['answer'] == data['prediction'])
33
+ ret = {'Overall': result}
34
+ ret = d2df(ret).round(2)
35
+
36
+ # save the result
37
+ suffix = eval_file.split('.')[-1]
38
+ result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
39
+ dump(ret, result_file)
40
+ return ret
41
+ # ============================================================
42
+
43
+
44
+ # override the default dataset class
45
+ CustomVQADataset.load_data = CustomDataset.load_data
46
+ CustomVQADataset.build_prompt = CustomDataset.build_prompt
47
+ CustomVQADataset.evaluate = CustomDataset.evaluate
evalscope/config.py CHANGED
@@ -33,6 +33,7 @@ registry_tasks = {
33
33
  @dataclass
34
34
  class TaskConfig:
35
35
  model_args: Optional[dict] = field(default_factory=dict)
36
+ template_type: Optional[str] = 'default-generation'
36
37
  generation_config: Optional[dict] = field(default_factory=dict)
37
38
  dataset_args: Optional[dict] = field(default_factory=dict)
38
39
  dry_run: bool = False
@@ -362,6 +362,8 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
362
362
  torch_dtype: The torch dtype for model inference. Default: torch.float16.
363
363
  **kwargs: Other args.
364
364
  """
365
+
366
+ custom_generation_config = kwargs.pop('generation_config', None)
365
367
  model_cache_dir = get_model_cache_dir(root_cache_dir=cache_dir)
366
368
 
367
369
  self.model_id: str = model_id
@@ -414,6 +416,10 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
414
416
  self.origin_tokenizer = deepcopy(tokenizer)
415
417
 
416
418
  self.generation_config, self.generation_template = self._parse_generation_config(tokenizer, model)
419
+
420
+ if custom_generation_config:
421
+ logger.info('**Updating generation config ...')
422
+ self.generation_config.update(**custom_generation_config.to_dict())
417
423
  logger.info(f'**Generation config init: {self.generation_config.to_dict()}')
418
424
 
419
425
  super().__init__(model=model, tokenizer=self.generation_template.tokenizer, model_cfg=model_cfg)
evalscope/run_arena.py CHANGED
@@ -100,17 +100,18 @@ class ArenaWorkflow:
100
100
  model_revision = cfg_d.get(EvalConfigKeys.MODEL_REVISION, None)
101
101
  precision = cfg_d.get(EvalConfigKeys.PRECISION, torch.float16)
102
102
  precision = eval(precision) if isinstance(precision, str) else precision
103
- generation_config = cfg_d.get(EvalConfigKeys.GENERATION_CONFIG, {})
104
- generation_config = GenerationConfig(**generation_config)
103
+ custom_generation_config = cfg_d.get(EvalConfigKeys.GENERATION_CONFIG, {})
104
+ custom_generation_config = GenerationConfig(**custom_generation_config)
105
105
  ans_output_file = os.path.join(WORK_DIR, cfg_d.get(EvalConfigKeys.OUTPUT_FILE))
106
106
  template_type = cfg_d.get(EvalConfigKeys.TEMPLATE_TYPE)
107
107
 
108
108
  answers_list = self._predict_answers(model_id_or_path=model_id_or_path,
109
109
  model_revision=model_revision,
110
110
  precision=precision,
111
- generation_config=generation_config,
111
+ generation_config=custom_generation_config,
112
112
  template_type=template_type)
113
113
 
114
+ os.makedirs(os.path.dirname(ans_output_file), exist_ok=True)
114
115
  dump_jsonl_data(answers_list, ans_output_file)
115
116
  logger.info(f'Answers generated by model {model_name} and saved to {ans_output_file}')
116
117
 
@@ -168,6 +169,7 @@ class ArenaWorkflow:
168
169
  res_list = ae.run(self.review_file)
169
170
  rating_df = res_list[0]
170
171
  logger.info(f'Rating results:\n{rating_df.to_csv()}')
172
+ os.makedirs(os.path.dirname(report_file), exist_ok=True)
171
173
  rating_df.to_csv(report_file, index=True)
172
174
  logger.info(f'Rating results are saved to {report_file}')
173
175
  else:
evalscope/summarizer.py CHANGED
@@ -99,19 +99,25 @@ class Summarizer:
99
99
  elif eval_backend == EvalBackend.VLM_EVAL_KIT.value:
100
100
  eval_config = Summarizer.parse_eval_config(candidate_task)
101
101
 
102
- work_dir = eval_config.get('work_dir') or 'outputs/default'
102
+ work_dir = eval_config.get('work_dir') or 'outputs'
103
103
  if not os.path.exists(work_dir):
104
104
  raise ValueError(f'work_dir {work_dir} does not exist.')
105
105
 
106
- # TODO: parse summary files: acc.csv, score.csv, score.json for different models
107
106
  for model in eval_config['model']:
108
107
  if model['name'] == 'CustomAPIModel':
109
108
  model_name = model['type']
110
109
  else:
111
110
  model_name = model['name']
112
- summary_files = glob.glob(os.path.join(work_dir, model_name, '*.csv'))
111
+
112
+ csv_files = glob.glob(os.path.join(work_dir, model_name, '*.csv'))
113
+ json_files = glob.glob(os.path.join(work_dir, model_name, '*.json'))
114
+
115
+ summary_files = csv_files + json_files
113
116
  for summary_file_path in summary_files:
114
- summary_res: dict = csv_to_list(file_path=summary_file_path)[0]
117
+ if summary_file_path.endswith('csv'):
118
+ summary_res: dict = csv_to_list(summary_file_path)[0]
119
+ elif summary_file_path.endswith('json'):
120
+ summary_res: dict = json_to_dict(summary_file_path)
115
121
  file_name = os.path.basename(summary_file_path).split('.')[0]
116
122
  final_res_list.append({file_name: summary_res})
117
123
 
@@ -3,7 +3,7 @@ from enum import Enum
3
3
 
4
4
 
5
5
  class EvalBackend(Enum):
6
- # Use native evaluation pipeline of Eval-Scope
6
+ # Use native evaluation pipeline of EvalScope
7
7
  NATIVE = 'Native'
8
8
 
9
9
  # Use OpenCompass framework as the evaluation backend
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.5.0rc0'
4
- __release_datetime__ = '2024-08-01 08:00:00'
3
+ __version__ = '0.5.3'
4
+ __release_datetime__ = '2024-08-29 08:00:00'