evalscope 0.8.2__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (79) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +10 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +23 -99
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +114 -85
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
  26. evalscope/benchmarks/mmlu/__init__.py +0 -5
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
  28. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  29. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  30. evalscope/benchmarks/race/__init__.py +0 -5
  31. evalscope/benchmarks/race/race_adapter.py +25 -53
  32. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
  34. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  35. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
  36. evalscope/collections/__init__.py +3 -0
  37. evalscope/collections/evaluator.py +178 -0
  38. evalscope/collections/sampler.py +132 -0
  39. evalscope/collections/schema.py +122 -0
  40. evalscope/config.py +7 -5
  41. evalscope/constants.py +7 -28
  42. evalscope/evaluator/evaluator.py +66 -109
  43. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  44. evalscope/metrics/__init__.py +6 -0
  45. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  46. evalscope/metrics/math_accuracy.py +193 -50
  47. evalscope/metrics/metrics.py +7 -4
  48. evalscope/metrics/rouge_metric.py +13 -8
  49. evalscope/models/__init__.py +14 -1
  50. evalscope/models/base_adapter.py +52 -0
  51. evalscope/models/chat_adapter.py +138 -0
  52. evalscope/models/choice_adapter.py +211 -0
  53. evalscope/models/custom_adapter.py +67 -0
  54. evalscope/models/local_model.py +74 -0
  55. evalscope/models/model.py +141 -0
  56. evalscope/models/server_adapter.py +104 -0
  57. evalscope/run.py +37 -66
  58. evalscope/run_arena.py +1 -1
  59. evalscope/utils/__init__.py +1 -1
  60. evalscope/utils/chat_service.py +4 -3
  61. evalscope/utils/io_utils.py +8 -0
  62. evalscope/utils/logger.py +4 -0
  63. evalscope/utils/model_utils.py +10 -0
  64. evalscope/utils/utils.py +3 -25
  65. evalscope/version.py +2 -2
  66. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/METADATA +32 -15
  67. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/RECORD +75 -66
  68. tests/cli/test_collection.py +53 -0
  69. tests/cli/test_run.py +43 -1
  70. tests/rag/test_mteb.py +3 -2
  71. evalscope/models/api/__init__.py +0 -3
  72. evalscope/models/dummy_chat_model.py +0 -49
  73. evalscope/models/model_adapter.py +0 -525
  74. evalscope/models/openai_model.py +0 -103
  75. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  76. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
  77. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
  78. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
  79. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,122 @@
1
+ import copy
2
+ import json
3
+ from dataclasses import asdict, dataclass, field
4
+ from typing import List, Union
5
+
6
+
7
+ @dataclass
8
+ class DatasetInfo:
9
+ name: str
10
+ weight: float = 1.0 # sample weight in each collection
11
+ task_type: str = ''
12
+ tags: List[str] = field(default_factory=list)
13
+ args: dict = field(default_factory=dict)
14
+
15
+ def get_data(self) -> dict:
16
+ from evalscope.benchmarks import Benchmark
17
+
18
+ benchmark_meta = Benchmark.get(self.name)
19
+
20
+ data_adapter = benchmark_meta.get_data_adapter(config=self.args)
21
+ data_dict = data_adapter.load(
22
+ dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
23
+ prompts = data_adapter.gen_prompts(data_dict)
24
+ return prompts
25
+
26
+
27
+ def flatten_weight(collection: 'CollectionSchema', base_weight=1):
28
+ total_weight = sum(dataset.weight for dataset in collection.datasets)
29
+ for dataset in collection.datasets:
30
+ current_weight = dataset.weight / total_weight * base_weight
31
+ if isinstance(dataset, CollectionSchema):
32
+ flatten_weight(dataset, current_weight)
33
+ else:
34
+ dataset.weight = current_weight
35
+
36
+
37
+ def flatten_tags(collection: 'CollectionSchema', parent_names=None):
38
+ if parent_names is None:
39
+ parent_names = []
40
+ current_names = parent_names + [collection.name]
41
+ for dataset in collection.datasets:
42
+ if isinstance(dataset, CollectionSchema):
43
+ flatten_tags(dataset, current_names)
44
+ else:
45
+ # Add all parent CollectionSchema names to the tags of each DatasetInfo
46
+ for name in current_names:
47
+ if name not in dataset.tags:
48
+ dataset.tags.append(name)
49
+
50
+
51
+ def flatten_datasets(collection: 'CollectionSchema') -> List[DatasetInfo]:
52
+ flat_datasets = []
53
+ for dataset in collection.datasets:
54
+ if isinstance(dataset, CollectionSchema):
55
+ flat_datasets.extend(flatten_datasets(dataset))
56
+ else:
57
+ flat_datasets.append(dataset)
58
+ return flat_datasets
59
+
60
+
61
+ @dataclass
62
+ class CollectionSchema:
63
+ name: str
64
+ weight: float = 1.0
65
+ datasets: List[Union[DatasetInfo, 'CollectionSchema']] = field(default_factory=list)
66
+
67
+ def __str__(self):
68
+ return json.dumps(self.to_dict(), ensure_ascii=False, indent=4)
69
+
70
+ def to_dict(self):
71
+ return {
72
+ 'name':
73
+ self.name,
74
+ 'weight':
75
+ self.weight,
76
+ 'datasets':
77
+ [asdict(dataset) if isinstance(dataset, DatasetInfo) else dataset.to_dict() for dataset in self.datasets],
78
+ }
79
+
80
+ @classmethod
81
+ def from_dict(cls, data):
82
+ instance = cls(name=data.get('name', ''), weight=data.get('weight', 1))
83
+ for dataset in data.get('datasets', []):
84
+ if 'datasets' in dataset:
85
+ instance.datasets.append(CollectionSchema.from_dict(dataset))
86
+ else:
87
+ instance.datasets.append(DatasetInfo(**dataset))
88
+ return instance
89
+
90
+ def dump_json(self, file_path):
91
+ d = self.to_dict()
92
+ with open(file_path, 'w') as f:
93
+ json.dump(d, f, ensure_ascii=False, indent=4)
94
+
95
+ @classmethod
96
+ def from_json(cls, file_path):
97
+ with open(file_path, 'r') as f:
98
+ data = json.load(f)
99
+ return cls.from_dict(data)
100
+
101
+ def flatten(self) -> List[DatasetInfo]:
102
+ collection = copy.deepcopy(self)
103
+ flatten_tags(collection)
104
+ flatten_weight(collection)
105
+ return flatten_datasets(collection)
106
+
107
+
108
+ if __name__ == '__main__':
109
+ schema = CollectionSchema(
110
+ name='reasoning',
111
+ datasets=[
112
+ DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en']),
113
+ DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh'], args={'subset_list': ['logic']})
114
+ ])
115
+ print(schema)
116
+ print(schema.flatten())
117
+ schema.dump_json('outputs/schema.json')
118
+
119
+ schema = CollectionSchema.from_json('outputs/schema.json')
120
+ print(schema)
121
+ for dataset in schema.flatten():
122
+ print(dataset)
evalscope/config.py CHANGED
@@ -31,7 +31,7 @@ DEFAULT_GENERATION_CONFIG = {
31
31
  @dataclass
32
32
  class TaskConfig:
33
33
  # Model-related arguments
34
- model: Union[str, CustomModel, None] = None
34
+ model: Union[str, 'CustomModel', None] = None
35
35
  model_id: Optional[str] = None
36
36
  model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
37
37
 
@@ -40,8 +40,8 @@ class TaskConfig:
40
40
  chat_template: Optional[str] = None
41
41
 
42
42
  # Dataset-related arguments
43
- datasets: Optional[List[str]] = None
44
- dataset_args: Optional[Dict] = field(default_factory=dict)
43
+ datasets: List[str] = field(default_factory=list)
44
+ dataset_args: Dict = field(default_factory=dict)
45
45
  dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
46
46
  dataset_hub: str = HubType.MODELSCOPE
47
47
 
@@ -64,7 +64,9 @@ class TaskConfig:
64
64
  # Debug and runtime mode arguments
65
65
  debug: bool = False
66
66
  dry_run: bool = False
67
- seed: int = 42
67
+ seed: Optional[int] = 42
68
+ api_url: Optional[str] = None # Only used for server model
69
+ api_key: Optional[str] = 'EMPTY' # Only used for server model
68
70
 
69
71
  def __post_init__(self):
70
72
  if (not self.model_id) and self.model:
@@ -74,7 +76,6 @@ class TaskConfig:
74
76
  self.model_id = os.path.basename(self.model).rstrip(os.sep)
75
77
 
76
78
  def to_dict(self):
77
- # Note: to avoid serialization error for some model instance
78
79
  return self.__dict__
79
80
 
80
81
  def __str__(self):
@@ -130,6 +131,7 @@ class TaskConfig:
130
131
  continue
131
132
 
132
133
  task.model = custom_model
134
+ task.model_args = custom_model.config
133
135
  task.model_id = type(custom_model).__name__
134
136
  res_list.append(task)
135
137
 
evalscope/constants.py CHANGED
@@ -135,34 +135,13 @@ class EvalStage:
135
135
  class EvalType:
136
136
 
137
137
  CUSTOM = 'custom'
138
- CHECKPOINT = 'checkpoint'
138
+ CHECKPOINT = 'checkpoint' # native model checkpoint
139
+ SERVICE = 'service' # model service
139
140
 
140
141
 
141
142
  class EvalBackend:
142
-
143
- class _Backend:
144
- # compatible with old version, set 'value'
145
-
146
- def __init__(self, value):
147
- self._value = value
148
-
149
- @property
150
- def value(self):
151
- return self._value
152
-
153
- def __str__(self):
154
- return self._value
155
-
156
- def __repr__(self):
157
- return f"'{self._value}'"
158
-
159
- def __eq__(self, other):
160
- if isinstance(other, str):
161
- return self._value == other
162
- return NotImplemented
163
-
164
- NATIVE = _Backend('Native')
165
- OPEN_COMPASS = _Backend('OpenCompass')
166
- VLM_EVAL_KIT = _Backend('VLMEvalKit')
167
- RAG_EVAL = _Backend('RAGEval')
168
- THIRD_PARTY = _Backend('ThirdParty')
143
+ NATIVE = 'Native'
144
+ OPEN_COMPASS = 'OpenCompass'
145
+ VLM_EVAL_KIT = 'VLMEvalKit'
146
+ RAG_EVAL = 'RAGEval'
147
+ THIRD_PARTY = 'ThirdParty'
@@ -10,9 +10,8 @@ from typing import Any, Dict, List, Optional, Union
10
10
 
11
11
  from evalscope.benchmarks import DataAdapter
12
12
  from evalscope.config import TaskConfig
13
- from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, AnswerKeys, DumpMode, EvalStage, EvalType, HubType,
14
- ReviewKeys)
15
- from evalscope.models.model_adapter import BaseModelAdapter, CustomModelAdapter
13
+ from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
14
+ from evalscope.models import BaseModelAdapter, CustomModelAdapter
16
15
  from evalscope.tools.combine_reports import gen_table
17
16
  from evalscope.utils import dict_torch_dtype_to_str, gen_hash
18
17
  from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
@@ -30,73 +29,63 @@ class Evaluator(object):
30
29
  if the dataset is a local path, e.g. /path/to/your_dataset_name,
31
30
  then the task name will be the basename of the path, which is `your_dataset_name`.
32
31
  data_adapter: DataAdapter, the data adapter for the dataset.
33
- subset_list: list, the subset list for the dataset.
34
32
  model_adapter: BaseModelAdapter, the model adapter for the model.
35
- use_cache: str, path to local cache. Default: None
36
- outputs_dir: OutputsStructure, the outputs dir. Default: None
37
- datasets_dir: str, the datasets dir. Default: DEFAULT_ROOT_CACHE_DIR
38
- datasets_hub: str, the datasets hub. `Local`, `ModelScope` or `HuggingFace`. Default: 'ModelScope'
39
- stage: str, the stage of evaluation. `all` or `infer` or `review`. Default: 'all'
40
- eval_type: str, the evaluation type. `checkpoint` or `service` or `custom`. Default: 'checkpoint'
41
- overall_task_cfg: dict, the overall task config. Default: None
33
+ outputs: OutputsStructure, the outputs dir. Default: None
34
+ task_cfg: TaskConfig, the overall task config. Default: None
42
35
  **kwargs: kwargs.
43
36
  """
44
37
 
45
38
  def __init__(self,
46
39
  dataset_name_or_path: str,
47
40
  data_adapter: DataAdapter,
48
- subset_list: Optional[list] = None,
49
- model_adapter: Optional[BaseModelAdapter] = None,
50
- use_cache: Optional[str] = None,
51
- outputs: Optional[OutputsStructure] = None,
52
- datasets_dir: Optional[str] = DEFAULT_DATASET_CACHE_DIR,
53
- datasets_hub: Optional[str] = HubType.MODELSCOPE,
54
- stage: Optional[str] = EvalStage.ALL,
55
- eval_type: Optional[str] = EvalType.CHECKPOINT,
56
- overall_task_cfg: Optional[TaskConfig] = None,
41
+ model_adapter: BaseModelAdapter,
42
+ outputs: OutputsStructure = None,
43
+ task_cfg: TaskConfig = None,
57
44
  **kwargs):
58
45
 
59
46
  self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
60
47
  self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
61
- self.model_name = overall_task_cfg.model_id
48
+ self.model_name = task_cfg.model_id
62
49
  self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
63
50
 
64
- self.datasets_dir = os.path.expanduser(datasets_dir)
65
- self.kwargs = kwargs
66
51
  self.data_adapter = data_adapter
67
52
  self.model_adapter = model_adapter
68
- self.eval_type = eval_type
69
- self.stage = stage
70
- self.use_cache = use_cache
71
- self.overall_task_cfg = overall_task_cfg
72
- if isinstance(self.model_adapter, CustomModelAdapter):
73
- self.overall_task_cfg.model_args = self.model_adapter.custom_model.config
74
-
75
- self.model_cfg = self.model_adapter.model_cfg
76
-
53
+ self.model_cfg = model_adapter.model_cfg
54
+ self.eval_type = task_cfg.eval_type
55
+ self.dataset_hub = task_cfg.dataset_hub
56
+ self.stage = task_cfg.stage
57
+ self.use_cache = task_cfg.use_cache
58
+ self.task_cfg = task_cfg
77
59
  # Deal with the output paths
78
60
  self.outputs_structure = outputs
79
61
 
80
- # Load dataset
81
- self.dataset = self.data_adapter.load(
82
- dataset_name_or_path=dataset_name_or_path,
83
- subset_list=subset_list,
84
- work_dir=self.datasets_dir,
85
- datasets_hub=datasets_hub,
86
- **kwargs)
87
-
88
- # Get prompts from dataset
89
- # TODO: support sampler
90
- self.prompts = self.data_adapter.gen_prompts(data_dict=self.dataset)
91
- del self.dataset
92
-
93
- def _pred_answer(self, input_d: dict, infer_cfg: dict, subset_name: str, answer_id: str = None) -> dict:
62
+ self.kwargs = kwargs
94
63
 
95
- ans: dict = self.model_adapter.predict(inputs=input_d, infer_cfg=infer_cfg)
96
- ans[AnswerKeys.ANSWER_ID] = answer_id
97
- ans[AnswerKeys.SUBSET_NAME] = subset_name
64
+ def load_dataset(self):
65
+ dataset = self.data_adapter.load(
66
+ dataset_name_or_path=self.dataset_name_or_path,
67
+ subset_list=self.data_adapter.subset_list,
68
+ work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
69
+ datasets_hub=self.dataset_hub,
70
+ **self.kwargs)
98
71
 
99
- return ans
72
+ # Get prompts from dataset
73
+ prompts = self.data_adapter.gen_prompts(data_dict=dataset)
74
+ return prompts
75
+
76
+ def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
77
+ model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
78
+ input_prompt_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(input_d).items())), ensure_ascii=False)
79
+ infer_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
80
+ return 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
81
+
82
+ def _process_answer(self, answer_d, input_d, subset_name, answer_id):
83
+ answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
84
+ answer_d[AnswerKeys.ANSWER_ID] = answer_id
85
+ answer_d[AnswerKeys.SUBSET_NAME] = subset_name
86
+ answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
87
+ answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
88
+ return answer_d
100
89
 
101
90
  def get_answers(self,
102
91
  subset_name: str,
@@ -147,57 +136,24 @@ class Evaluator(object):
147
136
  resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
148
137
  inputs=prompts_list, infer_cfg=infer_cfg)
149
138
 
150
- assert len(prompts_list) == len(resp_answers_list), \
151
- f'Length of prompts_list({len(prompts_list)}) != Length of resp_answers_list({len(resp_answers_list)})'
152
-
153
- for in_d, resp_d in zip(prompts_list, resp_answers_list):
154
-
155
- # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
156
- model_cfg_str = json.dumps(
157
- OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
158
- ensure_ascii=False)
159
- input_prompt_str = json.dumps(
160
- OrderedDict(sorted(dict_torch_dtype_to_str(in_d).items())), ensure_ascii=False)
161
- infer_cfg_str = json.dumps(
162
- OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
163
- answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
164
-
165
- resp_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
166
- resp_d[AnswerKeys.ANSWER_ID] = answer_id
167
- resp_d[AnswerKeys.SUBSET_NAME] = subset_name
168
- resp_d[AnswerKeys.RAW_INPUT] = in_d[AnswerKeys.RAW_INPUT]
169
- resp_d[AnswerKeys.ORIGIN_PROMPT] = in_d
170
-
171
- answers_list.append(resp_d)
172
- dump_jsonl_data(resp_d, pred_file_path, dump_mode=DumpMode.APPEND)
139
+ for input_prompt, answer_d in zip(prompts_list, resp_answers_list):
140
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
141
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
142
+ answers_list.append(processed_answer)
143
+ dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
173
144
 
174
145
  else:
175
146
  for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
176
-
177
- # Gen answer_id (concat: model_cfg + input_prompt + infer_cfg)
178
- model_cfg_str = json.dumps(
179
- OrderedDict(sorted(dict_torch_dtype_to_str(self.model_adapter.model_cfg).items())),
180
- ensure_ascii=False)
181
- input_prompt_str = json.dumps(
182
- OrderedDict(sorted(dict_torch_dtype_to_str(input_prompt).items())), ensure_ascii=False)
183
- infer_cfg_str = json.dumps(
184
- OrderedDict(sorted(dict_torch_dtype_to_str(infer_cfg).items())), ensure_ascii=False)
185
- answer_id = 'answer-' + gen_hash(model_cfg_str + input_prompt_str + infer_cfg_str)
186
-
187
- # Get answers
188
- answer_d: dict = self._pred_answer(
189
- input_d=input_prompt, infer_cfg=infer_cfg, subset_name=subset_name, answer_id=answer_id)
190
-
191
- answer_d[AnswerKeys.MODEL_SPEC] = self.model_adapter.model_cfg
192
- answer_d[AnswerKeys.RAW_INPUT] = input_prompt[AnswerKeys.RAW_INPUT]
193
- answer_d[AnswerKeys.ORIGIN_PROMPT] = input_prompt
147
+ answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
148
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
149
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
194
150
 
195
151
  if debug:
196
152
  logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
197
- logger.info(f'**predicted ans: {json.dumps(answer_d, ensure_ascii=False)} \n')
153
+ logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
198
154
 
199
- answers_list.append(answer_d)
200
- dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
155
+ answers_list.append(processed_answer)
156
+ dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
201
157
 
202
158
  logger.info(f'Dump predictions to {pred_file_path}.')
203
159
  return answers_list
@@ -241,6 +197,19 @@ class Evaluator(object):
241
197
 
242
198
  return review_res
243
199
 
200
+ def _generate_review_id(self, answer_d):
201
+ # Gen review_id (concat: answer_id + reviewer_spec)
202
+ answer_id = answer_d[AnswerKeys.ANSWER_ID]
203
+ reviewer_spec = {
204
+ 'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
205
+ 'reviewer': ['Evaluator'],
206
+ 'revision': ['default']
207
+ }
208
+ reviewer_spec_str = json.dumps(
209
+ OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
210
+ review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
211
+ return review_id, reviewer_spec
212
+
244
213
  def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
245
214
  """
246
215
  Get reviews from answers.
@@ -264,19 +233,7 @@ class Evaluator(object):
264
233
  logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
265
234
 
266
235
  for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
267
-
268
- # Gen review_id (concat: answer_id + reviewer_spec)
269
- answer_id = answer_d[AnswerKeys.ANSWER_ID]
270
-
271
- reviewer_spec: dict = {
272
- 'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
273
- 'reviewer': ['Evaluator'],
274
- 'revision': ['default']
275
- }
276
- reviewer_spec_str = json.dumps(
277
- OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
278
- review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
279
-
236
+ review_id, reviewer_spec = self._generate_review_id(answer_d)
280
237
  # Get review
281
238
  review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
282
239
 
@@ -284,7 +241,6 @@ class Evaluator(object):
284
241
  logger.info(review_d)
285
242
 
286
243
  reviews_list.append(review_d)
287
-
288
244
  # Dump reviews
289
245
  dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
290
246
 
@@ -380,7 +336,8 @@ class Evaluator(object):
380
336
  stage_answers_dict = {}
381
337
  stage_reviews_dict = {}
382
338
 
383
- for subset_name, prompts_list in self.prompts.items():
339
+ prompts = self.load_dataset()
340
+ for subset_name, prompts_list in prompts.items():
384
341
  limit = kwargs.get('limit', len(prompts_list))
385
342
  prompts_list = prompts_list[:limit]
386
343
 
@@ -8,10 +8,10 @@ import sys
8
8
  import time
9
9
  from abc import ABC, abstractmethod
10
10
  from functools import partial
11
- from typing import Any, List
11
+ from typing import Any, List, Tuple
12
12
 
13
13
  from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
14
- from evalscope.models.openai_model import OpenAIModel
14
+ from evalscope.models.model import OpenAIModel
15
15
  from evalscope.utils import completion_parsers, random_seeded_choice
16
16
  from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
17
17
  from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
@@ -240,7 +240,15 @@ class AutoReviewerGpt4(BaseReviewer):
240
240
  review_text=review_text)
241
241
  return review_result
242
242
 
243
- def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry_run=False, **kwargs) -> (str, Any):
243
+ def _get_review_pair(self,
244
+ model_a,
245
+ model_b,
246
+ question,
247
+ category,
248
+ ans1,
249
+ ans2,
250
+ dry_run=False,
251
+ **kwargs) -> Tuple[str, Any]:
244
252
  input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
245
253
 
246
254
  if self.reference_list:
@@ -263,7 +271,7 @@ class AutoReviewerGpt4(BaseReviewer):
263
271
  result = (result, None)
264
272
  return review_text, *result
265
273
 
266
- def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> (str, Any):
274
+ def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> Tuple[str, Any]:
267
275
  input_msg = dict(ques=question, category=category, ans1=answer)
268
276
 
269
277
  if self.reference_list:
@@ -1 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, weighted_mean
3
+ from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
4
+
5
+ WeightedAverageAccuracy = {'name': 'WeightedAverageAccuracy', 'object': weighted_mean}
6
+ WeightedAverageBLEU = {'name': 'WeightedAverageBLEU', 'object': weighted_mean}
7
+ Pass1 = {'name': 'Pass@1', 'object': weighted_mean}
@@ -55,7 +55,7 @@ try:
55
55
  os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
56
56
  os.system(f'unzip {punkt_path} -d {nltk_dir}')
57
57
  else:
58
- logger.info(f'{punkt_path} already exists, skipping download')
58
+ logger.debug(f'{punkt_path} already exists, skipping download')
59
59
  except Exception as e:
60
60
  logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
61
61