evalscope 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (55) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/aime24/__init__.py +0 -0
  3. evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  5. evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
  6. evalscope/benchmarks/benchmark.py +2 -2
  7. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
  10. evalscope/benchmarks/data_adapter.py +18 -12
  11. evalscope/benchmarks/data_collection/__init__.py +0 -0
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  13. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
  16. evalscope/benchmarks/gpqa/gpqa_adapter.py +26 -8
  17. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
  18. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  19. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  20. evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -13
  21. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  22. evalscope/benchmarks/math_500/__init__.py +0 -0
  23. evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
  24. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  25. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
  26. evalscope/benchmarks/race/race_adapter.py +3 -3
  27. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  28. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
  29. evalscope/collections/evaluator.py +103 -39
  30. evalscope/collections/sampler.py +2 -1
  31. evalscope/collections/schema.py +1 -2
  32. evalscope/config.py +1 -0
  33. evalscope/evaluator/evaluator.py +78 -64
  34. evalscope/metrics/math_parser.py +526 -0
  35. evalscope/metrics/metrics.py +16 -1
  36. evalscope/metrics/named_metrics.py +31 -7
  37. evalscope/models/chat_adapter.py +69 -49
  38. evalscope/models/choice_adapter.py +52 -45
  39. evalscope/models/custom_adapter.py +2 -2
  40. evalscope/models/local_model.py +4 -0
  41. evalscope/models/server_adapter.py +28 -34
  42. evalscope/report/app.py +30 -15
  43. evalscope/run.py +10 -7
  44. evalscope/utils/chat_service.py +2 -2
  45. evalscope/utils/io_utils.py +1 -1
  46. evalscope/version.py +2 -2
  47. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/METADATA +14 -5
  48. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/RECORD +53 -46
  49. tests/cli/test_run.py +93 -16
  50. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  51. evalscope/metrics/math_accuracy.py +0 -200
  52. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
  53. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
  54. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
  55. {evalscope-0.10.1.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,7 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import AverageAccuracy, exact_match
7
+ from evalscope.metrics import exact_match
8
8
  from evalscope.models import MultiChoiceModelAdapter
9
9
  from evalscope.utils import ResponseParser
10
10
  from evalscope.utils.io_utils import jsonl_to_list
@@ -20,7 +20,7 @@ logger = get_logger()
20
20
  dataset_id='modelscope/race',
21
21
  model_adapter=MultiChoiceModelAdapter,
22
22
  subset_list=['high', 'middle'],
23
- metric_list=[AverageAccuracy],
23
+ metric_list=['AverageAccuracy'],
24
24
  few_shot_num=3,
25
25
  train_split='train',
26
26
  eval_split='test',
@@ -82,7 +82,7 @@ class RACEAdapter(DataAdapter):
82
82
 
83
83
  full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
84
84
 
85
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
85
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
86
86
 
87
87
  def get_gold_answer(self, input_d: dict) -> str:
88
88
  # Get the gold choice
@@ -6,7 +6,6 @@ import os
6
6
  from evalscope.benchmarks import Benchmark
7
7
  from evalscope.benchmarks.data_adapter import DataAdapter
8
8
  from evalscope.constants import EvalType
9
- from evalscope.metrics import AverageAccuracy
10
9
  from evalscope.models import ChatGenerationModelAdapter
11
10
  from evalscope.utils import get_logger
12
11
 
@@ -20,7 +19,7 @@ logger = get_logger()
20
19
  dataset_id='modelscope/trivia_qa',
21
20
  model_adapter=ChatGenerationModelAdapter,
22
21
  subset_list=['default'],
23
- metric_list=[AverageAccuracy],
22
+ metric_list=['AverageAccuracy'],
24
23
  few_shot_num=5,
25
24
  train_split='dev',
26
25
  eval_split='test',
@@ -9,9 +9,8 @@ from typing import List
9
9
  from evalscope.benchmarks import Benchmark
10
10
  from evalscope.benchmarks.data_adapter import DataAdapter
11
11
  from evalscope.constants import EvalType
12
- from evalscope.metrics import AverageAccuracy
13
12
  from evalscope.models import ContinuationLogitsModelAdapter
14
- from evalscope.utils import get_logger, normalize_score
13
+ from evalscope.utils import get_logger
15
14
 
16
15
  # flake8: noqa
17
16
 
@@ -25,7 +24,7 @@ logger = get_logger()
25
24
  dataset_id='modelscope/truthful_qa',
26
25
  model_adapter=ContinuationLogitsModelAdapter,
27
26
  subset_list=['multiple_choice'],
28
- metric_list=[AverageAccuracy],
27
+ metric_list=['AverageAccuracy'],
29
28
  few_shot_num=0,
30
29
  train_split=None,
31
30
  eval_split='validation',
@@ -284,8 +283,9 @@ class TruthfulQaAdapter(DataAdapter):
284
283
  logger.error(f'** Unknown review_res: {review_res_d}')
285
284
 
286
285
  # To get mc2 score
287
- return [{
288
- 'metric_name': self.metric_list[0].name,
289
- 'score': self.metric_list[0].object(mc2_list),
290
- 'num': len(mc2_list)
291
- }]
286
+ # return [{
287
+ # 'metric_name': self.metric_list[0].name,
288
+ # 'score': self.metric_list[0].object(mc2_list),
289
+ # 'num': len(mc2_list)
290
+ # }]
291
+ return super().compute_metric(mc2_list)
@@ -2,14 +2,15 @@ import json
2
2
  import os
3
3
  import pandas as pd
4
4
  from collections import defaultdict
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
6
  from tabulate import tabulate
6
7
  from tqdm import tqdm
7
8
  from typing import List
8
9
 
9
- from evalscope.benchmarks import Benchmark
10
+ from evalscope.benchmarks import Benchmark, DataAdapter
10
11
  from evalscope.collections.sampler import DatasetEntry
11
12
  from evalscope.config import TaskConfig
12
- from evalscope.constants import DataCollection, DumpMode
13
+ from evalscope.constants import AnswerKeys, DumpMode, EvalType
13
14
  from evalscope.evaluator import Evaluator
14
15
  from evalscope.models import get_local_model, initialize_model_adapter
15
16
  from evalscope.report import ReportGenerator
@@ -29,11 +30,16 @@ class SimpleEvaluator(Evaluator):
29
30
  task_cfg=task_cfg,
30
31
  outputs=outputs)
31
32
 
32
- def get_answer(self, input_prompt, subset_name, infer_cfg) -> dict:
33
- answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
34
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
35
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
36
- return processed_answer
33
+ def get_answer(self, samples, infer_cfg) -> List[dict]:
34
+ input_prompts = [sample.prompt for sample in samples]
35
+ subset_name = samples[0].subset_name
36
+ answers_list = []
37
+ answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
38
+ for answer_d, input_prompt in zip(answer_ds, input_prompts):
39
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
40
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
41
+ answers_list.append(processed_answer)
42
+ return answers_list, samples
37
43
 
38
44
  def get_review(self, answer_d) -> dict:
39
45
  review_id, reviewer_spec = self._generate_review_id(answer_d)
@@ -42,38 +48,50 @@ class SimpleEvaluator(Evaluator):
42
48
 
43
49
  def get_score(self, review_d) -> float:
44
50
  metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
45
- # use the first metric by default
46
- score = metric_score[0]['score']
47
- return score
51
+ return metric_score
48
52
 
49
53
 
50
54
  class EvaluatorCollection:
51
55
 
52
- def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure):
56
+ def __init__(self, task_cfg: TaskConfig, data_adapter: DataAdapter, outputs: OutputsStructure):
53
57
  self.task_cfg = task_cfg
58
+ self.data_adapter = data_adapter
54
59
  self.outputs = outputs
55
60
  self.model = get_local_model(task_cfg)
61
+
56
62
  self.dataset, self.dataset_name = self.load()
57
- self.dataset_name_map, self.dataset_id_map = self._parse_dataset()
63
+ self.dataset_name_map = EvaluatorCollection._init_name_map(self.dataset)
64
+ self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
58
65
  self.evaluators = self._initialize_evaluators()
59
66
 
60
67
  def load(self) -> tuple[list[DatasetEntry], str]:
61
- dataset_path = self.task_cfg.dataset_args[DataCollection.NAME]['local_path']
62
- dataset_name = os.path.basename(dataset_path).split('.')[0]
63
- raw_dataset = jsonl_to_list(dataset_path)
68
+ dataset_name = os.path.basename(self.data_adapter.dataset_id).split('.')[0]
69
+ raw_dataset = self.data_adapter.load()
70
+ # limit the dataset
71
+ if self.task_cfg.limit:
72
+ raw_dataset = raw_dataset[:self.task_cfg.limit]
73
+ # index dataset
64
74
  datasets = []
65
75
  for sample in raw_dataset:
76
+ sample['prompt'].update({'index': sample['index']})
66
77
  datasets.append(DatasetEntry(**sample))
78
+
67
79
  return datasets, dataset_name
68
80
 
69
- def _parse_dataset(self):
81
+ @staticmethod
82
+ def _init_name_map(dataset):
70
83
  dataset_name_map = defaultdict(lambda: defaultdict(list))
71
- dataset_id_map = {}
72
- for sample in self.dataset:
84
+ for sample in dataset:
73
85
  dataset_name, subset_name = sample.dataset_name, sample.subset_name
74
86
  dataset_name_map[dataset_name][subset_name].append(sample.index)
87
+ return dataset_name_map
88
+
89
+ @staticmethod
90
+ def _init_id_map(dataset):
91
+ dataset_id_map = {}
92
+ for sample in dataset:
75
93
  dataset_id_map[sample.index] = sample
76
- return dataset_name_map, dataset_id_map
94
+ return dataset_id_map
77
95
 
78
96
  def _initialize_evaluators(self):
79
97
  evaluators = {}
@@ -93,15 +111,16 @@ class EvaluatorCollection:
93
111
  for subset_name, ids in data_map.items():
94
112
  for _id in ids:
95
113
  row_data: DatasetEntry = self.dataset_id_map[_id]
96
- score = scores[_id]
97
- data.append(
98
- dict(
99
- task_type=row_data.task_type,
100
- categories=tuple(row_data.categories),
101
- dataset_name=dataset_name,
102
- subset_name=subset_name,
103
- tags=row_data.tags,
104
- score=score))
114
+ for metric in scores[_id]:
115
+ data.append(
116
+ dict(
117
+ task_type=row_data.task_type,
118
+ categories=tuple(row_data.categories),
119
+ dataset_name=dataset_name,
120
+ subset_name=subset_name,
121
+ tags=row_data.tags,
122
+ metric=metric['metric_name'],
123
+ score=metric['score']))
105
124
  return pd.DataFrame(data)
106
125
 
107
126
  def aggregate_and_sort(df, group_by_cols):
@@ -117,13 +136,13 @@ class EvaluatorCollection:
117
136
  df = get_dataframe(scores)
118
137
 
119
138
  # multi-level aggregation
120
- subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
121
- dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
122
- task_report_df = aggregate_and_sort(df, ['task_type'])
139
+ subset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name', 'subset_name'])
140
+ dataset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name'])
141
+ task_report_df = aggregate_and_sort(df, ['task_type', 'metric'])
123
142
 
124
143
  # explode tags to multiple rows
125
144
  df_exploded_tags = df.explode('tags')
126
- tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags'])
145
+ tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags', 'metric'])
127
146
 
128
147
  # process multi-level categories
129
148
  df_categories = df.copy()
@@ -132,7 +151,8 @@ class EvaluatorCollection:
132
151
  for level in range(max_depth):
133
152
  df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
134
153
  if len(x) > level else '')
135
- category_report_df = aggregate_and_sort(df_categories, [f'category{level}' for level in range(max_depth)])
154
+ category_report_df = aggregate_and_sort(df_categories,
155
+ [f'category{level}' for level in range(max_depth)] + ['metric'])
136
156
 
137
157
  # convert to dict format
138
158
  report_dict = {
@@ -155,16 +175,60 @@ class EvaluatorCollection:
155
175
  with open(report_file_path, 'w', encoding='utf-8') as f:
156
176
  json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
157
177
 
178
+ def _filter_answer(self, pred_file_path):
179
+ answer_dict = defaultdict(dict)
180
+ if self.task_cfg.use_cache and os.path.exists(pred_file_path):
181
+ answers_list = jsonl_to_list(pred_file_path)
182
+ indices = set()
183
+ for answer in answers_list:
184
+ index = answer[AnswerKeys.ORIGIN_PROMPT].get('index')
185
+ answer_dict[index] = answer
186
+ indices.add(index)
187
+ data = []
188
+ for sample in self.dataset:
189
+ if sample.index not in indices:
190
+ data.append(sample)
191
+ data_map = self._init_name_map(data)
192
+
193
+ return answer_dict, data, data_map
194
+ return answer_dict, self.dataset, self.dataset_name_map
195
+
158
196
  def get_answers(self):
159
197
  pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
160
198
  f'{self.dataset_name}.jsonl')
161
199
  os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
162
- answers = defaultdict(dict)
163
- for sample in tqdm(self.dataset, desc='Getting answers'):
164
- evaluator = self.evaluators[sample.dataset_name]
165
- answer_d = evaluator.get_answer(sample.prompt, sample.subset_name, self.task_cfg.generation_config)
166
- answers[sample.index] = answer_d
167
- dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
200
+
201
+ answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
202
+
203
+ eval_batch_size = self.task_cfg.eval_batch_size
204
+ with tqdm(total=len(dataset), desc='Getting answers') as pbar:
205
+ if self.task_cfg.eval_type == EvalType.SERVICE:
206
+ with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
207
+ futures = []
208
+ for sample in dataset:
209
+ evaluator = self.evaluators[sample.dataset_name]
210
+ futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
211
+ for future in as_completed(futures):
212
+ answer_list, samples = future.result()
213
+ answers[samples[0].index] = answer_list[0]
214
+ dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
215
+ pbar.update(1)
216
+ else:
217
+ for dataset_name, data_map in dataset_name_map.items():
218
+ # get evaluator for the dataset
219
+ evaluator = self.evaluators[dataset_name]
220
+ for subset_name, ids in data_map.items():
221
+ for i in range(0, len(ids), eval_batch_size):
222
+ # get batch samples
223
+ batch_ids = ids[i:i + eval_batch_size]
224
+ batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
225
+ answer_list, _ = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
226
+ # update answers
227
+ for j, _id in enumerate(batch_ids):
228
+ answers[_id] = answer_list[j]
229
+ dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
230
+
231
+ pbar.update(len(batch_ids))
168
232
  return answers
169
233
 
170
234
  def get_reviews(self, answers):
@@ -44,7 +44,8 @@ class Sampler(ABC):
44
44
  dataset_name=dataset.name,
45
45
  subset_name=subset_name,
46
46
  ))
47
- sampled_data = random.choices(all_data, k=count)
47
+ count = min(count, len(all_data)) # avoid sampling more than the dataset size
48
+ sampled_data = random.sample(all_data, k=count)
48
49
  return sampled_data
49
50
 
50
51
  def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
@@ -19,8 +19,7 @@ class DatasetInfo:
19
19
  benchmark_meta = Benchmark.get(self.name)
20
20
 
21
21
  data_adapter = benchmark_meta.get_data_adapter(config=self.args)
22
- data_dict = data_adapter.load(
23
- dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
22
+ data_dict = data_adapter.load()
24
23
  prompts = data_adapter.gen_prompts(data_dict)
25
24
  return prompts
26
25
 
evalscope/config.py CHANGED
@@ -54,6 +54,7 @@ class TaskConfig:
54
54
  eval_config: Union[str, Dict, None] = None
55
55
  stage: str = EvalStage.ALL
56
56
  limit: Optional[int] = None
57
+ eval_batch_size: int = 1
57
58
 
58
59
  # Cache and working directory arguments
59
60
  mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
@@ -3,15 +3,16 @@
3
3
  import json
4
4
  import os
5
5
  import time
6
- from collections import OrderedDict
6
+ from collections import OrderedDict, defaultdict
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
8
  from copy import deepcopy
8
9
  from tqdm import tqdm
9
10
  from typing import Any, Dict, List, Optional, Union
10
11
 
11
12
  from evalscope.benchmarks import DataAdapter
12
13
  from evalscope.config import TaskConfig
13
- from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
14
- from evalscope.models import BaseModelAdapter, CustomModelAdapter
14
+ from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, ReviewKeys
15
+ from evalscope.models import BaseModelAdapter
15
16
  from evalscope.report import Report, gen_table
16
17
  from evalscope.utils import dict_torch_dtype_to_str, gen_hash
17
18
  from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
@@ -36,7 +37,6 @@ class Evaluator(object):
36
37
  """
37
38
 
38
39
  def __init__(self,
39
- dataset_name_or_path: str,
40
40
  data_adapter: DataAdapter,
41
41
  model_adapter: BaseModelAdapter,
42
42
  outputs: OutputsStructure = None,
@@ -44,7 +44,7 @@ class Evaluator(object):
44
44
  **kwargs):
45
45
 
46
46
  self.dataset_name = data_adapter.name
47
- self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
47
+ self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
48
48
  self.model_name = task_cfg.model_id
49
49
  self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
50
50
 
@@ -63,15 +63,20 @@ class Evaluator(object):
63
63
 
64
64
  def load_dataset(self):
65
65
  dataset = self.data_adapter.load(
66
- dataset_name_or_path=self.dataset_name_or_path,
67
- subset_list=self.data_adapter.subset_list,
68
- work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
69
- datasets_hub=self.dataset_hub,
70
- **self.kwargs)
66
+ work_dir=os.path.expanduser(self.task_cfg.dataset_dir), datasets_hub=self.dataset_hub, **self.kwargs)
71
67
 
72
68
  # Get prompts from dataset
73
69
  prompts = self.data_adapter.gen_prompts(data_dict=dataset)
74
- return prompts
70
+
71
+ # Limit and index prompts
72
+ limited_prompts = defaultdict(list)
73
+ for subset_name, prompts_list in prompts.items():
74
+ limit = self.task_cfg.limit or len(prompts_list)
75
+ for index, prompt in enumerate(prompts_list[:limit]):
76
+ prompt['index'] = index
77
+ limited_prompts[subset_name].append(prompt)
78
+
79
+ return limited_prompts
75
80
 
76
81
  def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
77
82
  model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
@@ -87,12 +92,38 @@ class Evaluator(object):
87
92
  answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
88
93
  return answer_d
89
94
 
90
- def get_answers(self,
91
- subset_name: str,
92
- prompts_list: List[dict],
93
- infer_cfg: dict = None,
94
- debug: bool = False,
95
- **kwargs) -> list:
95
+ def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
96
+ answers_list = []
97
+ answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
98
+ for answer_d, input_prompt in zip(answer_ds, input_prompts):
99
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
100
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
101
+ answers_list.append(processed_answer)
102
+ return answers_list
103
+
104
+ @staticmethod
105
+ def filter_answer(use_cache, prompts_list, pred_file_path) -> dict:
106
+ # Filter prompts that have been answered
107
+ answers_list = []
108
+ if not use_cache or not os.path.exists(pred_file_path):
109
+ return answers_list, prompts_list
110
+
111
+ def get_answered_indices(answers_list: List[Dict]) -> List[int]:
112
+ indices = [answer[AnswerKeys.ORIGIN_PROMPT].get('index') for answer in answers_list]
113
+
114
+ if all(index is None for index in indices):
115
+ return list(range(len(answers_list)))
116
+
117
+ return [index for index in indices if index is not None]
118
+
119
+ answers_list = jsonl_to_list(pred_file_path)
120
+ answered_indices = set(get_answered_indices(answers_list))
121
+ logger.info(f'Reusing predictions from {pred_file_path}, got {len(answered_indices)} answers.')
122
+
123
+ prompts = [prompt for i, prompt in enumerate(prompts_list) if i not in answered_indices]
124
+ return answers_list, prompts
125
+
126
+ def get_answers(self, subset_name: str, prompts_list: List[dict], infer_cfg: dict = None, **kwargs) -> list:
96
127
  """
97
128
  Get answers from model inference.
98
129
  It is required to rewrite this method to support your own evaluator.
@@ -110,7 +141,6 @@ class Evaluator(object):
110
141
  max_length: int, the max length of the sequence to be generated.
111
142
  max_new_tokens: int, the max number of new tokens to be generated.
112
143
  repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
113
- debug: whether to run in debug mode.
114
144
  **kwargs: kwargs.
115
145
 
116
146
  Returns: The list of answers.
@@ -119,41 +149,35 @@ class Evaluator(object):
119
149
  assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
120
150
  assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
121
151
 
122
- answers_list = []
123
152
  pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
124
153
  pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
125
154
  os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
126
155
 
127
- if self.use_cache and os.path.exists(pred_file_path):
128
- answers_list = jsonl_to_list(pred_file_path)
129
- logger.info(f'Reusing predictions from {pred_file_path}, got {len(answers_list)} answers.')
130
- # Note: assume prediction in order of prompts_list
131
- prompts_list = prompts_list[len(answers_list):]
132
-
133
- if isinstance(self.model_adapter, CustomModelAdapter):
134
- # Batch inference for custom model
135
-
136
- resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
137
- inputs=prompts_list, infer_cfg=infer_cfg)
138
-
139
- for input_prompt, answer_d in zip(prompts_list, resp_answers_list):
140
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
141
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
142
- answers_list.append(processed_answer)
143
- dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
144
-
156
+ answers_list, prompts_list = Evaluator.filter_answer(self.use_cache, prompts_list, pred_file_path)
157
+
158
+ eval_batch_size = self.task_cfg.eval_batch_size
159
+ if self.task_cfg.eval_type == EvalType.SERVICE:
160
+ with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
161
+ with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
162
+ futures = []
163
+ for input_prompt in prompts_list:
164
+ futures.append(executor.submit(self._get_answer, [input_prompt], subset_name, infer_cfg))
165
+ for future in as_completed(futures):
166
+ answer_ds: List[dict] = future.result()
167
+ answers_list.extend(answer_ds)
168
+ dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
169
+ pbar.update(len(answer_ds))
145
170
  else:
146
- for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
147
- answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
148
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
149
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
150
-
151
- if debug:
152
- logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
153
- logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
154
-
155
- answers_list.append(processed_answer)
156
- dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
171
+ batch_prompts_list = [
172
+ prompts_list[i:i + eval_batch_size] for i in range(0, len(prompts_list), eval_batch_size)
173
+ ]
174
+ with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
175
+ for batch_prompts in batch_prompts_list:
176
+ answer_ds: List[dict] = self._get_answer(
177
+ input_prompts=batch_prompts, subset_name=subset_name, infer_cfg=infer_cfg)
178
+ answers_list.extend(answer_ds)
179
+ dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
180
+ pbar.update(len(batch_prompts))
157
181
 
158
182
  logger.info(f'Dump predictions to {pred_file_path}.')
159
183
  return answers_list
@@ -200,17 +224,13 @@ class Evaluator(object):
200
224
  def _generate_review_id(self, answer_d):
201
225
  # Gen review_id (concat: answer_id + reviewer_spec)
202
226
  answer_id = answer_d[AnswerKeys.ANSWER_ID]
203
- reviewer_spec = {
204
- 'metric': [metric.name for metric in self.data_adapter.metric_list],
205
- 'reviewer': ['Evaluator'],
206
- 'revision': ['default']
207
- }
227
+ reviewer_spec = {'metric': self.data_adapter.metric_list, 'reviewer': ['Evaluator'], 'revision': ['default']}
208
228
  reviewer_spec_str = json.dumps(
209
229
  OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
210
230
  review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
211
231
  return review_id, reviewer_spec
212
232
 
213
- def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
233
+ def get_reviews(self, subset_name: str, answers_list: List[dict], **kwargs) -> list:
214
234
  """
215
235
  Get reviews from answers.
216
236
  It is required to rewrite this method to support your own evaluator.
@@ -218,7 +238,6 @@ class Evaluator(object):
218
238
  Args:
219
239
  subset_name: subset name of benchmark
220
240
  answers_list: inference results list.
221
- debug: whether to run in debug mode.
222
241
  **kwargs: kwargs.
223
242
 
224
243
  Returns: reviews list.
@@ -237,8 +256,7 @@ class Evaluator(object):
237
256
  # Get review
238
257
  review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
239
258
 
240
- if debug:
241
- logger.info(review_d)
259
+ logger.debug(review_d)
242
260
 
243
261
  reviews_list.append(review_d)
244
262
  # Dump reviews
@@ -315,7 +333,7 @@ class Evaluator(object):
315
333
  logger.error('Failed to generate report table.')
316
334
  return report_map
317
335
 
318
- def eval(self, infer_cfg: dict = None, debug: bool = False, **kwargs) -> dict:
336
+ def eval(self, **kwargs) -> dict:
319
337
  """
320
338
  Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
321
339
  It is required to rewrite this method to support your own evaluator.
@@ -329,7 +347,6 @@ class Evaluator(object):
329
347
 
330
348
  Args:
331
349
  infer_cfg: The config for model inference.
332
- debug: Whether to run in debug mode. Default: False.
333
350
 
334
351
  Returns:
335
352
  Dict of results. Depends on the stage of evaluation.
@@ -347,17 +364,14 @@ class Evaluator(object):
347
364
 
348
365
  prompts = self.load_dataset()
349
366
  for subset_name, prompts_list in prompts.items():
350
- limit = kwargs.get('limit', len(prompts_list))
351
- prompts_list = prompts_list[:limit]
352
367
 
353
368
  answers_list: list = self.get_answers(
354
- subset_name=subset_name, prompts_list=prompts_list, infer_cfg=infer_cfg, debug=debug, **kwargs)
369
+ subset_name=subset_name, prompts_list=prompts_list, infer_cfg=self.task_cfg.generation_config, **kwargs)
355
370
  if self.stage == EvalStage.INFER:
356
371
  stage_answers_dict[subset_name] = answers_list
357
372
  continue
358
373
 
359
- reviews_list: list = self.get_reviews(
360
- subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
374
+ reviews_list: list = self.get_reviews(subset_name=subset_name, answers_list=answers_list, **kwargs)
361
375
 
362
376
  metric_res = self.compute_metrics(reviews_list=reviews_list)
363
377
  reviews_score_all[subset_name] = metric_res