evalscope 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (81) hide show
  1. evalscope/arguments.py +3 -0
  2. evalscope/benchmarks/aime/__init__.py +0 -0
  3. evalscope/benchmarks/aime/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/aime/aime25_adapter.py +49 -0
  5. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  6. evalscope/benchmarks/bbh/bbh_adapter.py +17 -14
  7. evalscope/benchmarks/benchmark.py +5 -3
  8. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  9. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  10. evalscope/benchmarks/competition_math/competition_math_adapter.py +21 -24
  11. evalscope/benchmarks/data_adapter.py +88 -29
  12. evalscope/benchmarks/data_collection/__init__.py +0 -0
  13. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  14. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  15. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +125 -0
  16. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -11
  17. evalscope/benchmarks/gpqa/gpqa_adapter.py +27 -9
  18. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +9 -14
  19. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  20. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  21. evalscope/benchmarks/ifeval/ifeval_adapter.py +15 -14
  22. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  23. evalscope/benchmarks/math_500/__init__.py +0 -0
  24. evalscope/benchmarks/math_500/math_500_adapter.py +58 -0
  25. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  26. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +32 -36
  27. evalscope/benchmarks/musr/__init__.py +0 -0
  28. evalscope/benchmarks/musr/musr_adapter.py +68 -0
  29. evalscope/benchmarks/process_bench/__init__.py +0 -0
  30. evalscope/benchmarks/process_bench/critique_template.txt +13 -0
  31. evalscope/benchmarks/process_bench/process_bench_adapter.py +96 -0
  32. evalscope/benchmarks/race/race_adapter.py +3 -3
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  34. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +9 -9
  35. evalscope/cli/start_app.py +4 -1
  36. evalscope/cli/start_eval.py +4 -3
  37. evalscope/cli/start_perf.py +4 -2
  38. evalscope/collections/evaluator.py +109 -39
  39. evalscope/collections/sampler.py +2 -1
  40. evalscope/collections/schema.py +1 -2
  41. evalscope/config.py +4 -1
  42. evalscope/evaluator/evaluator.py +81 -65
  43. evalscope/metrics/__init__.py +2 -1
  44. evalscope/metrics/math_parser.py +526 -0
  45. evalscope/metrics/metrics.py +39 -3
  46. evalscope/metrics/named_metrics.py +31 -7
  47. evalscope/models/base_adapter.py +7 -1
  48. evalscope/models/chat_adapter.py +69 -49
  49. evalscope/models/choice_adapter.py +52 -45
  50. evalscope/models/custom_adapter.py +2 -2
  51. evalscope/models/local_model.py +7 -2
  52. evalscope/models/server_adapter.py +106 -61
  53. evalscope/perf/__init__.py +0 -1
  54. evalscope/perf/arguments.py +5 -1
  55. evalscope/perf/http_client.py +2 -2
  56. evalscope/perf/plugin/api/openai_api.py +11 -1
  57. evalscope/perf/utils/benchmark_util.py +6 -2
  58. evalscope/report/app.py +42 -23
  59. evalscope/run.py +11 -8
  60. evalscope/third_party/thinkbench/__init__.py +3 -0
  61. evalscope/third_party/thinkbench/eval.py +264 -0
  62. evalscope/third_party/thinkbench/infer.py +100 -0
  63. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  64. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  65. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  66. evalscope/third_party/thinkbench/tools/llm.py +47 -0
  67. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  68. evalscope/utils/chat_service.py +2 -2
  69. evalscope/utils/io_utils.py +1 -1
  70. evalscope/utils/model_utils.py +17 -1
  71. evalscope/utils/utils.py +45 -45
  72. evalscope/version.py +2 -2
  73. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/METADATA +22 -8
  74. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/RECORD +79 -58
  75. tests/cli/test_run.py +108 -19
  76. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  77. evalscope/metrics/math_accuracy.py +0 -200
  78. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/LICENSE +0 -0
  79. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/WHEEL +0 -0
  80. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/entry_points.txt +0 -0
  81. {evalscope-0.10.1.dist-info → evalscope-0.12.0.dist-info}/top_level.txt +0 -0
@@ -3,8 +3,6 @@ import os
3
3
  from argparse import ArgumentParser
4
4
 
5
5
  from evalscope.cli.base import CLICommand
6
- from evalscope.perf.arguments import add_argument
7
- from evalscope.perf.main import run_perf_benchmark
8
6
 
9
7
 
10
8
  def subparser_func(args):
@@ -23,9 +21,13 @@ class PerfBenchCMD(CLICommand):
23
21
  def define_args(parsers: ArgumentParser):
24
22
  """ define args for create pipeline template command.
25
23
  """
24
+ from evalscope.perf.arguments import add_argument
25
+
26
26
  parser = parsers.add_parser(PerfBenchCMD.name)
27
27
  add_argument(parser)
28
28
  parser.set_defaults(func=subparser_func)
29
29
 
30
30
  def execute(self):
31
+ from evalscope.perf.main import run_perf_benchmark
32
+
31
33
  run_perf_benchmark(self.args)
@@ -2,14 +2,15 @@ import json
2
2
  import os
3
3
  import pandas as pd
4
4
  from collections import defaultdict
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
6
  from tabulate import tabulate
6
7
  from tqdm import tqdm
7
8
  from typing import List
8
9
 
9
- from evalscope.benchmarks import Benchmark
10
+ from evalscope.benchmarks import Benchmark, DataAdapter
10
11
  from evalscope.collections.sampler import DatasetEntry
11
12
  from evalscope.config import TaskConfig
12
- from evalscope.constants import DataCollection, DumpMode
13
+ from evalscope.constants import AnswerKeys, DumpMode, EvalType
13
14
  from evalscope.evaluator import Evaluator
14
15
  from evalscope.models import get_local_model, initialize_model_adapter
15
16
  from evalscope.report import ReportGenerator
@@ -29,11 +30,16 @@ class SimpleEvaluator(Evaluator):
29
30
  task_cfg=task_cfg,
30
31
  outputs=outputs)
31
32
 
32
- def get_answer(self, input_prompt, subset_name, infer_cfg) -> dict:
33
- answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
34
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
35
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
36
- return processed_answer
33
+ def get_answer(self, samples, infer_cfg) -> List[dict]:
34
+ input_prompts = [sample.prompt for sample in samples]
35
+ subset_name = samples[0].subset_name
36
+ answers_list = []
37
+ answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
38
+ for answer_d, input_prompt in zip(answer_ds, input_prompts):
39
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
40
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
41
+ answers_list.append(processed_answer)
42
+ return answers_list, samples
37
43
 
38
44
  def get_review(self, answer_d) -> dict:
39
45
  review_id, reviewer_spec = self._generate_review_id(answer_d)
@@ -42,38 +48,50 @@ class SimpleEvaluator(Evaluator):
42
48
 
43
49
  def get_score(self, review_d) -> float:
44
50
  metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
45
- # use the first metric by default
46
- score = metric_score[0]['score']
47
- return score
51
+ return metric_score
48
52
 
49
53
 
50
54
  class EvaluatorCollection:
51
55
 
52
- def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure):
56
+ def __init__(self, task_cfg: TaskConfig, data_adapter: DataAdapter, outputs: OutputsStructure):
53
57
  self.task_cfg = task_cfg
58
+ self.data_adapter = data_adapter
54
59
  self.outputs = outputs
55
60
  self.model = get_local_model(task_cfg)
61
+
56
62
  self.dataset, self.dataset_name = self.load()
57
- self.dataset_name_map, self.dataset_id_map = self._parse_dataset()
63
+ self.dataset_name_map = EvaluatorCollection._init_name_map(self.dataset)
64
+ self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
58
65
  self.evaluators = self._initialize_evaluators()
59
66
 
60
67
  def load(self) -> tuple[list[DatasetEntry], str]:
61
- dataset_path = self.task_cfg.dataset_args[DataCollection.NAME]['local_path']
62
- dataset_name = os.path.basename(dataset_path).split('.')[0]
63
- raw_dataset = jsonl_to_list(dataset_path)
68
+ dataset_name = os.path.basename(self.data_adapter.dataset_id).split('.')[0]
69
+ raw_dataset = self.data_adapter.load()
70
+ # limit the dataset
71
+ if self.task_cfg.limit:
72
+ raw_dataset = raw_dataset[:self.task_cfg.limit]
73
+ # index dataset
64
74
  datasets = []
65
75
  for sample in raw_dataset:
76
+ sample['prompt'].update({'index': sample['index']})
66
77
  datasets.append(DatasetEntry(**sample))
78
+
67
79
  return datasets, dataset_name
68
80
 
69
- def _parse_dataset(self):
81
+ @staticmethod
82
+ def _init_name_map(dataset):
70
83
  dataset_name_map = defaultdict(lambda: defaultdict(list))
71
- dataset_id_map = {}
72
- for sample in self.dataset:
84
+ for sample in dataset:
73
85
  dataset_name, subset_name = sample.dataset_name, sample.subset_name
74
86
  dataset_name_map[dataset_name][subset_name].append(sample.index)
87
+ return dataset_name_map
88
+
89
+ @staticmethod
90
+ def _init_id_map(dataset):
91
+ dataset_id_map = {}
92
+ for sample in dataset:
75
93
  dataset_id_map[sample.index] = sample
76
- return dataset_name_map, dataset_id_map
94
+ return dataset_id_map
77
95
 
78
96
  def _initialize_evaluators(self):
79
97
  evaluators = {}
@@ -93,15 +111,16 @@ class EvaluatorCollection:
93
111
  for subset_name, ids in data_map.items():
94
112
  for _id in ids:
95
113
  row_data: DatasetEntry = self.dataset_id_map[_id]
96
- score = scores[_id]
97
- data.append(
98
- dict(
99
- task_type=row_data.task_type,
100
- categories=tuple(row_data.categories),
101
- dataset_name=dataset_name,
102
- subset_name=subset_name,
103
- tags=row_data.tags,
104
- score=score))
114
+ for metric in scores[_id]:
115
+ data.append(
116
+ dict(
117
+ task_type=row_data.task_type,
118
+ categories=tuple(row_data.categories),
119
+ dataset_name=dataset_name,
120
+ subset_name=subset_name,
121
+ tags=row_data.tags,
122
+ metric=metric['metric_name'],
123
+ score=metric['score']))
105
124
  return pd.DataFrame(data)
106
125
 
107
126
  def aggregate_and_sort(df, group_by_cols):
@@ -117,13 +136,13 @@ class EvaluatorCollection:
117
136
  df = get_dataframe(scores)
118
137
 
119
138
  # multi-level aggregation
120
- subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
121
- dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
122
- task_report_df = aggregate_and_sort(df, ['task_type'])
139
+ subset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name', 'subset_name'])
140
+ dataset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name'])
141
+ task_report_df = aggregate_and_sort(df, ['task_type', 'metric'])
123
142
 
124
143
  # explode tags to multiple rows
125
144
  df_exploded_tags = df.explode('tags')
126
- tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags'])
145
+ tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags', 'metric'])
127
146
 
128
147
  # process multi-level categories
129
148
  df_categories = df.copy()
@@ -132,7 +151,8 @@ class EvaluatorCollection:
132
151
  for level in range(max_depth):
133
152
  df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
134
153
  if len(x) > level else '')
135
- category_report_df = aggregate_and_sort(df_categories, [f'category{level}' for level in range(max_depth)])
154
+ category_report_df = aggregate_and_sort(df_categories,
155
+ [f'category{level}' for level in range(max_depth)] + ['metric'])
136
156
 
137
157
  # convert to dict format
138
158
  report_dict = {
@@ -155,21 +175,71 @@ class EvaluatorCollection:
155
175
  with open(report_file_path, 'w', encoding='utf-8') as f:
156
176
  json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
157
177
 
178
+ def _filter_answer(self, pred_file_path):
179
+ answer_dict = defaultdict(dict)
180
+ if self.task_cfg.use_cache and os.path.exists(pred_file_path):
181
+ answers_list = jsonl_to_list(pred_file_path)
182
+ indices = set()
183
+ for answer in answers_list:
184
+ index = answer[AnswerKeys.ORIGIN_PROMPT].get('index')
185
+ answer_dict[index] = answer
186
+ indices.add(index)
187
+ data = []
188
+ for sample in self.dataset:
189
+ if sample.index not in indices:
190
+ data.append(sample)
191
+ data_map = self._init_name_map(data)
192
+
193
+ return answer_dict, data, data_map
194
+ return answer_dict, self.dataset, self.dataset_name_map
195
+
158
196
  def get_answers(self):
159
197
  pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
160
198
  f'{self.dataset_name}.jsonl')
161
199
  os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
162
- answers = defaultdict(dict)
163
- for sample in tqdm(self.dataset, desc='Getting answers'):
164
- evaluator = self.evaluators[sample.dataset_name]
165
- answer_d = evaluator.get_answer(sample.prompt, sample.subset_name, self.task_cfg.generation_config)
166
- answers[sample.index] = answer_d
167
- dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
200
+
201
+ answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
202
+
203
+ eval_batch_size = self.task_cfg.eval_batch_size
204
+ with tqdm(total=len(dataset), desc='Getting answers') as pbar:
205
+ if self.task_cfg.eval_type == EvalType.SERVICE:
206
+ with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
207
+ futures = []
208
+ for sample in dataset:
209
+ evaluator = self.evaluators[sample.dataset_name]
210
+ futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
211
+ for future in as_completed(futures):
212
+ answer_list, samples = future.result()
213
+ answers[samples[0].index] = answer_list[0]
214
+ dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
215
+ pbar.update(1)
216
+ else:
217
+ for dataset_name, data_map in dataset_name_map.items():
218
+ # get evaluator for the dataset
219
+ evaluator = self.evaluators[dataset_name]
220
+ for subset_name, ids in data_map.items():
221
+ for i in range(0, len(ids), eval_batch_size):
222
+ # get batch samples
223
+ batch_ids = ids[i:i + eval_batch_size]
224
+ batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
225
+ answer_list, _ = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
226
+ # update answers
227
+ for j, _id in enumerate(batch_ids):
228
+ answers[_id] = answer_list[j]
229
+ dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
230
+
231
+ pbar.update(len(batch_ids))
168
232
  return answers
169
233
 
170
234
  def get_reviews(self, answers):
171
235
  review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
172
236
  os.makedirs(review_file_path, exist_ok=True)
237
+
238
+ if self.task_cfg.use_cache and os.path.exists(review_file_path):
239
+ logger.warning(
240
+ f'Ignore use_cache={self.task_cfg.use_cache}, updating the review file: {review_file_path} ...')
241
+ os.remove(review_file_path)
242
+
173
243
  reviews = defaultdict(dict)
174
244
  for sample in tqdm(self.dataset, desc='Getting reviews'):
175
245
  evaluator = self.evaluators[sample.dataset_name]
@@ -44,7 +44,8 @@ class Sampler(ABC):
44
44
  dataset_name=dataset.name,
45
45
  subset_name=subset_name,
46
46
  ))
47
- sampled_data = random.choices(all_data, k=count)
47
+ count = min(count, len(all_data)) # avoid sampling more than the dataset size
48
+ sampled_data = random.sample(all_data, k=count)
48
49
  return sampled_data
49
50
 
50
51
  def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
@@ -19,8 +19,7 @@ class DatasetInfo:
19
19
  benchmark_meta = Benchmark.get(self.name)
20
20
 
21
21
  data_adapter = benchmark_meta.get_data_adapter(config=self.args)
22
- data_dict = data_adapter.load(
23
- dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
22
+ data_dict = data_adapter.load()
24
23
  prompts = data_adapter.gen_prompts(data_dict)
25
24
  return prompts
26
25
 
evalscope/config.py CHANGED
@@ -17,7 +17,7 @@ logger = get_logger()
17
17
 
18
18
  cur_path = os.path.dirname(os.path.abspath(__file__))
19
19
 
20
- DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16', 'device': 'auto'}
20
+ DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16'}
21
21
  DEFAULT_GENERATION_CONFIG = {
22
22
  'max_length': 2048,
23
23
  'max_new_tokens': 512,
@@ -54,6 +54,7 @@ class TaskConfig:
54
54
  eval_config: Union[str, Dict, None] = None
55
55
  stage: str = EvalStage.ALL
56
56
  limit: Optional[int] = None
57
+ eval_batch_size: int = 1
57
58
 
58
59
  # Cache and working directory arguments
59
60
  mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
@@ -67,6 +68,8 @@ class TaskConfig:
67
68
  seed: Optional[int] = 42
68
69
  api_url: Optional[str] = None # Only used for server model
69
70
  api_key: Optional[str] = 'EMPTY' # Only used for server model
71
+ timeout: Optional[float] = None # Only used for server model
72
+ stream: bool = False # Only used for server model
70
73
 
71
74
  def __post_init__(self):
72
75
  if (not self.model_id) and self.model:
@@ -3,15 +3,16 @@
3
3
  import json
4
4
  import os
5
5
  import time
6
- from collections import OrderedDict
6
+ from collections import OrderedDict, defaultdict
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
8
  from copy import deepcopy
8
9
  from tqdm import tqdm
9
10
  from typing import Any, Dict, List, Optional, Union
10
11
 
11
12
  from evalscope.benchmarks import DataAdapter
12
13
  from evalscope.config import TaskConfig
13
- from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
14
- from evalscope.models import BaseModelAdapter, CustomModelAdapter
14
+ from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, ReviewKeys
15
+ from evalscope.models import BaseModelAdapter
15
16
  from evalscope.report import Report, gen_table
16
17
  from evalscope.utils import dict_torch_dtype_to_str, gen_hash
17
18
  from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
@@ -36,7 +37,6 @@ class Evaluator(object):
36
37
  """
37
38
 
38
39
  def __init__(self,
39
- dataset_name_or_path: str,
40
40
  data_adapter: DataAdapter,
41
41
  model_adapter: BaseModelAdapter,
42
42
  outputs: OutputsStructure = None,
@@ -44,7 +44,7 @@ class Evaluator(object):
44
44
  **kwargs):
45
45
 
46
46
  self.dataset_name = data_adapter.name
47
- self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
47
+ self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
48
48
  self.model_name = task_cfg.model_id
49
49
  self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
50
50
 
@@ -63,15 +63,20 @@ class Evaluator(object):
63
63
 
64
64
  def load_dataset(self):
65
65
  dataset = self.data_adapter.load(
66
- dataset_name_or_path=self.dataset_name_or_path,
67
- subset_list=self.data_adapter.subset_list,
68
- work_dir=os.path.expanduser(self.task_cfg.dataset_dir),
69
- datasets_hub=self.dataset_hub,
70
- **self.kwargs)
66
+ work_dir=os.path.expanduser(self.task_cfg.dataset_dir), datasets_hub=self.dataset_hub, **self.kwargs)
71
67
 
72
68
  # Get prompts from dataset
73
69
  prompts = self.data_adapter.gen_prompts(data_dict=dataset)
74
- return prompts
70
+
71
+ # Limit and index prompts
72
+ limited_prompts = defaultdict(list)
73
+ for subset_name, prompts_list in prompts.items():
74
+ limit = self.task_cfg.limit or len(prompts_list)
75
+ for index, prompt in enumerate(prompts_list[:limit]):
76
+ prompt['index'] = index
77
+ limited_prompts[subset_name].append(prompt)
78
+
79
+ return limited_prompts
75
80
 
76
81
  def _generate_answer_id(self, model_cfg, input_d, infer_cfg):
77
82
  model_cfg_str = json.dumps(OrderedDict(sorted(dict_torch_dtype_to_str(model_cfg).items())), ensure_ascii=False)
@@ -87,12 +92,38 @@ class Evaluator(object):
87
92
  answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
88
93
  return answer_d
89
94
 
90
- def get_answers(self,
91
- subset_name: str,
92
- prompts_list: List[dict],
93
- infer_cfg: dict = None,
94
- debug: bool = False,
95
- **kwargs) -> list:
95
+ def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
96
+ answers_list = []
97
+ answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
98
+ for answer_d, input_prompt in zip(answer_ds, input_prompts):
99
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
100
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
101
+ answers_list.append(processed_answer)
102
+ return answers_list
103
+
104
+ @staticmethod
105
+ def filter_answer(use_cache, prompts_list, pred_file_path) -> dict:
106
+ # Filter prompts that have been answered
107
+ answers_list = []
108
+ if not use_cache or not os.path.exists(pred_file_path):
109
+ return answers_list, prompts_list
110
+
111
+ def get_answered_indices(answers_list: List[Dict]) -> List[int]:
112
+ indices = [answer[AnswerKeys.ORIGIN_PROMPT].get('index') for answer in answers_list]
113
+
114
+ if all(index is None for index in indices):
115
+ return list(range(len(answers_list)))
116
+
117
+ return [index for index in indices if index is not None]
118
+
119
+ answers_list = jsonl_to_list(pred_file_path)
120
+ answered_indices = set(get_answered_indices(answers_list))
121
+ logger.info(f'Reusing predictions from {pred_file_path}, got {len(answered_indices)} answers.')
122
+
123
+ prompts = [prompt for i, prompt in enumerate(prompts_list) if i not in answered_indices]
124
+ return answers_list, prompts
125
+
126
+ def get_answers(self, subset_name: str, prompts_list: List[dict], infer_cfg: dict = None, **kwargs) -> list:
96
127
  """
97
128
  Get answers from model inference.
98
129
  It is required to rewrite this method to support your own evaluator.
@@ -110,7 +141,6 @@ class Evaluator(object):
110
141
  max_length: int, the max length of the sequence to be generated.
111
142
  max_new_tokens: int, the max number of new tokens to be generated.
112
143
  repetition_penalty: float, the parameter for repetition penalty. 1.0 means no penalty.
113
- debug: whether to run in debug mode.
114
144
  **kwargs: kwargs.
115
145
 
116
146
  Returns: The list of answers.
@@ -119,41 +149,35 @@ class Evaluator(object):
119
149
  assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
120
150
  assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
121
151
 
122
- answers_list = []
123
152
  pred_file_name = self.dataset_name + '_' + subset_name + '.jsonl'
124
153
  pred_file_path = os.path.join(self.outputs_structure.predictions_dir, self.model_name, pred_file_name)
125
154
  os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
126
155
 
127
- if self.use_cache and os.path.exists(pred_file_path):
128
- answers_list = jsonl_to_list(pred_file_path)
129
- logger.info(f'Reusing predictions from {pred_file_path}, got {len(answers_list)} answers.')
130
- # Note: assume prediction in order of prompts_list
131
- prompts_list = prompts_list[len(answers_list):]
132
-
133
- if isinstance(self.model_adapter, CustomModelAdapter):
134
- # Batch inference for custom model
135
-
136
- resp_answers_list: List[Dict[str, Any]] = self.model_adapter.predict(
137
- inputs=prompts_list, infer_cfg=infer_cfg)
138
-
139
- for input_prompt, answer_d in zip(prompts_list, resp_answers_list):
140
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
141
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
142
- answers_list.append(processed_answer)
143
- dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
144
-
156
+ answers_list, prompts_list = Evaluator.filter_answer(self.use_cache, prompts_list, pred_file_path)
157
+
158
+ eval_batch_size = self.task_cfg.eval_batch_size
159
+ if self.task_cfg.eval_type == EvalType.SERVICE:
160
+ with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
161
+ with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
162
+ futures = []
163
+ for input_prompt in prompts_list:
164
+ futures.append(executor.submit(self._get_answer, [input_prompt], subset_name, infer_cfg))
165
+ for future in as_completed(futures):
166
+ answer_ds: List[dict] = future.result()
167
+ answers_list.extend(answer_ds)
168
+ dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
169
+ pbar.update(len(answer_ds))
145
170
  else:
146
- for input_prompt in tqdm(prompts_list, total=len(prompts_list), desc=f'Predicting({subset_name}): '):
147
- answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
148
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
149
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
150
-
151
- if debug:
152
- logger.info(f'**input_prompt: {json.dumps(input_prompt, ensure_ascii=False)} \n')
153
- logger.info(f'**predicted ans: {json.dumps(processed_answer, ensure_ascii=False)} \n')
154
-
155
- answers_list.append(processed_answer)
156
- dump_jsonl_data(processed_answer, pred_file_path, dump_mode=DumpMode.APPEND)
171
+ batch_prompts_list = [
172
+ prompts_list[i:i + eval_batch_size] for i in range(0, len(prompts_list), eval_batch_size)
173
+ ]
174
+ with tqdm(total=len(prompts_list), desc=f'Predicting({subset_name}): ') as pbar:
175
+ for batch_prompts in batch_prompts_list:
176
+ answer_ds: List[dict] = self._get_answer(
177
+ input_prompts=batch_prompts, subset_name=subset_name, infer_cfg=infer_cfg)
178
+ answers_list.extend(answer_ds)
179
+ dump_jsonl_data(answer_ds, pred_file_path, dump_mode=DumpMode.APPEND)
180
+ pbar.update(len(batch_prompts))
157
181
 
158
182
  logger.info(f'Dump predictions to {pred_file_path}.')
159
183
  return answers_list
@@ -200,17 +224,13 @@ class Evaluator(object):
200
224
  def _generate_review_id(self, answer_d):
201
225
  # Gen review_id (concat: answer_id + reviewer_spec)
202
226
  answer_id = answer_d[AnswerKeys.ANSWER_ID]
203
- reviewer_spec = {
204
- 'metric': [metric.name for metric in self.data_adapter.metric_list],
205
- 'reviewer': ['Evaluator'],
206
- 'revision': ['default']
207
- }
227
+ reviewer_spec = {'metric': self.data_adapter.metric_list, 'reviewer': ['Evaluator'], 'revision': ['default']}
208
228
  reviewer_spec_str = json.dumps(
209
229
  OrderedDict(sorted(dict_torch_dtype_to_str(reviewer_spec).items())), ensure_ascii=False)
210
230
  review_id = 'review-' + gen_hash(answer_id + reviewer_spec_str)
211
231
  return review_id, reviewer_spec
212
232
 
213
- def get_reviews(self, subset_name: str, answers_list: List[dict], debug: bool = False, **kwargs) -> list:
233
+ def get_reviews(self, subset_name: str, answers_list: List[dict], **kwargs) -> list:
214
234
  """
215
235
  Get reviews from answers.
216
236
  It is required to rewrite this method to support your own evaluator.
@@ -218,7 +238,6 @@ class Evaluator(object):
218
238
  Args:
219
239
  subset_name: subset name of benchmark
220
240
  answers_list: inference results list.
221
- debug: whether to run in debug mode.
222
241
  **kwargs: kwargs.
223
242
 
224
243
  Returns: reviews list.
@@ -231,14 +250,14 @@ class Evaluator(object):
231
250
 
232
251
  if self.use_cache and os.path.exists(review_file_path):
233
252
  logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
253
+ os.remove(review_file_path)
234
254
 
235
255
  for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
236
256
  review_id, reviewer_spec = self._generate_review_id(answer_d)
237
257
  # Get review
238
258
  review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
239
259
 
240
- if debug:
241
- logger.info(review_d)
260
+ logger.debug(review_d)
242
261
 
243
262
  reviews_list.append(review_d)
244
263
  # Dump reviews
@@ -274,7 +293,8 @@ class Evaluator(object):
274
293
 
275
294
  review_res_list.append(review_res)
276
295
 
277
- metric_score: List[dict] = self.data_adapter.compute_metric(review_res_list=review_res_list)
296
+ metric_score: List[dict] = self.data_adapter.compute_metric(
297
+ review_res_list=review_res_list, reviews_list=reviews_list)
278
298
 
279
299
  return metric_score
280
300
 
@@ -315,7 +335,7 @@ class Evaluator(object):
315
335
  logger.error('Failed to generate report table.')
316
336
  return report_map
317
337
 
318
- def eval(self, infer_cfg: dict = None, debug: bool = False, **kwargs) -> dict:
338
+ def eval(self, **kwargs) -> dict:
319
339
  """
320
340
  Evaluate the model on the specific benchmark. Streaming & parallel mode is supported.
321
341
  It is required to rewrite this method to support your own evaluator.
@@ -329,7 +349,6 @@ class Evaluator(object):
329
349
 
330
350
  Args:
331
351
  infer_cfg: The config for model inference.
332
- debug: Whether to run in debug mode. Default: False.
333
352
 
334
353
  Returns:
335
354
  Dict of results. Depends on the stage of evaluation.
@@ -347,17 +366,14 @@ class Evaluator(object):
347
366
 
348
367
  prompts = self.load_dataset()
349
368
  for subset_name, prompts_list in prompts.items():
350
- limit = kwargs.get('limit', len(prompts_list))
351
- prompts_list = prompts_list[:limit]
352
369
 
353
370
  answers_list: list = self.get_answers(
354
- subset_name=subset_name, prompts_list=prompts_list, infer_cfg=infer_cfg, debug=debug, **kwargs)
371
+ subset_name=subset_name, prompts_list=prompts_list, infer_cfg=self.task_cfg.generation_config, **kwargs)
355
372
  if self.stage == EvalStage.INFER:
356
373
  stage_answers_dict[subset_name] = answers_list
357
374
  continue
358
375
 
359
- reviews_list: list = self.get_reviews(
360
- subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
376
+ reviews_list: list = self.get_reviews(subset_name=subset_name, answers_list=answers_list, **kwargs)
361
377
 
362
378
  metric_res = self.compute_metrics(reviews_list=reviews_list)
363
379
  reviews_score_all[subset_name] = metric_res
@@ -1,4 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, weighted_mean
2
+ from evalscope.metrics.metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean,
3
+ simple_f1_score, weighted_mean)
3
4
  from evalscope.metrics.named_metrics import *
4
5
  from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh