evalscope 0.10.0__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (59) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/aime24/__init__.py +0 -0
  3. evalscope/benchmarks/aime24/aime24_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +5 -7
  5. evalscope/benchmarks/bbh/bbh_adapter.py +17 -9
  6. evalscope/benchmarks/benchmark.py +2 -2
  7. evalscope/benchmarks/ceval/ceval_adapter.py +9 -9
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -11
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +34 -23
  10. evalscope/benchmarks/data_adapter.py +18 -12
  11. evalscope/benchmarks/data_collection/__init__.py +0 -0
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +71 -0
  13. evalscope/benchmarks/general_mcq/__init__.py +0 -0
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +129 -0
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -6
  16. evalscope/benchmarks/gpqa/__init__.py +0 -0
  17. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  18. evalscope/benchmarks/gpqa/gpqa_adapter.py +121 -0
  19. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +8 -13
  20. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -7
  21. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -6
  22. evalscope/benchmarks/ifeval/ifeval_adapter.py +14 -14
  23. evalscope/benchmarks/ifeval/instructions.py +3 -4
  24. evalscope/benchmarks/iquiz/iquiz_adapter.py +5 -5
  25. evalscope/benchmarks/math_500/__init__.py +0 -0
  26. evalscope/benchmarks/math_500/math_500_adapter.py +49 -0
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +7 -11
  28. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +27 -15
  29. evalscope/benchmarks/race/race_adapter.py +3 -3
  30. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +1 -2
  31. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -8
  32. evalscope/cli/start_app.py +3 -2
  33. evalscope/collections/evaluator.py +103 -39
  34. evalscope/collections/sampler.py +2 -1
  35. evalscope/collections/schema.py +1 -2
  36. evalscope/config.py +1 -0
  37. evalscope/evaluator/evaluator.py +78 -64
  38. evalscope/metrics/math_parser.py +526 -0
  39. evalscope/metrics/metrics.py +16 -1
  40. evalscope/metrics/named_metrics.py +31 -7
  41. evalscope/models/chat_adapter.py +69 -47
  42. evalscope/models/choice_adapter.py +52 -45
  43. evalscope/models/custom_adapter.py +2 -2
  44. evalscope/models/local_model.py +4 -0
  45. evalscope/models/server_adapter.py +28 -34
  46. evalscope/report/app.py +298 -96
  47. evalscope/run.py +10 -7
  48. evalscope/utils/chat_service.py +2 -2
  49. evalscope/utils/io_utils.py +1 -1
  50. evalscope/version.py +2 -2
  51. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/METADATA +20 -11
  52. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/RECORD +57 -47
  53. tests/cli/test_run.py +93 -16
  54. evalscope/benchmarks/ceval/samples.jsonl +0 -1
  55. evalscope/metrics/math_accuracy.py +0 -200
  56. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/LICENSE +0 -0
  57. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/WHEEL +0 -0
  58. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/entry_points.txt +0 -0
  59. {evalscope-0.10.0.dist-info → evalscope-0.11.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,49 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
3
+ from evalscope.models import ChatGenerationModelAdapter
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ # flake8: noqa
7
+
8
+ logger = get_logger()
9
+
10
+
11
+ @Benchmark.register(
12
+ name='math_500',
13
+ dataset_id='AI-ModelScope/MATH-500',
14
+ model_adapter=ChatGenerationModelAdapter,
15
+ subset_list=['default'],
16
+ metric_list=['AveragePass@1'],
17
+ few_shot_num=0,
18
+ train_split=None,
19
+ eval_split='test',
20
+ prompt_template='{query}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
21
+ )
22
+ class Math500Adapter(DataAdapter):
23
+
24
+ def __init__(self, *args, **kwargs):
25
+ super().__init__(*args, **kwargs)
26
+
27
+ def gen_prompt(self, input_d: dict, few_shot_list: list, **kwargs) -> dict:
28
+ """
29
+ Generate the prompt for the model input.
30
+ """
31
+ problem = input_d['problem']
32
+ full_prompt = self.prompt_template.format(query=problem)
33
+
34
+ return {'data': [full_prompt], 'system_prompt': self.system_prompt}
35
+
36
+ def get_gold_answer(self, input_d: dict) -> str:
37
+ # Extract the gold answer from the input dict.
38
+ return strip_answer_string(input_d['answer'])
39
+
40
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
41
+ """
42
+ Parse the model output to get the answer. Could be the best choice index.
43
+ """
44
+ # Note: Use same extraction method for both of checkpoint/service/custom
45
+ result = strip_answer_string(extract_answer(result))
46
+ return result
47
+
48
+ def match(self, gold: str, pred: str) -> float:
49
+ return math_equal(pred, gold)
@@ -4,17 +4,15 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import AverageAccuracy, exact_match
7
+ from evalscope.metrics import exact_match
8
8
  from evalscope.models import MultiChoiceModelAdapter
9
- from evalscope.utils import ResponseParser, normalize_score
9
+ from evalscope.utils import ResponseParser
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  # flake8: noqa
13
13
 
14
14
  logger = get_logger()
15
15
 
16
- DATASET_ID = 'modelscope/mmlu'
17
-
18
16
  SUBSET_LIST = [
19
17
  'high_school_european_history',
20
18
  'business_ethics',
@@ -141,11 +139,11 @@ SUBJECT_MAPPING = {
141
139
  dataset_id='modelscope/mmlu',
142
140
  model_adapter=MultiChoiceModelAdapter,
143
141
  subset_list=SUBSET_LIST,
144
- metric_list=[AverageAccuracy],
142
+ metric_list=['AverageAccuracy'],
145
143
  few_shot_num=5,
146
144
  train_split='train',
147
145
  eval_split='test',
148
- prompt_template='',
146
+ prompt_template='The following are multiple choice questions (with answers) about {subset_name}. \n{query}',
149
147
  )
150
148
  class MMLUAdapter(DataAdapter):
151
149
 
@@ -221,17 +219,15 @@ class MMLUAdapter(DataAdapter):
221
219
  {'data': [full_prompt], 'multi_choices': self.choices}
222
220
 
223
221
  """
224
- prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
225
- self._format_subject(subset_name))
226
222
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
227
223
 
228
224
  context: str = '\n'.join(few_shot_prompts) + '\n'
229
225
  context += self._generate_prompt(input_d=input_d, include_answer=False)
230
- context = prompt + context
226
+ query = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
231
227
 
232
- full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
228
+ full_prompt = self.prompt_template.format(subset_name=self._format_subject(subset_name), query=query)
233
229
 
234
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
230
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
235
231
 
236
232
  def get_gold_answer(self, input_d: dict) -> str:
237
233
  # Get the gold choice
@@ -3,22 +3,27 @@ from typing import Any, Dict
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
5
  from evalscope.constants import AnswerKeys, EvalType
6
- from evalscope.metrics import AverageAccuracy, exact_match
6
+ from evalscope.metrics import exact_match
7
7
  from evalscope.models import ChatGenerationModelAdapter
8
8
  from evalscope.utils.utils import ResponseParser
9
9
 
10
+ SUBSET_LIST = [
11
+ 'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
12
+ 'philosophy', 'economics', 'other', 'psychology', 'history'
13
+ ]
14
+
10
15
 
11
16
  @Benchmark.register(
12
17
  name='mmlu_pro',
13
18
  dataset_id='modelscope/mmlu-pro',
14
19
  model_adapter=ChatGenerationModelAdapter,
15
- subset_list=['default'],
16
- metric_list=[AverageAccuracy],
20
+ subset_list=SUBSET_LIST,
21
+ metric_list=['AverageAccuracy'],
17
22
  few_shot_num=5,
18
23
  train_split='validation',
19
24
  eval_split='test',
20
25
  prompt_template=
21
- 'You are an knowledge expert, you are supposed to answer the multi-choice question to derive your final answer as `The answer is ...`.', # noqa: E501
26
+ 'The following are multiple choice questions (with answers) about {subset_name}. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n{query}', # noqa: E501
22
27
  )
23
28
  class MMLUProAdapter(DataAdapter):
24
29
 
@@ -26,10 +31,11 @@ class MMLUProAdapter(DataAdapter):
26
31
  super().__init__(**kwargs)
27
32
 
28
33
  self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
29
- self.categories = [
30
- 'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
31
- 'philosophy', 'economics', 'other', 'psychology', 'history'
32
- ]
34
+
35
+ def load(self, **kwargs):
36
+ # default load all data
37
+ kwargs['subset_list'] = ['default']
38
+ return super().load(**kwargs)
33
39
 
34
40
  def gen_prompts(self, data_dict: dict, **kwargs) -> Dict[str, list]:
35
41
  """
@@ -37,26 +43,32 @@ class MMLUProAdapter(DataAdapter):
37
43
  Return a dict with category as key and list of prompts as value.
38
44
  """
39
45
 
40
- data_dict = data_dict[self.subset_list[0]] # Only one subset for MMLU-Pro
46
+ data_dict = data_dict['default'] # Only one subset for MMLU-Pro
41
47
  fewshot_prompts = self.get_fewshot_examples(data_dict)
42
48
 
43
49
  # Use the category as key to group the prompts
44
50
  res_dict = defaultdict(list)
45
51
  # generate prompts for each test sample
46
52
  for entry in data_dict[self.eval_split]:
47
- prefix = fewshot_prompts[entry['category']]
53
+ subset_name = entry['category']
54
+ if subset_name not in self.subset_list:
55
+ continue
56
+ prefix = fewshot_prompts[subset_name]
48
57
  query = prefix + 'Q: ' + entry['question'] + '\n' + \
49
58
  self.__form_options(entry['options']) + '\n'
50
59
 
51
- prompt_d = {'data': [query], 'system_prompt': self.prompt_template, AnswerKeys.RAW_INPUT: entry}
60
+ full_prompt = self.prompt_template.format(subset_name=subset_name, query=query)
61
+ prompt_d = {'data': [full_prompt], 'system_prompt': self.system_prompt, AnswerKeys.RAW_INPUT: entry}
52
62
 
53
- res_dict[entry['category']].append(prompt_d)
63
+ res_dict[subset_name].append(prompt_d)
54
64
  return res_dict
55
65
 
56
66
  def get_fewshot_examples(self, data_dict: dict):
57
- # load 5-shot prompts for each category
58
- prompts = {c: '' for c in self.categories}
59
- for d in data_dict[self.train_split]:
67
+ # load few-shot prompts for each category
68
+ prompts = {c: '' for c in self.subset_list}
69
+ for index, d in enumerate(data_dict[self.train_split]):
70
+ if index >= self.few_shot_num:
71
+ break
60
72
  prompts[d['category']] += 'Q:' + ' ' + d['question'] + '\n' + \
61
73
  self.__form_options(d['options']) + '\n' + \
62
74
  d['cot_content'] + '\n\n'
@@ -4,7 +4,7 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import AverageAccuracy, exact_match
7
+ from evalscope.metrics import exact_match
8
8
  from evalscope.models import MultiChoiceModelAdapter
9
9
  from evalscope.utils import ResponseParser
10
10
  from evalscope.utils.io_utils import jsonl_to_list
@@ -20,7 +20,7 @@ logger = get_logger()
20
20
  dataset_id='modelscope/race',
21
21
  model_adapter=MultiChoiceModelAdapter,
22
22
  subset_list=['high', 'middle'],
23
- metric_list=[AverageAccuracy],
23
+ metric_list=['AverageAccuracy'],
24
24
  few_shot_num=3,
25
25
  train_split='train',
26
26
  eval_split='test',
@@ -82,7 +82,7 @@ class RACEAdapter(DataAdapter):
82
82
 
83
83
  full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
84
84
 
85
- return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
85
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.system_prompt}
86
86
 
87
87
  def get_gold_answer(self, input_d: dict) -> str:
88
88
  # Get the gold choice
@@ -6,7 +6,6 @@ import os
6
6
  from evalscope.benchmarks import Benchmark
7
7
  from evalscope.benchmarks.data_adapter import DataAdapter
8
8
  from evalscope.constants import EvalType
9
- from evalscope.metrics import AverageAccuracy
10
9
  from evalscope.models import ChatGenerationModelAdapter
11
10
  from evalscope.utils import get_logger
12
11
 
@@ -20,7 +19,7 @@ logger = get_logger()
20
19
  dataset_id='modelscope/trivia_qa',
21
20
  model_adapter=ChatGenerationModelAdapter,
22
21
  subset_list=['default'],
23
- metric_list=[AverageAccuracy],
22
+ metric_list=['AverageAccuracy'],
24
23
  few_shot_num=5,
25
24
  train_split='dev',
26
25
  eval_split='test',
@@ -9,9 +9,8 @@ from typing import List
9
9
  from evalscope.benchmarks import Benchmark
10
10
  from evalscope.benchmarks.data_adapter import DataAdapter
11
11
  from evalscope.constants import EvalType
12
- from evalscope.metrics import AverageAccuracy
13
12
  from evalscope.models import ContinuationLogitsModelAdapter
14
- from evalscope.utils import get_logger, normalize_score
13
+ from evalscope.utils import get_logger
15
14
 
16
15
  # flake8: noqa
17
16
 
@@ -25,7 +24,7 @@ logger = get_logger()
25
24
  dataset_id='modelscope/truthful_qa',
26
25
  model_adapter=ContinuationLogitsModelAdapter,
27
26
  subset_list=['multiple_choice'],
28
- metric_list=[AverageAccuracy],
27
+ metric_list=['AverageAccuracy'],
29
28
  few_shot_num=0,
30
29
  train_split=None,
31
30
  eval_split='validation',
@@ -284,8 +283,9 @@ class TruthfulQaAdapter(DataAdapter):
284
283
  logger.error(f'** Unknown review_res: {review_res_d}')
285
284
 
286
285
  # To get mc2 score
287
- return [{
288
- 'metric_name': self.metric_list[0].name,
289
- 'score': self.metric_list[0].object(mc2_list),
290
- 'num': len(mc2_list)
291
- }]
286
+ # return [{
287
+ # 'metric_name': self.metric_list[0].name,
288
+ # 'score': self.metric_list[0].object(mc2_list),
289
+ # 'num': len(mc2_list)
290
+ # }]
291
+ return super().compute_metric(mc2_list)
@@ -3,7 +3,7 @@ import os
3
3
  from argparse import ArgumentParser
4
4
 
5
5
  from evalscope.cli.base import CLICommand
6
- from evalscope.report.app import create_app
6
+ from evalscope.report.app import add_argument, create_app
7
7
 
8
8
 
9
9
  def subparser_func(args):
@@ -23,7 +23,8 @@ class StartAppCMD(CLICommand):
23
23
  """ define args for create pipeline template command.
24
24
  """
25
25
  parser = parsers.add_parser(StartAppCMD.name)
26
+ add_argument(parser)
26
27
  parser.set_defaults(func=subparser_func)
27
28
 
28
29
  def execute(self):
29
- create_app()
30
+ create_app(self.args)
@@ -2,14 +2,15 @@ import json
2
2
  import os
3
3
  import pandas as pd
4
4
  from collections import defaultdict
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
6
  from tabulate import tabulate
6
7
  from tqdm import tqdm
7
8
  from typing import List
8
9
 
9
- from evalscope.benchmarks import Benchmark
10
+ from evalscope.benchmarks import Benchmark, DataAdapter
10
11
  from evalscope.collections.sampler import DatasetEntry
11
12
  from evalscope.config import TaskConfig
12
- from evalscope.constants import DataCollection, DumpMode
13
+ from evalscope.constants import AnswerKeys, DumpMode, EvalType
13
14
  from evalscope.evaluator import Evaluator
14
15
  from evalscope.models import get_local_model, initialize_model_adapter
15
16
  from evalscope.report import ReportGenerator
@@ -29,11 +30,16 @@ class SimpleEvaluator(Evaluator):
29
30
  task_cfg=task_cfg,
30
31
  outputs=outputs)
31
32
 
32
- def get_answer(self, input_prompt, subset_name, infer_cfg) -> dict:
33
- answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
34
- answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
35
- processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
36
- return processed_answer
33
+ def get_answer(self, samples, infer_cfg) -> List[dict]:
34
+ input_prompts = [sample.prompt for sample in samples]
35
+ subset_name = samples[0].subset_name
36
+ answers_list = []
37
+ answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
38
+ for answer_d, input_prompt in zip(answer_ds, input_prompts):
39
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
40
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
41
+ answers_list.append(processed_answer)
42
+ return answers_list, samples
37
43
 
38
44
  def get_review(self, answer_d) -> dict:
39
45
  review_id, reviewer_spec = self._generate_review_id(answer_d)
@@ -42,38 +48,50 @@ class SimpleEvaluator(Evaluator):
42
48
 
43
49
  def get_score(self, review_d) -> float:
44
50
  metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
45
- # use the first metric by default
46
- score = metric_score[0]['score']
47
- return score
51
+ return metric_score
48
52
 
49
53
 
50
54
  class EvaluatorCollection:
51
55
 
52
- def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure):
56
+ def __init__(self, task_cfg: TaskConfig, data_adapter: DataAdapter, outputs: OutputsStructure):
53
57
  self.task_cfg = task_cfg
58
+ self.data_adapter = data_adapter
54
59
  self.outputs = outputs
55
60
  self.model = get_local_model(task_cfg)
61
+
56
62
  self.dataset, self.dataset_name = self.load()
57
- self.dataset_name_map, self.dataset_id_map = self._parse_dataset()
63
+ self.dataset_name_map = EvaluatorCollection._init_name_map(self.dataset)
64
+ self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
58
65
  self.evaluators = self._initialize_evaluators()
59
66
 
60
67
  def load(self) -> tuple[list[DatasetEntry], str]:
61
- dataset_path = self.task_cfg.dataset_args[DataCollection.NAME]['local_path']
62
- dataset_name = os.path.basename(dataset_path).split('.')[0]
63
- raw_dataset = jsonl_to_list(dataset_path)
68
+ dataset_name = os.path.basename(self.data_adapter.dataset_id).split('.')[0]
69
+ raw_dataset = self.data_adapter.load()
70
+ # limit the dataset
71
+ if self.task_cfg.limit:
72
+ raw_dataset = raw_dataset[:self.task_cfg.limit]
73
+ # index dataset
64
74
  datasets = []
65
75
  for sample in raw_dataset:
76
+ sample['prompt'].update({'index': sample['index']})
66
77
  datasets.append(DatasetEntry(**sample))
78
+
67
79
  return datasets, dataset_name
68
80
 
69
- def _parse_dataset(self):
81
+ @staticmethod
82
+ def _init_name_map(dataset):
70
83
  dataset_name_map = defaultdict(lambda: defaultdict(list))
71
- dataset_id_map = {}
72
- for sample in self.dataset:
84
+ for sample in dataset:
73
85
  dataset_name, subset_name = sample.dataset_name, sample.subset_name
74
86
  dataset_name_map[dataset_name][subset_name].append(sample.index)
87
+ return dataset_name_map
88
+
89
+ @staticmethod
90
+ def _init_id_map(dataset):
91
+ dataset_id_map = {}
92
+ for sample in dataset:
75
93
  dataset_id_map[sample.index] = sample
76
- return dataset_name_map, dataset_id_map
94
+ return dataset_id_map
77
95
 
78
96
  def _initialize_evaluators(self):
79
97
  evaluators = {}
@@ -93,15 +111,16 @@ class EvaluatorCollection:
93
111
  for subset_name, ids in data_map.items():
94
112
  for _id in ids:
95
113
  row_data: DatasetEntry = self.dataset_id_map[_id]
96
- score = scores[_id]
97
- data.append(
98
- dict(
99
- task_type=row_data.task_type,
100
- categories=tuple(row_data.categories),
101
- dataset_name=dataset_name,
102
- subset_name=subset_name,
103
- tags=row_data.tags,
104
- score=score))
114
+ for metric in scores[_id]:
115
+ data.append(
116
+ dict(
117
+ task_type=row_data.task_type,
118
+ categories=tuple(row_data.categories),
119
+ dataset_name=dataset_name,
120
+ subset_name=subset_name,
121
+ tags=row_data.tags,
122
+ metric=metric['metric_name'],
123
+ score=metric['score']))
105
124
  return pd.DataFrame(data)
106
125
 
107
126
  def aggregate_and_sort(df, group_by_cols):
@@ -117,13 +136,13 @@ class EvaluatorCollection:
117
136
  df = get_dataframe(scores)
118
137
 
119
138
  # multi-level aggregation
120
- subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
121
- dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
122
- task_report_df = aggregate_and_sort(df, ['task_type'])
139
+ subset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name', 'subset_name'])
140
+ dataset_report_df = aggregate_and_sort(df, ['task_type', 'metric', 'dataset_name'])
141
+ task_report_df = aggregate_and_sort(df, ['task_type', 'metric'])
123
142
 
124
143
  # explode tags to multiple rows
125
144
  df_exploded_tags = df.explode('tags')
126
- tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags'])
145
+ tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags', 'metric'])
127
146
 
128
147
  # process multi-level categories
129
148
  df_categories = df.copy()
@@ -132,7 +151,8 @@ class EvaluatorCollection:
132
151
  for level in range(max_depth):
133
152
  df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
134
153
  if len(x) > level else '')
135
- category_report_df = aggregate_and_sort(df_categories, [f'category{level}' for level in range(max_depth)])
154
+ category_report_df = aggregate_and_sort(df_categories,
155
+ [f'category{level}' for level in range(max_depth)] + ['metric'])
136
156
 
137
157
  # convert to dict format
138
158
  report_dict = {
@@ -155,16 +175,60 @@ class EvaluatorCollection:
155
175
  with open(report_file_path, 'w', encoding='utf-8') as f:
156
176
  json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
157
177
 
178
+ def _filter_answer(self, pred_file_path):
179
+ answer_dict = defaultdict(dict)
180
+ if self.task_cfg.use_cache and os.path.exists(pred_file_path):
181
+ answers_list = jsonl_to_list(pred_file_path)
182
+ indices = set()
183
+ for answer in answers_list:
184
+ index = answer[AnswerKeys.ORIGIN_PROMPT].get('index')
185
+ answer_dict[index] = answer
186
+ indices.add(index)
187
+ data = []
188
+ for sample in self.dataset:
189
+ if sample.index not in indices:
190
+ data.append(sample)
191
+ data_map = self._init_name_map(data)
192
+
193
+ return answer_dict, data, data_map
194
+ return answer_dict, self.dataset, self.dataset_name_map
195
+
158
196
  def get_answers(self):
159
197
  pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
160
198
  f'{self.dataset_name}.jsonl')
161
199
  os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
162
- answers = defaultdict(dict)
163
- for sample in tqdm(self.dataset, desc='Getting answers'):
164
- evaluator = self.evaluators[sample.dataset_name]
165
- answer_d = evaluator.get_answer(sample.prompt, sample.subset_name, self.task_cfg.generation_config)
166
- answers[sample.index] = answer_d
167
- dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
200
+
201
+ answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
202
+
203
+ eval_batch_size = self.task_cfg.eval_batch_size
204
+ with tqdm(total=len(dataset), desc='Getting answers') as pbar:
205
+ if self.task_cfg.eval_type == EvalType.SERVICE:
206
+ with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
207
+ futures = []
208
+ for sample in dataset:
209
+ evaluator = self.evaluators[sample.dataset_name]
210
+ futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
211
+ for future in as_completed(futures):
212
+ answer_list, samples = future.result()
213
+ answers[samples[0].index] = answer_list[0]
214
+ dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
215
+ pbar.update(1)
216
+ else:
217
+ for dataset_name, data_map in dataset_name_map.items():
218
+ # get evaluator for the dataset
219
+ evaluator = self.evaluators[dataset_name]
220
+ for subset_name, ids in data_map.items():
221
+ for i in range(0, len(ids), eval_batch_size):
222
+ # get batch samples
223
+ batch_ids = ids[i:i + eval_batch_size]
224
+ batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
225
+ answer_list, _ = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
226
+ # update answers
227
+ for j, _id in enumerate(batch_ids):
228
+ answers[_id] = answer_list[j]
229
+ dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
230
+
231
+ pbar.update(len(batch_ids))
168
232
  return answers
169
233
 
170
234
  def get_reviews(self, answers):
@@ -44,7 +44,8 @@ class Sampler(ABC):
44
44
  dataset_name=dataset.name,
45
45
  subset_name=subset_name,
46
46
  ))
47
- sampled_data = random.choices(all_data, k=count)
47
+ count = min(count, len(all_data)) # avoid sampling more than the dataset size
48
+ sampled_data = random.sample(all_data, k=count)
48
49
  return sampled_data
49
50
 
50
51
  def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
@@ -19,8 +19,7 @@ class DatasetInfo:
19
19
  benchmark_meta = Benchmark.get(self.name)
20
20
 
21
21
  data_adapter = benchmark_meta.get_data_adapter(config=self.args)
22
- data_dict = data_adapter.load(
23
- dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
22
+ data_dict = data_adapter.load()
24
23
  prompts = data_adapter.gen_prompts(data_dict)
25
24
  return prompts
26
25
 
evalscope/config.py CHANGED
@@ -54,6 +54,7 @@ class TaskConfig:
54
54
  eval_config: Union[str, Dict, None] = None
55
55
  stage: str = EvalStage.ALL
56
56
  limit: Optional[int] = None
57
+ eval_batch_size: int = 1
57
58
 
58
59
  # Cache and working directory arguments
59
60
  mem_cache: bool = False # Deprecated, will be removed in v1.0.0.