evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -5
  3. evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
  4. evalscope/benchmarks/benchmark.py +1 -1
  5. evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
  6. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
  7. evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
  8. evalscope/benchmarks/data_adapter.py +69 -70
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
  10. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
  11. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
  12. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
  13. evalscope/benchmarks/ifeval/__init__.py +0 -0
  14. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  15. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  16. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  17. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  18. evalscope/benchmarks/ifeval/utils.py +134 -0
  19. evalscope/benchmarks/iquiz/__init__.py +0 -0
  20. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
  23. evalscope/benchmarks/race/race_adapter.py +4 -73
  24. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
  25. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
  26. evalscope/cli/cli.py +2 -0
  27. evalscope/cli/start_app.py +29 -0
  28. evalscope/collections/evaluator.py +82 -62
  29. evalscope/collections/sampler.py +47 -41
  30. evalscope/collections/schema.py +14 -10
  31. evalscope/constants.py +4 -0
  32. evalscope/evaluator/evaluator.py +22 -13
  33. evalscope/metrics/__init__.py +2 -5
  34. evalscope/metrics/metrics.py +11 -2
  35. evalscope/metrics/named_metrics.py +17 -0
  36. evalscope/models/server_adapter.py +11 -4
  37. evalscope/perf/__init__.py +1 -0
  38. evalscope/perf/main.py +0 -1
  39. evalscope/perf/plugin/api/custom_api.py +1 -1
  40. evalscope/perf/plugin/api/openai_api.py +1 -1
  41. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  42. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  43. evalscope/report/__init__.py +5 -0
  44. evalscope/report/app.py +506 -0
  45. evalscope/report/combinator.py +73 -0
  46. evalscope/report/generator.py +80 -0
  47. evalscope/report/utils.py +133 -0
  48. evalscope/run.py +16 -11
  49. evalscope/summarizer.py +1 -1
  50. evalscope/utils/chat_service.py +1 -1
  51. evalscope/utils/logger.py +1 -0
  52. evalscope/utils/model_utils.py +5 -2
  53. evalscope/version.py +2 -2
  54. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
  55. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
  56. tests/cli/test_collection.py +11 -7
  57. tests/cli/test_run.py +13 -4
  58. evalscope/tools/__init__.py +0 -1
  59. evalscope/tools/combine_reports.py +0 -133
  60. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  61. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  62. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  63. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  64. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  65. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,11 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import glob
3
- import json
4
3
  import os.path
5
4
  from collections import defaultdict
6
- from typing import Any, Optional
5
+ from typing import List
7
6
 
8
7
  from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.metrics import (WeightedAverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh,
10
- weighted_mean)
8
+ from evalscope.metrics import AverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
11
9
  from evalscope.models import ChatGenerationModelAdapter
12
10
  from evalscope.utils.io_utils import jsonl_to_list
13
11
  from evalscope.utils.logger import get_logger
@@ -20,7 +18,7 @@ logger = get_logger()
20
18
  dataset_id='general_qa',
21
19
  model_adapter=ChatGenerationModelAdapter,
22
20
  subset_list=['default'],
23
- metric_list=[WeightedAverageBLEU],
21
+ metric_list=[AverageBLEU],
24
22
  few_shot_num=0,
25
23
  train_split=None,
26
24
  eval_split='test',
@@ -68,7 +66,7 @@ class GeneralQAAdapter(DataAdapter):
68
66
 
69
67
  # if len(history) > 0:
70
68
  # prompt = '\n'.join(history) + '\n' + prompt
71
- return {'data': [prompt]}
69
+ return {'data': [prompt], 'system_prompt': self.prompt_template}
72
70
 
73
71
  def get_gold_answer(self, input_d: dict) -> str:
74
72
  """
@@ -92,14 +90,14 @@ class GeneralQAAdapter(DataAdapter):
92
90
  """
93
91
  return result
94
92
 
95
- def match(self, gold: str, pred: str) -> float:
93
+ def match(self, gold: str, pred: str) -> dict:
96
94
  """
97
95
  Args:
98
96
  gold: str
99
97
  pred: str
100
98
 
101
99
  Returns:
102
- bleu_score: float
100
+ bleu_score: dict
103
101
 
104
102
  """
105
103
  res = dict()
@@ -107,10 +105,9 @@ class GeneralQAAdapter(DataAdapter):
107
105
  bleu_dict = bleu_ngram_one_sample(pred, gold)
108
106
  res.update(rouge_dict)
109
107
  res.update(bleu_dict)
110
- # return bleu(item)
111
108
  return res
112
109
 
113
- def compute_metric(self, review_res_list: list) -> float:
110
+ def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
114
111
  """
115
112
  compute weighted mean of the bleu score of all samples
116
113
 
@@ -118,62 +115,12 @@ class GeneralQAAdapter(DataAdapter):
118
115
  review_res_list: [score1, score2, ...]
119
116
 
120
117
  Returns:
121
- avg_res: float
118
+ avg_res: List[dict]
122
119
 
123
120
  """
124
121
  items = defaultdict(list)
125
122
  for scores in review_res_list:
126
123
  for k, v in scores.items():
127
- items[k].append((v, 1.0))
124
+ items[k].append(v)
128
125
  # items = [(score, 1.0) for score in review_res_list]
129
- res = {k: weighted_mean(v) for k, v in items.items()}
130
- # return weighted_mean(items)
131
- return res
132
-
133
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
134
- """
135
- Args:
136
- subset_score_map: {subset_name: (score_dict, num), ...}
137
- report_name: str, the user-defined report name.
138
-
139
- Returns:
140
- {
141
- "name":"GeneralQA",
142
- "metric":"WeightedAverageBLEU",
143
- "score":0.399,
144
- "category":[
145
- {
146
- "name":"DEFAULT",
147
- "score":0.399,
148
- "subset":[
149
- {
150
- "name":"default",
151
- "score":0.399
152
- },
153
- ]
154
- }
155
- ],
156
- "total_num":10
157
- }
158
- """
159
- total_num: int = sum([num for _, num in subset_score_map.values()])
160
- # weighted_avg_bleu: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
161
- cate_avg_list = [{
162
- 'name': subset_name,
163
- 'score': score_dict
164
- } for subset_name, (score_dict, _) in subset_score_map.items()]
165
- total_avg_list = defaultdict(float)
166
- for score_dict, num in subset_score_map.values():
167
- for metric, score in score_dict.items():
168
- total_avg_list[metric] += score * num / total_num
169
-
170
- category_d = dict(name='DEFAULT', score=total_avg_list, subset=cate_avg_list)
171
-
172
- res_map = dict(
173
- name=report_name or 'general_qa',
174
- metric=self.metric_list[0]['name'],
175
- score=total_avg_list,
176
- category=[category_d],
177
- total_num=total_num)
178
-
179
- return res_map
126
+ return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
@@ -6,7 +6,7 @@ import os
6
6
  import re
7
7
 
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
- from evalscope.metrics import WeightedAverageAccuracy
9
+ from evalscope.metrics import AverageAccuracy
10
10
  from evalscope.models import ChatGenerationModelAdapter
11
11
  from evalscope.utils.io_utils import jsonl_to_list
12
12
  from evalscope.utils.logger import get_logger
@@ -19,7 +19,7 @@ logger = get_logger()
19
19
  dataset_id='modelscope/gsm8k',
20
20
  model_adapter=ChatGenerationModelAdapter,
21
21
  subset_list=['main'],
22
- metric_list=[WeightedAverageAccuracy],
22
+ metric_list=[AverageAccuracy],
23
23
  few_shot_num=4,
24
24
  train_split='train',
25
25
  eval_split='test',
@@ -33,7 +33,7 @@ class GSM8KAdapter(DataAdapter):
33
33
 
34
34
  Args:
35
35
  subset_list (list): Subset list for the dataset. Default: ['main']
36
- metric_list (list): Metric list for the dataset. Default: [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
36
+ metric_list (list): Metric list for the dataset. Default: [{'name': 'AverageAccuracy', 'object': mean}]
37
37
  few_shot_num (int): Number of few-shot examples. Default: 4
38
38
  train_split (str): Train split name. Default: 'train'
39
39
  eval_split (str): The target eval split name. Default: 'test'
@@ -75,9 +75,8 @@ class GSM8KAdapter(DataAdapter):
75
75
  use_fewshot = self.few_shot_num > 0
76
76
 
77
77
  full_prompt = self._generate_prompt(input_d, few_shot_list=few_shot_list, use_fewshot=use_fewshot)
78
- full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
79
78
 
80
- return {'data': [full_prompt]}
79
+ return {'data': [full_prompt], 'system_prompt': self.prompt_template}
81
80
 
82
81
  def get_gold_answer(self, input_d: dict) -> str:
83
82
  # Extract the gold answer from the input dict.
@@ -5,10 +5,11 @@ import re
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType
8
- from evalscope.metrics import WeightedAverageAccuracy, exact_match
8
+ from evalscope.metrics import AverageAccuracy, exact_match
9
9
  from evalscope.models import ContinuationLogitsModelAdapter
10
10
  from evalscope.utils.io_utils import jsonl_to_list
11
11
  from evalscope.utils.logger import get_logger
12
+ from evalscope.utils.utils import ResponseParser
12
13
 
13
14
  # flake8: noqa
14
15
 
@@ -20,11 +21,12 @@ logger = get_logger()
20
21
  dataset_id='modelscope/hellaswag',
21
22
  model_adapter=ContinuationLogitsModelAdapter,
22
23
  subset_list=['default'],
23
- metric_list=[WeightedAverageAccuracy],
24
+ metric_list=[AverageAccuracy],
24
25
  few_shot_num=0,
25
26
  train_split='train',
26
27
  eval_split='validation',
27
- prompt_template='',
28
+ prompt_template=
29
+ 'Respond with the index of sentence that makes the most sense, chose from 0, 1, 2, 3, derive your final answer as `The answer is ...`.', # noqa: E501
28
30
  )
29
31
  class HellaSwagAdapter(DataAdapter):
30
32
 
@@ -87,7 +89,11 @@ class HellaSwagAdapter(DataAdapter):
87
89
 
88
90
  ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
89
91
 
90
- return {'data': ctx_continuation_pair_list, 'multi_choices': self.choices}
92
+ return {
93
+ 'data': ctx_continuation_pair_list,
94
+ 'multi_choices': self.choices,
95
+ 'system_prompt': self.prompt_template
96
+ }
91
97
 
92
98
  def get_gold_answer(self, input_d: dict) -> str:
93
99
  # Get the gold choice
@@ -114,9 +120,9 @@ class HellaSwagAdapter(DataAdapter):
114
120
 
115
121
  return str(best_choice_idx)
116
122
  elif eval_type == EvalType.SERVICE:
117
- return result # TODO: to be supported !
123
+ return ResponseParser.parse_first_option(result)
118
124
  elif eval_type == EvalType.CUSTOM:
119
- return result # TODO: to be supported !
125
+ return ResponseParser.parse_first_option(result)
120
126
  else:
121
127
  raise ValueError(f'Invalid eval_type: {eval_type}')
122
128
 
@@ -1,6 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import re
3
- from typing import List
4
3
 
5
4
  from evalscope.benchmarks import Benchmark, DataAdapter
6
5
  from evalscope.metrics import Pass1
@@ -22,7 +21,7 @@ logger = get_logger()
22
21
  few_shot_num=0,
23
22
  train_split=None,
24
23
  eval_split='test',
25
- prompt_template='Complete the following python code:\n',
24
+ prompt_template='',
26
25
  )
27
26
  class HumanevalAdapter(DataAdapter):
28
27
  """
@@ -66,9 +65,9 @@ class HumanevalAdapter(DataAdapter):
66
65
  {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
67
66
  """
68
67
  full_prompt = input_d['prompt']
69
- full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
68
+ full_prompt = f'Complete the following python code:\n{full_prompt}' if self.prompt_template else full_prompt
70
69
 
71
- return {'data': [full_prompt]}
70
+ return {'data': [full_prompt], 'system_prompt': self.prompt_template}
72
71
 
73
72
  @classmethod
74
73
  def _postprocess(cls, text: str) -> str:
File without changes
@@ -0,0 +1,57 @@
1
+ from collections import defaultdict
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.benchmarks import Benchmark, DataAdapter
5
+ from evalscope.benchmarks.ifeval.utils import agg_inst_level_acc, process_results
6
+ from evalscope.constants import EvalType
7
+ from evalscope.metrics import Metric, mean
8
+ from evalscope.models import ChatGenerationModelAdapter
9
+ from evalscope.utils.utils import normalize_score
10
+
11
+
12
+ @Benchmark.register(
13
+ name='ifeval',
14
+ dataset_id='opencompass/ifeval',
15
+ model_adapter=ChatGenerationModelAdapter,
16
+ subset_list=['default'],
17
+ metric_list=[
18
+ Metric(name='prompt_level_strict_acc', object=mean),
19
+ Metric(name='inst_level_strict_acc', object=agg_inst_level_acc),
20
+ Metric(name='prompt_level_loose_acc', object=mean),
21
+ Metric(name='inst_level_loose_acc', object=agg_inst_level_acc),
22
+ ],
23
+ few_shot_num=0,
24
+ train_split=None,
25
+ eval_split='train',
26
+ prompt_template='',
27
+ )
28
+ class IFEvalAdapter(DataAdapter):
29
+
30
+ def __init__(self, **kwargs):
31
+ super().__init__(**kwargs)
32
+
33
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
34
+ return {'data': [input_d['prompt']], 'system_prompt': self.prompt_template}
35
+
36
+ def get_gold_answer(self, input_d: dict) -> str:
37
+ return input_d
38
+
39
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
40
+ return result
41
+
42
+ def match(self, gold: Any, pred: Any) -> Dict:
43
+ return process_results(gold, [pred])
44
+
45
+ def compute_metric(self, review_res_list: List[dict]) -> Any:
46
+ # aggregate review results
47
+ res_dict = defaultdict(list)
48
+ for res in review_res_list:
49
+ for k, v in res.items():
50
+ res_dict[k].append(v)
51
+
52
+ metrics = []
53
+ for metric in self.metric_list:
54
+ metric_name = metric.name
55
+ pred_value = res_dict[metric_name]
56
+ metrics.append({'metric_name': metric_name, 'score': metric.object(pred_value), 'num': len(pred_value)})
57
+ return metrics