evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -5
  3. evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
  4. evalscope/benchmarks/benchmark.py +1 -1
  5. evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
  6. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
  7. evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
  8. evalscope/benchmarks/data_adapter.py +69 -70
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
  10. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
  11. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
  12. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
  13. evalscope/benchmarks/ifeval/__init__.py +0 -0
  14. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  15. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  16. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  17. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  18. evalscope/benchmarks/ifeval/utils.py +134 -0
  19. evalscope/benchmarks/iquiz/__init__.py +0 -0
  20. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
  23. evalscope/benchmarks/race/race_adapter.py +4 -73
  24. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
  25. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
  26. evalscope/cli/cli.py +2 -0
  27. evalscope/cli/start_app.py +29 -0
  28. evalscope/collections/evaluator.py +82 -62
  29. evalscope/collections/sampler.py +47 -41
  30. evalscope/collections/schema.py +14 -10
  31. evalscope/constants.py +4 -0
  32. evalscope/evaluator/evaluator.py +22 -13
  33. evalscope/metrics/__init__.py +2 -5
  34. evalscope/metrics/metrics.py +11 -2
  35. evalscope/metrics/named_metrics.py +17 -0
  36. evalscope/models/server_adapter.py +11 -4
  37. evalscope/perf/__init__.py +1 -0
  38. evalscope/perf/main.py +0 -1
  39. evalscope/perf/plugin/api/custom_api.py +1 -1
  40. evalscope/perf/plugin/api/openai_api.py +1 -1
  41. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  42. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  43. evalscope/report/__init__.py +5 -0
  44. evalscope/report/app.py +506 -0
  45. evalscope/report/combinator.py +73 -0
  46. evalscope/report/generator.py +80 -0
  47. evalscope/report/utils.py +133 -0
  48. evalscope/run.py +16 -11
  49. evalscope/summarizer.py +1 -1
  50. evalscope/utils/chat_service.py +1 -1
  51. evalscope/utils/logger.py +1 -0
  52. evalscope/utils/model_utils.py +5 -2
  53. evalscope/version.py +2 -2
  54. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
  55. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
  56. tests/cli/test_collection.py +11 -7
  57. tests/cli/test_run.py +13 -4
  58. evalscope/tools/__init__.py +0 -1
  59. evalscope/tools/combine_reports.py +0 -133
  60. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  61. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  62. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  63. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  64. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  65. {evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,134 @@
1
+ import dataclasses
2
+ from typing import Dict, Optional, Union
3
+
4
+ from evalscope.benchmarks.ifeval import instructions_registry
5
+
6
+
7
+ @dataclasses.dataclass
8
+ class InputExample:
9
+ key: int
10
+ instruction_id_list: list[str]
11
+ prompt: str
12
+ kwargs: list[Dict[str, Optional[Union[str, int]]]]
13
+
14
+
15
+ @dataclasses.dataclass
16
+ class OutputExample:
17
+ instruction_id_list: list[str]
18
+ prompt: str
19
+ response: str
20
+ follow_all_instructions: bool
21
+ follow_instruction_list: list[bool]
22
+
23
+
24
+ def test_instruction_following_strict(
25
+ inp,
26
+ response,
27
+ ):
28
+ """Tests response to see if instructions are followed."""
29
+ instruction_list = inp.instruction_id_list
30
+ is_following_list = []
31
+
32
+ for index, instruction_id in enumerate(instruction_list):
33
+ instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
34
+ instruction = instruction_cls(instruction_id)
35
+
36
+ # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
37
+ kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
38
+ instruction.build_description(**kwargs)
39
+ args = instruction.get_instruction_args()
40
+ if args and 'prompt' in args:
41
+ instruction.build_description(prompt=inp.prompt)
42
+
43
+ if response.strip() and instruction.check_following(response):
44
+ is_following_list.append(True)
45
+ else:
46
+ is_following_list.append(False)
47
+
48
+ return OutputExample(
49
+ instruction_id_list=inp.instruction_id_list,
50
+ prompt=inp.prompt,
51
+ response=response,
52
+ follow_all_instructions=all(is_following_list),
53
+ follow_instruction_list=is_following_list,
54
+ )
55
+
56
+
57
+ def test_instruction_following_loose(
58
+ inp,
59
+ response,
60
+ ):
61
+ """Tests response for an upper bound for following instructions."""
62
+ r = response.split('\n')
63
+ response_remove_first = '\n'.join(r[1:]).strip()
64
+ response_remove_last = '\n'.join(r[:-1]).strip()
65
+ response_remove_both = '\n'.join(r[1:-1]).strip()
66
+ revised_response = response.replace('*', '')
67
+ revised_response_remove_first = response_remove_first.replace('*', '')
68
+ revised_response_remove_last = response_remove_last.replace('*', '')
69
+ revised_response_remove_both = response_remove_both.replace('*', '')
70
+ all_responses = [
71
+ response,
72
+ revised_response,
73
+ response_remove_first,
74
+ response_remove_last,
75
+ response_remove_both,
76
+ revised_response_remove_first,
77
+ revised_response_remove_last,
78
+ revised_response_remove_both,
79
+ ]
80
+ instruction_list = inp.instruction_id_list
81
+ is_following_list = []
82
+
83
+ for index, instruction_id in enumerate(instruction_list):
84
+ instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
85
+ instruction = instruction_cls(instruction_id)
86
+
87
+ # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
88
+ kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
89
+ instruction.build_description(**kwargs)
90
+ args = instruction.get_instruction_args()
91
+ if args and 'prompt' in args:
92
+ instruction.build_description(prompt=inp.prompt)
93
+
94
+ is_following = False
95
+ for r in all_responses:
96
+ if r.strip() and instruction.check_following(r):
97
+ is_following = True
98
+ break
99
+
100
+ is_following_list.append(is_following)
101
+
102
+ return OutputExample(
103
+ instruction_id_list=inp.instruction_id_list,
104
+ prompt=inp.prompt,
105
+ response=response,
106
+ follow_all_instructions=all(is_following_list),
107
+ follow_instruction_list=is_following_list,
108
+ )
109
+
110
+
111
+ def process_results(doc, results):
112
+ inp = InputExample(
113
+ key=doc['key'],
114
+ instruction_id_list=doc['instruction_id_list'],
115
+ prompt=doc['prompt'],
116
+ kwargs=doc['kwargs'],
117
+ )
118
+ response = results[0]
119
+
120
+ out_strict = test_instruction_following_strict(inp, response)
121
+ out_loose = test_instruction_following_loose(inp, response)
122
+
123
+ return {
124
+ 'prompt_level_strict_acc': out_strict.follow_all_instructions,
125
+ 'inst_level_strict_acc': out_strict.follow_instruction_list,
126
+ 'prompt_level_loose_acc': out_loose.follow_all_instructions,
127
+ 'inst_level_loose_acc': out_loose.follow_instruction_list,
128
+ }
129
+
130
+
131
+ def agg_inst_level_acc(items):
132
+ flat_items = [item for sublist in items for item in sublist]
133
+ inst_level_acc = sum(flat_items) / len(flat_items)
134
+ return inst_level_acc
File without changes
@@ -0,0 +1,63 @@
1
+ from evalscope.benchmarks import Benchmark, DataAdapter
2
+ from evalscope.constants import AnswerKeys, EvalType
3
+ from evalscope.metrics import AverageAccuracy, exact_match
4
+ from evalscope.models import ChatGenerationModelAdapter
5
+ from evalscope.utils.utils import ResponseParser
6
+
7
+
8
+ @Benchmark.register(
9
+ name='iquiz',
10
+ dataset_id='AI-ModelScope/IQuiz',
11
+ model_adapter=ChatGenerationModelAdapter,
12
+ subset_list=['IQ', 'EQ'],
13
+ metric_list=[AverageAccuracy],
14
+ few_shot_num=0,
15
+ train_split=None,
16
+ eval_split='test',
17
+ prompt_template='你是一个高智商和高情商的专家,你被要求回答一个选择题,并选出一个正确的选项,解释原因,最终输出格式为:`答案是(选项)`。', # noqa: E501
18
+ )
19
+ class IQuizAdapter(DataAdapter):
20
+
21
+ def __init__(self, **kwargs):
22
+ super().__init__(**kwargs)
23
+
24
+ self.choices = ['A', 'B', 'C', 'D', 'E']
25
+
26
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
27
+ """
28
+ Generate model prompt from input data.
29
+ example:
30
+ {
31
+ "question":"天气预报说本周星期三会下雨,昨天果然下雨了,今天星期几?",
32
+ "choices":["星期一","星期二","星期三","星期四"],
33
+ "answer":"D",
34
+ "level":1
35
+ }
36
+ """
37
+ prompt = f"问题: {input_d['question']}\n"
38
+ prompt += self.__form_options(input_d['choices'])
39
+ return {'data': [prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
40
+
41
+ def __form_options(self, options: list):
42
+ option_str = '选项:\n'
43
+ for opt, choice in zip(options, self.choices):
44
+ option_str += f'({choice}): {opt}' + '\n'
45
+ return option_str
46
+
47
+ def get_gold_answer(self, input_d: dict) -> str:
48
+ """
49
+ Parse the raw input labels (gold).
50
+ """
51
+ return input_d['answer']
52
+
53
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
54
+ """
55
+ Parse the predicted result and extract proper answer.
56
+ """
57
+ return ResponseParser.parse_first_option_with_choices(result, self.choices)
58
+
59
+ def match(self, gold: str, pred: str) -> float:
60
+ """
61
+ Match the gold answer and the predicted answer.
62
+ """
63
+ return exact_match(gold=gold, pred=pred)
@@ -4,7 +4,7 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import WeightedAverageAccuracy, exact_match
7
+ from evalscope.metrics import AverageAccuracy, exact_match
8
8
  from evalscope.models import MultiChoiceModelAdapter
9
9
  from evalscope.utils import ResponseParser, normalize_score
10
10
  from evalscope.utils.logger import get_logger
@@ -141,7 +141,7 @@ SUBJECT_MAPPING = {
141
141
  dataset_id='modelscope/mmlu',
142
142
  model_adapter=MultiChoiceModelAdapter,
143
143
  subset_list=SUBSET_LIST,
144
- metric_list=[WeightedAverageAccuracy],
144
+ metric_list=[AverageAccuracy],
145
145
  few_shot_num=5,
146
146
  train_split='train',
147
147
  eval_split='test',
@@ -160,17 +160,19 @@ class MMLUAdapter(DataAdapter):
160
160
 
161
161
  super().__init__(**kwargs)
162
162
 
163
+ self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
164
+
163
165
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
164
166
  data_dict = {}
165
167
  for subset_name in subset_list:
166
168
  data_dict[subset_name] = {}
167
169
 
168
170
  for split_name in [self.train_split, self.eval_split]:
169
- if self.train_split == 'train':
171
+ if split_name == 'train':
170
172
  split_name_suffix = 'dev'
171
- elif self.eval_split == 'test':
173
+ elif split_name == 'test':
172
174
  split_name_suffix = 'test'
173
- elif self.eval_split == 'validation':
175
+ elif split_name == 'validation':
174
176
  split_name_suffix = 'val'
175
177
  else:
176
178
  raise ValueError(f'Invalid split name: {split_name}')
@@ -229,7 +231,7 @@ class MMLUAdapter(DataAdapter):
229
231
 
230
232
  full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
231
233
 
232
- return {'data': [full_prompt], 'multi_choices': self.choices}
234
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
233
235
 
234
236
  def get_gold_answer(self, input_d: dict) -> str:
235
237
  # Get the gold choice
@@ -259,84 +261,6 @@ class MMLUAdapter(DataAdapter):
259
261
  def match(self, gold: str, pred: str) -> float:
260
262
  return exact_match(gold=gold, pred=pred)
261
263
 
262
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
263
- """
264
- Generate report for the evaluation.
265
-
266
- Args:
267
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
268
- report_name: The user-defined report name.
269
-
270
- Returns:
271
- {
272
- "name":"MMLU",
273
- "metric":"WeightedAverageAccuracy",
274
- "score":0.3389,
275
- "category":[
276
- {
277
- "name":"STEM",
278
- "score":0.2528,
279
- "subset":[
280
- {
281
- "name":"computer_network",
282
- "score":0.2632
283
- },
284
- {
285
- "name":"operating_system",
286
- "score":0.3157
287
- },
288
- {
289
- "name":"computer_architecture",
290
- "score":0.4285
291
- }
292
- ]
293
- }
294
- ],
295
- "total_num":59
296
- }
297
- """
298
- total_num: int = sum([num for _, num in subset_score_map.values()])
299
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
300
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
301
-
302
- # Get domain-subject mapping
303
- subject_review_map = {}
304
- for subset_name, (subset_score, num) in subset_score_map.items():
305
- domain_name: str = SUBJECT_MAPPING.get(subset_name)[2] if SUBJECT_MAPPING.get(subset_name) else subset_name
306
- if domain_name in subject_review_map:
307
- subject_review_map[domain_name].append((subset_name, subset_score, num))
308
- else:
309
- subject_review_map[domain_name] = [(subset_name, subset_score, num)]
310
-
311
- # Get domain score
312
- category_list = []
313
- for domain_name, domain_res_list in subject_review_map.items():
314
- domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
315
- sum([num for _, _, num in domain_res_list])
316
- domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
317
- category_list.append({
318
- 'name':
319
- domain_name,
320
- 'score':
321
- domain_weighted_avg_acc,
322
- 'subset': [{
323
- 'name': subset_name,
324
- 'score': normalize_score(score=subset_score)
325
- } for subset_name, subset_score, _ in domain_res_list]
326
- })
327
-
328
- category_list = sorted(category_list, key=lambda x: x['name'])
329
-
330
- # Get final dict of report
331
- res_map = dict(
332
- name=report_name or 'mmlu',
333
- metric=self.metric_list[0]['name'],
334
- score=weighted_avg_acc,
335
- category=category_list,
336
- total_num=total_num)
337
-
338
- return res_map
339
-
340
264
  @classmethod
341
265
  def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
342
266
 
@@ -3,7 +3,7 @@ from typing import Any, Dict
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
5
  from evalscope.constants import AnswerKeys, EvalType
6
- from evalscope.metrics import WeightedAverageAccuracy, exact_match
6
+ from evalscope.metrics import AverageAccuracy, exact_match
7
7
  from evalscope.models import ChatGenerationModelAdapter
8
8
  from evalscope.utils.utils import ResponseParser
9
9
 
@@ -13,7 +13,7 @@ from evalscope.utils.utils import ResponseParser
13
13
  dataset_id='modelscope/mmlu-pro',
14
14
  model_adapter=ChatGenerationModelAdapter,
15
15
  subset_list=['default'],
16
- metric_list=[WeightedAverageAccuracy],
16
+ metric_list=[AverageAccuracy],
17
17
  few_shot_num=5,
18
18
  train_split='validation',
19
19
  eval_split='test',
@@ -4,9 +4,9 @@ import os
4
4
 
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType
7
- from evalscope.metrics import WeightedAverageAccuracy, exact_match
7
+ from evalscope.metrics import AverageAccuracy, exact_match
8
8
  from evalscope.models import MultiChoiceModelAdapter
9
- from evalscope.utils import ResponseParser, normalize_score
9
+ from evalscope.utils import ResponseParser
10
10
  from evalscope.utils.io_utils import jsonl_to_list
11
11
  from evalscope.utils.logger import get_logger
12
12
 
@@ -14,15 +14,13 @@ from evalscope.utils.logger import get_logger
14
14
 
15
15
  logger = get_logger()
16
16
 
17
- SUBJECT_MAPPING = {'high': 'High', 'middle': 'Middle'}
18
-
19
17
 
20
18
  @Benchmark.register(
21
19
  name='race',
22
20
  dataset_id='modelscope/race',
23
21
  model_adapter=MultiChoiceModelAdapter,
24
22
  subset_list=['high', 'middle'],
25
- metric_list=[WeightedAverageAccuracy],
23
+ metric_list=[AverageAccuracy],
26
24
  few_shot_num=3,
27
25
  train_split='train',
28
26
  eval_split='test',
@@ -84,7 +82,7 @@ class RACEAdapter(DataAdapter):
84
82
 
85
83
  full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
86
84
 
87
- return {'data': [full_prompt], 'multi_choices': self.choices}
85
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
88
86
 
89
87
  def get_gold_answer(self, input_d: dict) -> str:
90
88
  # Get the gold choice
@@ -114,73 +112,6 @@ class RACEAdapter(DataAdapter):
114
112
  def match(self, gold: str, pred: str) -> float:
115
113
  return exact_match(gold=gold, pred=pred)
116
114
 
117
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
118
- """
119
- Generate report for the evaluation.
120
-
121
- Args:
122
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
123
- report_name: The user-defined report name.
124
-
125
- Returns:
126
- {
127
- "name":"RACE",
128
- "metric":"WeightedAverageAccuracy",
129
- "score":0.3389,
130
- "category":[
131
- {
132
- "name":"High",
133
- "score":0.2528,
134
- "subset":[
135
- {
136
- "name":"high",
137
- "score":0.2528
138
- }
139
- ]
140
- }
141
- ],
142
- "total_num":59
143
- }
144
- """
145
- total_num: int = sum([num for _, num in subset_score_map.values()])
146
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
147
-
148
- # Get domain-subject mapping
149
- subject_review_map = {}
150
- for subset_name, (subset_score, num) in subset_score_map.items():
151
- domain_name: str = SUBJECT_MAPPING.get(subset_name)
152
- if domain_name in subject_review_map:
153
- subject_review_map[domain_name].append((subset_name, subset_score, num))
154
- else:
155
- subject_review_map[domain_name] = [(subset_name, subset_score, num)]
156
-
157
- # Get domain score
158
- category_list = []
159
- for domain_name, domain_res_list in subject_review_map.items():
160
- domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
161
- sum([num for _, _, num in domain_res_list])
162
- domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
163
- category_list.append({
164
- 'name':
165
- domain_name,
166
- 'score':
167
- normalize_score(score=domain_weighted_avg_acc),
168
- 'subset': [{
169
- 'name': subset_name,
170
- 'score': subset_score
171
- } for subset_name, subset_score, _ in domain_res_list]
172
- })
173
-
174
- # Get final dict of report
175
- res_map = dict(
176
- name=report_name or 'race',
177
- metric=self.metric_list[0]['name'],
178
- score=weighted_avg_acc,
179
- category=category_list,
180
- total_num=total_num)
181
-
182
- return res_map
183
-
184
115
  @classmethod
185
116
  def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
186
117
 
@@ -1,15 +1,12 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) EleutherAI Inc, and its affiliates.
3
3
  import csv
4
- import numpy as np
5
4
  import os
6
- from typing import List
7
5
 
8
6
  from evalscope.benchmarks import Benchmark
9
7
  from evalscope.benchmarks.data_adapter import DataAdapter
10
8
  from evalscope.constants import EvalType
11
- from evalscope.metrics import WeightedAverageAccuracy
12
- from evalscope.metrics.metrics import exact_match
9
+ from evalscope.metrics import AverageAccuracy
13
10
  from evalscope.models import ChatGenerationModelAdapter
14
11
  from evalscope.utils import get_logger
15
12
 
@@ -23,7 +20,7 @@ logger = get_logger()
23
20
  dataset_id='modelscope/trivia_qa',
24
21
  model_adapter=ChatGenerationModelAdapter,
25
22
  subset_list=['default'],
26
- metric_list=[WeightedAverageAccuracy],
23
+ metric_list=[AverageAccuracy],
27
24
  few_shot_num=5,
28
25
  train_split='dev',
29
26
  eval_split='test',
@@ -104,7 +101,7 @@ class TriviaQaAdapter(DataAdapter):
104
101
  context += self._generate_prompt(input_d=input_d, include_answer=False)
105
102
  full_prompt = context
106
103
 
107
- return {'data': [full_prompt], 'system_prompt': prompt}
104
+ return {'data': [full_prompt], 'system_prompt': prompt or self.prompt_template}
108
105
 
109
106
  def get_gold_answer(self, input_d: dict) -> list:
110
107
  # Get the gold choice
@@ -9,8 +9,7 @@ from typing import List
9
9
  from evalscope.benchmarks import Benchmark
10
10
  from evalscope.benchmarks.data_adapter import DataAdapter
11
11
  from evalscope.constants import EvalType
12
- from evalscope.metrics import WeightedAverageAccuracy
13
- from evalscope.metrics.metrics import weighted_mean
12
+ from evalscope.metrics import AverageAccuracy
14
13
  from evalscope.models import ContinuationLogitsModelAdapter
15
14
  from evalscope.utils import get_logger, normalize_score
16
15
 
@@ -26,7 +25,7 @@ logger = get_logger()
26
25
  dataset_id='modelscope/truthful_qa',
27
26
  model_adapter=ContinuationLogitsModelAdapter,
28
27
  subset_list=['multiple_choice'],
29
- metric_list=[WeightedAverageAccuracy],
28
+ metric_list=[AverageAccuracy],
30
29
  few_shot_num=0,
31
30
  train_split=None,
32
31
  eval_split='validation',
@@ -260,7 +259,7 @@ class TruthfulQaAdapter(DataAdapter):
260
259
 
261
260
  return {'multiple_choice': {'mc1': mc1(mc1_lls), 'mc2': mc2(mc2_lls)}} # or {'generation': xxx}
262
261
 
263
- def compute_metric(self, review_res_list: List[dict]) -> float:
262
+ def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
264
263
  """
265
264
  Compute evaluation result by specific metric for each subset.
266
265
 
@@ -285,56 +284,8 @@ class TruthfulQaAdapter(DataAdapter):
285
284
  logger.error(f'** Unknown review_res: {review_res_d}')
286
285
 
287
286
  # To get mc2 score
288
- items = [(score, 1.0) for score in mc2_list]
289
- return weighted_mean(items)
290
-
291
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
292
- """
293
- Generate the report for the model output.
294
-
295
- Args:
296
- subset_score_map: {subset_name: (score, num), ...}
297
- report_name: The user-defined report name.
298
-
299
- Returns:
300
- {
301
- "name":"TruthfulQA",
302
- "metric":"WeightedAverageAccuracy",
303
- "score":0.3389,
304
- "category":[
305
- {
306
- "name":"DEFAULT",
307
- "score":0.2527,
308
- "subset":[
309
- {
310
- "name":"multiple_choice",
311
- "score":0.3157
312
- },
313
- # {
314
- # "name":"generation",
315
- # "score":0.2631
316
- # }
317
- ]
318
- }
319
- ],
320
- "total_num":100
321
- }
322
- """
323
- total_num: int = sum([num for _, num in subset_score_map.values()])
324
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
325
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
326
- cate_avg_list = [{
327
- 'name': subset_name,
328
- 'score': normalize_score(score=score)
329
- } for subset_name, (score, _) in subset_score_map.items()]
330
-
331
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
332
-
333
- res_map = dict(
334
- name=report_name or 'truthful_qa',
335
- metric=self.metric_list[0]['name'],
336
- score=weighted_avg_acc,
337
- category=[category_d],
338
- total_num=total_num)
339
-
340
- return res_map
287
+ return [{
288
+ 'metric_name': self.metric_list[0].name,
289
+ 'score': self.metric_list[0].object(mc2_list),
290
+ 'num': len(mc2_list)
291
+ }]
evalscope/cli/cli.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  import argparse
4
4
 
5
+ from evalscope.cli.start_app import StartAppCMD
5
6
  from evalscope.cli.start_eval import EvalCMD
6
7
  from evalscope.cli.start_perf import PerfBenchCMD
7
8
 
@@ -12,6 +13,7 @@ def run_cmd():
12
13
 
13
14
  PerfBenchCMD.define_args(subparsers)
14
15
  EvalCMD.define_args(subparsers)
16
+ StartAppCMD.define_args(subparsers)
15
17
 
16
18
  args = parser.parse_args()
17
19
 
@@ -0,0 +1,29 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ from argparse import ArgumentParser
4
+
5
+ from evalscope.cli.base import CLICommand
6
+ from evalscope.report.app import create_app
7
+
8
+
9
+ def subparser_func(args):
10
+ """ Function which will be called for a specific sub parser.
11
+ """
12
+ return StartAppCMD(args)
13
+
14
+
15
+ class StartAppCMD(CLICommand):
16
+ name = 'app'
17
+
18
+ def __init__(self, args):
19
+ self.args = args
20
+
21
+ @staticmethod
22
+ def define_args(parsers: ArgumentParser):
23
+ """ define args for create pipeline template command.
24
+ """
25
+ parser = parsers.add_parser(StartAppCMD.name)
26
+ parser.set_defaults(func=subparser_func)
27
+
28
+ def execute(self):
29
+ create_app()