evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +115 -87
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
  26. evalscope/benchmarks/ifeval/__init__.py +0 -0
  27. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  28. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  29. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  30. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  31. evalscope/benchmarks/ifeval/utils.py +134 -0
  32. evalscope/benchmarks/iquiz/__init__.py +0 -0
  33. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  34. evalscope/benchmarks/mmlu/__init__.py +0 -5
  35. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  36. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  37. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  38. evalscope/benchmarks/race/__init__.py +0 -5
  39. evalscope/benchmarks/race/race_adapter.py +26 -123
  40. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  41. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  42. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  43. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  44. evalscope/cli/cli.py +2 -0
  45. evalscope/cli/start_app.py +29 -0
  46. evalscope/collections/__init__.py +3 -0
  47. evalscope/collections/evaluator.py +198 -0
  48. evalscope/collections/sampler.py +138 -0
  49. evalscope/collections/schema.py +126 -0
  50. evalscope/config.py +7 -5
  51. evalscope/constants.py +9 -26
  52. evalscope/evaluator/evaluator.py +87 -121
  53. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  54. evalscope/metrics/__init__.py +3 -0
  55. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  56. evalscope/metrics/math_accuracy.py +193 -50
  57. evalscope/metrics/metrics.py +18 -6
  58. evalscope/metrics/named_metrics.py +17 -0
  59. evalscope/metrics/rouge_metric.py +13 -8
  60. evalscope/models/__init__.py +14 -1
  61. evalscope/models/base_adapter.py +52 -0
  62. evalscope/models/chat_adapter.py +138 -0
  63. evalscope/models/choice_adapter.py +211 -0
  64. evalscope/models/custom_adapter.py +67 -0
  65. evalscope/models/local_model.py +74 -0
  66. evalscope/models/model.py +141 -0
  67. evalscope/models/server_adapter.py +111 -0
  68. evalscope/perf/__init__.py +1 -0
  69. evalscope/perf/main.py +0 -1
  70. evalscope/perf/plugin/api/custom_api.py +1 -1
  71. evalscope/perf/plugin/api/openai_api.py +1 -1
  72. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  73. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  74. evalscope/report/__init__.py +5 -0
  75. evalscope/report/app.py +506 -0
  76. evalscope/report/combinator.py +73 -0
  77. evalscope/report/generator.py +80 -0
  78. evalscope/report/utils.py +133 -0
  79. evalscope/run.py +48 -72
  80. evalscope/run_arena.py +1 -1
  81. evalscope/summarizer.py +1 -1
  82. evalscope/utils/__init__.py +1 -1
  83. evalscope/utils/chat_service.py +5 -4
  84. evalscope/utils/io_utils.py +8 -0
  85. evalscope/utils/logger.py +5 -0
  86. evalscope/utils/model_utils.py +15 -2
  87. evalscope/utils/utils.py +3 -25
  88. evalscope/version.py +2 -2
  89. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
  90. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
  91. tests/cli/test_collection.py +57 -0
  92. tests/cli/test_run.py +52 -1
  93. tests/rag/test_mteb.py +3 -2
  94. evalscope/models/api/__init__.py +0 -3
  95. evalscope/models/dummy_chat_model.py +0 -49
  96. evalscope/models/model_adapter.py +0 -525
  97. evalscope/models/openai_model.py +0 -103
  98. evalscope/tools/__init__.py +0 -1
  99. evalscope/tools/combine_reports.py +0 -133
  100. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  101. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  102. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  103. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  104. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  105. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  106. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,12 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import json
4
3
  import os
5
4
 
6
- from evalscope.benchmarks.data_adapter import DataAdapter
7
- from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import normalize_score
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import EvalType
7
+ from evalscope.metrics import AverageAccuracy, exact_match
8
+ from evalscope.models import MultiChoiceModelAdapter
9
+ from evalscope.utils import ResponseParser
9
10
  from evalscope.utils.io_utils import jsonl_to_list
10
11
  from evalscope.utils.logger import get_logger
11
12
 
@@ -13,46 +14,28 @@ from evalscope.utils.logger import get_logger
13
14
 
14
15
  logger = get_logger()
15
16
 
16
- DATASET_ID = 'modelscope/race'
17
-
18
- SUBSET_LIST = ['high', 'middle']
19
-
20
- SUBJECT_MAPPING = {'high': 'High', 'middle': 'Middle'}
21
-
22
17
 
18
+ @Benchmark.register(
19
+ name='race',
20
+ dataset_id='modelscope/race',
21
+ model_adapter=MultiChoiceModelAdapter,
22
+ subset_list=['high', 'middle'],
23
+ metric_list=[AverageAccuracy],
24
+ few_shot_num=3,
25
+ train_split='train',
26
+ eval_split='test',
27
+ )
23
28
  class RACEAdapter(DataAdapter):
24
29
 
25
30
  choices = ['A', 'B', 'C', 'D']
26
31
 
27
- def __init__(self,
28
- subset_list: list = None,
29
- metric_list: list = None,
30
- few_shot_num: int = None,
31
- train_split: str = 'train',
32
- eval_split: str = 'test',
33
- **kwargs):
34
-
35
- if subset_list is None:
36
- subset_list = SUBSET_LIST
37
-
38
- if metric_list is None:
39
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
40
-
41
- if few_shot_num is None:
42
- logger.info(f'Set 3-shot examples by system for RACE.')
43
- few_shot_num = 3
44
-
32
+ def __init__(self, **kwargs):
33
+ few_shot_num = kwargs.get('few_shot_num', 3)
45
34
  if few_shot_num > 3:
46
35
  logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.')
47
- few_shot_num = 3
36
+ kwargs['few_shot_num'] = 3
48
37
 
49
- super().__init__(
50
- subset_list=subset_list,
51
- metric_list=metric_list,
52
- few_shot_num=few_shot_num,
53
- train_split=train_split,
54
- eval_split=eval_split,
55
- **kwargs)
38
+ super().__init__(**kwargs)
56
39
 
57
40
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
58
41
  data_dict = {}
@@ -99,13 +82,13 @@ class RACEAdapter(DataAdapter):
99
82
 
100
83
  full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
101
84
 
102
- return {'data': [full_prompt], 'multi_choices': self.choices}
85
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
103
86
 
104
87
  def get_gold_answer(self, input_d: dict) -> str:
105
88
  # Get the gold choice
106
89
  return input_d.get('answer', '')
107
90
 
108
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
91
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
109
92
  """
110
93
  Parse the model output to get the answer. Could be the best choice index.
111
94
 
@@ -117,98 +100,18 @@ class RACEAdapter(DataAdapter):
117
100
  Returns:
118
101
  The parsed answer. Depending on the dataset. Usually a string for chat.
119
102
  """
120
- if eval_type == 'checkpoint':
121
- return result
122
- elif eval_type == 'service': # TODO: to be implemented
123
- return result
124
- elif eval_type == 'custom': # TODO: to be implemented
103
+ if eval_type == EvalType.CHECKPOINT:
125
104
  return result
105
+ elif eval_type == EvalType.SERVICE:
106
+ return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
107
+ elif eval_type == EvalType.CUSTOM:
108
+ return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
126
109
  else:
127
110
  raise ValueError(f'Unknown eval_type: {eval_type}')
128
111
 
129
112
  def match(self, gold: str, pred: str) -> float:
130
113
  return exact_match(gold=gold, pred=pred)
131
114
 
132
- def compute_metric(self, review_res_list: list) -> float:
133
- """
134
- Compute evaluation result by specific metric.
135
-
136
- Args:
137
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
138
-
139
- Returns:
140
- The metric score.
141
- """
142
- items = [(score, 1.0) for score in review_res_list]
143
- return weighted_mean(items)
144
-
145
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
146
- """
147
- Generate report for the evaluation.
148
-
149
- Args:
150
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
151
- report_name: The user-defined report name.
152
-
153
- Returns:
154
- {
155
- "name":"RACE",
156
- "metric":"WeightedAverageAccuracy",
157
- "score":0.3389,
158
- "category":[
159
- {
160
- "name":"High",
161
- "score":0.2528,
162
- "subset":[
163
- {
164
- "name":"high",
165
- "score":0.2528
166
- }
167
- ]
168
- }
169
- ],
170
- "total_num":59
171
- }
172
- """
173
- total_num: int = sum([num for _, num in subset_score_map.values()])
174
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
175
-
176
- # Get domain-subject mapping
177
- subject_review_map = {}
178
- for subset_name, (subset_score, num) in subset_score_map.items():
179
- domain_name: str = SUBJECT_MAPPING.get(subset_name)
180
- if domain_name in subject_review_map:
181
- subject_review_map[domain_name].append((subset_name, subset_score, num))
182
- else:
183
- subject_review_map[domain_name] = [(subset_name, subset_score, num)]
184
-
185
- # Get domain score
186
- category_list = []
187
- for domain_name, domain_res_list in subject_review_map.items():
188
- domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
189
- sum([num for _, _, num in domain_res_list])
190
- domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
191
- category_list.append({
192
- 'name':
193
- domain_name,
194
- 'score':
195
- normalize_score(score=domain_weighted_avg_acc),
196
- 'subset': [{
197
- 'name': subset_name,
198
- 'score': subset_score
199
- } for subset_name, subset_score, _ in domain_res_list]
200
- })
201
-
202
- # Get final dict of report
203
- res_map = dict(
204
- name=report_name or 'race',
205
- metric=self.metric_list[0]['name'],
206
- score=weighted_avg_acc,
207
- category=category_list,
208
- total_num=total_num)
209
-
210
- return res_map
211
-
212
115
  @classmethod
213
116
  def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
214
117
 
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter
5
- from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
@@ -1,49 +1,35 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) EleutherAI Inc, and its affiliates.
3
3
  import csv
4
- import numpy as np
5
4
  import os
6
- from typing import List
7
5
 
6
+ from evalscope.benchmarks import Benchmark
8
7
  from evalscope.benchmarks.data_adapter import DataAdapter
9
- from evalscope.metrics.metrics import exact_match, weighted_mean
10
- from evalscope.utils.logger import get_logger
8
+ from evalscope.constants import EvalType
9
+ from evalscope.metrics import AverageAccuracy
10
+ from evalscope.models import ChatGenerationModelAdapter
11
+ from evalscope.utils import get_logger
11
12
 
12
13
  # flake8: noqa
13
14
 
14
15
  logger = get_logger()
15
16
 
16
- DATASET_ID = 'modelscope/trivia_qa'
17
- SUBSET_LIST = ['default']
18
-
19
17
 
18
+ @Benchmark.register(
19
+ name='trivia_qa',
20
+ dataset_id='modelscope/trivia_qa',
21
+ model_adapter=ChatGenerationModelAdapter,
22
+ subset_list=['default'],
23
+ metric_list=[AverageAccuracy],
24
+ few_shot_num=5,
25
+ train_split='dev',
26
+ eval_split='test',
27
+ )
20
28
  class TriviaQaAdapter(DataAdapter):
21
29
 
22
- def __init__(self,
23
- subset_list: list = None,
24
- metric_list: list = None,
25
- few_shot_num: int = None,
26
- train_split: str = 'dev',
27
- eval_split: str = 'test',
28
- **kwargs):
29
-
30
- if subset_list is None:
31
- subset_list = SUBSET_LIST
32
-
33
- if metric_list is None:
34
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
30
+ def __init__(self, **kwargs):
35
31
 
36
- if few_shot_num is None:
37
- logger.info(f'few_shot_num is not specified for TriviaQA, use default value: 5')
38
- few_shot_num = 5
39
-
40
- super().__init__(
41
- subset_list=subset_list,
42
- metric_list=metric_list,
43
- few_shot_num=few_shot_num,
44
- train_split=train_split,
45
- eval_split=eval_split,
46
- **kwargs)
32
+ super().__init__(**kwargs)
47
33
 
48
34
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
49
35
  data_dict = {}
@@ -113,16 +99,16 @@ class TriviaQaAdapter(DataAdapter):
113
99
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
114
100
  context: str = '\n'.join(few_shot_prompts) + '\n'
115
101
  context += self._generate_prompt(input_d=input_d, include_answer=False)
116
- full_prompt = prompt + context
102
+ full_prompt = context
117
103
 
118
- return {'data': [full_prompt]}
104
+ return {'data': [full_prompt], 'system_prompt': prompt or self.prompt_template}
119
105
 
120
106
  def get_gold_answer(self, input_d: dict) -> list:
121
107
  # Get the gold choice
122
108
  ans: list = input_d.get('ideal', [])
123
109
  return ans
124
110
 
125
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
111
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
126
112
  """
127
113
  Parse the model output to get the answer.
128
114
 
@@ -134,73 +120,11 @@ class TriviaQaAdapter(DataAdapter):
134
120
  Returns:
135
121
  The predicted answer.
136
122
  """
137
- if eval_type == 'checkpoint':
138
- return result
139
- elif eval_type == 'service': # TODO: to be implemented
140
- return result
141
- elif eval_type == 'custom': # TODO: to be implemented
142
- return result
143
- else:
144
- raise ValueError(f'Unknown eval_type: {eval_type}')
123
+ return result
145
124
 
146
125
  def match(self, gold: list, pred: str) -> float:
147
- return max([exact_match(gold=ref, pred=pred) for ref in gold])
148
-
149
- def compute_metric(self, review_res_list: list) -> float:
150
- """
151
- Compute evaluation result by specific metric.
152
-
153
- Args:
154
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
155
-
156
- Returns:
157
- The metric score.
158
- """
159
- items = [(score, 1.0) for score in review_res_list]
160
- return weighted_mean(items)
161
-
162
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
163
- """
164
- Generate the report for the model output.
165
-
166
- Args:
167
- subset_score_map: {subset_name: (score, num), ...}
168
- report_name: The user-defined report name.
169
-
170
- Returns:
171
- {
172
- "name":"TriviaQA",
173
- "metric":"WeightedAverageAccuracy",
174
- "score":0.3389,
175
- "category":[
176
- {
177
- "name":"DEFAULT",
178
- "score":0.3389,
179
- "subset":[
180
- {
181
- "name":"default",
182
- "score":0.3389
183
- }
184
- ]
185
- }
186
- ],
187
- "total_num":100
188
- }
189
- """
190
- total_num: int = sum([num for _, num in subset_score_map.values()])
191
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
192
- cate_avg_list = [{'name': subset_name, 'score': score} for subset_name, (score, _) in subset_score_map.items()]
193
-
194
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
195
-
196
- res_map = dict(
197
- name=report_name or 'trivia_qa',
198
- metric=self.metric_list[0]['name'],
199
- score=weighted_avg_acc,
200
- category=[category_d],
201
- total_num=total_num)
202
-
203
- return res_map
126
+ is_correct = any([cand in pred for cand in gold])
127
+ return 1 if is_correct else 0
204
128
 
205
129
  @classmethod
206
130
  def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter
5
- from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
@@ -6,20 +6,30 @@ import numpy as np
6
6
  import os
7
7
  from typing import List
8
8
 
9
+ from evalscope.benchmarks import Benchmark
9
10
  from evalscope.benchmarks.data_adapter import DataAdapter
10
- from evalscope.metrics.metrics import weighted_mean
11
- from evalscope.utils import normalize_score
12
- from evalscope.utils.logger import get_logger
11
+ from evalscope.constants import EvalType
12
+ from evalscope.metrics import AverageAccuracy
13
+ from evalscope.models import ContinuationLogitsModelAdapter
14
+ from evalscope.utils import get_logger, normalize_score
13
15
 
14
16
  # flake8: noqa
15
17
 
16
18
  logger = get_logger()
17
19
 
18
- DATASET_ID = 'modelscope/truthful_qa'
19
20
  # task_list = ['generation', 'multiple_choice']
20
- SUBSET_LIST = ['multiple_choice']
21
21
 
22
22
 
23
+ @Benchmark.register(
24
+ name='truthful_qa',
25
+ dataset_id='modelscope/truthful_qa',
26
+ model_adapter=ContinuationLogitsModelAdapter,
27
+ subset_list=['multiple_choice'],
28
+ metric_list=[AverageAccuracy],
29
+ few_shot_num=0,
30
+ train_split=None,
31
+ eval_split='validation',
32
+ )
23
33
  class TruthfulQaAdapter(DataAdapter):
24
34
  """
25
35
  Adapter for TruthfulQA benchmark.
@@ -40,35 +50,14 @@ class TruthfulQaAdapter(DataAdapter):
40
50
  'Q: Where were the 1992 Olympics held?\n'
41
51
  'A: The 1992 Olympics were held in Barcelona, Spain.')
42
52
 
43
- def __init__(self,
44
- subset_list: list = None,
45
- metric_list: list = None,
46
- few_shot_num: int = None,
47
- train_split: str = None,
48
- eval_split: str = 'validation',
49
- **kwargs):
50
-
51
- if subset_list is None:
52
- subset_list = SUBSET_LIST
53
-
54
- if metric_list is None:
55
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
56
-
57
- if few_shot_num is None:
58
- logger.info(f'Set 0-shot examples by system for TruthfulQA.')
59
- few_shot_num = 0
53
+ def __init__(self, **kwargs):
60
54
 
55
+ few_shot_num = kwargs.get('few_shot_num', 0)
61
56
  if few_shot_num != 0:
62
57
  logger.warning(f'few_shot_num should be 0 for TruthfulQA, but got {few_shot_num}. Use 0-shot by default.')
63
- few_shot_num = 0
58
+ kwargs['few_shot_num'] = 0
64
59
 
65
- super().__init__(
66
- subset_list=subset_list,
67
- metric_list=metric_list,
68
- few_shot_num=few_shot_num,
69
- train_split=train_split,
70
- eval_split=eval_split,
71
- **kwargs)
60
+ super().__init__(**kwargs)
72
61
 
73
62
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
74
63
  data_dict = {}
@@ -215,7 +204,7 @@ class TruthfulQaAdapter(DataAdapter):
215
204
  # TODO: generation sub-task to be added
216
205
  return {'mc1_labels': input_d['mc1_targets']['labels'], 'mc2_labels': input_d['mc2_targets']['labels']}
217
206
 
218
- def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> list:
207
+ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> list:
219
208
  """
220
209
  Parse the model output to get the answer.
221
210
 
@@ -227,11 +216,11 @@ class TruthfulQaAdapter(DataAdapter):
227
216
  Returns:
228
217
  The predicted answer.
229
218
  """
230
- if eval_type == 'checkpoint':
219
+ if eval_type == EvalType.CHECKPOINT:
231
220
  return result
232
- elif eval_type == 'service': # TODO: to be supported !
221
+ elif eval_type == EvalType.SERVICE: # TODO: to be supported !
233
222
  return result
234
- elif eval_type == 'custom': # TODO: to be supported !
223
+ elif eval_type == EvalType.CUSTOM: # TODO: to be supported !
235
224
  return result
236
225
  else:
237
226
  raise ValueError(f'Invalid eval_type: {eval_type}')
@@ -270,7 +259,7 @@ class TruthfulQaAdapter(DataAdapter):
270
259
 
271
260
  return {'multiple_choice': {'mc1': mc1(mc1_lls), 'mc2': mc2(mc2_lls)}} # or {'generation': xxx}
272
261
 
273
- def compute_metric(self, review_res_list: List[dict]) -> float:
262
+ def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
274
263
  """
275
264
  Compute evaluation result by specific metric for each subset.
276
265
 
@@ -295,56 +284,8 @@ class TruthfulQaAdapter(DataAdapter):
295
284
  logger.error(f'** Unknown review_res: {review_res_d}')
296
285
 
297
286
  # To get mc2 score
298
- items = [(score, 1.0) for score in mc2_list]
299
- return weighted_mean(items)
300
-
301
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
302
- """
303
- Generate the report for the model output.
304
-
305
- Args:
306
- subset_score_map: {subset_name: (score, num), ...}
307
- report_name: The user-defined report name.
308
-
309
- Returns:
310
- {
311
- "name":"TruthfulQA",
312
- "metric":"WeightedAverageAccuracy",
313
- "score":0.3389,
314
- "category":[
315
- {
316
- "name":"DEFAULT",
317
- "score":0.2527,
318
- "subset":[
319
- {
320
- "name":"multiple_choice",
321
- "score":0.3157
322
- },
323
- # {
324
- # "name":"generation",
325
- # "score":0.2631
326
- # }
327
- ]
328
- }
329
- ],
330
- "total_num":100
331
- }
332
- """
333
- total_num: int = sum([num for _, num in subset_score_map.values()])
334
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
335
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
336
- cate_avg_list = [{
337
- 'name': subset_name,
338
- 'score': normalize_score(score=score)
339
- } for subset_name, (score, _) in subset_score_map.items()]
340
-
341
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
342
-
343
- res_map = dict(
344
- name=report_name or 'truthful_qa',
345
- metric=self.metric_list[0]['name'],
346
- score=weighted_avg_acc,
347
- category=[category_d],
348
- total_num=total_num)
349
-
350
- return res_map
287
+ return [{
288
+ 'metric_name': self.metric_list[0].name,
289
+ 'score': self.metric_list[0].object(mc2_list),
290
+ 'num': len(mc2_list)
291
+ }]
evalscope/cli/cli.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  import argparse
4
4
 
5
+ from evalscope.cli.start_app import StartAppCMD
5
6
  from evalscope.cli.start_eval import EvalCMD
6
7
  from evalscope.cli.start_perf import PerfBenchCMD
7
8
 
@@ -12,6 +13,7 @@ def run_cmd():
12
13
 
13
14
  PerfBenchCMD.define_args(subparsers)
14
15
  EvalCMD.define_args(subparsers)
16
+ StartAppCMD.define_args(subparsers)
15
17
 
16
18
  args = parser.parse_args()
17
19
 
@@ -0,0 +1,29 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ from argparse import ArgumentParser
4
+
5
+ from evalscope.cli.base import CLICommand
6
+ from evalscope.report.app import create_app
7
+
8
+
9
+ def subparser_func(args):
10
+ """ Function which will be called for a specific sub parser.
11
+ """
12
+ return StartAppCMD(args)
13
+
14
+
15
+ class StartAppCMD(CLICommand):
16
+ name = 'app'
17
+
18
+ def __init__(self, args):
19
+ self.args = args
20
+
21
+ @staticmethod
22
+ def define_args(parsers: ArgumentParser):
23
+ """ define args for create pipeline template command.
24
+ """
25
+ parser = parsers.add_parser(StartAppCMD.name)
26
+ parser.set_defaults(func=subparser_func)
27
+
28
+ def execute(self):
29
+ create_app()
@@ -0,0 +1,3 @@
1
+ from evalscope.collections.evaluator import EvaluatorCollection
2
+ from evalscope.collections.sampler import StratifiedSampler, UniformSampler, WeightedSampler
3
+ from evalscope.collections.schema import CollectionSchema, DatasetInfo