evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (147) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/base.py +1 -1
  4. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  5. evalscope/backend/rag_eval/utils/clip.py +2 -2
  6. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  7. evalscope/backend/rag_eval/utils/llm.py +1 -1
  8. evalscope/benchmarks/__init__.py +20 -1
  9. evalscope/benchmarks/arc/__init__.py +0 -5
  10. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  11. evalscope/benchmarks/bbh/__init__.py +0 -4
  12. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  13. evalscope/benchmarks/benchmark.py +70 -59
  14. evalscope/benchmarks/ceval/__init__.py +0 -5
  15. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  16. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  17. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  18. evalscope/benchmarks/competition_math/__init__.py +0 -5
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  20. evalscope/benchmarks/data_adapter.py +115 -87
  21. evalscope/benchmarks/general_qa/__init__.py +0 -5
  22. evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
  23. evalscope/benchmarks/gpqa/__init__.py +0 -0
  24. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  26. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  27. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
  28. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  29. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
  30. evalscope/benchmarks/humaneval/__init__.py +0 -4
  31. evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
  32. evalscope/benchmarks/ifeval/__init__.py +0 -0
  33. evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  34. evalscope/benchmarks/ifeval/instructions.py +1477 -0
  35. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  36. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  37. evalscope/benchmarks/ifeval/utils.py +134 -0
  38. evalscope/benchmarks/iquiz/__init__.py +0 -0
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  40. evalscope/benchmarks/mmlu/__init__.py +0 -5
  41. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  42. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  43. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  44. evalscope/benchmarks/race/__init__.py +0 -5
  45. evalscope/benchmarks/race/race_adapter.py +27 -123
  46. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  48. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  49. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  50. evalscope/cli/cli.py +2 -0
  51. evalscope/cli/start_app.py +30 -0
  52. evalscope/collections/__init__.py +3 -0
  53. evalscope/collections/evaluator.py +198 -0
  54. evalscope/collections/sampler.py +138 -0
  55. evalscope/collections/schema.py +126 -0
  56. evalscope/config.py +45 -7
  57. evalscope/constants.py +7 -38
  58. evalscope/evaluator/__init__.py +0 -1
  59. evalscope/evaluator/evaluator.py +89 -121
  60. evalscope/evaluator/rating_eval.py +1 -1
  61. evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
  62. evalscope/metrics/__init__.py +3 -0
  63. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  64. evalscope/metrics/math_accuracy.py +193 -50
  65. evalscope/metrics/metrics.py +18 -6
  66. evalscope/metrics/named_metrics.py +17 -0
  67. evalscope/metrics/rouge_metric.py +13 -8
  68. evalscope/models/__init__.py +14 -1
  69. evalscope/models/base_adapter.py +52 -0
  70. evalscope/models/chat_adapter.py +140 -0
  71. evalscope/models/choice_adapter.py +211 -0
  72. evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
  73. evalscope/models/custom_adapter.py +67 -0
  74. evalscope/models/local_model.py +74 -0
  75. evalscope/models/model.py +141 -0
  76. evalscope/models/server_adapter.py +111 -0
  77. evalscope/perf/__init__.py +1 -0
  78. evalscope/perf/arguments.py +3 -1
  79. evalscope/perf/benchmark.py +3 -3
  80. evalscope/perf/main.py +5 -7
  81. evalscope/perf/plugin/api/custom_api.py +1 -1
  82. evalscope/perf/plugin/api/openai_api.py +54 -50
  83. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  84. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  85. evalscope/perf/plugin/registry.py +3 -3
  86. evalscope/perf/utils/benchmark_util.py +4 -4
  87. evalscope/perf/utils/db_util.py +66 -22
  88. evalscope/perf/utils/local_server.py +4 -1
  89. evalscope/report/__init__.py +5 -0
  90. evalscope/report/app.py +693 -0
  91. evalscope/report/combinator.py +73 -0
  92. evalscope/report/generator.py +80 -0
  93. evalscope/report/utils.py +133 -0
  94. evalscope/run.py +64 -125
  95. evalscope/run_arena.py +3 -2
  96. evalscope/summarizer.py +15 -27
  97. evalscope/third_party/longbench_write/eval.py +2 -1
  98. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  99. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  100. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  101. evalscope/utils/__init__.py +1 -0
  102. evalscope/utils/chat_service.py +6 -5
  103. evalscope/utils/io_utils.py +170 -0
  104. evalscope/utils/logger.py +13 -0
  105. evalscope/utils/model_utils.py +15 -2
  106. evalscope/utils/utils.py +3 -200
  107. evalscope/version.py +2 -2
  108. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
  109. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
  110. tests/cli/test_collection.py +57 -0
  111. tests/cli/test_run.py +57 -7
  112. tests/perf/test_perf.py +3 -2
  113. tests/rag/test_mteb.py +3 -2
  114. tests/vlm/test_vlmeval.py +3 -2
  115. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  116. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  117. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  118. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  119. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  120. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  121. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  122. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  123. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  124. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  125. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  126. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  127. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  128. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  129. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  130. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  131. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  132. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  133. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  134. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  135. evalscope/evaluator/humaneval_evaluator.py +0 -158
  136. evalscope/models/api/__init__.py +0 -3
  137. evalscope/models/dummy_chat_model.py +0 -49
  138. evalscope/models/model_adapter.py +0 -525
  139. evalscope/models/openai_model.py +0 -103
  140. evalscope/tools/__init__.py +0 -1
  141. evalscope/tools/combine_reports.py +0 -135
  142. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  143. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  144. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
  145. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
  146. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
  147. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,57 +1,41 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- import json
4
3
  import os
5
4
 
6
- from evalscope.benchmarks.data_adapter import DataAdapter
7
- from evalscope.metrics.metrics import exact_match, weighted_mean
8
- from evalscope.utils import jsonl_to_list, normalize_score
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import EvalType
7
+ from evalscope.metrics import AverageAccuracy, exact_match
8
+ from evalscope.models import MultiChoiceModelAdapter
9
+ from evalscope.utils import ResponseParser
10
+ from evalscope.utils.io_utils import jsonl_to_list
9
11
  from evalscope.utils.logger import get_logger
10
12
 
11
13
  # flake8: noqa
12
14
 
13
15
  logger = get_logger()
14
16
 
15
- DATASET_ID = 'modelscope/race'
16
-
17
- SUBSET_LIST = ['high', 'middle']
18
-
19
- SUBJECT_MAPPING = {'high': 'High', 'middle': 'Middle'}
20
-
21
17
 
18
+ @Benchmark.register(
19
+ name='race',
20
+ dataset_id='modelscope/race',
21
+ model_adapter=MultiChoiceModelAdapter,
22
+ subset_list=['high', 'middle'],
23
+ metric_list=[AverageAccuracy],
24
+ few_shot_num=3,
25
+ train_split='train',
26
+ eval_split='test',
27
+ )
22
28
  class RACEAdapter(DataAdapter):
23
29
 
24
30
  choices = ['A', 'B', 'C', 'D']
25
31
 
26
- def __init__(self,
27
- subset_list: list = None,
28
- metric_list: list = None,
29
- few_shot_num: int = None,
30
- train_split: str = 'train',
31
- eval_split: str = 'test',
32
- **kwargs):
33
-
34
- if subset_list is None:
35
- subset_list = SUBSET_LIST
36
-
37
- if metric_list is None:
38
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
39
-
40
- if few_shot_num is None:
41
- logger.info(f'Set 3-shot examples by system for RACE.')
42
- few_shot_num = 3
43
-
32
+ def __init__(self, **kwargs):
33
+ few_shot_num = kwargs.get('few_shot_num', 3)
44
34
  if few_shot_num > 3:
45
35
  logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.')
46
- few_shot_num = 3
36
+ kwargs['few_shot_num'] = 3
47
37
 
48
- super().__init__(
49
- subset_list=subset_list,
50
- metric_list=metric_list,
51
- few_shot_num=few_shot_num,
52
- train_split=train_split,
53
- eval_split=eval_split,
54
- **kwargs)
38
+ super().__init__(**kwargs)
55
39
 
56
40
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
57
41
  data_dict = {}
@@ -98,13 +82,13 @@ class RACEAdapter(DataAdapter):
98
82
 
99
83
  full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
100
84
 
101
- return {'data': [full_prompt], 'multi_choices': self.choices}
85
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
102
86
 
103
87
  def get_gold_answer(self, input_d: dict) -> str:
104
88
  # Get the gold choice
105
89
  return input_d.get('answer', '')
106
90
 
107
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
91
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
108
92
  """
109
93
  Parse the model output to get the answer. Could be the best choice index.
110
94
 
@@ -116,98 +100,18 @@ class RACEAdapter(DataAdapter):
116
100
  Returns:
117
101
  The parsed answer. Depending on the dataset. Usually a string for chat.
118
102
  """
119
- if eval_type == 'checkpoint':
120
- return result
121
- elif eval_type == 'service': # TODO: to be implemented
122
- return result
123
- elif eval_type == 'custom': # TODO: to be implemented
103
+ if eval_type == EvalType.CHECKPOINT:
124
104
  return result
105
+ elif eval_type == EvalType.SERVICE:
106
+ return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
107
+ elif eval_type == EvalType.CUSTOM:
108
+ return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
125
109
  else:
126
110
  raise ValueError(f'Unknown eval_type: {eval_type}')
127
111
 
128
112
  def match(self, gold: str, pred: str) -> float:
129
113
  return exact_match(gold=gold, pred=pred)
130
114
 
131
- def compute_metric(self, review_res_list: list) -> float:
132
- """
133
- Compute evaluation result by specific metric.
134
-
135
- Args:
136
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
137
-
138
- Returns:
139
- The metric score.
140
- """
141
- items = [(score, 1.0) for score in review_res_list]
142
- return weighted_mean(items)
143
-
144
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
145
- """
146
- Generate report for the evaluation.
147
-
148
- Args:
149
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
150
- report_name: The user-defined report name.
151
-
152
- Returns:
153
- {
154
- "name":"RACE",
155
- "metric":"WeightedAverageAccuracy",
156
- "score":0.3389,
157
- "category":[
158
- {
159
- "name":"High",
160
- "score":0.2528,
161
- "subset":[
162
- {
163
- "name":"high",
164
- "score":0.2528
165
- }
166
- ]
167
- }
168
- ],
169
- "total_num":59
170
- }
171
- """
172
- total_num: int = sum([num for _, num in subset_score_map.values()])
173
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
174
-
175
- # Get domain-subject mapping
176
- subject_review_map = {}
177
- for subset_name, (subset_score, num) in subset_score_map.items():
178
- domain_name: str = SUBJECT_MAPPING.get(subset_name)
179
- if domain_name in subject_review_map:
180
- subject_review_map[domain_name].append((subset_name, subset_score, num))
181
- else:
182
- subject_review_map[domain_name] = [(subset_name, subset_score, num)]
183
-
184
- # Get domain score
185
- category_list = []
186
- for domain_name, domain_res_list in subject_review_map.items():
187
- domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
188
- sum([num for _, _, num in domain_res_list])
189
- domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
190
- category_list.append({
191
- 'name':
192
- domain_name,
193
- 'score':
194
- normalize_score(score=domain_weighted_avg_acc),
195
- 'subset': [{
196
- 'name': subset_name,
197
- 'score': subset_score
198
- } for subset_name, subset_score, _ in domain_res_list]
199
- })
200
-
201
- # Get final dict of report
202
- res_map = dict(
203
- name=report_name or 'race',
204
- metric=self.metric_list[0]['name'],
205
- score=weighted_avg_acc,
206
- category=category_list,
207
- total_num=total_num)
208
-
209
- return res_map
210
-
211
115
  @classmethod
212
116
  def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
213
117
 
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter
5
- from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
@@ -1,49 +1,35 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) EleutherAI Inc, and its affiliates.
3
3
  import csv
4
- import numpy as np
5
4
  import os
6
- from typing import List
7
5
 
6
+ from evalscope.benchmarks import Benchmark
8
7
  from evalscope.benchmarks.data_adapter import DataAdapter
9
- from evalscope.metrics.metrics import exact_match, weighted_mean
10
- from evalscope.utils.logger import get_logger
8
+ from evalscope.constants import EvalType
9
+ from evalscope.metrics import AverageAccuracy
10
+ from evalscope.models import ChatGenerationModelAdapter
11
+ from evalscope.utils import get_logger
11
12
 
12
13
  # flake8: noqa
13
14
 
14
15
  logger = get_logger()
15
16
 
16
- DATASET_ID = 'modelscope/trivia_qa'
17
- SUBSET_LIST = ['default']
18
-
19
17
 
18
+ @Benchmark.register(
19
+ name='trivia_qa',
20
+ dataset_id='modelscope/trivia_qa',
21
+ model_adapter=ChatGenerationModelAdapter,
22
+ subset_list=['default'],
23
+ metric_list=[AverageAccuracy],
24
+ few_shot_num=5,
25
+ train_split='dev',
26
+ eval_split='test',
27
+ )
20
28
  class TriviaQaAdapter(DataAdapter):
21
29
 
22
- def __init__(self,
23
- subset_list: list = None,
24
- metric_list: list = None,
25
- few_shot_num: int = None,
26
- train_split: str = 'dev',
27
- eval_split: str = 'test',
28
- **kwargs):
29
-
30
- if subset_list is None:
31
- subset_list = SUBSET_LIST
32
-
33
- if metric_list is None:
34
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
30
+ def __init__(self, **kwargs):
35
31
 
36
- if few_shot_num is None:
37
- logger.info(f'few_shot_num is not specified for TriviaQA, use default value: 5')
38
- few_shot_num = 5
39
-
40
- super().__init__(
41
- subset_list=subset_list,
42
- metric_list=metric_list,
43
- few_shot_num=few_shot_num,
44
- train_split=train_split,
45
- eval_split=eval_split,
46
- **kwargs)
32
+ super().__init__(**kwargs)
47
33
 
48
34
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
49
35
  data_dict = {}
@@ -113,16 +99,16 @@ class TriviaQaAdapter(DataAdapter):
113
99
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
114
100
  context: str = '\n'.join(few_shot_prompts) + '\n'
115
101
  context += self._generate_prompt(input_d=input_d, include_answer=False)
116
- full_prompt = prompt + context
102
+ full_prompt = context
117
103
 
118
- return {'data': [full_prompt]}
104
+ return {'data': [full_prompt], 'system_prompt': prompt or self.prompt_template}
119
105
 
120
106
  def get_gold_answer(self, input_d: dict) -> list:
121
107
  # Get the gold choice
122
108
  ans: list = input_d.get('ideal', [])
123
109
  return ans
124
110
 
125
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
111
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
126
112
  """
127
113
  Parse the model output to get the answer.
128
114
 
@@ -134,73 +120,11 @@ class TriviaQaAdapter(DataAdapter):
134
120
  Returns:
135
121
  The predicted answer.
136
122
  """
137
- if eval_type == 'checkpoint':
138
- return result
139
- elif eval_type == 'service': # TODO: to be implemented
140
- return result
141
- elif eval_type == 'custom': # TODO: to be implemented
142
- return result
143
- else:
144
- raise ValueError(f'Unknown eval_type: {eval_type}')
123
+ return result
145
124
 
146
125
  def match(self, gold: list, pred: str) -> float:
147
- return max([exact_match(gold=ref, pred=pred) for ref in gold])
148
-
149
- def compute_metric(self, review_res_list: list) -> float:
150
- """
151
- Compute evaluation result by specific metric.
152
-
153
- Args:
154
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
155
-
156
- Returns:
157
- The metric score.
158
- """
159
- items = [(score, 1.0) for score in review_res_list]
160
- return weighted_mean(items)
161
-
162
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
163
- """
164
- Generate the report for the model output.
165
-
166
- Args:
167
- subset_score_map: {subset_name: (score, num), ...}
168
- report_name: The user-defined report name.
169
-
170
- Returns:
171
- {
172
- "name":"TriviaQA",
173
- "metric":"WeightedAverageAccuracy",
174
- "score":0.3389,
175
- "category":[
176
- {
177
- "name":"DEFAULT",
178
- "score":0.3389,
179
- "subset":[
180
- {
181
- "name":"default",
182
- "score":0.3389
183
- }
184
- ]
185
- }
186
- ],
187
- "total_num":100
188
- }
189
- """
190
- total_num: int = sum([num for _, num in subset_score_map.values()])
191
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
192
- cate_avg_list = [{'name': subset_name, 'score': score} for subset_name, (score, _) in subset_score_map.items()]
193
-
194
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
195
-
196
- res_map = dict(
197
- name=report_name or 'trivia_qa',
198
- metric=self.metric_list[0]['name'],
199
- score=weighted_avg_acc,
200
- category=[category_d],
201
- total_num=total_num)
202
-
203
- return res_map
126
+ is_correct = any([cand in pred for cand in gold])
127
+ return 1 if is_correct else 0
204
128
 
205
129
  @classmethod
206
130
  def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter
5
- from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
@@ -6,20 +6,30 @@ import numpy as np
6
6
  import os
7
7
  from typing import List
8
8
 
9
+ from evalscope.benchmarks import Benchmark
9
10
  from evalscope.benchmarks.data_adapter import DataAdapter
10
- from evalscope.metrics.metrics import weighted_mean
11
- from evalscope.utils import normalize_score
12
- from evalscope.utils.logger import get_logger
11
+ from evalscope.constants import EvalType
12
+ from evalscope.metrics import AverageAccuracy
13
+ from evalscope.models import ContinuationLogitsModelAdapter
14
+ from evalscope.utils import get_logger, normalize_score
13
15
 
14
16
  # flake8: noqa
15
17
 
16
18
  logger = get_logger()
17
19
 
18
- DATASET_ID = 'modelscope/truthful_qa'
19
20
  # task_list = ['generation', 'multiple_choice']
20
- SUBSET_LIST = ['multiple_choice']
21
21
 
22
22
 
23
+ @Benchmark.register(
24
+ name='truthful_qa',
25
+ dataset_id='modelscope/truthful_qa',
26
+ model_adapter=ContinuationLogitsModelAdapter,
27
+ subset_list=['multiple_choice'],
28
+ metric_list=[AverageAccuracy],
29
+ few_shot_num=0,
30
+ train_split=None,
31
+ eval_split='validation',
32
+ )
23
33
  class TruthfulQaAdapter(DataAdapter):
24
34
  """
25
35
  Adapter for TruthfulQA benchmark.
@@ -40,35 +50,14 @@ class TruthfulQaAdapter(DataAdapter):
40
50
  'Q: Where were the 1992 Olympics held?\n'
41
51
  'A: The 1992 Olympics were held in Barcelona, Spain.')
42
52
 
43
- def __init__(self,
44
- subset_list: list = None,
45
- metric_list: list = None,
46
- few_shot_num: int = None,
47
- train_split: str = None,
48
- eval_split: str = 'validation',
49
- **kwargs):
50
-
51
- if subset_list is None:
52
- subset_list = SUBSET_LIST
53
-
54
- if metric_list is None:
55
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
56
-
57
- if few_shot_num is None:
58
- logger.info(f'Set 0-shot examples by system for TruthfulQA.')
59
- few_shot_num = 0
53
+ def __init__(self, **kwargs):
60
54
 
55
+ few_shot_num = kwargs.get('few_shot_num', 0)
61
56
  if few_shot_num != 0:
62
57
  logger.warning(f'few_shot_num should be 0 for TruthfulQA, but got {few_shot_num}. Use 0-shot by default.')
63
- few_shot_num = 0
58
+ kwargs['few_shot_num'] = 0
64
59
 
65
- super().__init__(
66
- subset_list=subset_list,
67
- metric_list=metric_list,
68
- few_shot_num=few_shot_num,
69
- train_split=train_split,
70
- eval_split=eval_split,
71
- **kwargs)
60
+ super().__init__(**kwargs)
72
61
 
73
62
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
74
63
  data_dict = {}
@@ -215,7 +204,7 @@ class TruthfulQaAdapter(DataAdapter):
215
204
  # TODO: generation sub-task to be added
216
205
  return {'mc1_labels': input_d['mc1_targets']['labels'], 'mc2_labels': input_d['mc2_targets']['labels']}
217
206
 
218
- def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> list:
207
+ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> list:
219
208
  """
220
209
  Parse the model output to get the answer.
221
210
 
@@ -227,11 +216,11 @@ class TruthfulQaAdapter(DataAdapter):
227
216
  Returns:
228
217
  The predicted answer.
229
218
  """
230
- if eval_type == 'checkpoint':
219
+ if eval_type == EvalType.CHECKPOINT:
231
220
  return result
232
- elif eval_type == 'service': # TODO: to be supported !
221
+ elif eval_type == EvalType.SERVICE: # TODO: to be supported !
233
222
  return result
234
- elif eval_type == 'custom': # TODO: to be supported !
223
+ elif eval_type == EvalType.CUSTOM: # TODO: to be supported !
235
224
  return result
236
225
  else:
237
226
  raise ValueError(f'Invalid eval_type: {eval_type}')
@@ -270,7 +259,7 @@ class TruthfulQaAdapter(DataAdapter):
270
259
 
271
260
  return {'multiple_choice': {'mc1': mc1(mc1_lls), 'mc2': mc2(mc2_lls)}} # or {'generation': xxx}
272
261
 
273
- def compute_metric(self, review_res_list: List[dict]) -> float:
262
+ def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
274
263
  """
275
264
  Compute evaluation result by specific metric for each subset.
276
265
 
@@ -295,56 +284,8 @@ class TruthfulQaAdapter(DataAdapter):
295
284
  logger.error(f'** Unknown review_res: {review_res_d}')
296
285
 
297
286
  # To get mc2 score
298
- items = [(score, 1.0) for score in mc2_list]
299
- return weighted_mean(items)
300
-
301
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
302
- """
303
- Generate the report for the model output.
304
-
305
- Args:
306
- subset_score_map: {subset_name: (score, num), ...}
307
- report_name: The user-defined report name.
308
-
309
- Returns:
310
- {
311
- "name":"TruthfulQA",
312
- "metric":"WeightedAverageAccuracy",
313
- "score":0.3389,
314
- "category":[
315
- {
316
- "name":"DEFAULT",
317
- "score":0.2527,
318
- "subset":[
319
- {
320
- "name":"multiple_choice",
321
- "score":0.3157
322
- },
323
- # {
324
- # "name":"generation",
325
- # "score":0.2631
326
- # }
327
- ]
328
- }
329
- ],
330
- "total_num":100
331
- }
332
- """
333
- total_num: int = sum([num for _, num in subset_score_map.values()])
334
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
335
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
336
- cate_avg_list = [{
337
- 'name': subset_name,
338
- 'score': normalize_score(score=score)
339
- } for subset_name, (score, _) in subset_score_map.items()]
340
-
341
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
342
-
343
- res_map = dict(
344
- name=report_name or 'truthful_qa',
345
- metric=self.metric_list[0]['name'],
346
- score=weighted_avg_acc,
347
- category=[category_d],
348
- total_num=total_num)
349
-
350
- return res_map
287
+ return [{
288
+ 'metric_name': self.metric_list[0].name,
289
+ 'score': self.metric_list[0].object(mc2_list),
290
+ 'num': len(mc2_list)
291
+ }]
evalscope/cli/cli.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  import argparse
4
4
 
5
+ from evalscope.cli.start_app import StartAppCMD
5
6
  from evalscope.cli.start_eval import EvalCMD
6
7
  from evalscope.cli.start_perf import PerfBenchCMD
7
8
 
@@ -12,6 +13,7 @@ def run_cmd():
12
13
 
13
14
  PerfBenchCMD.define_args(subparsers)
14
15
  EvalCMD.define_args(subparsers)
16
+ StartAppCMD.define_args(subparsers)
15
17
 
16
18
  args = parser.parse_args()
17
19
 
@@ -0,0 +1,30 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os
3
+ from argparse import ArgumentParser
4
+
5
+ from evalscope.cli.base import CLICommand
6
+ from evalscope.report.app import add_argument, create_app
7
+
8
+
9
+ def subparser_func(args):
10
+ """ Function which will be called for a specific sub parser.
11
+ """
12
+ return StartAppCMD(args)
13
+
14
+
15
+ class StartAppCMD(CLICommand):
16
+ name = 'app'
17
+
18
+ def __init__(self, args):
19
+ self.args = args
20
+
21
+ @staticmethod
22
+ def define_args(parsers: ArgumentParser):
23
+ """ define args for create pipeline template command.
24
+ """
25
+ parser = parsers.add_parser(StartAppCMD.name)
26
+ add_argument(parser)
27
+ parser.set_defaults(func=subparser_func)
28
+
29
+ def execute(self):
30
+ create_app(self.args)
@@ -0,0 +1,3 @@
1
+ from evalscope.collections.evaluator import EvaluatorCollection
2
+ from evalscope.collections.sampler import StratifiedSampler, UniformSampler, WeightedSampler
3
+ from evalscope.collections.schema import CollectionSchema, DatasetInfo