evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,263 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import os.path
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Optional
5
+ import random
6
+
7
+ from evalscope.benchmarks import Benchmark
8
+ from evalscope.constants import DEFAULT_ROOT_CACHE_DIR, AnswerKeys
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ class DataAdapter(ABC):
15
+
16
+ def __init__(self,
17
+ subset_list: list,
18
+ metric_list: list,
19
+ few_shot_num: Optional[int] = 0,
20
+ train_split: Optional[str] = None,
21
+ eval_split: Optional[str] = None,
22
+ prompt_template: str = '',
23
+ **kwargs):
24
+ """
25
+ Args:
26
+ subset_list: list of subset names for the dataset.
27
+ metric_list: list, the metric list to evaluate the model on specific benchmark.
28
+ few_shot_num: int, number of few-shot examples. Default: 0
29
+ train_split: str, usually for few-shot examples. e.g. 'train'
30
+ eval_split: str, the target eval split name. e.g. 'test'
31
+ prompt_template: str, the prompt template for the benchmark,
32
+ e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:`
33
+ """
34
+ self.subset_list = subset_list
35
+ self.metric_list = metric_list
36
+ self.few_shot_num = few_shot_num
37
+ self.train_split = train_split
38
+ self.eval_split = eval_split
39
+ self.prompt_template = prompt_template
40
+ self.config_kwargs = kwargs
41
+
42
+ def load(self,
43
+ dataset_name_or_path: str,
44
+ subset_list: list = None,
45
+ work_dir: Optional[str] = DEFAULT_ROOT_CACHE_DIR,
46
+ datasets_hub: str = 'ModelScope',
47
+ **kwargs) -> dict:
48
+ """
49
+ Load the dataset. Remote and local datasets are supported.
50
+ You can rewrite this method to support your own local dataset, just follow the format of the output.
51
+
52
+ Returns: {'subset_name': {'train': train_dataset, 'test': test_dataset}}
53
+ train_dataset, test_dataset: Iterable dataset, object each item of which is a dict.
54
+
55
+ """
56
+ dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
57
+ if datasets_hub == 'Local':
58
+ # Try to load dataset from local disk
59
+ if not os.path.exists(dataset_name_or_path):
60
+ raise FileNotFoundError(f'Dataset path not found: {dataset_name_or_path}')
61
+
62
+ logger.info(f'Loading dataset from local disk: >dataset_name: {dataset_name_or_path} >work_dir: {work_dir}')
63
+ data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
64
+ if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
65
+ raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
66
+ else:
67
+ # Load dataset from remote
68
+ logger.info(f'Loading dataset from {datasets_hub} hub: >dataset_name: {dataset_name_or_path}')
69
+ data_dict = {}
70
+ split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
71
+ if len(split_list) == 0:
72
+ logger.error(f'Got empty split list: {split_list}')
73
+
74
+ subset_list = subset_list if subset_list is not None else self.subset_list
75
+ for sub_name in subset_list:
76
+ data_dict[sub_name] = {}
77
+ # e.g. train: few-shot, test: target dataset to evaluate
78
+ for split in split_list:
79
+ dataset = Benchmark.load(dataset_name=dataset_name_or_path,
80
+ subset=sub_name,
81
+ split=split,
82
+ hub=datasets_hub,
83
+ work_dir=work_dir,
84
+ **kwargs)
85
+
86
+ data_dict[sub_name].update({split: dataset})
87
+
88
+ return data_dict
89
+
90
+ def load_from_disk(self, *args, **kwargs) -> dict:
91
+ """
92
+ Load the dataset from local disk.
93
+ If you want to support local dataset, please rewrite this method in xxx_data_adapter.
94
+ """
95
+ return {}
96
+
97
+ def gen_prompts(self, data_dict: dict) -> dict:
98
+ """
99
+ Generate dataset prompts from raw input, unify the prompt format for different datasets.
100
+
101
+ Args:
102
+ data_dict: Refer to the output of load method: evalscope.benchmarks.benchmark.Benchmark.load
103
+
104
+ Returns:
105
+ {'subset_name': [prompt_d_1, prompt_d_2, ...]}
106
+ prompt_d_i (dict): refer to the output of gen_prompt method.
107
+
108
+ e.g. train -- few-shot data, test -- target dataset to evaluate.
109
+ """
110
+ res_dict: dict = {}
111
+
112
+ if self.few_shot_num and self.few_shot_num < 0:
113
+ raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
114
+
115
+ logger.info(f'\n** Use default settings: \n'
116
+ f'>few_shot_num: {self.few_shot_num}, '
117
+ f'>few_shot_split: {self.train_split}, '
118
+ f'>target_eval_split: {self.eval_split}')
119
+
120
+ for sub_name, sub_data_dict in data_dict.items():
121
+ few_shot_data = []
122
+ if self.few_shot_num and self.few_shot_num > 0:
123
+ few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
124
+ few_shot_data = self.get_fewshot_examples(
125
+ [item for item in sub_data_dict[self.train_split]],
126
+ self.few_shot_num,
127
+ few_shot_random=few_shot_random)
128
+
129
+ res_dict[sub_name] = []
130
+ for sample_d in sub_data_dict[self.eval_split]:
131
+ prompt_d = self.gen_prompt(input_d=sample_d, subset_name=sub_name, few_shot_list=few_shot_data)
132
+ prompt_d[AnswerKeys.RAW_INPUT] = sample_d
133
+ res_dict[sub_name].append(prompt_d)
134
+
135
+ rnd = random.Random()
136
+ rnd.seed(42)
137
+ for k, v in res_dict.items():
138
+ rnd.shuffle(v)
139
+
140
+ return res_dict
141
+
142
+ @abstractmethod
143
+ def gen_prompt(self, *args, **kwargs) -> Any:
144
+ """
145
+ Generate model prompt from raw input, unify the prompt format for different datasets.
146
+ The input format is compatible with OpenAI Chat Completions APIs.
147
+ Refer to: https://platform.openai.com/docs/guides/gpt/chat-completions-api
148
+
149
+ Args:
150
+ input_d (Any): The raw input. Depending on the dataset.
151
+
152
+ Returns:
153
+ For class MultiChoiceModelAdapter, the output format is:
154
+ {'data': [full_prompt]}, -- full_prompt: str, the constructed prompt for each sample from dataset.
155
+
156
+ For class ContinuationEvalModelAdapter, the output format is:
157
+ {'data': ctx_continuation_pair_list, 'multi_choices': self.choices}
158
+ """
159
+ raise NotImplementedError
160
+
161
+ @abstractmethod
162
+ def get_gold_answer(self, input_d: Any) -> Any:
163
+ """
164
+ Parse the raw input labels (gold).
165
+
166
+ Args:
167
+ input_d: input raw data. Depending on the dataset.
168
+
169
+ Returns:
170
+ The parsed input. e.g. gold answer ... Depending on the dataset.
171
+ """
172
+ raise NotImplementedError
173
+
174
+ @abstractmethod
175
+ def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> Any:
176
+ """
177
+ Parse the predicted result and extract proper answer.
178
+
179
+ Args:
180
+ result: Predicted answer from the model. Usually a string for chat.
181
+ raw_input_d: The raw input. Depending on the dataset.
182
+ eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
183
+
184
+ Returns:
185
+ The parsed answer. Depending on the dataset. Usually a string for chat.
186
+ """
187
+ raise NotImplementedError
188
+
189
+ @abstractmethod
190
+ def match(self, gold: Any, pred: Any) -> Any:
191
+ """
192
+ Match the gold answer and the predicted answer.
193
+
194
+ Args:
195
+ gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
196
+ e.g. 'A'
197
+ pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
198
+ e.g. 'B'
199
+
200
+ Returns:
201
+ The match result. Usually a score (float) for chat/multiple-choice-questions.
202
+ """
203
+ raise NotImplementedError
204
+
205
+ @abstractmethod
206
+ def compute_metric(self, review_res_list: list) -> Any:
207
+ """
208
+ Compute evaluation result by specific metrics.
209
+
210
+ Args:
211
+ review_res_list: list, the review result list, each item of which is match result for gold and pred.
212
+
213
+ Attributes:
214
+ DataAdapter.metric_func_map: metric_name -> metric_func mapping,
215
+ e.g. {'WeightedAverageAccuracy': weighted_average_acc}
216
+
217
+ Returns:
218
+ Metric results.
219
+ """
220
+ raise NotImplementedError
221
+
222
+ def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
223
+ """
224
+ Generate report for the evaluation results for all subsets.
225
+
226
+ Args:
227
+ subset_score_map: The subset-score map.
228
+ e.g. {subset_name: (score, num)}
229
+
230
+ report_name: str, the user-defined report name. Default: None
231
+
232
+ Returns: The evaluation report. Note: should normalize the score by normalize_score method in utils.
233
+
234
+ Here is a format example for ARC-Challenge:
235
+ {
236
+ "name":"ARC-Challenge",
237
+ "metric":"WeightedAverageAccuracy",
238
+ "score": 0.3389,
239
+ "category":[
240
+ {
241
+ "name":"DEFAULT",
242
+ "score": 0.3389,
243
+ "subset":[
244
+ {
245
+ "name":"ARC-Challenge",
246
+ "score": 0.3389
247
+ },
248
+ ]
249
+ }
250
+ ],
251
+ "total_num":100
252
+ }
253
+ """
254
+ raise NotImplementedError
255
+
256
+ def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
257
+
258
+ if k > len(data_list):
259
+ k = len(data_list)
260
+ if few_shot_random:
261
+ return random.sample(data_list, k)
262
+ else:
263
+ return data_list[:k]
@@ -0,0 +1,5 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST, GeneralQAAdapter
4
+ from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter as DataAdapterClass
5
+ from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass
@@ -0,0 +1,186 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import glob
3
+ import os.path
4
+
5
+ from evalscope.benchmarks.data_adapter import DataAdapter
6
+ from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
7
+ from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
8
+ from evalscope.utils import jsonl_to_list
9
+ from evalscope.utils.logger import get_logger
10
+ from typing import Any, Optional
11
+ from collections import defaultdict
12
+ import json
13
+
14
+ logger = get_logger()
15
+
16
+ DATASET_ID = 'general_qa'
17
+ SUBSET_LIST = ['default']
18
+
19
+
20
+ class GeneralQAAdapter(DataAdapter):
21
+ # TODO: set few_shot_num
22
+
23
+ def __init__(self,
24
+ subset_list: list = None,
25
+ metric_list: list = None,
26
+ train_split: str = None,
27
+ eval_split: str = 'test',
28
+ **kwargs):
29
+ if subset_list is None:
30
+ subset_list = SUBSET_LIST
31
+
32
+ if metric_list is None:
33
+ metric_list = [{'name': 'WeightedAverageBLEU', 'object': weighted_mean}]
34
+
35
+ super().__init__(subset_list=subset_list,
36
+ metric_list=metric_list,
37
+ train_split=train_split,
38
+ eval_split=eval_split,
39
+ **kwargs)
40
+
41
+ def load(self,
42
+ dataset_name_or_path: str,
43
+ subset_list: list = None,
44
+ **kwargs) -> dict:
45
+
46
+ data_file_list = glob.glob(os.path.join(dataset_name_or_path, '*.jsonl'))
47
+ data_list = []
48
+
49
+ try:
50
+ for file_path in data_file_list:
51
+ data_list.extend(jsonl_to_list(file_path))
52
+ except Exception as e:
53
+ raise ValueError(f"Failed to load data from {dataset_name_or_path}, got error: {e}")
54
+
55
+ data_dict = {'default': {'test': data_list}}
56
+
57
+ return data_dict
58
+
59
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
60
+ """
61
+ Args:
62
+ input_d:
63
+ format1: {'history': [['q1', 'a1'], ['q2', 'a2']], 'question': '', 'answer': ''}
64
+ format2: {'history': [['q1', 'a1'], ['q2', 'a2']], 'query': '', 'response': ''}
65
+
66
+ Returns:
67
+ {'data': [prompt]}
68
+
69
+ """
70
+ # prompt = f"'<|im_start|>user\n{input_d['input']}<|im_end|>\n<|im_start|>assistant\n'"
71
+ history = input_d.get('history', []) # history: [['q1', 'a1'], ['q2', 'a2'], ...]
72
+ if len(history) > 0:
73
+ logger.warning(f"The history is not included in the prompt for GeneralQA. To be supported in the future.")
74
+
75
+ prompt = input_d.get('question', '') or input_d.get('query', '')
76
+
77
+ # if len(history) > 0:
78
+ # prompt = '\n'.join(history) + '\n' + prompt
79
+ return {'data': [prompt]}
80
+
81
+ def get_gold_answer(self, input_d: dict) -> str:
82
+ """
83
+ Args:
84
+ input_d: {'history': [], 'question': '', 'answer': ''}
85
+
86
+ Returns:
87
+ gold_answer: str
88
+
89
+ """
90
+ return input_d.get('answer', '') or input_d.get('response', '')
91
+
92
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
93
+ """
94
+ Args:
95
+ result: str
96
+
97
+ Returns:
98
+ pred_result: str
99
+
100
+ """
101
+ return result
102
+
103
+ def match(self, gold: str, pred: str) -> float:
104
+ """
105
+ Args:
106
+ gold: str
107
+ pred: str
108
+
109
+ Returns:
110
+ bleu_score: float
111
+
112
+ """
113
+ item = [(gold, pred)]
114
+ res = dict()
115
+ rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
116
+ bleu_dict = bleu_ngram_one_sample(pred, gold)
117
+ res.update(rouge_dict)
118
+ res.update(bleu_dict)
119
+ # return bleu(item)
120
+ return res
121
+
122
+ def compute_metric(self, review_res_list: list) -> float:
123
+ """
124
+ compute weighted mean of the bleu score of all samples
125
+
126
+ Args:
127
+ review_res_list: [score1, score2, ...]
128
+
129
+ Returns:
130
+ avg_res: float
131
+
132
+ """
133
+ items = defaultdict(list)
134
+ for scores in review_res_list:
135
+ for k,v in scores.items():
136
+ items[k].append((v, 1.0))
137
+ # items = [(score, 1.0) for score in review_res_list]
138
+ res = {k: weighted_mean(v) for k,v in items.items()}
139
+ # return weighted_mean(items)
140
+ return res
141
+
142
+ def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
143
+ """
144
+ Args:
145
+ subset_score_map: {subset_name: (score_dict, num), ...}
146
+ report_name: str, the user-defined report name.
147
+
148
+ Returns:
149
+ {
150
+ "name":"GeneralQA",
151
+ "metric":"WeightedAverageBLEU",
152
+ "score":0.399,
153
+ "category":[
154
+ {
155
+ "name":"DEFAULT",
156
+ "score":0.399,
157
+ "subset":[
158
+ {
159
+ "name":"default",
160
+ "score":0.399
161
+ },
162
+ ]
163
+ }
164
+ ],
165
+ "total_num":10
166
+ }
167
+ """
168
+ total_num: int = sum([num for _, num in subset_score_map.values()])
169
+ # weighted_avg_bleu: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
170
+ cate_avg_list = [{'name': subset_name, 'score': score_dict} for subset_name, (score_dict, _) in subset_score_map.items()]
171
+ total_avg_list = defaultdict(float)
172
+ for score_dict, num in subset_score_map.values():
173
+ for metric, score in score_dict.items():
174
+ total_avg_list[metric] += score * num / total_num
175
+
176
+ category_d = dict(name="DEFAULT",
177
+ score=total_avg_list,
178
+ subset=cate_avg_list)
179
+
180
+ res_map = dict(name=report_name or "general_qa",
181
+ metric=self.metric_list[0]['name'],
182
+ score=total_avg_list,
183
+ category=[category_d],
184
+ total_num=total_num)
185
+
186
+ return res_map
@@ -0,0 +1,5 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.benchmarks.gsm8k.gsm8k_adapter import DATASET_ID, SUBSET_LIST
4
+ from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter as DataAdapterClass
5
+ from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
@@ -0,0 +1,127 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # flake8: noqa
16
+
17
+ """Grade School Math 8k dataset."""
18
+
19
+ import json
20
+ import textwrap
21
+
22
+ import datasets
23
+
24
+
25
+ _CITATION = """\
26
+ @misc{cobbe2021training,
27
+ title={Training Verifiers to Solve Math Word Problems},
28
+ author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
29
+ year={2021},
30
+ eprint={2110.14168},
31
+ archivePrefix={arXiv},
32
+ primaryClass={cs.LG}
33
+ }
34
+ """
35
+
36
+ _DESCRIPTION = """\
37
+ GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality
38
+ linguistically diverse grade school math word problems. The
39
+ dataset was created to support the task of question answering
40
+ on basic mathematical problems that require multi-step reasoning.
41
+ """
42
+
43
+ _HOMEPAGE = 'https://openai.com/blog/grade-school-math'
44
+ _MODELSCOPE_PAGE = 'https://modelscope.cn/datasets/modelscope/gsm8k/summary'
45
+
46
+ _LICENSE = 'MIT'
47
+
48
+ # _BASE_URL = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/"
49
+ TRAIN_URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/gsm8k/train.jsonl'
50
+ TEST_URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/gsm8k/test.jsonl'
51
+
52
+
53
+ class Gsm8kConfig(datasets.BuilderConfig):
54
+ """BuilderConfig for GSM8K."""
55
+
56
+ def __init__(self, urls, **kwargs):
57
+ """BuilderConfig for GSM8K.
58
+ Args:
59
+ urls: *dict[string]*, the urls for each split of the GSM8k set.
60
+ """
61
+ super().__init__(version=datasets.Version('1.1.0'), **kwargs)
62
+ self.urls = urls
63
+
64
+
65
+ class Gsm8k(datasets.GeneratorBasedBuilder):
66
+ """Grade School Math 8k (GSM8K)"""
67
+
68
+ BUILDER_CONFIGS = [
69
+ Gsm8kConfig(
70
+ name='main',
71
+ description=textwrap.dedent(
72
+ """
73
+ It is segmented into 7.5K training problems and 1K test problems.
74
+ These problems take between 2 and 8 steps to solve, and solutions
75
+ primarily involve performing a sequence of elementary calculations
76
+ using basic arithmetic operations (+ - / *) to reach the final
77
+ answer. A bright middle school student should be able to solve
78
+ every problem.
79
+ """,
80
+ ),
81
+ urls={
82
+ 'train': TRAIN_URL,
83
+ 'test': TEST_URL,
84
+ },
85
+ ),
86
+ ]
87
+
88
+ def _info(self):
89
+ features = datasets.Features(
90
+ {
91
+ 'question': datasets.Value('string'),
92
+ 'answer': datasets.Value('string'),
93
+ }
94
+ )
95
+ return datasets.DatasetInfo(
96
+ description=_DESCRIPTION,
97
+ features=features,
98
+ homepage=_HOMEPAGE,
99
+ license=_LICENSE,
100
+ citation=_CITATION,
101
+ )
102
+
103
+ def _split_generators(self, dl_manager):
104
+ data_dir = dl_manager.download_and_extract(self.config.urls)
105
+ return [
106
+ datasets.SplitGenerator(
107
+ name=datasets.Split.TRAIN,
108
+ gen_kwargs={
109
+ 'filepath': data_dir['train'],
110
+ },
111
+ ),
112
+ datasets.SplitGenerator(
113
+ name=datasets.Split.TEST,
114
+ gen_kwargs={
115
+ 'filepath': data_dir['test'],
116
+ },
117
+ ),
118
+ ]
119
+
120
+ def _generate_examples(self, filepath):
121
+ with open(filepath, encoding='utf-8') as f:
122
+ for key, row in enumerate(f):
123
+ data = json.loads(row)
124
+ yield key, {
125
+ 'question': data['question'],
126
+ 'answer': data['answer'],
127
+ }