evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,118 @@
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+
16
+ import datasets
17
+ import pandas as pd
18
+
19
+
20
+ _CITATION = """\
21
+ @inproceedings{lai-etal-2017-race,
22
+ title = "{RACE}: Large-scale {R}e{A}ding Comprehension Dataset From Examinations",
23
+ author = "Lai, Guokun and
24
+ Xie, Qizhe and
25
+ Liu, Hanxiao and
26
+ Yang, Yiming and
27
+ Hovy, Eduard",
28
+ booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
29
+ month = sep,
30
+ year = "2017",
31
+ address = "Copenhagen, Denmark",
32
+ publisher = "Association for Computational Linguistics",
33
+ url = "https://aclanthology.org/D17-1082",
34
+ doi = "10.18653/v1/D17-1082",
35
+ pages = "785--794",
36
+ }
37
+ """
38
+
39
+ _DESCRIPTION = """\
40
+ RACE is a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions.
41
+ """
42
+
43
+ _HOMEPAGE = "https://modelscope.cn/datasets/modelscope/race/summary"
44
+
45
+ _URL = "https://modelscope.cn/api/v1/datasets/modelscope/race/repo?Revision=master&FilePath=race.zip"
46
+
47
+ task_list = [
48
+ "high",
49
+ "middle",
50
+ ]
51
+
52
+
53
+ class RACEConfig(datasets.BuilderConfig):
54
+ def __init__(self, **kwargs):
55
+ super().__init__(version=datasets.Version("1.0.0"), **kwargs)
56
+
57
+
58
+ class RACE(datasets.GeneratorBasedBuilder):
59
+ BUILDER_CONFIGS = [
60
+ RACEConfig(
61
+ name=task_name,
62
+ )
63
+ for task_name in task_list
64
+ ]
65
+
66
+ def _info(self):
67
+ features = datasets.Features(
68
+ {
69
+ "example_id": datasets.Value("string"),
70
+ "article": datasets.Value("string"),
71
+ "answer": datasets.Value("string"),
72
+ "question": datasets.Value("string"),
73
+ "options": [datasets.Value("string")],
74
+ }
75
+ )
76
+ return datasets.DatasetInfo(
77
+ description=_DESCRIPTION,
78
+ features=features,
79
+ homepage=_HOMEPAGE,
80
+ citation=_CITATION,
81
+ )
82
+
83
+ def _split_generators(self, dl_manager):
84
+ data_dir = dl_manager.download_and_extract(_URL)
85
+ task_name = self.config.name
86
+ return [
87
+ datasets.SplitGenerator(
88
+ name=datasets.Split.TEST,
89
+ gen_kwargs={
90
+ "filepath": os.path.join(
91
+ data_dir, f"race/test/{task_name}-00000-of-00001.parquet"
92
+ ),
93
+ },
94
+ ),
95
+ datasets.SplitGenerator(
96
+ name=datasets.Split.VALIDATION,
97
+ gen_kwargs={
98
+ "filepath": os.path.join(
99
+ data_dir, f"race/val/{task_name}-00000-of-00001.parquet"
100
+ ),
101
+ },
102
+ ),
103
+ datasets.SplitGenerator(
104
+ name=datasets.Split.TRAIN,
105
+ gen_kwargs={
106
+ "filepath": os.path.join(
107
+ data_dir, f"race/train/{task_name}-00000-of-00001.parquet"
108
+ ),
109
+ },
110
+ ),
111
+ ]
112
+
113
+ def _generate_examples(self, filepath):
114
+ df = pd.read_parquet(filepath)
115
+ df.columns = ["example_id", "article", "answer", "question", "options"]
116
+
117
+ for i, instance in enumerate(df.to_dict(orient="records")):
118
+ yield i, instance
@@ -0,0 +1,229 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ import os
4
+ import json
5
+ from evalscope.benchmarks.data_adapter import DataAdapter
6
+ from evalscope.metrics.metrics import exact_match, weighted_mean
7
+ from evalscope.utils import normalize_score, jsonl_to_list
8
+ from evalscope.utils.logger import get_logger
9
+ # flake8: noqa
10
+
11
+ logger = get_logger()
12
+
13
+ DATASET_ID = 'modelscope/race'
14
+
15
+ SUBSET_LIST = [
16
+ "high",
17
+ "middle"
18
+ ]
19
+
20
+
21
+ SUBJECT_MAPPING = {"high": "High",
22
+ "middle": "Middle"
23
+ }
24
+
25
+
26
+ class RACEAdapter(DataAdapter):
27
+
28
+ choices = ['A', 'B', 'C', 'D']
29
+
30
+ def __init__(self,
31
+ subset_list: list = None,
32
+ metric_list: list = None,
33
+ few_shot_num: int = None,
34
+ train_split: str = 'train',
35
+ eval_split: str = 'test',
36
+ **kwargs):
37
+
38
+ if subset_list is None:
39
+ subset_list = SUBSET_LIST
40
+
41
+ if metric_list is None:
42
+ metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
43
+
44
+ if few_shot_num is None:
45
+ logger.info(f'Set 3-shot examples by system for RACE.')
46
+ few_shot_num = 3
47
+
48
+ if few_shot_num > 3:
49
+ logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.')
50
+ few_shot_num = 3
51
+
52
+ super().__init__(subset_list=subset_list,
53
+ metric_list=metric_list,
54
+ few_shot_num=few_shot_num,
55
+ train_split=train_split,
56
+ eval_split=eval_split,
57
+ **kwargs)
58
+
59
+ def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
60
+ data_dict = {}
61
+ for subset_name in subset_list:
62
+ data_dict[subset_name] = {}
63
+ for split in [self.train_split, self.eval_split]:
64
+ if os.path.exists(dataset_name_or_path):
65
+ file_path = os.path.join(dataset_name_or_path, subset_name, f'{split}.jsonl')
66
+ else:
67
+ file_path = os.path.join(work_dir, dataset_name_or_path, subset_name, f'{split}.jsonl')
68
+ if os.path.exists(file_path):
69
+ data_dict[subset_name][split] = jsonl_to_list(file_path)
70
+
71
+ return data_dict
72
+
73
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
74
+ """
75
+ Generate model prompt from raw input, unify the prompt format for RACE benchmark.
76
+
77
+ Args:
78
+ input_d (dict): The raw input. A single data format of the RACE:
79
+
80
+ {'example_id': 'high3680.txt',
81
+ 'article': 'Astronauts on shorter shuttle missions often work very long days. Tasks are scheduled so tightly that break times are often used to finish the day's work. This type of schedule is far too demanding for long missions on the International Space Station(ISS). ISS crewmembers usually live in space for at least a quarter of a year. They work five days on and two days off to _ the normal way they do things on Earth as much as possible. Weekends give the crew valuable time to rest and do a few hours of housework. They can communicate with family and friends by email , internet phone and through private video conferences. While astronauts cannot go to a baseball game or a movie in orbit, there are many familiar activities that they can still enjoy . Before a mission, the family and friends of each ISS crewmember put together a collection of family photos, messages, videos and reading material for the astronauts to look at when they will be floating 370 kilometers above the Earth. During their mission, the crew also receives care packages with CDs, books, magazines, photos and letters . And as from early 2010, the internet became available on the ISS , giving astronauts the chance to do some "web surfing "in their personal time. Besides relaxing with these more common entertainments, astronauts can simply enjoy the experience of living in space. Many astronauts say that one of the most relaxing things to do in space is to look out the window and stare at the universe and the Earth's vast land mass and oceans.',
82
+ 'answer': 'C',
83
+ 'question': 'The passage mainly discusses how astronauts _ .',
84
+ 'options': [
85
+ "work for longer missions in space",
86
+ "connect with people on the Earth",
87
+ "spend their free time in space",
88
+ "observe the Earth from space"]}
89
+
90
+ Returns:
91
+ {'data': [(context, continuation), ...]}
92
+
93
+ """
94
+ prompt = 'The following are multiple choice reading comprehension questions (with answers).\n\n'.format(
95
+ self._format_subject(subset_name)
96
+ )
97
+ few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
98
+
99
+ context: str = '\n'.join(few_shot_prompts) + '\n'
100
+ context += self._generate_prompt(input_d=input_d, include_answer=False)
101
+ context = prompt + context
102
+
103
+ full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
104
+
105
+ return {'data': [full_prompt], 'multi_choices': self.choices}
106
+
107
+ def get_gold_answer(self, input_d: dict) -> str:
108
+ # Get the gold choice
109
+ return input_d.get('answer', '')
110
+
111
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
112
+ """
113
+ Parse the model output to get the answer. Could be the best choice index.
114
+
115
+ Args:
116
+ result: Predicted answer from the model. Usually a string for chat.
117
+ raw_input_d: The raw input. Depending on the dataset.
118
+ eval_type: The evaluation type. e.g. 'checkpoint' or 'service' or 'custom'.
119
+
120
+ Returns:
121
+ The parsed answer. Depending on the dataset. Usually a string for chat.
122
+ """
123
+ if eval_type == 'checkpoint':
124
+ return result
125
+ elif eval_type == 'service': # TODO: to be implemented
126
+ return result
127
+ elif eval_type == 'custom': # TODO: to be implemented
128
+ return result
129
+ else:
130
+ raise ValueError(f'Unknown eval_type: {eval_type}')
131
+
132
+ def match(self, gold: str, pred: str) -> float:
133
+ return exact_match(gold=gold, pred=pred)
134
+
135
+ def compute_metric(self, review_res_list: list) -> float:
136
+ """
137
+ Compute evaluation result by specific metric.
138
+
139
+ Args:
140
+ review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
141
+
142
+ Returns:
143
+ The metric score.
144
+ """
145
+ items = [(score, 1.0) for score in review_res_list]
146
+ return weighted_mean(items)
147
+
148
+ def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
149
+ """
150
+ Generate report for the evaluation.
151
+
152
+ Args:
153
+ subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
154
+ report_name: The user-defined report name.
155
+
156
+ Returns:
157
+ {
158
+ "name":"RACE",
159
+ "metric":"WeightedAverageAccuracy",
160
+ "score":0.3389,
161
+ "category":[
162
+ {
163
+ "name":"High",
164
+ "score":0.2528,
165
+ "subset":[
166
+ {
167
+ "name":"high",
168
+ "score":0.2528
169
+ }
170
+ ]
171
+ }
172
+ ],
173
+ "total_num":59
174
+ }
175
+ """
176
+ total_num: int = sum([num for _, num in subset_score_map.values()])
177
+ weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
178
+
179
+ # Get domain-subject mapping
180
+ subject_review_map = {}
181
+ for subset_name, (subset_score, num) in subset_score_map.items():
182
+ domain_name: str = SUBJECT_MAPPING.get(subset_name)
183
+ if domain_name in subject_review_map:
184
+ subject_review_map[domain_name].append((subset_name, subset_score, num))
185
+ else:
186
+ subject_review_map[domain_name] = [(subset_name, subset_score, num)]
187
+
188
+ # Get domain score
189
+ category_list = []
190
+ for domain_name, domain_res_list in subject_review_map.items():
191
+ domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
192
+ sum([num for _, _, num in domain_res_list])
193
+ domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
194
+ category_list.append({'name': domain_name,
195
+ 'score': normalize_score(score=domain_weighted_avg_acc),
196
+ 'subset': [{'name': subset_name, 'score': subset_score}
197
+ for subset_name, subset_score, _ in domain_res_list]})
198
+
199
+ # Get final dict of report
200
+ res_map = dict(name=report_name or 'race',
201
+ metric=self.metric_list[0]['name'],
202
+ score=weighted_avg_acc,
203
+ category=category_list,
204
+ total_num=total_num)
205
+
206
+ return res_map
207
+
208
+ @classmethod
209
+ def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
210
+
211
+ input_choices: list = input_d['options']
212
+
213
+ example: str = 'Article:\n{}\nQuestion:\n{}'.format(input_d['article'], input_d['question'])
214
+ for j in range(len(cls.choices)):
215
+ example += '\n{}. {}'.format(cls.choices[j], input_choices[j])
216
+
217
+ example += '\nAnswer:'
218
+ if include_answer:
219
+ example += ' {}\n\n'.format(input_d['answer'])
220
+
221
+ return example
222
+
223
+ @classmethod
224
+ def _format_subject(cls, subject):
225
+ l = subject.split('_')
226
+ s = ''
227
+ for entry in l:
228
+ s += ' ' + entry
229
+ return s
@@ -0,0 +1,5 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter, DATASET_ID, SUBSET_LIST
4
+ from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter as DataAdapterClass
5
+ from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa
@@ -0,0 +1,104 @@
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+ import json
16
+
17
+ import datasets
18
+ import pandas as pd
19
+
20
+
21
+ _CITATION = """\
22
+ @article{2017arXivtriviaqa,
23
+ author = {{Joshi}, Mandar and {Choi}, Eunsol and {Weld},
24
+ Daniel and {Zettlemoyer}, Luke},
25
+ title = "{triviaqa: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}",
26
+ journal = {arXiv e-prints},
27
+ year = 2017,
28
+ eid = {arXiv:1705.03551},
29
+ pages = {arXiv:1705.03551},
30
+ archivePrefix = {arXiv},
31
+ eprint = {1705.03551},
32
+ }
33
+ """
34
+
35
+ _DESCRIPTION = """\
36
+ TriviaqQA is a reading comprehension dataset containing over 650K question-answer-evidence triples.
37
+ """
38
+
39
+ _HOMEPAGE = "https://modelscope.cn/datasets/modelscope/trivia_qa/summary"
40
+
41
+ _URL = "https://modelscope.cn/api/v1/datasets/modelscope/trivia_qa/repo?Revision=master&FilePath=trivia_qa.zip"
42
+
43
+ task_list = [
44
+ "default"
45
+ ]
46
+
47
+
48
+ class TriviaQAConfig(datasets.BuilderConfig):
49
+ def __init__(self, **kwargs):
50
+ super().__init__(version=datasets.Version("1.0.0"), **kwargs)
51
+
52
+
53
+ class TriviaQA(datasets.GeneratorBasedBuilder):
54
+ BUILDER_CONFIGS = [
55
+ TriviaQAConfig(
56
+ name=task_name,
57
+ )
58
+ for task_name in task_list
59
+ ]
60
+
61
+ def _info(self):
62
+ features = datasets.Features(
63
+ {
64
+ "input": [{
65
+ "role": datasets.features.Value("string"),
66
+ "content": datasets.features.Value("string"),
67
+ }],
68
+ "ideal": [datasets.Value("string")],
69
+ }
70
+ )
71
+ return datasets.DatasetInfo(
72
+ description=_DESCRIPTION,
73
+ features=features,
74
+ homepage=_HOMEPAGE,
75
+ citation=_CITATION,
76
+ )
77
+
78
+ def _split_generators(self, dl_manager):
79
+ data_dir = dl_manager.download_and_extract(_URL)
80
+ task_name = self.config.name
81
+ return [
82
+ datasets.SplitGenerator(
83
+ name=datasets.Split.TEST,
84
+ gen_kwargs={
85
+ "filepath": os.path.join(
86
+ data_dir, f"trivia_qa/test.jsonl"
87
+ ),
88
+ },
89
+ ),
90
+ datasets.SplitGenerator(
91
+ name=datasets.Split("dev"),
92
+ gen_kwargs={
93
+ "filepath": os.path.join(
94
+ data_dir, f"trivia_qa/dev.jsonl"
95
+ ),
96
+ },
97
+ ),
98
+ ]
99
+
100
+ def _generate_examples(self, filepath):
101
+ with open(filepath, encoding='utf-8') as f:
102
+ contents = [json.loads(line) for line in f.readlines()]
103
+ for i, instance in enumerate(contents):
104
+ yield i, instance
@@ -0,0 +1,207 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # Copyright (c) EleutherAI Inc, and its affiliates.
3
+ import csv
4
+ import os
5
+ from typing import List
6
+ import numpy as np
7
+
8
+ from evalscope.benchmarks.data_adapter import DataAdapter
9
+ from evalscope.metrics.metrics import exact_match, weighted_mean
10
+ from evalscope.utils.logger import get_logger
11
+ # flake8: noqa
12
+
13
+ logger = get_logger()
14
+
15
+
16
+ DATASET_ID = 'modelscope/trivia_qa'
17
+ SUBSET_LIST = ['default']
18
+
19
+
20
+ class TriviaQaAdapter(DataAdapter):
21
+
22
+ def __init__(self,
23
+ subset_list: list = None,
24
+ metric_list: list = None,
25
+ few_shot_num: int = None,
26
+ train_split: str = 'dev',
27
+ eval_split: str = 'test',
28
+ **kwargs):
29
+
30
+ if subset_list is None:
31
+ subset_list = SUBSET_LIST
32
+
33
+ if metric_list is None:
34
+ metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
35
+
36
+ if few_shot_num is None:
37
+ logger.info(f'few_shot_num is not specified for TriviaQA, use default value: 5')
38
+ few_shot_num = 5
39
+
40
+ super().__init__(subset_list=subset_list,
41
+ metric_list=metric_list,
42
+ few_shot_num=few_shot_num,
43
+ train_split=train_split,
44
+ eval_split=eval_split,
45
+ **kwargs)
46
+
47
+ def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
48
+ data_dict = {}
49
+ for subset_name in subset_list:
50
+ data_dict[subset_name] = {}
51
+ for split in [self.train_split, self.eval_split]:
52
+ if os.path.exists(dataset_name_or_path):
53
+ file_path = os.path.join(dataset_name_or_path, f'trivia-{split}.qa.csv')
54
+ else:
55
+ file_path = os.path.join(work_dir, dataset_name_or_path, f'trivia-{split}.qa.csv')
56
+ if os.path.exists(file_path):
57
+ with open(file_path, 'r', encoding='utf-8') as f:
58
+ reader = csv.reader(f, delimiter='\t')
59
+ split_data = []
60
+ for row in reader:
61
+ assert len(row) == 2
62
+ question = row[0]
63
+ answers = eval(row[1])
64
+ split_data.append({
65
+ 'input': [
66
+ {"role": "system", "content": "Follow the given examples and answer the question."},
67
+ {"role": "user", "content": question}
68
+ ],
69
+ 'ideal': answers
70
+ })
71
+ data_dict[subset_name][split] = split_data
72
+
73
+ return data_dict
74
+
75
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
76
+ """
77
+ Generate model prompt from raw input, unify the prompt format for TriviaQA benchmark.
78
+
79
+ Args:
80
+ input_d (dict): The raw input. A single data format of the TriviaQA:
81
+
82
+ {
83
+ "input": [
84
+ {"role": "system", "content": "Follow the given examples and answer the question."},
85
+ {"role": "user", "content": "Which Lloyd Webber musical premiered in the US on 10th December 1993?"}
86
+ ],
87
+ "ideal": [
88
+ "Sunset Blvd",
89
+ "West Sunset Boulevard",
90
+ "Sunset Boulevard",
91
+ "Sunset Bulevard",
92
+ "Sunset Blvd.",
93
+ "sunset boulevard",
94
+ "sunset bulevard",
95
+ "west sunset boulevard",
96
+ "sunset blvd"
97
+ ]
98
+ }
99
+
100
+ Returns:
101
+ {'data': [(context, continuation), ...]}
102
+ """
103
+ def get_sys_prompt(inp: dict) -> str:
104
+ return inp['input'][0]['content']
105
+
106
+ prompt = get_sys_prompt(input_d)
107
+ few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
108
+ context: str = '\n'.join(few_shot_prompts) + '\n'
109
+ context += self._generate_prompt(input_d=input_d, include_answer=False)
110
+ full_prompt = prompt + context
111
+
112
+ return {'data': [full_prompt]}
113
+
114
+ def get_gold_answer(self, input_d: dict) -> list:
115
+ # Get the gold choice
116
+ ans: list = input_d.get("ideal", [])
117
+ return ans
118
+
119
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
120
+ """
121
+ Parse the model output to get the answer.
122
+
123
+ Args:
124
+ result: Predicted answer from the model. A list of loglikelihood values for inputs pairs.
125
+ raw_input_d: The raw input. A single data format of the TriviaQA:
126
+ eval_type: The type of evaluation, e.g. 'checkpoint' or 'service' or 'custom'.
127
+
128
+ Returns:
129
+ The predicted answer.
130
+ """
131
+ if eval_type == 'checkpoint':
132
+ return result
133
+ elif eval_type == 'service': # TODO: to be implemented
134
+ return result
135
+ elif eval_type == 'custom': # TODO: to be implemented
136
+ return result
137
+ else:
138
+ raise ValueError(f'Unknown eval_type: {eval_type}')
139
+
140
+ def match(self, gold: list, pred: str) -> float:
141
+ return max([exact_match(gold=ref, pred=pred) for ref in gold])
142
+
143
+ def compute_metric(self, review_res_list: list) -> float:
144
+ """
145
+ Compute evaluation result by specific metric.
146
+
147
+ Args:
148
+ review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
149
+
150
+ Returns:
151
+ The metric score.
152
+ """
153
+ items = [(score, 1.0) for score in review_res_list]
154
+ return weighted_mean(items)
155
+
156
+ def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
157
+ """
158
+ Generate the report for the model output.
159
+
160
+ Args:
161
+ subset_score_map: {subset_name: (score, num), ...}
162
+ report_name: The user-defined report name.
163
+
164
+ Returns:
165
+ {
166
+ "name":"TriviaQA",
167
+ "metric":"WeightedAverageAccuracy",
168
+ "score":0.3389,
169
+ "category":[
170
+ {
171
+ "name":"DEFAULT",
172
+ "score":0.3389,
173
+ "subset":[
174
+ {
175
+ "name":"default",
176
+ "score":0.3389
177
+ }
178
+ ]
179
+ }
180
+ ],
181
+ "total_num":100
182
+ }
183
+ """
184
+ total_num: int = sum([num for _, num in subset_score_map.values()])
185
+ weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
186
+ cate_avg_list = [{'name': subset_name, 'score': score} for subset_name, (score, _) in subset_score_map.items()]
187
+
188
+ category_d = dict(name='DEFAULT',
189
+ score=weighted_avg_acc,
190
+ subset=cate_avg_list)
191
+
192
+ res_map = dict(name=report_name or 'trivia_qa',
193
+ metric=self.metric_list[0]['name'],
194
+ score=weighted_avg_acc,
195
+ category=[category_d],
196
+ total_num=total_num)
197
+
198
+ return res_map
199
+
200
+ @classmethod
201
+ def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
202
+
203
+ example: str = f"Question: {input_d['input'][1]['content']}\nAnswer:"
204
+ if include_answer:
205
+ example += f" {input_d['ideal'][0]}\n\n"
206
+
207
+ return example
@@ -0,0 +1,5 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter, DATASET_ID, SUBSET_LIST
4
+ from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter as DataAdapterClass
5
+ from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa