evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,21 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ # flake8: noqa
4
+
5
+
6
+ DATASET_ID = 'modelscope/humaneval'
7
+ SUBSET_LIST = ['openai_humaneval']
8
+
9
+ # Note: ONLY FOR CLASS IMPORT, No implementation here.
10
+
11
+ # Example:
12
+ # {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n given threshold.\n >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n False\n >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n True\n \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": " for idx, elem in enumerate(numbers):\n for idx2, elem2 in enumerate(numbers):\n if idx != idx2:\n distance = abs(elem - elem2)\n if distance < threshold:\n return True\n\n return False\n", "test": "\n\nMETADATA = {\n 'author': 'jt',\n 'dataset': 'test'\n}\n\n\ndef check(candidate):\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}
13
+
14
+
15
+ class HumanevalAdapter:
16
+ """
17
+ A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
18
+ """
19
+
20
+ def __init__(self):
21
+ ...
@@ -0,0 +1,5 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.benchmarks.mmlu.mmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST, MMLUAdapter
4
+ from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter as DataAdapterClass
5
+ from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
@@ -0,0 +1,174 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ # flake8: noqa
16
+
17
+ import os
18
+
19
+ import datasets
20
+ import pandas as pd
21
+
22
+ """The MMLU dataset on ModelScope hub. READ ONLY, DO NOT MODIFY."""
23
+
24
+
25
+ _CITATION = """\
26
+ @article{hendryckstest2021,
27
+ title={Measuring Massive Multitask Language Understanding},
28
+ author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
29
+ journal={Proceedings of the International Conference on Learning Representations (ICLR)},
30
+ year={2021}
31
+ }
32
+ """
33
+
34
+ _DESCRIPTION = """\
35
+ Measuring Massive Multitask Language Understanding by Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas
36
+ Mazeika, Dawn Song, and Jacob Steinhardt (ICLR 2021).
37
+ """
38
+
39
+ _HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/mmlu/summary'
40
+
41
+ _LICENSE = 'MIT'
42
+
43
+ # _URL = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"
44
+ _URL = 'https://modelscope.cn/api/v1/datasets/modelscope/mmlu/repo?Revision=master&FilePath=data.tar'
45
+
46
+ task_list = [
47
+ 'high_school_european_history',
48
+ 'business_ethics',
49
+ 'clinical_knowledge',
50
+ 'medical_genetics',
51
+ 'high_school_us_history',
52
+ 'high_school_physics',
53
+ 'high_school_world_history',
54
+ 'virology',
55
+ 'high_school_microeconomics',
56
+ 'econometrics',
57
+ 'college_computer_science',
58
+ 'high_school_biology',
59
+ 'abstract_algebra',
60
+ 'professional_accounting',
61
+ 'philosophy',
62
+ 'professional_medicine',
63
+ 'nutrition',
64
+ 'global_facts',
65
+ 'machine_learning',
66
+ 'security_studies',
67
+ 'public_relations',
68
+ 'professional_psychology',
69
+ 'prehistory',
70
+ 'anatomy',
71
+ 'human_sexuality',
72
+ 'college_medicine',
73
+ 'high_school_government_and_politics',
74
+ 'college_chemistry',
75
+ 'logical_fallacies',
76
+ 'high_school_geography',
77
+ 'elementary_mathematics',
78
+ 'human_aging',
79
+ 'college_mathematics',
80
+ 'high_school_psychology',
81
+ 'formal_logic',
82
+ 'high_school_statistics',
83
+ 'international_law',
84
+ 'high_school_mathematics',
85
+ 'high_school_computer_science',
86
+ 'conceptual_physics',
87
+ 'miscellaneous',
88
+ 'high_school_chemistry',
89
+ 'marketing',
90
+ 'professional_law',
91
+ 'management',
92
+ 'college_physics',
93
+ 'jurisprudence',
94
+ 'world_religions',
95
+ 'sociology',
96
+ 'us_foreign_policy',
97
+ 'high_school_macroeconomics',
98
+ 'computer_security',
99
+ 'moral_scenarios',
100
+ 'moral_disputes',
101
+ 'electrical_engineering',
102
+ 'astronomy',
103
+ 'college_biology',
104
+ ]
105
+
106
+
107
+ class MMLUConfig(datasets.BuilderConfig):
108
+ def __init__(self, **kwargs):
109
+ super().__init__(version=datasets.Version('1.0.0'), **kwargs)
110
+
111
+
112
+ class MMLU(datasets.GeneratorBasedBuilder):
113
+ BUILDER_CONFIGS = [
114
+ MMLUConfig(
115
+ name=task_name,
116
+ )
117
+ for task_name in task_list
118
+ ]
119
+
120
+ def _info(self):
121
+ features = datasets.Features(
122
+ {
123
+ 'input': datasets.Value('string'),
124
+ 'A': datasets.Value('string'),
125
+ 'B': datasets.Value('string'),
126
+ 'C': datasets.Value('string'),
127
+ 'D': datasets.Value('string'),
128
+ 'target': datasets.Value('string'),
129
+ }
130
+ )
131
+ return datasets.DatasetInfo(
132
+ description=_DESCRIPTION,
133
+ features=features,
134
+ homepage=_HOMEPAGE,
135
+ license=_LICENSE,
136
+ citation=_CITATION,
137
+ )
138
+
139
+ def _split_generators(self, dl_manager):
140
+ data_dir = dl_manager.download_and_extract(_URL)
141
+ task_name = self.config.name
142
+ return [
143
+ datasets.SplitGenerator(
144
+ name=datasets.Split.TEST,
145
+ gen_kwargs={
146
+ 'filepath': os.path.join(
147
+ data_dir, 'data', 'test', f'{task_name}_test.csv'
148
+ ),
149
+ },
150
+ ),
151
+ datasets.SplitGenerator(
152
+ name=datasets.Split.VALIDATION,
153
+ gen_kwargs={
154
+ 'filepath': os.path.join(
155
+ data_dir, 'data', 'val', f'{task_name}_val.csv'
156
+ ),
157
+ },
158
+ ),
159
+ datasets.SplitGenerator(
160
+ name=datasets.Split.TRAIN,
161
+ gen_kwargs={
162
+ 'filepath': os.path.join(
163
+ data_dir, 'data', 'dev', f'{task_name}_dev.csv'
164
+ ),
165
+ },
166
+ ),
167
+ ]
168
+
169
+ def _generate_examples(self, filepath):
170
+ df = pd.read_csv(filepath)
171
+ df.columns = ['input', 'A', 'B', 'C', 'D', 'target']
172
+
173
+ for i, instance in enumerate(df.to_dict(orient='records')):
174
+ yield i, instance
@@ -0,0 +1,375 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ import csv
3
+ import os
4
+
5
+ from evalscope.benchmarks.data_adapter import DataAdapter
6
+ from evalscope.metrics.metrics import exact_match, weighted_mean
7
+ from evalscope.utils import normalize_score, ResponseParser
8
+ from evalscope.utils.logger import get_logger
9
+ # flake8: noqa
10
+
11
+ logger = get_logger()
12
+
13
+ DATASET_ID = 'modelscope/mmlu'
14
+
15
+ SUBSET_LIST = [
16
+ 'high_school_european_history',
17
+ 'business_ethics',
18
+ 'clinical_knowledge',
19
+ 'medical_genetics',
20
+ 'high_school_us_history',
21
+ 'high_school_physics',
22
+ 'high_school_world_history',
23
+ 'virology',
24
+ 'high_school_microeconomics',
25
+ 'econometrics',
26
+ 'college_computer_science',
27
+ 'high_school_biology',
28
+ 'abstract_algebra',
29
+ 'professional_accounting',
30
+ 'philosophy',
31
+ 'professional_medicine',
32
+ 'nutrition',
33
+ 'global_facts',
34
+ 'machine_learning',
35
+ 'security_studies',
36
+ 'public_relations',
37
+ 'professional_psychology',
38
+ 'prehistory',
39
+ 'anatomy',
40
+ 'human_sexuality',
41
+ 'college_medicine',
42
+ 'high_school_government_and_politics',
43
+ 'college_chemistry',
44
+ 'logical_fallacies',
45
+ 'high_school_geography',
46
+ 'elementary_mathematics',
47
+ 'human_aging',
48
+ 'college_mathematics',
49
+ 'high_school_psychology',
50
+ 'formal_logic',
51
+ 'high_school_statistics',
52
+ 'international_law',
53
+ 'high_school_mathematics',
54
+ 'high_school_computer_science',
55
+ 'conceptual_physics',
56
+ 'miscellaneous',
57
+ 'high_school_chemistry',
58
+ 'marketing',
59
+ 'professional_law',
60
+ 'management',
61
+ 'college_physics',
62
+ 'jurisprudence',
63
+ 'world_religions',
64
+ 'sociology',
65
+ 'us_foreign_policy',
66
+ 'high_school_macroeconomics',
67
+ 'computer_security',
68
+ 'moral_scenarios',
69
+ 'moral_disputes',
70
+ 'electrical_engineering',
71
+ 'astronomy',
72
+ 'college_biology',
73
+ ]
74
+
75
+
76
+ SUBJECT_MAPPING = {'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
77
+ 'anatomy': ['Anatomy', 'health', 'Other'],
78
+ 'astronomy': ['Astronomy', 'physics', 'STEM'],
79
+ 'business_ethics': ['Business Ethics', 'business', 'Other'],
80
+ 'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
81
+ 'college_biology': ['College Biology', 'biology', 'STEM'],
82
+ 'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
83
+ 'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
84
+ 'college_mathematics': ['College Mathematics', 'math', 'STEM'],
85
+ 'college_medicine': ['College Medicine', 'health', 'Other'],
86
+ 'college_physics': ['College Physics', 'physics', 'STEM'],
87
+ 'computer_security': ['Computer Security', 'computer science', 'STEM'],
88
+ 'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
89
+ 'econometrics': ['Econometrics', 'economics', 'Social Science'],
90
+ 'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
91
+ 'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
92
+ 'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
93
+ 'global_facts': ['Global Facts', 'other', 'Other'],
94
+ 'high_school_biology': ['High School Biology', 'biology', 'STEM'],
95
+ 'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
96
+ 'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
97
+ 'high_school_european_history': ['High School European History', 'history', 'Humanities'],
98
+ 'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
99
+ 'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
100
+ 'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
101
+ 'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
102
+ 'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
103
+ 'high_school_physics': ['High School Physics', 'physics', 'STEM'],
104
+ 'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
105
+ 'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
106
+ 'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
107
+ 'high_school_world_history': ['High School World History', 'history', 'Humanities'],
108
+ 'human_aging': ['Human Aging', 'health', 'Other'],
109
+ 'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
110
+ 'international_law': ['International Law', 'law', 'Humanities'],
111
+ 'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
112
+ 'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
113
+ 'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
114
+ 'management': ['Management', 'business', 'Other'],
115
+ 'marketing': ['Marketing', 'business', 'Other'],
116
+ 'medical_genetics': ['Medical Genetics', 'health', 'Other'],
117
+ 'miscellaneous': ['Miscellaneous', 'other', 'Other'],
118
+ 'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
119
+ 'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
120
+ 'nutrition': ['Nutrition', 'health', 'Other'],
121
+ 'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
122
+ 'prehistory': ['Prehistory', 'history', 'Humanities'],
123
+ 'professional_accounting': ['Professional Accounting', 'other', 'Other'],
124
+ 'professional_law': ['Professional Law', 'law', 'Humanities'],
125
+ 'professional_medicine': ['Professional Medicine', 'health', 'Other'],
126
+ 'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
127
+ 'public_relations': ['Public Relations', 'politics', 'Social Science'],
128
+ 'security_studies': ['Security Studies', 'politics', 'Social Science'],
129
+ 'sociology': ['Sociology', 'culture', 'Social Science'],
130
+ 'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
131
+ 'virology': ['Virology', 'health', 'Other'],
132
+ 'world_religions': ['World Religions', 'philosophy', 'Humanities'],
133
+ }
134
+
135
+
136
+ class MMLUAdapter(DataAdapter):
137
+
138
+ choices = ['A', 'B', 'C', 'D']
139
+
140
+ def __init__(self,
141
+ subset_list: list = None,
142
+ metric_list: list = None,
143
+ few_shot_num: int = None,
144
+ train_split: str = 'train',
145
+ eval_split: str = 'test',
146
+ **kwargs):
147
+
148
+ if subset_list is None:
149
+ subset_list = SUBSET_LIST
150
+
151
+ if metric_list is None:
152
+ metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
153
+
154
+ if few_shot_num is None:
155
+ # Use 5-shot by default
156
+ logger.info(f'Set 5-shot examples by system for MMLU.')
157
+ few_shot_num = 5
158
+
159
+ if few_shot_num > 5:
160
+ logger.warning(f'few_shot_num <= 5 for MMLU, but got {few_shot_num}. Use 5-shot by default.')
161
+ few_shot_num = 5
162
+
163
+ super().__init__(subset_list=subset_list,
164
+ metric_list=metric_list,
165
+ few_shot_num=few_shot_num,
166
+ train_split=train_split,
167
+ eval_split=eval_split,
168
+ **kwargs)
169
+
170
+ def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
171
+ data_dict = {}
172
+ for subset_name in subset_list:
173
+ data_dict[subset_name] = {}
174
+
175
+ for split_name in [self.train_split, self.eval_split]:
176
+ if self.train_split == 'train':
177
+ split_name_suffix = 'dev'
178
+ elif self.eval_split == 'test':
179
+ split_name_suffix = 'test'
180
+ elif self.eval_split == 'validation':
181
+ split_name_suffix = 'val'
182
+ else:
183
+ raise ValueError(f'Invalid split name: {split_name}')
184
+
185
+ if os.path.exists(dataset_name_or_path):
186
+ file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name_suffix}.csv')
187
+ else:
188
+ file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name_suffix}.csv')
189
+
190
+ if os.path.exists(file_path):
191
+ with open(file_path, encoding='utf-8') as f:
192
+ rows = []
193
+ reader = csv.reader(f)
194
+ for row in reader:
195
+ if len(row) != 6:
196
+ logger.error(f'Mismatch len of row: {row}, len of row should be 6. Skip this row.')
197
+ continue
198
+ rows.append({
199
+ 'input': row[0],
200
+ 'A': row[1],
201
+ 'B': row[2],
202
+ 'C': row[3],
203
+ 'D': row[4],
204
+ 'target': row[5],
205
+ })
206
+
207
+ data_dict[subset_name].update({split_name: rows})
208
+
209
+ return data_dict
210
+
211
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
212
+ """
213
+ Generate model prompt from raw input, unify the prompt format for MMLU benchmark.
214
+
215
+ Args:
216
+ input_d (dict): The raw input. A single data format of the MMLU:
217
+
218
+ {'input': '___________ is based on the idea that customer expectations of the service they will receive shape their perception of the actual service encounter.',
219
+ 'A': 'Service quality.',
220
+ 'B': 'Service action.',
221
+ 'C': 'Service recovery.',
222
+ 'D': 'Service satisfaction.',
223
+ 'target': 'A'}
224
+
225
+ Returns:
226
+ {'data': [(context, continuation), ...]}
227
+
228
+ """
229
+ prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
230
+ self._format_subject(subset_name)
231
+ )
232
+ few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
233
+
234
+ context: str = '\n'.join(few_shot_prompts) + '\n'
235
+ context += self._generate_prompt(input_d=input_d, include_answer=False)
236
+ context = prompt + context
237
+
238
+ full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
239
+
240
+ return {'data': [full_prompt], 'multi_choices': self.choices}
241
+
242
+ def get_gold_answer(self, input_d: dict) -> str:
243
+ # Get the gold choice
244
+ return input_d.get('target', '')
245
+
246
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
247
+ """
248
+ Parse the model output to get the answer. Could be the best choice index.
249
+
250
+ Args:
251
+ result: Predicted answer from the model. Usually a string for chat.
252
+ raw_input_d: The raw input. Depending on the dataset.
253
+ eval_type: 'checkpoint' or 'service' or 'custom'
254
+
255
+ Returns:
256
+ The parsed answer. Depending on the dataset. Usually a string for chat.
257
+ """
258
+ if eval_type == 'checkpoint':
259
+ return result
260
+ elif eval_type == 'service':
261
+ return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
262
+ elif eval_type == 'custom':
263
+ return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
264
+ else:
265
+ raise ValueError(f'Invalid eval_type: {eval_type}')
266
+
267
+ def match(self, gold: str, pred: str) -> float:
268
+ return exact_match(gold=gold, pred=pred)
269
+
270
+ def compute_metric(self, review_res_list: list) -> float:
271
+ """
272
+ Compute evaluation result by specific metric.
273
+
274
+ Args:
275
+ review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
276
+
277
+ Returns:
278
+ The metric score.
279
+ """
280
+ items = [(score, 1.0) for score in review_res_list]
281
+ return weighted_mean(items)
282
+
283
+ def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
284
+ """
285
+ Generate report for the evaluation.
286
+
287
+ Args:
288
+ subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
289
+ report_name: The user-defined report name.
290
+
291
+ Returns:
292
+ {
293
+ "name":"MMLU",
294
+ "metric":"WeightedAverageAccuracy",
295
+ "score":0.3389,
296
+ "category":[
297
+ {
298
+ "name":"STEM",
299
+ "score":0.2528,
300
+ "subset":[
301
+ {
302
+ "name":"computer_network",
303
+ "score":0.2632
304
+ },
305
+ {
306
+ "name":"operating_system",
307
+ "score":0.3157
308
+ },
309
+ {
310
+ "name":"computer_architecture",
311
+ "score":0.4285
312
+ }
313
+ ]
314
+ }
315
+ ],
316
+ "total_num":59
317
+ }
318
+ """
319
+ total_num: int = sum([num for _, num in subset_score_map.values()])
320
+ weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
321
+ weighted_avg_acc = normalize_score(score=weighted_avg_acc)
322
+
323
+ # Get domain-subject mapping
324
+ subject_review_map = {}
325
+ for subset_name, (subset_score, num) in subset_score_map.items():
326
+ domain_name: str = SUBJECT_MAPPING.get(subset_name)[2] if SUBJECT_MAPPING.get(subset_name) else subset_name
327
+ if domain_name in subject_review_map:
328
+ subject_review_map[domain_name].append((subset_name, subset_score, num))
329
+ else:
330
+ subject_review_map[domain_name] = [(subset_name, subset_score, num)]
331
+
332
+ # Get domain score
333
+ category_list = []
334
+ for domain_name, domain_res_list in subject_review_map.items():
335
+ domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
336
+ sum([num for _, _, num in domain_res_list])
337
+ domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
338
+ category_list.append({'name': domain_name,
339
+ 'score': domain_weighted_avg_acc,
340
+ 'subset': [{'name': subset_name, 'score': normalize_score(score=subset_score)}
341
+ for subset_name, subset_score, _ in domain_res_list]})
342
+
343
+ category_list = sorted(category_list, key=lambda x: x['name'])
344
+
345
+ # Get final dict of report
346
+ res_map = dict(name=report_name or 'mmlu',
347
+ metric=self.metric_list[0]['name'],
348
+ score=weighted_avg_acc,
349
+ category=category_list,
350
+ total_num=total_num)
351
+
352
+ return res_map
353
+
354
+ @classmethod
355
+ def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
356
+
357
+ input_choices: list = [input_d['A'], input_d['B'], input_d['C'], input_d['D']]
358
+
359
+ example: str = input_d['input']
360
+ for j in range(len(cls.choices)):
361
+ example += '\n{}. {}'.format(cls.choices[j], input_choices[j])
362
+
363
+ example += '\nAnswer:'
364
+ if include_answer:
365
+ example += ' {}\n\n'.format(input_d['target'])
366
+
367
+ return example
368
+
369
+ @classmethod
370
+ def _format_subject(cls, subject):
371
+ l = subject.split('_')
372
+ s = ''
373
+ for entry in l:
374
+ s += ' ' + entry
375
+ return s
@@ -0,0 +1,5 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from evalscope.benchmarks.race.race_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST, RACEAdapter
4
+ from evalscope.benchmarks.race.race_adapter import RACEAdapter as DataAdapterClass
5
+ from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa