evalscope 0.11.0__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (89) hide show
  1. evalscope/arguments.py +3 -1
  2. evalscope/benchmarks/{aime24 → aime}/aime24_adapter.py +3 -3
  3. evalscope/benchmarks/aime/aime25_adapter.py +49 -0
  4. evalscope/benchmarks/arc/arc_adapter.py +14 -17
  5. evalscope/benchmarks/bbh/bbh_adapter.py +6 -11
  6. evalscope/benchmarks/benchmark.py +12 -10
  7. evalscope/benchmarks/ceval/ceval_adapter.py +10 -15
  8. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +11 -16
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +6 -20
  10. evalscope/benchmarks/data_adapter.py +82 -19
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +15 -22
  13. evalscope/benchmarks/general_qa/general_qa_adapter.py +29 -16
  14. evalscope/benchmarks/gpqa/gpqa_adapter.py +13 -8
  15. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -4
  16. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +8 -12
  17. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -2
  18. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -4
  19. evalscope/benchmarks/iquiz/iquiz_adapter.py +9 -5
  20. evalscope/benchmarks/math_500/math_500_adapter.py +9 -4
  21. evalscope/benchmarks/mmlu/mmlu_adapter.py +11 -16
  22. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +24 -36
  23. evalscope/benchmarks/musr/__init__.py +0 -0
  24. evalscope/benchmarks/musr/musr_adapter.py +71 -0
  25. evalscope/benchmarks/process_bench/__init__.py +0 -0
  26. evalscope/benchmarks/process_bench/critique_template.txt +13 -0
  27. evalscope/benchmarks/process_bench/process_bench_adapter.py +99 -0
  28. evalscope/benchmarks/race/race_adapter.py +12 -16
  29. evalscope/benchmarks/simple_qa/__init__.py +0 -0
  30. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +20 -0
  31. evalscope/benchmarks/super_gpqa/__init__.py +0 -0
  32. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +89 -0
  33. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +191 -0
  34. evalscope/benchmarks/super_gpqa/utils.py +90 -0
  35. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +3 -0
  36. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -4
  37. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +7 -14
  38. evalscope/benchmarks/utils.py +43 -0
  39. evalscope/cli/start_app.py +4 -1
  40. evalscope/cli/start_eval.py +4 -3
  41. evalscope/cli/start_perf.py +4 -2
  42. evalscope/collections/evaluator.py +16 -1
  43. evalscope/config.py +13 -3
  44. evalscope/constants.py +7 -0
  45. evalscope/evaluator/evaluator.py +3 -1
  46. evalscope/metrics/__init__.py +2 -1
  47. evalscope/metrics/metrics.py +23 -2
  48. evalscope/metrics/named_metrics.py +1 -0
  49. evalscope/models/__init__.py +2 -1
  50. evalscope/models/base_adapter.py +32 -6
  51. evalscope/models/chat_adapter.py +4 -1
  52. evalscope/models/choice_adapter.py +4 -0
  53. evalscope/models/custom_adapter.py +2 -0
  54. evalscope/models/local_model.py +3 -2
  55. evalscope/models/register.py +28 -0
  56. evalscope/models/server_adapter.py +107 -29
  57. evalscope/perf/__init__.py +0 -1
  58. evalscope/perf/arguments.py +18 -8
  59. evalscope/perf/http_client.py +8 -6
  60. evalscope/perf/plugin/api/openai_api.py +11 -1
  61. evalscope/perf/utils/analysis_result.py +1 -1
  62. evalscope/perf/utils/benchmark_util.py +6 -2
  63. evalscope/report/app.py +15 -8
  64. evalscope/report/combinator.py +2 -2
  65. evalscope/run.py +6 -5
  66. evalscope/third_party/thinkbench/__init__.py +3 -0
  67. evalscope/third_party/thinkbench/eval.py +429 -0
  68. evalscope/third_party/thinkbench/infer.py +130 -0
  69. evalscope/third_party/thinkbench/resources/critique_template.txt +17 -0
  70. evalscope/third_party/thinkbench/resources/reformat_template.txt +31 -0
  71. evalscope/third_party/thinkbench/tools/__init__.py +0 -0
  72. evalscope/third_party/thinkbench/tools/llm.py +48 -0
  73. evalscope/third_party/thinkbench/tools/utils.py +13 -0
  74. evalscope/third_party/toolbench_static/llm/swift_infer.py +50 -20
  75. evalscope/utils/chat_service.py +1 -0
  76. evalscope/utils/filters.py +59 -0
  77. evalscope/utils/logger.py +3 -3
  78. evalscope/utils/model_utils.py +17 -1
  79. evalscope/utils/utils.py +45 -45
  80. evalscope/version.py +2 -2
  81. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/METADATA +14 -5
  82. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/RECORD +89 -65
  83. tests/cli/test_collection.py +1 -1
  84. tests/cli/test_run.py +151 -32
  85. /evalscope/benchmarks/{aime24 → aime}/__init__.py +0 -0
  86. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/LICENSE +0 -0
  87. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/WHEEL +0 -0
  88. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/entry_points.txt +0 -0
  89. {evalscope-0.11.0.dist-info → evalscope-0.12.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,191 @@
1
+ import os
2
+ import random
3
+ import re
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import EvalType, OutputType
7
+ from evalscope.metrics import exact_match
8
+ from evalscope.utils import logger
9
+
10
+ current_dir = os.path.dirname(os.path.abspath(__file__))
11
+
12
+ SUBSET_LIST = [
13
+ 'Electronic Science and Technology', 'Philosophy', 'Traditional Chinese Medicine', 'Applied Economics',
14
+ 'Mathematics', 'Physics', 'Clinical Medicine', 'Computer Science and Technology',
15
+ 'Information and Communication Engineering', 'Control Science and Engineering', 'Theoretical Economics', 'Law',
16
+ 'History', 'Basic Medicine', 'Education', 'Materials Science and Engineering', 'Electrical Engineering',
17
+ 'Systems Science', 'Power Engineering and Engineering Thermophysics', 'Military Science', 'Biology',
18
+ 'Business Administration', 'Language and Literature', 'Public Health and Preventive Medicine', 'Political Science',
19
+ 'Chemistry', 'Hydraulic Engineering', 'Chemical Engineering and Technology', 'Pharmacy', 'Geography', 'Art Studies',
20
+ 'Architecture', 'Forestry Engineering', 'Public Administration', 'Oceanography', 'Journalism and Communication',
21
+ 'Nuclear Science and Technology', 'Weapon Science and Technology', 'Naval Architecture and Ocean Engineering',
22
+ 'Environmental Science and Engineering', 'Transportation Engineering', 'Geology', 'Physical Oceanography',
23
+ 'Musicology', 'Stomatology', 'Aquaculture', 'Mechanical Engineering',
24
+ 'Aeronautical and Astronautical Science and Technology', 'Civil Engineering', 'Mechanics',
25
+ 'Petroleum and Natural Gas Engineering', 'Sociology', 'Food Science and Engineering', 'Agricultural Engineering',
26
+ 'Surveying and Mapping Science and Technology', 'Metallurgical Engineering',
27
+ 'Library, Information and Archival Management', 'Mining Engineering', 'Astronomy',
28
+ 'Geological Resources and Geological Engineering', 'Atmospheric Science', 'Optical Engineering', 'Animal Husbandry',
29
+ 'Geophysics', 'Crop Science', 'Management Science and Engineering', 'Psychology', 'Forestry',
30
+ 'Textile Science and Engineering', 'Veterinary Medicine', 'Instrument Science and Technology', 'Physical Education'
31
+ ]
32
+
33
+ SUBSET_MAPPING = {
34
+ 'Electronic Science and Technology': ['Engineering'],
35
+ 'Philosophy': ['Philosophy'],
36
+ 'Traditional Chinese Medicine': ['Medicine'],
37
+ 'Applied Economics': ['Economics'],
38
+ 'Mathematics': ['Science'],
39
+ 'Physics': ['Science'],
40
+ 'Clinical Medicine': ['Medicine'],
41
+ 'Computer Science and Technology': ['Engineering'],
42
+ 'Information and Communication Engineering': ['Engineering'],
43
+ 'Control Science and Engineering': ['Engineering'],
44
+ 'Theoretical Economics': ['Economics'],
45
+ 'Law': ['Law'],
46
+ 'History': ['History'],
47
+ 'Basic Medicine': ['Medicine'],
48
+ 'Education': ['Education'],
49
+ 'Materials Science and Engineering': ['Engineering'],
50
+ 'Electrical Engineering': ['Engineering'],
51
+ 'Systems Science': ['Science'],
52
+ 'Power Engineering and Engineering Thermophysics': ['Engineering'],
53
+ 'Military Science': ['Military Science'],
54
+ 'Biology': ['Science'],
55
+ 'Business Administration': ['Management'],
56
+ 'Language and Literature': ['Literature and Arts'],
57
+ 'Public Health and Preventive Medicine': ['Medicine'],
58
+ 'Political Science': ['Law'],
59
+ 'Chemistry': ['Science'],
60
+ 'Hydraulic Engineering': ['Engineering'],
61
+ 'Chemical Engineering and Technology': ['Engineering'],
62
+ 'Pharmacy': ['Medicine'],
63
+ 'Geography': ['Science'],
64
+ 'Art Studies': ['Literature and Arts'],
65
+ 'Architecture': ['Engineering'],
66
+ 'Forestry Engineering': ['Engineering'],
67
+ 'Public Administration': ['Management'],
68
+ 'Oceanography': ['Science'],
69
+ 'Journalism and Communication': ['Literature and Arts'],
70
+ 'Nuclear Science and Technology': ['Engineering'],
71
+ 'Weapon Science and Technology': ['Engineering'],
72
+ 'Naval Architecture and Ocean Engineering': ['Engineering'],
73
+ 'Environmental Science and Engineering': ['Engineering'],
74
+ 'Transportation Engineering': ['Engineering'],
75
+ 'Geology': ['Science'],
76
+ 'Physical Oceanography': ['Science'],
77
+ 'Musicology': ['Literature and Arts'],
78
+ 'Stomatology': ['Medicine'],
79
+ 'Aquaculture': ['Agronomy'],
80
+ 'Mechanical Engineering': ['Engineering'],
81
+ 'Aeronautical and Astronautical Science and Technology': ['Engineering'],
82
+ 'Civil Engineering': ['Engineering'],
83
+ 'Mechanics': ['Engineering'],
84
+ 'Petroleum and Natural Gas Engineering': ['Engineering'],
85
+ 'Sociology': ['Sociology'],
86
+ 'Food Science and Engineering': ['Engineering'],
87
+ 'Agricultural Engineering': ['Engineering'],
88
+ 'Surveying and Mapping Science and Technology': ['Engineering'],
89
+ 'Metallurgical Engineering': ['Engineering'],
90
+ 'Library, Information and Archival Management': ['Management'],
91
+ 'Mining Engineering': ['Engineering'],
92
+ 'Astronomy': ['Science'],
93
+ 'Geological Resources and Geological Engineering': ['Engineering'],
94
+ 'Atmospheric Science': ['Science'],
95
+ 'Optical Engineering': ['Engineering'],
96
+ 'Animal Husbandry': ['Agronomy'],
97
+ 'Geophysics': ['Science'],
98
+ 'Crop Science': ['Agronomy'],
99
+ 'Management Science and Engineering': ['Management'],
100
+ 'Psychology': ['Education'],
101
+ 'Forestry': ['Agronomy'],
102
+ 'Textile Science and Engineering': ['Engineering'],
103
+ 'Veterinary Medicine': ['Agronomy'],
104
+ 'Instrument Science and Technology': ['Engineering'],
105
+ 'Physical Education': ['Education']
106
+ }
107
+
108
+
109
+ @Benchmark.register(
110
+ name='super_gpqa',
111
+ pretty_name='SuperGPQA',
112
+ dataset_id='m-a-p/SuperGPQA',
113
+ model_adapter=OutputType.GENERATION,
114
+ output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
115
+ subset_list=SUBSET_LIST,
116
+ metric_list=['AverageAccuracy'],
117
+ few_shot_num=0,
118
+ train_split=None,
119
+ eval_split='train', # only have train split
120
+ )
121
+ class SuperGPQAAdapter(DataAdapter):
122
+
123
+ def __init__(self, **kwargs):
124
+ few_shot_num = kwargs.get('few_shot_num', 0)
125
+ if few_shot_num > 0 and few_shot_num != 5:
126
+ logger.warning(
127
+ f'Only support few_shot_num 0 or 5 for SuperGPQA, but got {few_shot_num}. Use 5-shot by default.')
128
+ kwargs['few_shot_num'] = 5
129
+ super().__init__(**kwargs)
130
+
131
+ self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
132
+ self.category_map = SUBSET_MAPPING
133
+ self.few_shot_prompt = open(os.path.join(current_dir, 'five_shot_prompt.txt'), encoding='utf-8').read()
134
+ self.zero_shot_prompt = open(os.path.join(current_dir, 'zero_shot_prompt.txt'), encoding='utf-8').read()
135
+
136
+ def load(self, **kwargs):
137
+ kwargs['subset_list'] = ['default']
138
+ data_dict = super().load(**kwargs)
139
+ return self.reformat_subset(data_dict, subset_key='field', format='{}')
140
+
141
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
142
+ if not self.prompt_template:
143
+ if few_shot_list:
144
+ prompt = self.few_shot_prompt.format(query=input_d['question'])
145
+ else:
146
+ prompt = self.zero_shot_prompt.format(query=input_d['question'])
147
+ else:
148
+ prompt = self.prompt_template.format(query=input_d['question'])
149
+ return self.gen_prompt_data(prompt)
150
+
151
+ def get_gold_answer(self, input_d: dict) -> str:
152
+ # Get the gold choice
153
+ return input_d.get('answer_letter')
154
+
155
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
156
+ """
157
+ Parse the model output to get the answer. Could be the best choice index.
158
+
159
+ Args:
160
+ result: Predicted answer from the model. Usually a string for chat.
161
+ raw_input_d: The raw input. Depending on the dataset.
162
+ eval_type: 'checkpoint' or 'service' or 'custom'
163
+
164
+ Returns:
165
+ The parsed answer. Depending on the dataset. Usually a string for chat.
166
+ """
167
+ if self.model_adapter == OutputType.MULTIPLE_CHOICE:
168
+ return result
169
+ else:
170
+ from evalscope.benchmarks.super_gpqa.utils import extract_option_content, extract_option_labels
171
+ sample = raw_input_d
172
+ if self.few_shot_num == 0:
173
+ predict = extract_option_labels(result, 'ABCDEFGHIJ')
174
+ if predict is None:
175
+ predict = extract_option_content(result, sample['options'])
176
+ predict = chr(sample['options'].index(predict) + 65) if predict else None
177
+ else:
178
+ response = result.split('Question:')[0]
179
+ predict = extract_option_labels(response, 'ABCDEFGHIJ')
180
+ if predict is None:
181
+ predict = extract_option_content(response, sample['options'])
182
+ predict = chr(sample['options'].index(predict) + 65) if predict else None
183
+ if predict is None:
184
+ predict = extract_option_labels(result, 'ABCDEFGHIJ')
185
+ if predict is None:
186
+ predict = extract_option_content(result, sample['options'])
187
+ predict = chr(sample['options'].index(predict) + 65) if predict else None
188
+ return predict
189
+
190
+ def match(self, gold: str, pred: str) -> float:
191
+ return exact_match(gold=gold, pred=pred)
@@ -0,0 +1,90 @@
1
+ # flake8: noqa
2
+ import re
3
+ import timeout_decorator
4
+
5
+
6
+ @timeout_decorator.timeout(5) # 5 seconds timeout
7
+ def safe_regex_search(pattern, text, flags=0):
8
+ try:
9
+ return re.search(pattern, text, flags)
10
+ except timeout_decorator.TimeoutError:
11
+ print(f'Regex match timeout: pattern={pattern}, text={text[:100]}...')
12
+ return None
13
+ except Exception as e:
14
+ print(f'Regex match error: {str(e)}')
15
+ return None
16
+
17
+
18
+ def extract_option_labels(text, options='ABCDEFGHIJ'):
19
+ if not isinstance(text, str) or not isinstance(options, str):
20
+ return 'error'
21
+
22
+ text = text.rstrip()
23
+ last_line = text.split('\n')[-1]
24
+
25
+ option_str = ''.join([chr(65 + i) for i in range(len(options))]) if options else 'ABCDEFGHIJ'
26
+
27
+ patterns = [
28
+ # e.g. "The final answer to this question is: A."
29
+ # "The best option is $\boxed{B}:"
30
+ # "The correct answer is (C)."
31
+ f'[Tt]he\s+(?:\w+\s+)?(?:answer|option)(?:\w+\s+)?\s+is?:?\s*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
32
+
33
+ # e.g. "ANSWER: A"
34
+ # "Answer: $\boxed{B}."
35
+ # "ANSWER: (C):"
36
+ f'(?i:Answer)[\*\s]*:\s*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
37
+
38
+ # e.g. "A"
39
+ # "$\boxed{B}$"
40
+ # "(C)."
41
+ # "[D]:"
42
+ f'^[^\w\r\n]*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
43
+ ]
44
+
45
+ for pattern in patterns:
46
+ match = safe_regex_search(pattern, last_line, re.IGNORECASE)
47
+ if match:
48
+ return match.group(1)
49
+
50
+ for pattern in patterns:
51
+ match = safe_regex_search(pattern, text, re.IGNORECASE)
52
+ if match:
53
+ return match.group(1)
54
+
55
+ return None
56
+
57
+
58
+ def extract_option_content(text, options_content=None):
59
+ if not isinstance(text, str) or not isinstance(options_content, list):
60
+ return 'error'
61
+
62
+ escaped_options_content = [re.escape(option_content) for option_content in options_content]
63
+ escaped_options_content_str = '|'.join(escaped_options_content)
64
+
65
+ text = text.rstrip()
66
+ last_line = text.split('\n')[-1]
67
+
68
+ patterns = [
69
+ f'[Tt]he\s+(?:\w+\s+)?(?:answer|option)(?:\w+\s+)?\s+is:?\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
70
+ f'(?i:Answer)\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
71
+ f'^[^\w\r\n]*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
72
+ ]
73
+
74
+ for pattern in patterns:
75
+ match = safe_regex_search(pattern, last_line)
76
+ if match:
77
+ if match.group(1) in escaped_options_content:
78
+ return options_content[escaped_options_content.index(match.group(1))]
79
+ else:
80
+ return match.group(1)
81
+
82
+ for pattern in patterns:
83
+ match = safe_regex_search(pattern, text)
84
+ if match:
85
+ if match.group(1) in escaped_options_content:
86
+ return options_content[escaped_options_content.index(match.group(1))]
87
+ else:
88
+ return match.group(1)
89
+
90
+ return None
@@ -0,0 +1,3 @@
1
+ Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
2
+
3
+ {query}
@@ -5,8 +5,7 @@ import os
5
5
 
6
6
  from evalscope.benchmarks import Benchmark
7
7
  from evalscope.benchmarks.data_adapter import DataAdapter
8
- from evalscope.constants import EvalType
9
- from evalscope.models import ChatGenerationModelAdapter
8
+ from evalscope.constants import EvalType, OutputType
10
9
  from evalscope.utils import get_logger
11
10
 
12
11
  # flake8: noqa
@@ -16,8 +15,8 @@ logger = get_logger()
16
15
 
17
16
  @Benchmark.register(
18
17
  name='trivia_qa',
18
+ pretty_name='TriviaQA',
19
19
  dataset_id='modelscope/trivia_qa',
20
- model_adapter=ChatGenerationModelAdapter,
21
20
  subset_list=['default'],
22
21
  metric_list=['AverageAccuracy'],
23
22
  few_shot_num=5,
@@ -100,7 +99,7 @@ class TriviaQaAdapter(DataAdapter):
100
99
  context += self._generate_prompt(input_d=input_d, include_answer=False)
101
100
  full_prompt = context
102
101
 
103
- return {'data': [full_prompt], 'system_prompt': prompt or self.prompt_template}
102
+ return self.gen_prompt_data(full_prompt)
104
103
 
105
104
  def get_gold_answer(self, input_d: dict) -> list:
106
105
  # Get the gold choice
@@ -8,8 +8,7 @@ from typing import List
8
8
 
9
9
  from evalscope.benchmarks import Benchmark
10
10
  from evalscope.benchmarks.data_adapter import DataAdapter
11
- from evalscope.constants import EvalType
12
- from evalscope.models import ContinuationLogitsModelAdapter
11
+ from evalscope.constants import EvalType, OutputType
13
12
  from evalscope.utils import get_logger
14
13
 
15
14
  # flake8: noqa
@@ -21,8 +20,10 @@ logger = get_logger()
21
20
 
22
21
  @Benchmark.register(
23
22
  name='truthful_qa',
23
+ pretty_name='TruthfulQA',
24
24
  dataset_id='modelscope/truthful_qa',
25
- model_adapter=ContinuationLogitsModelAdapter,
25
+ model_adapter=OutputType.CONTINUOUS,
26
+ output_types=[OutputType.CONTINUOUS, OutputType.GENERATION],
26
27
  subset_list=['multiple_choice'],
27
28
  metric_list=['AverageAccuracy'],
28
29
  few_shot_num=0,
@@ -195,8 +196,7 @@ class TruthfulQaAdapter(DataAdapter):
195
196
  else:
196
197
  raise ValueError(f'** Unknown subset_name: {subset_name}')
197
198
 
198
- prompt_d = {'data': ctx_continuation_pair_list}
199
- return prompt_d
199
+ return self.gen_prompt_data(ctx_continuation_pair_list)
200
200
 
201
201
  def get_gold_answer(self, input_d: dict) -> dict:
202
202
  # Get the gold choice
@@ -215,14 +215,7 @@ class TruthfulQaAdapter(DataAdapter):
215
215
  Returns:
216
216
  The predicted answer.
217
217
  """
218
- if eval_type == EvalType.CHECKPOINT:
219
- return result
220
- elif eval_type == EvalType.SERVICE: # TODO: to be supported !
221
- return result
222
- elif eval_type == EvalType.CUSTOM: # TODO: to be supported !
223
- return result
224
- else:
225
- raise ValueError(f'Invalid eval_type: {eval_type}')
218
+ return result
226
219
 
227
220
  def match(self, gold: dict, pred: list) -> dict:
228
221
  """
@@ -258,7 +251,7 @@ class TruthfulQaAdapter(DataAdapter):
258
251
 
259
252
  return {'multiple_choice': {'mc1': mc1(mc1_lls), 'mc2': mc2(mc2_lls)}} # or {'generation': xxx}
260
253
 
261
- def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
254
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
262
255
  """
263
256
  Compute evaluation result by specific metric for each subset.
264
257
 
@@ -0,0 +1,43 @@
1
+ from dataclasses import dataclass
2
+ from functools import wraps
3
+ from typing import Dict, List, Optional
4
+
5
+ from evalscope.constants import EvalType
6
+ from evalscope.utils.filters import Filter
7
+
8
+
9
+ @dataclass
10
+ class PromptData:
11
+ data: List[str]
12
+ index: Optional[int] = 0
13
+ system_prompt: Optional[str] = None
14
+ multi_choices: Optional[List[str]] = None
15
+
16
+ def to_dict(self) -> Dict:
17
+ if self.multi_choices is None:
18
+ return {
19
+ 'data': self.data,
20
+ 'index': self.index,
21
+ 'system_prompt': self.system_prompt,
22
+ }
23
+ else:
24
+ return {
25
+ 'data': self.data,
26
+ 'index': self.index,
27
+ 'system_prompt': self.system_prompt,
28
+ 'multi_choices': self.multi_choices,
29
+ }
30
+
31
+
32
+ def preprocess_decorator(func):
33
+
34
+ @wraps(func)
35
+ def wrapper(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT):
36
+ filters = self.config_kwargs.get('filters', None)
37
+ if filters:
38
+ # Apply filters to the resultply filters to the result
39
+ for filter_name, filter_value in filters.items():
40
+ result = Filter.apply(filter_name, result, filter_value)
41
+ return func(self, result, raw_input_d, eval_type)
42
+
43
+ return wrapper
@@ -3,7 +3,6 @@ import os
3
3
  from argparse import ArgumentParser
4
4
 
5
5
  from evalscope.cli.base import CLICommand
6
- from evalscope.report.app import add_argument, create_app
7
6
 
8
7
 
9
8
  def subparser_func(args):
@@ -22,9 +21,13 @@ class StartAppCMD(CLICommand):
22
21
  def define_args(parsers: ArgumentParser):
23
22
  """ define args for create pipeline template command.
24
23
  """
24
+ from evalscope.report.app import add_argument
25
+
25
26
  parser = parsers.add_parser(StartAppCMD.name)
26
27
  add_argument(parser)
27
28
  parser.set_defaults(func=subparser_func)
28
29
 
29
30
  def execute(self):
31
+ from evalscope.report.app import create_app
32
+
30
33
  create_app(self.args)
@@ -1,10 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
2
  from argparse import ArgumentParser
4
3
 
5
- from evalscope.arguments import add_argument
6
4
  from evalscope.cli.base import CLICommand
7
- from evalscope.run import run_task
8
5
 
9
6
 
10
7
  def subparser_func(args):
@@ -23,9 +20,13 @@ class EvalCMD(CLICommand):
23
20
  def define_args(parsers: ArgumentParser):
24
21
  """ define args for create pipeline template command.
25
22
  """
23
+ from evalscope.arguments import add_argument
24
+
26
25
  parser = parsers.add_parser(EvalCMD.name)
27
26
  add_argument(parser)
28
27
  parser.set_defaults(func=subparser_func)
29
28
 
30
29
  def execute(self):
30
+ from evalscope.run import run_task
31
+
31
32
  run_task(self.args)
@@ -3,8 +3,6 @@ import os
3
3
  from argparse import ArgumentParser
4
4
 
5
5
  from evalscope.cli.base import CLICommand
6
- from evalscope.perf.arguments import add_argument
7
- from evalscope.perf.main import run_perf_benchmark
8
6
 
9
7
 
10
8
  def subparser_func(args):
@@ -23,9 +21,13 @@ class PerfBenchCMD(CLICommand):
23
21
  def define_args(parsers: ArgumentParser):
24
22
  """ define args for create pipeline template command.
25
23
  """
24
+ from evalscope.perf.arguments import add_argument
25
+
26
26
  parser = parsers.add_parser(PerfBenchCMD.name)
27
27
  add_argument(parser)
28
28
  parser.set_defaults(func=subparser_func)
29
29
 
30
30
  def execute(self):
31
+ from evalscope.perf.main import run_perf_benchmark
32
+
31
33
  run_perf_benchmark(self.args)
@@ -97,8 +97,8 @@ class EvaluatorCollection:
97
97
  evaluators = {}
98
98
  for dataset_name in self.dataset_name_map.keys():
99
99
  benchmark = Benchmark.get(dataset_name)
100
+ model_adapter = initialize_model_adapter(self.task_cfg, benchmark, self.model)
100
101
  data_adapter = benchmark.get_data_adapter()
101
- model_adapter = initialize_model_adapter(self.task_cfg, benchmark.model_adapter, self.model)
102
102
  evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
103
103
  self.outputs)
104
104
  return evaluators
@@ -234,6 +234,21 @@ class EvaluatorCollection:
234
234
  def get_reviews(self, answers):
235
235
  review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
236
236
  os.makedirs(review_file_path, exist_ok=True)
237
+
238
+ if self.task_cfg.use_cache and os.path.exists(review_file_path):
239
+ logger.warning(
240
+ f'Ignore use_cache={self.task_cfg.use_cache}, updating the review file: {review_file_path} ...')
241
+ if os.path.isdir(review_file_path):
242
+ for filename in os.listdir(review_file_path):
243
+ file_path = os.path.join(review_file_path, filename)
244
+ try:
245
+ if os.path.isfile(file_path):
246
+ os.remove(file_path)
247
+ except Exception as e:
248
+ logger.error(f'Error deleting file {file_path}: {e}')
249
+ else:
250
+ os.remove(review_file_path)
251
+
237
252
  reviews = defaultdict(dict)
238
253
  for sample in tqdm(self.dataset, desc='Getting reviews'):
239
254
  evaluator = self.evaluators[sample.dataset_name]
evalscope/config.py CHANGED
@@ -4,10 +4,12 @@ import copy
4
4
  import json
5
5
  import os
6
6
  from argparse import Namespace
7
+ from collections import OrderedDict
7
8
  from dataclasses import dataclass, field
8
9
  from typing import Dict, List, Optional, Union
9
10
 
10
- from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
11
+ from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType,
12
+ OutputType)
11
13
  from evalscope.models.custom import CustomModel
12
14
  from evalscope.utils import gen_hash
13
15
  from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
@@ -17,7 +19,7 @@ logger = get_logger()
17
19
 
18
20
  cur_path = os.path.dirname(os.path.abspath(__file__))
19
21
 
20
- DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16', 'device': 'auto'}
22
+ DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16'}
21
23
  DEFAULT_GENERATION_CONFIG = {
22
24
  'max_length': 2048,
23
25
  'max_new_tokens': 512,
@@ -54,7 +56,7 @@ class TaskConfig:
54
56
  eval_config: Union[str, Dict, None] = None
55
57
  stage: str = EvalStage.ALL
56
58
  limit: Optional[int] = None
57
- eval_batch_size: int = 1
59
+ eval_batch_size: Optional[int] = None
58
60
 
59
61
  # Cache and working directory arguments
60
62
  mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
@@ -68,6 +70,8 @@ class TaskConfig:
68
70
  seed: Optional[int] = 42
69
71
  api_url: Optional[str] = None # Only used for server model
70
72
  api_key: Optional[str] = 'EMPTY' # Only used for server model
73
+ timeout: Optional[float] = None # Only used for server model
74
+ stream: bool = False # Only used for server model
71
75
 
72
76
  def __post_init__(self):
73
77
  if (not self.model_id) and self.model:
@@ -75,6 +79,12 @@ class TaskConfig:
75
79
  self.model_id = type(self.model).__name__
76
80
  else:
77
81
  self.model_id = os.path.basename(self.model).rstrip(os.sep)
82
+ # fix path error, see http://github.com/modelscope/evalscope/issues/377
83
+ self.model_id = self.model_id.replace(':', '-')
84
+
85
+ # Set default eval_batch_size based on eval_type
86
+ if self.eval_batch_size is None:
87
+ self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
78
88
 
79
89
  def to_dict(self):
80
90
  return self.__dict__
evalscope/constants.py CHANGED
@@ -139,6 +139,13 @@ class EvalType:
139
139
  SERVICE = 'service' # model service
140
140
 
141
141
 
142
+ class OutputType:
143
+ LOGITS = 'logits' # for multiple choice tasks
144
+ GENERATION = 'generation' # for text generation tasks and general tasks
145
+ MULTIPLE_CHOICE = 'multiple_choice_logits' # for multiple choice tasks
146
+ CONTINUOUS = 'continuous_logits' # for continuous tasks
147
+
148
+
142
149
  class EvalBackend:
143
150
  NATIVE = 'Native'
144
151
  OPEN_COMPASS = 'OpenCompass'
@@ -250,6 +250,7 @@ class Evaluator(object):
250
250
 
251
251
  if self.use_cache and os.path.exists(review_file_path):
252
252
  logger.warning(f'Ignore use_cache={self.use_cache}, updating the review file: {review_file_path} ...')
253
+ os.remove(review_file_path)
253
254
 
254
255
  for answer_d in tqdm(answers_list, total=len(answers_list), desc=f'Reviewing({subset_name}): '):
255
256
  review_id, reviewer_spec = self._generate_review_id(answer_d)
@@ -292,7 +293,8 @@ class Evaluator(object):
292
293
 
293
294
  review_res_list.append(review_res)
294
295
 
295
- metric_score: List[dict] = self.data_adapter.compute_metric(review_res_list=review_res_list)
296
+ metric_score: List[dict] = self.data_adapter.compute_metric(
297
+ review_res_list=review_res_list, reviews_list=reviews_list)
296
298
 
297
299
  return metric_score
298
300
 
@@ -1,4 +1,5 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, weighted_mean
2
+ from evalscope.metrics.metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean,
3
+ simple_f1_score, weighted_mean)
3
4
  from evalscope.metrics.named_metrics import *
4
5
  from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
@@ -13,6 +13,9 @@ from typing import TYPE_CHECKING, Dict, List, Union
13
13
 
14
14
 
15
15
  def mean(arr: list):
16
+ if not arr:
17
+ return 0.0
18
+
16
19
  if isinstance(arr[0], list):
17
20
  arr = [item for sublist in arr for item in sublist]
18
21
  return sum(arr) / len(arr)
@@ -58,6 +61,18 @@ def matthews_corrcoef(items):
58
61
  return sklearn.metrics.matthews_corrcoef(golds, preds)
59
62
 
60
63
 
64
+ def simple_f1_score(scores: tuple) -> float:
65
+ score1 = scores[0]
66
+ score2 = scores[1]
67
+ score1 = np.mean(score1) if len(score1) > 0 else 0.0
68
+ score2 = np.mean(score2) if len(score2) > 0 else 0.0
69
+
70
+ if score1 == 0 and score2 == 0:
71
+ return 0.0
72
+ else:
73
+ return 2 * score1 * score2 / (score1 + score2)
74
+
75
+
61
76
  def f1_score(items):
62
77
  import sklearn.metrics
63
78
 
@@ -126,11 +141,17 @@ def weighted_mean(items: List) -> float:
126
141
 
127
142
 
128
143
  def micro_mean(items):
129
- return sum([item.score * item.num for item in items]) / sum([item.num for item in items])
144
+ try:
145
+ return sum([item.score * item.num for item in items]) / sum([item.num for item in items])
146
+ except ZeroDivisionError:
147
+ return 0.0
130
148
 
131
149
 
132
150
  def macro_mean(items):
133
- return sum([item.score for item in items]) / len(items)
151
+ try:
152
+ return sum([item.score for item in items]) / len(items)
153
+ except ZeroDivisionError:
154
+ return 0.0
134
155
 
135
156
 
136
157
  def weighted_perplexity(items):