evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +115 -87
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
  26. evalscope/benchmarks/ifeval/__init__.py +0 -0
  27. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  28. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  29. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  30. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  31. evalscope/benchmarks/ifeval/utils.py +134 -0
  32. evalscope/benchmarks/iquiz/__init__.py +0 -0
  33. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  34. evalscope/benchmarks/mmlu/__init__.py +0 -5
  35. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  36. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  37. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  38. evalscope/benchmarks/race/__init__.py +0 -5
  39. evalscope/benchmarks/race/race_adapter.py +26 -123
  40. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  41. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  42. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  43. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  44. evalscope/cli/cli.py +2 -0
  45. evalscope/cli/start_app.py +29 -0
  46. evalscope/collections/__init__.py +3 -0
  47. evalscope/collections/evaluator.py +198 -0
  48. evalscope/collections/sampler.py +138 -0
  49. evalscope/collections/schema.py +126 -0
  50. evalscope/config.py +7 -5
  51. evalscope/constants.py +9 -26
  52. evalscope/evaluator/evaluator.py +87 -121
  53. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  54. evalscope/metrics/__init__.py +3 -0
  55. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  56. evalscope/metrics/math_accuracy.py +193 -50
  57. evalscope/metrics/metrics.py +18 -6
  58. evalscope/metrics/named_metrics.py +17 -0
  59. evalscope/metrics/rouge_metric.py +13 -8
  60. evalscope/models/__init__.py +14 -1
  61. evalscope/models/base_adapter.py +52 -0
  62. evalscope/models/chat_adapter.py +138 -0
  63. evalscope/models/choice_adapter.py +211 -0
  64. evalscope/models/custom_adapter.py +67 -0
  65. evalscope/models/local_model.py +74 -0
  66. evalscope/models/model.py +141 -0
  67. evalscope/models/server_adapter.py +111 -0
  68. evalscope/perf/__init__.py +1 -0
  69. evalscope/perf/main.py +0 -1
  70. evalscope/perf/plugin/api/custom_api.py +1 -1
  71. evalscope/perf/plugin/api/openai_api.py +1 -1
  72. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  73. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  74. evalscope/report/__init__.py +5 -0
  75. evalscope/report/app.py +506 -0
  76. evalscope/report/combinator.py +73 -0
  77. evalscope/report/generator.py +80 -0
  78. evalscope/report/utils.py +133 -0
  79. evalscope/run.py +48 -72
  80. evalscope/run_arena.py +1 -1
  81. evalscope/summarizer.py +1 -1
  82. evalscope/utils/__init__.py +1 -1
  83. evalscope/utils/chat_service.py +5 -4
  84. evalscope/utils/io_utils.py +8 -0
  85. evalscope/utils/logger.py +5 -0
  86. evalscope/utils/model_utils.py +15 -2
  87. evalscope/utils/utils.py +3 -25
  88. evalscope/version.py +2 -2
  89. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
  90. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
  91. tests/cli/test_collection.py +57 -0
  92. tests/cli/test_run.py +52 -1
  93. tests/rag/test_mteb.py +3 -2
  94. evalscope/models/api/__init__.py +0 -3
  95. evalscope/models/dummy_chat_model.py +0 -49
  96. evalscope/models/model_adapter.py +0 -525
  97. evalscope/models/openai_model.py +0 -103
  98. evalscope/tools/__init__.py +0 -1
  99. evalscope/tools/combine_reports.py +0 -133
  100. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  101. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  102. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  103. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  104. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  105. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  106. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -2,8 +2,11 @@
2
2
  import csv
3
3
  import os
4
4
 
5
- from evalscope.benchmarks.data_adapter import DataAdapter
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import EvalType
7
+ from evalscope.metrics import AverageAccuracy
6
8
  from evalscope.metrics.metrics import exact_match, weighted_mean
9
+ from evalscope.models import MultiChoiceModelAdapter
7
10
  from evalscope.utils import ResponseParser, normalize_score
8
11
  from evalscope.utils.logger import get_logger
9
12
 
@@ -11,8 +14,6 @@ from evalscope.utils.logger import get_logger
11
14
 
12
15
  logger = get_logger()
13
16
 
14
- DATASET_ID = 'modelscope/ceval-exam'
15
-
16
17
  SUBSET_LIST = [
17
18
  'computer_network',
18
19
  'operating_system',
@@ -124,40 +125,29 @@ SUBJECT_MAPPING = {
124
125
  }
125
126
 
126
127
 
128
+ @Benchmark.register(
129
+ name='ceval',
130
+ dataset_id='modelscope/ceval-exam',
131
+ model_adapter=MultiChoiceModelAdapter,
132
+ subset_list=SUBSET_LIST,
133
+ metric_list=[AverageAccuracy],
134
+ few_shot_num=0,
135
+ train_split='dev',
136
+ eval_split='val',
137
+ )
127
138
  class CEVALAdapter(DataAdapter):
128
139
 
129
140
  choices = ['A', 'B', 'C', 'D']
130
141
 
131
- def __init__(self,
132
- subset_list: list = None,
133
- metric_list: list = None,
134
- few_shot_num: int = None,
135
- train_split: str = 'dev',
136
- eval_split: str = 'val',
137
- **kwargs):
138
-
139
- if subset_list is None:
140
- subset_list = SUBSET_LIST
141
-
142
- if metric_list is None:
143
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
144
-
145
- if few_shot_num is None:
146
- # Use 5-shot by default
147
- logger.info(f'Set 0-shot examples by default for C-Eval.')
148
- few_shot_num = 0
142
+ def __init__(self, **kwargs):
149
143
 
144
+ few_shot_num = kwargs.get('few_shot_num', 0)
150
145
  if few_shot_num > 5:
151
146
  logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
152
- few_shot_num = 5
147
+ kwargs['few_shot_num'] = 5
148
+ super().__init__(**kwargs)
153
149
 
154
- super().__init__(
155
- subset_list=subset_list,
156
- metric_list=metric_list,
157
- few_shot_num=few_shot_num,
158
- train_split=train_split,
159
- eval_split=eval_split,
160
- **kwargs)
150
+ self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
161
151
 
162
152
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
163
153
  data_dict = {}
@@ -217,13 +207,13 @@ class CEVALAdapter(DataAdapter):
217
207
  subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
218
208
  full_prompt = f'以下是中国关于{subject_name}考试的单项选择题,请选出其中的正确答案。\n' + full_prompt
219
209
 
220
- return {'data': [full_prompt], 'multi_choices': self.choices}
210
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
221
211
 
222
212
  def get_gold_answer(self, input_d: dict) -> str:
223
213
  # Get the gold choice
224
214
  return input_d.get('answer', '')
225
215
 
226
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
216
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
227
217
  """
228
218
  Parse the model output to get the answer. Could be the best choice index.
229
219
 
@@ -235,11 +225,11 @@ class CEVALAdapter(DataAdapter):
235
225
  Returns:
236
226
  The parsed answer. Depending on the dataset. Usually a string for chat.
237
227
  """
238
- if eval_type == 'checkpoint':
228
+ if eval_type == EvalType.CHECKPOINT:
239
229
  return result
240
- elif eval_type == 'service':
230
+ elif eval_type == EvalType.SERVICE:
241
231
  return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
242
- elif eval_type == 'custom':
232
+ elif eval_type == EvalType.CUSTOM:
243
233
  return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
244
234
  else:
245
235
  raise ValueError(f'Invalid eval_type: {eval_type}')
@@ -247,97 +237,6 @@ class CEVALAdapter(DataAdapter):
247
237
  def match(self, gold: str, pred: str) -> float:
248
238
  return exact_match(gold=gold, pred=pred)
249
239
 
250
- def compute_metric(self, review_res_list: list) -> float:
251
- """
252
- Compute evaluation result by specific metric.
253
-
254
- Args:
255
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
256
-
257
- Returns:
258
- The metric score.
259
- """
260
- items = [(score, 1.0) for score in review_res_list]
261
- return weighted_mean(items)
262
-
263
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
264
- """
265
- Generate report for the evaluation.
266
-
267
- Args:
268
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
269
- report_name: The user-defined report name.
270
-
271
- Returns:
272
- {
273
- "name":"C-Eval",
274
- "metric":"WeightedAverageAccuracy",
275
- "score":0.3389,
276
- "category":[
277
- {
278
- "name":"STEM",
279
- "score":0.2528,
280
- "subset":[
281
- {
282
- "name":"computer_network",
283
- "score":0.2632
284
- },
285
- {
286
- "name":"operating_system",
287
- "score":0.3157
288
- },
289
- {
290
- "name":"computer_architecture",
291
- "score":0.4285
292
- }
293
- ]
294
- }
295
- ],
296
- "total_num":59
297
- }
298
- """
299
- total_num: int = sum([num for _, num in subset_score_map.values()])
300
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
301
- weighted_avg_acc = normalize_score(score=weighted_avg_acc)
302
-
303
- # Get domain-subject mapping
304
- subject_review_map = {}
305
- for subset_name, (subset_score, num) in subset_score_map.items():
306
- domain_name: str = SUBJECT_MAPPING.get(subset_name)[2] if SUBJECT_MAPPING.get(subset_name) else 'DEFAULT'
307
- if domain_name in subject_review_map:
308
- subject_review_map[domain_name].append((subset_name, subset_score, num))
309
- else:
310
- subject_review_map[domain_name] = [(subset_name, subset_score, num)]
311
-
312
- # Get domain score
313
- category_list = []
314
- for domain_name, domain_res_list in subject_review_map.items():
315
- domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
316
- sum([num for _, _, num in domain_res_list])
317
- domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
318
- category_list.append({
319
- 'name':
320
- domain_name,
321
- 'score':
322
- domain_weighted_avg_acc,
323
- 'subset': [{
324
- 'name': subset_name,
325
- 'score': normalize_score(score=subset_score)
326
- } for subset_name, subset_score, _ in domain_res_list]
327
- })
328
-
329
- category_list = sorted(category_list, key=lambda x: x['name'])
330
-
331
- # Get final dict of report
332
- res_map = dict(
333
- name=report_name or 'ceval',
334
- metric=self.metric_list[0]['name'],
335
- score=weighted_avg_acc,
336
- category=category_list,
337
- total_num=total_num)
338
-
339
- return res_map
340
-
341
240
  @classmethod
342
241
  def _format_example(cls, input_d: dict, include_answer=True):
343
242
  example = '问题:' + input_d['question']
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.cmmlu.cmmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST
4
- from evalscope.benchmarks.cmmlu.cmmlu_adapter import CMMLUAdapter
5
- from evalscope.benchmarks.cmmlu.cmmlu_adapter import CMMLUAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass # noqa
@@ -3,8 +3,10 @@
3
3
  import csv
4
4
  import os
5
5
 
6
- from evalscope.benchmarks.data_adapter import DataAdapter
7
- from evalscope.metrics.metrics import exact_match, weighted_mean
6
+ from evalscope.benchmarks import Benchmark, DataAdapter
7
+ from evalscope.constants import EvalType
8
+ from evalscope.metrics import AverageAccuracy, exact_match
9
+ from evalscope.models import MultiChoiceModelAdapter
8
10
  from evalscope.utils import ResponseParser, normalize_score
9
11
  from evalscope.utils.logger import get_logger
10
12
 
@@ -12,8 +14,6 @@ from evalscope.utils.logger import get_logger
12
14
 
13
15
  logger = get_logger()
14
16
 
15
- DATASET_ID = 'modelscope/cmmlu'
16
-
17
17
  SUBSET_LIST = [
18
18
  'agronomy', 'anatomy', 'ancient_chinese', 'arts', 'astronomy', 'business_ethics', 'chinese_civil_service_exam',
19
19
  'chinese_driving_rule', 'chinese_food_culture', 'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
@@ -101,31 +101,24 @@ SUBJECT_MAPPING = {
101
101
  }
102
102
 
103
103
 
104
+ @Benchmark.register(
105
+ name='cmmlu',
106
+ dataset_id='modelscope/cmmlu',
107
+ model_adapter=MultiChoiceModelAdapter,
108
+ subset_list=SUBSET_LIST,
109
+ metric_list=[AverageAccuracy],
110
+ few_shot_num=5,
111
+ train_split='dev',
112
+ eval_split='test',
113
+ )
104
114
  class CMMLUAdapter(DataAdapter):
105
115
 
106
116
  choices = ['A', 'B', 'C', 'D']
107
117
 
108
- def __init__(self,
109
- subset_list: list = None,
110
- metric_list: list = None,
111
- few_shot_num: int = 5,
112
- train_split: str = 'dev',
113
- eval_split: str = 'test',
114
- **kwargs):
115
-
116
- if subset_list is None:
117
- subset_list = SUBSET_LIST
118
-
119
- if metric_list is None:
120
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
118
+ def __init__(self, **kwargs):
119
+ super().__init__(**kwargs)
121
120
 
122
- super().__init__(
123
- subset_list=subset_list,
124
- metric_list=metric_list,
125
- few_shot_num=few_shot_num,
126
- train_split=train_split,
127
- eval_split=eval_split,
128
- **kwargs)
121
+ self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
129
122
 
130
123
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
131
124
  data_dict = {}
@@ -181,13 +174,13 @@ class CMMLUAdapter(DataAdapter):
181
174
 
182
175
  full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
183
176
 
184
- return {'data': [full_prompt], 'multi_choices': self.choices}
177
+ return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': prompt}
185
178
 
186
179
  def get_gold_answer(self, input_d: dict) -> str:
187
180
  # Get the gold choice
188
181
  return input_d.get('Answer', '')
189
182
 
190
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
183
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
191
184
  """
192
185
  Parse the model output to get the answer. Could be the best choice index.
193
186
 
@@ -199,11 +192,11 @@ class CMMLUAdapter(DataAdapter):
199
192
  Returns:
200
193
  The parsed answer. Depending on the dataset. Usually a string for chat.
201
194
  """
202
- if eval_type == 'checkpoint':
195
+ if eval_type == EvalType.CHECKPOINT:
203
196
  return result
204
- elif eval_type == 'service':
197
+ elif eval_type == EvalType.SERVICE:
205
198
  return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
206
- elif eval_type == 'custom':
199
+ elif eval_type == EvalType.CUSTOM:
207
200
  return ResponseParser.parse_first_option_with_choices(result, self.choices) # TODO: to be checked !
208
201
  else:
209
202
  raise ValueError(f'Invalid eval_type: {eval_type}')
@@ -211,94 +204,6 @@ class CMMLUAdapter(DataAdapter):
211
204
  def match(self, gold: str, pred: str) -> float:
212
205
  return exact_match(gold=gold, pred=pred)
213
206
 
214
- def compute_metric(self, review_res_list: list) -> float:
215
- """
216
- Compute evaluation result by specific metric.
217
-
218
- Args:
219
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
220
-
221
- Returns:
222
- The metric score.
223
- """
224
- items = [(score, 1.0) for score in review_res_list]
225
- return weighted_mean(items)
226
-
227
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
228
- """
229
- Generate report for the evaluation.
230
-
231
- Args:
232
- subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
233
- report_name: the user-defined report name. Default: None
234
-
235
- Returns:
236
- {
237
- "name":"CMMLU",
238
- "metric":"WeightedAverageAccuracy",
239
- "score":0.3389,
240
- "category":[
241
- {
242
- "name":"STEM",
243
- "score":0.2528,
244
- "subset":[
245
- {
246
- "name":"computer_network",
247
- "score":0.2632
248
- },
249
- {
250
- "name":"operating_system",
251
- "score":0.3157
252
- },
253
- {
254
- "name":"computer_architecture",
255
- "score":0.4285
256
- }
257
- ]
258
- }
259
- ],
260
- "total_num":59
261
- }
262
- """
263
- total_num: int = sum([num for _, num in subset_score_map.values()])
264
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
265
-
266
- # Get domain-subject mapping
267
- subject_review_map = {}
268
- for subset_name, (subset_score, num) in subset_score_map.items():
269
- domain_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
270
- if domain_name in subject_review_map:
271
- subject_review_map[domain_name].append((subset_name, subset_score, num))
272
- else:
273
- subject_review_map[domain_name] = [(subset_name, subset_score, num)]
274
-
275
- # Get domain score
276
- category_list = []
277
- for domain_name, domain_res_list in subject_review_map.items():
278
- domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
279
- sum([num for _, _, num in domain_res_list])
280
- domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
281
- category_list.append({
282
- 'name':
283
- domain_name,
284
- 'score':
285
- domain_weighted_avg_acc,
286
- 'subset': [{
287
- 'name': subset_name,
288
- 'score': normalize_score(subset_score)
289
- } for subset_name, subset_score, _ in domain_res_list]
290
- })
291
-
292
- # Get final dict of report
293
- res_map = dict(
294
- name=report_name or 'cmmlu',
295
- metric=self.metric_list[0]['name'],
296
- score=weighted_avg_acc,
297
- category=category_list,
298
- total_num=total_num)
299
-
300
- return res_map
301
-
302
207
  @classmethod
303
208
  def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
304
209
 
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.competition_math.competition_math_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter
5
- from evalscope.benchmarks.competition_math.competition_math_adapter import CompetitionMathAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass # noqa