evalscope 0.8.2__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (79) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +10 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +23 -99
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +114 -85
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
  26. evalscope/benchmarks/mmlu/__init__.py +0 -5
  27. evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
  28. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  29. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  30. evalscope/benchmarks/race/__init__.py +0 -5
  31. evalscope/benchmarks/race/race_adapter.py +25 -53
  32. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  33. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
  34. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  35. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
  36. evalscope/collections/__init__.py +3 -0
  37. evalscope/collections/evaluator.py +178 -0
  38. evalscope/collections/sampler.py +132 -0
  39. evalscope/collections/schema.py +122 -0
  40. evalscope/config.py +7 -5
  41. evalscope/constants.py +7 -28
  42. evalscope/evaluator/evaluator.py +66 -109
  43. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  44. evalscope/metrics/__init__.py +6 -0
  45. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  46. evalscope/metrics/math_accuracy.py +193 -50
  47. evalscope/metrics/metrics.py +7 -4
  48. evalscope/metrics/rouge_metric.py +13 -8
  49. evalscope/models/__init__.py +14 -1
  50. evalscope/models/base_adapter.py +52 -0
  51. evalscope/models/chat_adapter.py +138 -0
  52. evalscope/models/choice_adapter.py +211 -0
  53. evalscope/models/custom_adapter.py +67 -0
  54. evalscope/models/local_model.py +74 -0
  55. evalscope/models/model.py +141 -0
  56. evalscope/models/server_adapter.py +104 -0
  57. evalscope/run.py +37 -66
  58. evalscope/run_arena.py +1 -1
  59. evalscope/utils/__init__.py +1 -1
  60. evalscope/utils/chat_service.py +4 -3
  61. evalscope/utils/io_utils.py +8 -0
  62. evalscope/utils/logger.py +4 -0
  63. evalscope/utils/model_utils.py +10 -0
  64. evalscope/utils/utils.py +3 -25
  65. evalscope/version.py +2 -2
  66. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/METADATA +32 -15
  67. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/RECORD +75 -66
  68. tests/cli/test_collection.py +53 -0
  69. tests/cli/test_run.py +43 -1
  70. tests/rag/test_mteb.py +3 -2
  71. evalscope/models/api/__init__.py +0 -3
  72. evalscope/models/dummy_chat_model.py +0 -49
  73. evalscope/models/model_adapter.py +0 -525
  74. evalscope/models/openai_model.py +0 -103
  75. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  76. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
  77. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
  78. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
  79. {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
@@ -5,45 +5,34 @@ import numpy as np
5
5
  import os
6
6
  from typing import List
7
7
 
8
+ from evalscope.benchmarks import Benchmark
8
9
  from evalscope.benchmarks.data_adapter import DataAdapter
9
- from evalscope.metrics.metrics import exact_match, weighted_mean
10
- from evalscope.utils.logger import get_logger
10
+ from evalscope.constants import EvalType
11
+ from evalscope.metrics import WeightedAverageAccuracy
12
+ from evalscope.metrics.metrics import exact_match
13
+ from evalscope.models import ChatGenerationModelAdapter
14
+ from evalscope.utils import get_logger
11
15
 
12
16
  # flake8: noqa
13
17
 
14
18
  logger = get_logger()
15
19
 
16
- DATASET_ID = 'modelscope/trivia_qa'
17
- SUBSET_LIST = ['default']
18
-
19
20
 
21
+ @Benchmark.register(
22
+ name='trivia_qa',
23
+ dataset_id='modelscope/trivia_qa',
24
+ model_adapter=ChatGenerationModelAdapter,
25
+ subset_list=['default'],
26
+ metric_list=[WeightedAverageAccuracy],
27
+ few_shot_num=5,
28
+ train_split='dev',
29
+ eval_split='test',
30
+ )
20
31
  class TriviaQaAdapter(DataAdapter):
21
32
 
22
- def __init__(self,
23
- subset_list: list = None,
24
- metric_list: list = None,
25
- few_shot_num: int = None,
26
- train_split: str = 'dev',
27
- eval_split: str = 'test',
28
- **kwargs):
29
-
30
- if subset_list is None:
31
- subset_list = SUBSET_LIST
32
-
33
- if metric_list is None:
34
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
33
+ def __init__(self, **kwargs):
35
34
 
36
- if few_shot_num is None:
37
- logger.info(f'few_shot_num is not specified for TriviaQA, use default value: 5')
38
- few_shot_num = 5
39
-
40
- super().__init__(
41
- subset_list=subset_list,
42
- metric_list=metric_list,
43
- few_shot_num=few_shot_num,
44
- train_split=train_split,
45
- eval_split=eval_split,
46
- **kwargs)
35
+ super().__init__(**kwargs)
47
36
 
48
37
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
49
38
  data_dict = {}
@@ -113,16 +102,16 @@ class TriviaQaAdapter(DataAdapter):
113
102
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
114
103
  context: str = '\n'.join(few_shot_prompts) + '\n'
115
104
  context += self._generate_prompt(input_d=input_d, include_answer=False)
116
- full_prompt = prompt + context
105
+ full_prompt = context
117
106
 
118
- return {'data': [full_prompt]}
107
+ return {'data': [full_prompt], 'system_prompt': prompt}
119
108
 
120
109
  def get_gold_answer(self, input_d: dict) -> list:
121
110
  # Get the gold choice
122
111
  ans: list = input_d.get('ideal', [])
123
112
  return ans
124
113
 
125
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
114
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
126
115
  """
127
116
  Parse the model output to get the answer.
128
117
 
@@ -134,73 +123,11 @@ class TriviaQaAdapter(DataAdapter):
134
123
  Returns:
135
124
  The predicted answer.
136
125
  """
137
- if eval_type == 'checkpoint':
138
- return result
139
- elif eval_type == 'service': # TODO: to be implemented
140
- return result
141
- elif eval_type == 'custom': # TODO: to be implemented
142
- return result
143
- else:
144
- raise ValueError(f'Unknown eval_type: {eval_type}')
126
+ return result
145
127
 
146
128
  def match(self, gold: list, pred: str) -> float:
147
- return max([exact_match(gold=ref, pred=pred) for ref in gold])
148
-
149
- def compute_metric(self, review_res_list: list) -> float:
150
- """
151
- Compute evaluation result by specific metric.
152
-
153
- Args:
154
- review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
155
-
156
- Returns:
157
- The metric score.
158
- """
159
- items = [(score, 1.0) for score in review_res_list]
160
- return weighted_mean(items)
161
-
162
- def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
163
- """
164
- Generate the report for the model output.
165
-
166
- Args:
167
- subset_score_map: {subset_name: (score, num), ...}
168
- report_name: The user-defined report name.
169
-
170
- Returns:
171
- {
172
- "name":"TriviaQA",
173
- "metric":"WeightedAverageAccuracy",
174
- "score":0.3389,
175
- "category":[
176
- {
177
- "name":"DEFAULT",
178
- "score":0.3389,
179
- "subset":[
180
- {
181
- "name":"default",
182
- "score":0.3389
183
- }
184
- ]
185
- }
186
- ],
187
- "total_num":100
188
- }
189
- """
190
- total_num: int = sum([num for _, num in subset_score_map.values()])
191
- weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
192
- cate_avg_list = [{'name': subset_name, 'score': score} for subset_name, (score, _) in subset_score_map.items()]
193
-
194
- category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
195
-
196
- res_map = dict(
197
- name=report_name or 'trivia_qa',
198
- metric=self.metric_list[0]['name'],
199
- score=weighted_avg_acc,
200
- category=[category_d],
201
- total_num=total_num)
202
-
203
- return res_map
129
+ is_correct = any([cand in pred for cand in gold])
130
+ return 1 if is_correct else 0
204
131
 
205
132
  @classmethod
206
133
  def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
@@ -1,6 +1 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import DATASET_ID, SUBSET_LIST
4
- from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter
5
- from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter as DataAdapterClass
6
- from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
@@ -6,20 +6,31 @@ import numpy as np
6
6
  import os
7
7
  from typing import List
8
8
 
9
+ from evalscope.benchmarks import Benchmark
9
10
  from evalscope.benchmarks.data_adapter import DataAdapter
11
+ from evalscope.constants import EvalType
12
+ from evalscope.metrics import WeightedAverageAccuracy
10
13
  from evalscope.metrics.metrics import weighted_mean
11
- from evalscope.utils import normalize_score
12
- from evalscope.utils.logger import get_logger
14
+ from evalscope.models import ContinuationLogitsModelAdapter
15
+ from evalscope.utils import get_logger, normalize_score
13
16
 
14
17
  # flake8: noqa
15
18
 
16
19
  logger = get_logger()
17
20
 
18
- DATASET_ID = 'modelscope/truthful_qa'
19
21
  # task_list = ['generation', 'multiple_choice']
20
- SUBSET_LIST = ['multiple_choice']
21
22
 
22
23
 
24
+ @Benchmark.register(
25
+ name='truthful_qa',
26
+ dataset_id='modelscope/truthful_qa',
27
+ model_adapter=ContinuationLogitsModelAdapter,
28
+ subset_list=['multiple_choice'],
29
+ metric_list=[WeightedAverageAccuracy],
30
+ few_shot_num=0,
31
+ train_split=None,
32
+ eval_split='validation',
33
+ )
23
34
  class TruthfulQaAdapter(DataAdapter):
24
35
  """
25
36
  Adapter for TruthfulQA benchmark.
@@ -40,35 +51,14 @@ class TruthfulQaAdapter(DataAdapter):
40
51
  'Q: Where were the 1992 Olympics held?\n'
41
52
  'A: The 1992 Olympics were held in Barcelona, Spain.')
42
53
 
43
- def __init__(self,
44
- subset_list: list = None,
45
- metric_list: list = None,
46
- few_shot_num: int = None,
47
- train_split: str = None,
48
- eval_split: str = 'validation',
49
- **kwargs):
50
-
51
- if subset_list is None:
52
- subset_list = SUBSET_LIST
53
-
54
- if metric_list is None:
55
- metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
56
-
57
- if few_shot_num is None:
58
- logger.info(f'Set 0-shot examples by system for TruthfulQA.')
59
- few_shot_num = 0
54
+ def __init__(self, **kwargs):
60
55
 
56
+ few_shot_num = kwargs.get('few_shot_num', 0)
61
57
  if few_shot_num != 0:
62
58
  logger.warning(f'few_shot_num should be 0 for TruthfulQA, but got {few_shot_num}. Use 0-shot by default.')
63
- few_shot_num = 0
59
+ kwargs['few_shot_num'] = 0
64
60
 
65
- super().__init__(
66
- subset_list=subset_list,
67
- metric_list=metric_list,
68
- few_shot_num=few_shot_num,
69
- train_split=train_split,
70
- eval_split=eval_split,
71
- **kwargs)
61
+ super().__init__(**kwargs)
72
62
 
73
63
  def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
74
64
  data_dict = {}
@@ -215,7 +205,7 @@ class TruthfulQaAdapter(DataAdapter):
215
205
  # TODO: generation sub-task to be added
216
206
  return {'mc1_labels': input_d['mc1_targets']['labels'], 'mc2_labels': input_d['mc2_targets']['labels']}
217
207
 
218
- def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> list:
208
+ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> list:
219
209
  """
220
210
  Parse the model output to get the answer.
221
211
 
@@ -227,11 +217,11 @@ class TruthfulQaAdapter(DataAdapter):
227
217
  Returns:
228
218
  The predicted answer.
229
219
  """
230
- if eval_type == 'checkpoint':
220
+ if eval_type == EvalType.CHECKPOINT:
231
221
  return result
232
- elif eval_type == 'service': # TODO: to be supported !
222
+ elif eval_type == EvalType.SERVICE: # TODO: to be supported !
233
223
  return result
234
- elif eval_type == 'custom': # TODO: to be supported !
224
+ elif eval_type == EvalType.CUSTOM: # TODO: to be supported !
235
225
  return result
236
226
  else:
237
227
  raise ValueError(f'Invalid eval_type: {eval_type}')
@@ -0,0 +1,3 @@
1
+ from evalscope.collections.evaluator import EvaluatorCollection
2
+ from evalscope.collections.sampler import StratifiedSampler, UniformSampler, WeightedSampler
3
+ from evalscope.collections.schema import CollectionSchema, DatasetInfo
@@ -0,0 +1,178 @@
1
+ import json
2
+ import os
3
+ import pandas as pd
4
+ from collections import defaultdict
5
+ from tabulate import tabulate
6
+ from tqdm import tqdm
7
+
8
+ from evalscope.benchmarks import Benchmark
9
+ from evalscope.collections.sampler import DatasetEntry
10
+ from evalscope.config import TaskConfig
11
+ from evalscope.constants import AnswerKeys, DumpMode, EvalType, ReviewKeys
12
+ from evalscope.evaluator import Evaluator
13
+ from evalscope.models import get_local_model, initialize_model_adapter
14
+ from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
15
+ from evalscope.utils.logger import get_logger
16
+
17
+ logger = get_logger()
18
+
19
+
20
+ class SimpleEvaluator(Evaluator):
21
+
22
+ def __init__(self, dataset_name, data_adapter, model_adapter, task_cfg, outputs):
23
+ super().__init__(
24
+ dataset_name_or_path=dataset_name,
25
+ data_adapter=data_adapter,
26
+ model_adapter=model_adapter,
27
+ task_cfg=task_cfg,
28
+ outputs=outputs)
29
+
30
+ def get_answer(self, input_prompt, subset_name, infer_cfg) -> dict:
31
+ answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
32
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
33
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
34
+ return processed_answer
35
+
36
+ def get_review(self, answer_d) -> dict:
37
+ review_id, reviewer_spec = self._generate_review_id(answer_d)
38
+ review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
39
+ return review_d
40
+
41
+
42
+ class EvaluatorCollection:
43
+
44
+ def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure):
45
+ self.task_cfg = task_cfg
46
+ self.outputs = outputs
47
+ self.model = get_local_model(task_cfg)
48
+ self.dataset = self.load()
49
+ self.dataset_name_map, self.dataset_id_map = self._parse_dataset()
50
+ self.evaluators = self._initialize_evaluators()
51
+
52
+ def load(self) -> list[DatasetEntry]:
53
+ raw_dataset = jsonl_to_list(self.task_cfg.dataset_args['data_collection']['local_path'])
54
+ datasets = []
55
+ for sample in raw_dataset:
56
+ datasets.append(DatasetEntry(**sample))
57
+ return datasets
58
+
59
+ def _parse_dataset(self):
60
+ dataset_name_map = defaultdict(lambda: defaultdict(list))
61
+ dataset_id_map = {}
62
+ for sample in self.dataset:
63
+ dataset_name, subset_name = sample.dataset_name, sample.subset_name
64
+ dataset_name_map[dataset_name][subset_name].append(sample.index)
65
+ dataset_id_map[sample.index] = sample
66
+ return dataset_name_map, dataset_id_map
67
+
68
+ def _initialize_evaluators(self):
69
+ evaluators = {}
70
+ for dataset_name in self.dataset_name_map.keys():
71
+ benchmark = Benchmark.get(dataset_name)
72
+ data_adapter = benchmark.get_data_adapter()
73
+ model_adapter = initialize_model_adapter(self.task_cfg, benchmark.model_adapter, self.model)
74
+ evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
75
+ self.outputs)
76
+ return evaluators
77
+
78
+ def get_report(self, reviews):
79
+ data = []
80
+ for dataset_name, data_map in self.dataset_name_map.items():
81
+ for subset_name, ids in data_map.items():
82
+ for _id in ids:
83
+ review_d = reviews[_id]
84
+ row_data: DatasetEntry = self.dataset_id_map[_id]
85
+ score = self.get_pred_score(review_d)
86
+ data.append({
87
+ 'task_type': row_data.task,
88
+ 'dataset_name': dataset_name,
89
+ 'subset_name': subset_name,
90
+ 'tags': row_data.tags,
91
+ 'score': score
92
+ })
93
+
94
+ df = pd.DataFrame(data)
95
+ # Explode tags to multiple rows
96
+ df_exploded = df.explode('tags')
97
+
98
+ # Helper function for aggregation and sorting
99
+ def aggregate_and_sort(df, group_by_cols):
100
+ report_df = df.groupby(group_by_cols) \
101
+ .agg(average_score=('score', 'mean'), count=('score', 'size')) \
102
+ .reset_index()
103
+
104
+ # Round average_score to 4 decimal places
105
+ report_df['average_score'] = report_df['average_score'].round(4)
106
+
107
+ report_df = report_df.sort_values(by='count', ascending=False) \
108
+ .to_dict(orient='records')
109
+ return report_df
110
+
111
+ # Multi-level aggregation
112
+ subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
113
+ dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
114
+ task_report_df = aggregate_and_sort(df, ['task_type'])
115
+ tag_report_df = aggregate_and_sort(df_exploded, ['tags'])
116
+
117
+ # Convert sorted DataFrames to Dict
118
+ report = {
119
+ 'subset_level': subset_report_df,
120
+ 'dataset_level': dataset_report_df,
121
+ 'task_level': task_report_df,
122
+ 'tag_level': tag_report_df
123
+ }
124
+
125
+ # Log the report
126
+ for level, data in report.items():
127
+ table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
128
+ logger.info(f'{level} Report:\n{table}')
129
+
130
+ # Save the report to a JSON file
131
+ report_file_path = os.path.join(self.outputs.reports_dir, 'data_collection.json')
132
+ with open(report_file_path, 'w', encoding='utf-8') as f:
133
+ json.dump(report, f, ensure_ascii=False, indent=4)
134
+
135
+ def get_answers(self):
136
+ pred_file_path = os.path.join(self.outputs.predictions_dir, 'data_collection.jsonl')
137
+ answers = defaultdict(dict)
138
+ for sample in tqdm(self.dataset, desc='Getting answers'):
139
+ evaluator = self.evaluators[sample.dataset_name]
140
+ answer_d = evaluator.get_answer(sample.prompt, sample.subset_name, self.task_cfg.generation_config)
141
+ answers[sample.index] = answer_d
142
+ dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
143
+ return answers
144
+
145
+ def get_reviews(self, answers):
146
+ review_file_path = os.path.join(self.outputs.reviews_dir, 'data_collection.jsonl')
147
+ reviews = defaultdict(dict)
148
+ for sample in tqdm(self.dataset, desc='Getting reviews'):
149
+ evaluator = self.evaluators[sample.dataset_name]
150
+ review_d = evaluator.get_review(answers[sample.index])
151
+ reviews[sample.index] = review_d
152
+ dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
153
+ return reviews
154
+
155
+ @staticmethod
156
+ def get_pred_score(review_d) -> float:
157
+ return float(review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT])
158
+
159
+ def eval(self, **kwargs):
160
+ answers = self.get_answers()
161
+ reviews = self.get_reviews(answers)
162
+ self.get_report(reviews)
163
+
164
+
165
+ if __name__ == '__main__':
166
+ task_cfg = TaskConfig(
167
+ model='qwen2.5',
168
+ api_url='http://127.0.0.1:8801/v1/chat/completions',
169
+ api_key='EMPTY',
170
+ eval_type=EvalType.SERVICE,
171
+ datasets=['data_collection'],
172
+ dataset_args={'data_collection': {
173
+ 'local_path': 'outputs/mixed_data.jsonl'
174
+ }},
175
+ )
176
+
177
+ evaluator_collection = EvaluatorCollection(task_cfg)
178
+ evaluator_collection.eval()
@@ -0,0 +1,132 @@
1
+ import random
2
+ from abc import ABC, abstractmethod
3
+ from dataclasses import asdict, dataclass, field
4
+ from tqdm import tqdm
5
+ from typing import List, Optional
6
+
7
+ from evalscope.collections.schema import CollectionSchema, DatasetInfo
8
+
9
+
10
+ @dataclass
11
+ class DatasetEntry:
12
+ index: int = 0
13
+ prompt: dict = field(default_factory=dict)
14
+ tags: List[str] = field(default_factory=list)
15
+ task: str = ''
16
+ weight: float = 0.0
17
+ dataset_name: str = ''
18
+ subset_name: str = ''
19
+
20
+
21
+ # Define an abstract base class for Samplers
22
+ class Sampler(ABC):
23
+
24
+ def __init__(self, schema: CollectionSchema, count: Optional[int] = None):
25
+ self.schema = schema
26
+ self.count = count
27
+
28
+ @abstractmethod
29
+ def sample(self) -> List[dict]:
30
+ raise NotImplementedError
31
+
32
+ def _collect_dataset_data(self, dataset_info_list: List[DatasetInfo]) -> List[DatasetEntry]:
33
+ all_data = []
34
+ for dataset in tqdm(dataset_info_list, desc='Collecting dataset data'):
35
+ data_dict = dataset.get_data()
36
+ for subset_name, subset_data in data_dict.items():
37
+ for prompt in subset_data:
38
+ all_data.append(
39
+ DatasetEntry(
40
+ prompt=prompt,
41
+ tags=dataset.tags,
42
+ task=dataset.task_type,
43
+ weight=dataset.weight,
44
+ dataset_name=dataset.name,
45
+ subset_name=subset_name,
46
+ ))
47
+ return all_data
48
+
49
+ def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
50
+ result = []
51
+ for i, entry in enumerate(all_data):
52
+ entry.index = i
53
+ result.append(asdict(entry))
54
+ return result
55
+
56
+
57
+ class WeightedSampler(Sampler):
58
+ """
59
+ Weighted sampler, according to the weight of each dataset, sample data from each dataset.
60
+ """
61
+
62
+ def sample(self) -> List[dict]:
63
+ dataset_info_list = self.schema.flatten()
64
+ all_data = self._collect_dataset_data(dataset_info_list)
65
+
66
+ remaining_count = self.count
67
+ sampled_data = []
68
+
69
+ for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
70
+ if i == len(dataset_info_list) - 1:
71
+ dataset_sample_count = remaining_count
72
+ else:
73
+ dataset_sample_count = int(dataset.weight * self.count)
74
+ remaining_count -= dataset_sample_count
75
+
76
+ sampled_data.extend(random.choices(all_data, k=dataset_sample_count))
77
+
78
+ return self._update_index(sampled_data)
79
+
80
+
81
+ class UniformSampler(Sampler):
82
+ """
83
+ Uniform sampler, sample data from each dataset with the same number of samples.
84
+ """
85
+
86
+ def sample(self) -> List[dict]:
87
+ dataset_info_list = self.schema.flatten()
88
+ all_data = self._collect_dataset_data(dataset_info_list)
89
+
90
+ num_datasets = len(dataset_info_list)
91
+ samples_per_dataset = self.count // num_datasets
92
+ sampled_data = []
93
+
94
+ for _ in tqdm(dataset_info_list, desc='Sampling data'):
95
+ sampled_data.extend(random.choices(all_data, k=samples_per_dataset))
96
+
97
+ return self._update_index(sampled_data)
98
+
99
+
100
+ class StratifiedSampler(Sampler):
101
+ """
102
+ Stratified sampler, sample data from each dataset according to the number of samples of each dataset.
103
+ """
104
+
105
+ def sample(self) -> List[dict]:
106
+ dataset_info_list = self.schema.flatten()
107
+ all_data = self._collect_dataset_data(dataset_info_list)
108
+
109
+ total_samples = sum(len(dataset.get_data()) for dataset in dataset_info_list)
110
+ sampled_data = []
111
+
112
+ for dataset in tqdm(dataset_info_list, desc='Sampling data'):
113
+ dataset_samples = len(dataset.get_data())
114
+ samples_for_dataset = int((dataset_samples / total_samples) * self.count)
115
+ sampled_data.extend(random.choices(all_data, k=samples_for_dataset))
116
+
117
+ return self._update_index(sampled_data)
118
+
119
+
120
+ if __name__ == '__main__':
121
+ from evalscope.utils.io_utils import dump_jsonl_data
122
+
123
+ schema = CollectionSchema.from_json('outputs/schema.json')
124
+ print(schema.to_dict())
125
+ mixed_data = WeightedSampler(schema, 100).sample()
126
+ dump_jsonl_data(mixed_data, 'outputs/weighted_mixed_data.jsonl')
127
+
128
+ mixed_data = UniformSampler(schema, 100).sample()
129
+ dump_jsonl_data(mixed_data, 'outputs/uniform_mixed_data.jsonl')
130
+
131
+ mixed_data = StratifiedSampler(schema, 100).sample()
132
+ dump_jsonl_data(mixed_data, 'outputs/stratified_mixed_data.jsonl')