evalscope 0.9.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (69) hide show
  1. evalscope/arguments.py +1 -0
  2. evalscope/benchmarks/arc/arc_adapter.py +3 -5
  3. evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
  4. evalscope/benchmarks/benchmark.py +1 -1
  5. evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
  6. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
  7. evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
  8. evalscope/benchmarks/data_adapter.py +69 -70
  9. evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
  10. evalscope/benchmarks/gpqa/__init__.py +0 -0
  11. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  12. evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  13. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
  14. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
  15. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
  16. evalscope/benchmarks/ifeval/__init__.py +0 -0
  17. evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  18. evalscope/benchmarks/ifeval/instructions.py +1477 -0
  19. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  20. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  21. evalscope/benchmarks/ifeval/utils.py +134 -0
  22. evalscope/benchmarks/iquiz/__init__.py +0 -0
  23. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  24. evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
  25. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
  26. evalscope/benchmarks/race/race_adapter.py +4 -73
  27. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
  28. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
  29. evalscope/cli/cli.py +2 -0
  30. evalscope/cli/start_app.py +30 -0
  31. evalscope/collections/evaluator.py +82 -62
  32. evalscope/collections/sampler.py +47 -41
  33. evalscope/collections/schema.py +14 -10
  34. evalscope/constants.py +4 -0
  35. evalscope/evaluator/evaluator.py +22 -13
  36. evalscope/metrics/__init__.py +2 -5
  37. evalscope/metrics/metrics.py +11 -2
  38. evalscope/metrics/named_metrics.py +17 -0
  39. evalscope/models/chat_adapter.py +2 -0
  40. evalscope/models/server_adapter.py +11 -4
  41. evalscope/perf/__init__.py +1 -0
  42. evalscope/perf/main.py +0 -1
  43. evalscope/perf/plugin/api/custom_api.py +1 -1
  44. evalscope/perf/plugin/api/openai_api.py +1 -1
  45. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  46. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  47. evalscope/report/__init__.py +5 -0
  48. evalscope/report/app.py +693 -0
  49. evalscope/report/combinator.py +73 -0
  50. evalscope/report/generator.py +80 -0
  51. evalscope/report/utils.py +133 -0
  52. evalscope/run.py +16 -11
  53. evalscope/summarizer.py +1 -1
  54. evalscope/utils/chat_service.py +1 -1
  55. evalscope/utils/logger.py +1 -0
  56. evalscope/utils/model_utils.py +5 -2
  57. evalscope/version.py +2 -2
  58. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +84 -7
  59. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +66 -51
  60. tests/cli/test_collection.py +11 -7
  61. tests/cli/test_run.py +13 -4
  62. evalscope/tools/__init__.py +0 -1
  63. evalscope/tools/combine_reports.py +0 -133
  64. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  65. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  66. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
  67. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
  68. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
  69. {evalscope-0.9.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
@@ -4,13 +4,15 @@ import pandas as pd
4
4
  from collections import defaultdict
5
5
  from tabulate import tabulate
6
6
  from tqdm import tqdm
7
+ from typing import List
7
8
 
8
9
  from evalscope.benchmarks import Benchmark
9
10
  from evalscope.collections.sampler import DatasetEntry
10
11
  from evalscope.config import TaskConfig
11
- from evalscope.constants import AnswerKeys, DumpMode, EvalType, ReviewKeys
12
+ from evalscope.constants import DataCollection, DumpMode
12
13
  from evalscope.evaluator import Evaluator
13
14
  from evalscope.models import get_local_model, initialize_model_adapter
15
+ from evalscope.report import ReportGenerator
14
16
  from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
15
17
  from evalscope.utils.logger import get_logger
16
18
 
@@ -38,6 +40,12 @@ class SimpleEvaluator(Evaluator):
38
40
  review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
39
41
  return review_d
40
42
 
43
+ def get_score(self, review_d) -> float:
44
+ metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
45
+ # use the first metric by default
46
+ score = metric_score[0]['score']
47
+ return score
48
+
41
49
 
42
50
  class EvaluatorCollection:
43
51
 
@@ -45,16 +53,18 @@ class EvaluatorCollection:
45
53
  self.task_cfg = task_cfg
46
54
  self.outputs = outputs
47
55
  self.model = get_local_model(task_cfg)
48
- self.dataset = self.load()
56
+ self.dataset, self.dataset_name = self.load()
49
57
  self.dataset_name_map, self.dataset_id_map = self._parse_dataset()
50
58
  self.evaluators = self._initialize_evaluators()
51
59
 
52
- def load(self) -> list[DatasetEntry]:
53
- raw_dataset = jsonl_to_list(self.task_cfg.dataset_args['data_collection']['local_path'])
60
+ def load(self) -> tuple[list[DatasetEntry], str]:
61
+ dataset_path = self.task_cfg.dataset_args[DataCollection.NAME]['local_path']
62
+ dataset_name = os.path.basename(dataset_path).split('.')[0]
63
+ raw_dataset = jsonl_to_list(dataset_path)
54
64
  datasets = []
55
65
  for sample in raw_dataset:
56
66
  datasets.append(DatasetEntry(**sample))
57
- return datasets
67
+ return datasets, dataset_name
58
68
 
59
69
  def _parse_dataset(self):
60
70
  dataset_name_map = defaultdict(lambda: defaultdict(list))
@@ -75,65 +85,80 @@ class EvaluatorCollection:
75
85
  self.outputs)
76
86
  return evaluators
77
87
 
78
- def get_report(self, reviews):
79
- data = []
80
- for dataset_name, data_map in self.dataset_name_map.items():
81
- for subset_name, ids in data_map.items():
82
- for _id in ids:
83
- review_d = reviews[_id]
84
- row_data: DatasetEntry = self.dataset_id_map[_id]
85
- score = self.get_pred_score(review_d)
86
- data.append({
87
- 'task_type': row_data.task,
88
- 'dataset_name': dataset_name,
89
- 'subset_name': subset_name,
90
- 'tags': row_data.tags,
91
- 'score': score
92
- })
93
-
94
- df = pd.DataFrame(data)
95
- # Explode tags to multiple rows
96
- df_exploded = df.explode('tags')
97
-
98
- # Helper function for aggregation and sorting
88
+ def get_report(self, scores):
89
+
90
+ def get_dataframe(scores):
91
+ data = []
92
+ for dataset_name, data_map in self.dataset_name_map.items():
93
+ for subset_name, ids in data_map.items():
94
+ for _id in ids:
95
+ row_data: DatasetEntry = self.dataset_id_map[_id]
96
+ score = scores[_id]
97
+ data.append(
98
+ dict(
99
+ task_type=row_data.task_type,
100
+ categories=tuple(row_data.categories),
101
+ dataset_name=dataset_name,
102
+ subset_name=subset_name,
103
+ tags=row_data.tags,
104
+ score=score))
105
+ return pd.DataFrame(data)
106
+
99
107
  def aggregate_and_sort(df, group_by_cols):
108
+ # aggregate by group_by_cols, and calculate average_score and count
100
109
  report_df = df.groupby(group_by_cols) \
101
110
  .agg(average_score=('score', 'mean'), count=('score', 'size')) \
102
111
  .reset_index()
103
-
104
- # Round average_score to 4 decimal places
105
112
  report_df['average_score'] = report_df['average_score'].round(4)
106
-
107
113
  report_df = report_df.sort_values(by='count', ascending=False) \
108
114
  .to_dict(orient='records')
109
115
  return report_df
110
116
 
111
- # Multi-level aggregation
117
+ df = get_dataframe(scores)
118
+
119
+ # multi-level aggregation
112
120
  subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
113
121
  dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
114
122
  task_report_df = aggregate_and_sort(df, ['task_type'])
115
- tag_report_df = aggregate_and_sort(df_exploded, ['tags'])
116
123
 
117
- # Convert sorted DataFrames to Dict
118
- report = {
124
+ # explode tags to multiple rows
125
+ df_exploded_tags = df.explode('tags')
126
+ tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags'])
127
+
128
+ # process multi-level categories
129
+ df_categories = df.copy()
130
+ # multi-level aggregation for categories
131
+ max_depth = df_categories['categories'].apply(len).max()
132
+ for level in range(max_depth):
133
+ df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
134
+ if len(x) > level else '')
135
+ category_report_df = aggregate_and_sort(df_categories, [f'category{level}' for level in range(max_depth)])
136
+
137
+ # convert to dict format
138
+ report_dict = {
119
139
  'subset_level': subset_report_df,
120
140
  'dataset_level': dataset_report_df,
121
141
  'task_level': task_report_df,
122
- 'tag_level': tag_report_df
142
+ 'tag_level': tag_report_df,
143
+ 'category_level': category_report_df,
123
144
  }
124
145
 
125
- # Log the report
126
- for level, data in report.items():
146
+ # record report
147
+ for level, data in report_dict.items():
127
148
  table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
128
149
  logger.info(f'{level} Report:\n{table}')
129
150
 
130
- # Save the report to a JSON file
131
- report_file_path = os.path.join(self.outputs.reports_dir, 'data_collection.json')
151
+ report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
152
+ # save report to JSON file
153
+ report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
154
+ os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
132
155
  with open(report_file_path, 'w', encoding='utf-8') as f:
133
- json.dump(report, f, ensure_ascii=False, indent=4)
156
+ json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
134
157
 
135
158
  def get_answers(self):
136
- pred_file_path = os.path.join(self.outputs.predictions_dir, 'data_collection.jsonl')
159
+ pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
160
+ f'{self.dataset_name}.jsonl')
161
+ os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
137
162
  answers = defaultdict(dict)
138
163
  for sample in tqdm(self.dataset, desc='Getting answers'):
139
164
  evaluator = self.evaluators[sample.dataset_name]
@@ -143,36 +168,31 @@ class EvaluatorCollection:
143
168
  return answers
144
169
 
145
170
  def get_reviews(self, answers):
146
- review_file_path = os.path.join(self.outputs.reviews_dir, 'data_collection.jsonl')
171
+ review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
172
+ os.makedirs(review_file_path, exist_ok=True)
147
173
  reviews = defaultdict(dict)
148
174
  for sample in tqdm(self.dataset, desc='Getting reviews'):
149
175
  evaluator = self.evaluators[sample.dataset_name]
150
176
  review_d = evaluator.get_review(answers[sample.index])
151
177
  reviews[sample.index] = review_d
152
- dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
178
+ dump_jsonl_data(
179
+ review_d,
180
+ os.path.join(review_file_path, f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'),
181
+ dump_mode=DumpMode.APPEND)
153
182
  return reviews
154
183
 
155
- @staticmethod
156
- def get_pred_score(review_d) -> float:
157
- return float(review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT])
184
+ def get_scores(self, reviews) -> float:
185
+ scores = defaultdict(dict)
186
+ for sample in tqdm(self.dataset, desc='Getting scores'):
187
+ evaluator = self.evaluators[sample.dataset_name]
188
+ review_d = reviews[sample.index]
189
+ score = evaluator.get_score(review_d)
190
+ scores[sample.index] = score
191
+
192
+ return scores
158
193
 
159
194
  def eval(self, **kwargs):
160
195
  answers = self.get_answers()
161
196
  reviews = self.get_reviews(answers)
162
- self.get_report(reviews)
163
-
164
-
165
- if __name__ == '__main__':
166
- task_cfg = TaskConfig(
167
- model='qwen2.5',
168
- api_url='http://127.0.0.1:8801/v1/chat/completions',
169
- api_key='EMPTY',
170
- eval_type=EvalType.SERVICE,
171
- datasets=['data_collection'],
172
- dataset_args={'data_collection': {
173
- 'local_path': 'outputs/mixed_data.jsonl'
174
- }},
175
- )
176
-
177
- evaluator_collection = EvaluatorCollection(task_cfg)
178
- evaluator_collection.eval()
197
+ scores = self.get_scores(reviews)
198
+ self.get_report(scores)
@@ -12,7 +12,8 @@ class DatasetEntry:
12
12
  index: int = 0
13
13
  prompt: dict = field(default_factory=dict)
14
14
  tags: List[str] = field(default_factory=list)
15
- task: str = ''
15
+ categories: List[str] = field(default_factory=list)
16
+ task_type: str = ''
16
17
  weight: float = 0.0
17
18
  dataset_name: str = ''
18
19
  subset_name: str = ''
@@ -21,30 +22,30 @@ class DatasetEntry:
21
22
  # Define an abstract base class for Samplers
22
23
  class Sampler(ABC):
23
24
 
24
- def __init__(self, schema: CollectionSchema, count: Optional[int] = None):
25
+ def __init__(self, schema: CollectionSchema):
25
26
  self.schema = schema
26
- self.count = count
27
27
 
28
28
  @abstractmethod
29
29
  def sample(self) -> List[dict]:
30
30
  raise NotImplementedError
31
31
 
32
- def _collect_dataset_data(self, dataset_info_list: List[DatasetInfo]) -> List[DatasetEntry]:
32
+ def _sample_dataset(self, dataset: DatasetInfo, count: int) -> List[DatasetEntry]:
33
33
  all_data = []
34
- for dataset in tqdm(dataset_info_list, desc='Collecting dataset data'):
35
- data_dict = dataset.get_data()
36
- for subset_name, subset_data in data_dict.items():
37
- for prompt in subset_data:
38
- all_data.append(
39
- DatasetEntry(
40
- prompt=prompt,
41
- tags=dataset.tags,
42
- task=dataset.task_type,
43
- weight=dataset.weight,
44
- dataset_name=dataset.name,
45
- subset_name=subset_name,
46
- ))
47
- return all_data
34
+ data_dict = dataset.get_data()
35
+ for subset_name, subset_data in data_dict.items():
36
+ for prompt in subset_data:
37
+ all_data.append(
38
+ DatasetEntry(
39
+ prompt=prompt,
40
+ tags=dataset.tags,
41
+ categories=dataset.hierarchy,
42
+ task_type=dataset.task_type,
43
+ weight=dataset.weight,
44
+ dataset_name=dataset.name,
45
+ subset_name=subset_name,
46
+ ))
47
+ sampled_data = random.choices(all_data, k=count)
48
+ return sampled_data
48
49
 
49
50
  def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
50
51
  result = []
@@ -59,21 +60,19 @@ class WeightedSampler(Sampler):
59
60
  Weighted sampler, according to the weight of each dataset, sample data from each dataset.
60
61
  """
61
62
 
62
- def sample(self) -> List[dict]:
63
+ def sample(self, count: int) -> List[dict]:
63
64
  dataset_info_list = self.schema.flatten()
64
- all_data = self._collect_dataset_data(dataset_info_list)
65
-
66
- remaining_count = self.count
67
65
  sampled_data = []
66
+ remaining_count = count
68
67
 
69
68
  for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
70
69
  if i == len(dataset_info_list) - 1:
71
70
  dataset_sample_count = remaining_count
72
71
  else:
73
- dataset_sample_count = int(dataset.weight * self.count)
72
+ dataset_sample_count = int(dataset.weight * count)
74
73
  remaining_count -= dataset_sample_count
75
74
 
76
- sampled_data.extend(random.choices(all_data, k=dataset_sample_count))
75
+ sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
77
76
 
78
77
  return self._update_index(sampled_data)
79
78
 
@@ -83,16 +82,20 @@ class UniformSampler(Sampler):
83
82
  Uniform sampler, sample data from each dataset with the same number of samples.
84
83
  """
85
84
 
86
- def sample(self) -> List[dict]:
85
+ def sample(self, count: int) -> List[dict]:
87
86
  dataset_info_list = self.schema.flatten()
88
- all_data = self._collect_dataset_data(dataset_info_list)
89
-
90
87
  num_datasets = len(dataset_info_list)
91
- samples_per_dataset = self.count // num_datasets
88
+ remaining_count = count
92
89
  sampled_data = []
93
90
 
94
- for _ in tqdm(dataset_info_list, desc='Sampling data'):
95
- sampled_data.extend(random.choices(all_data, k=samples_per_dataset))
91
+ for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
92
+ if i == len(dataset_info_list) - 1:
93
+ dataset_sample_count = remaining_count
94
+ else:
95
+ dataset_sample_count = count // num_datasets
96
+ remaining_count -= dataset_sample_count
97
+
98
+ sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
96
99
 
97
100
  return self._update_index(sampled_data)
98
101
 
@@ -102,18 +105,21 @@ class StratifiedSampler(Sampler):
102
105
  Stratified sampler, sample data from each dataset according to the number of samples of each dataset.
103
106
  """
104
107
 
105
- def sample(self) -> List[dict]:
108
+ def sample(self, count: int) -> List[dict]:
106
109
  dataset_info_list = self.schema.flatten()
107
- all_data = self._collect_dataset_data(dataset_info_list)
108
110
 
109
111
  total_samples = sum(len(dataset.get_data()) for dataset in dataset_info_list)
112
+ remaining_count = count
110
113
  sampled_data = []
111
114
 
112
- for dataset in tqdm(dataset_info_list, desc='Sampling data'):
113
- dataset_samples = len(dataset.get_data())
114
- samples_for_dataset = int((dataset_samples / total_samples) * self.count)
115
- sampled_data.extend(random.choices(all_data, k=samples_for_dataset))
115
+ for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
116
+ if i == len(dataset_info_list) - 1:
117
+ dataset_sample_count = remaining_count
118
+ else:
119
+ dataset_sample_count = int((len(dataset.get_data()) / total_samples) * count)
120
+ remaining_count -= dataset_sample_count
116
121
 
122
+ sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
117
123
  return self._update_index(sampled_data)
118
124
 
119
125
 
@@ -122,11 +128,11 @@ if __name__ == '__main__':
122
128
 
123
129
  schema = CollectionSchema.from_json('outputs/schema.json')
124
130
  print(schema.to_dict())
125
- mixed_data = WeightedSampler(schema, 100).sample()
131
+ mixed_data = WeightedSampler(schema).sample(10)
126
132
  dump_jsonl_data(mixed_data, 'outputs/weighted_mixed_data.jsonl')
127
133
 
128
- mixed_data = UniformSampler(schema, 100).sample()
129
- dump_jsonl_data(mixed_data, 'outputs/uniform_mixed_data.jsonl')
134
+ # mixed_data = UniformSampler(schema, 100).sample()
135
+ # dump_jsonl_data(mixed_data, 'outputs/uniform_mixed_data.jsonl')
130
136
 
131
- mixed_data = StratifiedSampler(schema, 100).sample()
132
- dump_jsonl_data(mixed_data, 'outputs/stratified_mixed_data.jsonl')
137
+ # mixed_data = StratifiedSampler(schema, 100).sample()
138
+ # dump_jsonl_data(mixed_data, 'outputs/stratified_mixed_data.jsonl')
@@ -11,6 +11,7 @@ class DatasetInfo:
11
11
  task_type: str = ''
12
12
  tags: List[str] = field(default_factory=list)
13
13
  args: dict = field(default_factory=dict)
14
+ hierarchy: List[str] = field(default_factory=list)
14
15
 
15
16
  def get_data(self) -> dict:
16
17
  from evalscope.benchmarks import Benchmark
@@ -34,18 +35,15 @@ def flatten_weight(collection: 'CollectionSchema', base_weight=1):
34
35
  dataset.weight = current_weight
35
36
 
36
37
 
37
- def flatten_tags(collection: 'CollectionSchema', parent_names=None):
38
+ def flatten_name(collection: 'CollectionSchema', parent_names=None):
38
39
  if parent_names is None:
39
40
  parent_names = []
40
41
  current_names = parent_names + [collection.name]
41
42
  for dataset in collection.datasets:
42
43
  if isinstance(dataset, CollectionSchema):
43
- flatten_tags(dataset, current_names)
44
+ flatten_name(dataset, current_names)
44
45
  else:
45
- # Add all parent CollectionSchema names to the tags of each DatasetInfo
46
- for name in current_names:
47
- if name not in dataset.tags:
48
- dataset.tags.append(name)
46
+ dataset.hierarchy = current_names.copy()
49
47
 
50
48
 
51
49
  def flatten_datasets(collection: 'CollectionSchema') -> List[DatasetInfo]:
@@ -100,7 +98,7 @@ class CollectionSchema:
100
98
 
101
99
  def flatten(self) -> List[DatasetInfo]:
102
100
  collection = copy.deepcopy(self)
103
- flatten_tags(collection)
101
+ flatten_name(collection)
104
102
  flatten_weight(collection)
105
103
  return flatten_datasets(collection)
106
104
 
@@ -109,8 +107,12 @@ if __name__ == '__main__':
109
107
  schema = CollectionSchema(
110
108
  name='reasoning',
111
109
  datasets=[
112
- DatasetInfo(name='arc', weight=1, task_type='reasoning', tags=['en']),
113
- DatasetInfo(name='ceval', weight=1, task_type='reasoning', tags=['zh'], args={'subset_list': ['logic']})
110
+ CollectionSchema(name='english', datasets=[
111
+ DatasetInfo(name='arc', weight=1, tags=['en']),
112
+ ]),
113
+ CollectionSchema(
114
+ name='chinese',
115
+ datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})])
114
116
  ])
115
117
  print(schema)
116
118
  print(schema.flatten())
@@ -118,5 +120,7 @@ if __name__ == '__main__':
118
120
 
119
121
  schema = CollectionSchema.from_json('outputs/schema.json')
120
122
  print(schema)
123
+ # 打印扁平化后的结果
121
124
  for dataset in schema.flatten():
122
- print(dataset)
125
+ print(f'Dataset: {dataset.name}')
126
+ print(f"Hierarchy: {' -> '.join(dataset.hierarchy)}")
evalscope/constants.py CHANGED
@@ -145,3 +145,7 @@ class EvalBackend:
145
145
  VLM_EVAL_KIT = 'VLMEvalKit'
146
146
  RAG_EVAL = 'RAGEval'
147
147
  THIRD_PARTY = 'ThirdParty'
148
+
149
+
150
+ class DataCollection:
151
+ NAME = 'data_collection'
@@ -12,7 +12,7 @@ from evalscope.benchmarks import DataAdapter
12
12
  from evalscope.config import TaskConfig
13
13
  from evalscope.constants import AnswerKeys, DumpMode, EvalStage, ReviewKeys
14
14
  from evalscope.models import BaseModelAdapter, CustomModelAdapter
15
- from evalscope.tools.combine_reports import gen_table
15
+ from evalscope.report import Report, gen_table
16
16
  from evalscope.utils import dict_torch_dtype_to_str, gen_hash
17
17
  from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
18
18
  from evalscope.utils.logger import get_logger
@@ -43,8 +43,8 @@ class Evaluator(object):
43
43
  task_cfg: TaskConfig = None,
44
44
  **kwargs):
45
45
 
46
+ self.dataset_name = data_adapter.name
46
47
  self.dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
47
- self.dataset_name = os.path.basename(self.dataset_name_or_path.rstrip(os.sep)).split('.')[0]
48
48
  self.model_name = task_cfg.model_id
49
49
  self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
50
50
 
@@ -201,7 +201,7 @@ class Evaluator(object):
201
201
  # Gen review_id (concat: answer_id + reviewer_spec)
202
202
  answer_id = answer_d[AnswerKeys.ANSWER_ID]
203
203
  reviewer_spec = {
204
- 'metric': [metric_d['name'] for metric_d in self.data_adapter.metric_list],
204
+ 'metric': [metric.name for metric in self.data_adapter.metric_list],
205
205
  'reviewer': ['Evaluator'],
206
206
  'revision': ['default']
207
207
  }
@@ -246,7 +246,7 @@ class Evaluator(object):
246
246
 
247
247
  return reviews_list
248
248
 
249
- def compute_metrics(self, reviews_list: List[dict]) -> Any:
249
+ def compute_metrics(self, reviews_list: List[dict]) -> List[dict]:
250
250
  """
251
251
  To compute metrics from reviews_list for each subset.
252
252
  It is required to rewrite this method to support your own evaluator.
@@ -264,28 +264,37 @@ class Evaluator(object):
264
264
  logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
265
265
  continue
266
266
 
267
- review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
267
+ if len(review_d[AnswerKeys.CHOICES]) == 0:
268
+ logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
269
+ continue
270
+ elif len(review_d[AnswerKeys.CHOICES]) == 1:
271
+ review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
272
+ else:
273
+ review_res = [choice[ReviewKeys.REVIEW][ReviewKeys.RESULT] for choice in review_d[AnswerKeys.CHOICES]]
274
+
268
275
  review_res_list.append(review_res)
269
276
 
270
- metric_score: Union[float, dict] = self.data_adapter.compute_metric(review_res_list=review_res_list)
277
+ metric_score: List[dict] = self.data_adapter.compute_metric(review_res_list=review_res_list)
271
278
 
272
279
  return metric_score
273
280
 
274
- def dump_report(self, reviews_score_all: dict, use_table: bool = True):
281
+ def dump_report(self, reviews_score_all: List[dict], use_table: bool = True):
275
282
  """
276
283
  Get report for total reviews of specific dataset.
277
284
  It is required to rewrite this method to support your own evaluator.
278
285
 
279
286
  Args:
280
- report_map: report dict. Generated by func self.data_adapter.gen_report().
287
+ reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
281
288
  use_table: whether to generate table for reports. Default to True.
282
289
 
283
290
  Returns: None
284
291
  """
285
292
  # Get report map
286
- report_map: dict = self.data_adapter.gen_report(
287
- subset_score_map=reviews_score_all, report_name=self.custom_task_name)
288
- report_map.update(dict(model_name=self.model_name, dataset_name=self.dataset_name))
293
+ report_map: Report = self.data_adapter.gen_report(
294
+ subset_score_map=reviews_score_all,
295
+ report_name=self.custom_task_name,
296
+ model_name=self.model_name,
297
+ dataset_name=self.dataset_name)
289
298
 
290
299
  # Dump report
291
300
  report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
@@ -294,7 +303,7 @@ class Evaluator(object):
294
303
 
295
304
  # Write report
296
305
  with open(report_path, 'w') as f:
297
- f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
306
+ f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
298
307
  logger.info(f'Dump report: {report_path} \n')
299
308
 
300
309
  # Make table
@@ -351,7 +360,7 @@ class Evaluator(object):
351
360
  subset_name=subset_name, answers_list=answers_list, debug=debug, **kwargs)
352
361
 
353
362
  metric_res = self.compute_metrics(reviews_list=reviews_list)
354
- reviews_score_all[subset_name] = (metric_res, len(reviews_list))
363
+ reviews_score_all[subset_name] = metric_res
355
364
  stage_reviews_dict[subset_name] = reviews_list
356
365
 
357
366
  if self.stage == EvalStage.INFER:
@@ -1,7 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, weighted_mean
2
+ from evalscope.metrics.metrics import bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, weighted_mean
3
+ from evalscope.metrics.named_metrics import *
3
4
  from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
4
-
5
- WeightedAverageAccuracy = {'name': 'WeightedAverageAccuracy', 'object': weighted_mean}
6
- WeightedAverageBLEU = {'name': 'WeightedAverageBLEU', 'object': weighted_mean}
7
- Pass1 = {'name': 'Pass@1', 'object': weighted_mean}
@@ -1,6 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  # Copyright (c) EleutherAI. and its affiliates.
3
3
  # Copyright (c) OpenAI. and its affiliates.
4
+
4
5
  import itertools
5
6
  import math
6
7
  import numpy as np
@@ -8,7 +9,7 @@ import random
8
9
  import sacrebleu
9
10
  from collections import defaultdict
10
11
  from collections.abc import Iterable
11
- from typing import Dict, List, Union
12
+ from typing import TYPE_CHECKING, Dict, List, Union
12
13
 
13
14
 
14
15
  def mean(arr):
@@ -103,12 +104,20 @@ def perplexity(items):
103
104
  return math.exp(-mean(items))
104
105
 
105
106
 
106
- def weighted_mean(items) -> float:
107
+ def weighted_mean(items: List) -> float:
107
108
  # e.g. [(0,1), (0.5,1), (1,1)]
108
109
  a, b = zip(*items)
109
110
  return sum(a) / sum(b)
110
111
 
111
112
 
113
+ def micro_mean(items):
114
+ return sum([item.score * item.num for item in items]) / sum([item.num for item in items])
115
+
116
+
117
+ def macro_mean(items):
118
+ return sum([item.score for item in items]) / len(items)
119
+
120
+
112
121
  def weighted_perplexity(items):
113
122
  return math.exp(-weighted_mean(items))
114
123
 
@@ -0,0 +1,17 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Callable
3
+
4
+ from evalscope.metrics.metrics import mean, weighted_mean
5
+
6
+
7
+ @dataclass
8
+ class Metric:
9
+ name: str = 'default_metric'
10
+ object: Callable = field(default_factory=lambda: mean)
11
+
12
+
13
+ AverageAccuracy = Metric(name='AverageAccuracy', object=mean)
14
+ WeightedAverageAccuracy = Metric(name='WeightedAverageAccuracy', object=weighted_mean)
15
+ AverageBLEU = Metric(name='AverageBLEU', object=mean)
16
+ WeightedAverageBLEU = Metric(name='WeightedAverageBLEU', object=weighted_mean)
17
+ Pass1 = Metric(name='Pass@1', object=mean)
@@ -76,6 +76,8 @@ class ChatGenerationModelAdapter(BaseModelAdapter):
76
76
  # For base model, use the query as the input
77
77
  formatted_prompt = query
78
78
 
79
+ logger.debug(f'formatted_prompt: {formatted_prompt}')
80
+
79
81
  inputs = self.tokenizer(formatted_prompt, return_tensors='pt', padding=True).to(self.device)
80
82
  input_ids = inputs['input_ids']
81
83
 
@@ -46,9 +46,13 @@ class ServerModelAdapter(BaseModelAdapter):
46
46
  query = inputs
47
47
  system_prompt = None
48
48
  elif isinstance(inputs, dict):
49
- # TODO: to be supported for continuation list like truthful_qa
50
- query = inputs['data'][0]
51
- system_prompt = inputs.get('system_prompt', None)
49
+ data: list = inputs['data']
50
+ if isinstance(data[0], tuple): # for truthful_qa and hellaswag
51
+ query = '\n'.join(''.join(item) for item in data)
52
+ system_prompt = inputs.get('system_prompt', None)
53
+ else:
54
+ query = data[0]
55
+ system_prompt = inputs.get('system_prompt', None)
52
56
  elif isinstance(inputs, list):
53
57
  query = '\n'.join(inputs)
54
58
  system_prompt = None
@@ -76,10 +80,13 @@ class ServerModelAdapter(BaseModelAdapter):
76
80
  def make_request(self, content: dict, infer_cfg: dict = {}) -> dict:
77
81
  """Make request to remote API."""
78
82
  # Format request JSON according to OpenAI API format
83
+ do_sample = infer_cfg.get('do_sample', False)
84
+ temperature = infer_cfg.get('temperature', 0.0) if do_sample else 0.0
85
+
79
86
  request_json = {
80
87
  **content, 'model': self.model_id,
81
88
  'max_tokens': infer_cfg.get('max_tokens', 2048),
82
- 'temperature': infer_cfg.get('temperature', 0.0),
89
+ 'temperature': temperature,
83
90
  'top_p': infer_cfg.get('top_p', 1.0),
84
91
  'n': infer_cfg.get('num_return_sequences', 1),
85
92
  'stop': infer_cfg.get('stop', None)
@@ -0,0 +1 @@
1
+ from evalscope.perf.main import run_perf_benchmark