evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  4. evalscope/backend/rag_eval/utils/llm.py +1 -1
  5. evalscope/benchmarks/__init__.py +20 -1
  6. evalscope/benchmarks/arc/__init__.py +0 -5
  7. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  8. evalscope/benchmarks/bbh/__init__.py +0 -4
  9. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  10. evalscope/benchmarks/benchmark.py +70 -59
  11. evalscope/benchmarks/ceval/__init__.py +0 -5
  12. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  13. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  14. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  15. evalscope/benchmarks/competition_math/__init__.py +0 -5
  16. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  17. evalscope/benchmarks/data_adapter.py +115 -87
  18. evalscope/benchmarks/general_qa/__init__.py +0 -5
  19. evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
  20. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  21. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
  22. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  23. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
  24. evalscope/benchmarks/humaneval/__init__.py +0 -4
  25. evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
  26. evalscope/benchmarks/ifeval/__init__.py +0 -0
  27. evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
  28. evalscope/benchmarks/ifeval/instructions.py +1478 -0
  29. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  30. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  31. evalscope/benchmarks/ifeval/utils.py +134 -0
  32. evalscope/benchmarks/iquiz/__init__.py +0 -0
  33. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  34. evalscope/benchmarks/mmlu/__init__.py +0 -5
  35. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  36. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  37. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  38. evalscope/benchmarks/race/__init__.py +0 -5
  39. evalscope/benchmarks/race/race_adapter.py +26 -123
  40. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  41. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  42. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  43. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  44. evalscope/cli/cli.py +2 -0
  45. evalscope/cli/start_app.py +29 -0
  46. evalscope/collections/__init__.py +3 -0
  47. evalscope/collections/evaluator.py +198 -0
  48. evalscope/collections/sampler.py +138 -0
  49. evalscope/collections/schema.py +126 -0
  50. evalscope/config.py +7 -5
  51. evalscope/constants.py +9 -26
  52. evalscope/evaluator/evaluator.py +87 -121
  53. evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
  54. evalscope/metrics/__init__.py +3 -0
  55. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  56. evalscope/metrics/math_accuracy.py +193 -50
  57. evalscope/metrics/metrics.py +18 -6
  58. evalscope/metrics/named_metrics.py +17 -0
  59. evalscope/metrics/rouge_metric.py +13 -8
  60. evalscope/models/__init__.py +14 -1
  61. evalscope/models/base_adapter.py +52 -0
  62. evalscope/models/chat_adapter.py +138 -0
  63. evalscope/models/choice_adapter.py +211 -0
  64. evalscope/models/custom_adapter.py +67 -0
  65. evalscope/models/local_model.py +74 -0
  66. evalscope/models/model.py +141 -0
  67. evalscope/models/server_adapter.py +111 -0
  68. evalscope/perf/__init__.py +1 -0
  69. evalscope/perf/main.py +0 -1
  70. evalscope/perf/plugin/api/custom_api.py +1 -1
  71. evalscope/perf/plugin/api/openai_api.py +1 -1
  72. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  73. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  74. evalscope/report/__init__.py +5 -0
  75. evalscope/report/app.py +506 -0
  76. evalscope/report/combinator.py +73 -0
  77. evalscope/report/generator.py +80 -0
  78. evalscope/report/utils.py +133 -0
  79. evalscope/run.py +48 -72
  80. evalscope/run_arena.py +1 -1
  81. evalscope/summarizer.py +1 -1
  82. evalscope/utils/__init__.py +1 -1
  83. evalscope/utils/chat_service.py +5 -4
  84. evalscope/utils/io_utils.py +8 -0
  85. evalscope/utils/logger.py +5 -0
  86. evalscope/utils/model_utils.py +15 -2
  87. evalscope/utils/utils.py +3 -25
  88. evalscope/version.py +2 -2
  89. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
  90. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
  91. tests/cli/test_collection.py +57 -0
  92. tests/cli/test_run.py +52 -1
  93. tests/rag/test_mteb.py +3 -2
  94. evalscope/models/api/__init__.py +0 -3
  95. evalscope/models/dummy_chat_model.py +0 -49
  96. evalscope/models/model_adapter.py +0 -525
  97. evalscope/models/openai_model.py +0 -103
  98. evalscope/tools/__init__.py +0 -1
  99. evalscope/tools/combine_reports.py +0 -133
  100. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  101. /evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
  102. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  103. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
  104. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
  105. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
  106. {evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,198 @@
1
+ import json
2
+ import os
3
+ import pandas as pd
4
+ from collections import defaultdict
5
+ from tabulate import tabulate
6
+ from tqdm import tqdm
7
+ from typing import List
8
+
9
+ from evalscope.benchmarks import Benchmark
10
+ from evalscope.collections.sampler import DatasetEntry
11
+ from evalscope.config import TaskConfig
12
+ from evalscope.constants import DataCollection, DumpMode
13
+ from evalscope.evaluator import Evaluator
14
+ from evalscope.models import get_local_model, initialize_model_adapter
15
+ from evalscope.report import ReportGenerator
16
+ from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
17
+ from evalscope.utils.logger import get_logger
18
+
19
+ logger = get_logger()
20
+
21
+
22
+ class SimpleEvaluator(Evaluator):
23
+
24
+ def __init__(self, dataset_name, data_adapter, model_adapter, task_cfg, outputs):
25
+ super().__init__(
26
+ dataset_name_or_path=dataset_name,
27
+ data_adapter=data_adapter,
28
+ model_adapter=model_adapter,
29
+ task_cfg=task_cfg,
30
+ outputs=outputs)
31
+
32
+ def get_answer(self, input_prompt, subset_name, infer_cfg) -> dict:
33
+ answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
34
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
35
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
36
+ return processed_answer
37
+
38
+ def get_review(self, answer_d) -> dict:
39
+ review_id, reviewer_spec = self._generate_review_id(answer_d)
40
+ review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
41
+ return review_d
42
+
43
+ def get_score(self, review_d) -> float:
44
+ metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
45
+ # use the first metric by default
46
+ score = metric_score[0]['score']
47
+ return score
48
+
49
+
50
+ class EvaluatorCollection:
51
+
52
+ def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure):
53
+ self.task_cfg = task_cfg
54
+ self.outputs = outputs
55
+ self.model = get_local_model(task_cfg)
56
+ self.dataset, self.dataset_name = self.load()
57
+ self.dataset_name_map, self.dataset_id_map = self._parse_dataset()
58
+ self.evaluators = self._initialize_evaluators()
59
+
60
+ def load(self) -> tuple[list[DatasetEntry], str]:
61
+ dataset_path = self.task_cfg.dataset_args[DataCollection.NAME]['local_path']
62
+ dataset_name = os.path.basename(dataset_path).split('.')[0]
63
+ raw_dataset = jsonl_to_list(dataset_path)
64
+ datasets = []
65
+ for sample in raw_dataset:
66
+ datasets.append(DatasetEntry(**sample))
67
+ return datasets, dataset_name
68
+
69
+ def _parse_dataset(self):
70
+ dataset_name_map = defaultdict(lambda: defaultdict(list))
71
+ dataset_id_map = {}
72
+ for sample in self.dataset:
73
+ dataset_name, subset_name = sample.dataset_name, sample.subset_name
74
+ dataset_name_map[dataset_name][subset_name].append(sample.index)
75
+ dataset_id_map[sample.index] = sample
76
+ return dataset_name_map, dataset_id_map
77
+
78
+ def _initialize_evaluators(self):
79
+ evaluators = {}
80
+ for dataset_name in self.dataset_name_map.keys():
81
+ benchmark = Benchmark.get(dataset_name)
82
+ data_adapter = benchmark.get_data_adapter()
83
+ model_adapter = initialize_model_adapter(self.task_cfg, benchmark.model_adapter, self.model)
84
+ evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
85
+ self.outputs)
86
+ return evaluators
87
+
88
+ def get_report(self, scores):
89
+
90
+ def get_dataframe(scores):
91
+ data = []
92
+ for dataset_name, data_map in self.dataset_name_map.items():
93
+ for subset_name, ids in data_map.items():
94
+ for _id in ids:
95
+ row_data: DatasetEntry = self.dataset_id_map[_id]
96
+ score = scores[_id]
97
+ data.append(
98
+ dict(
99
+ task_type=row_data.task_type,
100
+ categories=tuple(row_data.categories),
101
+ dataset_name=dataset_name,
102
+ subset_name=subset_name,
103
+ tags=row_data.tags,
104
+ score=score))
105
+ return pd.DataFrame(data)
106
+
107
+ def aggregate_and_sort(df, group_by_cols):
108
+ # aggregate by group_by_cols, and calculate average_score and count
109
+ report_df = df.groupby(group_by_cols) \
110
+ .agg(average_score=('score', 'mean'), count=('score', 'size')) \
111
+ .reset_index()
112
+ report_df['average_score'] = report_df['average_score'].round(4)
113
+ report_df = report_df.sort_values(by='count', ascending=False) \
114
+ .to_dict(orient='records')
115
+ return report_df
116
+
117
+ df = get_dataframe(scores)
118
+
119
+ # multi-level aggregation
120
+ subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
121
+ dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
122
+ task_report_df = aggregate_and_sort(df, ['task_type'])
123
+
124
+ # explode tags to multiple rows
125
+ df_exploded_tags = df.explode('tags')
126
+ tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags'])
127
+
128
+ # process multi-level categories
129
+ df_categories = df.copy()
130
+ # multi-level aggregation for categories
131
+ max_depth = df_categories['categories'].apply(len).max()
132
+ for level in range(max_depth):
133
+ df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
134
+ if len(x) > level else '')
135
+ category_report_df = aggregate_and_sort(df_categories, [f'category{level}' for level in range(max_depth)])
136
+
137
+ # convert to dict format
138
+ report_dict = {
139
+ 'subset_level': subset_report_df,
140
+ 'dataset_level': dataset_report_df,
141
+ 'task_level': task_report_df,
142
+ 'tag_level': tag_report_df,
143
+ 'category_level': category_report_df,
144
+ }
145
+
146
+ # record report
147
+ for level, data in report_dict.items():
148
+ table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
149
+ logger.info(f'{level} Report:\n{table}')
150
+
151
+ report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
152
+ # save report to JSON file
153
+ report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
154
+ os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
155
+ with open(report_file_path, 'w', encoding='utf-8') as f:
156
+ json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
157
+
158
+ def get_answers(self):
159
+ pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
160
+ f'{self.dataset_name}.jsonl')
161
+ os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
162
+ answers = defaultdict(dict)
163
+ for sample in tqdm(self.dataset, desc='Getting answers'):
164
+ evaluator = self.evaluators[sample.dataset_name]
165
+ answer_d = evaluator.get_answer(sample.prompt, sample.subset_name, self.task_cfg.generation_config)
166
+ answers[sample.index] = answer_d
167
+ dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
168
+ return answers
169
+
170
+ def get_reviews(self, answers):
171
+ review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
172
+ os.makedirs(review_file_path, exist_ok=True)
173
+ reviews = defaultdict(dict)
174
+ for sample in tqdm(self.dataset, desc='Getting reviews'):
175
+ evaluator = self.evaluators[sample.dataset_name]
176
+ review_d = evaluator.get_review(answers[sample.index])
177
+ reviews[sample.index] = review_d
178
+ dump_jsonl_data(
179
+ review_d,
180
+ os.path.join(review_file_path, f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'),
181
+ dump_mode=DumpMode.APPEND)
182
+ return reviews
183
+
184
+ def get_scores(self, reviews) -> float:
185
+ scores = defaultdict(dict)
186
+ for sample in tqdm(self.dataset, desc='Getting scores'):
187
+ evaluator = self.evaluators[sample.dataset_name]
188
+ review_d = reviews[sample.index]
189
+ score = evaluator.get_score(review_d)
190
+ scores[sample.index] = score
191
+
192
+ return scores
193
+
194
+ def eval(self, **kwargs):
195
+ answers = self.get_answers()
196
+ reviews = self.get_reviews(answers)
197
+ scores = self.get_scores(reviews)
198
+ self.get_report(scores)
@@ -0,0 +1,138 @@
1
+ import random
2
+ from abc import ABC, abstractmethod
3
+ from dataclasses import asdict, dataclass, field
4
+ from tqdm import tqdm
5
+ from typing import List, Optional
6
+
7
+ from evalscope.collections.schema import CollectionSchema, DatasetInfo
8
+
9
+
10
+ @dataclass
11
+ class DatasetEntry:
12
+ index: int = 0
13
+ prompt: dict = field(default_factory=dict)
14
+ tags: List[str] = field(default_factory=list)
15
+ categories: List[str] = field(default_factory=list)
16
+ task_type: str = ''
17
+ weight: float = 0.0
18
+ dataset_name: str = ''
19
+ subset_name: str = ''
20
+
21
+
22
+ # Define an abstract base class for Samplers
23
+ class Sampler(ABC):
24
+
25
+ def __init__(self, schema: CollectionSchema):
26
+ self.schema = schema
27
+
28
+ @abstractmethod
29
+ def sample(self) -> List[dict]:
30
+ raise NotImplementedError
31
+
32
+ def _sample_dataset(self, dataset: DatasetInfo, count: int) -> List[DatasetEntry]:
33
+ all_data = []
34
+ data_dict = dataset.get_data()
35
+ for subset_name, subset_data in data_dict.items():
36
+ for prompt in subset_data:
37
+ all_data.append(
38
+ DatasetEntry(
39
+ prompt=prompt,
40
+ tags=dataset.tags,
41
+ categories=dataset.hierarchy,
42
+ task_type=dataset.task_type,
43
+ weight=dataset.weight,
44
+ dataset_name=dataset.name,
45
+ subset_name=subset_name,
46
+ ))
47
+ sampled_data = random.choices(all_data, k=count)
48
+ return sampled_data
49
+
50
+ def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
51
+ result = []
52
+ for i, entry in enumerate(all_data):
53
+ entry.index = i
54
+ result.append(asdict(entry))
55
+ return result
56
+
57
+
58
+ class WeightedSampler(Sampler):
59
+ """
60
+ Weighted sampler, according to the weight of each dataset, sample data from each dataset.
61
+ """
62
+
63
+ def sample(self, count: int) -> List[dict]:
64
+ dataset_info_list = self.schema.flatten()
65
+ sampled_data = []
66
+ remaining_count = count
67
+
68
+ for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
69
+ if i == len(dataset_info_list) - 1:
70
+ dataset_sample_count = remaining_count
71
+ else:
72
+ dataset_sample_count = int(dataset.weight * count)
73
+ remaining_count -= dataset_sample_count
74
+
75
+ sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
76
+
77
+ return self._update_index(sampled_data)
78
+
79
+
80
+ class UniformSampler(Sampler):
81
+ """
82
+ Uniform sampler, sample data from each dataset with the same number of samples.
83
+ """
84
+
85
+ def sample(self, count: int) -> List[dict]:
86
+ dataset_info_list = self.schema.flatten()
87
+ num_datasets = len(dataset_info_list)
88
+ remaining_count = count
89
+ sampled_data = []
90
+
91
+ for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
92
+ if i == len(dataset_info_list) - 1:
93
+ dataset_sample_count = remaining_count
94
+ else:
95
+ dataset_sample_count = count // num_datasets
96
+ remaining_count -= dataset_sample_count
97
+
98
+ sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
99
+
100
+ return self._update_index(sampled_data)
101
+
102
+
103
+ class StratifiedSampler(Sampler):
104
+ """
105
+ Stratified sampler, sample data from each dataset according to the number of samples of each dataset.
106
+ """
107
+
108
+ def sample(self, count: int) -> List[dict]:
109
+ dataset_info_list = self.schema.flatten()
110
+
111
+ total_samples = sum(len(dataset.get_data()) for dataset in dataset_info_list)
112
+ remaining_count = count
113
+ sampled_data = []
114
+
115
+ for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
116
+ if i == len(dataset_info_list) - 1:
117
+ dataset_sample_count = remaining_count
118
+ else:
119
+ dataset_sample_count = int((len(dataset.get_data()) / total_samples) * count)
120
+ remaining_count -= dataset_sample_count
121
+
122
+ sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
123
+ return self._update_index(sampled_data)
124
+
125
+
126
+ if __name__ == '__main__':
127
+ from evalscope.utils.io_utils import dump_jsonl_data
128
+
129
+ schema = CollectionSchema.from_json('outputs/schema.json')
130
+ print(schema.to_dict())
131
+ mixed_data = WeightedSampler(schema).sample(10)
132
+ dump_jsonl_data(mixed_data, 'outputs/weighted_mixed_data.jsonl')
133
+
134
+ # mixed_data = UniformSampler(schema, 100).sample()
135
+ # dump_jsonl_data(mixed_data, 'outputs/uniform_mixed_data.jsonl')
136
+
137
+ # mixed_data = StratifiedSampler(schema, 100).sample()
138
+ # dump_jsonl_data(mixed_data, 'outputs/stratified_mixed_data.jsonl')
@@ -0,0 +1,126 @@
1
+ import copy
2
+ import json
3
+ from dataclasses import asdict, dataclass, field
4
+ from typing import List, Union
5
+
6
+
7
+ @dataclass
8
+ class DatasetInfo:
9
+ name: str
10
+ weight: float = 1.0 # sample weight in each collection
11
+ task_type: str = ''
12
+ tags: List[str] = field(default_factory=list)
13
+ args: dict = field(default_factory=dict)
14
+ hierarchy: List[str] = field(default_factory=list)
15
+
16
+ def get_data(self) -> dict:
17
+ from evalscope.benchmarks import Benchmark
18
+
19
+ benchmark_meta = Benchmark.get(self.name)
20
+
21
+ data_adapter = benchmark_meta.get_data_adapter(config=self.args)
22
+ data_dict = data_adapter.load(
23
+ dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
24
+ prompts = data_adapter.gen_prompts(data_dict)
25
+ return prompts
26
+
27
+
28
+ def flatten_weight(collection: 'CollectionSchema', base_weight=1):
29
+ total_weight = sum(dataset.weight for dataset in collection.datasets)
30
+ for dataset in collection.datasets:
31
+ current_weight = dataset.weight / total_weight * base_weight
32
+ if isinstance(dataset, CollectionSchema):
33
+ flatten_weight(dataset, current_weight)
34
+ else:
35
+ dataset.weight = current_weight
36
+
37
+
38
+ def flatten_name(collection: 'CollectionSchema', parent_names=None):
39
+ if parent_names is None:
40
+ parent_names = []
41
+ current_names = parent_names + [collection.name]
42
+ for dataset in collection.datasets:
43
+ if isinstance(dataset, CollectionSchema):
44
+ flatten_name(dataset, current_names)
45
+ else:
46
+ dataset.hierarchy = current_names.copy()
47
+
48
+
49
+ def flatten_datasets(collection: 'CollectionSchema') -> List[DatasetInfo]:
50
+ flat_datasets = []
51
+ for dataset in collection.datasets:
52
+ if isinstance(dataset, CollectionSchema):
53
+ flat_datasets.extend(flatten_datasets(dataset))
54
+ else:
55
+ flat_datasets.append(dataset)
56
+ return flat_datasets
57
+
58
+
59
+ @dataclass
60
+ class CollectionSchema:
61
+ name: str
62
+ weight: float = 1.0
63
+ datasets: List[Union[DatasetInfo, 'CollectionSchema']] = field(default_factory=list)
64
+
65
+ def __str__(self):
66
+ return json.dumps(self.to_dict(), ensure_ascii=False, indent=4)
67
+
68
+ def to_dict(self):
69
+ return {
70
+ 'name':
71
+ self.name,
72
+ 'weight':
73
+ self.weight,
74
+ 'datasets':
75
+ [asdict(dataset) if isinstance(dataset, DatasetInfo) else dataset.to_dict() for dataset in self.datasets],
76
+ }
77
+
78
+ @classmethod
79
+ def from_dict(cls, data):
80
+ instance = cls(name=data.get('name', ''), weight=data.get('weight', 1))
81
+ for dataset in data.get('datasets', []):
82
+ if 'datasets' in dataset:
83
+ instance.datasets.append(CollectionSchema.from_dict(dataset))
84
+ else:
85
+ instance.datasets.append(DatasetInfo(**dataset))
86
+ return instance
87
+
88
+ def dump_json(self, file_path):
89
+ d = self.to_dict()
90
+ with open(file_path, 'w') as f:
91
+ json.dump(d, f, ensure_ascii=False, indent=4)
92
+
93
+ @classmethod
94
+ def from_json(cls, file_path):
95
+ with open(file_path, 'r') as f:
96
+ data = json.load(f)
97
+ return cls.from_dict(data)
98
+
99
+ def flatten(self) -> List[DatasetInfo]:
100
+ collection = copy.deepcopy(self)
101
+ flatten_name(collection)
102
+ flatten_weight(collection)
103
+ return flatten_datasets(collection)
104
+
105
+
106
+ if __name__ == '__main__':
107
+ schema = CollectionSchema(
108
+ name='reasoning',
109
+ datasets=[
110
+ CollectionSchema(name='english', datasets=[
111
+ DatasetInfo(name='arc', weight=1, tags=['en']),
112
+ ]),
113
+ CollectionSchema(
114
+ name='chinese',
115
+ datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})])
116
+ ])
117
+ print(schema)
118
+ print(schema.flatten())
119
+ schema.dump_json('outputs/schema.json')
120
+
121
+ schema = CollectionSchema.from_json('outputs/schema.json')
122
+ print(schema)
123
+ # 打印扁平化后的结果
124
+ for dataset in schema.flatten():
125
+ print(f'Dataset: {dataset.name}')
126
+ print(f"Hierarchy: {' -> '.join(dataset.hierarchy)}")
evalscope/config.py CHANGED
@@ -31,7 +31,7 @@ DEFAULT_GENERATION_CONFIG = {
31
31
  @dataclass
32
32
  class TaskConfig:
33
33
  # Model-related arguments
34
- model: Union[str, CustomModel, None] = None
34
+ model: Union[str, 'CustomModel', None] = None
35
35
  model_id: Optional[str] = None
36
36
  model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
37
37
 
@@ -40,8 +40,8 @@ class TaskConfig:
40
40
  chat_template: Optional[str] = None
41
41
 
42
42
  # Dataset-related arguments
43
- datasets: Optional[List[str]] = None
44
- dataset_args: Optional[Dict] = field(default_factory=dict)
43
+ datasets: List[str] = field(default_factory=list)
44
+ dataset_args: Dict = field(default_factory=dict)
45
45
  dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
46
46
  dataset_hub: str = HubType.MODELSCOPE
47
47
 
@@ -64,7 +64,9 @@ class TaskConfig:
64
64
  # Debug and runtime mode arguments
65
65
  debug: bool = False
66
66
  dry_run: bool = False
67
- seed: int = 42
67
+ seed: Optional[int] = 42
68
+ api_url: Optional[str] = None # Only used for server model
69
+ api_key: Optional[str] = 'EMPTY' # Only used for server model
68
70
 
69
71
  def __post_init__(self):
70
72
  if (not self.model_id) and self.model:
@@ -74,7 +76,6 @@ class TaskConfig:
74
76
  self.model_id = os.path.basename(self.model).rstrip(os.sep)
75
77
 
76
78
  def to_dict(self):
77
- # Note: to avoid serialization error for some model instance
78
79
  return self.__dict__
79
80
 
80
81
  def __str__(self):
@@ -130,6 +131,7 @@ class TaskConfig:
130
131
  continue
131
132
 
132
133
  task.model = custom_model
134
+ task.model_args = custom_model.config
133
135
  task.model_id = type(custom_model).__name__
134
136
  res_list.append(task)
135
137
 
evalscope/constants.py CHANGED
@@ -135,34 +135,17 @@ class EvalStage:
135
135
  class EvalType:
136
136
 
137
137
  CUSTOM = 'custom'
138
- CHECKPOINT = 'checkpoint'
138
+ CHECKPOINT = 'checkpoint' # native model checkpoint
139
+ SERVICE = 'service' # model service
139
140
 
140
141
 
141
142
  class EvalBackend:
143
+ NATIVE = 'Native'
144
+ OPEN_COMPASS = 'OpenCompass'
145
+ VLM_EVAL_KIT = 'VLMEvalKit'
146
+ RAG_EVAL = 'RAGEval'
147
+ THIRD_PARTY = 'ThirdParty'
142
148
 
143
- class _Backend:
144
- # compatible with old version, set 'value'
145
149
 
146
- def __init__(self, value):
147
- self._value = value
148
-
149
- @property
150
- def value(self):
151
- return self._value
152
-
153
- def __str__(self):
154
- return self._value
155
-
156
- def __repr__(self):
157
- return f"'{self._value}'"
158
-
159
- def __eq__(self, other):
160
- if isinstance(other, str):
161
- return self._value == other
162
- return NotImplemented
163
-
164
- NATIVE = _Backend('Native')
165
- OPEN_COMPASS = _Backend('OpenCompass')
166
- VLM_EVAL_KIT = _Backend('VLMEvalKit')
167
- RAG_EVAL = _Backend('RAGEval')
168
- THIRD_PARTY = _Backend('ThirdParty')
150
+ class DataCollection:
151
+ NAME = 'data_collection'