evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (147) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/base.py +1 -1
  4. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  5. evalscope/backend/rag_eval/utils/clip.py +2 -2
  6. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  7. evalscope/backend/rag_eval/utils/llm.py +1 -1
  8. evalscope/benchmarks/__init__.py +20 -1
  9. evalscope/benchmarks/arc/__init__.py +0 -5
  10. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  11. evalscope/benchmarks/bbh/__init__.py +0 -4
  12. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  13. evalscope/benchmarks/benchmark.py +70 -59
  14. evalscope/benchmarks/ceval/__init__.py +0 -5
  15. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  16. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  17. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  18. evalscope/benchmarks/competition_math/__init__.py +0 -5
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  20. evalscope/benchmarks/data_adapter.py +115 -87
  21. evalscope/benchmarks/general_qa/__init__.py +0 -5
  22. evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
  23. evalscope/benchmarks/gpqa/__init__.py +0 -0
  24. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  26. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  27. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
  28. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  29. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
  30. evalscope/benchmarks/humaneval/__init__.py +0 -4
  31. evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
  32. evalscope/benchmarks/ifeval/__init__.py +0 -0
  33. evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  34. evalscope/benchmarks/ifeval/instructions.py +1477 -0
  35. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  36. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  37. evalscope/benchmarks/ifeval/utils.py +134 -0
  38. evalscope/benchmarks/iquiz/__init__.py +0 -0
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  40. evalscope/benchmarks/mmlu/__init__.py +0 -5
  41. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  42. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  43. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  44. evalscope/benchmarks/race/__init__.py +0 -5
  45. evalscope/benchmarks/race/race_adapter.py +27 -123
  46. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  48. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  49. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  50. evalscope/cli/cli.py +2 -0
  51. evalscope/cli/start_app.py +30 -0
  52. evalscope/collections/__init__.py +3 -0
  53. evalscope/collections/evaluator.py +198 -0
  54. evalscope/collections/sampler.py +138 -0
  55. evalscope/collections/schema.py +126 -0
  56. evalscope/config.py +45 -7
  57. evalscope/constants.py +7 -38
  58. evalscope/evaluator/__init__.py +0 -1
  59. evalscope/evaluator/evaluator.py +89 -121
  60. evalscope/evaluator/rating_eval.py +1 -1
  61. evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
  62. evalscope/metrics/__init__.py +3 -0
  63. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  64. evalscope/metrics/math_accuracy.py +193 -50
  65. evalscope/metrics/metrics.py +18 -6
  66. evalscope/metrics/named_metrics.py +17 -0
  67. evalscope/metrics/rouge_metric.py +13 -8
  68. evalscope/models/__init__.py +14 -1
  69. evalscope/models/base_adapter.py +52 -0
  70. evalscope/models/chat_adapter.py +140 -0
  71. evalscope/models/choice_adapter.py +211 -0
  72. evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
  73. evalscope/models/custom_adapter.py +67 -0
  74. evalscope/models/local_model.py +74 -0
  75. evalscope/models/model.py +141 -0
  76. evalscope/models/server_adapter.py +111 -0
  77. evalscope/perf/__init__.py +1 -0
  78. evalscope/perf/arguments.py +3 -1
  79. evalscope/perf/benchmark.py +3 -3
  80. evalscope/perf/main.py +5 -7
  81. evalscope/perf/plugin/api/custom_api.py +1 -1
  82. evalscope/perf/plugin/api/openai_api.py +54 -50
  83. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  84. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  85. evalscope/perf/plugin/registry.py +3 -3
  86. evalscope/perf/utils/benchmark_util.py +4 -4
  87. evalscope/perf/utils/db_util.py +66 -22
  88. evalscope/perf/utils/local_server.py +4 -1
  89. evalscope/report/__init__.py +5 -0
  90. evalscope/report/app.py +693 -0
  91. evalscope/report/combinator.py +73 -0
  92. evalscope/report/generator.py +80 -0
  93. evalscope/report/utils.py +133 -0
  94. evalscope/run.py +64 -125
  95. evalscope/run_arena.py +3 -2
  96. evalscope/summarizer.py +15 -27
  97. evalscope/third_party/longbench_write/eval.py +2 -1
  98. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  99. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  100. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  101. evalscope/utils/__init__.py +1 -0
  102. evalscope/utils/chat_service.py +6 -5
  103. evalscope/utils/io_utils.py +170 -0
  104. evalscope/utils/logger.py +13 -0
  105. evalscope/utils/model_utils.py +15 -2
  106. evalscope/utils/utils.py +3 -200
  107. evalscope/version.py +2 -2
  108. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
  109. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
  110. tests/cli/test_collection.py +57 -0
  111. tests/cli/test_run.py +57 -7
  112. tests/perf/test_perf.py +3 -2
  113. tests/rag/test_mteb.py +3 -2
  114. tests/vlm/test_vlmeval.py +3 -2
  115. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  116. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  117. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  118. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  119. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  120. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  121. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  122. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  123. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  124. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  125. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  126. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  127. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  128. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  129. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  130. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  131. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  132. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  133. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  134. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  135. evalscope/evaluator/humaneval_evaluator.py +0 -158
  136. evalscope/models/api/__init__.py +0 -3
  137. evalscope/models/dummy_chat_model.py +0 -49
  138. evalscope/models/model_adapter.py +0 -525
  139. evalscope/models/openai_model.py +0 -103
  140. evalscope/tools/__init__.py +0 -1
  141. evalscope/tools/combine_reports.py +0 -135
  142. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  143. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  144. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
  145. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
  146. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
  147. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,198 @@
1
+ import json
2
+ import os
3
+ import pandas as pd
4
+ from collections import defaultdict
5
+ from tabulate import tabulate
6
+ from tqdm import tqdm
7
+ from typing import List
8
+
9
+ from evalscope.benchmarks import Benchmark
10
+ from evalscope.collections.sampler import DatasetEntry
11
+ from evalscope.config import TaskConfig
12
+ from evalscope.constants import DataCollection, DumpMode
13
+ from evalscope.evaluator import Evaluator
14
+ from evalscope.models import get_local_model, initialize_model_adapter
15
+ from evalscope.report import ReportGenerator
16
+ from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
17
+ from evalscope.utils.logger import get_logger
18
+
19
+ logger = get_logger()
20
+
21
+
22
+ class SimpleEvaluator(Evaluator):
23
+
24
+ def __init__(self, dataset_name, data_adapter, model_adapter, task_cfg, outputs):
25
+ super().__init__(
26
+ dataset_name_or_path=dataset_name,
27
+ data_adapter=data_adapter,
28
+ model_adapter=model_adapter,
29
+ task_cfg=task_cfg,
30
+ outputs=outputs)
31
+
32
+ def get_answer(self, input_prompt, subset_name, infer_cfg) -> dict:
33
+ answer_d: dict = self.model_adapter.predict(inputs=input_prompt, infer_cfg=infer_cfg)
34
+ answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
35
+ processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
36
+ return processed_answer
37
+
38
+ def get_review(self, answer_d) -> dict:
39
+ review_id, reviewer_spec = self._generate_review_id(answer_d)
40
+ review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
41
+ return review_d
42
+
43
+ def get_score(self, review_d) -> float:
44
+ metric_score: List[dict] = self.compute_metrics(reviews_list=[review_d])
45
+ # use the first metric by default
46
+ score = metric_score[0]['score']
47
+ return score
48
+
49
+
50
+ class EvaluatorCollection:
51
+
52
+ def __init__(self, task_cfg: TaskConfig, outputs: OutputsStructure):
53
+ self.task_cfg = task_cfg
54
+ self.outputs = outputs
55
+ self.model = get_local_model(task_cfg)
56
+ self.dataset, self.dataset_name = self.load()
57
+ self.dataset_name_map, self.dataset_id_map = self._parse_dataset()
58
+ self.evaluators = self._initialize_evaluators()
59
+
60
+ def load(self) -> tuple[list[DatasetEntry], str]:
61
+ dataset_path = self.task_cfg.dataset_args[DataCollection.NAME]['local_path']
62
+ dataset_name = os.path.basename(dataset_path).split('.')[0]
63
+ raw_dataset = jsonl_to_list(dataset_path)
64
+ datasets = []
65
+ for sample in raw_dataset:
66
+ datasets.append(DatasetEntry(**sample))
67
+ return datasets, dataset_name
68
+
69
+ def _parse_dataset(self):
70
+ dataset_name_map = defaultdict(lambda: defaultdict(list))
71
+ dataset_id_map = {}
72
+ for sample in self.dataset:
73
+ dataset_name, subset_name = sample.dataset_name, sample.subset_name
74
+ dataset_name_map[dataset_name][subset_name].append(sample.index)
75
+ dataset_id_map[sample.index] = sample
76
+ return dataset_name_map, dataset_id_map
77
+
78
+ def _initialize_evaluators(self):
79
+ evaluators = {}
80
+ for dataset_name in self.dataset_name_map.keys():
81
+ benchmark = Benchmark.get(dataset_name)
82
+ data_adapter = benchmark.get_data_adapter()
83
+ model_adapter = initialize_model_adapter(self.task_cfg, benchmark.model_adapter, self.model)
84
+ evaluators[dataset_name] = SimpleEvaluator(dataset_name, data_adapter, model_adapter, self.task_cfg,
85
+ self.outputs)
86
+ return evaluators
87
+
88
+ def get_report(self, scores):
89
+
90
+ def get_dataframe(scores):
91
+ data = []
92
+ for dataset_name, data_map in self.dataset_name_map.items():
93
+ for subset_name, ids in data_map.items():
94
+ for _id in ids:
95
+ row_data: DatasetEntry = self.dataset_id_map[_id]
96
+ score = scores[_id]
97
+ data.append(
98
+ dict(
99
+ task_type=row_data.task_type,
100
+ categories=tuple(row_data.categories),
101
+ dataset_name=dataset_name,
102
+ subset_name=subset_name,
103
+ tags=row_data.tags,
104
+ score=score))
105
+ return pd.DataFrame(data)
106
+
107
+ def aggregate_and_sort(df, group_by_cols):
108
+ # aggregate by group_by_cols, and calculate average_score and count
109
+ report_df = df.groupby(group_by_cols) \
110
+ .agg(average_score=('score', 'mean'), count=('score', 'size')) \
111
+ .reset_index()
112
+ report_df['average_score'] = report_df['average_score'].round(4)
113
+ report_df = report_df.sort_values(by='count', ascending=False) \
114
+ .to_dict(orient='records')
115
+ return report_df
116
+
117
+ df = get_dataframe(scores)
118
+
119
+ # multi-level aggregation
120
+ subset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name', 'subset_name'])
121
+ dataset_report_df = aggregate_and_sort(df, ['task_type', 'dataset_name'])
122
+ task_report_df = aggregate_and_sort(df, ['task_type'])
123
+
124
+ # explode tags to multiple rows
125
+ df_exploded_tags = df.explode('tags')
126
+ tag_report_df = aggregate_and_sort(df_exploded_tags, ['tags'])
127
+
128
+ # process multi-level categories
129
+ df_categories = df.copy()
130
+ # multi-level aggregation for categories
131
+ max_depth = df_categories['categories'].apply(len).max()
132
+ for level in range(max_depth):
133
+ df_categories[f'category{level}'] = df_categories['categories'].apply(lambda x: x[level]
134
+ if len(x) > level else '')
135
+ category_report_df = aggregate_and_sort(df_categories, [f'category{level}' for level in range(max_depth)])
136
+
137
+ # convert to dict format
138
+ report_dict = {
139
+ 'subset_level': subset_report_df,
140
+ 'dataset_level': dataset_report_df,
141
+ 'task_level': task_report_df,
142
+ 'tag_level': tag_report_df,
143
+ 'category_level': category_report_df,
144
+ }
145
+
146
+ # record report
147
+ for level, data in report_dict.items():
148
+ table = tabulate(data, headers='keys', tablefmt='pretty', showindex=False)
149
+ logger.info(f'{level} Report:\n{table}')
150
+
151
+ report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
152
+ # save report to JSON file
153
+ report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
154
+ os.makedirs(os.path.dirname(report_file_path), exist_ok=True)
155
+ with open(report_file_path, 'w', encoding='utf-8') as f:
156
+ json.dump(report.to_dict(), f, ensure_ascii=False, indent=4)
157
+
158
+ def get_answers(self):
159
+ pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
160
+ f'{self.dataset_name}.jsonl')
161
+ os.makedirs(os.path.dirname(pred_file_path), exist_ok=True)
162
+ answers = defaultdict(dict)
163
+ for sample in tqdm(self.dataset, desc='Getting answers'):
164
+ evaluator = self.evaluators[sample.dataset_name]
165
+ answer_d = evaluator.get_answer(sample.prompt, sample.subset_name, self.task_cfg.generation_config)
166
+ answers[sample.index] = answer_d
167
+ dump_jsonl_data(answer_d, pred_file_path, dump_mode=DumpMode.APPEND)
168
+ return answers
169
+
170
+ def get_reviews(self, answers):
171
+ review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
172
+ os.makedirs(review_file_path, exist_ok=True)
173
+ reviews = defaultdict(dict)
174
+ for sample in tqdm(self.dataset, desc='Getting reviews'):
175
+ evaluator = self.evaluators[sample.dataset_name]
176
+ review_d = evaluator.get_review(answers[sample.index])
177
+ reviews[sample.index] = review_d
178
+ dump_jsonl_data(
179
+ review_d,
180
+ os.path.join(review_file_path, f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'),
181
+ dump_mode=DumpMode.APPEND)
182
+ return reviews
183
+
184
+ def get_scores(self, reviews) -> float:
185
+ scores = defaultdict(dict)
186
+ for sample in tqdm(self.dataset, desc='Getting scores'):
187
+ evaluator = self.evaluators[sample.dataset_name]
188
+ review_d = reviews[sample.index]
189
+ score = evaluator.get_score(review_d)
190
+ scores[sample.index] = score
191
+
192
+ return scores
193
+
194
+ def eval(self, **kwargs):
195
+ answers = self.get_answers()
196
+ reviews = self.get_reviews(answers)
197
+ scores = self.get_scores(reviews)
198
+ self.get_report(scores)
@@ -0,0 +1,138 @@
1
+ import random
2
+ from abc import ABC, abstractmethod
3
+ from dataclasses import asdict, dataclass, field
4
+ from tqdm import tqdm
5
+ from typing import List, Optional
6
+
7
+ from evalscope.collections.schema import CollectionSchema, DatasetInfo
8
+
9
+
10
+ @dataclass
11
+ class DatasetEntry:
12
+ index: int = 0
13
+ prompt: dict = field(default_factory=dict)
14
+ tags: List[str] = field(default_factory=list)
15
+ categories: List[str] = field(default_factory=list)
16
+ task_type: str = ''
17
+ weight: float = 0.0
18
+ dataset_name: str = ''
19
+ subset_name: str = ''
20
+
21
+
22
+ # Define an abstract base class for Samplers
23
+ class Sampler(ABC):
24
+
25
+ def __init__(self, schema: CollectionSchema):
26
+ self.schema = schema
27
+
28
+ @abstractmethod
29
+ def sample(self) -> List[dict]:
30
+ raise NotImplementedError
31
+
32
+ def _sample_dataset(self, dataset: DatasetInfo, count: int) -> List[DatasetEntry]:
33
+ all_data = []
34
+ data_dict = dataset.get_data()
35
+ for subset_name, subset_data in data_dict.items():
36
+ for prompt in subset_data:
37
+ all_data.append(
38
+ DatasetEntry(
39
+ prompt=prompt,
40
+ tags=dataset.tags,
41
+ categories=dataset.hierarchy,
42
+ task_type=dataset.task_type,
43
+ weight=dataset.weight,
44
+ dataset_name=dataset.name,
45
+ subset_name=subset_name,
46
+ ))
47
+ sampled_data = random.choices(all_data, k=count)
48
+ return sampled_data
49
+
50
+ def _update_index(self, all_data: List[DatasetEntry]) -> List[dict]:
51
+ result = []
52
+ for i, entry in enumerate(all_data):
53
+ entry.index = i
54
+ result.append(asdict(entry))
55
+ return result
56
+
57
+
58
+ class WeightedSampler(Sampler):
59
+ """
60
+ Weighted sampler, according to the weight of each dataset, sample data from each dataset.
61
+ """
62
+
63
+ def sample(self, count: int) -> List[dict]:
64
+ dataset_info_list = self.schema.flatten()
65
+ sampled_data = []
66
+ remaining_count = count
67
+
68
+ for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
69
+ if i == len(dataset_info_list) - 1:
70
+ dataset_sample_count = remaining_count
71
+ else:
72
+ dataset_sample_count = int(dataset.weight * count)
73
+ remaining_count -= dataset_sample_count
74
+
75
+ sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
76
+
77
+ return self._update_index(sampled_data)
78
+
79
+
80
+ class UniformSampler(Sampler):
81
+ """
82
+ Uniform sampler, sample data from each dataset with the same number of samples.
83
+ """
84
+
85
+ def sample(self, count: int) -> List[dict]:
86
+ dataset_info_list = self.schema.flatten()
87
+ num_datasets = len(dataset_info_list)
88
+ remaining_count = count
89
+ sampled_data = []
90
+
91
+ for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
92
+ if i == len(dataset_info_list) - 1:
93
+ dataset_sample_count = remaining_count
94
+ else:
95
+ dataset_sample_count = count // num_datasets
96
+ remaining_count -= dataset_sample_count
97
+
98
+ sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
99
+
100
+ return self._update_index(sampled_data)
101
+
102
+
103
+ class StratifiedSampler(Sampler):
104
+ """
105
+ Stratified sampler, sample data from each dataset according to the number of samples of each dataset.
106
+ """
107
+
108
+ def sample(self, count: int) -> List[dict]:
109
+ dataset_info_list = self.schema.flatten()
110
+
111
+ total_samples = sum(len(dataset.get_data()) for dataset in dataset_info_list)
112
+ remaining_count = count
113
+ sampled_data = []
114
+
115
+ for i, dataset in enumerate(tqdm(dataset_info_list, desc='Sampling data')):
116
+ if i == len(dataset_info_list) - 1:
117
+ dataset_sample_count = remaining_count
118
+ else:
119
+ dataset_sample_count = int((len(dataset.get_data()) / total_samples) * count)
120
+ remaining_count -= dataset_sample_count
121
+
122
+ sampled_data.extend(self._sample_dataset(dataset, dataset_sample_count))
123
+ return self._update_index(sampled_data)
124
+
125
+
126
+ if __name__ == '__main__':
127
+ from evalscope.utils.io_utils import dump_jsonl_data
128
+
129
+ schema = CollectionSchema.from_json('outputs/schema.json')
130
+ print(schema.to_dict())
131
+ mixed_data = WeightedSampler(schema).sample(10)
132
+ dump_jsonl_data(mixed_data, 'outputs/weighted_mixed_data.jsonl')
133
+
134
+ # mixed_data = UniformSampler(schema, 100).sample()
135
+ # dump_jsonl_data(mixed_data, 'outputs/uniform_mixed_data.jsonl')
136
+
137
+ # mixed_data = StratifiedSampler(schema, 100).sample()
138
+ # dump_jsonl_data(mixed_data, 'outputs/stratified_mixed_data.jsonl')
@@ -0,0 +1,126 @@
1
+ import copy
2
+ import json
3
+ from dataclasses import asdict, dataclass, field
4
+ from typing import List, Union
5
+
6
+
7
+ @dataclass
8
+ class DatasetInfo:
9
+ name: str
10
+ weight: float = 1.0 # sample weight in each collection
11
+ task_type: str = ''
12
+ tags: List[str] = field(default_factory=list)
13
+ args: dict = field(default_factory=dict)
14
+ hierarchy: List[str] = field(default_factory=list)
15
+
16
+ def get_data(self) -> dict:
17
+ from evalscope.benchmarks import Benchmark
18
+
19
+ benchmark_meta = Benchmark.get(self.name)
20
+
21
+ data_adapter = benchmark_meta.get_data_adapter(config=self.args)
22
+ data_dict = data_adapter.load(
23
+ dataset_name_or_path=benchmark_meta.dataset_id, subset_list=benchmark_meta.subset_list)
24
+ prompts = data_adapter.gen_prompts(data_dict)
25
+ return prompts
26
+
27
+
28
+ def flatten_weight(collection: 'CollectionSchema', base_weight=1):
29
+ total_weight = sum(dataset.weight for dataset in collection.datasets)
30
+ for dataset in collection.datasets:
31
+ current_weight = dataset.weight / total_weight * base_weight
32
+ if isinstance(dataset, CollectionSchema):
33
+ flatten_weight(dataset, current_weight)
34
+ else:
35
+ dataset.weight = current_weight
36
+
37
+
38
+ def flatten_name(collection: 'CollectionSchema', parent_names=None):
39
+ if parent_names is None:
40
+ parent_names = []
41
+ current_names = parent_names + [collection.name]
42
+ for dataset in collection.datasets:
43
+ if isinstance(dataset, CollectionSchema):
44
+ flatten_name(dataset, current_names)
45
+ else:
46
+ dataset.hierarchy = current_names.copy()
47
+
48
+
49
+ def flatten_datasets(collection: 'CollectionSchema') -> List[DatasetInfo]:
50
+ flat_datasets = []
51
+ for dataset in collection.datasets:
52
+ if isinstance(dataset, CollectionSchema):
53
+ flat_datasets.extend(flatten_datasets(dataset))
54
+ else:
55
+ flat_datasets.append(dataset)
56
+ return flat_datasets
57
+
58
+
59
+ @dataclass
60
+ class CollectionSchema:
61
+ name: str
62
+ weight: float = 1.0
63
+ datasets: List[Union[DatasetInfo, 'CollectionSchema']] = field(default_factory=list)
64
+
65
+ def __str__(self):
66
+ return json.dumps(self.to_dict(), ensure_ascii=False, indent=4)
67
+
68
+ def to_dict(self):
69
+ return {
70
+ 'name':
71
+ self.name,
72
+ 'weight':
73
+ self.weight,
74
+ 'datasets':
75
+ [asdict(dataset) if isinstance(dataset, DatasetInfo) else dataset.to_dict() for dataset in self.datasets],
76
+ }
77
+
78
+ @classmethod
79
+ def from_dict(cls, data):
80
+ instance = cls(name=data.get('name', ''), weight=data.get('weight', 1))
81
+ for dataset in data.get('datasets', []):
82
+ if 'datasets' in dataset:
83
+ instance.datasets.append(CollectionSchema.from_dict(dataset))
84
+ else:
85
+ instance.datasets.append(DatasetInfo(**dataset))
86
+ return instance
87
+
88
+ def dump_json(self, file_path):
89
+ d = self.to_dict()
90
+ with open(file_path, 'w') as f:
91
+ json.dump(d, f, ensure_ascii=False, indent=4)
92
+
93
+ @classmethod
94
+ def from_json(cls, file_path):
95
+ with open(file_path, 'r') as f:
96
+ data = json.load(f)
97
+ return cls.from_dict(data)
98
+
99
+ def flatten(self) -> List[DatasetInfo]:
100
+ collection = copy.deepcopy(self)
101
+ flatten_name(collection)
102
+ flatten_weight(collection)
103
+ return flatten_datasets(collection)
104
+
105
+
106
+ if __name__ == '__main__':
107
+ schema = CollectionSchema(
108
+ name='reasoning',
109
+ datasets=[
110
+ CollectionSchema(name='english', datasets=[
111
+ DatasetInfo(name='arc', weight=1, tags=['en']),
112
+ ]),
113
+ CollectionSchema(
114
+ name='chinese',
115
+ datasets=[DatasetInfo(name='ceval', weight=1, tags=['zh'], args={'subset_list': ['logic']})])
116
+ ])
117
+ print(schema)
118
+ print(schema.flatten())
119
+ schema.dump_json('outputs/schema.json')
120
+
121
+ schema = CollectionSchema.from_json('outputs/schema.json')
122
+ print(schema)
123
+ # 打印扁平化后的结果
124
+ for dataset in schema.flatten():
125
+ print(f'Dataset: {dataset.name}')
126
+ print(f"Hierarchy: {' -> '.join(dataset.hierarchy)}")
evalscope/config.py CHANGED
@@ -9,7 +9,8 @@ from typing import Dict, List, Optional, Union
9
9
 
10
10
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
11
11
  from evalscope.models.custom import CustomModel
12
- from evalscope.utils import dict_to_yaml, gen_hash, json_to_dict, yaml_to_dict
12
+ from evalscope.utils import gen_hash
13
+ from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
13
14
  from evalscope.utils.logger import get_logger
14
15
 
15
16
  logger = get_logger()
@@ -30,7 +31,8 @@ DEFAULT_GENERATION_CONFIG = {
30
31
  @dataclass
31
32
  class TaskConfig:
32
33
  # Model-related arguments
33
- model: Union[str, CustomModel, None] = None
34
+ model: Union[str, 'CustomModel', None] = None
35
+ model_id: Optional[str] = None
34
36
  model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
35
37
 
36
38
  # Template-related arguments
@@ -38,8 +40,8 @@ class TaskConfig:
38
40
  chat_template: Optional[str] = None
39
41
 
40
42
  # Dataset-related arguments
41
- datasets: Optional[List[str]] = None
42
- dataset_args: Optional[Dict] = field(default_factory=dict)
43
+ datasets: List[str] = field(default_factory=list)
44
+ dataset_args: Dict = field(default_factory=dict)
43
45
  dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
44
46
  dataset_hub: str = HubType.MODELSCOPE
45
47
 
@@ -62,10 +64,18 @@ class TaskConfig:
62
64
  # Debug and runtime mode arguments
63
65
  debug: bool = False
64
66
  dry_run: bool = False
65
- seed: int = 42
67
+ seed: Optional[int] = 42
68
+ api_url: Optional[str] = None # Only used for server model
69
+ api_key: Optional[str] = 'EMPTY' # Only used for server model
70
+
71
+ def __post_init__(self):
72
+ if (not self.model_id) and self.model:
73
+ if isinstance(self.model, CustomModel):
74
+ self.model_id = type(self.model).__name__
75
+ else:
76
+ self.model_id = os.path.basename(self.model).rstrip(os.sep)
66
77
 
67
78
  def to_dict(self):
68
- # Note: to avoid serialization error for some model instance
69
79
  return self.__dict__
70
80
 
71
81
  def __str__(self):
@@ -105,7 +115,9 @@ class TaskConfig:
105
115
  def from_args(args: Namespace):
106
116
  # Convert Namespace to a dictionary and filter out None values
107
117
  args_dict = {k: v for k, v in vars(args).items() if v is not None}
108
- del args_dict['func'] # Note: compat CLI arguments
118
+
119
+ if 'func' in args_dict:
120
+ del args_dict['func'] # Note: compat CLI arguments
109
121
 
110
122
  return TaskConfig.from_dict(args_dict)
111
123
 
@@ -119,6 +131,8 @@ class TaskConfig:
119
131
  continue
120
132
 
121
133
  task.model = custom_model
134
+ task.model_args = custom_model.config
135
+ task.model_id = type(custom_model).__name__
122
136
  res_list.append(task)
123
137
 
124
138
  return res_list
@@ -168,6 +182,30 @@ tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
168
182
  registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
169
183
 
170
184
 
185
+ def parse_task_config(task_cfg) -> TaskConfig:
186
+ """Parse task configuration from various formats into a TaskConfig object."""
187
+ if isinstance(task_cfg, TaskConfig):
188
+ logger.info('Args: Task config is provided with TaskConfig type.')
189
+ elif isinstance(task_cfg, dict):
190
+ logger.info('Args: Task config is provided with dictionary type.')
191
+ task_cfg = TaskConfig.from_dict(task_cfg)
192
+ elif isinstance(task_cfg, Namespace):
193
+ logger.info('Args: Task config is provided with CommandLine type.')
194
+ task_cfg = TaskConfig.from_args(task_cfg)
195
+ elif isinstance(task_cfg, str):
196
+ extension = task_cfg.split('.')[-1]
197
+ logger.info(f'Args: Task config is provided with {extension} file type.')
198
+ if extension in ['yaml', 'yml']:
199
+ task_cfg = TaskConfig.from_yaml(task_cfg)
200
+ elif extension == 'json':
201
+ task_cfg = TaskConfig.from_json(task_cfg)
202
+ else:
203
+ raise ValueError('Args: Unsupported file extension.')
204
+ else:
205
+ raise ValueError('Args: Please provide a valid task config.')
206
+ return task_cfg
207
+
208
+
171
209
  class TempModel(CustomModel):
172
210
 
173
211
  def __init__(self, config: dict):
evalscope/constants.py CHANGED
@@ -1,5 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
2
  from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
4
3
  from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
5
4
 
@@ -7,6 +6,7 @@ DEFAULT_WORK_DIR = './outputs'
7
6
  DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
8
7
  DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
9
8
  DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
9
+ DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old version
10
10
 
11
11
 
12
12
  class HubType:
@@ -76,33 +76,6 @@ class ArenaMode:
76
76
  PAIRWISE_BASELINE = 'pairwise_baseline'
77
77
 
78
78
 
79
- class OutputsStructure:
80
- LOGS_DIR = 'logs'
81
- PREDICTIONS_DIR = 'predictions'
82
- REVIEWS_DIR = 'reviews'
83
- REPORTS_DIR = 'reports'
84
- CONFIGS_DIR = 'configs'
85
-
86
- def __init__(self, outputs_dir: str, is_make: bool = True):
87
- self.outputs_dir = outputs_dir
88
- self.logs_dir = os.path.join(outputs_dir, OutputsStructure.LOGS_DIR)
89
- self.predictions_dir = os.path.join(outputs_dir, OutputsStructure.PREDICTIONS_DIR)
90
- self.reviews_dir = os.path.join(outputs_dir, OutputsStructure.REVIEWS_DIR)
91
- self.reports_dir = os.path.join(outputs_dir, OutputsStructure.REPORTS_DIR)
92
- self.configs_dir = os.path.join(outputs_dir, OutputsStructure.CONFIGS_DIR)
93
-
94
- if is_make:
95
- self.create_directories()
96
-
97
- def create_directories(self):
98
- os.makedirs(self.outputs_dir, exist_ok=True)
99
- os.makedirs(self.logs_dir, exist_ok=True)
100
- os.makedirs(self.predictions_dir, exist_ok=True)
101
- os.makedirs(self.reviews_dir, exist_ok=True)
102
- os.makedirs(self.reports_dir, exist_ok=True)
103
- os.makedirs(self.configs_dir, exist_ok=True)
104
-
105
-
106
79
  class AnswerKeys:
107
80
  ANSWER_ID = 'answer_id'
108
81
  RAW_INPUT = 'raw_input'
@@ -162,21 +135,17 @@ class EvalStage:
162
135
  class EvalType:
163
136
 
164
137
  CUSTOM = 'custom'
165
- CHECKPOINT = 'checkpoint'
138
+ CHECKPOINT = 'checkpoint' # native model checkpoint
139
+ SERVICE = 'service' # model service
166
140
 
167
141
 
168
142
  class EvalBackend:
169
- # Use native evaluation pipeline of EvalScope
170
143
  NATIVE = 'Native'
171
-
172
- # Use OpenCompass framework as the evaluation backend
173
144
  OPEN_COMPASS = 'OpenCompass'
174
-
175
- # Use VLM Eval Kit as the multi-modal model evaluation backend
176
145
  VLM_EVAL_KIT = 'VLMEvalKit'
177
-
178
- # Use RAGEval as the RAG evaluation backend
179
146
  RAG_EVAL = 'RAGEval'
180
-
181
- # Use third-party evaluation backend/modules
182
147
  THIRD_PARTY = 'ThirdParty'
148
+
149
+
150
+ class DataCollection:
151
+ NAME = 'data_collection'
@@ -1,4 +1,3 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
3
  from evalscope.evaluator.evaluator import Evaluator
4
- from evalscope.evaluator.humaneval_evaluator import HumanevalEvaluator