evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  25. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  26. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  27. evalscope/benchmarks/data_adapter.py +20 -5
  28. evalscope/benchmarks/general_arena/__init__.py +0 -0
  29. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  30. evalscope/benchmarks/general_arena/utils.py +226 -0
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  35. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  37. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  38. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  39. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  41. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  42. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  43. evalscope/benchmarks/race/race_adapter.py +1 -1
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  45. evalscope/benchmarks/utils.py +1 -2
  46. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  47. evalscope/config.py +8 -123
  48. evalscope/evaluator/evaluator.py +15 -12
  49. evalscope/metrics/__init__.py +6 -0
  50. evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
  51. evalscope/metrics/llm_judge.py +105 -20
  52. evalscope/metrics/metrics.py +1 -1
  53. evalscope/models/adapters/base_adapter.py +0 -2
  54. evalscope/models/adapters/server_adapter.py +2 -2
  55. evalscope/models/custom/dummy_model.py +3 -3
  56. evalscope/perf/arguments.py +2 -16
  57. evalscope/perf/main.py +1 -1
  58. evalscope/perf/utils/analysis_result.py +24 -23
  59. evalscope/perf/utils/benchmark_util.py +1 -1
  60. evalscope/report/__init__.py +1 -1
  61. evalscope/report/utils.py +34 -15
  62. evalscope/run.py +1 -1
  63. evalscope/summarizer.py +1 -2
  64. evalscope/utils/__init__.py +63 -2
  65. evalscope/utils/argument_utils.py +64 -0
  66. evalscope/utils/import_utils.py +16 -0
  67. evalscope/utils/io_utils.py +45 -4
  68. evalscope/utils/model_utils.py +37 -1
  69. evalscope/version.py +2 -2
  70. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
  71. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
  72. tests/aigc/test_t2i.py +1 -1
  73. tests/cli/test_all.py +50 -2
  74. tests/cli/test_collection.py +1 -1
  75. tests/cli/test_custom.py +261 -0
  76. tests/cli/test_run.py +13 -37
  77. tests/perf/test_perf.py +2 -2
  78. tests/rag/test_clip_benchmark.py +2 -1
  79. tests/rag/test_mteb.py +3 -1
  80. tests/rag/test_ragas.py +3 -1
  81. tests/swift/test_run_swift_eval.py +2 -1
  82. tests/swift/test_run_swift_vlm_eval.py +2 -1
  83. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  84. tests/utils.py +13 -0
  85. tests/vlm/test_vlmeval.py +8 -2
  86. evalscope/evaluator/rating_eval.py +0 -157
  87. evalscope/evaluator/reviewer/__init__.py +0 -1
  88. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  89. evalscope/registry/__init__.py +0 -1
  90. evalscope/registry/config/cfg_arena.yaml +0 -77
  91. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  92. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  93. evalscope/registry/config/cfg_single.yaml +0 -78
  94. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  95. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  96. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  97. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  98. evalscope/registry/data/question.jsonl +0 -80
  99. evalscope/registry/tasks/arc.yaml +0 -28
  100. evalscope/registry/tasks/bbh.yaml +0 -26
  101. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  102. evalscope/registry/tasks/ceval.yaml +0 -27
  103. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  104. evalscope/registry/tasks/cmmlu.yaml +0 -27
  105. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  106. evalscope/registry/tasks/general_qa.yaml +0 -27
  107. evalscope/registry/tasks/gsm8k.yaml +0 -29
  108. evalscope/registry/tasks/mmlu.yaml +0 -29
  109. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  110. evalscope/run_arena.py +0 -202
  111. evalscope/utils/arena_utils.py +0 -217
  112. evalscope/utils/completion_parsers.py +0 -82
  113. /evalscope/{utils → benchmarks}/filters.py +0 -0
  114. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
  115. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
  116. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
  117. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,411 @@
1
+ import glob
2
+ import os
3
+ from collections import defaultdict
4
+ from typing import Any, List
5
+
6
+ from evalscope.benchmarks import Benchmark, DataAdapter
7
+ from evalscope.constants import EvalType
8
+ from evalscope.metrics import Metric, mean, metric_registry
9
+ from evalscope.report import Report, ReportKey
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ # flake8: noqa
13
+
14
+ logger = get_logger()
15
+
16
+ GRADER_SYSTEM_PROMPT = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"." # noqa: E501
17
+
18
+ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>".strip(
19
+ ) # noqa: E501
20
+
21
+
22
+ @Benchmark.register(
23
+ name='general_arena',
24
+ pretty_name='GeneralArena',
25
+ tags=['Custom', 'Arena'],
26
+ description=
27
+ 'GeneralArena is a custom benchmark designed to evaluate the performance of large language models in a competitive setting, '
28
+ 'where models are pitted against each other in custom tasks to determine their relative strengths and weaknesses. You should '
29
+ 'provide the model outputs in the format of a list of dictionaries, where each dictionary contains the model name and its report path. '
30
+ 'For detailed instructions on how to use this benchmark, please refer to the [Arena User Guide](https://evalscope.readthedocs.io/zh-cn/latest/user_guides/arena.html).',
31
+ dataset_id='general_arena',
32
+ metric_list=['winrate'],
33
+ few_shot_num=0,
34
+ train_split=None,
35
+ eval_split='test',
36
+ system_prompt=GRADER_SYSTEM_PROMPT,
37
+ prompt_template=GRADER_TEMPLATE,
38
+ extra_params={
39
+ 'models': [{
40
+ 'name': 'qwen-plus',
41
+ 'report_path': 'outputs/20250627_172550/reports/qwen-plus'
42
+ }, {
43
+ 'name': 'qwen2.5-7b',
44
+ 'report_path': 'outputs/20250627_172817/reports/qwen2.5-7b-instruct'
45
+ }],
46
+ 'baseline':
47
+ 'qwen2.5-7b'
48
+ })
49
+ class GeneralArenaAdapter(DataAdapter):
50
+
51
+ def __init__(self, *args, **kwargs):
52
+ super().__init__(*args, **kwargs)
53
+
54
+ # register metrics
55
+ metric_registry.register(Metric(name='winrate', object=mean))
56
+
57
+ # whether to use LLM as a judge
58
+ self.llm_as_a_judge = True
59
+
60
+ extra_params = kwargs.get('extra_params', {})
61
+ self.models = extra_params.get('models', [])
62
+ self.baseline = extra_params.get('baseline', None)
63
+
64
+ def load(self, **kwargs):
65
+ self._check_names()
66
+ self._check_reports()
67
+ self._check_datasets()
68
+ logger.info(f'Overall datasets: {self.overall_datasets}')
69
+ dataset_model_dict = self._load_common_datasets()
70
+ data_dict = self._build_pair_wise_data(dataset_model_dict)
71
+ return data_dict
72
+
73
+ def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
74
+ return self.gen_prompt_data(input_d['question'])
75
+
76
+ def _check_names(self):
77
+ """Check the names of the models and baseline."""
78
+ # check duplicate models
79
+ model_names = [model['name'] for model in self.models]
80
+ if len(model_names) != len(set(model_names)):
81
+ raise ValueError(f'Duplicate model names found in the models list {model_names}.')
82
+ # check if models list is empty
83
+ if len(self.models) < 2:
84
+ raise ValueError('Models list must contain at least two models.')
85
+ # check baseline model
86
+ if self.baseline and self.baseline not in model_names:
87
+ raise ValueError(f'Baseline model {self.baseline} not found in the models list.')
88
+ # check if the baseline model is not set
89
+ if not self.baseline:
90
+ logger.warning('Baseline model is not set. Using the first model as the baseline.')
91
+ self.baseline = self.models[0]['name']
92
+
93
+ def _check_reports(self):
94
+ """Check if the report paths are valid."""
95
+ for model in self.models:
96
+ report_path = model.get('report_path', None)
97
+ if not report_path or not os.path.exists(report_path):
98
+ raise ValueError(f'Report path {report_path} for model {model["name"]} does not exist.')
99
+ reports = []
100
+ for report_item in glob.glob(os.path.join(report_path, '*.json')):
101
+ report = Report.from_json(report_item)
102
+ reports.append(report)
103
+ model['reports'] = reports
104
+
105
+ def _check_datasets(self):
106
+ """Check common datasets in the reports."""
107
+ overall_datasets = set()
108
+ for model in self.models:
109
+ datasets = set()
110
+ for report in model['reports']:
111
+ report_df = report.to_dataframe()
112
+ # get unique (dataset, subset) tuples
113
+ unique_datasets = set(zip(report_df[ReportKey.dataset_name], report_df[ReportKey.subset_name]))
114
+ datasets.update(unique_datasets)
115
+ model['datasets'] = datasets
116
+ # get overall datasets by intersecting all models' datasets
117
+ overall_datasets = set.intersection(*[model['datasets'] for model in self.models if 'datasets' in model])
118
+ self.overall_datasets = overall_datasets
119
+
120
+ def _load_common_datasets(self):
121
+ """Load common datasets from the local path."""
122
+ from evalscope.utils import OutputsStructure, jsonl_to_list
123
+
124
+ dataset_dict = defaultdict(dict)
125
+ for dataset_name, subset_name in self.overall_datasets:
126
+ for model in self.models:
127
+ dataset_path = model['report_path'].replace(OutputsStructure.REPORTS_DIR, OutputsStructure.REVIEWS_DIR)
128
+ dataset_file_path = os.path.join(dataset_path, f'{dataset_name}_{subset_name}.jsonl')
129
+ if not os.path.exists(dataset_file_path):
130
+ raise ValueError(
131
+ f'Dataset {dataset_name} with subset {subset_name} not found in model {model["name"]}.')
132
+ dataset = jsonl_to_list(dataset_file_path)
133
+ # sort by index
134
+ dataset.sort(key=lambda x: x.get('index'))
135
+ dataset_dict[(dataset_name, subset_name)][model['name']] = dataset
136
+
137
+ return dataset_dict
138
+
139
+ def _build_pair_wise_data(self, dataset_dict):
140
+ """Build pairwise data for the models."""
141
+ from .utils import process_review_item
142
+
143
+ pairwise_data = defaultdict(dict)
144
+ for (dataset_name, subset_name), model_data in dataset_dict.items():
145
+ if len(model_data) < 2:
146
+ logger.warning(f'Not enough models for dataset {dataset_name} with subset {subset_name}. Skipping.')
147
+ continue
148
+ # create pairwise data for each model against the baseline
149
+ model_names = list(model_data.keys())
150
+ for name in model_names:
151
+ if name == self.baseline:
152
+ continue
153
+ pairs = []
154
+ for model_item, baseline_item in zip(model_data[name], model_data[self.baseline]):
155
+ for model_choice, baseline_choice in zip(
156
+ process_review_item(model_item), process_review_item(baseline_item)):
157
+ pairs.append({
158
+ 'question': model_choice['Question'],
159
+ 'answer_1': model_choice['Generated'],
160
+ 'answer_2': baseline_choice['Generated'],
161
+ 'model_1': name,
162
+ 'model_2': self.baseline
163
+ })
164
+ pairwise_data[f'{dataset_name}&{subset_name}@{name}&{self.baseline}'][self.eval_split] = pairs
165
+
166
+ return pairwise_data
167
+
168
+ def llm_match(self, gold, pred, judge=None, **kwargs):
169
+ from .utils import get_judge_score, post_process_result
170
+
171
+ try:
172
+ raw_input = kwargs.get('raw_input', None)
173
+ question = raw_input['question']
174
+ answer_1 = raw_input['answer_1']
175
+ answer_2 = raw_input['answer_2']
176
+ model_1 = raw_input['model_1']
177
+ model_2 = raw_input['model_2']
178
+ except KeyError as e:
179
+ logger.error(f'Missing key in raw input: {e}. Raw input: {raw_input}')
180
+ raise
181
+
182
+ system_template = self.system_prompt
183
+ prompt_template = self.prompt_template
184
+
185
+ prompt1 = prompt_template.format(question=question, answer_1=answer_1, answer_2=answer_2)
186
+ # reverse the order
187
+ prompt2 = prompt_template.format(question=question, answer_1=answer_2, answer_2=answer_1)
188
+ # get grading response
189
+ game1_response = judge(prompt1, system_prompt=system_template)
190
+ game2_response = judge(prompt2, system_prompt=system_template)
191
+ # parse grading response
192
+ # game1
193
+ res1 = post_process_result(game1_response)
194
+ score1 = get_judge_score(res1, reverse=False)
195
+ # game2
196
+ res2 = post_process_result(game2_response)
197
+ score2 = get_judge_score(res2, reverse=True)
198
+ return {
199
+ 'score':
200
+ mean([score1, score2]),
201
+ 'games': [
202
+ {
203
+ 'model_a': model_1,
204
+ 'model_b': model_2,
205
+ 'response': game1_response,
206
+ 'judgment': res1
207
+ },
208
+ {
209
+ 'model_a': model_2,
210
+ 'model_b': model_1,
211
+ 'response': game2_response,
212
+ 'judgment': res2
213
+ },
214
+ ]
215
+ }
216
+
217
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
218
+ """
219
+ compute score of the model
220
+ """
221
+ import numpy as np
222
+ import pandas as pd
223
+
224
+ from .utils import compute_mle_elo, get_battles_from_row, get_bootstrap_result, get_win_rate_column
225
+
226
+ if isinstance(review_res_list[0], list):
227
+ review_res_list = [item for sublist in review_res_list for item in sublist]
228
+
229
+ battles = pd.concat([get_battles_from_row(res) for res in review_res_list])
230
+
231
+ bt_model_coef = compute_mle_elo(battles, baseline_model=self.baseline)
232
+
233
+ bootstrap_model_coef = get_bootstrap_result(
234
+ battles, func_compute_elo=compute_mle_elo, num_round=100, baseline_model=self.baseline)
235
+
236
+ stats = pd.DataFrame()
237
+ stats['results'] = None
238
+ stats['results'] = stats['results'].astype('object')
239
+
240
+ for i, model in enumerate(bt_model_coef.index):
241
+ # assert model in bootstrap_elo_lu.columns
242
+ stats.at[i, 'model'] = model
243
+ stats.at[i, 'score'] = bt_model_coef[model]
244
+ stats.at[i, 'lower'] = np.percentile(bootstrap_model_coef[model], 2.5)
245
+ stats.at[i, 'upper'] = np.percentile(bootstrap_model_coef[model], 97.5)
246
+
247
+ metrics_dict = {}
248
+ metrics_dict['winrate'] = get_win_rate_column(stats, 'score', self.baseline).to_dict()
249
+ metrics_dict['winrate_lower'] = get_win_rate_column(stats, 'lower', self.baseline).to_dict()
250
+ metrics_dict['winrate_upper'] = get_win_rate_column(stats, 'upper', self.baseline).to_dict()
251
+
252
+ metrics = []
253
+ for metric_name, models in metrics_dict.items():
254
+ for model_name, score in models.items():
255
+ if model_name == self.baseline:
256
+ continue
257
+ metrics.append({'metric_name': metric_name, 'score': score, 'num': len(review_res_list)})
258
+ return metrics
259
+
260
+ def post_process_report(self, report: 'Report', **kwargs):
261
+ """Post-process the report to convert it to a DataFrame with winrate leaderboards."""
262
+ import pandas as pd
263
+ import tabulate
264
+
265
+ report_path = kwargs.get('report_path')
266
+ leaderboard_file = os.path.join(report_path, 'leaderboard.txt')
267
+
268
+ # Ensure report directory exists
269
+ os.makedirs(report_path, exist_ok=True)
270
+
271
+ # Convert report to dataframe
272
+ df = report.to_dataframe()
273
+
274
+ # Filter for winrate-related metrics
275
+ winrate_df = df[df[ReportKey.metric_name].str.contains('winrate')].copy()
276
+
277
+ if winrate_df.empty:
278
+ logger.warning('No winrate data found in the report.')
279
+ return
280
+
281
+ # Get all model names from self.models
282
+ all_model_names = [model['name'] for model in self.models]
283
+
284
+ # Collect all leaderboard outputs
285
+ leaderboard_outputs = []
286
+
287
+ def format_leaderboard(data_df, title):
288
+ """Format DataFrame as leaderboard with CI."""
289
+ # Pivot to get winrate, winrate_lower, winrate_upper as columns
290
+ pivot_df = data_df.pivot_table(
291
+ index=[ReportKey.model_name], columns=ReportKey.metric_name, values=ReportKey.score, aggfunc='first')
292
+
293
+ # Add baseline model with 50% winrate
294
+ baseline_data = {'winrate': 0.5, 'winrate_lower': 0.5, 'winrate_upper': 0.5}
295
+
296
+ # Create a complete index with all models
297
+ complete_index = pd.Index(all_model_names, name=pivot_df.index.name)
298
+ pivot_df = pivot_df.reindex(complete_index)
299
+
300
+ # Fill baseline model data
301
+ if self.baseline in pivot_df.index:
302
+ for col, val in baseline_data.items():
303
+ if col in pivot_df.columns:
304
+ pivot_df.loc[self.baseline, col] = val
305
+
306
+ # Fill missing values with winrate score for other models
307
+ if 'winrate' in pivot_df.columns:
308
+ pivot_df['winrate_lower'] = pivot_df.get('winrate_lower', pivot_df['winrate'])
309
+ pivot_df['winrate_upper'] = pivot_df.get('winrate_upper', pivot_df['winrate'])
310
+
311
+ # Format for display
312
+ leaderboard_data = []
313
+ for model in pivot_df.index:
314
+ if pd.isna(pivot_df.loc[model, 'winrate']):
315
+ continue
316
+
317
+ score_pct = pivot_df.loc[model, 'winrate'] * 100
318
+ lower_diff = (pivot_df.loc[model, 'winrate_lower'] - pivot_df.loc[model, 'winrate']) * 100
319
+ upper_diff = (pivot_df.loc[model, 'winrate_upper'] - pivot_df.loc[model, 'winrate']) * 100
320
+
321
+ leaderboard_data.append({
322
+ 'Model': model,
323
+ 'WinRate (%)': f'{score_pct:.1f}',
324
+ 'CI (%)': f'({lower_diff:+.1f} / {upper_diff:+.1f})'
325
+ })
326
+
327
+ # Sort by score descending
328
+ leaderboard_data.sort(key=lambda x: float(x['WinRate (%)'].replace('%', '')), reverse=True)
329
+
330
+ # Create DataFrame
331
+ leaderboard_df = pd.DataFrame(leaderboard_data)
332
+ leaderboard_df.index = range(len(leaderboard_df))
333
+
334
+ # Format as string
335
+ table_str = tabulate.tabulate(leaderboard_df, headers='keys', showindex=False)
336
+ output = f'{title}\n{table_str}\n'
337
+
338
+ logger.info(f'\n{title}\n{table_str}')
339
+ return output
340
+
341
+ # Parse dataset and subset information from dataset_name column
342
+ # Format: '{dataset_name}&{subset_name}@{name}&{self.baseline}'
343
+ def parse_dataset_key(dataset_key):
344
+ """Parse dataset key to extract dataset_name, subset_name, and model pair."""
345
+ parts = dataset_key.split('@')
346
+
347
+ dataset_subset = parts[0]
348
+ model_pair = parts[1]
349
+
350
+ dataset_name, subset_name = dataset_subset.split('&', 1)
351
+ model_1, model_2 = model_pair.split('&', 1)
352
+
353
+ return dataset_name, subset_name, model_1, model_2
354
+
355
+ # Add parsed columns
356
+ parsed_data = []
357
+ for _, row in winrate_df.iterrows():
358
+ dataset_name, subset_name, model_1, model_2 = parse_dataset_key(row[ReportKey.subset_name])
359
+ if dataset_name is not None:
360
+ parsed_data.append({
361
+ 'dataset_name': dataset_name,
362
+ 'subset_name': subset_name,
363
+ ReportKey.model_name: model_1,
364
+ ReportKey.metric_name: row[ReportKey.metric_name],
365
+ ReportKey.score: row[ReportKey.score]
366
+ })
367
+
368
+ if not parsed_data:
369
+ logger.warning('No valid dataset keys found for parsing.')
370
+ return
371
+
372
+ parsed_df = pd.DataFrame(parsed_data)
373
+
374
+ # 1. Overall ranking (aggregate across all datasets and subsets)
375
+ overall_df = parsed_df.groupby([ReportKey.model_name,
376
+ ReportKey.metric_name])[ReportKey.score].mean().reset_index()
377
+ leaderboard_outputs.append(format_leaderboard(overall_df, '=== OVERALL LEADERBOARD ==='))
378
+
379
+ # 2. Dataset-level rankings
380
+ datasets = parsed_df['dataset_name'].unique()
381
+ for dataset in sorted(datasets):
382
+ dataset_df = parsed_df[parsed_df['dataset_name'] == dataset]
383
+ dataset_agg = dataset_df.groupby([ReportKey.model_name,
384
+ ReportKey.metric_name])[ReportKey.score].mean().reset_index()
385
+ leaderboard_outputs.append(format_leaderboard(dataset_agg, f'=== DATASET LEADERBOARD: {dataset} ==='))
386
+
387
+ # 3. Subset-level rankings
388
+ subsets = parsed_df[['dataset_name', 'subset_name']].drop_duplicates()
389
+ for _, subset_row in subsets.iterrows():
390
+ dataset_name = subset_row['dataset_name']
391
+ subset_name = subset_row['subset_name']
392
+ subset_df = parsed_df[(parsed_df['dataset_name'] == dataset_name)
393
+ & (parsed_df['subset_name'] == subset_name)]
394
+ leaderboard_outputs.append(
395
+ format_leaderboard(subset_df, f'=== SUBSET LEADERBOARD: {dataset_name} - {subset_name} ==='))
396
+
397
+ # Write all leaderboard outputs to file
398
+ with open(leaderboard_file, 'w', encoding='utf-8') as f:
399
+ f.write('\n'.join(leaderboard_outputs))
400
+
401
+ logger.info(f'Leaderboard results saved to: {leaderboard_file}')
402
+
403
+ def get_gold_answer(self, input_d):
404
+ return f"model_1: {input_d['model_1']}\n---\n" + input_d['answer_1']
405
+
406
+ def llm_parse_pred_result(self, result, raw_input_d=None, eval_type=EvalType.CHECKPOINT):
407
+ return f"model_2: {raw_input_d['model_2']}\n---\n" + raw_input_d['answer_2']
408
+
409
+ def match(self, gold, pred):
410
+ logger.warning(f'Please use LLMJudge to match the result for {self.name}')
411
+ return
@@ -0,0 +1,226 @@
1
+ import inspect
2
+ import math
3
+ import numpy as np
4
+ import pandas as pd
5
+ import re
6
+ from collections import defaultdict
7
+ from sklearn.linear_model import LogisticRegression
8
+ from tqdm import tqdm
9
+
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+
15
+ def process_review_item(review_item: dict) -> dict:
16
+ """
17
+ Process a single review item to extract relevant information.
18
+
19
+ Args:
20
+ review_item (dict): The review item to process.
21
+
22
+ Returns:
23
+ dict: Processed review item with necessary information.
24
+ """
25
+ res = []
26
+ raw_input = review_item['raw_input']
27
+ sample_index = review_item['index']
28
+ question_keys = ['question', 'Question', 'prompt', 'Prompt', 'query', 'Query', 'problem', 'Problem']
29
+ # Find the first non-empty question key in raw_input
30
+ question = next((raw_input.get(key) for key in question_keys if raw_input.get(key)), None)
31
+ for choice_index, choice in enumerate(review_item['choices']):
32
+ raw_pred_answer = choice['message']['content']
33
+ parsed_gold_answer = choice['review']['gold']
34
+ parsed_pred_answer = choice['review']['pred']
35
+ score = choice['review']['result']
36
+ raw_d = {
37
+ 'Index': f'{sample_index}_{choice_index}',
38
+ 'Input': raw_input,
39
+ 'Question': question if question else '*No Question*',
40
+ 'Generated': raw_pred_answer,
41
+ 'Gold': parsed_gold_answer if parsed_gold_answer != raw_input else '*Same as Input*',
42
+ 'Pred': parsed_pred_answer,
43
+ 'Score': score,
44
+ }
45
+ res.append(raw_d)
46
+
47
+ return res
48
+
49
+
50
+ def post_process_result(completion):
51
+ result = re.findall(r'\[\[([AB<>=]+)\]\]', completion)
52
+ if result:
53
+ return result[0]
54
+ else:
55
+ return None
56
+
57
+
58
+ def get_judge_score(result, reverse=False):
59
+ """
60
+ Calculate the judge score, considering confidence weight.
61
+
62
+ Args:
63
+ result: Judgment result ('A=B', 'A>B', 'A>>B', 'B>A', 'B>>A')
64
+ reverse: Whether to reverse the score
65
+
66
+ Returns:
67
+ float: Weighted score
68
+ """
69
+
70
+ # Base score mapping - using finer-grained scores
71
+ if not reverse:
72
+ score_mapping = {
73
+ 'A=B': 0.5, # Tie
74
+ 'A>B': 0.75, # A slightly wins
75
+ 'A>>B': 1.0, # A significantly wins
76
+ 'B>A': 0.25, # B slightly wins
77
+ 'B>>A': 0.0, # B significantly wins
78
+ }
79
+ else:
80
+ score_mapping = {
81
+ 'A=B': 0.5, # Tie
82
+ 'A>B': 0.25, # A slightly wins
83
+ 'A>>B': 0.0, # A significantly wins
84
+ 'B>A': 0.75, # B slightly wins
85
+ 'B>>A': 1.0, # B significantly wins
86
+ }
87
+
88
+ base_score = score_mapping.get(result, 0.5)
89
+
90
+ return base_score
91
+
92
+
93
+ def get_battles_from_row(row, first_game_only=False, multiplier=3):
94
+ results = []
95
+
96
+ game = row['games'][0]
97
+ output = {'model_a': game['model_a'], 'model_b': game['model_b']}
98
+
99
+ weight = 1
100
+ if game['judgment'] == 'A=B':
101
+ output['winner'] = 'tie'
102
+ elif game['judgment'] == 'A>B':
103
+ output['winner'] = 'model_a'
104
+ elif game['judgment'] == 'A>>B':
105
+ output['winner'] = 'model_a'
106
+ weight = multiplier
107
+ elif game['judgment'] == 'B>A':
108
+ output['winner'] = 'model_b'
109
+ elif game['judgment'] == 'B>>A':
110
+ output['winner'] = 'model_b'
111
+ weight = multiplier
112
+ else:
113
+ weight = 0
114
+
115
+ if weight:
116
+ results += [output] * weight
117
+
118
+ if first_game_only:
119
+ return pd.DataFrame(results)
120
+
121
+ # Dont change the order of model_a and model_b
122
+ output = {'model_a': game['model_a'], 'model_b': game['model_b']}
123
+
124
+ # game 2
125
+ game = row['games'][1]
126
+
127
+ weight = 1
128
+ if game['judgment'] == 'A=B':
129
+ output['winner'] = 'tie'
130
+ elif game['judgment'] == 'A>B':
131
+ output['winner'] = 'model_b'
132
+ elif game['judgment'] == 'A>>B':
133
+ output['winner'] = 'model_b'
134
+ weight = multiplier
135
+ elif game['judgment'] == 'B>A':
136
+ output['winner'] = 'model_a'
137
+ elif game['judgment'] == 'B>>A':
138
+ output['winner'] = 'model_a'
139
+ weight = multiplier
140
+ else:
141
+ weight = 0
142
+
143
+ if weight:
144
+ results += [output] * weight
145
+
146
+ return pd.DataFrame(results)
147
+
148
+
149
+ def compute_mle_elo(df, scale=400, base=10, init_rating=1000, baseline_model='gpt4-0314'):
150
+ models = pd.concat([df['model_a'], df['model_b']]).unique()
151
+ models = pd.Series(np.arange(len(models)), index=models)
152
+
153
+ # duplicate battles
154
+ df = pd.concat([df, df], ignore_index=True)
155
+ p = len(models.index)
156
+ n = df.shape[0]
157
+
158
+ X = np.zeros([n, p])
159
+ X[np.arange(n), models[df['model_a']]] = +math.log(base)
160
+ X[np.arange(n), models[df['model_b']]] = -math.log(base)
161
+
162
+ # one A win => two A win
163
+ Y = np.zeros(n)
164
+ Y[df['winner'] == 'model_a'] = 1.0
165
+
166
+ # one tie => one A win + one B win
167
+ # find tie + tie (both bad) index
168
+ tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)')
169
+ tie_idx[len(tie_idx) // 2:] = False
170
+ Y[tie_idx] = 1.0
171
+
172
+ if len(np.unique(Y)) < 2:
173
+ logger.info('Warning: Only one class in the data')
174
+ elo_scores = pd.Series(init_rating, index=models.index)
175
+ if np.all(Y == 1.0):
176
+ elo_scores[df['model_a'].iloc[0]] += scale # Boost the winning model
177
+ elif np.all(Y == 0.0):
178
+ elo_scores[df['model_b'].iloc[0]] += scale # Boost the winning model
179
+ return elo_scores.sort_values(ascending=False)
180
+
181
+ lr = LogisticRegression(
182
+ fit_intercept=False, penalty=None, tol=1e-8) # May need to set a small value when not use GPT4 as judge model
183
+ lr.fit(X, Y)
184
+
185
+ elo_scores = scale * lr.coef_[0] + init_rating
186
+
187
+ # set anchor 1000
188
+ if baseline_model in models.index:
189
+ elo_scores += 1000 - elo_scores[models[baseline_model]]
190
+ return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
191
+
192
+
193
+ def get_bootstrap_result(battles, func_compute_elo, num_round, baseline_model='gpt-4-0314'):
194
+ rows = []
195
+ kwargs = {}
196
+ if 'baseline_model' in inspect.signature(func_compute_elo).parameters:
197
+ kwargs['baseline_model'] = baseline_model
198
+ for _ in tqdm(range(num_round), desc='bootstrap'):
199
+ res = func_compute_elo(battles.sample(frac=1.0, replace=True), **kwargs)
200
+ if res is not None:
201
+ rows.append(res)
202
+ df = pd.DataFrame(rows)
203
+ return df[df.median().sort_values(ascending=False).index]
204
+
205
+
206
+ def predict_win_rate(elo_ratings, scale=400, base=10, init_rating=1000):
207
+ names = sorted(list(elo_ratings.keys()))
208
+ wins = defaultdict(lambda: defaultdict(lambda: 0))
209
+ for a in names:
210
+ for b in names:
211
+ ea = 1 / (1 + base**((elo_ratings[b] - elo_ratings[a]) / scale))
212
+ wins[a][b] = ea
213
+ wins[b][a] = 1 - ea
214
+
215
+ data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
216
+
217
+ df = pd.DataFrame(data, index=names)
218
+ df.index.name = 'model_a'
219
+ df.columns.name = 'model_b'
220
+ return df.T
221
+
222
+
223
+ def get_win_rate_column(df, column, baseline='gpt4-0314'):
224
+ to_dict = df[['model', column]].set_index('model').to_dict()[column]
225
+ win_rate_table = predict_win_rate(to_dict)
226
+ return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x, 4))
@@ -5,7 +5,7 @@ from collections import defaultdict
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics import exact_match
8
- from evalscope.utils import ResponseParser
8
+ from evalscope.metrics.completion_parsers import ResponseParser
9
9
  from evalscope.utils.io_utils import csv_to_list, jsonl_to_list
10
10
  from evalscope.utils.logger import get_logger
11
11