evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +20 -5
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +1 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/evaluator/evaluator.py +15 -12
- evalscope/metrics/__init__.py +6 -0
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
- evalscope/metrics/llm_judge.py +105 -20
- evalscope/metrics/metrics.py +1 -1
- evalscope/models/adapters/base_adapter.py +0 -2
- evalscope/models/adapters/server_adapter.py +2 -2
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/perf/arguments.py +2 -16
- evalscope/perf/main.py +1 -1
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +45 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +50 -2
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +13 -37
- tests/perf/test_perf.py +2 -2
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
|
@@ -1,391 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
# flake8: noqa
|
|
3
|
-
|
|
4
|
-
import os
|
|
5
|
-
import pandas as pd
|
|
6
|
-
import random
|
|
7
|
-
import sys
|
|
8
|
-
import time
|
|
9
|
-
from abc import ABC, abstractmethod
|
|
10
|
-
from functools import partial
|
|
11
|
-
from typing import Any, List, Tuple
|
|
12
|
-
|
|
13
|
-
from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
|
|
14
|
-
from evalscope.models import OpenAIModel
|
|
15
|
-
from evalscope.utils import completion_parsers, random_seeded_choice
|
|
16
|
-
from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
|
|
17
|
-
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
|
|
18
|
-
from evalscope.utils.logger import get_logger
|
|
19
|
-
|
|
20
|
-
logger = get_logger()
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class BaseReviewer(ABC):
|
|
24
|
-
|
|
25
|
-
def __init__(self, **kwargs):
|
|
26
|
-
...
|
|
27
|
-
|
|
28
|
-
@abstractmethod
|
|
29
|
-
def run(self, *args, **kwargs):
|
|
30
|
-
"""
|
|
31
|
-
Run pairwise battles with given models.
|
|
32
|
-
"""
|
|
33
|
-
raise NotImplementedError('run() method must be implemented in your subclass.')
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
class AutoReviewerGpt4(BaseReviewer):
|
|
37
|
-
"""
|
|
38
|
-
Auto-review target answers(models) pairwise with GPT-4.
|
|
39
|
-
|
|
40
|
-
Args:
|
|
41
|
-
prompt_file: path to prompt templates file.
|
|
42
|
-
answer_file_list: list of paths to answer files.
|
|
43
|
-
review_result_file: path to review result file.
|
|
44
|
-
reviewer_args: config for reviewer(GPT-4).
|
|
45
|
-
|
|
46
|
-
Examples:
|
|
47
|
-
>>> from evalscope.evaluator.reviewer.auto_reviewer import AutoReviewerGpt4
|
|
48
|
-
>>> input_kwargs = dict(prompt_file='/path/to/prompt_file.jsonl', answer_file_list=['/path/to/ans1_file.jsonl',
|
|
49
|
-
'/path/to/ans2_file.jsonl', ...], review_file='/path/to/review_file.jsonl',
|
|
50
|
-
reviewer_args={'model': 'gpt-4', 'mode': 'single'})
|
|
51
|
-
>>> auto_reviewer = AutoReviewerGpt4(**input_kwargs)
|
|
52
|
-
>>> auto_reviewer.run(dry_run=False)
|
|
53
|
-
"""
|
|
54
|
-
|
|
55
|
-
MODEL_NAME = 'gpt-4'
|
|
56
|
-
|
|
57
|
-
def __init__(self,
|
|
58
|
-
prompt_file: str,
|
|
59
|
-
answer_file_list: list,
|
|
60
|
-
review_result_file: str,
|
|
61
|
-
baseline_file: str = None,
|
|
62
|
-
reference_file: str = None,
|
|
63
|
-
reviewer_args: dict = None,
|
|
64
|
-
cache_file: str = None,
|
|
65
|
-
**kwargs):
|
|
66
|
-
super().__init__(**kwargs)
|
|
67
|
-
|
|
68
|
-
self.review_result_file = review_result_file
|
|
69
|
-
self.prompt_list = jsonl_to_list(prompt_file)
|
|
70
|
-
self.answer_list = [jsonl_to_list(answer_file) for answer_file in answer_file_list]
|
|
71
|
-
self.reference_list = jsonl_to_list(reference_file) if reference_file else []
|
|
72
|
-
self.cache_list = jsonl_to_list(cache_file) if cache_file and os.path.isfile(cache_file) else []
|
|
73
|
-
|
|
74
|
-
self.reviewer_args = reviewer_args if reviewer_args \
|
|
75
|
-
else self._get_default_args()
|
|
76
|
-
|
|
77
|
-
self.review_mode = self.reviewer_args.pop('mode', ArenaMode.PAIRWISE)
|
|
78
|
-
if self.review_mode == ArenaMode.PAIRWISE_BASELINE:
|
|
79
|
-
assert baseline_file is not None, f'baseline_file is required for {ArenaMode.PAIRWISE_BASELINE} mode'
|
|
80
|
-
self.answer_list.append(jsonl_to_list(baseline_file))
|
|
81
|
-
self.baseline_idx = len(self.answer_list) - 1
|
|
82
|
-
|
|
83
|
-
self.position_bias_mitigation = self.reviewer_args.pop(EvalConfigKeys.POSITION_BIAS_MITIGATION,
|
|
84
|
-
PositionBiasMitigation.NONE)
|
|
85
|
-
if self.position_bias_mitigation == PositionBiasMitigation.RANDOMIZE_ORDER:
|
|
86
|
-
self.random_seed = self.reviewer_args.pop(EvalConfigKeys.RANDOM_SEED, 123)
|
|
87
|
-
|
|
88
|
-
fn_completion_parser = self.reviewer_args.pop(EvalConfigKeys.FN_COMPLETION_PARSER,
|
|
89
|
-
FnCompletionParser.LMSYS_PARSER)
|
|
90
|
-
completion_parser_kwargs = self.reviewer_args.pop(EvalConfigKeys.COMPLETION_PARSER_KWARGS, {})
|
|
91
|
-
if isinstance(fn_completion_parser, str):
|
|
92
|
-
fn_completion_parser = getattr(completion_parsers, fn_completion_parser)
|
|
93
|
-
|
|
94
|
-
self.fn_completion_parser = partial(fn_completion_parser, **completion_parser_kwargs)
|
|
95
|
-
self.gpt_predictor = OpenAIModel(model_cfg=self.reviewer_args)
|
|
96
|
-
|
|
97
|
-
@staticmethod
|
|
98
|
-
def _get_default_args():
|
|
99
|
-
return dict(
|
|
100
|
-
model=AutoReviewerGpt4.MODEL_NAME,
|
|
101
|
-
mode=ArenaMode.PAIRWISE,
|
|
102
|
-
position_bias_mitigation=PositionBiasMitigation.NONE,
|
|
103
|
-
fn_completion_parser=FnCompletionParser.LMSYS_PARSER,
|
|
104
|
-
random_seed=123,
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
@staticmethod
|
|
108
|
-
def gen_prompt(prompts_list: list,
|
|
109
|
-
type: str,
|
|
110
|
-
category: str,
|
|
111
|
-
ques: str,
|
|
112
|
-
ans1: str,
|
|
113
|
-
ans2: str = None,
|
|
114
|
-
ans_ref: str = None):
|
|
115
|
-
"""
|
|
116
|
-
Generate prompt for Auto-reviewer with GPT-4.
|
|
117
|
-
"""
|
|
118
|
-
|
|
119
|
-
# Default to general category (idx 0)
|
|
120
|
-
target_prompt_dict = prompts_list[0]
|
|
121
|
-
for item in prompts_list:
|
|
122
|
-
is_category_match = category in item['category'] if isinstance(item['category'],
|
|
123
|
-
list) else item['category'] == category
|
|
124
|
-
is_type_match = item.get('type', ArenaMode.PAIRWISE) == type
|
|
125
|
-
if is_category_match and is_type_match:
|
|
126
|
-
target_prompt_dict = item
|
|
127
|
-
break
|
|
128
|
-
elif is_type_match and target_prompt_dict.get('type', ArenaMode.PAIRWISE) != type:
|
|
129
|
-
target_prompt_dict = item # fallback to type match
|
|
130
|
-
|
|
131
|
-
sys_prompt = target_prompt_dict['system_prompt']
|
|
132
|
-
prompt_template = target_prompt_dict['prompt_template']
|
|
133
|
-
defaults = target_prompt_dict.get('defaults', dict({}))
|
|
134
|
-
output_format = target_prompt_dict.get('output_format', '[[rating_a,rating_b]]')
|
|
135
|
-
|
|
136
|
-
if type == ArenaMode.SINGLE:
|
|
137
|
-
user_prompt = prompt_template.format(question=ques, answer=ans1, ref_answer_1=ans_ref, **defaults)
|
|
138
|
-
else:
|
|
139
|
-
user_prompt = prompt_template.format(
|
|
140
|
-
question=ques, answer_a=ans1, answer_b=ans2, ref_answer_1=ans_ref, **defaults)
|
|
141
|
-
|
|
142
|
-
return sys_prompt, user_prompt, output_format
|
|
143
|
-
|
|
144
|
-
def get_review_cache(self, model_a, model_b, question) -> list:
|
|
145
|
-
if model_b:
|
|
146
|
-
cache_hit = next((r for r in self.cache_list
|
|
147
|
-
if r['model_a'] == model_a and r['model_b'] == model_b and r['question'] == question),
|
|
148
|
-
None)
|
|
149
|
-
else:
|
|
150
|
-
cache_hit = next((r for r in self.cache_list if r['model'] == model_a and r['question'] == question), None)
|
|
151
|
-
return cache_hit
|
|
152
|
-
|
|
153
|
-
def get_review_pair(self, item: List[dict], dry_run=False, **kwargs) -> dict:
|
|
154
|
-
|
|
155
|
-
question = item[0]['text']
|
|
156
|
-
question_id = item[0]['question_id']
|
|
157
|
-
category = item[0]['category']
|
|
158
|
-
|
|
159
|
-
model_a = item[0]['model_id']
|
|
160
|
-
model_b = item[1]['model_id']
|
|
161
|
-
|
|
162
|
-
ans1 = item[0]['answer']
|
|
163
|
-
ans2 = item[1]['answer']
|
|
164
|
-
|
|
165
|
-
review_cache = self.get_review_cache(model_a, model_b, question)
|
|
166
|
-
if review_cache:
|
|
167
|
-
logger.info(f'Use cache review for {model_a} vs {model_b} ...')
|
|
168
|
-
return review_cache
|
|
169
|
-
|
|
170
|
-
if self.position_bias_mitigation == PositionBiasMitigation.SWAP_POSITION:
|
|
171
|
-
review_text_1, winner_1, score_1 = self._get_review_pair(
|
|
172
|
-
model_a, model_b, question, category, ans1, ans2, dry_run=dry_run, **kwargs)
|
|
173
|
-
review_text_2, winner_2, score_2 = self._get_review_pair(
|
|
174
|
-
model_b, model_a, question, category, ans2, ans1, dry_run=dry_run, **kwargs)
|
|
175
|
-
|
|
176
|
-
# Swap winner for the second round.
|
|
177
|
-
if winner_2 == 'model_a':
|
|
178
|
-
winner_2 = 'model_b'
|
|
179
|
-
elif winner_2 == 'model_b':
|
|
180
|
-
winner_2 = 'model_a'
|
|
181
|
-
review_result = dict(
|
|
182
|
-
model_a=model_a,
|
|
183
|
-
model_b=model_b,
|
|
184
|
-
win_1=winner_1,
|
|
185
|
-
win_2=winner_2,
|
|
186
|
-
anony=True,
|
|
187
|
-
tstamp=time.time(),
|
|
188
|
-
language=item[0].get('language', 'NA'),
|
|
189
|
-
question_id=question_id,
|
|
190
|
-
category=category,
|
|
191
|
-
question=question,
|
|
192
|
-
review_text_1=review_text_1,
|
|
193
|
-
review_text_2=review_text_2)
|
|
194
|
-
else:
|
|
195
|
-
review_text, winner, scores = self._get_review_pair(
|
|
196
|
-
model_a, model_b, question, category, ans1, ans2, dry_run=dry_run, **kwargs)
|
|
197
|
-
|
|
198
|
-
if dry_run:
|
|
199
|
-
scores = [round(random.random(), 1), round(random.random(), 1)]
|
|
200
|
-
winner = 'model_a' if scores[0] > scores[1] else 'model_b'
|
|
201
|
-
|
|
202
|
-
review_result = dict(
|
|
203
|
-
model_a=model_a,
|
|
204
|
-
model_b=model_b,
|
|
205
|
-
scores=scores,
|
|
206
|
-
win=winner,
|
|
207
|
-
anony=True,
|
|
208
|
-
tstamp=time.time(),
|
|
209
|
-
language=item[0].get('language', 'NA'),
|
|
210
|
-
question_id=question_id,
|
|
211
|
-
category=category,
|
|
212
|
-
question=question,
|
|
213
|
-
review_text=review_text)
|
|
214
|
-
return review_result
|
|
215
|
-
|
|
216
|
-
def get_review_single(self, row: List[dict], dry_run: bool = False, **kwargs):
|
|
217
|
-
item = row[0]
|
|
218
|
-
model = item['model_id']
|
|
219
|
-
question = item['text']
|
|
220
|
-
question_id = item['question_id']
|
|
221
|
-
category = item['category']
|
|
222
|
-
answer = item['answer']
|
|
223
|
-
|
|
224
|
-
review_cache = self.get_review_cache(model, None, question)
|
|
225
|
-
if review_cache:
|
|
226
|
-
logger.info(f'Use cache review for {model} ...')
|
|
227
|
-
return review_cache
|
|
228
|
-
|
|
229
|
-
review_text, score = self._get_review_single(model, question, category, answer, dry_run=dry_run, **kwargs)
|
|
230
|
-
|
|
231
|
-
review_result = dict(
|
|
232
|
-
model=model,
|
|
233
|
-
score=score,
|
|
234
|
-
anony=True,
|
|
235
|
-
tstamp=time.time(),
|
|
236
|
-
language=item.get('language', 'NA'),
|
|
237
|
-
question_id=question_id,
|
|
238
|
-
category=category,
|
|
239
|
-
question=question,
|
|
240
|
-
review_text=review_text)
|
|
241
|
-
return review_result
|
|
242
|
-
|
|
243
|
-
def _get_review_pair(self,
|
|
244
|
-
model_a,
|
|
245
|
-
model_b,
|
|
246
|
-
question,
|
|
247
|
-
category,
|
|
248
|
-
ans1,
|
|
249
|
-
ans2,
|
|
250
|
-
dry_run=False,
|
|
251
|
-
**kwargs) -> Tuple[str, Any]:
|
|
252
|
-
input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
|
|
253
|
-
|
|
254
|
-
if self.reference_list:
|
|
255
|
-
ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
|
|
256
|
-
assert ans_ref['answer']
|
|
257
|
-
input_msg['ans_ref'] = ans_ref['answer']
|
|
258
|
-
|
|
259
|
-
sys_prompt, user_prompt, output_format = AutoReviewerGpt4.gen_prompt(
|
|
260
|
-
prompts_list=self.prompt_list,
|
|
261
|
-
type=ArenaMode.SINGLE if self.review_mode == ArenaMode.SINGLE else ArenaMode.PAIRWISE,
|
|
262
|
-
**input_msg)
|
|
263
|
-
|
|
264
|
-
if dry_run:
|
|
265
|
-
review_text = self._get_reviewer_prediction_dummy(sys_prompt, user_prompt, output_format)
|
|
266
|
-
else:
|
|
267
|
-
review_text = self._get_reviewer_prediction(sys_prompt, user_prompt, **kwargs)
|
|
268
|
-
|
|
269
|
-
result = self.fn_completion_parser(review_text, output_format=output_format)
|
|
270
|
-
if not isinstance(result, tuple):
|
|
271
|
-
result = (result, None)
|
|
272
|
-
return review_text, *result
|
|
273
|
-
|
|
274
|
-
def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> Tuple[str, Any]:
|
|
275
|
-
input_msg = dict(ques=question, category=category, ans1=answer)
|
|
276
|
-
|
|
277
|
-
if self.reference_list:
|
|
278
|
-
ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
|
|
279
|
-
assert ans_ref['answer']
|
|
280
|
-
input_msg['ans_ref'] = ans_ref['answer']
|
|
281
|
-
|
|
282
|
-
sys_prompt, user_prompt, output_format = AutoReviewerGpt4.gen_prompt(
|
|
283
|
-
prompts_list=self.prompt_list,
|
|
284
|
-
type=ArenaMode.SINGLE if self.review_mode == ArenaMode.SINGLE else ArenaMode.PAIRWISE,
|
|
285
|
-
**input_msg)
|
|
286
|
-
|
|
287
|
-
if dry_run:
|
|
288
|
-
review_text = self._get_reviewer_prediction_dummy(sys_prompt, user_prompt, output_format)
|
|
289
|
-
else:
|
|
290
|
-
review_text = self._get_reviewer_prediction(sys_prompt, user_prompt, **kwargs)
|
|
291
|
-
|
|
292
|
-
score = self.fn_completion_parser(review_text, output_format)
|
|
293
|
-
return review_text, score
|
|
294
|
-
|
|
295
|
-
def _get_reviewer_prediction_dummy(self, sys_prompt: str, user_prompt: str, output_format) -> str:
|
|
296
|
-
logger.info('Get dummy scores for input prompt ...')
|
|
297
|
-
if output_format == '[[rating]]':
|
|
298
|
-
return f'[[{round(random.random(), 2)}]]'
|
|
299
|
-
if output_format == '[[rating_a,rating_b]]':
|
|
300
|
-
ans_list = [round(random.random(), 2), round(random.random(), 2)]
|
|
301
|
-
return ' '.join(str(element) for element in ans_list)
|
|
302
|
-
elif output_format == '[[A]]':
|
|
303
|
-
return random.choice(['[[A]]', '[[B]]', '[[C]]'])
|
|
304
|
-
elif output_format == "[{'model': <model-name>, 'rank': <model-rank>}, " \
|
|
305
|
-
"{'model': <model-name>, 'rank': <model-rank>}]":
|
|
306
|
-
rank_1 = random.choice([1, 2])
|
|
307
|
-
rank_2 = 1 if rank_1 == 2 else 2
|
|
308
|
-
return f"[{{'model': 'model_a', 'rank': {rank_1}}}, {{'model': 'model_b', 'rank': {rank_2}}}]"
|
|
309
|
-
|
|
310
|
-
def _get_reviewer_prediction(self, sys_prompt: str, user_prompt: str, **kwargs) -> str:
|
|
311
|
-
|
|
312
|
-
input_msg = dict(sys_prompt=sys_prompt, user_prompt=user_prompt)
|
|
313
|
-
|
|
314
|
-
# Call GPT-4 predictor
|
|
315
|
-
# TODO: Add more reviewer implementation
|
|
316
|
-
resp = self.gpt_predictor.predict(model_id=self.MODEL_NAME, inputs=input_msg, **kwargs)
|
|
317
|
-
|
|
318
|
-
if resp is None or len(resp) == 0:
|
|
319
|
-
logger.error(f'Failed to get response from {self.MODEL_NAME} for input: {input_msg}')
|
|
320
|
-
|
|
321
|
-
ans_text = resp['ans_text']
|
|
322
|
-
# model_id = resp['model_id']
|
|
323
|
-
|
|
324
|
-
return ans_text
|
|
325
|
-
|
|
326
|
-
def run(self, dry_run: bool = False, **kwargs):
|
|
327
|
-
print(f'Run battles for models with dry_run={dry_run} ...')
|
|
328
|
-
|
|
329
|
-
os.makedirs(os.path.dirname(self.review_result_file), exist_ok=True)
|
|
330
|
-
|
|
331
|
-
if len(self.answer_list) == 0:
|
|
332
|
-
raise Exception('The answer list cannot be empty.')
|
|
333
|
-
|
|
334
|
-
merge_key = 'question_id'
|
|
335
|
-
merged_ans_df = merge_ques_ans(self.answer_list, merge_key=merge_key)
|
|
336
|
-
merged_ans_df = merged_ans_df.drop(columns=['question_id'])
|
|
337
|
-
|
|
338
|
-
if self.review_mode == ArenaMode.PAIRWISE:
|
|
339
|
-
battle_pairs = get_battle_pairs(merged_ans_df.columns)
|
|
340
|
-
elif self.review_mode == ArenaMode.PAIRWISE_BASELINE:
|
|
341
|
-
battle_pairs = get_battle_pairs(merged_ans_df.columns, self.baseline_idx)
|
|
342
|
-
elif self.review_mode == ArenaMode.SINGLE:
|
|
343
|
-
battle_pairs = [(col, ) for col in merged_ans_df.columns]
|
|
344
|
-
else:
|
|
345
|
-
raise Exception(f'NotSupported mode: {self.review_mode}')
|
|
346
|
-
|
|
347
|
-
res_list = []
|
|
348
|
-
for t in battle_pairs:
|
|
349
|
-
pair_df = merged_ans_df[list(t)]
|
|
350
|
-
if self.position_bias_mitigation == PositionBiasMitigation.RANDOMIZE_ORDER:
|
|
351
|
-
pair_df.columns = ['output_1', 'output_2']
|
|
352
|
-
pair_df['is_switched_outputs'] = pair_df.apply(
|
|
353
|
-
lambda x: random_seeded_choice(
|
|
354
|
-
seed='is_switched_outputs' + x[0]['text'] + str(self.random_seed),
|
|
355
|
-
choices=[False, True],
|
|
356
|
-
),
|
|
357
|
-
axis=1,
|
|
358
|
-
)
|
|
359
|
-
pair_df = shuffle_pairwise_preferences(pair_df, pair_df['is_switched_outputs'])
|
|
360
|
-
|
|
361
|
-
for index, row in pair_df.iterrows():
|
|
362
|
-
row_result = self.get_review_pair(row.to_list(), dry_run=dry_run, **kwargs) \
|
|
363
|
-
if self.review_mode != ArenaMode.SINGLE \
|
|
364
|
-
else self.get_review_single(row.to_list(), dry_run=dry_run, **kwargs)
|
|
365
|
-
res_list.append(row_result)
|
|
366
|
-
dump_jsonl_data(res_list, self.review_result_file)
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
if __name__ == '__main__':
|
|
370
|
-
from pathlib import Path
|
|
371
|
-
|
|
372
|
-
work_path = os.path.join(Path(__file__).absolute().parent, '../../../')
|
|
373
|
-
prompt_template_path = os.path.join(work_path, 'evalscope/registry/data/prompt_template/prompt_templates.jsonl')
|
|
374
|
-
answer_file_list = [
|
|
375
|
-
os.path.join(work_path, 'outputs/arena/default/answers/answer_chatglm2-6b.jsonl'),
|
|
376
|
-
os.path.join(work_path, 'outputs/arena/default/answers/answer_llama2-7b.jsonl')
|
|
377
|
-
]
|
|
378
|
-
review_result_file_path = os.path.join(work_path, 'outputs/arena/default/reviews/review_gpt4.jsonl')
|
|
379
|
-
|
|
380
|
-
input_kwargs = dict(
|
|
381
|
-
prompt_file=prompt_template_path,
|
|
382
|
-
answer_file_list=answer_file_list,
|
|
383
|
-
review_result_file=review_result_file_path,
|
|
384
|
-
reviewer_args={},
|
|
385
|
-
baseline_file='',
|
|
386
|
-
reference_file='',
|
|
387
|
-
cache_file='',
|
|
388
|
-
)
|
|
389
|
-
|
|
390
|
-
auto_reviewer = AutoReviewerGpt4(**input_kwargs)
|
|
391
|
-
auto_reviewer.run(dry_run=True)
|
evalscope/registry/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
# input raw data
|
|
2
|
-
question_file: registry/data/question.jsonl
|
|
3
|
-
|
|
4
|
-
# candidate models to be battled
|
|
5
|
-
answers_gen:
|
|
6
|
-
chatglm3-6b:
|
|
7
|
-
# model_id_or_path could be local absolute path, e.g. /to/path/.cache/modelscope/ZhipuAI/chatglm3-6b
|
|
8
|
-
model_id_or_path: ZhipuAI/chatglm3-6b # model_id on modelscope
|
|
9
|
-
revision: v1.0.2 # revision of model, default is NULL
|
|
10
|
-
precision: torch.float16
|
|
11
|
-
enable: true # enable or disable this model
|
|
12
|
-
template_type: chatglm3 # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
|
|
13
|
-
generation_config:
|
|
14
|
-
do_sample: true
|
|
15
|
-
max_new_tokens: 256
|
|
16
|
-
top_k: 20
|
|
17
|
-
top_p: 0.75
|
|
18
|
-
temperature: 0.333
|
|
19
|
-
# output predicted answer file name
|
|
20
|
-
output_file: registry/data/arena/answers/answer_chatglm3-6b.jsonl
|
|
21
|
-
Baichuan2-7B-Base:
|
|
22
|
-
model_id_or_path: baichuan-inc/Baichuan2-7B-Base
|
|
23
|
-
revision: v1.0.2 # revision of model, default is NULL
|
|
24
|
-
precision: torch.float16
|
|
25
|
-
enable: false # enable or disable this model
|
|
26
|
-
template_type: default-generation
|
|
27
|
-
generation_config:
|
|
28
|
-
do_sample: true
|
|
29
|
-
max_new_tokens: 256
|
|
30
|
-
top_k: 20
|
|
31
|
-
top_p: 0.75
|
|
32
|
-
temperature: 0.3
|
|
33
|
-
output_file: registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl
|
|
34
|
-
Qwen-7B:
|
|
35
|
-
model_id_or_path: qwen/Qwen-7B
|
|
36
|
-
revision: v1.1.8 # revision of model, default is NULL
|
|
37
|
-
precision: torch.float16
|
|
38
|
-
enable: true # enable or disable this model # TODO: tokenizer issue
|
|
39
|
-
template_type: default-generation
|
|
40
|
-
generation_config:
|
|
41
|
-
do_sample: true
|
|
42
|
-
max_new_tokens: 256
|
|
43
|
-
top_k: 20
|
|
44
|
-
top_p: 0.75
|
|
45
|
-
temperature: 0.3
|
|
46
|
-
output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
|
|
47
|
-
|
|
48
|
-
# Auto-reviewer(GPT-4) config
|
|
49
|
-
reviews_gen:
|
|
50
|
-
enable: true
|
|
51
|
-
reviewer:
|
|
52
|
-
# class reference of auto reviewer(GPT-4)
|
|
53
|
-
ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
|
|
54
|
-
args:
|
|
55
|
-
max_tokens: 1024
|
|
56
|
-
temperature: 0.2
|
|
57
|
-
# options: pairwise, pairwise_baseline, single (default is pairwise)
|
|
58
|
-
mode: pairwise
|
|
59
|
-
# position bias mitigation strategy, options: swap_position, randomize_order, NULL. default is NULL
|
|
60
|
-
position_bias_mitigation: NULL
|
|
61
|
-
# completion parser config, default is lmsys_parser
|
|
62
|
-
fn_completion_parser: lmsys_parser
|
|
63
|
-
# prompt templates for auto reviewer(GPT-4)
|
|
64
|
-
prompt_file: registry/data/prompt_template/prompt_templates.jsonl
|
|
65
|
-
# target answer files list to be reviewed,
|
|
66
|
-
# could be replaced by your own path: ['/path/to/answers_model_1.jsonl', '/path/to/answers_model_2.jsonl', ...]
|
|
67
|
-
# Default is NULL, which means all answers in answers_gen will be reviewed
|
|
68
|
-
target_answers: NULL
|
|
69
|
-
# output file name of auto reviewer
|
|
70
|
-
review_file: registry/data/arena/reviews/review_gpt4.jsonl
|
|
71
|
-
|
|
72
|
-
# rating results
|
|
73
|
-
rating_gen:
|
|
74
|
-
enable: true
|
|
75
|
-
metrics: ['elo']
|
|
76
|
-
# elo rating report file name
|
|
77
|
-
report_file: registry/data/arena/reports/elo_rating_origin.csv
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
# input raw data
|
|
2
|
-
question_file: registry/data/question.jsonl
|
|
3
|
-
|
|
4
|
-
# candidate models to be battled
|
|
5
|
-
answers_gen:
|
|
6
|
-
Qwen2-7B-Instruct:
|
|
7
|
-
model_id_or_path: /mnt/data/data/user/maoyunlin.myl/models/Qwen2-7B-Instruct # model_id on modelscope
|
|
8
|
-
revision: NULL # revision of model, default is NULL
|
|
9
|
-
precision: torch.float16
|
|
10
|
-
enable: true # enable or disable this model
|
|
11
|
-
template_type: default-generation # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
|
|
12
|
-
generation_config:
|
|
13
|
-
do_sample: true
|
|
14
|
-
max_new_tokens: 512
|
|
15
|
-
top_k: 20
|
|
16
|
-
top_p: 0.9
|
|
17
|
-
temperature: 0.7
|
|
18
|
-
# output predicted answer file name
|
|
19
|
-
output_file: registry/data/arena/answers/answer_qwen2.jsonl
|
|
20
|
-
Qwen-7B:
|
|
21
|
-
model_id_or_path: /mnt/data/data/user/maoyunlin.myl/output/qwen2-7b-instruct/v25-20240809-113533/checkpoint-309-merged
|
|
22
|
-
revision: NULL # revision of model, default is NULL
|
|
23
|
-
precision: torch.float16
|
|
24
|
-
enable: true # enable or disable this model
|
|
25
|
-
template_type: default-generation
|
|
26
|
-
generation_config:
|
|
27
|
-
do_sample: true
|
|
28
|
-
max_new_tokens: 512
|
|
29
|
-
top_k: 20
|
|
30
|
-
top_p: 0.9
|
|
31
|
-
temperature: 0.7
|
|
32
|
-
output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
|
|
33
|
-
|
|
34
|
-
# Auto-reviewer(GPT-4) config
|
|
35
|
-
reviews_gen:
|
|
36
|
-
enable: true
|
|
37
|
-
reviewer:
|
|
38
|
-
# class reference of auto reviewer(GPT-4)
|
|
39
|
-
ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
|
|
40
|
-
args:
|
|
41
|
-
max_tokens: 1024
|
|
42
|
-
temperature: 0.2
|
|
43
|
-
# options: pairwise, pairwise_baseline, single (default is pairwise)
|
|
44
|
-
mode: pairwise
|
|
45
|
-
# position bias mitigation strategy, options: swap_position, randomize_order, NULL. default is NULL
|
|
46
|
-
position_bias_mitigation: NULL
|
|
47
|
-
# completion parser config, default is lmsys_parser
|
|
48
|
-
fn_completion_parser: lmsys_parser
|
|
49
|
-
# prompt templates for auto reviewer(GPT-4)
|
|
50
|
-
prompt_file: registry/data/prompt_template/prompt_templates.jsonl
|
|
51
|
-
# target answer files list to be reviewed,
|
|
52
|
-
# could be replaced by your own path: ['/path/to/answers_model_1.jsonl', '/path/to/answers_model_2.jsonl', ...]
|
|
53
|
-
# Default is NULL, which means all answers in answers_gen will be reviewed
|
|
54
|
-
target_answers: NULL
|
|
55
|
-
# output file name of auto reviewer
|
|
56
|
-
review_file: registry/data/arena/reviews/review_gpt4.jsonl
|
|
57
|
-
|
|
58
|
-
# rating results
|
|
59
|
-
rating_gen:
|
|
60
|
-
enable: true
|
|
61
|
-
metrics: ['elo']
|
|
62
|
-
# elo rating report file name
|
|
63
|
-
report_file: registry/data/arena/reports/elo_rating_origin.csv
|
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
# input raw data
|
|
2
|
-
question_file: registry/data/question.jsonl
|
|
3
|
-
|
|
4
|
-
# candidate models to be battled
|
|
5
|
-
answers_gen:
|
|
6
|
-
chatglm3-6b:
|
|
7
|
-
# model_id_or_path could be local absolute path, e.g. /to/path/.cache/modelscope/ZhipuAI/chatglm3-6b
|
|
8
|
-
model_id_or_path: ZhipuAI/chatglm3-6b # model_id on modelscope
|
|
9
|
-
revision: v1.0.2 # revision of model, default is NULL
|
|
10
|
-
precision: torch.float16
|
|
11
|
-
enable: true # enable or disable this model
|
|
12
|
-
template_type: chatglm3 # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
|
|
13
|
-
generation_config:
|
|
14
|
-
do_sample: true
|
|
15
|
-
max_new_tokens: 256
|
|
16
|
-
top_k: 20
|
|
17
|
-
top_p: 0.75
|
|
18
|
-
temperature: 0.3
|
|
19
|
-
# output predicted answer file name
|
|
20
|
-
output_file: registry/data/arena/answers/answer_chatglm3-6b.jsonl
|
|
21
|
-
Baichuan2-7B-Base:
|
|
22
|
-
model_id_or_path: baichuan-inc/Baichuan2-7B-Base
|
|
23
|
-
revision: v1.0.2 # revision of model, default is NULL
|
|
24
|
-
precision: torch.float16
|
|
25
|
-
enable: false # enable or disable this model
|
|
26
|
-
template_type: default-generation
|
|
27
|
-
generation_config:
|
|
28
|
-
do_sample: true
|
|
29
|
-
max_new_tokens: 256
|
|
30
|
-
top_k: 20
|
|
31
|
-
top_p: 0.75
|
|
32
|
-
temperature: 0.3
|
|
33
|
-
output_file: registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl
|
|
34
|
-
Qwen-7B:
|
|
35
|
-
model_id_or_path: qwen/Qwen-7B
|
|
36
|
-
revision: v1.1.8 # revision of model, default is NULL
|
|
37
|
-
precision: torch.float16
|
|
38
|
-
enable: true # enable or disable this model # TODO: tokenizer issue
|
|
39
|
-
template_type: default-generation
|
|
40
|
-
generation_config:
|
|
41
|
-
do_sample: true
|
|
42
|
-
max_new_tokens: 256
|
|
43
|
-
top_k: 20
|
|
44
|
-
top_p: 0.75
|
|
45
|
-
temperature: 0.3
|
|
46
|
-
output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
|
|
47
|
-
|
|
48
|
-
# model of auto-reviewer
|
|
49
|
-
reviews_gen:
|
|
50
|
-
enable: true
|
|
51
|
-
reviewer:
|
|
52
|
-
ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
|
|
53
|
-
args:
|
|
54
|
-
model: gpt-4
|
|
55
|
-
max_tokens: 1024
|
|
56
|
-
temperature: 0
|
|
57
|
-
# pairwise comparison against baseline
|
|
58
|
-
mode: pairwise_baseline
|
|
59
|
-
# position bias mitigation strategy, options: swap_position, randomize_order, None. default is None
|
|
60
|
-
position_bias_mitigation: swap_position
|
|
61
|
-
# completion parser config, default is lmsys_parser
|
|
62
|
-
fn_completion_parser: lmsys_parser
|
|
63
|
-
# target answers list to be reviewed, could be replaced by your own path: /path/to/answers.jsonl
|
|
64
|
-
target_answers: [registry/data/arena/answers/answer_chatglm3-6b.jsonl,
|
|
65
|
-
registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl]
|
|
66
|
-
# the path to the outputs of the baseline model
|
|
67
|
-
baseline_file: registry/data/arena/answers/answer_text_davinci_003.jsonl
|
|
68
|
-
# the path to the reference answers
|
|
69
|
-
reference_file:
|
|
70
|
-
# prompt templates for auto reviewer(GPT-4)
|
|
71
|
-
prompt_file: registry/data/prompt_template/lmsys_v2.jsonl
|
|
72
|
-
# output file of auto reviewer
|
|
73
|
-
review_file: registry/data/arena/reviews/review_gpt4_pair_baseline.jsonl
|
|
74
|
-
# cache file of auto reviewer
|
|
75
|
-
cache_file: registry/data/arena/reviews/review_gpt4_pair_baseline.jsonl
|
|
76
|
-
|
|
77
|
-
# rating results
|
|
78
|
-
rating_gen:
|
|
79
|
-
enable: true
|
|
80
|
-
metrics: ['pairwise']
|
|
81
|
-
baseline_model: text_davinci_003
|
|
82
|
-
# elo rating report file
|
|
83
|
-
report_file: registry/data/arena/reports/rating_pairwise_baseline.csv
|