evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  25. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  26. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  27. evalscope/benchmarks/data_adapter.py +20 -5
  28. evalscope/benchmarks/general_arena/__init__.py +0 -0
  29. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  30. evalscope/benchmarks/general_arena/utils.py +226 -0
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  35. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  37. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  38. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  39. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  41. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  42. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  43. evalscope/benchmarks/race/race_adapter.py +1 -1
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  45. evalscope/benchmarks/utils.py +1 -2
  46. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  47. evalscope/config.py +8 -123
  48. evalscope/evaluator/evaluator.py +15 -12
  49. evalscope/metrics/__init__.py +6 -0
  50. evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
  51. evalscope/metrics/llm_judge.py +105 -20
  52. evalscope/metrics/metrics.py +1 -1
  53. evalscope/models/adapters/base_adapter.py +0 -2
  54. evalscope/models/adapters/server_adapter.py +2 -2
  55. evalscope/models/custom/dummy_model.py +3 -3
  56. evalscope/perf/arguments.py +2 -16
  57. evalscope/perf/main.py +1 -1
  58. evalscope/perf/utils/analysis_result.py +24 -23
  59. evalscope/perf/utils/benchmark_util.py +1 -1
  60. evalscope/report/__init__.py +1 -1
  61. evalscope/report/utils.py +34 -15
  62. evalscope/run.py +1 -1
  63. evalscope/summarizer.py +1 -2
  64. evalscope/utils/__init__.py +63 -2
  65. evalscope/utils/argument_utils.py +64 -0
  66. evalscope/utils/import_utils.py +16 -0
  67. evalscope/utils/io_utils.py +45 -4
  68. evalscope/utils/model_utils.py +37 -1
  69. evalscope/version.py +2 -2
  70. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
  71. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
  72. tests/aigc/test_t2i.py +1 -1
  73. tests/cli/test_all.py +50 -2
  74. tests/cli/test_collection.py +1 -1
  75. tests/cli/test_custom.py +261 -0
  76. tests/cli/test_run.py +13 -37
  77. tests/perf/test_perf.py +2 -2
  78. tests/rag/test_clip_benchmark.py +2 -1
  79. tests/rag/test_mteb.py +3 -1
  80. tests/rag/test_ragas.py +3 -1
  81. tests/swift/test_run_swift_eval.py +2 -1
  82. tests/swift/test_run_swift_vlm_eval.py +2 -1
  83. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  84. tests/utils.py +13 -0
  85. tests/vlm/test_vlmeval.py +8 -2
  86. evalscope/evaluator/rating_eval.py +0 -157
  87. evalscope/evaluator/reviewer/__init__.py +0 -1
  88. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  89. evalscope/registry/__init__.py +0 -1
  90. evalscope/registry/config/cfg_arena.yaml +0 -77
  91. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  92. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  93. evalscope/registry/config/cfg_single.yaml +0 -78
  94. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  95. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  96. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  97. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  98. evalscope/registry/data/question.jsonl +0 -80
  99. evalscope/registry/tasks/arc.yaml +0 -28
  100. evalscope/registry/tasks/bbh.yaml +0 -26
  101. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  102. evalscope/registry/tasks/ceval.yaml +0 -27
  103. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  104. evalscope/registry/tasks/cmmlu.yaml +0 -27
  105. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  106. evalscope/registry/tasks/general_qa.yaml +0 -27
  107. evalscope/registry/tasks/gsm8k.yaml +0 -29
  108. evalscope/registry/tasks/mmlu.yaml +0 -29
  109. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  110. evalscope/run_arena.py +0 -202
  111. evalscope/utils/arena_utils.py +0 -217
  112. evalscope/utils/completion_parsers.py +0 -82
  113. /evalscope/{utils → benchmarks}/filters.py +0 -0
  114. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
  115. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
  116. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
  117. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
@@ -1,391 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- # flake8: noqa
3
-
4
- import os
5
- import pandas as pd
6
- import random
7
- import sys
8
- import time
9
- from abc import ABC, abstractmethod
10
- from functools import partial
11
- from typing import Any, List, Tuple
12
-
13
- from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
14
- from evalscope.models import OpenAIModel
15
- from evalscope.utils import completion_parsers, random_seeded_choice
16
- from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
17
- from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
18
- from evalscope.utils.logger import get_logger
19
-
20
- logger = get_logger()
21
-
22
-
23
- class BaseReviewer(ABC):
24
-
25
- def __init__(self, **kwargs):
26
- ...
27
-
28
- @abstractmethod
29
- def run(self, *args, **kwargs):
30
- """
31
- Run pairwise battles with given models.
32
- """
33
- raise NotImplementedError('run() method must be implemented in your subclass.')
34
-
35
-
36
- class AutoReviewerGpt4(BaseReviewer):
37
- """
38
- Auto-review target answers(models) pairwise with GPT-4.
39
-
40
- Args:
41
- prompt_file: path to prompt templates file.
42
- answer_file_list: list of paths to answer files.
43
- review_result_file: path to review result file.
44
- reviewer_args: config for reviewer(GPT-4).
45
-
46
- Examples:
47
- >>> from evalscope.evaluator.reviewer.auto_reviewer import AutoReviewerGpt4
48
- >>> input_kwargs = dict(prompt_file='/path/to/prompt_file.jsonl', answer_file_list=['/path/to/ans1_file.jsonl',
49
- '/path/to/ans2_file.jsonl', ...], review_file='/path/to/review_file.jsonl',
50
- reviewer_args={'model': 'gpt-4', 'mode': 'single'})
51
- >>> auto_reviewer = AutoReviewerGpt4(**input_kwargs)
52
- >>> auto_reviewer.run(dry_run=False)
53
- """
54
-
55
- MODEL_NAME = 'gpt-4'
56
-
57
- def __init__(self,
58
- prompt_file: str,
59
- answer_file_list: list,
60
- review_result_file: str,
61
- baseline_file: str = None,
62
- reference_file: str = None,
63
- reviewer_args: dict = None,
64
- cache_file: str = None,
65
- **kwargs):
66
- super().__init__(**kwargs)
67
-
68
- self.review_result_file = review_result_file
69
- self.prompt_list = jsonl_to_list(prompt_file)
70
- self.answer_list = [jsonl_to_list(answer_file) for answer_file in answer_file_list]
71
- self.reference_list = jsonl_to_list(reference_file) if reference_file else []
72
- self.cache_list = jsonl_to_list(cache_file) if cache_file and os.path.isfile(cache_file) else []
73
-
74
- self.reviewer_args = reviewer_args if reviewer_args \
75
- else self._get_default_args()
76
-
77
- self.review_mode = self.reviewer_args.pop('mode', ArenaMode.PAIRWISE)
78
- if self.review_mode == ArenaMode.PAIRWISE_BASELINE:
79
- assert baseline_file is not None, f'baseline_file is required for {ArenaMode.PAIRWISE_BASELINE} mode'
80
- self.answer_list.append(jsonl_to_list(baseline_file))
81
- self.baseline_idx = len(self.answer_list) - 1
82
-
83
- self.position_bias_mitigation = self.reviewer_args.pop(EvalConfigKeys.POSITION_BIAS_MITIGATION,
84
- PositionBiasMitigation.NONE)
85
- if self.position_bias_mitigation == PositionBiasMitigation.RANDOMIZE_ORDER:
86
- self.random_seed = self.reviewer_args.pop(EvalConfigKeys.RANDOM_SEED, 123)
87
-
88
- fn_completion_parser = self.reviewer_args.pop(EvalConfigKeys.FN_COMPLETION_PARSER,
89
- FnCompletionParser.LMSYS_PARSER)
90
- completion_parser_kwargs = self.reviewer_args.pop(EvalConfigKeys.COMPLETION_PARSER_KWARGS, {})
91
- if isinstance(fn_completion_parser, str):
92
- fn_completion_parser = getattr(completion_parsers, fn_completion_parser)
93
-
94
- self.fn_completion_parser = partial(fn_completion_parser, **completion_parser_kwargs)
95
- self.gpt_predictor = OpenAIModel(model_cfg=self.reviewer_args)
96
-
97
- @staticmethod
98
- def _get_default_args():
99
- return dict(
100
- model=AutoReviewerGpt4.MODEL_NAME,
101
- mode=ArenaMode.PAIRWISE,
102
- position_bias_mitigation=PositionBiasMitigation.NONE,
103
- fn_completion_parser=FnCompletionParser.LMSYS_PARSER,
104
- random_seed=123,
105
- )
106
-
107
- @staticmethod
108
- def gen_prompt(prompts_list: list,
109
- type: str,
110
- category: str,
111
- ques: str,
112
- ans1: str,
113
- ans2: str = None,
114
- ans_ref: str = None):
115
- """
116
- Generate prompt for Auto-reviewer with GPT-4.
117
- """
118
-
119
- # Default to general category (idx 0)
120
- target_prompt_dict = prompts_list[0]
121
- for item in prompts_list:
122
- is_category_match = category in item['category'] if isinstance(item['category'],
123
- list) else item['category'] == category
124
- is_type_match = item.get('type', ArenaMode.PAIRWISE) == type
125
- if is_category_match and is_type_match:
126
- target_prompt_dict = item
127
- break
128
- elif is_type_match and target_prompt_dict.get('type', ArenaMode.PAIRWISE) != type:
129
- target_prompt_dict = item # fallback to type match
130
-
131
- sys_prompt = target_prompt_dict['system_prompt']
132
- prompt_template = target_prompt_dict['prompt_template']
133
- defaults = target_prompt_dict.get('defaults', dict({}))
134
- output_format = target_prompt_dict.get('output_format', '[[rating_a,rating_b]]')
135
-
136
- if type == ArenaMode.SINGLE:
137
- user_prompt = prompt_template.format(question=ques, answer=ans1, ref_answer_1=ans_ref, **defaults)
138
- else:
139
- user_prompt = prompt_template.format(
140
- question=ques, answer_a=ans1, answer_b=ans2, ref_answer_1=ans_ref, **defaults)
141
-
142
- return sys_prompt, user_prompt, output_format
143
-
144
- def get_review_cache(self, model_a, model_b, question) -> list:
145
- if model_b:
146
- cache_hit = next((r for r in self.cache_list
147
- if r['model_a'] == model_a and r['model_b'] == model_b and r['question'] == question),
148
- None)
149
- else:
150
- cache_hit = next((r for r in self.cache_list if r['model'] == model_a and r['question'] == question), None)
151
- return cache_hit
152
-
153
- def get_review_pair(self, item: List[dict], dry_run=False, **kwargs) -> dict:
154
-
155
- question = item[0]['text']
156
- question_id = item[0]['question_id']
157
- category = item[0]['category']
158
-
159
- model_a = item[0]['model_id']
160
- model_b = item[1]['model_id']
161
-
162
- ans1 = item[0]['answer']
163
- ans2 = item[1]['answer']
164
-
165
- review_cache = self.get_review_cache(model_a, model_b, question)
166
- if review_cache:
167
- logger.info(f'Use cache review for {model_a} vs {model_b} ...')
168
- return review_cache
169
-
170
- if self.position_bias_mitigation == PositionBiasMitigation.SWAP_POSITION:
171
- review_text_1, winner_1, score_1 = self._get_review_pair(
172
- model_a, model_b, question, category, ans1, ans2, dry_run=dry_run, **kwargs)
173
- review_text_2, winner_2, score_2 = self._get_review_pair(
174
- model_b, model_a, question, category, ans2, ans1, dry_run=dry_run, **kwargs)
175
-
176
- # Swap winner for the second round.
177
- if winner_2 == 'model_a':
178
- winner_2 = 'model_b'
179
- elif winner_2 == 'model_b':
180
- winner_2 = 'model_a'
181
- review_result = dict(
182
- model_a=model_a,
183
- model_b=model_b,
184
- win_1=winner_1,
185
- win_2=winner_2,
186
- anony=True,
187
- tstamp=time.time(),
188
- language=item[0].get('language', 'NA'),
189
- question_id=question_id,
190
- category=category,
191
- question=question,
192
- review_text_1=review_text_1,
193
- review_text_2=review_text_2)
194
- else:
195
- review_text, winner, scores = self._get_review_pair(
196
- model_a, model_b, question, category, ans1, ans2, dry_run=dry_run, **kwargs)
197
-
198
- if dry_run:
199
- scores = [round(random.random(), 1), round(random.random(), 1)]
200
- winner = 'model_a' if scores[0] > scores[1] else 'model_b'
201
-
202
- review_result = dict(
203
- model_a=model_a,
204
- model_b=model_b,
205
- scores=scores,
206
- win=winner,
207
- anony=True,
208
- tstamp=time.time(),
209
- language=item[0].get('language', 'NA'),
210
- question_id=question_id,
211
- category=category,
212
- question=question,
213
- review_text=review_text)
214
- return review_result
215
-
216
- def get_review_single(self, row: List[dict], dry_run: bool = False, **kwargs):
217
- item = row[0]
218
- model = item['model_id']
219
- question = item['text']
220
- question_id = item['question_id']
221
- category = item['category']
222
- answer = item['answer']
223
-
224
- review_cache = self.get_review_cache(model, None, question)
225
- if review_cache:
226
- logger.info(f'Use cache review for {model} ...')
227
- return review_cache
228
-
229
- review_text, score = self._get_review_single(model, question, category, answer, dry_run=dry_run, **kwargs)
230
-
231
- review_result = dict(
232
- model=model,
233
- score=score,
234
- anony=True,
235
- tstamp=time.time(),
236
- language=item.get('language', 'NA'),
237
- question_id=question_id,
238
- category=category,
239
- question=question,
240
- review_text=review_text)
241
- return review_result
242
-
243
- def _get_review_pair(self,
244
- model_a,
245
- model_b,
246
- question,
247
- category,
248
- ans1,
249
- ans2,
250
- dry_run=False,
251
- **kwargs) -> Tuple[str, Any]:
252
- input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
253
-
254
- if self.reference_list:
255
- ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
256
- assert ans_ref['answer']
257
- input_msg['ans_ref'] = ans_ref['answer']
258
-
259
- sys_prompt, user_prompt, output_format = AutoReviewerGpt4.gen_prompt(
260
- prompts_list=self.prompt_list,
261
- type=ArenaMode.SINGLE if self.review_mode == ArenaMode.SINGLE else ArenaMode.PAIRWISE,
262
- **input_msg)
263
-
264
- if dry_run:
265
- review_text = self._get_reviewer_prediction_dummy(sys_prompt, user_prompt, output_format)
266
- else:
267
- review_text = self._get_reviewer_prediction(sys_prompt, user_prompt, **kwargs)
268
-
269
- result = self.fn_completion_parser(review_text, output_format=output_format)
270
- if not isinstance(result, tuple):
271
- result = (result, None)
272
- return review_text, *result
273
-
274
- def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> Tuple[str, Any]:
275
- input_msg = dict(ques=question, category=category, ans1=answer)
276
-
277
- if self.reference_list:
278
- ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
279
- assert ans_ref['answer']
280
- input_msg['ans_ref'] = ans_ref['answer']
281
-
282
- sys_prompt, user_prompt, output_format = AutoReviewerGpt4.gen_prompt(
283
- prompts_list=self.prompt_list,
284
- type=ArenaMode.SINGLE if self.review_mode == ArenaMode.SINGLE else ArenaMode.PAIRWISE,
285
- **input_msg)
286
-
287
- if dry_run:
288
- review_text = self._get_reviewer_prediction_dummy(sys_prompt, user_prompt, output_format)
289
- else:
290
- review_text = self._get_reviewer_prediction(sys_prompt, user_prompt, **kwargs)
291
-
292
- score = self.fn_completion_parser(review_text, output_format)
293
- return review_text, score
294
-
295
- def _get_reviewer_prediction_dummy(self, sys_prompt: str, user_prompt: str, output_format) -> str:
296
- logger.info('Get dummy scores for input prompt ...')
297
- if output_format == '[[rating]]':
298
- return f'[[{round(random.random(), 2)}]]'
299
- if output_format == '[[rating_a,rating_b]]':
300
- ans_list = [round(random.random(), 2), round(random.random(), 2)]
301
- return ' '.join(str(element) for element in ans_list)
302
- elif output_format == '[[A]]':
303
- return random.choice(['[[A]]', '[[B]]', '[[C]]'])
304
- elif output_format == "[{'model': <model-name>, 'rank': <model-rank>}, " \
305
- "{'model': <model-name>, 'rank': <model-rank>}]":
306
- rank_1 = random.choice([1, 2])
307
- rank_2 = 1 if rank_1 == 2 else 2
308
- return f"[{{'model': 'model_a', 'rank': {rank_1}}}, {{'model': 'model_b', 'rank': {rank_2}}}]"
309
-
310
- def _get_reviewer_prediction(self, sys_prompt: str, user_prompt: str, **kwargs) -> str:
311
-
312
- input_msg = dict(sys_prompt=sys_prompt, user_prompt=user_prompt)
313
-
314
- # Call GPT-4 predictor
315
- # TODO: Add more reviewer implementation
316
- resp = self.gpt_predictor.predict(model_id=self.MODEL_NAME, inputs=input_msg, **kwargs)
317
-
318
- if resp is None or len(resp) == 0:
319
- logger.error(f'Failed to get response from {self.MODEL_NAME} for input: {input_msg}')
320
-
321
- ans_text = resp['ans_text']
322
- # model_id = resp['model_id']
323
-
324
- return ans_text
325
-
326
- def run(self, dry_run: bool = False, **kwargs):
327
- print(f'Run battles for models with dry_run={dry_run} ...')
328
-
329
- os.makedirs(os.path.dirname(self.review_result_file), exist_ok=True)
330
-
331
- if len(self.answer_list) == 0:
332
- raise Exception('The answer list cannot be empty.')
333
-
334
- merge_key = 'question_id'
335
- merged_ans_df = merge_ques_ans(self.answer_list, merge_key=merge_key)
336
- merged_ans_df = merged_ans_df.drop(columns=['question_id'])
337
-
338
- if self.review_mode == ArenaMode.PAIRWISE:
339
- battle_pairs = get_battle_pairs(merged_ans_df.columns)
340
- elif self.review_mode == ArenaMode.PAIRWISE_BASELINE:
341
- battle_pairs = get_battle_pairs(merged_ans_df.columns, self.baseline_idx)
342
- elif self.review_mode == ArenaMode.SINGLE:
343
- battle_pairs = [(col, ) for col in merged_ans_df.columns]
344
- else:
345
- raise Exception(f'NotSupported mode: {self.review_mode}')
346
-
347
- res_list = []
348
- for t in battle_pairs:
349
- pair_df = merged_ans_df[list(t)]
350
- if self.position_bias_mitigation == PositionBiasMitigation.RANDOMIZE_ORDER:
351
- pair_df.columns = ['output_1', 'output_2']
352
- pair_df['is_switched_outputs'] = pair_df.apply(
353
- lambda x: random_seeded_choice(
354
- seed='is_switched_outputs' + x[0]['text'] + str(self.random_seed),
355
- choices=[False, True],
356
- ),
357
- axis=1,
358
- )
359
- pair_df = shuffle_pairwise_preferences(pair_df, pair_df['is_switched_outputs'])
360
-
361
- for index, row in pair_df.iterrows():
362
- row_result = self.get_review_pair(row.to_list(), dry_run=dry_run, **kwargs) \
363
- if self.review_mode != ArenaMode.SINGLE \
364
- else self.get_review_single(row.to_list(), dry_run=dry_run, **kwargs)
365
- res_list.append(row_result)
366
- dump_jsonl_data(res_list, self.review_result_file)
367
-
368
-
369
- if __name__ == '__main__':
370
- from pathlib import Path
371
-
372
- work_path = os.path.join(Path(__file__).absolute().parent, '../../../')
373
- prompt_template_path = os.path.join(work_path, 'evalscope/registry/data/prompt_template/prompt_templates.jsonl')
374
- answer_file_list = [
375
- os.path.join(work_path, 'outputs/arena/default/answers/answer_chatglm2-6b.jsonl'),
376
- os.path.join(work_path, 'outputs/arena/default/answers/answer_llama2-7b.jsonl')
377
- ]
378
- review_result_file_path = os.path.join(work_path, 'outputs/arena/default/reviews/review_gpt4.jsonl')
379
-
380
- input_kwargs = dict(
381
- prompt_file=prompt_template_path,
382
- answer_file_list=answer_file_list,
383
- review_result_file=review_result_file_path,
384
- reviewer_args={},
385
- baseline_file='',
386
- reference_file='',
387
- cache_file='',
388
- )
389
-
390
- auto_reviewer = AutoReviewerGpt4(**input_kwargs)
391
- auto_reviewer.run(dry_run=True)
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -1,77 +0,0 @@
1
- # input raw data
2
- question_file: registry/data/question.jsonl
3
-
4
- # candidate models to be battled
5
- answers_gen:
6
- chatglm3-6b:
7
- # model_id_or_path could be local absolute path, e.g. /to/path/.cache/modelscope/ZhipuAI/chatglm3-6b
8
- model_id_or_path: ZhipuAI/chatglm3-6b # model_id on modelscope
9
- revision: v1.0.2 # revision of model, default is NULL
10
- precision: torch.float16
11
- enable: true # enable or disable this model
12
- template_type: chatglm3 # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
13
- generation_config:
14
- do_sample: true
15
- max_new_tokens: 256
16
- top_k: 20
17
- top_p: 0.75
18
- temperature: 0.333
19
- # output predicted answer file name
20
- output_file: registry/data/arena/answers/answer_chatglm3-6b.jsonl
21
- Baichuan2-7B-Base:
22
- model_id_or_path: baichuan-inc/Baichuan2-7B-Base
23
- revision: v1.0.2 # revision of model, default is NULL
24
- precision: torch.float16
25
- enable: false # enable or disable this model
26
- template_type: default-generation
27
- generation_config:
28
- do_sample: true
29
- max_new_tokens: 256
30
- top_k: 20
31
- top_p: 0.75
32
- temperature: 0.3
33
- output_file: registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl
34
- Qwen-7B:
35
- model_id_or_path: qwen/Qwen-7B
36
- revision: v1.1.8 # revision of model, default is NULL
37
- precision: torch.float16
38
- enable: true # enable or disable this model # TODO: tokenizer issue
39
- template_type: default-generation
40
- generation_config:
41
- do_sample: true
42
- max_new_tokens: 256
43
- top_k: 20
44
- top_p: 0.75
45
- temperature: 0.3
46
- output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
47
-
48
- # Auto-reviewer(GPT-4) config
49
- reviews_gen:
50
- enable: true
51
- reviewer:
52
- # class reference of auto reviewer(GPT-4)
53
- ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
54
- args:
55
- max_tokens: 1024
56
- temperature: 0.2
57
- # options: pairwise, pairwise_baseline, single (default is pairwise)
58
- mode: pairwise
59
- # position bias mitigation strategy, options: swap_position, randomize_order, NULL. default is NULL
60
- position_bias_mitigation: NULL
61
- # completion parser config, default is lmsys_parser
62
- fn_completion_parser: lmsys_parser
63
- # prompt templates for auto reviewer(GPT-4)
64
- prompt_file: registry/data/prompt_template/prompt_templates.jsonl
65
- # target answer files list to be reviewed,
66
- # could be replaced by your own path: ['/path/to/answers_model_1.jsonl', '/path/to/answers_model_2.jsonl', ...]
67
- # Default is NULL, which means all answers in answers_gen will be reviewed
68
- target_answers: NULL
69
- # output file name of auto reviewer
70
- review_file: registry/data/arena/reviews/review_gpt4.jsonl
71
-
72
- # rating results
73
- rating_gen:
74
- enable: true
75
- metrics: ['elo']
76
- # elo rating report file name
77
- report_file: registry/data/arena/reports/elo_rating_origin.csv
@@ -1,63 +0,0 @@
1
- # input raw data
2
- question_file: registry/data/question.jsonl
3
-
4
- # candidate models to be battled
5
- answers_gen:
6
- Qwen2-7B-Instruct:
7
- model_id_or_path: /mnt/data/data/user/maoyunlin.myl/models/Qwen2-7B-Instruct # model_id on modelscope
8
- revision: NULL # revision of model, default is NULL
9
- precision: torch.float16
10
- enable: true # enable or disable this model
11
- template_type: default-generation # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
12
- generation_config:
13
- do_sample: true
14
- max_new_tokens: 512
15
- top_k: 20
16
- top_p: 0.9
17
- temperature: 0.7
18
- # output predicted answer file name
19
- output_file: registry/data/arena/answers/answer_qwen2.jsonl
20
- Qwen-7B:
21
- model_id_or_path: /mnt/data/data/user/maoyunlin.myl/output/qwen2-7b-instruct/v25-20240809-113533/checkpoint-309-merged
22
- revision: NULL # revision of model, default is NULL
23
- precision: torch.float16
24
- enable: true # enable or disable this model
25
- template_type: default-generation
26
- generation_config:
27
- do_sample: true
28
- max_new_tokens: 512
29
- top_k: 20
30
- top_p: 0.9
31
- temperature: 0.7
32
- output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
33
-
34
- # Auto-reviewer(GPT-4) config
35
- reviews_gen:
36
- enable: true
37
- reviewer:
38
- # class reference of auto reviewer(GPT-4)
39
- ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
40
- args:
41
- max_tokens: 1024
42
- temperature: 0.2
43
- # options: pairwise, pairwise_baseline, single (default is pairwise)
44
- mode: pairwise
45
- # position bias mitigation strategy, options: swap_position, randomize_order, NULL. default is NULL
46
- position_bias_mitigation: NULL
47
- # completion parser config, default is lmsys_parser
48
- fn_completion_parser: lmsys_parser
49
- # prompt templates for auto reviewer(GPT-4)
50
- prompt_file: registry/data/prompt_template/prompt_templates.jsonl
51
- # target answer files list to be reviewed,
52
- # could be replaced by your own path: ['/path/to/answers_model_1.jsonl', '/path/to/answers_model_2.jsonl', ...]
53
- # Default is NULL, which means all answers in answers_gen will be reviewed
54
- target_answers: NULL
55
- # output file name of auto reviewer
56
- review_file: registry/data/arena/reviews/review_gpt4.jsonl
57
-
58
- # rating results
59
- rating_gen:
60
- enable: true
61
- metrics: ['elo']
62
- # elo rating report file name
63
- report_file: registry/data/arena/reports/elo_rating_origin.csv
@@ -1,83 +0,0 @@
1
- # input raw data
2
- question_file: registry/data/question.jsonl
3
-
4
- # candidate models to be battled
5
- answers_gen:
6
- chatglm3-6b:
7
- # model_id_or_path could be local absolute path, e.g. /to/path/.cache/modelscope/ZhipuAI/chatglm3-6b
8
- model_id_or_path: ZhipuAI/chatglm3-6b # model_id on modelscope
9
- revision: v1.0.2 # revision of model, default is NULL
10
- precision: torch.float16
11
- enable: true # enable or disable this model
12
- template_type: chatglm3 # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
13
- generation_config:
14
- do_sample: true
15
- max_new_tokens: 256
16
- top_k: 20
17
- top_p: 0.75
18
- temperature: 0.3
19
- # output predicted answer file name
20
- output_file: registry/data/arena/answers/answer_chatglm3-6b.jsonl
21
- Baichuan2-7B-Base:
22
- model_id_or_path: baichuan-inc/Baichuan2-7B-Base
23
- revision: v1.0.2 # revision of model, default is NULL
24
- precision: torch.float16
25
- enable: false # enable or disable this model
26
- template_type: default-generation
27
- generation_config:
28
- do_sample: true
29
- max_new_tokens: 256
30
- top_k: 20
31
- top_p: 0.75
32
- temperature: 0.3
33
- output_file: registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl
34
- Qwen-7B:
35
- model_id_or_path: qwen/Qwen-7B
36
- revision: v1.1.8 # revision of model, default is NULL
37
- precision: torch.float16
38
- enable: true # enable or disable this model # TODO: tokenizer issue
39
- template_type: default-generation
40
- generation_config:
41
- do_sample: true
42
- max_new_tokens: 256
43
- top_k: 20
44
- top_p: 0.75
45
- temperature: 0.3
46
- output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
47
-
48
- # model of auto-reviewer
49
- reviews_gen:
50
- enable: true
51
- reviewer:
52
- ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
53
- args:
54
- model: gpt-4
55
- max_tokens: 1024
56
- temperature: 0
57
- # pairwise comparison against baseline
58
- mode: pairwise_baseline
59
- # position bias mitigation strategy, options: swap_position, randomize_order, None. default is None
60
- position_bias_mitigation: swap_position
61
- # completion parser config, default is lmsys_parser
62
- fn_completion_parser: lmsys_parser
63
- # target answers list to be reviewed, could be replaced by your own path: /path/to/answers.jsonl
64
- target_answers: [registry/data/arena/answers/answer_chatglm3-6b.jsonl,
65
- registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl]
66
- # the path to the outputs of the baseline model
67
- baseline_file: registry/data/arena/answers/answer_text_davinci_003.jsonl
68
- # the path to the reference answers
69
- reference_file:
70
- # prompt templates for auto reviewer(GPT-4)
71
- prompt_file: registry/data/prompt_template/lmsys_v2.jsonl
72
- # output file of auto reviewer
73
- review_file: registry/data/arena/reviews/review_gpt4_pair_baseline.jsonl
74
- # cache file of auto reviewer
75
- cache_file: registry/data/arena/reviews/review_gpt4_pair_baseline.jsonl
76
-
77
- # rating results
78
- rating_gen:
79
- enable: true
80
- metrics: ['pairwise']
81
- baseline_model: text_davinci_003
82
- # elo rating report file
83
- report_file: registry/data/arena/reports/rating_pairwise_baseline.csv