evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. evalscope/__init__.py +3 -0
  2. evalscope/backend/__init__.py +3 -0
  3. evalscope/backend/base.py +27 -0
  4. evalscope/backend/opencompass/__init__.py +3 -0
  5. evalscope/backend/opencompass/api_meta_template.py +64 -0
  6. evalscope/backend/opencompass/backend_manager.py +247 -0
  7. evalscope/backend/opencompass/tasks/__init__.py +1 -0
  8. evalscope/backend/opencompass/tasks/eval_api.py +30 -0
  9. evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
  10. evalscope/backend/vlm_eval_kit/__init__.py +1 -0
  11. evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
  12. evalscope/benchmarks/__init__.py +4 -0
  13. evalscope/benchmarks/arc/__init__.py +5 -0
  14. evalscope/benchmarks/arc/ai2_arc.py +148 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +231 -0
  16. evalscope/benchmarks/bbh/__init__.py +6 -0
  17. evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
  18. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
  19. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
  20. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
  21. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
  22. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
  23. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
  24. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
  25. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
  26. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
  27. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
  28. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
  29. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
  30. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
  31. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
  32. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
  33. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
  34. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
  35. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
  36. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
  37. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
  38. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
  39. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
  40. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
  41. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
  42. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
  43. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
  44. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
  45. evalscope/benchmarks/benchmark.py +65 -0
  46. evalscope/benchmarks/ceval/__init__.py +5 -0
  47. evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
  48. evalscope/benchmarks/ceval/ceval_exam.py +159 -0
  49. evalscope/benchmarks/cmmlu/__init__.py +5 -0
  50. evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
  51. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
  52. evalscope/benchmarks/competition_math/__init__.py +5 -0
  53. evalscope/benchmarks/competition_math/competition_math.py +88 -0
  54. evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
  55. evalscope/benchmarks/data_adapter.py +263 -0
  56. evalscope/benchmarks/general_qa/__init__.py +5 -0
  57. evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
  58. evalscope/benchmarks/gsm8k/__init__.py +5 -0
  59. evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
  60. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
  61. evalscope/benchmarks/hellaswag/__init__.py +5 -0
  62. evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
  63. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
  64. evalscope/benchmarks/humaneval/__init__.py +5 -0
  65. evalscope/benchmarks/humaneval/humaneval.py +82 -0
  66. evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
  67. evalscope/benchmarks/mmlu/__init__.py +5 -0
  68. evalscope/benchmarks/mmlu/mmlu.py +174 -0
  69. evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
  70. evalscope/benchmarks/race/__init__.py +5 -0
  71. evalscope/benchmarks/race/race.py +118 -0
  72. evalscope/benchmarks/race/race_adapter.py +229 -0
  73. evalscope/benchmarks/trivia_qa/__init__.py +5 -0
  74. evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
  75. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
  76. evalscope/benchmarks/truthful_qa/__init__.py +5 -0
  77. evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
  78. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
  79. evalscope/cache.py +98 -0
  80. evalscope/cli/__init__.py +1 -0
  81. evalscope/cli/base.py +20 -0
  82. evalscope/cli/cli.py +26 -0
  83. evalscope/cli/start_perf.py +37 -0
  84. evalscope/cli/start_server.py +138 -0
  85. evalscope/config.py +165 -0
  86. evalscope/constants.py +150 -0
  87. evalscope/evaluator/__init__.py +3 -0
  88. evalscope/evaluator/evaluator.py +689 -0
  89. evalscope/evaluator/rating_eval.py +178 -0
  90. evalscope/evaluator/reviewer/__init__.py +1 -0
  91. evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
  92. evalscope/metrics/__init__.py +1 -0
  93. evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
  94. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
  95. evalscope/metrics/code_metric.py +104 -0
  96. evalscope/metrics/math_accuracy.py +60 -0
  97. evalscope/metrics/metrics.py +405 -0
  98. evalscope/metrics/rouge_metric.py +129 -0
  99. evalscope/models/__init__.py +4 -0
  100. evalscope/models/custom/__init__.py +4 -0
  101. evalscope/models/custom/custom_model.py +53 -0
  102. evalscope/models/dummy_chat_model.py +50 -0
  103. evalscope/models/model.py +88 -0
  104. evalscope/models/model_adapter.py +586 -0
  105. evalscope/models/openai_model.py +103 -0
  106. evalscope/models/template.py +1446 -0
  107. evalscope/perf/__init__.py +0 -0
  108. evalscope/perf/_logging.py +32 -0
  109. evalscope/perf/api_plugin_base.py +60 -0
  110. evalscope/perf/custom_api.py +87 -0
  111. evalscope/perf/dashscope_api.py +84 -0
  112. evalscope/perf/dataset_plugin_base.py +64 -0
  113. evalscope/perf/datasets/__init__.py +0 -0
  114. evalscope/perf/datasets/line_by_line.py +18 -0
  115. evalscope/perf/datasets/longalpaca_12k.py +20 -0
  116. evalscope/perf/datasets/openqa.py +22 -0
  117. evalscope/perf/how_to_analysis_result.py +24 -0
  118. evalscope/perf/http_client.py +756 -0
  119. evalscope/perf/openai_api.py +130 -0
  120. evalscope/perf/plugin_registry.py +35 -0
  121. evalscope/perf/query_parameters.py +42 -0
  122. evalscope/perf/server_sent_event.py +43 -0
  123. evalscope/preprocess/__init__.py +1 -0
  124. evalscope/preprocess/tokenizers/__init__.py +0 -0
  125. evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
  126. evalscope/registry/__init__.py +1 -0
  127. evalscope/registry/tasks/arc.yaml +29 -0
  128. evalscope/registry/tasks/bbh.yaml +27 -0
  129. evalscope/registry/tasks/bbh_mini.yaml +27 -0
  130. evalscope/registry/tasks/ceval.yaml +27 -0
  131. evalscope/registry/tasks/ceval_mini.yaml +27 -0
  132. evalscope/registry/tasks/cmmlu.yaml +27 -0
  133. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
  134. evalscope/registry/tasks/general_qa.yaml +27 -0
  135. evalscope/registry/tasks/gsm8k.yaml +29 -0
  136. evalscope/registry/tasks/mmlu.yaml +29 -0
  137. evalscope/registry/tasks/mmlu_mini.yaml +27 -0
  138. evalscope/run.py +404 -0
  139. evalscope/run_arena.py +204 -0
  140. evalscope/run_ms.py +140 -0
  141. evalscope/summarizer.py +144 -0
  142. evalscope/third_party/__init__.py +1 -0
  143. evalscope/third_party/toolbench_static/__init__.py +3 -0
  144. evalscope/third_party/toolbench_static/eval.py +219 -0
  145. evalscope/third_party/toolbench_static/infer.py +278 -0
  146. evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
  147. evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
  148. evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
  149. evalscope/tools/__init__.py +1 -0
  150. evalscope/tools/combine_reports.py +140 -0
  151. evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
  152. evalscope/tools/rewrite_eval_results.py +95 -0
  153. evalscope/utils/__init__.py +4 -0
  154. evalscope/utils/arena_utils.py +247 -0
  155. evalscope/utils/completion_parsers.py +87 -0
  156. evalscope/utils/logger.py +64 -0
  157. evalscope/utils/task_cfg_parser.py +10 -0
  158. evalscope/utils/task_utils.py +19 -0
  159. evalscope/utils/utils.py +625 -0
  160. evalscope/version.py +4 -0
  161. evalscope-0.5.0.dist-info/METADATA +566 -0
  162. evalscope-0.5.0.dist-info/RECORD +165 -0
  163. evalscope-0.5.0.dist-info/WHEEL +5 -0
  164. evalscope-0.5.0.dist-info/entry_points.txt +3 -0
  165. evalscope-0.5.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,178 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+
3
+ from typing import List, Union
4
+
5
+ import pandas as pd
6
+ import pyarrow as pa
7
+
8
+ from evalscope.constants import MetricMembers
9
+ from evalscope.utils.arena_utils import compute_elo
10
+ from evalscope.utils.logger import get_logger
11
+ from evalscope.utils import jsonl_to_list
12
+
13
+ logger = get_logger()
14
+
15
+ DEFAULT_COLUMNS_MAPPING = {
16
+ 'model_a': 'model_a',
17
+ 'model_b': 'model_b',
18
+ 'win': 'win',
19
+ 'tstamp': 'ts',
20
+ 'language': 'lang'
21
+ }
22
+
23
+
24
+ class RatingEvaluate(object):
25
+
26
+ def __init__(self, metrics: list, baseline_model: str = None, **kwargs):
27
+ self.metrics = metrics
28
+ self.baseline_model = baseline_model
29
+ self.kwargs = kwargs
30
+
31
+ def preprocess(self, raw_data_df: pd.DataFrame, **kwargs):
32
+
33
+ # Get battles data
34
+ raw_data_df = raw_data_df.sort_values(ascending=True, by=['tstamp'])
35
+ battles = raw_data_df[raw_data_df['anony']].reset_index(drop=True)
36
+
37
+ return battles
38
+
39
+ def compute_elo_rating(self, raw_data):
40
+ battles = self.preprocess(raw_data_df=raw_data)
41
+ elo_ratings = compute_elo(battles)
42
+ col_model = 'Model'
43
+ col_elo_rating = 'Elo_Rating'
44
+ elo_ratings_res = pd.DataFrame(
45
+ [[n, elo_ratings[n]] for n in elo_ratings.keys()],
46
+ columns=[col_model, col_elo_rating]).sort_values(
47
+ col_elo_rating, ascending=False).reset_index(drop=True)
48
+ elo_ratings_res = elo_ratings_res.round({col_elo_rating: 1})
49
+ return elo_ratings_res
50
+
51
+ def get_single_pairwise_rating(self, row: pd.Series):
52
+ tie = False
53
+ if 'win' in row:
54
+ win = row['win']
55
+ if win == 'tie':
56
+ tie = True
57
+ else:
58
+ if win == 'model_a':
59
+ winner = row['model_a']
60
+ loser = row['model_b']
61
+ else:
62
+ winner = row['model_b']
63
+ loser = row['model_a']
64
+ elif 'win_1' in row:
65
+ win_1 = row['win_1']
66
+ win_2 = row['win_2']
67
+ if win_1 == 'tie' or win_1 != win_2:
68
+ tie = True
69
+ else:
70
+ if win_1 == 'model_a':
71
+ winner = row['model_a']
72
+ loser = row['model_b']
73
+ else:
74
+ winner = row['model_b']
75
+ loser = row['model_a']
76
+ else:
77
+ raise ValueError('Unsupported data format')
78
+
79
+ if tie:
80
+ return [{
81
+ 'model': row['model_a'],
82
+ 'win': 0,
83
+ 'loss': 0,
84
+ 'tie': 1
85
+ }, {
86
+ 'model': row['model_b'],
87
+ 'win': 0,
88
+ 'loss': 0,
89
+ 'tie': 1
90
+ }]
91
+ else:
92
+ return [{
93
+ 'model': winner,
94
+ 'win': 1,
95
+ 'loss': 0,
96
+ 'tie': 0
97
+ }, {
98
+ 'model': loser,
99
+ 'win': 0,
100
+ 'loss': 1,
101
+ 'tie': 0
102
+ }]
103
+
104
+ def compute_pairwise_rating(self, raw_data):
105
+ df_all = self.preprocess(raw_data_df=raw_data)
106
+ model_list = (
107
+ df_all['model_a'].unique().tolist()
108
+ + df_all['model_b'].unique().tolist())
109
+ model_list = list(set(model_list))
110
+
111
+ list_res = []
112
+ # traverse df row by row
113
+ for index, row in df_all.iterrows():
114
+ if self.baseline_model is not None:
115
+ if self.baseline_model not in [row['model_a'], row['model_b']]:
116
+ logger.warning(
117
+ f'One of the models in the battle should be the baseline model: {self.baseline_model}'
118
+ )
119
+ continue
120
+ rating = self.get_single_pairwise_rating(row)
121
+ list_res = list_res + rating
122
+
123
+ df = pd.DataFrame(list_res)
124
+ df = df.groupby(['model']).sum()
125
+
126
+ # remove baseline model
127
+ if self.baseline_model is not None:
128
+ df = df[df.index != self.baseline_model]
129
+ # add win rate
130
+ df['win_rate'] = df['win'] / (df['win'] + df['loss'] + df['tie'])
131
+ df['loss_rate'] = df['loss'] / (df['win'] + df['loss'] + df['tie'])
132
+ df['tie_rate'] = df['tie'] / (df['win'] + df['loss'] + df['tie'])
133
+ return df.sort_values(by='win_rate', ascending=False)
134
+
135
+ def compute_score_rating(self, raw_data):
136
+ df_all = self.preprocess(raw_data_df=raw_data)
137
+ df = df_all[['model', 'score']]
138
+
139
+ df_score = df.groupby(['model']).mean()
140
+ return df_score.sort_values(by='score', ascending=False)
141
+
142
+ def eval_samples(self, data_list: list):
143
+ res_all = []
144
+
145
+ raw_data: pd.DataFrame = None
146
+
147
+ if len(data_list) > 0:
148
+ raw_data = data_list[0]
149
+
150
+ for metric in self.metrics:
151
+
152
+ if metric == MetricMembers.ELO.value:
153
+ res = self.compute_elo_rating(raw_data)
154
+ res_all.append(res)
155
+
156
+ elif metric == MetricMembers.PAIRWISE.value:
157
+ res = self.compute_pairwise_rating(raw_data)
158
+ res_all.append(res)
159
+
160
+ elif metric == MetricMembers.SCORE.value:
161
+ res = self.compute_score_rating(raw_data)
162
+ res_all.append(res)
163
+
164
+ else:
165
+ raise ValueError(f'Unsupported metric: {metric}')
166
+
167
+ return res_all
168
+
169
+ def run(self, prompts: Union[str, list], **kwargs) -> List[pd.DataFrame]:
170
+ """
171
+ Load the predicted samples and evaluate them in arena mode.
172
+ """
173
+ # raw_data = pd.read_json(prompts)
174
+ data_list = jsonl_to_list(prompts)
175
+ data_df = pa.Table.from_pylist(data_list).to_pandas()
176
+ res_list = self.eval_samples([data_df])
177
+
178
+ return res_list
@@ -0,0 +1 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -0,0 +1,411 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ # flake8: noqa
3
+
4
+ import os
5
+ import random
6
+ import sys
7
+ import time
8
+ from abc import ABC, abstractmethod
9
+ from functools import partial
10
+ from typing import Any, List
11
+
12
+ import pandas as pd
13
+
14
+ from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
15
+ from evalscope.models.openai_model import OpenAIModel
16
+ from evalscope.utils import completion_parsers
17
+ from evalscope.utils.arena_utils import (get_battle_pairs,
18
+ merge_ques_ans,
19
+ shuffle_pairwise_preferences)
20
+ from evalscope.utils import dump_jsonl_data, jsonl_to_list, random_seeded_choice
21
+ from evalscope.utils.logger import get_logger
22
+
23
+ logger = get_logger()
24
+
25
+
26
+ class BaseReviewer(ABC):
27
+
28
+ def __init__(self, **kwargs):
29
+ ...
30
+
31
+ @abstractmethod
32
+ def run(self, *args, **kwargs):
33
+ """
34
+ Run pairwise battles with given models.
35
+ """
36
+ raise NotImplementedError(
37
+ 'run() method must be implemented in your subclass.')
38
+
39
+
40
+ class AutoReviewerGpt4(BaseReviewer):
41
+ """
42
+ Auto-review target answers(models) pairwise with GPT-4.
43
+
44
+ Args:
45
+ prompt_file: path to prompt templates file.
46
+ answer_file_list: list of paths to answer files.
47
+ review_result_file: path to review result file.
48
+ reviewer_args: config for reviewer(GPT-4).
49
+
50
+ Examples:
51
+ >>> from evalscope.evaluator.reviewer.auto_reviewer import AutoReviewerGpt4
52
+ >>> input_kwargs = dict(prompt_file='/path/to/prompt_file.jsonl', answer_file_list=['/path/to/ans1_file.jsonl',
53
+ '/path/to/ans2_file.jsonl', ...], review_file='/path/to/review_file.jsonl',
54
+ reviewer_args={'model': 'gpt-4', 'mode': 'single'})
55
+ >>> auto_reviewer = AutoReviewerGpt4(**input_kwargs)
56
+ >>> auto_reviewer.run(dry_run=False)
57
+ """
58
+
59
+ MODEL_NAME = 'gpt-4'
60
+
61
+ def __init__(self,
62
+ prompt_file: str,
63
+ answer_file_list: list,
64
+ review_result_file: str,
65
+ baseline_file: str = None,
66
+ reference_file: str = None,
67
+ reviewer_args: dict = None,
68
+ cache_file: str = None,
69
+ **kwargs):
70
+ super().__init__(**kwargs)
71
+
72
+ self.review_result_file = review_result_file
73
+ self.prompt_list = jsonl_to_list(prompt_file)
74
+ self.answer_list = [
75
+ jsonl_to_list(answer_file) for answer_file in answer_file_list
76
+ ]
77
+ self.reference_list = jsonl_to_list(
78
+ reference_file) if reference_file else []
79
+ self.cache_list = jsonl_to_list(
80
+ cache_file) if cache_file and os.path.isfile(cache_file) else []
81
+
82
+ self.reviewer_args = reviewer_args if reviewer_args \
83
+ else self._get_default_args()
84
+
85
+ self.review_mode = self.reviewer_args.pop('mode', ArenaMode.PAIRWISE)
86
+ if self.review_mode == ArenaMode.PAIRWISE_BASELINE:
87
+ assert baseline_file is not None, f'baseline_file is required for {ArenaMode.PAIRWISE_BASELINE} mode'
88
+ self.answer_list.append(jsonl_to_list(baseline_file))
89
+ self.baseline_idx = len(self.answer_list) - 1
90
+
91
+ self.position_bias_mitigation = self.reviewer_args.pop(
92
+ EvalConfigKeys.POSITION_BIAS_MITIGATION,
93
+ PositionBiasMitigation.NONE)
94
+ if self.position_bias_mitigation == PositionBiasMitigation.RANDOMIZE_ORDER:
95
+ self.random_seed = self.reviewer_args.pop(
96
+ EvalConfigKeys.RANDOM_SEED, 123)
97
+
98
+ fn_completion_parser = self.reviewer_args.pop(
99
+ EvalConfigKeys.FN_COMPLETION_PARSER,
100
+ FnCompletionParser.LMSYS_PARSER)
101
+ completion_parser_kwargs = self.reviewer_args.pop(
102
+ EvalConfigKeys.COMPLETION_PARSER_KWARGS, {})
103
+ if isinstance(fn_completion_parser, str):
104
+ fn_completion_parser = getattr(completion_parsers,
105
+ fn_completion_parser)
106
+
107
+ self.fn_completion_parser = partial(fn_completion_parser,
108
+ **completion_parser_kwargs)
109
+ self.gpt_predictor = OpenAIModel(model_cfg=self.reviewer_args)
110
+
111
+ @staticmethod
112
+ def _get_default_args():
113
+ return dict(
114
+ model=AutoReviewerGpt4.MODEL_NAME,
115
+ mode=ArenaMode.PAIRWISE,
116
+ position_bias_mitigation=PositionBiasMitigation.NONE,
117
+ fn_completion_parser=FnCompletionParser.LMSYS_PARSER,
118
+ random_seed=123,
119
+ )
120
+
121
+ @staticmethod
122
+ def gen_prompt(prompts_list: list,
123
+ type: str,
124
+ category: str,
125
+ ques: str,
126
+ ans1: str,
127
+ ans2: str = None,
128
+ ans_ref: str = None):
129
+ """
130
+ Generate prompt for Auto-reviewer with GPT-4.
131
+ """
132
+
133
+ # Default to general category (idx 0)
134
+ target_prompt_dict = prompts_list[0]
135
+ for item in prompts_list:
136
+ is_category_match = category in item['category'] if isinstance(
137
+ item['category'], list) else item['category'] == category
138
+ is_type_match = item.get('type', ArenaMode.PAIRWISE) == type
139
+ if is_category_match and is_type_match:
140
+ target_prompt_dict = item
141
+ break
142
+ elif is_type_match and target_prompt_dict.get('type',
143
+ ArenaMode.PAIRWISE) != type:
144
+ target_prompt_dict = item # fallback to type match
145
+
146
+ sys_prompt = target_prompt_dict['system_prompt']
147
+ prompt_template = target_prompt_dict['prompt_template']
148
+ defaults = target_prompt_dict.get('defaults', dict({}))
149
+ output_format = target_prompt_dict.get('output_format',
150
+ '[[rating_a,rating_b]]')
151
+
152
+ if type == ArenaMode.SINGLE:
153
+ user_prompt = prompt_template.format(
154
+ question=ques, answer=ans1, ref_answer_1=ans_ref, **defaults)
155
+ else:
156
+ user_prompt = prompt_template.format(
157
+ question=ques,
158
+ answer_a=ans1,
159
+ answer_b=ans2,
160
+ ref_answer_1=ans_ref,
161
+ **defaults)
162
+
163
+ return sys_prompt, user_prompt, output_format
164
+
165
+ def get_review_cache(self, model_a, model_b, question) -> list:
166
+ if model_b:
167
+ cache_hit = next(
168
+ (r for r in self.cache_list if r['model_a'] == model_a
169
+ and r['model_b'] == model_b and r['question'] == question),
170
+ None)
171
+ else:
172
+ cache_hit = next(
173
+ (r for r in self.cache_list
174
+ if r['model'] == model_a and r['question'] == question), None)
175
+ return cache_hit
176
+
177
+ def get_review_pair(self, item: List[dict], dry_run=False, **kwargs) -> dict:
178
+
179
+ question = item[0]['text']
180
+ question_id = item[0]['question_id']
181
+ category = item[0]['category']
182
+
183
+ model_a = item[0]['model_id']
184
+ model_b = item[1]['model_id']
185
+
186
+ ans1 = item[0]['answer']
187
+ ans2 = item[1]['answer']
188
+
189
+ review_cache = self.get_review_cache(model_a, model_b, question)
190
+ if review_cache:
191
+ logger.info(f'Use cache review for {model_a} vs {model_b} ...')
192
+ return review_cache
193
+
194
+ if self.position_bias_mitigation == PositionBiasMitigation.SWAP_POSITION:
195
+ review_text_1, winner_1, score_1 = self._get_review_pair(
196
+ model_a, model_b, question, category, ans1, ans2, dry_run=dry_run, **kwargs)
197
+ review_text_2, winner_2, score_2 = self._get_review_pair(
198
+ model_b, model_a, question, category, ans2, ans1, dry_run=dry_run, **kwargs)
199
+
200
+ # Swap winner for the second round.
201
+ if winner_2 == 'model_a':
202
+ winner_2 = 'model_b'
203
+ elif winner_2 == 'model_b':
204
+ winner_2 = 'model_a'
205
+ review_result = dict(
206
+ model_a=model_a,
207
+ model_b=model_b,
208
+ win_1=winner_1,
209
+ win_2=winner_2,
210
+ anony=True,
211
+ tstamp=time.time(),
212
+ language=item[0].get('language', 'NA'),
213
+ question_id=question_id,
214
+ category=category,
215
+ question=question,
216
+ review_text_1=review_text_1,
217
+ review_text_2=review_text_2)
218
+ else:
219
+ review_text, winner, scores = self._get_review_pair(
220
+ model_a, model_b, question, category, ans1, ans2, dry_run=dry_run, **kwargs)
221
+
222
+ if dry_run:
223
+ scores = [round(random.random(), 1), round(random.random(), 1)]
224
+ winner = 'model_a' if scores[0] > scores[1] else 'model_b'
225
+
226
+ review_result = dict(
227
+ model_a=model_a,
228
+ model_b=model_b,
229
+ scores=scores,
230
+ win=winner,
231
+ anony=True,
232
+ tstamp=time.time(),
233
+ language=item[0].get('language', 'NA'),
234
+ question_id=question_id,
235
+ category=category,
236
+ question=question,
237
+ review_text=review_text)
238
+ return review_result
239
+
240
+ def get_review_single(self, row: List[dict], dry_run: bool = False, **kwargs):
241
+ item = row[0]
242
+ model = item['model_id']
243
+ question = item['text']
244
+ question_id = item['question_id']
245
+ category = item['category']
246
+ answer = item['answer']
247
+
248
+ review_cache = self.get_review_cache(model, None, question)
249
+ if review_cache:
250
+ logger.info(f'Use cache review for {model} ...')
251
+ return review_cache
252
+
253
+ review_text, score = self._get_review_single(model, question, category, answer, dry_run=dry_run, **kwargs)
254
+
255
+ review_result = dict(
256
+ model=model,
257
+ score=score,
258
+ anony=True,
259
+ tstamp=time.time(),
260
+ language=item.get('language', 'NA'),
261
+ question_id=question_id,
262
+ category=category,
263
+ question=question,
264
+ review_text=review_text)
265
+ return review_result
266
+
267
+ def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry_run=False, **kwargs) -> (str, Any):
268
+ input_msg = dict(
269
+ ques=question, category=category, ans1=ans1, ans2=ans2)
270
+
271
+ if self.reference_list:
272
+ ans_ref = next((ref for ref in self.reference_list
273
+ if ref.get('text') == question), None)
274
+ assert ans_ref['answer']
275
+ input_msg['ans_ref'] = ans_ref['answer']
276
+
277
+ sys_prompt, user_prompt, output_format = AutoReviewerGpt4.gen_prompt(
278
+ prompts_list=self.prompt_list,
279
+ type=ArenaMode.SINGLE if self.review_mode == ArenaMode.SINGLE else ArenaMode.PAIRWISE,
280
+ **input_msg)
281
+
282
+ if dry_run:
283
+ review_text = self._get_reviewer_prediction_dummy(sys_prompt, user_prompt, output_format)
284
+ else:
285
+ review_text = self._get_reviewer_prediction(sys_prompt, user_prompt, **kwargs)
286
+
287
+ result = self.fn_completion_parser(
288
+ review_text, output_format=output_format)
289
+ if not isinstance(result, tuple):
290
+ result = (result, None)
291
+ return review_text, *result
292
+
293
+ def _get_review_single(self, model, question, category, answer, dry_run=False, **kwargs) -> (str, Any):
294
+ input_msg = dict(ques=question, category=category, ans1=answer)
295
+
296
+ if self.reference_list:
297
+ ans_ref = next((ref for ref in self.reference_list
298
+ if ref.get('text') == question), None)
299
+ assert ans_ref['answer']
300
+ input_msg['ans_ref'] = ans_ref['answer']
301
+
302
+ sys_prompt, user_prompt, output_format = AutoReviewerGpt4.gen_prompt(
303
+ prompts_list=self.prompt_list,
304
+ type=ArenaMode.SINGLE if self.review_mode == ArenaMode.SINGLE else ArenaMode.PAIRWISE,
305
+ **input_msg)
306
+
307
+ if dry_run:
308
+ review_text = self._get_reviewer_prediction_dummy(sys_prompt, user_prompt, output_format)
309
+ else:
310
+ review_text = self._get_reviewer_prediction(sys_prompt, user_prompt, **kwargs)
311
+
312
+ score = self.fn_completion_parser(review_text, output_format)
313
+ return review_text, score
314
+
315
+ def _get_reviewer_prediction_dummy(self, sys_prompt: str, user_prompt: str,
316
+ output_format) -> str:
317
+ logger.info('Get dummy scores for input prompt ...')
318
+ if output_format == '[[rating]]':
319
+ return f'[[{round(random.random(), 2)}]]'
320
+ if output_format == '[[rating_a,rating_b]]':
321
+ ans_list = [round(random.random(), 2), round(random.random(), 2)]
322
+ return ' '.join(str(element) for element in ans_list)
323
+ elif output_format == '[[A]]':
324
+ return random.choice(['[[A]]', '[[B]]', '[[C]]'])
325
+ elif output_format == "[{'model': <model-name>, 'rank': <model-rank>}, " \
326
+ "{'model': <model-name>, 'rank': <model-rank>}]":
327
+ rank_1 = random.choice([1, 2])
328
+ rank_2 = 1 if rank_1 == 2 else 2
329
+ return f"[{{'model': 'model_a', 'rank': {rank_1}}}, {{'model': 'model_b', 'rank': {rank_2}}}]"
330
+
331
+ def _get_reviewer_prediction(self, sys_prompt: str, user_prompt: str, **kwargs) -> str:
332
+
333
+ input_msg = dict(sys_prompt=sys_prompt, user_prompt=user_prompt)
334
+
335
+ # Call GPT-4 predictor
336
+ # TODO: Add more reviewer implementation
337
+ resp = self.gpt_predictor.predict(model_id=self.MODEL_NAME, inputs=input_msg, **kwargs)
338
+
339
+ if resp is None or len(resp) == 0:
340
+ logger.error(f'Failed to get response from {self.MODEL_NAME} for input: {input_msg}')
341
+
342
+ ans_text = resp['ans_text']
343
+ # model_id = resp['model_id']
344
+
345
+ return ans_text
346
+
347
+ def run(self, dry_run: bool = False, **kwargs):
348
+ print(f'Run battles for models with dry_run={dry_run} ...')
349
+
350
+ os.makedirs(os.path.dirname(self.review_result_file), exist_ok=True)
351
+
352
+ if len(self.answer_list) == 0:
353
+ raise Exception('The answer list cannot be empty.')
354
+
355
+ merge_key = 'question_id'
356
+ merged_ans_df = merge_ques_ans(self.answer_list, merge_key=merge_key)
357
+ merged_ans_df = merged_ans_df.drop(columns=['question_id'])
358
+
359
+ if self.review_mode == ArenaMode.PAIRWISE:
360
+ battle_pairs = get_battle_pairs(merged_ans_df.columns)
361
+ elif self.review_mode == ArenaMode.PAIRWISE_BASELINE:
362
+ battle_pairs = get_battle_pairs(merged_ans_df.columns,
363
+ self.baseline_idx)
364
+ elif self.review_mode == ArenaMode.SINGLE:
365
+ battle_pairs = [(col, ) for col in merged_ans_df.columns]
366
+ else:
367
+ raise Exception(f'NotSupported mode: {self.review_mode}')
368
+
369
+ res_list = []
370
+ for t in battle_pairs:
371
+ pair_df = merged_ans_df[list(t)]
372
+ if self.position_bias_mitigation == PositionBiasMitigation.RANDOMIZE_ORDER:
373
+ pair_df.columns = ['output_1', 'output_2']
374
+ pair_df['is_switched_outputs'] = pair_df.apply(
375
+ lambda x: random_seeded_choice(
376
+ seed='is_switched_outputs' + x[0]['text'] + str(
377
+ self.random_seed),
378
+ choices=[False, True],
379
+ ),
380
+ axis=1,
381
+ )
382
+ pair_df = shuffle_pairwise_preferences(
383
+ pair_df, pair_df['is_switched_outputs'])
384
+
385
+ for index, row in pair_df.iterrows():
386
+ row_result = self.get_review_pair(row.to_list(), dry_run=dry_run, **kwargs) \
387
+ if self.review_mode != ArenaMode.SINGLE \
388
+ else self.get_review_single(row.to_list(), dry_run=dry_run, **kwargs)
389
+ res_list.append(row_result)
390
+ dump_jsonl_data(res_list, self.review_result_file)
391
+
392
+
393
+ if __name__ == '__main__':
394
+ from pathlib import Path
395
+
396
+ work_path = os.path.join(Path(__file__).absolute().parent, '../../../')
397
+ prompt_template_path = os.path.join(work_path, 'evalscope/registry/data/prompt_template/prompt_templates.jsonl')
398
+ answer_file_list = [os.path.join(work_path, 'outputs/arena/default/answers/answer_chatglm2-6b.jsonl'),
399
+ os.path.join(work_path, 'outputs/arena/default/answers/answer_llama2-7b.jsonl')]
400
+ review_result_file_path = os.path.join(work_path, 'outputs/arena/default/reviews/review_gpt4.jsonl')
401
+
402
+ input_kwargs = dict(prompt_file=prompt_template_path,
403
+ answer_file_list=answer_file_list,
404
+ review_result_file=review_result_file_path,
405
+ reviewer_args={},
406
+ baseline_file='',
407
+ reference_file='',
408
+ cache_file='', )
409
+
410
+ auto_reviewer = AutoReviewerGpt4(**input_kwargs)
411
+ auto_reviewer.run(dry_run=True)
@@ -0,0 +1 @@
1
+ # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -0,0 +1,14 @@
1
+ # Copyright 2022 The rouge_score Authors.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Bundled rouge_score.rouge_scorer to avoid typing error."""