evalscope 0.13.1__py3-none-any.whl → 0.13.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (35) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/backend/rag_eval/utils/llm.py +4 -5
  3. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  4. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
  5. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  6. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
  7. evalscope/benchmarks/arena_hard/utils.py +162 -0
  8. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
  9. evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
  10. evalscope/benchmarks/data_adapter.py +26 -2
  11. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  12. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -11
  13. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
  14. evalscope/benchmarks/live_code_bench/testing_util.py +3 -3
  15. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  16. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
  17. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
  18. evalscope/config.py +1 -1
  19. evalscope/metrics/llm_judge.py +1 -1
  20. evalscope/models/chat_adapter.py +32 -11
  21. evalscope/perf/arguments.py +8 -6
  22. evalscope/perf/benchmark.py +31 -63
  23. evalscope/perf/plugin/api/openai_api.py +4 -2
  24. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  25. evalscope/perf/utils/db_util.py +2 -2
  26. evalscope/version.py +2 -2
  27. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/METADATA +10 -49
  28. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/RECORD +35 -28
  29. tests/cli/test_all.py +33 -24
  30. tests/cli/test_run.py +35 -18
  31. tests/rag/test_ragas.py +4 -1
  32. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/LICENSE +0 -0
  33. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/WHEEL +0 -0
  34. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/entry_points.txt +0 -0
  35. {evalscope-0.13.1.dist-info → evalscope-0.13.2.dist-info}/top_level.txt +0 -0
evalscope/arguments.py CHANGED
@@ -77,7 +77,7 @@ def add_argument(parser: argparse.ArgumentParser):
77
77
  # LLMJudge arguments
78
78
  parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
79
79
  parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.') # noqa: E501
80
- parser.add_argument('--judge-worker-num', type=int, default=8, help='The number of workers for the judge model.')
80
+ parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
81
81
  # yapf: enable
82
82
 
83
83
 
@@ -6,7 +6,7 @@ from modelscope.utils.hf_util import GenerationConfig
6
6
  from typing import Any, Dict, Iterator, List, Mapping, Optional
7
7
 
8
8
  from evalscope.constants import DEFAULT_MODEL_REVISION
9
- from evalscope.models import ChatGenerationModelAdapter
9
+ from evalscope.models import ChatGenerationModelAdapter, LocalModel
10
10
 
11
11
 
12
12
  class LLM:
@@ -38,8 +38,7 @@ class LocalLLM(BaseLLM):
38
38
  super().__init__(**kw)
39
39
  self.model_name = os.path.basename(self.model_name_or_path)
40
40
  self.model = ChatGenerationModelAdapter(
41
- model_id=self.model_name_or_path,
42
- model_revision=self.model_revision,
41
+ model=LocalModel(model_id=self.model_name_or_path, model_revision=self.model_revision),
43
42
  generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
44
43
  )
45
44
 
@@ -53,8 +52,8 @@ class LocalLLM(BaseLLM):
53
52
  """Run the LLM on the given input."""
54
53
  infer_cfg = {'stop': stop}
55
54
 
56
- response = self.model._model_generate(prompt, infer_cfg)
57
- return response
55
+ response, _ = self.model._model_generate([prompt], infer_cfg=infer_cfg)
56
+ return response[0][0]
58
57
 
59
58
  @property
60
59
  def _identifying_params(self) -> Dict[str, Any]:
File without changes
@@ -0,0 +1,109 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Any, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.metrics import Metric, mean, metric_registry
7
+ from evalscope.metrics.llm_judge import LLMJudge
8
+ from evalscope.utils.logger import get_logger
9
+
10
+ # flake8: noqa
11
+
12
+ logger = get_logger()
13
+
14
+ GRADER_SYSTEM_PROMPT = """You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers."""
15
+
16
+ GRADER_TEMPLATE = """
17
+ I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
18
+
19
+ ## Instruction
20
+
21
+ {{
22
+ "instruction": "{instruction}"
23
+ }}
24
+
25
+ ## Model Outputs
26
+
27
+ Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
28
+
29
+ {{
30
+ {{
31
+ "model_identifier": "m",
32
+ "output": "{output_1}"
33
+ }},
34
+ {{
35
+ "model_identifier": "M",
36
+ "output": "{output_2}"
37
+ }}
38
+ }}
39
+
40
+ ## Task
41
+
42
+ Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
43
+
44
+ ## Best Model Identifier
45
+ """.strip() # noqa: E501
46
+
47
+
48
+ @Benchmark.register(
49
+ name='alpaca_eval',
50
+ pretty_name='AlpacaEval2.0',
51
+ dataset_id='AI-ModelScope/alpaca_eval',
52
+ subset_list=['alpaca_eval_gpt4_baseline'],
53
+ metric_list=['winrate'],
54
+ few_shot_num=0,
55
+ train_split=None,
56
+ eval_split='eval')
57
+ class AlpacaEvalAdapter(DataAdapter):
58
+
59
+ def __init__(self, *args, **kwargs):
60
+ super().__init__(*args, **kwargs)
61
+
62
+ # register metrics
63
+ metric_registry.register(Metric(name='winrate', object=mean))
64
+
65
+ # whether to use LLM as a judge
66
+ self.llm_as_a_judge = True
67
+
68
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
69
+ question = input_d['instruction']
70
+ return self.gen_prompt_data(question)
71
+
72
+ def get_gold_answer(self, input_d: dict) -> str:
73
+ return input_d['output']
74
+
75
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
76
+ return result.strip()
77
+
78
+ def match(self, gold: str, pred: str):
79
+ # simple match
80
+ logger.warning(f'Please use LLMJudge to match the result for {self.name}')
81
+ return None
82
+
83
+ def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> bool:
84
+ raw_input = kwargs.get('raw_input', None)
85
+ instruction = raw_input['instruction']
86
+ # gold is baseline answer 'm', pred is model answer 'M'
87
+ prompt = GRADER_TEMPLATE.format(instruction=instruction, output_1=gold, output_2=pred)
88
+ # get grading response
89
+ grading_response = judge(prompt, system_prompt=GRADER_SYSTEM_PROMPT)
90
+ # parse grading response
91
+ match = re.search(r'(m|M)', grading_response)
92
+ res = match.group(0) if match else None
93
+ if res:
94
+ return res == 'M'
95
+ else:
96
+ logger.info(f'Failed to parse grading response: {prompt=}\n {grading_response=}')
97
+ return None
98
+
99
+ def compute_metric(self, review_res_list: List[bool], **kwargs) -> List[dict]:
100
+ """
101
+ compute weighted mean of the bleu score of all samples
102
+
103
+ Args:
104
+ review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
105
+ """
106
+ # zip dict answers
107
+ res_list = [res for res in review_res_list if res is not None]
108
+
109
+ return super().compute_metric(res_list, **kwargs)
File without changes
@@ -0,0 +1,120 @@
1
+ import re
2
+ from collections import defaultdict
3
+ from typing import Any, List
4
+
5
+ from evalscope.benchmarks import Benchmark, DataAdapter
6
+ from evalscope.constants import AnswerKeys
7
+ from evalscope.metrics import Metric, mean, metric_registry
8
+ from evalscope.metrics.llm_judge import LLMJudge
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ # flake8: noqa
12
+
13
+ logger = get_logger()
14
+
15
+ GRADER_SYSTEM_PROMPT = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"." # noqa: E501
16
+
17
+ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>".strip(
18
+ ) # noqa: E501
19
+
20
+
21
+ @Benchmark.register(
22
+ name='arena_hard',
23
+ pretty_name='ArenaHard',
24
+ dataset_id='AI-ModelScope/arena-hard-auto-v0.1',
25
+ metric_list=['winrate'],
26
+ few_shot_num=0,
27
+ train_split=None,
28
+ eval_split='test')
29
+ class AlpacaEvalAdapter(DataAdapter):
30
+
31
+ def __init__(self, *args, **kwargs):
32
+ super().__init__(*args, **kwargs)
33
+
34
+ # register metrics
35
+ metric_registry.register(Metric(name='winrate', object=mean))
36
+
37
+ # whether to use LLM as a judge
38
+ self.llm_as_a_judge = True
39
+
40
+ def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
41
+ question = input_d['question']
42
+ return self.gen_prompt_data(question)
43
+
44
+ def get_gold_answer(self, input_d: dict) -> str:
45
+ return input_d['prediction']
46
+
47
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
48
+ return result.strip()
49
+
50
+ def match(self, gold: str, pred: str):
51
+ # simple match
52
+ logger.warning(f'Please use LLMJudge to match the result for {self.name}')
53
+ return None
54
+
55
+ def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
56
+ from .utils import post_process_arenahard
57
+
58
+ raw_input = kwargs.get('raw_input', None)
59
+ question = raw_input['question']
60
+ # gold is baseline answer 'A', pred is model answer 'B'
61
+ prompt1 = GRADER_TEMPLATE.format(question=question, answer_1=gold, answer_2=pred)
62
+ # reverse the order
63
+ prompt2 = GRADER_TEMPLATE.format(question=question, answer_1=pred, answer_2=gold)
64
+ # get grading response
65
+ game1_response = judge(prompt1, system_prompt=GRADER_SYSTEM_PROMPT)
66
+ game2_response = judge(prompt2, system_prompt=GRADER_SYSTEM_PROMPT)
67
+ # parse grading response
68
+ res1 = post_process_arenahard(game1_response)
69
+ res2 = post_process_arenahard(game2_response)
70
+ return {
71
+ 'model_a':
72
+ 'gpt4-0314',
73
+ 'model_b':
74
+ 'test_model',
75
+ 'games': [
76
+ {
77
+ 'user_prompt': prompt1,
78
+ 'judgment': game1_response,
79
+ 'score': res1
80
+ },
81
+ {
82
+ 'user_prompt': prompt2,
83
+ 'judgment': game2_response,
84
+ 'score': res2
85
+ },
86
+ ]
87
+ }
88
+
89
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
90
+ """
91
+ compute score of the model
92
+ """
93
+ import pandas as pd
94
+
95
+ from .utils import compute_mle_elo, get_battles_from_row, get_bootstrap_result, get_win_rate_column
96
+
97
+ if isinstance(review_res_list[0], list):
98
+ review_res_list = [item for sublist in review_res_list for item in sublist]
99
+
100
+ battles = pd.concat([get_battles_from_row(res) for res in review_res_list])
101
+
102
+ bootstrap_online_elo = compute_mle_elo(battles)
103
+
104
+ # bootstrap_elo_lu = get_bootstrap_result(battles, compute_mle_elo, 100)
105
+ stats = pd.DataFrame()
106
+ stats['results'] = None
107
+ stats['results'] = stats['results'].astype('object')
108
+
109
+ for i, model in enumerate(bootstrap_online_elo.index):
110
+ # assert model in bootstrap_elo_lu.columns
111
+ stats.at[i, 'model'] = model
112
+ stats.at[i, 'score'] = bootstrap_online_elo[model]
113
+ # stats.at[i, "lower"] = np.percentile(bootstrap_elo_lu[model], 2.5)
114
+ # stats.at[i, "upper"] = np.percentile(bootstrap_elo_lu[model], 97.5)
115
+
116
+ # stats['score'] = get_win_rate_column(stats, 'score', 'gpt4-0314').tolist()
117
+
118
+ score = get_win_rate_column(stats, 'score', 'gpt4-0314').at['test_model']
119
+
120
+ return [{'metric_name': 'winrate', 'score': score, 'num': len(review_res_list)}]
@@ -0,0 +1,162 @@
1
+ import math
2
+ import numpy as np
3
+ import pandas as pd
4
+ import re
5
+ from collections import defaultdict
6
+ from sklearn.linear_model import LogisticRegression
7
+ from tqdm import tqdm
8
+
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ def post_process_arenahard(completion):
15
+ result = re.findall(r'\[\[([AB<>=]+)\]\]', completion)
16
+ if result:
17
+ return result[0]
18
+ else:
19
+ return None
20
+
21
+
22
+ def get_battles_from_row(row, first_game_only=False, multiplier=3):
23
+ results = []
24
+ output = {'model_a': row['model_a'], 'model_b': row['model_b']}
25
+
26
+ game = row['games'][0]
27
+ weight = 1
28
+ if game['score'] == 'A=B':
29
+ output['winner'] = 'tie'
30
+ elif game['score'] == 'A>B':
31
+ output['winner'] = 'model_a'
32
+ elif game['score'] == 'A>>B':
33
+ output['winner'] = 'model_a'
34
+ weight = multiplier
35
+ elif game['score'] == 'B>A':
36
+ output['winner'] = 'model_b'
37
+ elif game['score'] == 'B>>A':
38
+ output['winner'] = 'model_b'
39
+ weight = multiplier
40
+ else:
41
+ weight = 0
42
+
43
+ if weight:
44
+ results += [output] * weight
45
+
46
+ if first_game_only:
47
+ return pd.DataFrame(results)
48
+
49
+ # game 2
50
+ output = {'model_a': row['model_a'], 'model_b': row['model_b']}
51
+
52
+ game = row['games'][1]
53
+
54
+ weight = 1
55
+ if game['score'] == 'A=B':
56
+ output['winner'] = 'tie'
57
+ elif game['score'] == 'A>B':
58
+ output['winner'] = 'model_b'
59
+ elif game['score'] == 'A>>B':
60
+ output['winner'] = 'model_b'
61
+ weight = multiplier
62
+ elif game['score'] == 'B>A':
63
+ output['winner'] = 'model_a'
64
+ elif game['score'] == 'B>>A':
65
+ output['winner'] = 'model_a'
66
+ weight = multiplier
67
+ else:
68
+ weight = 0
69
+
70
+ if weight:
71
+ results += [output] * weight
72
+
73
+ return pd.DataFrame(results)
74
+
75
+
76
+ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
77
+ models = pd.concat([df['model_a'], df['model_b']]).unique()
78
+ models = pd.Series(np.arange(len(models)), index=models)
79
+
80
+ # duplicate battles
81
+ df = pd.concat([df, df], ignore_index=True)
82
+ p = len(models.index)
83
+ n = df.shape[0]
84
+
85
+ X = np.zeros([n, p])
86
+ X[np.arange(n), models[df['model_a']]] = +math.log(BASE)
87
+ X[np.arange(n), models[df['model_b']]] = -math.log(BASE)
88
+
89
+ # one A win => two A win
90
+ Y = np.zeros(n)
91
+ Y[df['winner'] == 'model_a'] = 1.0
92
+
93
+ # one tie => one A win + one B win
94
+ # find tie + tie (both bad) index
95
+ tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)')
96
+ tie_idx[len(tie_idx) // 2:] = False
97
+ Y[tie_idx] = 1.0
98
+
99
+ if len(np.unique(Y)) < 2:
100
+ logger.info('Warning: Only one class in the data')
101
+ elo_scores = pd.Series(INIT_RATING, index=models.index)
102
+ if np.all(Y == 1.0):
103
+ elo_scores[df['model_a'].iloc[0]] += SCALE # Boost the winning model
104
+ elif np.all(Y == 0.0):
105
+ elo_scores[df['model_b'].iloc[0]] += SCALE # Boost the winning model
106
+ return elo_scores.sort_values(ascending=False)
107
+
108
+ lr = LogisticRegression(
109
+ fit_intercept=False, penalty=None, tol=1e-8) # May need to set a small value when not use GPT4 as judge model
110
+ lr.fit(X, Y)
111
+
112
+ elo_scores = SCALE * lr.coef_[0] + INIT_RATING
113
+
114
+ # set anchor as gpt4-0314 = 1000
115
+ if 'gpt4-0314' in models.index:
116
+ elo_scores += 1000 - elo_scores[models['gpt4-0314']]
117
+ return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
118
+
119
+
120
+ def get_bootstrap_result(battles, func_compute_elo, num_round):
121
+ rows = []
122
+ for _ in tqdm(range(num_round), desc='bootstrap'):
123
+ res = func_compute_elo(battles.sample(frac=1.0, replace=True))
124
+ if res is not None:
125
+ rows.append(res)
126
+ df = pd.DataFrame(rows)
127
+ return df[df.median().sort_values(ascending=False).index]
128
+
129
+
130
+ def preety_print_two_ratings(ratings_1, ratings_2, column_names):
131
+ df = (
132
+ pd.DataFrame(
133
+ [[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()],
134
+ columns=['Model', column_names[0], column_names[1]],
135
+ ).sort_values(column_names[0], ascending=False).reset_index(drop=True))
136
+ df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
137
+ df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
138
+ df.index = df.index + 1
139
+ return df
140
+
141
+
142
+ def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
143
+ names = sorted(list(elo_ratings.keys()))
144
+ wins = defaultdict(lambda: defaultdict(lambda: 0))
145
+ for a in names:
146
+ for b in names:
147
+ ea = 1 / (1 + BASE**((elo_ratings[b] - elo_ratings[a]) / SCALE))
148
+ wins[a][b] = ea
149
+ wins[b][a] = 1 - ea
150
+
151
+ data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
152
+
153
+ df = pd.DataFrame(data, index=names)
154
+ df.index.name = 'model_a'
155
+ df.columns.name = 'model_b'
156
+ return df.T
157
+
158
+
159
+ def get_win_rate_column(df, column, baseline='gpt4-0314'):
160
+ to_dict = df[['model', column]].set_index('model').to_dict()[column]
161
+ win_rate_table = predict_win_rate(to_dict)
162
+ return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x, 4))
@@ -126,7 +126,7 @@ class ChineseSimpleQAAdapter(DataAdapter):
126
126
 
127
127
  def match(self, gold: str, pred: str) -> float:
128
128
  # simple match
129
- logger.warning(f'Please use LLMJudge to match the result for ChineseSimpleQA')
129
+ logger.warning(f'Please use LLMJudge to match the result for {self.name}')
130
130
  is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
131
131
  is_incorrect = not is_correct
132
132
  is_not_attempted = 0
@@ -160,9 +160,6 @@ class ChineseSimpleQAAdapter(DataAdapter):
160
160
  review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
161
161
  """
162
162
  # zip dict answers
163
- res_dict = defaultdict(list)
164
- for res in review_res_list:
165
- for key, value in res.items():
166
- res_dict[key].append(value)
163
+ res_dict = super().compute_dict_metric(review_res_list, **kwargs)
167
164
 
168
165
  return super().compute_metric(res_dict, **kwargs)
@@ -8,7 +8,6 @@ from collections import defaultdict
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
10
  from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
11
- from evalscope.models import ChatGenerationModelAdapter
12
11
  from evalscope.utils.logger import get_logger
13
12
 
14
13
  # flake8: noqa
@@ -245,6 +245,29 @@ class DataAdapter(ABC):
245
245
  res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
246
246
  return res_list
247
247
 
248
+ def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
249
+ """
250
+ compute weighted mean of the bleu score of all samples
251
+
252
+ Args:
253
+ review_res_list: [score1, score2, ...]
254
+
255
+ Returns:
256
+ avg_res: List[dict]
257
+
258
+ """
259
+ if isinstance(review_res_list[0], list):
260
+ review_res_list = [item for sublist in review_res_list for item in sublist]
261
+
262
+ items = defaultdict(list)
263
+ for scores in review_res_list:
264
+ if isinstance(scores, dict):
265
+ for k, v in scores.items():
266
+ items[k].append(v)
267
+ else:
268
+ items['AverageAccuracy'].append(scores)
269
+ return items
270
+
248
271
  def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
249
272
  """
250
273
  Generate report for the evaluation results for all subsets.
@@ -291,10 +314,11 @@ class DataAdapter(ABC):
291
314
  kwargs['metric_list'] = self.metric_list
292
315
  return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
293
316
 
294
- def gen_prompt_data(self, prompt: str, **kwargs) -> dict:
317
+ def gen_prompt_data(self, prompt: str, system_prompt: Optional[str] = None, **kwargs) -> dict:
295
318
  if not isinstance(prompt, list):
296
319
  prompt = [prompt]
297
- prompt_data = PromptData(data=prompt, multi_choices=self.choices, system_prompt=self.system_prompt)
320
+ prompt_data = PromptData(
321
+ data=prompt, multi_choices=self.choices, system_prompt=system_prompt or self.system_prompt)
298
322
  return prompt_data.to_dict()
299
323
 
300
324
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
@@ -5,7 +5,6 @@ from typing import Any, Optional
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, EvalType, HubType
8
- from evalscope.models import ChatGenerationModelAdapter
9
8
  from evalscope.utils.io_utils import jsonl_to_list
10
9
  from evalscope.utils.logger import get_logger
11
10
 
@@ -1,7 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import os.path
3
3
  from collections import defaultdict
4
- from typing import List
4
+ from typing import List, Optional, Union
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.metrics import bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
@@ -74,8 +74,9 @@ class GeneralQAAdapter(DataAdapter):
74
74
  To be supported in the future.')
75
75
 
76
76
  query = input_d.get('question', '') or input_d.get('query', '')
77
+ system_prompt = input_d.get('system')
77
78
  prompt = self.prompt_template.format(query=query)
78
- return self.gen_prompt_data(prompt)
79
+ return self.gen_prompt_data(prompt, system_prompt=system_prompt)
79
80
 
80
81
  def get_gold_answer(self, input_d: dict) -> str:
81
82
  """
@@ -118,7 +119,7 @@ class GeneralQAAdapter(DataAdapter):
118
119
  res.update(bleu_dict)
119
120
  return res
120
121
 
121
- def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
122
+ def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
122
123
  """
123
124
  compute weighted mean of the bleu score of all samples
124
125
 
@@ -129,12 +130,5 @@ class GeneralQAAdapter(DataAdapter):
129
130
  avg_res: List[dict]
130
131
 
131
132
  """
132
- items = defaultdict(list)
133
- for scores in review_res_list:
134
- if isinstance(scores, dict):
135
- for k, v in scores.items():
136
- items[k].append(v)
137
- else:
138
- items['AverageAccuracy'].append(scores)
139
- # items = [(score, 1.0) for score in review_res_list]
133
+ items = super().compute_dict_metric(review_res_list, **kwargs)
140
134
  return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
@@ -48,9 +48,6 @@ class IFEvalAdapter(DataAdapter):
48
48
 
49
49
  def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
50
50
  # aggregate review results
51
- res_dict = defaultdict(list)
52
- for res in review_res_list:
53
- for k, v in res.items():
54
- res_dict[k].append(v)
51
+ res_dict = super().compute_dict_metric(review_res_list, **kwargs)
55
52
 
56
- return super().compute_metric(res_dict)
53
+ return super().compute_metric(res_dict, **kwargs)
@@ -361,7 +361,7 @@ def run_test(sample, test=None, debug=False, timeout=6):
361
361
  nl = '\n'
362
362
  if not isinstance(inputs, list):
363
363
  print(
364
- f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
364
+ f"not passed output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
365
365
  )
366
366
  else:
367
367
  print(
@@ -431,7 +431,7 @@ def run_test(sample, test=None, debug=False, timeout=6):
431
431
  nl = '\n'
432
432
  if not isinstance(inputs, list):
433
433
  print(
434
- f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}" # noqa: E501
434
+ f"@1 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]} {tmp_result=}" # noqa: E501
435
435
  )
436
436
  else:
437
437
  print(
@@ -581,7 +581,7 @@ def run_test(sample, test=None, debug=False, timeout=6):
581
581
  nl = '\n'
582
582
  if not isinstance(inputs, list):
583
583
  print(
584
- f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
584
+ f"@2 output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl, ' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}" # noqa: E501
585
585
  )
586
586
  else:
587
587
  print(
File without changes