evalscope 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (72) hide show
  1. evalscope/arguments.py +1 -1
  2. evalscope/backend/rag_eval/__init__.py +1 -1
  3. evalscope/backend/rag_eval/backend_manager.py +21 -5
  4. evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
  5. evalscope/backend/rag_eval/ragas/arguments.py +0 -1
  6. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
  7. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
  8. evalscope/backend/rag_eval/utils/embedding.py +49 -3
  9. evalscope/backend/rag_eval/utils/llm.py +8 -9
  10. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
  11. evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
  12. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
  13. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  14. evalscope/benchmarks/arena_hard/__init__.py +0 -0
  15. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
  16. evalscope/benchmarks/arena_hard/utils.py +162 -0
  17. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
  18. evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
  19. evalscope/benchmarks/data_adapter.py +30 -2
  20. evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
  21. evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
  22. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  23. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
  24. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
  25. evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
  26. evalscope/benchmarks/maritime_bench/__init__.py +0 -0
  27. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
  28. evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
  29. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  30. evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
  31. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
  32. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  33. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
  34. evalscope/collections/evaluator.py +4 -2
  35. evalscope/config.py +2 -2
  36. evalscope/metrics/llm_judge.py +1 -1
  37. evalscope/models/chat_adapter.py +32 -11
  38. evalscope/perf/arguments.py +30 -9
  39. evalscope/perf/benchmark.py +57 -103
  40. evalscope/perf/http_client.py +2 -3
  41. evalscope/perf/plugin/api/custom_api.py +1 -1
  42. evalscope/perf/plugin/api/openai_api.py +4 -2
  43. evalscope/perf/plugin/datasets/custom.py +4 -1
  44. evalscope/perf/plugin/datasets/line_by_line.py +4 -1
  45. evalscope/perf/plugin/datasets/longalpaca.py +4 -1
  46. evalscope/perf/plugin/datasets/openqa.py +4 -1
  47. evalscope/perf/plugin/datasets/random_dataset.py +13 -6
  48. evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
  49. evalscope/perf/utils/benchmark_util.py +12 -6
  50. evalscope/perf/utils/db_util.py +3 -3
  51. evalscope/perf/utils/log_utils.py +41 -0
  52. evalscope/report/app.py +11 -11
  53. evalscope/run.py +7 -0
  54. evalscope/summarizer.py +2 -1
  55. evalscope/utils/utils.py +36 -25
  56. evalscope/version.py +2 -2
  57. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/METADATA +21 -55
  58. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/RECORD +70 -62
  59. tests/cli/test_all.py +36 -27
  60. tests/cli/test_collection.py +2 -1
  61. tests/cli/test_run.py +38 -20
  62. tests/perf/test_perf.py +1 -2
  63. tests/rag/test_clip_benchmark.py +0 -1
  64. tests/rag/test_mteb.py +37 -8
  65. tests/rag/test_ragas.py +33 -27
  66. tests/vlm/test_vlmeval.py +37 -1
  67. evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
  68. evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
  69. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
  70. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
  71. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
  72. {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,162 @@
1
+ import math
2
+ import numpy as np
3
+ import pandas as pd
4
+ import re
5
+ from collections import defaultdict
6
+ from sklearn.linear_model import LogisticRegression
7
+ from tqdm import tqdm
8
+
9
+ from evalscope.utils.logger import get_logger
10
+
11
+ logger = get_logger()
12
+
13
+
14
+ def post_process_arenahard(completion):
15
+ result = re.findall(r'\[\[([AB<>=]+)\]\]', completion)
16
+ if result:
17
+ return result[0]
18
+ else:
19
+ return None
20
+
21
+
22
+ def get_battles_from_row(row, first_game_only=False, multiplier=3):
23
+ results = []
24
+ output = {'model_a': row['model_a'], 'model_b': row['model_b']}
25
+
26
+ game = row['games'][0]
27
+ weight = 1
28
+ if game['score'] == 'A=B':
29
+ output['winner'] = 'tie'
30
+ elif game['score'] == 'A>B':
31
+ output['winner'] = 'model_a'
32
+ elif game['score'] == 'A>>B':
33
+ output['winner'] = 'model_a'
34
+ weight = multiplier
35
+ elif game['score'] == 'B>A':
36
+ output['winner'] = 'model_b'
37
+ elif game['score'] == 'B>>A':
38
+ output['winner'] = 'model_b'
39
+ weight = multiplier
40
+ else:
41
+ weight = 0
42
+
43
+ if weight:
44
+ results += [output] * weight
45
+
46
+ if first_game_only:
47
+ return pd.DataFrame(results)
48
+
49
+ # game 2
50
+ output = {'model_a': row['model_a'], 'model_b': row['model_b']}
51
+
52
+ game = row['games'][1]
53
+
54
+ weight = 1
55
+ if game['score'] == 'A=B':
56
+ output['winner'] = 'tie'
57
+ elif game['score'] == 'A>B':
58
+ output['winner'] = 'model_b'
59
+ elif game['score'] == 'A>>B':
60
+ output['winner'] = 'model_b'
61
+ weight = multiplier
62
+ elif game['score'] == 'B>A':
63
+ output['winner'] = 'model_a'
64
+ elif game['score'] == 'B>>A':
65
+ output['winner'] = 'model_a'
66
+ weight = multiplier
67
+ else:
68
+ weight = 0
69
+
70
+ if weight:
71
+ results += [output] * weight
72
+
73
+ return pd.DataFrame(results)
74
+
75
+
76
+ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
77
+ models = pd.concat([df['model_a'], df['model_b']]).unique()
78
+ models = pd.Series(np.arange(len(models)), index=models)
79
+
80
+ # duplicate battles
81
+ df = pd.concat([df, df], ignore_index=True)
82
+ p = len(models.index)
83
+ n = df.shape[0]
84
+
85
+ X = np.zeros([n, p])
86
+ X[np.arange(n), models[df['model_a']]] = +math.log(BASE)
87
+ X[np.arange(n), models[df['model_b']]] = -math.log(BASE)
88
+
89
+ # one A win => two A win
90
+ Y = np.zeros(n)
91
+ Y[df['winner'] == 'model_a'] = 1.0
92
+
93
+ # one tie => one A win + one B win
94
+ # find tie + tie (both bad) index
95
+ tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)')
96
+ tie_idx[len(tie_idx) // 2:] = False
97
+ Y[tie_idx] = 1.0
98
+
99
+ if len(np.unique(Y)) < 2:
100
+ logger.info('Warning: Only one class in the data')
101
+ elo_scores = pd.Series(INIT_RATING, index=models.index)
102
+ if np.all(Y == 1.0):
103
+ elo_scores[df['model_a'].iloc[0]] += SCALE # Boost the winning model
104
+ elif np.all(Y == 0.0):
105
+ elo_scores[df['model_b'].iloc[0]] += SCALE # Boost the winning model
106
+ return elo_scores.sort_values(ascending=False)
107
+
108
+ lr = LogisticRegression(
109
+ fit_intercept=False, penalty=None, tol=1e-8) # May need to set a small value when not use GPT4 as judge model
110
+ lr.fit(X, Y)
111
+
112
+ elo_scores = SCALE * lr.coef_[0] + INIT_RATING
113
+
114
+ # set anchor as gpt4-0314 = 1000
115
+ if 'gpt4-0314' in models.index:
116
+ elo_scores += 1000 - elo_scores[models['gpt4-0314']]
117
+ return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
118
+
119
+
120
+ def get_bootstrap_result(battles, func_compute_elo, num_round):
121
+ rows = []
122
+ for _ in tqdm(range(num_round), desc='bootstrap'):
123
+ res = func_compute_elo(battles.sample(frac=1.0, replace=True))
124
+ if res is not None:
125
+ rows.append(res)
126
+ df = pd.DataFrame(rows)
127
+ return df[df.median().sort_values(ascending=False).index]
128
+
129
+
130
+ def preety_print_two_ratings(ratings_1, ratings_2, column_names):
131
+ df = (
132
+ pd.DataFrame(
133
+ [[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()],
134
+ columns=['Model', column_names[0], column_names[1]],
135
+ ).sort_values(column_names[0], ascending=False).reset_index(drop=True))
136
+ df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
137
+ df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
138
+ df.index = df.index + 1
139
+ return df
140
+
141
+
142
+ def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
143
+ names = sorted(list(elo_ratings.keys()))
144
+ wins = defaultdict(lambda: defaultdict(lambda: 0))
145
+ for a in names:
146
+ for b in names:
147
+ ea = 1 / (1 + BASE**((elo_ratings[b] - elo_ratings[a]) / SCALE))
148
+ wins[a][b] = ea
149
+ wins[b][a] = 1 - ea
150
+
151
+ data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
152
+
153
+ df = pd.DataFrame(data, index=names)
154
+ df.index.name = 'model_a'
155
+ df.columns.name = 'model_b'
156
+ return df.T
157
+
158
+
159
+ def get_win_rate_column(df, column, baseline='gpt4-0314'):
160
+ to_dict = df[['model', column]].set_index('model').to_dict()[column]
161
+ win_rate_table = predict_win_rate(to_dict)
162
+ return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x, 4))
@@ -126,7 +126,7 @@ class ChineseSimpleQAAdapter(DataAdapter):
126
126
 
127
127
  def match(self, gold: str, pred: str) -> float:
128
128
  # simple match
129
- logger.warning(f'Please use LLMJudge to match the result for ChineseSimpleQA')
129
+ logger.warning(f'Please use LLMJudge to match the result for {self.name}')
130
130
  is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
131
131
  is_incorrect = not is_correct
132
132
  is_not_attempted = 0
@@ -160,9 +160,6 @@ class ChineseSimpleQAAdapter(DataAdapter):
160
160
  review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
161
161
  """
162
162
  # zip dict answers
163
- res_dict = defaultdict(list)
164
- for res in review_res_list:
165
- for key, value in res.items():
166
- res_dict[key].append(value)
163
+ res_dict = super().compute_dict_metric(review_res_list, **kwargs)
167
164
 
168
165
  return super().compute_metric(res_dict, **kwargs)
@@ -8,7 +8,6 @@ from collections import defaultdict
8
8
  from evalscope.benchmarks import Benchmark, DataAdapter
9
9
  from evalscope.constants import AnswerKeys
10
10
  from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
11
- from evalscope.models import ChatGenerationModelAdapter
12
11
  from evalscope.utils.logger import get_logger
13
12
 
14
13
  # flake8: noqa
@@ -245,6 +245,29 @@ class DataAdapter(ABC):
245
245
  res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
246
246
  return res_list
247
247
 
248
+ def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
249
+ """
250
+ compute weighted mean of the bleu score of all samples
251
+
252
+ Args:
253
+ review_res_list: [score1, score2, ...]
254
+
255
+ Returns:
256
+ avg_res: List[dict]
257
+
258
+ """
259
+ if isinstance(review_res_list[0], list):
260
+ review_res_list = [item for sublist in review_res_list for item in sublist]
261
+
262
+ items = defaultdict(list)
263
+ for scores in review_res_list:
264
+ if isinstance(scores, dict):
265
+ for k, v in scores.items():
266
+ items[k].append(v)
267
+ else:
268
+ items['AverageAccuracy'].append(scores)
269
+ return items
270
+
248
271
  def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
249
272
  """
250
273
  Generate report for the evaluation results for all subsets.
@@ -291,10 +314,15 @@ class DataAdapter(ABC):
291
314
  kwargs['metric_list'] = self.metric_list
292
315
  return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
293
316
 
294
- def gen_prompt_data(self, prompt: str, **kwargs) -> dict:
317
+ def gen_prompt_data(self,
318
+ prompt: str,
319
+ system_prompt: Optional[str] = None,
320
+ choices: Optional[List[str]] = None,
321
+ **kwargs) -> dict:
295
322
  if not isinstance(prompt, list):
296
323
  prompt = [prompt]
297
- prompt_data = PromptData(data=prompt, multi_choices=self.choices, system_prompt=self.system_prompt)
324
+ prompt_data = PromptData(
325
+ data=prompt, multi_choices=choices or self.choices, system_prompt=system_prompt or self.system_prompt)
298
326
  return prompt_data.to_dict()
299
327
 
300
328
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
@@ -5,7 +5,6 @@ from typing import Any, Optional
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, EvalType, HubType
8
- from evalscope.models import ChatGenerationModelAdapter
9
8
  from evalscope.utils.io_utils import jsonl_to_list
10
9
  from evalscope.utils.logger import get_logger
11
10
 
@@ -1,7 +1,7 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
  import os.path
3
3
  from collections import defaultdict
4
- from typing import List
4
+ from typing import List, Optional, Union
5
5
 
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.metrics import bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
@@ -40,7 +40,7 @@ class GeneralQAAdapter(DataAdapter):
40
40
  for subset_name in subset_list:
41
41
  data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
42
42
  elif os.path.isfile(dataset_name_or_path):
43
- cur_subset_name = os.path.basename(dataset_name_or_path).split('.')[0]
43
+ cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
44
44
  data_file_dict[cur_subset_name] = dataset_name_or_path
45
45
  else:
46
46
  raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
@@ -74,8 +74,9 @@ class GeneralQAAdapter(DataAdapter):
74
74
  To be supported in the future.')
75
75
 
76
76
  query = input_d.get('question', '') or input_d.get('query', '')
77
+ system_prompt = input_d.get('system')
77
78
  prompt = self.prompt_template.format(query=query)
78
- return self.gen_prompt_data(prompt)
79
+ return self.gen_prompt_data(prompt, system_prompt=system_prompt)
79
80
 
80
81
  def get_gold_answer(self, input_d: dict) -> str:
81
82
  """
@@ -118,7 +119,7 @@ class GeneralQAAdapter(DataAdapter):
118
119
  res.update(bleu_dict)
119
120
  return res
120
121
 
121
- def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
122
+ def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
122
123
  """
123
124
  compute weighted mean of the bleu score of all samples
124
125
 
@@ -129,12 +130,5 @@ class GeneralQAAdapter(DataAdapter):
129
130
  avg_res: List[dict]
130
131
 
131
132
  """
132
- items = defaultdict(list)
133
- for scores in review_res_list:
134
- if isinstance(scores, dict):
135
- for k, v in scores.items():
136
- items[k].append(v)
137
- else:
138
- items['AverageAccuracy'].append(scores)
139
- # items = [(score, 1.0) for score in review_res_list]
133
+ items = super().compute_dict_metric(review_res_list, **kwargs)
140
134
  return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
@@ -108,7 +108,7 @@ class HellaSwagAdapter(DataAdapter):
108
108
  if self.model_adapter == OutputType.MULTIPLE_CHOICE:
109
109
  return result
110
110
  else:
111
- return ResponseParser.parse_first_option(result)
111
+ return ResponseParser.parse_first_option(result, options=self.choices)
112
112
 
113
113
  def match(self, gold: str, pred: str) -> float:
114
114
  return exact_match(gold=str(gold), pred=str(pred))
@@ -48,9 +48,6 @@ class IFEvalAdapter(DataAdapter):
48
48
 
49
49
  def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
50
50
  # aggregate review results
51
- res_dict = defaultdict(list)
52
- for res in review_res_list:
53
- for k, v in res.items():
54
- res_dict[k].append(v)
51
+ res_dict = super().compute_dict_metric(review_res_list, **kwargs)
55
52
 
56
- return super().compute_metric(res_dict)
53
+ return super().compute_metric(res_dict, **kwargs)
@@ -18,7 +18,6 @@ logger = get_logger()
18
18
  extra_params={
19
19
  'start_date': None,
20
20
  'end_date': None,
21
- 'num_process_evaluate': 1,
22
21
  'timeout': 6
23
22
  },
24
23
  system_prompt=
@@ -33,7 +32,6 @@ class LiveCodeBenchAdapter(DataAdapter):
33
32
 
34
33
  extra_params = kwargs.get('extra_params', {})
35
34
 
36
- self.num_process_evaluate = extra_params.get('num_process_evaluate', 1)
37
35
  self.timeout = extra_params.get('timeout', 6)
38
36
  self.start_date = extra_params.get('start_date')
39
37
  self.end_date = extra_params.get('end_date')
@@ -84,7 +82,7 @@ class LiveCodeBenchAdapter(DataAdapter):
84
82
  references,
85
83
  predictions,
86
84
  k_list=[1],
87
- num_process_evaluate=self.num_process_evaluate,
85
+ num_process_evaluate=1,
88
86
  timeout=self.timeout,
89
87
  )
90
88
  return metrics['pass@1'] / 100 # convert to point scale