evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  25. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  26. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  27. evalscope/benchmarks/data_adapter.py +20 -5
  28. evalscope/benchmarks/general_arena/__init__.py +0 -0
  29. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  30. evalscope/benchmarks/general_arena/utils.py +226 -0
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  35. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  37. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  38. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  39. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  41. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  42. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  43. evalscope/benchmarks/race/race_adapter.py +1 -1
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  45. evalscope/benchmarks/utils.py +1 -2
  46. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  47. evalscope/config.py +8 -123
  48. evalscope/evaluator/evaluator.py +15 -12
  49. evalscope/metrics/__init__.py +6 -0
  50. evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
  51. evalscope/metrics/llm_judge.py +105 -20
  52. evalscope/metrics/metrics.py +1 -1
  53. evalscope/models/adapters/base_adapter.py +0 -2
  54. evalscope/models/adapters/server_adapter.py +2 -2
  55. evalscope/models/custom/dummy_model.py +3 -3
  56. evalscope/perf/arguments.py +2 -16
  57. evalscope/perf/main.py +1 -1
  58. evalscope/perf/utils/analysis_result.py +24 -23
  59. evalscope/perf/utils/benchmark_util.py +1 -1
  60. evalscope/report/__init__.py +1 -1
  61. evalscope/report/utils.py +34 -15
  62. evalscope/run.py +1 -1
  63. evalscope/summarizer.py +1 -2
  64. evalscope/utils/__init__.py +63 -2
  65. evalscope/utils/argument_utils.py +64 -0
  66. evalscope/utils/import_utils.py +16 -0
  67. evalscope/utils/io_utils.py +45 -4
  68. evalscope/utils/model_utils.py +37 -1
  69. evalscope/version.py +2 -2
  70. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
  71. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
  72. tests/aigc/test_t2i.py +1 -1
  73. tests/cli/test_all.py +50 -2
  74. tests/cli/test_collection.py +1 -1
  75. tests/cli/test_custom.py +261 -0
  76. tests/cli/test_run.py +13 -37
  77. tests/perf/test_perf.py +2 -2
  78. tests/rag/test_clip_benchmark.py +2 -1
  79. tests/rag/test_mteb.py +3 -1
  80. tests/rag/test_ragas.py +3 -1
  81. tests/swift/test_run_swift_eval.py +2 -1
  82. tests/swift/test_run_swift_vlm_eval.py +2 -1
  83. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  84. tests/utils.py +13 -0
  85. tests/vlm/test_vlmeval.py +8 -2
  86. evalscope/evaluator/rating_eval.py +0 -157
  87. evalscope/evaluator/reviewer/__init__.py +0 -1
  88. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  89. evalscope/registry/__init__.py +0 -1
  90. evalscope/registry/config/cfg_arena.yaml +0 -77
  91. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  92. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  93. evalscope/registry/config/cfg_single.yaml +0 -78
  94. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  95. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  96. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  97. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  98. evalscope/registry/data/question.jsonl +0 -80
  99. evalscope/registry/tasks/arc.yaml +0 -28
  100. evalscope/registry/tasks/bbh.yaml +0 -26
  101. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  102. evalscope/registry/tasks/ceval.yaml +0 -27
  103. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  104. evalscope/registry/tasks/cmmlu.yaml +0 -27
  105. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  106. evalscope/registry/tasks/general_qa.yaml +0 -27
  107. evalscope/registry/tasks/gsm8k.yaml +0 -29
  108. evalscope/registry/tasks/mmlu.yaml +0 -29
  109. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  110. evalscope/run_arena.py +0 -202
  111. evalscope/utils/arena_utils.py +0 -217
  112. evalscope/utils/completion_parsers.py +0 -82
  113. /evalscope/{utils → benchmarks}/filters.py +0 -0
  114. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
  115. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
  116. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
  117. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
@@ -1,217 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- # Copyright (c) lmsys.org.
3
-
4
- import numpy as np
5
- import pandas as pd
6
- import pyarrow as pa
7
- import random
8
- from collections import OrderedDict, defaultdict
9
- from typing import List, Sequence, Union
10
-
11
- from evalscope.utils.logger import get_logger
12
-
13
- logger = get_logger()
14
-
15
-
16
- def compute_elo(battles,
17
- col_model_a='model_a',
18
- col_model_b='model_b',
19
- col_win='win',
20
- tie_values=['tie', 'tie (bothbad)'],
21
- k=32,
22
- scale=400,
23
- base=10,
24
- init_rating=1000):
25
- rating = defaultdict(lambda: init_rating)
26
-
27
- for rd, model_a, model_b, win in battles[[col_model_a, col_model_b, col_win]].itertuples():
28
- ra = rating[model_a]
29
- rb = rating[model_b]
30
- ea = 1 / (1 + base**((rb - ra) / scale))
31
- eb = 1 / (1 + base**((ra - rb) / scale))
32
- if win == col_model_a:
33
- sa = 1
34
- elif win == col_model_b:
35
- sa = 0
36
- elif win in tie_values:
37
- sa = 0.5
38
- else:
39
- raise Exception(f'unexpected vote {win}')
40
- rating[model_a] += k * (sa - ea)
41
- rating[model_b] += k * (1 - sa - eb)
42
-
43
- return rating
44
-
45
-
46
- def merge_ques_ans(answer_list_all, merge_key: str = 'question_id', merge_mode: str = 'inner') -> pd.DataFrame:
47
- """
48
- Merge question and answer list to unifiled data.
49
-
50
- Args:
51
- answer_list_all: list of answer list,
52
- e.g. [ans1_list, ans2_list, ...], an ans_list is predicted answers
53
- of a specific model, must contain following columns: 'question_id',
54
- 'text', 'category', 'model_id', 'answer'
55
- merge_key: key for dataframe merging
56
- merge_mode: mode for dataframe merging,
57
- e.g. 'inner', 'left', 'right', 'outer'
58
-
59
- Returns:
60
- pandas DataFrame: merged dataframe, e.g. columns are
61
- ['question_id', 'gpt-3.5-turbo', 'llama2-7b']
62
- """
63
- ans_df = pd.DataFrame()
64
- for ans_list in answer_list_all:
65
- ans_list = [{'question_id': item['question_id'], item['model_id']: item} for item in ans_list]
66
- if ans_df.empty:
67
- ans_df = pa.Table.from_pylist(ans_list).to_pandas()
68
- else:
69
- ans_df = pd.merge(ans_df, pa.Table.from_pylist(ans_list).to_pandas(), on=merge_key, how=merge_mode)
70
-
71
- return ans_df
72
-
73
-
74
- def get_battle_pairs(columns: List[str], baseline_idx: int = -1) -> List[tuple]:
75
- """
76
- Get battle pair names from columns.
77
-
78
- Args:
79
- columns: list of column names.
80
-
81
- Returns:
82
- list of battle pairs.
83
-
84
- Example:
85
- >>> columns = ['A', 'B', 'C']
86
- >>> res = get_battle_pairs(columns)
87
- >>> print(res)
88
- >>> [('B', 'A'), ('C', 'A'), ('C', 'B')]
89
-
90
- >>> columns = ['A', 'B', 'C']
91
- >>> res = get_battle_pairs(columns, 2)
92
- >>> print(res)
93
- >>> [('A', 'C'), ('B', 'C')]
94
- """
95
- res_list = []
96
-
97
- cols_num = len(columns)
98
- if cols_num <= 0:
99
- return res_list
100
-
101
- if baseline_idx != -1:
102
- n_column = columns[baseline_idx]
103
- res_list = [(column, n_column) for column in columns if column != n_column]
104
- else:
105
- mat = np.ones((cols_num, cols_num))
106
- mat_lower_tril = np.tril(mat, k=-1)
107
- x_ids, y_ids = np.where(mat_lower_tril == 1)
108
- res_list = [(columns[x_id], columns[y_id]) for x_id, y_id in zip(x_ids, y_ids)]
109
-
110
- return res_list
111
-
112
-
113
- def get_battle_pairs_origin(columns: List[str], compare_base: bool = False, swap: bool = False): # TODO: to refactor
114
- """
115
- Get battle pair names from columns.
116
-
117
- Args:
118
- columns: list of column names.
119
-
120
- Returns:
121
- list of battle pairs.
122
-
123
- Example:
124
- >>> columns = ['A', 'B', 'C']
125
- >>> res = get_battle_pairs(columns)
126
- >>> print(res)
127
- >>> [('B', 'A'), ('C', 'A'), ('C', 'B')]
128
- """
129
- res_list = []
130
-
131
- cols_num = len(columns)
132
- if cols_num <= 0:
133
- return res_list
134
-
135
- if not compare_base:
136
- mat = np.ones((cols_num, cols_num))
137
- mat_lower_tril = np.tril(mat, k=-1)
138
- x_ids, y_ids = np.where(mat_lower_tril == 1)
139
- res_list = [(columns[x_id], columns[y_id]) for x_id, y_id in zip(x_ids, y_ids)]
140
- else:
141
- for column in columns[1:]:
142
- res_list.append((columns[0], column))
143
-
144
- if swap:
145
- res_list.extend([(j, i) for i, j in res_list])
146
- return res_list
147
-
148
-
149
- def shuffle_pairwise_preferences(df: pd.DataFrame, arr_is_shuffle: Sequence[int]) -> pd.DataFrame:
150
- """Shuffle the outputs of a pairwise preference dataframe.
151
-
152
- Examples
153
- --------
154
- >>> df = pd.DataFrame([dict(instruction='2+2', output_1='3', output_2='4', preference=2),
155
- dict(instruction='2+3', output_1='5', output_2='4', preference=1)])
156
- >>> print(shuffle_pairwise_preferences(df, [True, False]))
157
- instruction output_1 output_2 preference
158
- 0 2+2 4 3 1
159
- 1 2+3 5 4 1
160
- """
161
- col_1 = df['output_1'].copy()
162
- col_2 = df['output_2'].copy()
163
- df['output_1'] = np.where(arr_is_shuffle, col_2, col_1)
164
- df['output_2'] = np.where(arr_is_shuffle, col_1, col_2)
165
-
166
- if 'preference' in df.columns:
167
- df['preference'] = np.where(arr_is_shuffle, 3 - df['preference'], df['preference'])
168
-
169
- return df
170
-
171
-
172
- class BattlePairSelection:
173
- """
174
- Select battle pairs by specific strategy.
175
-
176
- Attributes:
177
- model_elo_map(dict): map of model_id--base_elo_score
178
- """
179
-
180
- DEFAULT_K = 5
181
-
182
- def __init__(self, model_elo_map: Union[dict, OrderedDict]):
183
- # Make sure model_elo_map to be ordered when compare_base is true.
184
- self.model_elo_map = model_elo_map
185
-
186
- def top_k(self, k: int = DEFAULT_K, compare_base: bool = False, swap: bool = False) -> list:
187
- if k <= 0:
188
- k = self.DEFAULT_K
189
- sorted_res = sorted(self.model_elo_map.items(), key=lambda x: x[1])[:k]
190
- sorted_res = list(dict(sorted_res).keys())
191
- return get_battle_pairs_origin(sorted_res, compare_base, swap)
192
-
193
- def random_k(self, k: int = DEFAULT_K, compare_base: bool = False, swap: bool = False) -> list:
194
- if k <= 0:
195
- k = self.DEFAULT_K
196
- if k > len(self.model_elo_map):
197
- k = len(self.model_elo_map)
198
- candidate_list = list(self.model_elo_map.items())
199
- k = len(candidate_list) if k > len(candidate_list) else k
200
- res = dict(random.sample(candidate_list, k=k))
201
- res = list(res.keys())
202
- return get_battle_pairs_origin(res, compare_base, swap)
203
-
204
- def volatility_index(self, frac: float = 0.2, compare_base: bool = False, swap: bool = False) -> list:
205
- res_list = []
206
- candidate_list = get_battle_pairs_origin(list(self.model_elo_map.keys()), compare_base, swap)
207
- for t in candidate_list:
208
- model_a = t[0]
209
- model_b = t[1]
210
- base_elo_a = self.model_elo_map.get(model_a)
211
- base_elo_b = self.model_elo_map.get(model_b)
212
-
213
- vol_frac = abs(base_elo_b - base_elo_a) / max(base_elo_a, base_elo_b)
214
- if vol_frac <= frac:
215
- res_list.append(t)
216
-
217
- return res_list
@@ -1,82 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- # flake8: noqa
3
-
4
- import ast
5
- import re
6
-
7
- # from . import utils as ann_utils
8
- from evalscope.constants import ArenaWinner
9
- from evalscope.utils.logger import get_logger
10
-
11
- logger = get_logger()
12
-
13
- one_score_pattern = re.compile('\[\[(\d+\.?\d*)\]\]')
14
- one_score_pattern_backup = re.compile('\[(\d+\.?\d*)\]')
15
-
16
-
17
- # modified from: https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py#L47
18
- # does not work with batched completions
19
- def lmsys_parser(completion, output_format):
20
- if output_format == '[[rating]]':
21
- match = re.search(one_score_pattern, completion)
22
- if not match:
23
- match = re.search(one_score_pattern_backup, completion)
24
-
25
- if match:
26
- rating = ast.literal_eval(match.groups()[0])
27
- else:
28
- logger.error(f'Content: {completion}\n'
29
- 'You must manually fix the score.')
30
- rating = -1
31
-
32
- return rating
33
- if output_format == '[[rating_a,rating_b]]':
34
- try:
35
- score_pair = completion.split('\n')[0]
36
- score_pair = score_pair.replace(',', ' ')
37
- sp = score_pair.split(' ')
38
- if len(sp) == 2:
39
- score_1 = float(sp[0])
40
- score_2 = float(sp[1])
41
- if score_1 > score_2:
42
- winner = ArenaWinner.MODEL_A
43
- elif score_1 < score_2:
44
- winner = ArenaWinner.MODEL_B
45
- else:
46
- if score_1 == score_1 == -1:
47
- winner = ArenaWinner.UNKNOWN
48
- winner = ArenaWinner.TIE
49
- return winner, [score_1, score_2]
50
- else:
51
- raise Exception('Invalid score pair.')
52
- except Exception as e:
53
- logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
54
- return ArenaWinner.UNKNOWN, [-1, -1]
55
- elif output_format == '[[A]]':
56
- if '[[A]]' in completion:
57
- winner = ArenaWinner.MODEL_A
58
- elif '[[B]]' in completion:
59
- winner = ArenaWinner.MODEL_B
60
- elif '[[C]]' in completion:
61
- winner = ArenaWinner.TIE
62
- else:
63
- logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
64
- winner = ArenaWinner.UNKNOWN
65
- return winner
66
-
67
-
68
- def ranking_parser(completion, **kwargs):
69
- try:
70
- if isinstance(completion, str):
71
- ordered_completions = ast.literal_eval(completion)
72
- else:
73
- ordered_completions = completion
74
-
75
- rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
76
- assert rank in [1, 2]
77
-
78
- return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
79
- except Exception as e:
80
- logger.error(f'{e}\nContent: {completion}\n'
81
- 'You must manually fix the score pair.')
82
- return ArenaWinner.UNKNOWN
File without changes