evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +20 -5
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +1 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/evaluator/evaluator.py +15 -12
- evalscope/metrics/__init__.py +6 -0
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
- evalscope/metrics/llm_judge.py +105 -20
- evalscope/metrics/metrics.py +1 -1
- evalscope/models/adapters/base_adapter.py +0 -2
- evalscope/models/adapters/server_adapter.py +2 -2
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/perf/arguments.py +2 -16
- evalscope/perf/main.py +1 -1
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +45 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +50 -2
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +13 -37
- tests/perf/test_perf.py +2 -2
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
|
@@ -5,7 +5,8 @@ from functools import partial
|
|
|
5
5
|
from typing import Optional, Union
|
|
6
6
|
|
|
7
7
|
from evalscope.backend.base import BackendManager
|
|
8
|
-
from evalscope.utils import
|
|
8
|
+
from evalscope.utils.import_utils import is_module_installed
|
|
9
|
+
from evalscope.utils.io_utils import get_valid_list
|
|
9
10
|
from evalscope.utils.logger import get_logger
|
|
10
11
|
|
|
11
12
|
logger = get_logger()
|
|
@@ -68,6 +69,8 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
68
69
|
del remain_cfg['type'] # remove not used args
|
|
69
70
|
|
|
70
71
|
norm_model_type = os.path.basename(model_type).replace(':', '-').replace('.', '_')
|
|
72
|
+
model_cfg['type'] = norm_model_type
|
|
73
|
+
|
|
71
74
|
self.valid_models.update({norm_model_type: partial(model_class, model=model_type, **remain_cfg)})
|
|
72
75
|
new_model_names.append(norm_model_type)
|
|
73
76
|
else:
|
evalscope/benchmarks/__init__.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import glob
|
|
3
3
|
import importlib
|
|
4
4
|
import os
|
|
5
|
+
import time
|
|
5
6
|
|
|
6
7
|
from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
|
|
7
8
|
from evalscope.benchmarks.data_adapter import DataAdapter
|
|
@@ -13,11 +14,24 @@ logger = get_logger()
|
|
|
13
14
|
pattern = os.path.join(os.path.dirname(__file__), '*', '**', '*_adapter.py')
|
|
14
15
|
files = glob.glob(pattern, recursive=True)
|
|
15
16
|
|
|
17
|
+
import_times = []
|
|
18
|
+
|
|
16
19
|
for file_path in files:
|
|
17
20
|
if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
|
|
18
21
|
# Convert file path to a module path
|
|
19
22
|
relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
|
|
20
23
|
module_path = relative_path[:-3].replace(os.path.sep, '.') # strip '.py' and convert to module path
|
|
21
24
|
full_path = f'evalscope.benchmarks.{module_path}'
|
|
25
|
+
|
|
26
|
+
start_time = time.perf_counter()
|
|
22
27
|
importlib.import_module(full_path)
|
|
23
|
-
|
|
28
|
+
end_time = time.perf_counter()
|
|
29
|
+
|
|
30
|
+
import_times.append((full_path, end_time - start_time))
|
|
31
|
+
|
|
32
|
+
# Sort by import time in descending order
|
|
33
|
+
import_times.sort(key=lambda x: x[1], reverse=True)
|
|
34
|
+
|
|
35
|
+
# Log the sorted import times
|
|
36
|
+
for module, duration in import_times:
|
|
37
|
+
logger.debug(f'Module {module} imported in {duration:.6f} seconds')
|
|
@@ -47,7 +47,7 @@ Evaluate the models based on the quality and relevance of their outputs, and sel
|
|
|
47
47
|
@Benchmark.register(
|
|
48
48
|
name='alpaca_eval',
|
|
49
49
|
pretty_name='AlpacaEval2.0',
|
|
50
|
-
tags=['Instruction-Following', '
|
|
50
|
+
tags=['Instruction-Following', 'Arena'],
|
|
51
51
|
description='Alpaca Eval 2.0 is an enhanced framework for evaluating instruction-following language models, '
|
|
52
52
|
'featuring an improved auto-annotator, updated baselines, and continuous preference calculation to '
|
|
53
53
|
'provide more accurate and cost-effective model assessments. '
|
|
@@ -6,7 +6,7 @@ import os
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType, OutputType
|
|
8
8
|
from evalscope.metrics import exact_match
|
|
9
|
-
from evalscope.
|
|
9
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
12
12
|
# flake8: noqa
|
|
@@ -17,7 +17,7 @@ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's A
|
|
|
17
17
|
@Benchmark.register(
|
|
18
18
|
name='arena_hard',
|
|
19
19
|
pretty_name='ArenaHard',
|
|
20
|
-
tags=['Instruction-Following', '
|
|
20
|
+
tags=['Instruction-Following', 'Arena'],
|
|
21
21
|
description=
|
|
22
22
|
'ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, '
|
|
23
23
|
'where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. '
|
|
@@ -127,18 +127,6 @@ def get_bootstrap_result(battles, func_compute_elo, num_round):
|
|
|
127
127
|
return df[df.median().sort_values(ascending=False).index]
|
|
128
128
|
|
|
129
129
|
|
|
130
|
-
def preety_print_two_ratings(ratings_1, ratings_2, column_names):
|
|
131
|
-
df = (
|
|
132
|
-
pd.DataFrame(
|
|
133
|
-
[[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()],
|
|
134
|
-
columns=['Model', column_names[0], column_names[1]],
|
|
135
|
-
).sort_values(column_names[0], ascending=False).reset_index(drop=True))
|
|
136
|
-
df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
|
|
137
|
-
df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
|
|
138
|
-
df.index = df.index + 1
|
|
139
|
-
return df
|
|
140
|
-
|
|
141
|
-
|
|
142
130
|
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
|
|
143
131
|
names = sorted(list(elo_ratings.keys()))
|
|
144
132
|
wins = defaultdict(lambda: defaultdict(lambda: 0))
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
import csv
|
|
3
3
|
import os
|
|
4
|
+
from collections import defaultdict
|
|
4
5
|
|
|
5
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
7
|
from evalscope.constants import EvalType, OutputType
|
|
7
8
|
from evalscope.metrics import exact_match
|
|
8
|
-
from evalscope.
|
|
9
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
10
|
+
from evalscope.utils.io_utils import csv_to_list
|
|
9
11
|
from evalscope.utils.logger import get_logger
|
|
10
12
|
|
|
11
13
|
# flake8: noqa
|
|
@@ -154,7 +156,7 @@ class CEVALAdapter(DataAdapter):
|
|
|
154
156
|
self.choices = ['A', 'B', 'C', 'D']
|
|
155
157
|
|
|
156
158
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
157
|
-
data_dict =
|
|
159
|
+
data_dict = defaultdict(dict)
|
|
158
160
|
for subset_name in subset_list:
|
|
159
161
|
for split_name in [self.train_split, self.eval_split]:
|
|
160
162
|
if os.path.exists(dataset_name_or_path):
|
|
@@ -162,20 +164,7 @@ class CEVALAdapter(DataAdapter):
|
|
|
162
164
|
else:
|
|
163
165
|
file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}.csv')
|
|
164
166
|
if os.path.exists(file_path):
|
|
165
|
-
|
|
166
|
-
rows = []
|
|
167
|
-
reader = csv.reader(f)
|
|
168
|
-
header = next(reader)
|
|
169
|
-
for row in reader:
|
|
170
|
-
item = dict(zip(header, row))
|
|
171
|
-
item.setdefault('explanation', '')
|
|
172
|
-
item.setdefault('answer', '')
|
|
173
|
-
rows.append(item)
|
|
174
|
-
|
|
175
|
-
if subset_name in data_dict:
|
|
176
|
-
data_dict[subset_name].update({split_name: rows})
|
|
177
|
-
else:
|
|
178
|
-
data_dict[subset_name] = {split_name: rows}
|
|
167
|
+
data_dict[subset_name][split_name] = csv_to_list(file_path)
|
|
179
168
|
|
|
180
169
|
return data_dict
|
|
181
170
|
|
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
import csv
|
|
4
4
|
import os
|
|
5
|
+
from collections import defaultdict
|
|
5
6
|
|
|
6
7
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
8
|
from evalscope.constants import EvalType, OutputType
|
|
8
9
|
from evalscope.metrics import exact_match
|
|
9
|
-
from evalscope.
|
|
10
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
11
|
+
from evalscope.utils.io_utils import csv_to_list
|
|
10
12
|
from evalscope.utils.logger import get_logger
|
|
11
13
|
|
|
12
14
|
# flake8: noqa
|
|
@@ -126,29 +128,15 @@ class CMMLUAdapter(DataAdapter):
|
|
|
126
128
|
self.choices = ['A', 'B', 'C', 'D']
|
|
127
129
|
|
|
128
130
|
def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
|
|
129
|
-
data_dict =
|
|
131
|
+
data_dict = defaultdict(dict)
|
|
130
132
|
for subset_name in subset_list:
|
|
131
|
-
data_dict[subset_name] = {}
|
|
132
133
|
for split_name in [self.train_split, self.eval_split]:
|
|
133
|
-
|
|
134
|
+
if os.path.exists(dataset_name_or_path):
|
|
135
|
+
file_path = os.path.join(dataset_name_or_path, split_name, f'{subset_name}.csv')
|
|
136
|
+
else:
|
|
137
|
+
file_path = os.path.join(work_dir, dataset_name_or_path, split_name, f'{subset_name}.csv')
|
|
134
138
|
if os.path.exists(file_path):
|
|
135
|
-
|
|
136
|
-
rows = []
|
|
137
|
-
reader = csv.reader(f)
|
|
138
|
-
for row in reader:
|
|
139
|
-
if len(row) != 7:
|
|
140
|
-
logger.error(f'Mismatch len of row: {row}, len of row should be 6. Skip this row.')
|
|
141
|
-
continue
|
|
142
|
-
rows.append({
|
|
143
|
-
'Question': row[1],
|
|
144
|
-
'A': row[2],
|
|
145
|
-
'B': row[3],
|
|
146
|
-
'C': row[4],
|
|
147
|
-
'D': row[5],
|
|
148
|
-
'Answer': row[6],
|
|
149
|
-
})
|
|
150
|
-
|
|
151
|
-
data_dict[subset_name].update({split_name: rows})
|
|
139
|
+
data_dict[subset_name][split_name] = csv_to_list(file_path)
|
|
152
140
|
|
|
153
141
|
return data_dict
|
|
154
142
|
|
|
@@ -105,7 +105,8 @@ class CompetitionMathAdapter(DataAdapter):
|
|
|
105
105
|
return result
|
|
106
106
|
|
|
107
107
|
def match(self, gold: str, pred: str) -> float:
|
|
108
|
-
|
|
108
|
+
res = math_equal(pred, gold)
|
|
109
|
+
return 1.0 if res else 0.0
|
|
109
110
|
|
|
110
111
|
@classmethod
|
|
111
112
|
def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:
|
|
@@ -168,7 +168,7 @@ class DataAdapter(ABC):
|
|
|
168
168
|
If you want to support local dataset, please rewrite this method in xxx_data_adapter.
|
|
169
169
|
Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
|
|
170
170
|
"""
|
|
171
|
-
return self.load_from_hub(dataset_name_or_path, subset_list,
|
|
171
|
+
return self.load_from_hub(dataset_name_or_path, subset_list, None, **kwargs)
|
|
172
172
|
|
|
173
173
|
def load_with_snapshot(self,
|
|
174
174
|
file_structure: Dict[str, List[str]],
|
|
@@ -449,7 +449,6 @@ class DataAdapter(ABC):
|
|
|
449
449
|
"""
|
|
450
450
|
raise NotImplementedError
|
|
451
451
|
|
|
452
|
-
@abstractmethod
|
|
453
452
|
def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
|
|
454
453
|
"""
|
|
455
454
|
Parse the predicted result and extract proper answer.
|
|
@@ -462,7 +461,21 @@ class DataAdapter(ABC):
|
|
|
462
461
|
Returns:
|
|
463
462
|
The parsed answer. Depending on the dataset. Usually a string for chat.
|
|
464
463
|
"""
|
|
465
|
-
|
|
464
|
+
return result
|
|
465
|
+
|
|
466
|
+
def llm_parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
|
|
467
|
+
"""
|
|
468
|
+
Parse the predicted result using LLM.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
result (Any): The predicted answer from the model.
|
|
472
|
+
raw_input_d (dict): The raw input data.
|
|
473
|
+
eval_type (str): The evaluation type, default is 'checkpoint'.
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
The parsed answer. Usually a string for chat.
|
|
477
|
+
"""
|
|
478
|
+
return result
|
|
466
479
|
|
|
467
480
|
@abstractmethod
|
|
468
481
|
def match(self, gold: Any, pred: Any) -> Any:
|
|
@@ -504,5 +517,7 @@ class DataAdapter(ABC):
|
|
|
504
517
|
|
|
505
518
|
# Request judge and obtain score
|
|
506
519
|
prompt = judge.build_prompt(pred, gold, question)
|
|
507
|
-
|
|
508
|
-
|
|
520
|
+
judge_response = judge(prompt)
|
|
521
|
+
score = judge.get_score(judge_response)
|
|
522
|
+
|
|
523
|
+
return score
|
|
File without changes
|