evalscope 0.8.2__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +10 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +23 -99
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +114 -85
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +25 -53
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +178 -0
- evalscope/collections/sampler.py +132 -0
- evalscope/collections/schema.py +122 -0
- evalscope/config.py +7 -5
- evalscope/constants.py +7 -28
- evalscope/evaluator/evaluator.py +66 -109
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +6 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +7 -4
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +104 -0
- evalscope/run.py +37 -66
- evalscope/run_arena.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +4 -3
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +4 -0
- evalscope/utils/model_utils.py +10 -0
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/METADATA +32 -15
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/RECORD +75 -66
- tests/cli/test_collection.py +53 -0
- tests/cli/test_run.py +43 -1
- tests/rag/test_mteb.py +3 -2
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.2.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
evalscope/run.py
CHANGED
|
@@ -2,27 +2,23 @@
|
|
|
2
2
|
"""
|
|
3
3
|
Run evaluation for LLMs.
|
|
4
4
|
"""
|
|
5
|
-
import logging
|
|
6
5
|
import os.path
|
|
7
|
-
import torch
|
|
8
6
|
from argparse import Namespace
|
|
9
7
|
from datetime import datetime
|
|
10
8
|
from typing import List, Optional, Union
|
|
11
9
|
|
|
12
10
|
from evalscope.arguments import parse_args
|
|
11
|
+
from evalscope.benchmarks import Benchmark, BenchmarkMeta
|
|
13
12
|
from evalscope.config import TaskConfig, parse_task_config
|
|
14
|
-
from evalscope.constants import
|
|
13
|
+
from evalscope.constants import DEFAULT_WORK_DIR, EvalBackend
|
|
15
14
|
from evalscope.evaluator import Evaluator
|
|
16
|
-
from evalscope.models
|
|
17
|
-
from evalscope.utils import
|
|
15
|
+
from evalscope.models import LocalModel, get_local_model, initialize_model_adapter
|
|
16
|
+
from evalscope.utils import seed_everything
|
|
18
17
|
from evalscope.utils.io_utils import OutputsStructure, are_paths_same
|
|
19
18
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
20
19
|
|
|
21
20
|
logger = get_logger()
|
|
22
21
|
|
|
23
|
-
BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.'
|
|
24
|
-
MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
|
|
25
|
-
|
|
26
22
|
|
|
27
23
|
def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]:
|
|
28
24
|
"""Run evaluation task(s) based on the provided configuration."""
|
|
@@ -38,15 +34,13 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]
|
|
|
38
34
|
|
|
39
35
|
def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
40
36
|
"""Run a single evaluation task."""
|
|
41
|
-
|
|
37
|
+
if task_cfg.seed is not None:
|
|
38
|
+
seed_everything(task_cfg.seed)
|
|
42
39
|
outputs = setup_work_directory(task_cfg, run_time)
|
|
43
40
|
configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
|
|
44
41
|
|
|
45
|
-
task_cfg.dump_yaml(outputs.configs_dir)
|
|
46
|
-
logger.info(task_cfg)
|
|
47
|
-
|
|
48
42
|
if task_cfg.eval_backend != EvalBackend.NATIVE:
|
|
49
|
-
return run_non_native_backend(task_cfg)
|
|
43
|
+
return run_non_native_backend(task_cfg, outputs)
|
|
50
44
|
else:
|
|
51
45
|
return evaluate_model(task_cfg, outputs)
|
|
52
46
|
|
|
@@ -68,7 +62,7 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
|
68
62
|
return outputs
|
|
69
63
|
|
|
70
64
|
|
|
71
|
-
def run_non_native_backend(task_cfg: TaskConfig) -> dict:
|
|
65
|
+
def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
72
66
|
"""Run evaluation using a non-native backend."""
|
|
73
67
|
eval_backend = task_cfg.eval_backend
|
|
74
68
|
eval_config = task_cfg.eval_config
|
|
@@ -78,6 +72,10 @@ def run_non_native_backend(task_cfg: TaskConfig) -> dict:
|
|
|
78
72
|
|
|
79
73
|
backend_manager_class = get_backend_manager_class(eval_backend)
|
|
80
74
|
backend_manager = backend_manager_class(config=eval_config)
|
|
75
|
+
|
|
76
|
+
task_cfg.dump_yaml(outputs.configs_dir)
|
|
77
|
+
logger.info(task_cfg)
|
|
78
|
+
|
|
81
79
|
backend_manager.run()
|
|
82
80
|
|
|
83
81
|
return dict()
|
|
@@ -102,75 +100,48 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
102
100
|
"""Evaluate the model based on the provided task configuration."""
|
|
103
101
|
# Initialize evaluator
|
|
104
102
|
eval_results = {}
|
|
105
|
-
|
|
103
|
+
base_model = get_local_model(task_cfg)
|
|
104
|
+
evaluators = []
|
|
106
105
|
for dataset_name in task_cfg.datasets:
|
|
107
|
-
evaluator = create_evaluator(task_cfg, dataset_name, outputs)
|
|
106
|
+
evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
|
|
107
|
+
evaluators.append(evaluator)
|
|
108
|
+
|
|
109
|
+
# dump task_cfg to outputs.configs_dir after creating evaluators
|
|
110
|
+
task_cfg.dump_yaml(outputs.configs_dir)
|
|
111
|
+
logger.info(task_cfg)
|
|
112
|
+
|
|
113
|
+
for evaluator in evaluators:
|
|
108
114
|
res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
|
|
109
115
|
eval_results[dataset_name] = res_dict
|
|
110
116
|
|
|
111
117
|
return eval_results
|
|
112
118
|
|
|
113
119
|
|
|
114
|
-
def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure):
|
|
120
|
+
def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: LocalModel):
|
|
115
121
|
"""Create an evaluator object for the specified dataset."""
|
|
116
|
-
imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
|
|
117
|
-
model_adapter = initialize_model_adapter(task_cfg, dataset_name, imported_modules)
|
|
118
|
-
|
|
119
|
-
dataset_config = task_cfg.dataset_args.get(dataset_name, {})
|
|
120
|
-
dataset_name_or_path = dataset_config.get('local_path') or imported_modules['DATASET_ID']
|
|
121
|
-
in_prompt_template = dataset_config.get('prompt_template', '')
|
|
122
|
-
few_shot_num = dataset_config.get('few_shot_num', None)
|
|
123
|
-
few_shot_random = dataset_config.get('few_shot_random', True)
|
|
124
|
-
|
|
125
|
-
data_adapter = imported_modules['DataAdapterClass'](
|
|
126
|
-
few_shot_num=few_shot_num,
|
|
127
|
-
few_shot_random=few_shot_random,
|
|
128
|
-
prompt_template=in_prompt_template,
|
|
129
|
-
outputs=outputs,
|
|
130
|
-
)
|
|
131
|
-
in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
|
|
132
122
|
|
|
133
|
-
|
|
123
|
+
if dataset_name == 'data_collection':
|
|
124
|
+
# EvaluatorCollection is a collection of evaluators
|
|
125
|
+
from evalscope.collections import EvaluatorCollection
|
|
126
|
+
return EvaluatorCollection(task_cfg, outputs)
|
|
127
|
+
|
|
128
|
+
benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
|
|
129
|
+
|
|
130
|
+
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
131
|
+
model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
|
|
132
|
+
|
|
133
|
+
# update task_cfg.dataset_args
|
|
134
|
+
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|
|
134
135
|
|
|
135
136
|
return Evaluator(
|
|
136
|
-
dataset_name_or_path=
|
|
137
|
-
subset_list=in_subset_list,
|
|
137
|
+
dataset_name_or_path=benchmark.dataset_id,
|
|
138
138
|
data_adapter=data_adapter,
|
|
139
139
|
model_adapter=model_adapter,
|
|
140
|
-
use_cache=task_cfg.use_cache,
|
|
141
140
|
outputs=outputs,
|
|
142
|
-
|
|
143
|
-
datasets_hub=task_cfg.dataset_hub,
|
|
144
|
-
stage=task_cfg.stage,
|
|
145
|
-
eval_type=task_cfg.eval_type,
|
|
146
|
-
overall_task_cfg=task_cfg,
|
|
141
|
+
task_cfg=task_cfg,
|
|
147
142
|
)
|
|
148
143
|
|
|
149
144
|
|
|
150
|
-
def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules):
|
|
151
|
-
"""Initialize the model adapter based on the task configuration."""
|
|
152
|
-
if task_cfg.dry_run:
|
|
153
|
-
from evalscope.models.dummy_chat_model import DummyChatModel
|
|
154
|
-
return DummyChatModel(model_cfg=dict())
|
|
155
|
-
elif task_cfg.eval_type == EvalType.CUSTOM:
|
|
156
|
-
if not isinstance(task_cfg.model, CustomModel):
|
|
157
|
-
raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
|
|
158
|
-
from evalscope.models.model_adapter import CustomModelAdapter
|
|
159
|
-
return CustomModelAdapter(custom_model=task_cfg.model)
|
|
160
|
-
else:
|
|
161
|
-
device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
|
|
162
|
-
model_precision = task_cfg.model_args.get('precision', torch.float16)
|
|
163
|
-
if isinstance(model_precision, str) and model_precision != 'auto':
|
|
164
|
-
model_precision = eval(model_precision)
|
|
165
|
-
return imported_modules['ModelAdapterClass'](
|
|
166
|
-
model_id=task_cfg.model,
|
|
167
|
-
model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
|
|
168
|
-
device_map=device_map,
|
|
169
|
-
torch_dtype=model_precision,
|
|
170
|
-
generation_config=task_cfg.generation_config,
|
|
171
|
-
chat_template=task_cfg.chat_template)
|
|
172
|
-
|
|
173
|
-
|
|
174
145
|
def main():
|
|
175
146
|
args = parse_args()
|
|
176
147
|
run_task(args)
|
evalscope/run_arena.py
CHANGED
|
@@ -10,7 +10,7 @@ from tqdm import tqdm
|
|
|
10
10
|
|
|
11
11
|
from evalscope.constants import EvalConfigKeys
|
|
12
12
|
from evalscope.evaluator.rating_eval import RatingEvaluate
|
|
13
|
-
from evalscope.models
|
|
13
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
14
14
|
from evalscope.utils import get_obj_from_cfg
|
|
15
15
|
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list, yaml_to_dict
|
|
16
16
|
from evalscope.utils.logger import get_logger
|
evalscope/utils/__init__.py
CHANGED
evalscope/utils/chat_service.py
CHANGED
|
@@ -3,11 +3,10 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from contextlib import contextmanager
|
|
5
5
|
from functools import partial
|
|
6
|
-
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
7
6
|
from pydantic import BaseModel, Field
|
|
8
7
|
from threading import Thread
|
|
9
8
|
from transformers import TextIteratorStreamer
|
|
10
|
-
from typing import List, Literal, Optional, Union
|
|
9
|
+
from typing import Any, List, Literal, Optional, Union
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
class Usage(BaseModel):
|
|
@@ -66,7 +65,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
|
|
|
66
65
|
class ChatCompletionResponse(BaseModel):
|
|
67
66
|
model: str
|
|
68
67
|
object: Literal['chat.completion', 'chat.completion.chunk']
|
|
69
|
-
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
|
|
68
|
+
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
|
|
70
69
|
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
|
|
71
70
|
usage: Optional[Usage]
|
|
72
71
|
|
|
@@ -96,6 +95,8 @@ class TextCompletionResponse(BaseModel):
|
|
|
96
95
|
class ChatService:
|
|
97
96
|
|
|
98
97
|
def __init__(self, model_path, attn_implementation):
|
|
98
|
+
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
99
|
+
|
|
99
100
|
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
|
100
101
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|
101
102
|
model_path,
|
evalscope/utils/io_utils.py
CHANGED
|
@@ -160,3 +160,11 @@ def are_paths_same(path1, path2):
|
|
|
160
160
|
real_path2 = os.path.realpath(os.path.abspath(os.path.expanduser(path2)))
|
|
161
161
|
|
|
162
162
|
return real_path1 == real_path2
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def dict_to_json(d: dict, json_file: str):
|
|
166
|
+
"""
|
|
167
|
+
Dump dict to json file.
|
|
168
|
+
"""
|
|
169
|
+
with open(json_file, 'w') as f:
|
|
170
|
+
json.dump(d, f, indent=4, ensure_ascii=False)
|
evalscope/utils/logger.py
CHANGED
|
@@ -14,6 +14,10 @@ DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else
|
|
|
14
14
|
|
|
15
15
|
logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
|
|
16
16
|
|
|
17
|
+
# disable datasets logging
|
|
18
|
+
logging.getLogger('datasets').setLevel(logging.WARNING)
|
|
19
|
+
logging.getLogger('modelscope').setLevel(logging.WARNING)
|
|
20
|
+
|
|
17
21
|
|
|
18
22
|
def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
|
|
19
23
|
"""Get logging logger
|
evalscope/utils/model_utils.py
CHANGED
|
@@ -1,6 +1,16 @@
|
|
|
1
|
+
from enum import Enum
|
|
1
2
|
from transformers import GenerationConfig
|
|
2
3
|
|
|
3
4
|
|
|
5
|
+
class EvalBackend(Enum):
|
|
6
|
+
# NOTE: compatible with ms-swfit v2.x
|
|
7
|
+
NATIVE = 'Native'
|
|
8
|
+
OPEN_COMPASS = 'OpenCompass'
|
|
9
|
+
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
10
|
+
RAG_EVAL = 'RAGEval'
|
|
11
|
+
THIRD_PARTY = 'ThirdParty'
|
|
12
|
+
|
|
13
|
+
|
|
4
14
|
def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
|
|
5
15
|
# Use the default values of temperature/top_p/top_k in generation_config.
|
|
6
16
|
if generation_config.temperature == 0:
|
evalscope/utils/utils.py
CHANGED
|
@@ -121,7 +121,6 @@ class ResponseParser:
|
|
|
121
121
|
f'([{options_concat}])\s?是正确答案',
|
|
122
122
|
f'选项\s?([{options_concat}])\s?正确',
|
|
123
123
|
f'所以答\s?([{options_concat}])',
|
|
124
|
-
f'1.\s?([{options_concat}])[.。$]?$',
|
|
125
124
|
f'所以\s?([{options_concat}][.。$]?$)',
|
|
126
125
|
f'所有\s?([{options_concat}][.。$]?$)',
|
|
127
126
|
f'[\s,::,]([{options_concat}])[。,,\.]?$',
|
|
@@ -137,16 +136,15 @@ class ResponseParser:
|
|
|
137
136
|
f'答案为(.*?)[{options_concat}]',
|
|
138
137
|
f'固选(.*?)[{options_concat}]',
|
|
139
138
|
f'答案应该是(.*?)[{options_concat}]',
|
|
140
|
-
f'[Tt]he answer is [{options_concat}]',
|
|
139
|
+
f'[Tt]he answer is \(?[{options_concat}]\)?',
|
|
141
140
|
f'[Tt]he correct answer is [{options_concat}]',
|
|
142
141
|
f'[Tt]he correct answer is:\n[{options_concat}]',
|
|
143
142
|
f'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
|
|
144
|
-
f'[{options_concat}]',
|
|
145
143
|
f'^选项\s?([{options_concat}])',
|
|
146
144
|
f'^([{options_concat}])\s?选?项',
|
|
147
145
|
f'(\s|^)[{options_concat}][\s。,,::\.$]',
|
|
148
146
|
f'(\s|^)[{options_concat}](\s|$)',
|
|
149
|
-
f'
|
|
147
|
+
f'[{options_concat}]',
|
|
150
148
|
]
|
|
151
149
|
|
|
152
150
|
regexes = [re.compile(pattern) for pattern in patterns]
|
|
@@ -169,6 +167,7 @@ class ResponseParser:
|
|
|
169
167
|
"""
|
|
170
168
|
patterns = [
|
|
171
169
|
r'[Aa]nswer:\s*(\w+)',
|
|
170
|
+
r'answer is \(?(\w+)\)?',
|
|
172
171
|
r'[Tt]he correct answer is:\s*(\w+)',
|
|
173
172
|
r'[Tt]he correct answer is:\n\s*(\w+)',
|
|
174
173
|
r'[Tt]he correct answer is:\n\n-\s*(\w+)',
|
|
@@ -199,27 +198,6 @@ class ResponseParser:
|
|
|
199
198
|
|
|
200
199
|
|
|
201
200
|
|
|
202
|
-
def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
|
|
203
|
-
"""
|
|
204
|
-
Import module utility function.
|
|
205
|
-
|
|
206
|
-
Args:
|
|
207
|
-
import_path_prefix: e.g. 'evalscope.benchmarks.'
|
|
208
|
-
module_name: The module name to import. e.g. 'mmlu'
|
|
209
|
-
members_to_import: The members to import.
|
|
210
|
-
e.g. ['DATASET_ID', 'SUBJECT_MAPPING', 'SUBSET_LIST', 'DataAdapterClass']
|
|
211
|
-
|
|
212
|
-
Returns:
|
|
213
|
-
dict: imported modules map. e.g. {'DATASET_ID': 'mmlu', 'SUBJECT_MAPPING': {...}, ...}
|
|
214
|
-
"""
|
|
215
|
-
imported_modules = {}
|
|
216
|
-
module = importlib.import_module(import_path_prefix + module_name)
|
|
217
|
-
for member_name in members_to_import:
|
|
218
|
-
imported_modules[member_name] = getattr(module, member_name)
|
|
219
|
-
|
|
220
|
-
return imported_modules
|
|
221
|
-
|
|
222
|
-
|
|
223
201
|
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
|
|
224
202
|
"""
|
|
225
203
|
Normalize score.
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -160,14 +160,16 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
|
160
160
|
> ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
|
|
161
161
|
|
|
162
162
|
## 📋 Contents
|
|
163
|
-
- [Introduction](
|
|
164
|
-
- [News](
|
|
165
|
-
- [Installation](
|
|
166
|
-
- [Quick Start](
|
|
163
|
+
- [Introduction](#-introduction)
|
|
164
|
+
- [News](#-news)
|
|
165
|
+
- [Installation](#️-installation)
|
|
166
|
+
- [Quick Start](#-quick-start)
|
|
167
167
|
- [Evaluation Backend](#evaluation-backend)
|
|
168
|
-
- [Custom Dataset Evaluation](
|
|
169
|
-
- [Model Serving Performance Evaluation](
|
|
170
|
-
- [Arena Mode](
|
|
168
|
+
- [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
|
|
169
|
+
- [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
170
|
+
- [Arena Mode](#-arena-mode)
|
|
171
|
+
- [Contribution](#️-contribution)
|
|
172
|
+
- [Roadmap](#-roadmap)
|
|
171
173
|
|
|
172
174
|
|
|
173
175
|
## 📝 Introduction
|
|
@@ -208,11 +210,15 @@ Please scan the QR code below to join our community groups:
|
|
|
208
210
|
|
|
209
211
|
|
|
210
212
|
## 🎉 News
|
|
213
|
+
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
211
214
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
212
215
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
213
216
|
- 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
|
|
214
217
|
- 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
|
|
215
218
|
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
219
|
+
|
|
220
|
+
<details><summary>More</summary>
|
|
221
|
+
|
|
216
222
|
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
217
223
|
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
218
224
|
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -224,7 +230,7 @@ Please scan the QR code below to join our community groups:
|
|
|
224
230
|
- 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
|
|
225
231
|
- 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
|
|
226
232
|
|
|
227
|
-
|
|
233
|
+
</details>
|
|
228
234
|
|
|
229
235
|
## 🛠️ Installation
|
|
230
236
|
### Method 1: Install Using pip
|
|
@@ -414,7 +420,7 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
|
|
|
414
420
|
- **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
|
|
415
421
|
|
|
416
422
|
|
|
417
|
-
## Model Serving Performance Evaluation
|
|
423
|
+
## 📈 Model Serving Performance Evaluation
|
|
418
424
|
A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
|
|
419
425
|
|
|
420
426
|
Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
|
|
@@ -439,19 +445,32 @@ Speed Benchmark Results:
|
|
|
439
445
|
+---------------+-----------------+----------------+
|
|
440
446
|
```
|
|
441
447
|
|
|
442
|
-
## Custom Dataset Evaluation
|
|
448
|
+
## 🖊️ Custom Dataset Evaluation
|
|
443
449
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
|
|
444
450
|
|
|
445
451
|
|
|
446
|
-
## Arena Mode
|
|
452
|
+
## 🏟️ Arena Mode
|
|
447
453
|
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
448
454
|
|
|
449
455
|
Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
450
456
|
|
|
457
|
+
## 👷♂️ Contribution
|
|
451
458
|
|
|
459
|
+
EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn), is continuously optimizing its benchmark evaluation features! We invite you to refer to the [Contribution Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html) to easily add your own evaluation benchmarks and share your contributions with the community. Let’s work together to support the growth of EvalScope and make our tools even better! Join us now!
|
|
452
460
|
|
|
461
|
+
<a href="https://github.com/modelscope/evalscope/graphs/contributors" target="_blank">
|
|
462
|
+
<table>
|
|
463
|
+
<tr>
|
|
464
|
+
<th colspan="2">
|
|
465
|
+
<br><img src="https://contrib.rocks/image?repo=modelscope/evalscope"><br><br>
|
|
466
|
+
</th>
|
|
467
|
+
</tr>
|
|
468
|
+
</table>
|
|
469
|
+
</a>
|
|
453
470
|
|
|
454
|
-
##
|
|
471
|
+
## 🔜 Roadmap
|
|
472
|
+
- [ ] Support for better evaluation report visualization
|
|
473
|
+
- [x] Support for mixed evaluations across multiple datasets
|
|
455
474
|
- [x] RAG evaluation
|
|
456
475
|
- [x] VLM evaluation
|
|
457
476
|
- [x] Agents evaluation
|
|
@@ -462,8 +481,6 @@ Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/lates
|
|
|
462
481
|
- [ ] GAIA
|
|
463
482
|
- [ ] GPQA
|
|
464
483
|
- [x] MBPP
|
|
465
|
-
- [ ] Auto-reviewer
|
|
466
|
-
- [ ] Qwen-max
|
|
467
484
|
|
|
468
485
|
|
|
469
486
|
## Star History
|