evalscope 0.8.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +2 -0
- evalscope/arguments.py +10 -3
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/__init__.py +20 -1
- evalscope/benchmarks/arc/__init__.py +0 -5
- evalscope/benchmarks/arc/arc_adapter.py +23 -99
- evalscope/benchmarks/bbh/__init__.py +0 -4
- evalscope/benchmarks/bbh/bbh_adapter.py +19 -89
- evalscope/benchmarks/benchmark.py +70 -59
- evalscope/benchmarks/ceval/__init__.py +0 -5
- evalscope/benchmarks/ceval/ceval_adapter.py +22 -46
- evalscope/benchmarks/cmmlu/__init__.py +0 -5
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +20 -41
- evalscope/benchmarks/competition_math/__init__.py +0 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
- evalscope/benchmarks/data_adapter.py +114 -85
- evalscope/benchmarks/general_qa/__init__.py +0 -5
- evalscope/benchmarks/general_qa/general_qa_adapter.py +16 -19
- evalscope/benchmarks/gsm8k/__init__.py +0 -4
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +19 -98
- evalscope/benchmarks/hellaswag/__init__.py +0 -5
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +23 -96
- evalscope/benchmarks/humaneval/__init__.py +0 -4
- evalscope/benchmarks/humaneval/humaneval_adapter.py +16 -117
- evalscope/benchmarks/mmlu/__init__.py +0 -5
- evalscope/benchmarks/mmlu/mmlu_adapter.py +26 -48
- evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
- evalscope/benchmarks/race/__init__.py +0 -5
- evalscope/benchmarks/race/race_adapter.py +25 -53
- evalscope/benchmarks/trivia_qa/__init__.py +0 -5
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +24 -97
- evalscope/benchmarks/truthful_qa/__init__.py +0 -5
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +23 -33
- evalscope/collections/__init__.py +3 -0
- evalscope/collections/evaluator.py +178 -0
- evalscope/collections/sampler.py +132 -0
- evalscope/collections/schema.py +122 -0
- evalscope/config.py +10 -6
- evalscope/constants.py +7 -28
- evalscope/evaluator/evaluator.py +66 -108
- evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
- evalscope/metrics/__init__.py +6 -0
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/math_accuracy.py +193 -50
- evalscope/metrics/metrics.py +7 -4
- evalscope/metrics/rouge_metric.py +13 -8
- evalscope/models/__init__.py +14 -1
- evalscope/models/base_adapter.py +52 -0
- evalscope/models/chat_adapter.py +138 -0
- evalscope/models/choice_adapter.py +211 -0
- evalscope/models/custom_adapter.py +67 -0
- evalscope/models/local_model.py +74 -0
- evalscope/models/model.py +141 -0
- evalscope/models/server_adapter.py +104 -0
- evalscope/perf/arguments.py +1 -0
- evalscope/perf/benchmark.py +1 -1
- evalscope/perf/main.py +3 -1
- evalscope/perf/plugin/api/openai_api.py +51 -47
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/run.py +37 -66
- evalscope/run_arena.py +1 -1
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/chat_service.py +4 -3
- evalscope/utils/io_utils.py +8 -0
- evalscope/utils/logger.py +4 -0
- evalscope/utils/model_utils.py +10 -0
- evalscope/utils/utils.py +3 -25
- evalscope/version.py +2 -2
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/METADATA +46 -17
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/RECORD +81 -92
- tests/cli/test_collection.py +53 -0
- tests/cli/test_run.py +43 -1
- tests/perf/test_perf.py +3 -3
- tests/rag/test_mteb.py +3 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
- evalscope/models/api/__init__.py +0 -3
- evalscope/models/dummy_chat_model.py +0 -49
- evalscope/models/model_adapter.py +0 -525
- evalscope/models/openai_model.py +0 -103
- /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/LICENSE +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/WHEEL +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.1.dist-info → evalscope-0.9.0.dist-info}/top_level.txt +0 -0
|
@@ -96,60 +96,64 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
96
96
|
|
|
97
97
|
def parse_responses(self, responses, request: Any = None, **kwargs) -> Dict:
|
|
98
98
|
"""Parser responses and return number of request and response tokens.
|
|
99
|
-
|
|
100
|
-
|
|
99
|
+
Only one response for non-stream, multiple responses for stream.
|
|
100
|
+
"""
|
|
101
101
|
|
|
102
|
+
# when stream, the last response is the full usage
|
|
103
|
+
# when non-stream, the last response is the first response
|
|
104
|
+
last_response_js = json.loads(responses[-1])
|
|
105
|
+
if 'usage' in last_response_js and last_response_js['usage']:
|
|
106
|
+
input_tokens = last_response_js['usage']['prompt_tokens']
|
|
107
|
+
output_tokens = last_response_js['usage']['completion_tokens']
|
|
108
|
+
return input_tokens, output_tokens
|
|
102
109
|
|
|
103
|
-
|
|
104
|
-
responses (List[bytes]): List of http response body, for stream output,
|
|
105
|
-
there are multiple responses, for general only one.
|
|
106
|
-
kwargs: (Any): The command line --parameter content.
|
|
107
|
-
Returns:
|
|
108
|
-
Tuple: Return number of prompt token and number of completion tokens.
|
|
109
|
-
"""
|
|
110
|
-
full_response_content = ''
|
|
110
|
+
# no usage information in the response, parse the response to get the tokens
|
|
111
111
|
delta_contents = {}
|
|
112
|
-
input_tokens = None
|
|
113
|
-
output_tokens = None
|
|
114
112
|
for response in responses:
|
|
115
113
|
js = json.loads(response)
|
|
116
|
-
if
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
114
|
+
if 'object' in js:
|
|
115
|
+
self.__process_response_object(js, delta_contents)
|
|
116
|
+
else:
|
|
117
|
+
self.__process_no_object(js, delta_contents)
|
|
118
|
+
|
|
119
|
+
input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
|
|
120
|
+
return input_tokens, output_tokens
|
|
121
|
+
|
|
122
|
+
def __process_response_object(self, js, delta_contents):
|
|
123
|
+
if js['object'] == 'chat.completion':
|
|
124
|
+
for choice in js['choices']:
|
|
125
|
+
delta_contents[choice['index']] = [choice['message']['content']]
|
|
126
|
+
elif js['object'] == 'text_completion':
|
|
127
|
+
for choice in js['choices']:
|
|
128
|
+
delta_contents[choice['index']] = [choice['text']]
|
|
129
|
+
elif js['object'] == 'chat.completion.chunk':
|
|
130
|
+
for choice in js.get('choices', []):
|
|
131
|
+
if 'delta' in choice and 'index' in choice:
|
|
132
|
+
delta = choice['delta']
|
|
133
|
+
idx = choice['index']
|
|
134
|
+
if 'content' in delta:
|
|
135
|
+
delta_content = delta['content']
|
|
136
|
+
delta_contents.setdefault(idx, []).append(delta_content)
|
|
137
|
+
|
|
138
|
+
def __process_no_object(self, js, delta_contents):
|
|
139
|
+
# assume the response is a single choice
|
|
140
|
+
for choice in js['choices']:
|
|
141
|
+
if 'delta' in choice:
|
|
142
|
+
delta = choice['delta']
|
|
143
|
+
idx = choice['index']
|
|
144
|
+
if 'content' in delta:
|
|
145
|
+
delta_content = delta['content']
|
|
146
|
+
delta_contents.setdefault(idx, []).append(delta_content)
|
|
147
|
+
else:
|
|
148
|
+
delta_contents[choice['index']] = [choice['message']['content']]
|
|
149
|
+
|
|
150
|
+
def __calculate_tokens_from_content(self, request, delta_contents):
|
|
151
|
+
input_tokens = output_tokens = 0
|
|
152
|
+
if self.tokenizer is not None:
|
|
146
153
|
for idx, choice_contents in delta_contents.items():
|
|
147
|
-
full_response_content = ''.join(
|
|
154
|
+
full_response_content = ''.join(choice_contents)
|
|
148
155
|
input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
|
|
149
156
|
output_tokens += len(self.tokenizer.encode(full_response_content))
|
|
150
|
-
|
|
151
|
-
input_tokens = 0
|
|
152
|
-
output_tokens = 0
|
|
157
|
+
else:
|
|
153
158
|
logger.warning('No usage information found. Please specify `--tokenizer-path` to generate usage details.')
|
|
154
|
-
|
|
155
159
|
return input_tokens, output_tokens
|
|
@@ -103,6 +103,7 @@ def start_app(args: Arguments):
|
|
|
103
103
|
elif args.api == 'local_vllm':
|
|
104
104
|
os.environ['VLLM_USE_MODELSCOPE'] = 'True'
|
|
105
105
|
os.environ['VLLM_ALLOW_LONG_MAX_MODEL_LEN'] = '1'
|
|
106
|
+
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
|
|
106
107
|
# yapf: disable
|
|
107
108
|
proc = subprocess.Popen([
|
|
108
109
|
'python', '-m', 'vllm.entrypoints.openai.api_server',
|
evalscope/run.py
CHANGED
|
@@ -2,27 +2,23 @@
|
|
|
2
2
|
"""
|
|
3
3
|
Run evaluation for LLMs.
|
|
4
4
|
"""
|
|
5
|
-
import logging
|
|
6
5
|
import os.path
|
|
7
|
-
import torch
|
|
8
6
|
from argparse import Namespace
|
|
9
7
|
from datetime import datetime
|
|
10
8
|
from typing import List, Optional, Union
|
|
11
9
|
|
|
12
10
|
from evalscope.arguments import parse_args
|
|
11
|
+
from evalscope.benchmarks import Benchmark, BenchmarkMeta
|
|
13
12
|
from evalscope.config import TaskConfig, parse_task_config
|
|
14
|
-
from evalscope.constants import
|
|
13
|
+
from evalscope.constants import DEFAULT_WORK_DIR, EvalBackend
|
|
15
14
|
from evalscope.evaluator import Evaluator
|
|
16
|
-
from evalscope.models
|
|
17
|
-
from evalscope.utils import
|
|
15
|
+
from evalscope.models import LocalModel, get_local_model, initialize_model_adapter
|
|
16
|
+
from evalscope.utils import seed_everything
|
|
18
17
|
from evalscope.utils.io_utils import OutputsStructure, are_paths_same
|
|
19
18
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
20
19
|
|
|
21
20
|
logger = get_logger()
|
|
22
21
|
|
|
23
|
-
BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.'
|
|
24
|
-
MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
|
|
25
|
-
|
|
26
22
|
|
|
27
23
|
def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]:
|
|
28
24
|
"""Run evaluation task(s) based on the provided configuration."""
|
|
@@ -38,15 +34,13 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]
|
|
|
38
34
|
|
|
39
35
|
def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
40
36
|
"""Run a single evaluation task."""
|
|
41
|
-
|
|
37
|
+
if task_cfg.seed is not None:
|
|
38
|
+
seed_everything(task_cfg.seed)
|
|
42
39
|
outputs = setup_work_directory(task_cfg, run_time)
|
|
43
40
|
configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
|
|
44
41
|
|
|
45
|
-
task_cfg.dump_yaml(outputs.configs_dir)
|
|
46
|
-
logger.info(task_cfg)
|
|
47
|
-
|
|
48
42
|
if task_cfg.eval_backend != EvalBackend.NATIVE:
|
|
49
|
-
return run_non_native_backend(task_cfg)
|
|
43
|
+
return run_non_native_backend(task_cfg, outputs)
|
|
50
44
|
else:
|
|
51
45
|
return evaluate_model(task_cfg, outputs)
|
|
52
46
|
|
|
@@ -68,7 +62,7 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
|
68
62
|
return outputs
|
|
69
63
|
|
|
70
64
|
|
|
71
|
-
def run_non_native_backend(task_cfg: TaskConfig) -> dict:
|
|
65
|
+
def run_non_native_backend(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
72
66
|
"""Run evaluation using a non-native backend."""
|
|
73
67
|
eval_backend = task_cfg.eval_backend
|
|
74
68
|
eval_config = task_cfg.eval_config
|
|
@@ -78,6 +72,10 @@ def run_non_native_backend(task_cfg: TaskConfig) -> dict:
|
|
|
78
72
|
|
|
79
73
|
backend_manager_class = get_backend_manager_class(eval_backend)
|
|
80
74
|
backend_manager = backend_manager_class(config=eval_config)
|
|
75
|
+
|
|
76
|
+
task_cfg.dump_yaml(outputs.configs_dir)
|
|
77
|
+
logger.info(task_cfg)
|
|
78
|
+
|
|
81
79
|
backend_manager.run()
|
|
82
80
|
|
|
83
81
|
return dict()
|
|
@@ -102,75 +100,48 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
102
100
|
"""Evaluate the model based on the provided task configuration."""
|
|
103
101
|
# Initialize evaluator
|
|
104
102
|
eval_results = {}
|
|
105
|
-
|
|
103
|
+
base_model = get_local_model(task_cfg)
|
|
104
|
+
evaluators = []
|
|
106
105
|
for dataset_name in task_cfg.datasets:
|
|
107
|
-
evaluator = create_evaluator(task_cfg, dataset_name, outputs)
|
|
106
|
+
evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model)
|
|
107
|
+
evaluators.append(evaluator)
|
|
108
|
+
|
|
109
|
+
# dump task_cfg to outputs.configs_dir after creating evaluators
|
|
110
|
+
task_cfg.dump_yaml(outputs.configs_dir)
|
|
111
|
+
logger.info(task_cfg)
|
|
112
|
+
|
|
113
|
+
for evaluator in evaluators:
|
|
108
114
|
res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
|
|
109
115
|
eval_results[dataset_name] = res_dict
|
|
110
116
|
|
|
111
117
|
return eval_results
|
|
112
118
|
|
|
113
119
|
|
|
114
|
-
def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure):
|
|
120
|
+
def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: LocalModel):
|
|
115
121
|
"""Create an evaluator object for the specified dataset."""
|
|
116
|
-
imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
|
|
117
|
-
model_adapter = initialize_model_adapter(task_cfg, dataset_name, imported_modules)
|
|
118
|
-
|
|
119
|
-
dataset_config = task_cfg.dataset_args.get(dataset_name, {})
|
|
120
|
-
dataset_name_or_path = dataset_config.get('local_path') or imported_modules['DATASET_ID']
|
|
121
|
-
in_prompt_template = dataset_config.get('prompt_template', '')
|
|
122
|
-
few_shot_num = dataset_config.get('few_shot_num', None)
|
|
123
|
-
few_shot_random = dataset_config.get('few_shot_random', True)
|
|
124
|
-
|
|
125
|
-
data_adapter = imported_modules['DataAdapterClass'](
|
|
126
|
-
few_shot_num=few_shot_num,
|
|
127
|
-
few_shot_random=few_shot_random,
|
|
128
|
-
prompt_template=in_prompt_template,
|
|
129
|
-
outputs=outputs,
|
|
130
|
-
)
|
|
131
|
-
in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
|
|
132
122
|
|
|
133
|
-
|
|
123
|
+
if dataset_name == 'data_collection':
|
|
124
|
+
# EvaluatorCollection is a collection of evaluators
|
|
125
|
+
from evalscope.collections import EvaluatorCollection
|
|
126
|
+
return EvaluatorCollection(task_cfg, outputs)
|
|
127
|
+
|
|
128
|
+
benchmark: BenchmarkMeta = Benchmark.get(dataset_name)
|
|
129
|
+
|
|
130
|
+
data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args.get(dataset_name, {}))
|
|
131
|
+
model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model)
|
|
132
|
+
|
|
133
|
+
# update task_cfg.dataset_args
|
|
134
|
+
task_cfg.dataset_args[dataset_name] = benchmark.to_string_dict()
|
|
134
135
|
|
|
135
136
|
return Evaluator(
|
|
136
|
-
dataset_name_or_path=
|
|
137
|
-
subset_list=in_subset_list,
|
|
137
|
+
dataset_name_or_path=benchmark.dataset_id,
|
|
138
138
|
data_adapter=data_adapter,
|
|
139
139
|
model_adapter=model_adapter,
|
|
140
|
-
use_cache=task_cfg.use_cache,
|
|
141
140
|
outputs=outputs,
|
|
142
|
-
|
|
143
|
-
datasets_hub=task_cfg.dataset_hub,
|
|
144
|
-
stage=task_cfg.stage,
|
|
145
|
-
eval_type=task_cfg.eval_type,
|
|
146
|
-
overall_task_cfg=task_cfg,
|
|
141
|
+
task_cfg=task_cfg,
|
|
147
142
|
)
|
|
148
143
|
|
|
149
144
|
|
|
150
|
-
def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules):
|
|
151
|
-
"""Initialize the model adapter based on the task configuration."""
|
|
152
|
-
if task_cfg.dry_run:
|
|
153
|
-
from evalscope.models.dummy_chat_model import DummyChatModel
|
|
154
|
-
return DummyChatModel(model_cfg=dict())
|
|
155
|
-
elif task_cfg.eval_type == EvalType.CUSTOM:
|
|
156
|
-
if not isinstance(task_cfg.model, CustomModel):
|
|
157
|
-
raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
|
|
158
|
-
from evalscope.models.model_adapter import CustomModelAdapter
|
|
159
|
-
return CustomModelAdapter(custom_model=task_cfg.model)
|
|
160
|
-
else:
|
|
161
|
-
device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
|
|
162
|
-
model_precision = task_cfg.model_args.get('precision', torch.float16)
|
|
163
|
-
if isinstance(model_precision, str) and model_precision != 'auto':
|
|
164
|
-
model_precision = eval(model_precision)
|
|
165
|
-
return imported_modules['ModelAdapterClass'](
|
|
166
|
-
model_id=task_cfg.model,
|
|
167
|
-
model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
|
|
168
|
-
device_map=device_map,
|
|
169
|
-
torch_dtype=model_precision,
|
|
170
|
-
generation_config=task_cfg.generation_config,
|
|
171
|
-
chat_template=task_cfg.chat_template)
|
|
172
|
-
|
|
173
|
-
|
|
174
145
|
def main():
|
|
175
146
|
args = parse_args()
|
|
176
147
|
run_task(args)
|
evalscope/run_arena.py
CHANGED
|
@@ -10,7 +10,7 @@ from tqdm import tqdm
|
|
|
10
10
|
|
|
11
11
|
from evalscope.constants import EvalConfigKeys
|
|
12
12
|
from evalscope.evaluator.rating_eval import RatingEvaluate
|
|
13
|
-
from evalscope.models
|
|
13
|
+
from evalscope.models import ChatGenerationModelAdapter
|
|
14
14
|
from evalscope.utils import get_obj_from_cfg
|
|
15
15
|
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list, yaml_to_dict
|
|
16
16
|
from evalscope.utils.logger import get_logger
|
evalscope/utils/__init__.py
CHANGED
evalscope/utils/chat_service.py
CHANGED
|
@@ -3,11 +3,10 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from contextlib import contextmanager
|
|
5
5
|
from functools import partial
|
|
6
|
-
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
7
6
|
from pydantic import BaseModel, Field
|
|
8
7
|
from threading import Thread
|
|
9
8
|
from transformers import TextIteratorStreamer
|
|
10
|
-
from typing import List, Literal, Optional, Union
|
|
9
|
+
from typing import Any, List, Literal, Optional, Union
|
|
11
10
|
|
|
12
11
|
|
|
13
12
|
class Usage(BaseModel):
|
|
@@ -66,7 +65,7 @@ class ChatCompletionResponseStreamChoice(BaseModel):
|
|
|
66
65
|
class ChatCompletionResponse(BaseModel):
|
|
67
66
|
model: str
|
|
68
67
|
object: Literal['chat.completion', 'chat.completion.chunk']
|
|
69
|
-
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
|
|
68
|
+
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, Any]]
|
|
70
69
|
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
|
|
71
70
|
usage: Optional[Usage]
|
|
72
71
|
|
|
@@ -96,6 +95,8 @@ class TextCompletionResponse(BaseModel):
|
|
|
96
95
|
class ChatService:
|
|
97
96
|
|
|
98
97
|
def __init__(self, model_path, attn_implementation):
|
|
98
|
+
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
99
|
+
|
|
99
100
|
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
|
100
101
|
self.model = AutoModelForCausalLM.from_pretrained(
|
|
101
102
|
model_path,
|
evalscope/utils/io_utils.py
CHANGED
|
@@ -160,3 +160,11 @@ def are_paths_same(path1, path2):
|
|
|
160
160
|
real_path2 = os.path.realpath(os.path.abspath(os.path.expanduser(path2)))
|
|
161
161
|
|
|
162
162
|
return real_path1 == real_path2
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def dict_to_json(d: dict, json_file: str):
|
|
166
|
+
"""
|
|
167
|
+
Dump dict to json file.
|
|
168
|
+
"""
|
|
169
|
+
with open(json_file, 'w') as f:
|
|
170
|
+
json.dump(d, f, indent=4, ensure_ascii=False)
|
evalscope/utils/logger.py
CHANGED
|
@@ -14,6 +14,10 @@ DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else
|
|
|
14
14
|
|
|
15
15
|
logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL)
|
|
16
16
|
|
|
17
|
+
# disable datasets logging
|
|
18
|
+
logging.getLogger('datasets').setLevel(logging.WARNING)
|
|
19
|
+
logging.getLogger('modelscope').setLevel(logging.WARNING)
|
|
20
|
+
|
|
17
21
|
|
|
18
22
|
def get_logger(log_file: Optional[str] = None, log_level: int = DEFAULT_LEVEL, file_mode: str = 'w', force=False):
|
|
19
23
|
"""Get logging logger
|
evalscope/utils/model_utils.py
CHANGED
|
@@ -1,6 +1,16 @@
|
|
|
1
|
+
from enum import Enum
|
|
1
2
|
from transformers import GenerationConfig
|
|
2
3
|
|
|
3
4
|
|
|
5
|
+
class EvalBackend(Enum):
|
|
6
|
+
# NOTE: compatible with ms-swfit v2.x
|
|
7
|
+
NATIVE = 'Native'
|
|
8
|
+
OPEN_COMPASS = 'OpenCompass'
|
|
9
|
+
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
10
|
+
RAG_EVAL = 'RAGEval'
|
|
11
|
+
THIRD_PARTY = 'ThirdParty'
|
|
12
|
+
|
|
13
|
+
|
|
4
14
|
def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
|
|
5
15
|
# Use the default values of temperature/top_p/top_k in generation_config.
|
|
6
16
|
if generation_config.temperature == 0:
|
evalscope/utils/utils.py
CHANGED
|
@@ -121,7 +121,6 @@ class ResponseParser:
|
|
|
121
121
|
f'([{options_concat}])\s?是正确答案',
|
|
122
122
|
f'选项\s?([{options_concat}])\s?正确',
|
|
123
123
|
f'所以答\s?([{options_concat}])',
|
|
124
|
-
f'1.\s?([{options_concat}])[.。$]?$',
|
|
125
124
|
f'所以\s?([{options_concat}][.。$]?$)',
|
|
126
125
|
f'所有\s?([{options_concat}][.。$]?$)',
|
|
127
126
|
f'[\s,::,]([{options_concat}])[。,,\.]?$',
|
|
@@ -137,16 +136,15 @@ class ResponseParser:
|
|
|
137
136
|
f'答案为(.*?)[{options_concat}]',
|
|
138
137
|
f'固选(.*?)[{options_concat}]',
|
|
139
138
|
f'答案应该是(.*?)[{options_concat}]',
|
|
140
|
-
f'[Tt]he answer is [{options_concat}]',
|
|
139
|
+
f'[Tt]he answer is \(?[{options_concat}]\)?',
|
|
141
140
|
f'[Tt]he correct answer is [{options_concat}]',
|
|
142
141
|
f'[Tt]he correct answer is:\n[{options_concat}]',
|
|
143
142
|
f'(\s|^)[{options_concat}][\s。,,\.$]', # noqa
|
|
144
|
-
f'[{options_concat}]',
|
|
145
143
|
f'^选项\s?([{options_concat}])',
|
|
146
144
|
f'^([{options_concat}])\s?选?项',
|
|
147
145
|
f'(\s|^)[{options_concat}][\s。,,::\.$]',
|
|
148
146
|
f'(\s|^)[{options_concat}](\s|$)',
|
|
149
|
-
f'
|
|
147
|
+
f'[{options_concat}]',
|
|
150
148
|
]
|
|
151
149
|
|
|
152
150
|
regexes = [re.compile(pattern) for pattern in patterns]
|
|
@@ -169,6 +167,7 @@ class ResponseParser:
|
|
|
169
167
|
"""
|
|
170
168
|
patterns = [
|
|
171
169
|
r'[Aa]nswer:\s*(\w+)',
|
|
170
|
+
r'answer is \(?(\w+)\)?',
|
|
172
171
|
r'[Tt]he correct answer is:\s*(\w+)',
|
|
173
172
|
r'[Tt]he correct answer is:\n\s*(\w+)',
|
|
174
173
|
r'[Tt]he correct answer is:\n\n-\s*(\w+)',
|
|
@@ -199,27 +198,6 @@ class ResponseParser:
|
|
|
199
198
|
|
|
200
199
|
|
|
201
200
|
|
|
202
|
-
def import_module_util(import_path_prefix: str, module_name: str, members_to_import: list) -> dict:
|
|
203
|
-
"""
|
|
204
|
-
Import module utility function.
|
|
205
|
-
|
|
206
|
-
Args:
|
|
207
|
-
import_path_prefix: e.g. 'evalscope.benchmarks.'
|
|
208
|
-
module_name: The module name to import. e.g. 'mmlu'
|
|
209
|
-
members_to_import: The members to import.
|
|
210
|
-
e.g. ['DATASET_ID', 'SUBJECT_MAPPING', 'SUBSET_LIST', 'DataAdapterClass']
|
|
211
|
-
|
|
212
|
-
Returns:
|
|
213
|
-
dict: imported modules map. e.g. {'DATASET_ID': 'mmlu', 'SUBJECT_MAPPING': {...}, ...}
|
|
214
|
-
"""
|
|
215
|
-
imported_modules = {}
|
|
216
|
-
module = importlib.import_module(import_path_prefix + module_name)
|
|
217
|
-
for member_name in members_to_import:
|
|
218
|
-
imported_modules[member_name] = getattr(module, member_name)
|
|
219
|
-
|
|
220
|
-
return imported_modules
|
|
221
|
-
|
|
222
|
-
|
|
223
201
|
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
|
|
224
202
|
"""
|
|
225
203
|
Normalize score.
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.9.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -84,7 +84,7 @@ Requires-Dist: transformers-stream-generator; extra == "all"
|
|
|
84
84
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
|
|
85
85
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
|
|
86
86
|
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
87
|
-
Requires-Dist: ragas==0.2.
|
|
87
|
+
Requires-Dist: ragas==0.2.9; extra == "all"
|
|
88
88
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
89
89
|
Requires-Dist: aiohttp; extra == "all"
|
|
90
90
|
Requires-Dist: fastapi; extra == "all"
|
|
@@ -129,7 +129,7 @@ Requires-Dist: transformers; extra == "perf"
|
|
|
129
129
|
Requires-Dist: unicorn; extra == "perf"
|
|
130
130
|
Provides-Extra: rag
|
|
131
131
|
Requires-Dist: mteb==1.19.4; extra == "rag"
|
|
132
|
-
Requires-Dist: ragas==0.2.
|
|
132
|
+
Requires-Dist: ragas==0.2.9; extra == "rag"
|
|
133
133
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
134
134
|
Provides-Extra: vlmeval
|
|
135
135
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
@@ -160,14 +160,16 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
|
160
160
|
> ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
|
|
161
161
|
|
|
162
162
|
## 📋 Contents
|
|
163
|
-
- [Introduction](
|
|
164
|
-
- [News](
|
|
165
|
-
- [Installation](
|
|
166
|
-
- [Quick Start](
|
|
163
|
+
- [Introduction](#-introduction)
|
|
164
|
+
- [News](#-news)
|
|
165
|
+
- [Installation](#️-installation)
|
|
166
|
+
- [Quick Start](#-quick-start)
|
|
167
167
|
- [Evaluation Backend](#evaluation-backend)
|
|
168
|
-
- [Custom Dataset Evaluation](
|
|
169
|
-
- [Model Serving Performance Evaluation](
|
|
170
|
-
- [Arena Mode](
|
|
168
|
+
- [Custom Dataset Evaluation](#️-custom-dataset-evaluation)
|
|
169
|
+
- [Model Serving Performance Evaluation](#-model-serving-performance-evaluation)
|
|
170
|
+
- [Arena Mode](#-arena-mode)
|
|
171
|
+
- [Contribution](#️-contribution)
|
|
172
|
+
- [Roadmap](#-roadmap)
|
|
171
173
|
|
|
172
174
|
|
|
173
175
|
## 📝 Introduction
|
|
@@ -181,6 +183,8 @@ The framework accommodates multiple evaluation scenarios such as end-to-end RAG
|
|
|
181
183
|
<br>EvalScope Framework.
|
|
182
184
|
</p>
|
|
183
185
|
|
|
186
|
+
<details><summary>Framework Description</summary>
|
|
187
|
+
|
|
184
188
|
The architecture includes the following modules:
|
|
185
189
|
1. **Model Adapter**: The model adapter is used to convert the outputs of specific models into the format required by the framework, supporting both API call models and locally run models.
|
|
186
190
|
2. **Data Adapter**: The data adapter is responsible for converting and processing input data to meet various evaluation needs and formats.
|
|
@@ -194,13 +198,27 @@ The architecture includes the following modules:
|
|
|
194
198
|
5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
|
|
195
199
|
6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
|
|
196
200
|
|
|
201
|
+
</details>
|
|
202
|
+
|
|
203
|
+
## ☎ User Groups
|
|
204
|
+
|
|
205
|
+
Please scan the QR code below to join our community groups:
|
|
206
|
+
|
|
207
|
+
[Discord Group](https://discord.com/invite/D27yfEFVz5) | WeChat Group | DingTalk Group
|
|
208
|
+
:-------------------------:|:-------------------------:|:-------------------------:
|
|
209
|
+
<img src="docs/asset/discord_qr.jpg" width="160" height="160"> | <img src="docs/asset/wechat.png" width="160" height="160"> | <img src="docs/asset/dingding.png" width="160" height="160">
|
|
210
|
+
|
|
197
211
|
|
|
198
212
|
## 🎉 News
|
|
213
|
+
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
199
214
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
200
215
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
201
216
|
- 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
|
|
202
217
|
- 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
|
|
203
218
|
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
219
|
+
|
|
220
|
+
<details><summary>More</summary>
|
|
221
|
+
|
|
204
222
|
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
205
223
|
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
206
224
|
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -212,7 +230,7 @@ The architecture includes the following modules:
|
|
|
212
230
|
- 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
|
|
213
231
|
- 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
|
|
214
232
|
|
|
215
|
-
|
|
233
|
+
</details>
|
|
216
234
|
|
|
217
235
|
## 🛠️ Installation
|
|
218
236
|
### Method 1: Install Using pip
|
|
@@ -402,7 +420,7 @@ EvalScope supports using third-party evaluation frameworks to initiate evaluatio
|
|
|
402
420
|
- **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
|
|
403
421
|
|
|
404
422
|
|
|
405
|
-
## Model Serving Performance Evaluation
|
|
423
|
+
## 📈 Model Serving Performance Evaluation
|
|
406
424
|
A stress testing tool focused on large language models, which can be customized to support various dataset formats and different API protocol formats.
|
|
407
425
|
|
|
408
426
|
Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html)
|
|
@@ -427,19 +445,32 @@ Speed Benchmark Results:
|
|
|
427
445
|
+---------------+-----------------+----------------+
|
|
428
446
|
```
|
|
429
447
|
|
|
430
|
-
## Custom Dataset Evaluation
|
|
448
|
+
## 🖊️ Custom Dataset Evaluation
|
|
431
449
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
|
|
432
450
|
|
|
433
451
|
|
|
434
|
-
## Arena Mode
|
|
452
|
+
## 🏟️ Arena Mode
|
|
435
453
|
The Arena mode allows multiple candidate models to be evaluated through pairwise battles, and can choose to use the AI Enhanced Auto-Reviewer (AAR) automatic evaluation process or manual evaluation to obtain the evaluation report.
|
|
436
454
|
|
|
437
455
|
Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/arena.html)
|
|
438
456
|
|
|
457
|
+
## 👷♂️ Contribution
|
|
439
458
|
|
|
459
|
+
EvalScope, as the official evaluation tool of [ModelScope](https://modelscope.cn), is continuously optimizing its benchmark evaluation features! We invite you to refer to the [Contribution Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html) to easily add your own evaluation benchmarks and share your contributions with the community. Let’s work together to support the growth of EvalScope and make our tools even better! Join us now!
|
|
440
460
|
|
|
461
|
+
<a href="https://github.com/modelscope/evalscope/graphs/contributors" target="_blank">
|
|
462
|
+
<table>
|
|
463
|
+
<tr>
|
|
464
|
+
<th colspan="2">
|
|
465
|
+
<br><img src="https://contrib.rocks/image?repo=modelscope/evalscope"><br><br>
|
|
466
|
+
</th>
|
|
467
|
+
</tr>
|
|
468
|
+
</table>
|
|
469
|
+
</a>
|
|
441
470
|
|
|
442
|
-
##
|
|
471
|
+
## 🔜 Roadmap
|
|
472
|
+
- [ ] Support for better evaluation report visualization
|
|
473
|
+
- [x] Support for mixed evaluations across multiple datasets
|
|
443
474
|
- [x] RAG evaluation
|
|
444
475
|
- [x] VLM evaluation
|
|
445
476
|
- [x] Agents evaluation
|
|
@@ -450,8 +481,6 @@ Refer to: Arena Mode [📖 User Guide](https://evalscope.readthedocs.io/en/lates
|
|
|
450
481
|
- [ ] GAIA
|
|
451
482
|
- [ ] GPQA
|
|
452
483
|
- [x] MBPP
|
|
453
|
-
- [ ] Auto-reviewer
|
|
454
|
-
- [ ] Qwen-max
|
|
455
484
|
|
|
456
485
|
|
|
457
486
|
## Star History
|