evalscope 0.5.5__py3-none-any.whl → 0.5.5rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/__init__.py +3 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +1 -0
- evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
- evalscope/evaluator/evaluator.py +0 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +0 -19
- evalscope/run.py +0 -4
- evalscope/utils/logger.py +14 -44
- evalscope/utils/task_utils.py +0 -3
- evalscope/version.py +2 -2
- {evalscope-0.5.5.dist-info → evalscope-0.5.5rc0.dist-info}/METADATA +30 -24
- {evalscope-0.5.5.dist-info → evalscope-0.5.5rc0.dist-info}/RECORD +14 -30
- evalscope/backend/rag_eval/__init__.py +0 -3
- evalscope/backend/rag_eval/backend_manager.py +0 -68
- evalscope/backend/rag_eval/cmteb/__init__.py +0 -4
- evalscope/backend/rag_eval/cmteb/arguments.py +0 -59
- evalscope/backend/rag_eval/cmteb/base.py +0 -89
- evalscope/backend/rag_eval/cmteb/task_template.py +0 -83
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +0 -302
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +0 -252
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +0 -113
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +0 -153
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +0 -345
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +0 -302
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +0 -64
- evalscope/backend/rag_eval/ragas/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/arguments.py +0 -37
- evalscope/backend/rag_eval/ragas/task_template.py +0 -117
- {evalscope-0.5.5.dist-info → evalscope-0.5.5rc0.dist-info}/WHEEL +0 -0
- {evalscope-0.5.5.dist-info → evalscope-0.5.5rc0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.5.5.dist-info → evalscope-0.5.5rc0.dist-info}/top_level.txt +0 -0
evalscope/backend/__init__.py
CHANGED
|
@@ -8,7 +8,7 @@ class CustomDataset:
|
|
|
8
8
|
|
|
9
9
|
def load_data(self, dataset):
|
|
10
10
|
# customize the loading of the dataset
|
|
11
|
-
data_path = os.path.join(
|
|
11
|
+
data_path = os.path.join("~/LMUData", f'{dataset}.tsv')
|
|
12
12
|
return load(data_path)
|
|
13
13
|
|
|
14
14
|
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -174,7 +174,6 @@ class Evaluator(object):
|
|
|
174
174
|
"""
|
|
175
175
|
assert self.data_adapter is not None, 'data_adapter must be provided when calling func get_answers() !'
|
|
176
176
|
assert self.model_adapter is not None, 'model must be provided when calling func get_answers() !'
|
|
177
|
-
assert len(prompts_list) > 0, 'prompts_list must not be empty when calling func get_answers() !'
|
|
178
177
|
|
|
179
178
|
answers_list = []
|
|
180
179
|
pred_dir: str = self.outputs_structure.get(OutputsStructure.PREDICTIONS_DIR)
|
|
@@ -31,7 +31,6 @@ In these examples settings.xml lists input files and formats.
|
|
|
31
31
|
from __future__ import absolute_import, division, print_function
|
|
32
32
|
import collections
|
|
33
33
|
import re
|
|
34
|
-
import os
|
|
35
34
|
|
|
36
35
|
import nltk
|
|
37
36
|
import numpy as np
|
|
@@ -39,24 +38,6 @@ import six
|
|
|
39
38
|
from absl import logging
|
|
40
39
|
from rouge_score import scoring, tokenizers
|
|
41
40
|
from six.moves import map, range
|
|
42
|
-
from evalscope.utils import get_logger
|
|
43
|
-
|
|
44
|
-
logger = get_logger()
|
|
45
|
-
|
|
46
|
-
# Deal with nltk punkt_tab.zip tokenizer file to avoid downloading issue
|
|
47
|
-
try:
|
|
48
|
-
nltk_dir = os.path.join(os.path.expanduser('~'), 'nltk_data/tokenizers')
|
|
49
|
-
os.makedirs(nltk_dir, exist_ok=True)
|
|
50
|
-
punkt_path = os.path.join(nltk_dir, 'punkt_tab.zip')
|
|
51
|
-
punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
|
|
52
|
-
|
|
53
|
-
if not os.path.exists(punkt_path):
|
|
54
|
-
os.system(f'wget -P {nltk_dir} {punkt_tab_url}')
|
|
55
|
-
os.system(f'unzip {punkt_path} -d {nltk_dir}')
|
|
56
|
-
else:
|
|
57
|
-
logger.info(f'{punkt_path} already exists, skipping download')
|
|
58
|
-
except Exception as e:
|
|
59
|
-
logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
|
|
60
41
|
|
|
61
42
|
|
|
62
43
|
class RougeScorer(scoring.BaseScorer):
|
evalscope/run.py
CHANGED
|
@@ -207,10 +207,6 @@ def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig]]) -> Union[
|
|
|
207
207
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
208
208
|
vlm_eval_kit_backend_manager = VLMEvalKitBackendManager(config=eval_config)
|
|
209
209
|
vlm_eval_kit_backend_manager.run()
|
|
210
|
-
elif eval_backend == EvalBackend.RAG_EVAL.value:
|
|
211
|
-
from evalscope.backend.rag_eval import RAGEvalBackendManager
|
|
212
|
-
rag_eval_backend_manager = RAGEvalBackendManager(config=eval_config)
|
|
213
|
-
rag_eval_backend_manager.run()
|
|
214
210
|
# TODO: Add other evaluation backends
|
|
215
211
|
elif eval_backend == EvalBackend.THIRD_PARTY.value:
|
|
216
212
|
raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
|
evalscope/utils/logger.py
CHANGED
|
@@ -1,20 +1,18 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import importlib.util as iutil
|
|
4
3
|
import logging
|
|
5
4
|
from typing import Optional
|
|
6
5
|
|
|
7
6
|
init_loggers = {}
|
|
8
|
-
format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
9
|
-
formatter = logging.Formatter(format)
|
|
10
7
|
|
|
11
|
-
|
|
8
|
+
formatter = logging.Formatter(
|
|
9
|
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
12
10
|
|
|
13
11
|
|
|
14
|
-
def get_logger(
|
|
15
|
-
|
|
16
|
-
):
|
|
17
|
-
"""Get logging logger
|
|
12
|
+
def get_logger(log_file: Optional[str] = None,
|
|
13
|
+
log_level: int = logging.INFO,
|
|
14
|
+
file_mode: str = 'w'):
|
|
15
|
+
""" Get logging logger
|
|
18
16
|
|
|
19
17
|
Args:
|
|
20
18
|
log_file: Log filename, if specified, file handler will be added to
|
|
@@ -24,39 +22,21 @@ def get_logger(
|
|
|
24
22
|
specified (if filemode is unspecified, it defaults to 'w').
|
|
25
23
|
"""
|
|
26
24
|
|
|
27
|
-
logger_name = __name__.split(
|
|
25
|
+
logger_name = __name__.split('.')[0]
|
|
28
26
|
logger = logging.getLogger(logger_name)
|
|
29
|
-
|
|
27
|
+
|
|
30
28
|
if logger_name in init_loggers:
|
|
31
29
|
add_file_handler_if_needed(logger, log_file, file_mode, log_level)
|
|
32
|
-
if logger.level != log_level:
|
|
33
|
-
logger.setLevel(log_level)
|
|
34
30
|
return logger
|
|
35
31
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
# level handler causes logging messages from rank>0 processes to
|
|
40
|
-
# unexpectedly show up on the console, creating much unwanted clutter.
|
|
41
|
-
# To fix this issue, we set the root logger's StreamHandler, if any, to log
|
|
42
|
-
# at the ERROR level.
|
|
43
|
-
torch_dist = False
|
|
44
|
-
is_worker0 = True
|
|
45
|
-
if iutil.find_spec("torch") is not None:
|
|
46
|
-
from modelscope.utils.torch_utils import is_dist, is_master
|
|
47
|
-
|
|
48
|
-
torch_dist = is_dist()
|
|
49
|
-
is_worker0 = is_master()
|
|
50
|
-
|
|
51
|
-
if torch_dist:
|
|
52
|
-
for handler in logger.root.handlers:
|
|
53
|
-
if type(handler) is logging.StreamHandler:
|
|
54
|
-
handler.setLevel(logging.ERROR)
|
|
32
|
+
for handler in logger.root.handlers:
|
|
33
|
+
if type(handler) is logging.StreamHandler:
|
|
34
|
+
handler.setLevel(logging.ERROR)
|
|
55
35
|
|
|
56
36
|
stream_handler = logging.StreamHandler()
|
|
57
37
|
handlers = [stream_handler]
|
|
58
38
|
|
|
59
|
-
if
|
|
39
|
+
if log_file is not None:
|
|
60
40
|
file_handler = logging.FileHandler(log_file, file_mode)
|
|
61
41
|
handlers.append(file_handler)
|
|
62
42
|
|
|
@@ -65,10 +45,7 @@ def get_logger(
|
|
|
65
45
|
handler.setLevel(log_level)
|
|
66
46
|
logger.addHandler(handler)
|
|
67
47
|
|
|
68
|
-
|
|
69
|
-
logger.setLevel(log_level)
|
|
70
|
-
else:
|
|
71
|
-
logger.setLevel(logging.ERROR)
|
|
48
|
+
logger.setLevel(log_level)
|
|
72
49
|
|
|
73
50
|
init_loggers[logger_name] = True
|
|
74
51
|
|
|
@@ -80,14 +57,7 @@ def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
|
|
|
80
57
|
if isinstance(handler, logging.FileHandler):
|
|
81
58
|
return
|
|
82
59
|
|
|
83
|
-
if
|
|
84
|
-
from modelscope.utils.torch_utils import is_master
|
|
85
|
-
|
|
86
|
-
is_worker0 = is_master()
|
|
87
|
-
else:
|
|
88
|
-
is_worker0 = True
|
|
89
|
-
|
|
90
|
-
if is_worker0 and log_file is not None:
|
|
60
|
+
if log_file is not None:
|
|
91
61
|
file_handler = logging.FileHandler(log_file, file_mode)
|
|
92
62
|
file_handler.setFormatter(formatter)
|
|
93
63
|
file_handler.setLevel(log_level)
|
evalscope/utils/task_utils.py
CHANGED
|
@@ -11,9 +11,6 @@ class EvalBackend(Enum):
|
|
|
11
11
|
|
|
12
12
|
# Use VLM Eval Kit as the multi-modal model evaluation backend
|
|
13
13
|
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
14
|
-
|
|
15
|
-
# Use RAGEval as the RAG evaluation backend
|
|
16
|
-
RAG_EVAL = 'RAGEval'
|
|
17
14
|
|
|
18
15
|
# Use third-party evaluation backend/modules
|
|
19
16
|
THIRD_PARTY = 'ThirdParty'
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.5rc0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -24,7 +24,7 @@ Requires-Dist: editdistance
|
|
|
24
24
|
Requires-Dist: jsonlines
|
|
25
25
|
Requires-Dist: matplotlib
|
|
26
26
|
Requires-Dist: modelscope[framework]
|
|
27
|
-
Requires-Dist: nltk
|
|
27
|
+
Requires-Dist: nltk
|
|
28
28
|
Requires-Dist: openai
|
|
29
29
|
Requires-Dist: pandas
|
|
30
30
|
Requires-Dist: plotly
|
|
@@ -34,7 +34,7 @@ Requires-Dist: pyyaml
|
|
|
34
34
|
Requires-Dist: regex
|
|
35
35
|
Requires-Dist: requests
|
|
36
36
|
Requires-Dist: requests-toolbelt
|
|
37
|
-
Requires-Dist: rouge-score
|
|
37
|
+
Requires-Dist: rouge-score
|
|
38
38
|
Requires-Dist: sacrebleu
|
|
39
39
|
Requires-Dist: scikit-learn
|
|
40
40
|
Requires-Dist: seaborn
|
|
@@ -57,7 +57,7 @@ Requires-Dist: editdistance ; extra == 'all'
|
|
|
57
57
|
Requires-Dist: jsonlines ; extra == 'all'
|
|
58
58
|
Requires-Dist: matplotlib ; extra == 'all'
|
|
59
59
|
Requires-Dist: modelscope[framework] ; extra == 'all'
|
|
60
|
-
Requires-Dist: nltk
|
|
60
|
+
Requires-Dist: nltk ; extra == 'all'
|
|
61
61
|
Requires-Dist: openai ; extra == 'all'
|
|
62
62
|
Requires-Dist: pandas ; extra == 'all'
|
|
63
63
|
Requires-Dist: plotly ; extra == 'all'
|
|
@@ -67,7 +67,7 @@ Requires-Dist: pyyaml ; extra == 'all'
|
|
|
67
67
|
Requires-Dist: regex ; extra == 'all'
|
|
68
68
|
Requires-Dist: requests ; extra == 'all'
|
|
69
69
|
Requires-Dist: requests-toolbelt ; extra == 'all'
|
|
70
|
-
Requires-Dist: rouge-score
|
|
70
|
+
Requires-Dist: rouge-score ; extra == 'all'
|
|
71
71
|
Requires-Dist: sacrebleu ; extra == 'all'
|
|
72
72
|
Requires-Dist: scikit-learn ; extra == 'all'
|
|
73
73
|
Requires-Dist: seaborn ; extra == 'all'
|
|
@@ -82,8 +82,6 @@ Requires-Dist: jieba ; extra == 'all'
|
|
|
82
82
|
Requires-Dist: rouge-chinese ; extra == 'all'
|
|
83
83
|
Requires-Dist: ms-opencompass (>=0.1.1) ; extra == 'all'
|
|
84
84
|
Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'all'
|
|
85
|
-
Requires-Dist: ragas ; extra == 'all'
|
|
86
|
-
Requires-Dist: mteb (>=0.14.16) ; extra == 'all'
|
|
87
85
|
Provides-Extra: inner
|
|
88
86
|
Requires-Dist: absl-py ; extra == 'inner'
|
|
89
87
|
Requires-Dist: accelerate ; extra == 'inner'
|
|
@@ -112,9 +110,6 @@ Requires-Dist: transformers (<4.43,>=4.33) ; extra == 'inner'
|
|
|
112
110
|
Requires-Dist: transformers-stream-generator ; extra == 'inner'
|
|
113
111
|
Provides-Extra: opencompass
|
|
114
112
|
Requires-Dist: ms-opencompass (>=0.1.1) ; extra == 'opencompass'
|
|
115
|
-
Provides-Extra: rag
|
|
116
|
-
Requires-Dist: ragas ; extra == 'rag'
|
|
117
|
-
Requires-Dist: mteb (>=0.14.16) ; extra == 'rag'
|
|
118
113
|
Provides-Extra: vlmeval
|
|
119
114
|
Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'vlmeval'
|
|
120
115
|
|
|
@@ -149,11 +144,30 @@ English | [简体中文](README_zh.md)
|
|
|
149
144
|
|
|
150
145
|
## 📝 Introduction
|
|
151
146
|
|
|
152
|
-
|
|
147
|
+
Large Model (including Large Language Models, Multi-modal Large Language Models) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the EvalScope framework.
|
|
148
|
+
|
|
149
|
+
### Framework Features
|
|
150
|
+
- **Benchmark Datasets**: Preloaded with several commonly used test benchmarks, including MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
|
|
151
|
+
- **Evaluation Metrics**: Implements various commonly used evaluation metrics.
|
|
152
|
+
- **Model Access**: A unified model access mechanism that is compatible with the Generate and Chat interfaces of multiple model families.
|
|
153
|
+
- **Automated Evaluation**: Includes automatic evaluation of objective questions and complex task evaluation using expert models.
|
|
154
|
+
- **Evaluation Reports**: Automatically generates evaluation reports.
|
|
155
|
+
- **Arena Mode**: Used for comparisons between models and objective evaluation of models, supporting various evaluation modes, including:
|
|
156
|
+
- **Single mode**: Scoring a single model.
|
|
157
|
+
- **Pairwise-baseline mode**: Comparing against a baseline model.
|
|
158
|
+
- **Pairwise (all) mode**: Pairwise comparison among all models.
|
|
159
|
+
- **Visualization Tools**: Provides intuitive displays of evaluation results.
|
|
160
|
+
- **Model Performance Evaluation**: Offers a performance testing tool for model inference services and detailed statistics, see [Model Performance Evaluation Documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test.html).
|
|
161
|
+
- **OpenCompass Integration**: Supports OpenCompass as the evaluation backend, providing advanced encapsulation and task simplification, allowing for easier task submission for evaluation.
|
|
162
|
+
- **VLMEvalKit Integration**: Supports VLMEvalKit as the evaluation backend, facilitating the initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
163
|
+
- **Full-Link Support**: Through seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, provides a one-stop development process for model training, model deployment, model evaluation, and report viewing, enhancing user development efficiency.
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
<details><summary>Overall Architecture</summary>
|
|
153
167
|
|
|
154
168
|
<p align="center">
|
|
155
169
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
156
|
-
<br>EvalScope Framework.
|
|
170
|
+
<br>Fig 1. EvalScope Framework.
|
|
157
171
|
</p>
|
|
158
172
|
|
|
159
173
|
The architecture includes the following modules:
|
|
@@ -163,15 +177,14 @@ The architecture includes the following modules:
|
|
|
163
177
|
- **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
|
|
164
178
|
- **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
|
|
165
179
|
- **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
166
|
-
- **RAGEval**: Supports RAG evaluation, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
167
180
|
- **ThirdParty**: Other third-party evaluation tasks, such as ToolBench.
|
|
168
181
|
4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
|
|
169
182
|
5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
|
|
170
183
|
6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
|
|
184
|
+
</details>
|
|
171
185
|
|
|
172
186
|
|
|
173
187
|
## 🎉 News
|
|
174
|
-
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
175
188
|
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
176
189
|
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
177
190
|
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -342,10 +355,9 @@ run_task(task_cfg=your_task_cfg)
|
|
|
342
355
|
## Evaluation Backend
|
|
343
356
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
344
357
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
345
|
-
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/
|
|
346
|
-
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/
|
|
347
|
-
- **
|
|
348
|
-
- **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
|
|
358
|
+
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/opencompass_backend.html)
|
|
359
|
+
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/vlmevalkit_backend.html)
|
|
360
|
+
- **ThirdParty**: The third-party task, e.g. [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html), you can contribute your own evaluation task to EvalScope as third-party backend.
|
|
349
361
|
|
|
350
362
|
## Custom Dataset Evaluation
|
|
351
363
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
|
|
@@ -374,8 +386,6 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
|
|
|
374
386
|
|
|
375
387
|
|
|
376
388
|
## TO-DO List
|
|
377
|
-
- [x] RAG evaluation
|
|
378
|
-
- [x] VLM evaluation
|
|
379
389
|
- [x] Agents evaluation
|
|
380
390
|
- [x] vLLM
|
|
381
391
|
- [ ] Distributed evaluating
|
|
@@ -387,7 +397,3 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
|
|
|
387
397
|
- [ ] Auto-reviewer
|
|
388
398
|
- [ ] Qwen-max
|
|
389
399
|
|
|
390
|
-
|
|
391
|
-
## Star History
|
|
392
|
-
|
|
393
|
-
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
@@ -2,12 +2,12 @@ evalscope/__init__.py,sha256=3eLMMrjkAIAs3vGluXNZn5-xTSbO_vfba9yNPbkVtg8,105
|
|
|
2
2
|
evalscope/cache.py,sha256=zpGjL9JMosqjk_dkODVwvIGiUC0WAMmMTHDNJOvBQU8,3288
|
|
3
3
|
evalscope/config.py,sha256=G_rpSn5Kd1aPlFJO6asnZu5FUggZmwcYdAxxpuq0yDs,6972
|
|
4
4
|
evalscope/constants.py,sha256=g8lGYlpA4Wk88HwtqId1-jJX_z8Lr2k02gWLsyofyj0,2670
|
|
5
|
-
evalscope/run.py,sha256=
|
|
5
|
+
evalscope/run.py,sha256=T-2zoJpBx6YxLnLJH-iFF3UxUGYTU36PMV_DQ9e8tSM,18484
|
|
6
6
|
evalscope/run_arena.py,sha256=BCWCAiX0BQ9pLMIq08svEcd-IoFr75gFShpV88robIY,8963
|
|
7
7
|
evalscope/run_ms.py,sha256=UtJoGnah64SXigTawJQWTi_TEGjr7Td0rjCTaO-htL8,6028
|
|
8
8
|
evalscope/summarizer.py,sha256=rIyML8HpjQxIpXg8KvQ0CzOS6xMS-JHZh6kUZzkaRsk,6640
|
|
9
|
-
evalscope/version.py,sha256=
|
|
10
|
-
evalscope/backend/__init__.py,sha256=
|
|
9
|
+
evalscope/version.py,sha256=bZN6I8gZRA-2x6Q_o4vAmsbfomUm2v86wl1-AhMHnR4,121
|
|
10
|
+
evalscope/backend/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
11
11
|
evalscope/backend/base.py,sha256=5BLrDNNwxsGp35zorD-kphmN15tlBbkuuqwkz8jWZq0,876
|
|
12
12
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
13
13
|
evalscope/backend/opencompass/api_meta_template.py,sha256=sBW0XbVDOKeJ7mVUDLhmcG4e0yClw3eluazdp_8wtgQ,1753
|
|
@@ -15,25 +15,9 @@ evalscope/backend/opencompass/backend_manager.py,sha256=_eg82FLAVxQ6t5e1OqlyuxZc
|
|
|
15
15
|
evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
16
16
|
evalscope/backend/opencompass/tasks/eval_api.py,sha256=12lrgDpMzZ1XBRboq5TEOovDPCMDwwGCJoRT78Ox_yo,1108
|
|
17
17
|
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=bYFHkjiwZqh2FVRo1I88xEDZ6nYmZjAgG5ZODbthKFI,5241
|
|
18
|
-
evalscope/backend/rag_eval/__init__.py,sha256=yRCcfxhzC7wIYbgb-w76i4D9v8wXI7JmYNM6IZUn064,199
|
|
19
|
-
evalscope/backend/rag_eval/backend_manager.py,sha256=tWkFzYO9LQjaI7paD5yz1c-HtNJUbnAr0a-4biYSZvg,2562
|
|
20
|
-
evalscope/backend/rag_eval/cmteb/__init__.py,sha256=ajVz6XP5hqPq-jm66hp2poA2qKj1V19ZGoqjrGUlO7U,279
|
|
21
|
-
evalscope/backend/rag_eval/cmteb/arguments.py,sha256=698UvPVZp5Ncq_p25P_67SQkYaW2tLSCHenUOZ0n5OI,2217
|
|
22
|
-
evalscope/backend/rag_eval/cmteb/base.py,sha256=sJqTRCej7vk5ASirk21hOobX1_Hz7BO1LIHJFOGLuE4,2731
|
|
23
|
-
evalscope/backend/rag_eval/cmteb/task_template.py,sha256=HreVxyRiF2QUe4Dy9_zKNp1WU74342RWHV5_B8ycXG0,2537
|
|
24
|
-
evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=7adR40W6Uu58-QR9jCUP4k7TdAnG0oT225v4xHXah2g,10635
|
|
25
|
-
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py,sha256=7j1Hts_r4Nv8DlbIiPFMaU1JDxCYgu0wO0JI8T_Y6X8,8969
|
|
26
|
-
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py,sha256=2WkaTE-jF8jqsu1UcNDqN8A4567UzW5boD_0B83j-9A,4008
|
|
27
|
-
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py,sha256=50h-lXaRcb5s6ZpIgnfk5mU7iZur8ZDxwsaFbrqSZ_o,5462
|
|
28
|
-
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=wUxiQH5aOmWNS4YswACyHqBn5xqP5eyvsq6U9WSp5R0,11457
|
|
29
|
-
evalscope/backend/rag_eval/cmteb/tasks/STS.py,sha256=6GMaoCANM-IKYLk4srHOYr_eurav3DGihHMQeJPXR6k,12054
|
|
30
|
-
evalscope/backend/rag_eval/cmteb/tasks/__init__.py,sha256=FrtoBosHq9iRp3yfZEAxWa5NkYhHtA20NmHDG6eiPNU,1421
|
|
31
|
-
evalscope/backend/rag_eval/ragas/__init__.py,sha256=tHB7XGREmcrx8ulF-JZWWoHsEbn2s-PFyWFdGzOZQcw,190
|
|
32
|
-
evalscope/backend/rag_eval/ragas/arguments.py,sha256=plVc2_3auVG5z91ExzBdkbNIhMVjyi_xQYbEzlV0iNw,1208
|
|
33
|
-
evalscope/backend/rag_eval/ragas/task_template.py,sha256=795NHXzGdeqa15ONV1AgDZywpMHucaIlvk_EBF0CK98,3868
|
|
34
18
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=xTgHM95lWzh4s0W7zxLwYkgUbPAZfAb0UoGGmyyBXrs,83
|
|
35
|
-
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=
|
|
36
|
-
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=
|
|
19
|
+
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=k52qTUqkp1kJivKn8bVrKoF8cng4xYTQLUmjnH_CWPM,6080
|
|
20
|
+
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=zC40Jw9bIqcGKuWS9oKPAlQdBARc-zY3sJlSiU-u-sI,1625
|
|
37
21
|
evalscope/benchmarks/__init__.py,sha256=6TKP35wfKf7R_h870fsEtcIlIAgomKOcukNL9M-5I1Y,162
|
|
38
22
|
evalscope/benchmarks/benchmark.py,sha256=EmwYyFdrAHBGMkSbsMZQOR_62Q0CSKl8zeLlr7xvJdQ,2159
|
|
39
23
|
evalscope/benchmarks/data_adapter.py,sha256=eVQvOQYQOQbIl8UlvOEUqRThL3FP3aUD6DSlqF1bqO0,10395
|
|
@@ -107,7 +91,7 @@ evalscope/cli/cli.py,sha256=uZ-qC8WBsLd5-Hn94d43sSGg0UC_12RebSD4ToKjypg,844
|
|
|
107
91
|
evalscope/cli/start_perf.py,sha256=TL6bMXYl3ln-tfs5uBmzb9x94uxz6f3PBFIt1l7g3VA,994
|
|
108
92
|
evalscope/cli/start_server.py,sha256=ATGLP2TE0aImJNicpehdzBuFlNb50F7KhyL4A_ZSoGU,3885
|
|
109
93
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
110
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
94
|
+
evalscope/evaluator/evaluator.py,sha256=gB408byOpu269Psh6MjYC9-a_uv9GvThoT7t07Oqh6w,30712
|
|
111
95
|
evalscope/evaluator/rating_eval.py,sha256=cJbkyXIuwFUZoe7ZJZM6eUskNd9zlORgndckuon2OQ8,5768
|
|
112
96
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
113
97
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=JycPYti9h1j_8DRcu_rc5U0wkEASHYg-XBqrUUoiO-Q,17054
|
|
@@ -117,7 +101,7 @@ evalscope/metrics/math_accuracy.py,sha256=1PCy1VUNYg48JcGy-6SUmUDZNwPeAkMW1QQ_lX
|
|
|
117
101
|
evalscope/metrics/metrics.py,sha256=sDZljGiZwgHsFZ5eNi65-3z3BLCdIwWUzPcq2QpKf1k,12545
|
|
118
102
|
evalscope/metrics/rouge_metric.py,sha256=sN0r-sXXc-nJUdFrthQPAv1VFdOCrF6zzIYDKaLSgrU,4522
|
|
119
103
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
120
|
-
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=
|
|
104
|
+
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=xSLis-zx1hnHuj_9JI7HuUKMS4ZQsX-D8wECZg4D2bg,11450
|
|
121
105
|
evalscope/models/__init__.py,sha256=zG27J2HSeKPGiAIUE7QLPHEPLyXLsfaDwYI_TDXjpCg,145
|
|
122
106
|
evalscope/models/dummy_chat_model.py,sha256=xE8wcFVSCkvizEJ-B8ojX0Ir01Q5KrN5mapjMQaQtbg,1325
|
|
123
107
|
evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
|
|
@@ -185,12 +169,12 @@ evalscope/tools/rewrite_eval_results.py,sha256=ZVi2hVjiTOmR_O5IaLv6qnQNpMz6FnDb9
|
|
|
185
169
|
evalscope/utils/__init__.py,sha256=6RjACRYUSpGj6fkZ7NzYpl0lFppQCp9KVn5ktZe626s,128
|
|
186
170
|
evalscope/utils/arena_utils.py,sha256=RMkymUv9Cxs37arUntzgDY5P0Dand2jGpsb7uy6wZmg,7670
|
|
187
171
|
evalscope/utils/completion_parsers.py,sha256=61l8CTh1VxHgRoMDhtznpAhuJp47MssGgS-LdEe_h80,2997
|
|
188
|
-
evalscope/utils/logger.py,sha256=
|
|
172
|
+
evalscope/utils/logger.py,sha256=Ycd0W17Z_oiByPuPX3_umNrOCHjT9O_e_Kws7ZWUSvU,1855
|
|
189
173
|
evalscope/utils/task_cfg_parser.py,sha256=LiNQ2X8lbZU0cODpaY_PbKyUhNoxZIC495UsLJigX64,138
|
|
190
|
-
evalscope/utils/task_utils.py,sha256=
|
|
174
|
+
evalscope/utils/task_utils.py,sha256=Mv_u_f4Z91zcUeko6acZCmnOAPRfk61kf_dliLzG5Yk,459
|
|
191
175
|
evalscope/utils/utils.py,sha256=zHo9hfxGBUVKE2xNMR7lDoEvfRnk4V4946DEfXQhlq4,20509
|
|
192
|
-
evalscope-0.5.
|
|
193
|
-
evalscope-0.5.
|
|
194
|
-
evalscope-0.5.
|
|
195
|
-
evalscope-0.5.
|
|
196
|
-
evalscope-0.5.
|
|
176
|
+
evalscope-0.5.5rc0.dist-info/METADATA,sha256=DZDMgIPMwCBputs0_kU74reMXftr7Y9ERi2dCmC29Gs,20708
|
|
177
|
+
evalscope-0.5.5rc0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
178
|
+
evalscope-0.5.5rc0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
179
|
+
evalscope-0.5.5rc0.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
|
|
180
|
+
evalscope-0.5.5rc0.dist-info/RECORD,,
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from typing import Optional, Union
|
|
3
|
-
from evalscope.utils import is_module_installed, get_valid_list
|
|
4
|
-
from evalscope.backend.base import BackendManager
|
|
5
|
-
from evalscope.utils.logger import get_logger
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
logger = get_logger()
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class RAGEvalBackendManager(BackendManager):
|
|
12
|
-
def __init__(self, config: Union[str, dict], **kwargs):
|
|
13
|
-
"""BackendManager for VLM Evaluation Kit
|
|
14
|
-
|
|
15
|
-
Args:
|
|
16
|
-
config (Union[str, dict]): the configuration yaml-file or the configuration dictionary
|
|
17
|
-
"""
|
|
18
|
-
super().__init__(config, **kwargs)
|
|
19
|
-
|
|
20
|
-
@staticmethod
|
|
21
|
-
def _check_env(module_name: str):
|
|
22
|
-
if is_module_installed(module_name):
|
|
23
|
-
logger.info(f"Check `{module_name}` Installed")
|
|
24
|
-
else:
|
|
25
|
-
logger.error(f"Please install `{module_name}` first")
|
|
26
|
-
|
|
27
|
-
def run_mteb(self):
|
|
28
|
-
from evalscope.backend.rag_eval.cmteb import ModelArguments, EvalArguments
|
|
29
|
-
from evalscope.backend.rag_eval.cmteb import one_stage_eval, two_stage_eval
|
|
30
|
-
|
|
31
|
-
if len(self.model_args) > 2:
|
|
32
|
-
raise ValueError("Not support multiple models yet")
|
|
33
|
-
|
|
34
|
-
# Convert arguments to dictionary
|
|
35
|
-
model_args_list = [ModelArguments(**args).to_dict() for args in self.model_args]
|
|
36
|
-
eval_args = EvalArguments(**self.eval_args).to_dict()
|
|
37
|
-
|
|
38
|
-
if len(model_args_list) == 1:
|
|
39
|
-
one_stage_eval(model_args_list[0], eval_args)
|
|
40
|
-
else: # len(model_args_list) == 2
|
|
41
|
-
two_stage_eval(model_args_list[0], model_args_list[1], eval_args)
|
|
42
|
-
|
|
43
|
-
def run_ragas(self):
|
|
44
|
-
from evalscope.backend.rag_eval.ragas import rag_eval, testset_generation
|
|
45
|
-
from evalscope.backend.rag_eval.ragas import (
|
|
46
|
-
TestsetGenerationArguments,
|
|
47
|
-
EvaluationArguments,
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
if self.testset_args is not None:
|
|
51
|
-
testset_generation(TestsetGenerationArguments(**self.testset_args))
|
|
52
|
-
if self.eval_args is not None:
|
|
53
|
-
rag_eval(EvaluationArguments(**self.eval_args))
|
|
54
|
-
|
|
55
|
-
def run(self, *args, **kwargs):
|
|
56
|
-
tool = self.config_d.pop("tool")
|
|
57
|
-
if tool.lower() == "mteb":
|
|
58
|
-
self._check_env("mteb")
|
|
59
|
-
self.model_args = self.config_d["model"]
|
|
60
|
-
self.eval_args = self.config_d["eval"]
|
|
61
|
-
self.run_mteb()
|
|
62
|
-
elif tool.lower() == "ragas":
|
|
63
|
-
self._check_env("ragas")
|
|
64
|
-
self.testset_args = self.config_d.get("testset_generation", None)
|
|
65
|
-
self.eval_args = self.config_d.get("eval", None)
|
|
66
|
-
self.run_ragas()
|
|
67
|
-
else:
|
|
68
|
-
raise ValueError(f"Unknown tool: {tool}")
|
|
@@ -1,4 +0,0 @@
|
|
|
1
|
-
from evalscope.backend.rag_eval.cmteb.tasks import *
|
|
2
|
-
from evalscope.backend.rag_eval.cmteb.base import *
|
|
3
|
-
from evalscope.backend.rag_eval.cmteb.arguments import ModelArguments, EvalArguments
|
|
4
|
-
from evalscope.backend.rag_eval.cmteb.task_template import one_stage_eval, two_stage_eval
|
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass, field
|
|
2
|
-
from typing import List, Optional, Union, Dict, Any
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
@dataclass
|
|
6
|
-
class ModelArguments:
|
|
7
|
-
# Arguments for embeding model: sentence transformer or cross encoder
|
|
8
|
-
model_name_or_path: str = "" # model name or path
|
|
9
|
-
is_cross_encoder: bool = False # whether the model is a cross encoder
|
|
10
|
-
# pooling mode: Either “cls”, “lasttoken”, “max”, “mean”, “mean_sqrt_len_tokens”, or “weightedmean”.
|
|
11
|
-
pooling_mode: Optional[str] = None
|
|
12
|
-
max_seq_length: int = 512 # max sequence length
|
|
13
|
-
# prompt for llm based model
|
|
14
|
-
prompt: str = ""
|
|
15
|
-
# model kwargs
|
|
16
|
-
model_kwargs: dict = field(default_factory=lambda: {"torch_dtype": "auto"})
|
|
17
|
-
# config kwargs
|
|
18
|
-
config_kwargs: Dict[str, Any] = field(default_factory=dict)
|
|
19
|
-
# encode kwargs
|
|
20
|
-
encode_kwargs: dict = field(
|
|
21
|
-
default_factory=lambda: {"show_progress_bar": True, "batch_size": 32}
|
|
22
|
-
)
|
|
23
|
-
hub: str = "modelscope" # modelscope or huggingface
|
|
24
|
-
|
|
25
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
26
|
-
return {
|
|
27
|
-
"model_name_or_path": self.model_name_or_path,
|
|
28
|
-
"is_cross_encoder": self.is_cross_encoder,
|
|
29
|
-
"pooling_mode": self.pooling_mode,
|
|
30
|
-
"max_seq_length": self.max_seq_length,
|
|
31
|
-
"prompt": self.prompt,
|
|
32
|
-
"model_kwargs": self.model_kwargs,
|
|
33
|
-
"config_kwargs": self.config_kwargs,
|
|
34
|
-
"encode_kwargs": self.encode_kwargs,
|
|
35
|
-
"hub": self.hub,
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
@dataclass
|
|
40
|
-
class EvalArguments:
|
|
41
|
-
# Evaluation
|
|
42
|
-
tasks: List[str] = field(default_factory=list) # task names
|
|
43
|
-
verbosity: int = 2 # verbosity level 0-3
|
|
44
|
-
output_folder: str = "outputs" # output folder
|
|
45
|
-
overwrite_results: bool = True # overwrite results
|
|
46
|
-
limits: Optional[int] = None # limit number of samples
|
|
47
|
-
hub: str = "modelscope" # modelscope or huggingface
|
|
48
|
-
top_k: int = 5
|
|
49
|
-
|
|
50
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
51
|
-
return {
|
|
52
|
-
"tasks": self.tasks,
|
|
53
|
-
"verbosity": self.verbosity,
|
|
54
|
-
"output_folder": self.output_folder,
|
|
55
|
-
"overwrite_results": self.overwrite_results,
|
|
56
|
-
"limits": self.limits,
|
|
57
|
-
"hub": self.hub,
|
|
58
|
-
"top_k": 5,
|
|
59
|
-
}
|