evalscope 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/opencompass/backend_manager.py +1 -3
- evalscope/backend/opencompass/tasks/eval_api.py +1 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +3 -5
- evalscope/utils/task_utils.py +1 -1
- evalscope/version.py +2 -2
- {evalscope-0.5.0.dist-info → evalscope-0.5.2.dist-info}/METADATA +42 -30
- {evalscope-0.5.0.dist-info → evalscope-0.5.2.dist-info}/RECORD +10 -10
- {evalscope-0.5.0.dist-info → evalscope-0.5.2.dist-info}/entry_points.txt +0 -1
- {evalscope-0.5.0.dist-info → evalscope-0.5.2.dist-info}/WHEEL +0 -0
- {evalscope-0.5.0.dist-info → evalscope-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -76,9 +76,7 @@ class OpenCompassBackendManager(BackendManager):
|
|
|
76
76
|
@staticmethod
|
|
77
77
|
def _check_env():
|
|
78
78
|
if is_module_installed('opencompass'):
|
|
79
|
-
logger.info('
|
|
80
|
-
else:
|
|
81
|
-
raise ModuleNotFoundError('Please install the `ms-opencompass` first: `pip install ms-opencompass`')
|
|
79
|
+
logger.info('Check the OpenCompass environment: OK')
|
|
82
80
|
|
|
83
81
|
@staticmethod
|
|
84
82
|
def get_restore_arg(arg_name: str, arg_val: bool):
|
|
@@ -31,7 +31,7 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
31
31
|
from vlmeval.utils.arguments import Arguments as VLMEvalArguments
|
|
32
32
|
self.args = VLMEvalArguments(**self.config_d)
|
|
33
33
|
|
|
34
|
-
self.valid_models = self.
|
|
34
|
+
self.valid_models = self.list_supported_models()
|
|
35
35
|
self.valid_model_names = list(self.valid_models.keys())
|
|
36
36
|
self.valid_datasets = self.list_supported_datasets()
|
|
37
37
|
|
|
@@ -86,7 +86,7 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
86
86
|
return self.get_cmd()
|
|
87
87
|
|
|
88
88
|
@staticmethod
|
|
89
|
-
def
|
|
89
|
+
def list_supported_models():
|
|
90
90
|
from vlmeval.config import supported_VLM
|
|
91
91
|
return supported_VLM
|
|
92
92
|
|
|
@@ -98,9 +98,7 @@ class VLMEvalKitBackendManager(BackendManager):
|
|
|
98
98
|
@staticmethod
|
|
99
99
|
def _check_env():
|
|
100
100
|
if is_module_installed('vlmeval'):
|
|
101
|
-
logger.info('
|
|
102
|
-
else:
|
|
103
|
-
raise ModuleNotFoundError('Please install the `ms-vlmeval` first: `pip install ms-vlmeval`')
|
|
101
|
+
logger.info('Check VLM Evaluation Kit: Installed')
|
|
104
102
|
|
|
105
103
|
@staticmethod
|
|
106
104
|
def get_restore_arg(arg_name: str, arg_val: bool):
|
evalscope/utils/task_utils.py
CHANGED
evalscope/version.py
CHANGED
|
@@ -1,13 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.5.
|
|
4
|
-
Summary:
|
|
5
|
-
Home-page: https://github.com/modelscope/
|
|
3
|
+
Version: 0.5.2
|
|
4
|
+
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
|
+
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
7
7
|
Author-email: contact@modelscope.cn
|
|
8
|
-
License: UNKNOWN
|
|
9
8
|
Keywords: python,llm,evaluation
|
|
10
|
-
Platform: UNKNOWN
|
|
11
9
|
Classifier: Development Status :: 4 - Beta
|
|
12
10
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
11
|
Classifier: Operating System :: OS Independent
|
|
@@ -80,8 +78,8 @@ Requires-Dist: transformers (<4.43,>=4.33) ; extra == 'all'
|
|
|
80
78
|
Requires-Dist: transformers-stream-generator ; extra == 'all'
|
|
81
79
|
Requires-Dist: jieba ; extra == 'all'
|
|
82
80
|
Requires-Dist: rouge-chinese ; extra == 'all'
|
|
83
|
-
Requires-Dist: ms-opencompass ; extra == 'all'
|
|
84
|
-
Requires-Dist: ms-vlmeval ; extra == 'all'
|
|
81
|
+
Requires-Dist: ms-opencompass (>=0.0.5) ; extra == 'all'
|
|
82
|
+
Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'all'
|
|
85
83
|
Provides-Extra: inner
|
|
86
84
|
Requires-Dist: absl-py ; extra == 'inner'
|
|
87
85
|
Requires-Dist: accelerate ; extra == 'inner'
|
|
@@ -109,16 +107,16 @@ Requires-Dist: tqdm ; extra == 'inner'
|
|
|
109
107
|
Requires-Dist: transformers (<4.43,>=4.33) ; extra == 'inner'
|
|
110
108
|
Requires-Dist: transformers-stream-generator ; extra == 'inner'
|
|
111
109
|
Provides-Extra: opencompass
|
|
112
|
-
Requires-Dist: ms-opencompass ; extra == 'opencompass'
|
|
110
|
+
Requires-Dist: ms-opencompass (>=0.0.5) ; extra == 'opencompass'
|
|
113
111
|
Provides-Extra: vlmeval
|
|
114
|
-
Requires-Dist: ms-vlmeval ; extra == 'vlmeval'
|
|
112
|
+
Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'vlmeval'
|
|
115
113
|
|
|
116
114
|
English | [简体中文](README_zh.md)
|
|
117
115
|
|
|
118
116
|
<p align="center">
|
|
119
117
|
<a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dm/evalscope">
|
|
120
118
|
</a>
|
|
121
|
-
<a href="https://github.com/modelscope/
|
|
119
|
+
<a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
|
|
122
120
|
<p>
|
|
123
121
|
|
|
124
122
|
## 📖 Table of Content
|
|
@@ -133,7 +131,7 @@ English | [简体中文](README_zh.md)
|
|
|
133
131
|
|
|
134
132
|
## 📝 Introduction
|
|
135
133
|
|
|
136
|
-
Large Language Model (LLMs) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the
|
|
134
|
+
Large Language Model (LLMs) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the EvalScope framework, which includes the following components and features:
|
|
137
135
|
|
|
138
136
|
- Pre-configured common benchmark datasets, including: MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
|
|
139
137
|
- Implementation of common evaluation metrics
|
|
@@ -146,7 +144,7 @@ Large Language Model (LLMs) evaluation has become a critical process for assessi
|
|
|
146
144
|
- Visualization tools
|
|
147
145
|
- Model Inference Performance Evaluation [Tutorial](evalscope/perf/README.md)
|
|
148
146
|
- Support for OpenCompass as an Evaluation Backend, featuring advanced encapsulation and task simplification to easily submit tasks to OpenCompass for evaluation.
|
|
149
|
-
- Supports VLMEvalKit as the evaluation backend. It initiates VLMEvalKit's multimodal evaluation tasks through
|
|
147
|
+
- Supports VLMEvalKit as the evaluation backend. It initiates VLMEvalKit's multimodal evaluation tasks through EvalScope, supporting various multimodal models and datasets.
|
|
150
148
|
- Full pipeline support: Seamlessly integrate with SWIFT to easily train and deploy model services, initiate evaluation tasks, view evaluation reports, and achieve an end-to-end large model development process.
|
|
151
149
|
|
|
152
150
|
|
|
@@ -167,33 +165,48 @@ Large Language Model (LLMs) evaluation has become a critical process for assessi
|
|
|
167
165
|
- **[2024.07.31]** Breaking change: The sdk name has been changed from `llmuses` to `evalscope`, please update the sdk name in your code.
|
|
168
166
|
- **[2024.07.26]** Supports **VLMEvalKit** as a third-party evaluation framework, initiating multimodal model evaluation tasks. [User Guide](#vlmevalkit-evaluation-backend) 🔥🔥🔥
|
|
169
167
|
- **[2024.06.29]** Supports **OpenCompass** as a third-party evaluation framework. We have provided a high-level wrapper, supporting installation via pip and simplifying the evaluation task configuration. [User Guide](#opencompass-evaluation-backend) 🔥🔥🔥
|
|
170
|
-
- **[2024.06.13]**
|
|
168
|
+
- **[2024.06.13]** EvalScope has been updated to version 0.3.x, which supports the ModelScope SWIFT framework for LLMs evaluation. 🚀🚀🚀
|
|
171
169
|
- **[2024.06.13]** We have supported the ToolBench as a third-party evaluation backend for Agents evaluation. 🚀🚀🚀
|
|
172
170
|
|
|
173
171
|
|
|
174
172
|
|
|
175
173
|
## 🛠️ Installation
|
|
176
174
|
### Install with pip
|
|
177
|
-
1. create conda environment
|
|
175
|
+
1. create conda environment [Optional]
|
|
178
176
|
```shell
|
|
179
|
-
conda create -n
|
|
180
|
-
conda activate
|
|
177
|
+
conda create -n evalscope python=3.10
|
|
178
|
+
conda activate evalscope
|
|
181
179
|
```
|
|
182
180
|
|
|
183
|
-
2. Install
|
|
181
|
+
2. Install EvalScope
|
|
184
182
|
```shell
|
|
185
|
-
pip install evalscope
|
|
183
|
+
pip install evalscope # Installation with Native backend (by default)
|
|
184
|
+
|
|
185
|
+
pip install evalscope[opencompass] # Installation with OpenCompass backend
|
|
186
|
+
pip install evalscope[vlmeval] # Installation with VLMEvalKit backend
|
|
187
|
+
pip install evalscope[all] # Installation with all backends (Native, OpenCompass, VLMEvalKit)
|
|
186
188
|
```
|
|
187
189
|
|
|
190
|
+
DEPRECATION WARNING: For 0.4.3 or older versions, please use the following command to install:
|
|
191
|
+
```shell
|
|
192
|
+
pip install llmuses<=0.4.3
|
|
193
|
+
|
|
194
|
+
# Usage:
|
|
195
|
+
from llmuses.run import run_task
|
|
196
|
+
...
|
|
197
|
+
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
|
|
188
201
|
### Install from source code
|
|
189
202
|
1. Download source code
|
|
190
203
|
```shell
|
|
191
|
-
git clone https://github.com/modelscope/
|
|
204
|
+
git clone https://github.com/modelscope/evalscope.git
|
|
192
205
|
```
|
|
193
206
|
|
|
194
207
|
2. Install dependencies
|
|
195
208
|
```shell
|
|
196
|
-
cd
|
|
209
|
+
cd evalscope/
|
|
197
210
|
pip install -e .
|
|
198
211
|
```
|
|
199
212
|
|
|
@@ -237,15 +250,15 @@ print(TemplateType.get_template_name_list())
|
|
|
237
250
|
```
|
|
238
251
|
|
|
239
252
|
### Evaluation Backend
|
|
240
|
-
|
|
241
|
-
- **Native**:
|
|
242
|
-
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through
|
|
243
|
-
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through
|
|
244
|
-
- **ThirdParty**: The third-party task, e.g. [ToolBench](evalscope/thirdparty/toolbench/README.md), you can contribute your own evaluation task to
|
|
253
|
+
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
254
|
+
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
255
|
+
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
|
|
256
|
+
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
|
|
257
|
+
- **ThirdParty**: The third-party task, e.g. [ToolBench](evalscope/thirdparty/toolbench/README.md), you can contribute your own evaluation task to EvalScope as third-party backend.
|
|
245
258
|
|
|
246
259
|
#### OpenCompass Eval-Backend
|
|
247
260
|
|
|
248
|
-
To facilitate the use of the OpenCompass evaluation backend, we have customized the OpenCompass source code and named it `ms-opencompass`. This version includes optimizations for evaluation task configuration and execution based on the original version, and it supports installation via PyPI. This allows users to initiate lightweight OpenCompass evaluation tasks through
|
|
261
|
+
To facilitate the use of the OpenCompass evaluation backend, we have customized the OpenCompass source code and named it `ms-opencompass`. This version includes optimizations for evaluation task configuration and execution based on the original version, and it supports installation via PyPI. This allows users to initiate lightweight OpenCompass evaluation tasks through EvalScope. Additionally, we have initially opened up API-based evaluation tasks in the OpenAI API format. You can deploy model services using [ModelScope Swift](https://github.com/modelscope/swift), where [swift deploy](https://swift.readthedocs.io/en/latest/LLM/VLLM-inference-acceleration-and-deployment.html) supports using vLLM to launch model inference services.
|
|
249
262
|
|
|
250
263
|
|
|
251
264
|
##### Installation
|
|
@@ -301,7 +314,7 @@ python examples/example_eval_swift_openai_api.py
|
|
|
301
314
|
|
|
302
315
|
#### VLMEvalKit Evaluation Backend
|
|
303
316
|
|
|
304
|
-
To facilitate the use of the VLMEvalKit evaluation backend, we have customized the VLMEvalKit source code and named it `ms-vlmeval`. This version encapsulates the configuration and execution of evaluation tasks based on the original version and supports installation via PyPI, allowing users to initiate lightweight VLMEvalKit evaluation tasks through
|
|
317
|
+
To facilitate the use of the VLMEvalKit evaluation backend, we have customized the VLMEvalKit source code and named it `ms-vlmeval`. This version encapsulates the configuration and execution of evaluation tasks based on the original version and supports installation via PyPI, allowing users to initiate lightweight VLMEvalKit evaluation tasks through EvalScope. Additionally, we support API-based evaluation tasks in the OpenAI API format. You can deploy multimodal model services using ModelScope [swift](https://github.com/modelscope/swift).
|
|
305
318
|
|
|
306
319
|
##### Installation
|
|
307
320
|
```shell
|
|
@@ -319,7 +332,8 @@ For detailed information about the datasets, please refer to [VLMEvalKit Support
|
|
|
319
332
|
You can use the following to view the list of dataset names:
|
|
320
333
|
```python
|
|
321
334
|
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
322
|
-
print(f'** All models from VLMEvalKit backend: {VLMEvalKitBackendManager.
|
|
335
|
+
print(f'** All models from VLMEvalKit backend: {VLMEvalKitBackendManager.list_supported_models().keys()}')
|
|
336
|
+
|
|
323
337
|
```
|
|
324
338
|
If the dataset file does not exist locally when loading the dataset, it will be automatically downloaded to the `~/LMUData/` directory.
|
|
325
339
|
|
|
@@ -562,5 +576,3 @@ The LLM Leaderboard aims to provide an objective and comprehensive evaluation st
|
|
|
562
576
|
- [ ] Auto-reviewer
|
|
563
577
|
- [ ] Qwen-max
|
|
564
578
|
|
|
565
|
-
|
|
566
|
-
|
|
@@ -6,17 +6,17 @@ evalscope/run.py,sha256=T-2zoJpBx6YxLnLJH-iFF3UxUGYTU36PMV_DQ9e8tSM,18484
|
|
|
6
6
|
evalscope/run_arena.py,sha256=_LL8fqeKUEMUg985TENYzcnH5_Q8sqPxM68eZk-jhLA,8793
|
|
7
7
|
evalscope/run_ms.py,sha256=UtJoGnah64SXigTawJQWTi_TEGjr7Td0rjCTaO-htL8,6028
|
|
8
8
|
evalscope/summarizer.py,sha256=Ie1kwPETpz3x2yROLMGqC0UwEj6OKJuKwEcUqxUx5fM,6358
|
|
9
|
-
evalscope/version.py,sha256=
|
|
9
|
+
evalscope/version.py,sha256=Bo14bi3CEm4GSQOqlmyUKrRQLg4TS8hCNrE-bnYDI28,118
|
|
10
10
|
evalscope/backend/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
11
11
|
evalscope/backend/base.py,sha256=5BLrDNNwxsGp35zorD-kphmN15tlBbkuuqwkz8jWZq0,876
|
|
12
12
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
13
13
|
evalscope/backend/opencompass/api_meta_template.py,sha256=sBW0XbVDOKeJ7mVUDLhmcG4e0yClw3eluazdp_8wtgQ,1753
|
|
14
|
-
evalscope/backend/opencompass/backend_manager.py,sha256=
|
|
14
|
+
evalscope/backend/opencompass/backend_manager.py,sha256=Rr8eFFDUXTxI8AMcrbFW9LZuSQVZ7tsgHcZ1veNhfWM,10190
|
|
15
15
|
evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
16
|
-
evalscope/backend/opencompass/tasks/eval_api.py,sha256=
|
|
16
|
+
evalscope/backend/opencompass/tasks/eval_api.py,sha256=12lrgDpMzZ1XBRboq5TEOovDPCMDwwGCJoRT78Ox_yo,1108
|
|
17
17
|
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=DWwKcQGGSkkh65H1d-oKN8Jow0Q0cHJJzDC75inycFM,5186
|
|
18
18
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=xTgHM95lWzh4s0W7zxLwYkgUbPAZfAb0UoGGmyyBXrs,83
|
|
19
|
-
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=
|
|
19
|
+
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=PQ9n2jdfPj7s5Ma6_5nNuOMM4La9JBdxKbLf4Oa17NI,6055
|
|
20
20
|
evalscope/benchmarks/__init__.py,sha256=6TKP35wfKf7R_h870fsEtcIlIAgomKOcukNL9M-5I1Y,162
|
|
21
21
|
evalscope/benchmarks/benchmark.py,sha256=e7rA8Y_vo6q5BhlUbZGWfZ1-SfJnU2IFRg62pnjQtDk,2157
|
|
22
22
|
evalscope/benchmarks/data_adapter.py,sha256=eVQvOQYQOQbIl8UlvOEUqRThL3FP3aUD6DSlqF1bqO0,10395
|
|
@@ -156,10 +156,10 @@ evalscope/utils/arena_utils.py,sha256=RMkymUv9Cxs37arUntzgDY5P0Dand2jGpsb7uy6wZm
|
|
|
156
156
|
evalscope/utils/completion_parsers.py,sha256=61l8CTh1VxHgRoMDhtznpAhuJp47MssGgS-LdEe_h80,2997
|
|
157
157
|
evalscope/utils/logger.py,sha256=Ycd0W17Z_oiByPuPX3_umNrOCHjT9O_e_Kws7ZWUSvU,1855
|
|
158
158
|
evalscope/utils/task_cfg_parser.py,sha256=LiNQ2X8lbZU0cODpaY_PbKyUhNoxZIC495UsLJigX64,138
|
|
159
|
-
evalscope/utils/task_utils.py,sha256=
|
|
159
|
+
evalscope/utils/task_utils.py,sha256=Mv_u_f4Z91zcUeko6acZCmnOAPRfk61kf_dliLzG5Yk,459
|
|
160
160
|
evalscope/utils/utils.py,sha256=zHo9hfxGBUVKE2xNMR7lDoEvfRnk4V4946DEfXQhlq4,20509
|
|
161
|
-
evalscope-0.5.
|
|
162
|
-
evalscope-0.5.
|
|
163
|
-
evalscope-0.5.
|
|
164
|
-
evalscope-0.5.
|
|
165
|
-
evalscope-0.5.
|
|
161
|
+
evalscope-0.5.2.dist-info/METADATA,sha256=F0YWg7gyenErvz-Kq1X5Z2Ngr1TYh3H-KpCX5zBLnog,27866
|
|
162
|
+
evalscope-0.5.2.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
163
|
+
evalscope-0.5.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
164
|
+
evalscope-0.5.2.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
|
|
165
|
+
evalscope-0.5.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|