evalscope 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

@@ -76,9 +76,7 @@ class OpenCompassBackendManager(BackendManager):
76
76
  @staticmethod
77
77
  def _check_env():
78
78
  if is_module_installed('opencompass'):
79
- logger.info('Please make sure you have installed the `ms-opencompass`: `pip install ms-opencompass`')
80
- else:
81
- raise ModuleNotFoundError('Please install the `ms-opencompass` first: `pip install ms-opencompass`')
79
+ logger.info('Check the OpenCompass environment: OK')
82
80
 
83
81
  @staticmethod
84
82
  def get_restore_arg(arg_name: str, arg_val: bool):
@@ -6,6 +6,7 @@ from opencompass.tasks import OpenICLInferTask
6
6
 
7
7
 
8
8
  with read_base():
9
+ from opencompass.configs.summarizers.medium import summarizer
9
10
  from evalscope.backend.opencompass.tasks.eval_datasets import datasets
10
11
 
11
12
  # 1. Get datasets
@@ -31,7 +31,7 @@ class VLMEvalKitBackendManager(BackendManager):
31
31
  from vlmeval.utils.arguments import Arguments as VLMEvalArguments
32
32
  self.args = VLMEvalArguments(**self.config_d)
33
33
 
34
- self.valid_models = self.list_supported_VLMs()
34
+ self.valid_models = self.list_supported_models()
35
35
  self.valid_model_names = list(self.valid_models.keys())
36
36
  self.valid_datasets = self.list_supported_datasets()
37
37
 
@@ -86,7 +86,7 @@ class VLMEvalKitBackendManager(BackendManager):
86
86
  return self.get_cmd()
87
87
 
88
88
  @staticmethod
89
- def list_supported_VLMs():
89
+ def list_supported_models():
90
90
  from vlmeval.config import supported_VLM
91
91
  return supported_VLM
92
92
 
@@ -98,9 +98,7 @@ class VLMEvalKitBackendManager(BackendManager):
98
98
  @staticmethod
99
99
  def _check_env():
100
100
  if is_module_installed('vlmeval'):
101
- logger.info('Please make sure you have installed the `ms-vlmeval`: `pip install ms-vlmeval`')
102
- else:
103
- raise ModuleNotFoundError('Please install the `ms-vlmeval` first: `pip install ms-vlmeval`')
101
+ logger.info('Check VLM Evaluation Kit: Installed')
104
102
 
105
103
  @staticmethod
106
104
  def get_restore_arg(arg_name: str, arg_val: bool):
@@ -3,7 +3,7 @@ from enum import Enum
3
3
 
4
4
 
5
5
  class EvalBackend(Enum):
6
- # Use native evaluation pipeline of Eval-Scope
6
+ # Use native evaluation pipeline of EvalScope
7
7
  NATIVE = 'Native'
8
8
 
9
9
  # Use OpenCompass framework as the evaluation backend
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.5.0'
4
- __release_datetime__ = '2024-08-01 08:00:00'
3
+ __version__ = '0.5.2'
4
+ __release_datetime__ = '2024-08-06 08:00:00'
@@ -1,13 +1,11 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.5.0
4
- Summary: Eval-Scope: Lightweight LLMs Evaluation Framework
5
- Home-page: https://github.com/modelscope/eval-scope
3
+ Version: 0.5.2
4
+ Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
+ Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
7
7
  Author-email: contact@modelscope.cn
8
- License: UNKNOWN
9
8
  Keywords: python,llm,evaluation
10
- Platform: UNKNOWN
11
9
  Classifier: Development Status :: 4 - Beta
12
10
  Classifier: License :: OSI Approved :: Apache Software License
13
11
  Classifier: Operating System :: OS Independent
@@ -80,8 +78,8 @@ Requires-Dist: transformers (<4.43,>=4.33) ; extra == 'all'
80
78
  Requires-Dist: transformers-stream-generator ; extra == 'all'
81
79
  Requires-Dist: jieba ; extra == 'all'
82
80
  Requires-Dist: rouge-chinese ; extra == 'all'
83
- Requires-Dist: ms-opencompass ; extra == 'all'
84
- Requires-Dist: ms-vlmeval ; extra == 'all'
81
+ Requires-Dist: ms-opencompass (>=0.0.5) ; extra == 'all'
82
+ Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'all'
85
83
  Provides-Extra: inner
86
84
  Requires-Dist: absl-py ; extra == 'inner'
87
85
  Requires-Dist: accelerate ; extra == 'inner'
@@ -109,16 +107,16 @@ Requires-Dist: tqdm ; extra == 'inner'
109
107
  Requires-Dist: transformers (<4.43,>=4.33) ; extra == 'inner'
110
108
  Requires-Dist: transformers-stream-generator ; extra == 'inner'
111
109
  Provides-Extra: opencompass
112
- Requires-Dist: ms-opencompass ; extra == 'opencompass'
110
+ Requires-Dist: ms-opencompass (>=0.0.5) ; extra == 'opencompass'
113
111
  Provides-Extra: vlmeval
114
- Requires-Dist: ms-vlmeval ; extra == 'vlmeval'
112
+ Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'vlmeval'
115
113
 
116
114
  English | [简体中文](README_zh.md)
117
115
 
118
116
  <p align="center">
119
117
  <a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dm/evalscope">
120
118
  </a>
121
- <a href="https://github.com/modelscope/eval-scope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
119
+ <a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
122
120
  <p>
123
121
 
124
122
  ## 📖 Table of Content
@@ -133,7 +131,7 @@ English | [简体中文](README_zh.md)
133
131
 
134
132
  ## 📝 Introduction
135
133
 
136
- Large Language Model (LLMs) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the Eval-Scope framework, which includes the following components and features:
134
+ Large Language Model (LLMs) evaluation has become a critical process for assessing and improving LLMs. To better support the evaluation of large models, we propose the EvalScope framework, which includes the following components and features:
137
135
 
138
136
  - Pre-configured common benchmark datasets, including: MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
139
137
  - Implementation of common evaluation metrics
@@ -146,7 +144,7 @@ Large Language Model (LLMs) evaluation has become a critical process for assessi
146
144
  - Visualization tools
147
145
  - Model Inference Performance Evaluation [Tutorial](evalscope/perf/README.md)
148
146
  - Support for OpenCompass as an Evaluation Backend, featuring advanced encapsulation and task simplification to easily submit tasks to OpenCompass for evaluation.
149
- - Supports VLMEvalKit as the evaluation backend. It initiates VLMEvalKit's multimodal evaluation tasks through Eval-Scope, supporting various multimodal models and datasets.
147
+ - Supports VLMEvalKit as the evaluation backend. It initiates VLMEvalKit's multimodal evaluation tasks through EvalScope, supporting various multimodal models and datasets.
150
148
  - Full pipeline support: Seamlessly integrate with SWIFT to easily train and deploy model services, initiate evaluation tasks, view evaluation reports, and achieve an end-to-end large model development process.
151
149
 
152
150
 
@@ -167,33 +165,48 @@ Large Language Model (LLMs) evaluation has become a critical process for assessi
167
165
  - **[2024.07.31]** Breaking change: The sdk name has been changed from `llmuses` to `evalscope`, please update the sdk name in your code.
168
166
  - **[2024.07.26]** Supports **VLMEvalKit** as a third-party evaluation framework, initiating multimodal model evaluation tasks. [User Guide](#vlmevalkit-evaluation-backend) 🔥🔥🔥
169
167
  - **[2024.06.29]** Supports **OpenCompass** as a third-party evaluation framework. We have provided a high-level wrapper, supporting installation via pip and simplifying the evaluation task configuration. [User Guide](#opencompass-evaluation-backend) 🔥🔥🔥
170
- - **[2024.06.13]** Eval-Scope has been updated to version 0.3.x, which supports the ModelScope SWIFT framework for LLMs evaluation. 🚀🚀🚀
168
+ - **[2024.06.13]** EvalScope has been updated to version 0.3.x, which supports the ModelScope SWIFT framework for LLMs evaluation. 🚀🚀🚀
171
169
  - **[2024.06.13]** We have supported the ToolBench as a third-party evaluation backend for Agents evaluation. 🚀🚀🚀
172
170
 
173
171
 
174
172
 
175
173
  ## 🛠️ Installation
176
174
  ### Install with pip
177
- 1. create conda environment
175
+ 1. create conda environment [Optional]
178
176
  ```shell
179
- conda create -n eval-scope python=3.10
180
- conda activate eval-scope
177
+ conda create -n evalscope python=3.10
178
+ conda activate evalscope
181
179
  ```
182
180
 
183
- 2. Install Eval-Scope
181
+ 2. Install EvalScope
184
182
  ```shell
185
- pip install evalscope
183
+ pip install evalscope # Installation with Native backend (by default)
184
+
185
+ pip install evalscope[opencompass] # Installation with OpenCompass backend
186
+ pip install evalscope[vlmeval] # Installation with VLMEvalKit backend
187
+ pip install evalscope[all] # Installation with all backends (Native, OpenCompass, VLMEvalKit)
186
188
  ```
187
189
 
190
+ DEPRECATION WARNING: For 0.4.3 or older versions, please use the following command to install:
191
+ ```shell
192
+ pip install llmuses<=0.4.3
193
+
194
+ # Usage:
195
+ from llmuses.run import run_task
196
+ ...
197
+
198
+ ```
199
+
200
+
188
201
  ### Install from source code
189
202
  1. Download source code
190
203
  ```shell
191
- git clone https://github.com/modelscope/eval-scope.git
204
+ git clone https://github.com/modelscope/evalscope.git
192
205
  ```
193
206
 
194
207
  2. Install dependencies
195
208
  ```shell
196
- cd eval-scope/
209
+ cd evalscope/
197
210
  pip install -e .
198
211
  ```
199
212
 
@@ -237,15 +250,15 @@ print(TemplateType.get_template_name_list())
237
250
  ```
238
251
 
239
252
  ### Evaluation Backend
240
- Eval-Scope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
241
- - **Native**: Eval-Scope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
242
- - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through Eval-Scope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
243
- - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through Eval-Scope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
244
- - **ThirdParty**: The third-party task, e.g. [ToolBench](evalscope/thirdparty/toolbench/README.md), you can contribute your own evaluation task to Eval-Scope as third-party backend.
253
+ EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
254
+ - **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
255
+ - [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
256
+ - [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework [ModelScope Swift](https://github.com/modelscope/swift).
257
+ - **ThirdParty**: The third-party task, e.g. [ToolBench](evalscope/thirdparty/toolbench/README.md), you can contribute your own evaluation task to EvalScope as third-party backend.
245
258
 
246
259
  #### OpenCompass Eval-Backend
247
260
 
248
- To facilitate the use of the OpenCompass evaluation backend, we have customized the OpenCompass source code and named it `ms-opencompass`. This version includes optimizations for evaluation task configuration and execution based on the original version, and it supports installation via PyPI. This allows users to initiate lightweight OpenCompass evaluation tasks through Eval-Scope. Additionally, we have initially opened up API-based evaluation tasks in the OpenAI API format. You can deploy model services using [ModelScope Swift](https://github.com/modelscope/swift), where [swift deploy](https://swift.readthedocs.io/en/latest/LLM/VLLM-inference-acceleration-and-deployment.html) supports using vLLM to launch model inference services.
261
+ To facilitate the use of the OpenCompass evaluation backend, we have customized the OpenCompass source code and named it `ms-opencompass`. This version includes optimizations for evaluation task configuration and execution based on the original version, and it supports installation via PyPI. This allows users to initiate lightweight OpenCompass evaluation tasks through EvalScope. Additionally, we have initially opened up API-based evaluation tasks in the OpenAI API format. You can deploy model services using [ModelScope Swift](https://github.com/modelscope/swift), where [swift deploy](https://swift.readthedocs.io/en/latest/LLM/VLLM-inference-acceleration-and-deployment.html) supports using vLLM to launch model inference services.
249
262
 
250
263
 
251
264
  ##### Installation
@@ -301,7 +314,7 @@ python examples/example_eval_swift_openai_api.py
301
314
 
302
315
  #### VLMEvalKit Evaluation Backend
303
316
 
304
- To facilitate the use of the VLMEvalKit evaluation backend, we have customized the VLMEvalKit source code and named it `ms-vlmeval`. This version encapsulates the configuration and execution of evaluation tasks based on the original version and supports installation via PyPI, allowing users to initiate lightweight VLMEvalKit evaluation tasks through Eval-Scope. Additionally, we support API-based evaluation tasks in the OpenAI API format. You can deploy multimodal model services using ModelScope [swift](https://github.com/modelscope/swift).
317
+ To facilitate the use of the VLMEvalKit evaluation backend, we have customized the VLMEvalKit source code and named it `ms-vlmeval`. This version encapsulates the configuration and execution of evaluation tasks based on the original version and supports installation via PyPI, allowing users to initiate lightweight VLMEvalKit evaluation tasks through EvalScope. Additionally, we support API-based evaluation tasks in the OpenAI API format. You can deploy multimodal model services using ModelScope [swift](https://github.com/modelscope/swift).
305
318
 
306
319
  ##### Installation
307
320
  ```shell
@@ -319,7 +332,8 @@ For detailed information about the datasets, please refer to [VLMEvalKit Support
319
332
  You can use the following to view the list of dataset names:
320
333
  ```python
321
334
  from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
322
- print(f'** All models from VLMEvalKit backend: {VLMEvalKitBackendManager.list(list_supported_VLMs().keys())}')
335
+ print(f'** All models from VLMEvalKit backend: {VLMEvalKitBackendManager.list_supported_models().keys()}')
336
+
323
337
  ```
324
338
  If the dataset file does not exist locally when loading the dataset, it will be automatically downloaded to the `~/LMUData/` directory.
325
339
 
@@ -562,5 +576,3 @@ The LLM Leaderboard aims to provide an objective and comprehensive evaluation st
562
576
  - [ ] Auto-reviewer
563
577
  - [ ] Qwen-max
564
578
 
565
-
566
-
@@ -6,17 +6,17 @@ evalscope/run.py,sha256=T-2zoJpBx6YxLnLJH-iFF3UxUGYTU36PMV_DQ9e8tSM,18484
6
6
  evalscope/run_arena.py,sha256=_LL8fqeKUEMUg985TENYzcnH5_Q8sqPxM68eZk-jhLA,8793
7
7
  evalscope/run_ms.py,sha256=UtJoGnah64SXigTawJQWTi_TEGjr7Td0rjCTaO-htL8,6028
8
8
  evalscope/summarizer.py,sha256=Ie1kwPETpz3x2yROLMGqC0UwEj6OKJuKwEcUqxUx5fM,6358
9
- evalscope/version.py,sha256=_1Lu_R_3DYpaloS52_vLqUEBtD1ixppLykqr1dl_TqM,118
9
+ evalscope/version.py,sha256=Bo14bi3CEm4GSQOqlmyUKrRQLg4TS8hCNrE-bnYDI28,118
10
10
  evalscope/backend/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
11
11
  evalscope/backend/base.py,sha256=5BLrDNNwxsGp35zorD-kphmN15tlBbkuuqwkz8jWZq0,876
12
12
  evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
13
13
  evalscope/backend/opencompass/api_meta_template.py,sha256=sBW0XbVDOKeJ7mVUDLhmcG4e0yClw3eluazdp_8wtgQ,1753
14
- evalscope/backend/opencompass/backend_manager.py,sha256=swmJELcEDNorZzyXZxOhz2q5tWAE-IkotqJVZ2rBRQ4,10366
14
+ evalscope/backend/opencompass/backend_manager.py,sha256=Rr8eFFDUXTxI8AMcrbFW9LZuSQVZ7tsgHcZ1veNhfWM,10190
15
15
  evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
16
- evalscope/backend/opencompass/tasks/eval_api.py,sha256=9ylEm1Xk_xft56EEpVvlzK89_R1kQh7PI6uVZiexqy8,1042
16
+ evalscope/backend/opencompass/tasks/eval_api.py,sha256=12lrgDpMzZ1XBRboq5TEOovDPCMDwwGCJoRT78Ox_yo,1108
17
17
  evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=DWwKcQGGSkkh65H1d-oKN8Jow0Q0cHJJzDC75inycFM,5186
18
18
  evalscope/backend/vlm_eval_kit/__init__.py,sha256=xTgHM95lWzh4s0W7zxLwYkgUbPAZfAb0UoGGmyyBXrs,83
19
- evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=_5yZ7dUULvzLw9-LYg5Svmeia8M6-8CInmiwtGfkYF4,6213
19
+ evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=PQ9n2jdfPj7s5Ma6_5nNuOMM4La9JBdxKbLf4Oa17NI,6055
20
20
  evalscope/benchmarks/__init__.py,sha256=6TKP35wfKf7R_h870fsEtcIlIAgomKOcukNL9M-5I1Y,162
21
21
  evalscope/benchmarks/benchmark.py,sha256=e7rA8Y_vo6q5BhlUbZGWfZ1-SfJnU2IFRg62pnjQtDk,2157
22
22
  evalscope/benchmarks/data_adapter.py,sha256=eVQvOQYQOQbIl8UlvOEUqRThL3FP3aUD6DSlqF1bqO0,10395
@@ -156,10 +156,10 @@ evalscope/utils/arena_utils.py,sha256=RMkymUv9Cxs37arUntzgDY5P0Dand2jGpsb7uy6wZm
156
156
  evalscope/utils/completion_parsers.py,sha256=61l8CTh1VxHgRoMDhtznpAhuJp47MssGgS-LdEe_h80,2997
157
157
  evalscope/utils/logger.py,sha256=Ycd0W17Z_oiByPuPX3_umNrOCHjT9O_e_Kws7ZWUSvU,1855
158
158
  evalscope/utils/task_cfg_parser.py,sha256=LiNQ2X8lbZU0cODpaY_PbKyUhNoxZIC495UsLJigX64,138
159
- evalscope/utils/task_utils.py,sha256=9izZ6H7nso1OJmdoduDpaFN2KA3DmZ91dkKXA8GTIUc,460
159
+ evalscope/utils/task_utils.py,sha256=Mv_u_f4Z91zcUeko6acZCmnOAPRfk61kf_dliLzG5Yk,459
160
160
  evalscope/utils/utils.py,sha256=zHo9hfxGBUVKE2xNMR7lDoEvfRnk4V4946DEfXQhlq4,20509
161
- evalscope-0.5.0.dist-info/METADATA,sha256=xMJR-aNm2_br22AlhRWVeu24slq9oW5tXfxOvilPZoM,27367
162
- evalscope-0.5.0.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
163
- evalscope-0.5.0.dist-info/entry_points.txt,sha256=eAQqqT7PlGix33BDKmS7wsaIJ_6-vvGrq79Szb6uVxg,57
164
- evalscope-0.5.0.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
165
- evalscope-0.5.0.dist-info/RECORD,,
161
+ evalscope-0.5.2.dist-info/METADATA,sha256=F0YWg7gyenErvz-Kq1X5Z2Ngr1TYh3H-KpCX5zBLnog,27866
162
+ evalscope-0.5.2.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
163
+ evalscope-0.5.2.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
164
+ evalscope-0.5.2.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
165
+ evalscope-0.5.2.dist-info/RECORD,,
@@ -1,3 +1,2 @@
1
1
  [console_scripts]
2
2
  evalscope = evalscope.cli.cli:run_cmd
3
-