evalscope 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/__init__.py +0 -3
- evalscope/backend/opencompass/backend_manager.py +2 -0
- evalscope/backend/opencompass/tasks/eval_datasets.py +2 -2
- evalscope/backend/rag_eval/__init__.py +3 -0
- evalscope/backend/rag_eval/backend_manager.py +68 -0
- evalscope/backend/rag_eval/cmteb/__init__.py +4 -0
- evalscope/backend/rag_eval/cmteb/arguments.py +59 -0
- evalscope/backend/rag_eval/cmteb/base.py +89 -0
- evalscope/backend/rag_eval/cmteb/task_template.py +83 -0
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +252 -0
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +113 -0
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +153 -0
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +345 -0
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +302 -0
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +64 -0
- evalscope/backend/rag_eval/ragas/__init__.py +2 -0
- evalscope/backend/rag_eval/ragas/arguments.py +37 -0
- evalscope/backend/rag_eval/ragas/task_template.py +117 -0
- evalscope/backend/vlm_eval_kit/backend_manager.py +1 -2
- evalscope/backend/vlm_eval_kit/custom_dataset.py +1 -1
- evalscope/benchmarks/benchmark.py +1 -1
- evalscope/evaluator/evaluator.py +4 -3
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +19 -0
- evalscope/models/api/__init__.py +3 -0
- evalscope/models/api/openai_api.py +228 -0
- evalscope/perf/http_client.py +5 -5
- evalscope/run.py +4 -0
- evalscope/third_party/longbench_write/__init__.py +3 -0
- evalscope/third_party/longbench_write/eval.py +284 -0
- evalscope/third_party/longbench_write/infer.py +217 -0
- evalscope/third_party/longbench_write/longbench_write.py +88 -0
- evalscope/third_party/longbench_write/resources/__init__.py +1 -0
- evalscope/third_party/longbench_write/resources/judge.txt +31 -0
- evalscope/third_party/longbench_write/resources/longbench_write.jsonl +120 -0
- evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl +60 -0
- evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl +48 -0
- evalscope/third_party/longbench_write/tools/__init__.py +1 -0
- evalscope/third_party/longbench_write/tools/data_etl.py +155 -0
- evalscope/third_party/longbench_write/utils.py +37 -0
- evalscope/utils/logger.py +44 -14
- evalscope/utils/task_utils.py +3 -0
- evalscope/version.py +2 -2
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/METADATA +46 -60
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/RECORD +48 -18
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/WHEEL +0 -0
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/entry_points.txt +0 -0
- {evalscope-0.5.3.dist-info → evalscope-0.5.5.dist-info}/top_level.txt +0 -0
evalscope/utils/logger.py
CHANGED
|
@@ -1,18 +1,20 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
+
import importlib.util as iutil
|
|
3
4
|
import logging
|
|
4
5
|
from typing import Optional
|
|
5
6
|
|
|
6
7
|
init_loggers = {}
|
|
8
|
+
format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
9
|
+
formatter = logging.Formatter(format)
|
|
7
10
|
|
|
8
|
-
|
|
9
|
-
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
11
|
+
logging.basicConfig(format=format, level=logging.INFO)
|
|
10
12
|
|
|
11
13
|
|
|
12
|
-
def get_logger(
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
"""
|
|
14
|
+
def get_logger(
|
|
15
|
+
log_file: Optional[str] = None, log_level: int = logging.INFO, file_mode: str = "w"
|
|
16
|
+
):
|
|
17
|
+
"""Get logging logger
|
|
16
18
|
|
|
17
19
|
Args:
|
|
18
20
|
log_file: Log filename, if specified, file handler will be added to
|
|
@@ -22,21 +24,39 @@ def get_logger(log_file: Optional[str] = None,
|
|
|
22
24
|
specified (if filemode is unspecified, it defaults to 'w').
|
|
23
25
|
"""
|
|
24
26
|
|
|
25
|
-
logger_name = __name__.split(
|
|
27
|
+
logger_name = __name__.split(".")[0]
|
|
26
28
|
logger = logging.getLogger(logger_name)
|
|
27
|
-
|
|
29
|
+
logger.propagate = False
|
|
28
30
|
if logger_name in init_loggers:
|
|
29
31
|
add_file_handler_if_needed(logger, log_file, file_mode, log_level)
|
|
32
|
+
if logger.level != log_level:
|
|
33
|
+
logger.setLevel(log_level)
|
|
30
34
|
return logger
|
|
31
35
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
36
|
+
# handle duplicate logs to the console
|
|
37
|
+
# Starting in 1.8.0, PyTorch DDP attaches a StreamHandler <stderr> (NOTSET)
|
|
38
|
+
# to the root logger. As logger.propagate is True by default, this root
|
|
39
|
+
# level handler causes logging messages from rank>0 processes to
|
|
40
|
+
# unexpectedly show up on the console, creating much unwanted clutter.
|
|
41
|
+
# To fix this issue, we set the root logger's StreamHandler, if any, to log
|
|
42
|
+
# at the ERROR level.
|
|
43
|
+
torch_dist = False
|
|
44
|
+
is_worker0 = True
|
|
45
|
+
if iutil.find_spec("torch") is not None:
|
|
46
|
+
from modelscope.utils.torch_utils import is_dist, is_master
|
|
47
|
+
|
|
48
|
+
torch_dist = is_dist()
|
|
49
|
+
is_worker0 = is_master()
|
|
50
|
+
|
|
51
|
+
if torch_dist:
|
|
52
|
+
for handler in logger.root.handlers:
|
|
53
|
+
if type(handler) is logging.StreamHandler:
|
|
54
|
+
handler.setLevel(logging.ERROR)
|
|
35
55
|
|
|
36
56
|
stream_handler = logging.StreamHandler()
|
|
37
57
|
handlers = [stream_handler]
|
|
38
58
|
|
|
39
|
-
if log_file is not None:
|
|
59
|
+
if is_worker0 and log_file is not None:
|
|
40
60
|
file_handler = logging.FileHandler(log_file, file_mode)
|
|
41
61
|
handlers.append(file_handler)
|
|
42
62
|
|
|
@@ -45,7 +65,10 @@ def get_logger(log_file: Optional[str] = None,
|
|
|
45
65
|
handler.setLevel(log_level)
|
|
46
66
|
logger.addHandler(handler)
|
|
47
67
|
|
|
48
|
-
|
|
68
|
+
if is_worker0:
|
|
69
|
+
logger.setLevel(log_level)
|
|
70
|
+
else:
|
|
71
|
+
logger.setLevel(logging.ERROR)
|
|
49
72
|
|
|
50
73
|
init_loggers[logger_name] = True
|
|
51
74
|
|
|
@@ -57,7 +80,14 @@ def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
|
|
|
57
80
|
if isinstance(handler, logging.FileHandler):
|
|
58
81
|
return
|
|
59
82
|
|
|
60
|
-
if
|
|
83
|
+
if iutil.find_spec("torch") is not None:
|
|
84
|
+
from modelscope.utils.torch_utils import is_master
|
|
85
|
+
|
|
86
|
+
is_worker0 = is_master()
|
|
87
|
+
else:
|
|
88
|
+
is_worker0 = True
|
|
89
|
+
|
|
90
|
+
if is_worker0 and log_file is not None:
|
|
61
91
|
file_handler = logging.FileHandler(log_file, file_mode)
|
|
62
92
|
file_handler.setFormatter(formatter)
|
|
63
93
|
file_handler.setLevel(log_level)
|
evalscope/utils/task_utils.py
CHANGED
|
@@ -11,6 +11,9 @@ class EvalBackend(Enum):
|
|
|
11
11
|
|
|
12
12
|
# Use VLM Eval Kit as the multi-modal model evaluation backend
|
|
13
13
|
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
14
|
+
|
|
15
|
+
# Use RAGEval as the RAG evaluation backend
|
|
16
|
+
RAG_EVAL = 'RAGEval'
|
|
14
17
|
|
|
15
18
|
# Use third-party evaluation backend/modules
|
|
16
19
|
THIRD_PARTY = 'ThirdParty'
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.5
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -19,11 +19,12 @@ Requires-Dist: torch
|
|
|
19
19
|
Requires-Dist: absl-py
|
|
20
20
|
Requires-Dist: accelerate
|
|
21
21
|
Requires-Dist: cachetools
|
|
22
|
+
Requires-Dist: datasets (<3.0.0,>=2.18.0)
|
|
22
23
|
Requires-Dist: editdistance
|
|
23
24
|
Requires-Dist: jsonlines
|
|
24
25
|
Requires-Dist: matplotlib
|
|
25
26
|
Requires-Dist: modelscope[framework]
|
|
26
|
-
Requires-Dist: nltk
|
|
27
|
+
Requires-Dist: nltk (>=3.9)
|
|
27
28
|
Requires-Dist: openai
|
|
28
29
|
Requires-Dist: pandas
|
|
29
30
|
Requires-Dist: plotly
|
|
@@ -33,7 +34,7 @@ Requires-Dist: pyyaml
|
|
|
33
34
|
Requires-Dist: regex
|
|
34
35
|
Requires-Dist: requests
|
|
35
36
|
Requires-Dist: requests-toolbelt
|
|
36
|
-
Requires-Dist: rouge-score
|
|
37
|
+
Requires-Dist: rouge-score (>=0.1.0)
|
|
37
38
|
Requires-Dist: sacrebleu
|
|
38
39
|
Requires-Dist: scikit-learn
|
|
39
40
|
Requires-Dist: seaborn
|
|
@@ -42,7 +43,7 @@ Requires-Dist: simple-ddl-parser
|
|
|
42
43
|
Requires-Dist: tabulate
|
|
43
44
|
Requires-Dist: tiktoken
|
|
44
45
|
Requires-Dist: tqdm
|
|
45
|
-
Requires-Dist: transformers (
|
|
46
|
+
Requires-Dist: transformers (>=4.33)
|
|
46
47
|
Requires-Dist: transformers-stream-generator
|
|
47
48
|
Requires-Dist: jieba
|
|
48
49
|
Requires-Dist: rouge-chinese
|
|
@@ -51,11 +52,12 @@ Requires-Dist: torch ; extra == 'all'
|
|
|
51
52
|
Requires-Dist: absl-py ; extra == 'all'
|
|
52
53
|
Requires-Dist: accelerate ; extra == 'all'
|
|
53
54
|
Requires-Dist: cachetools ; extra == 'all'
|
|
55
|
+
Requires-Dist: datasets (<3.0.0,>=2.18.0) ; extra == 'all'
|
|
54
56
|
Requires-Dist: editdistance ; extra == 'all'
|
|
55
57
|
Requires-Dist: jsonlines ; extra == 'all'
|
|
56
58
|
Requires-Dist: matplotlib ; extra == 'all'
|
|
57
59
|
Requires-Dist: modelscope[framework] ; extra == 'all'
|
|
58
|
-
Requires-Dist: nltk ; extra == 'all'
|
|
60
|
+
Requires-Dist: nltk (>=3.9) ; extra == 'all'
|
|
59
61
|
Requires-Dist: openai ; extra == 'all'
|
|
60
62
|
Requires-Dist: pandas ; extra == 'all'
|
|
61
63
|
Requires-Dist: plotly ; extra == 'all'
|
|
@@ -65,7 +67,7 @@ Requires-Dist: pyyaml ; extra == 'all'
|
|
|
65
67
|
Requires-Dist: regex ; extra == 'all'
|
|
66
68
|
Requires-Dist: requests ; extra == 'all'
|
|
67
69
|
Requires-Dist: requests-toolbelt ; extra == 'all'
|
|
68
|
-
Requires-Dist: rouge-score ; extra == 'all'
|
|
70
|
+
Requires-Dist: rouge-score (>=0.1.0) ; extra == 'all'
|
|
69
71
|
Requires-Dist: sacrebleu ; extra == 'all'
|
|
70
72
|
Requires-Dist: scikit-learn ; extra == 'all'
|
|
71
73
|
Requires-Dist: seaborn ; extra == 'all'
|
|
@@ -74,12 +76,14 @@ Requires-Dist: simple-ddl-parser ; extra == 'all'
|
|
|
74
76
|
Requires-Dist: tabulate ; extra == 'all'
|
|
75
77
|
Requires-Dist: tiktoken ; extra == 'all'
|
|
76
78
|
Requires-Dist: tqdm ; extra == 'all'
|
|
77
|
-
Requires-Dist: transformers (
|
|
79
|
+
Requires-Dist: transformers (>=4.33) ; extra == 'all'
|
|
78
80
|
Requires-Dist: transformers-stream-generator ; extra == 'all'
|
|
79
81
|
Requires-Dist: jieba ; extra == 'all'
|
|
80
82
|
Requires-Dist: rouge-chinese ; extra == 'all'
|
|
81
|
-
Requires-Dist: ms-opencompass (>=0.1.
|
|
83
|
+
Requires-Dist: ms-opencompass (>=0.1.1) ; extra == 'all'
|
|
82
84
|
Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'all'
|
|
85
|
+
Requires-Dist: ragas ; extra == 'all'
|
|
86
|
+
Requires-Dist: mteb (>=0.14.16) ; extra == 'all'
|
|
83
87
|
Provides-Extra: inner
|
|
84
88
|
Requires-Dist: absl-py ; extra == 'inner'
|
|
85
89
|
Requires-Dist: accelerate ; extra == 'inner'
|
|
@@ -107,7 +111,10 @@ Requires-Dist: tqdm ; extra == 'inner'
|
|
|
107
111
|
Requires-Dist: transformers (<4.43,>=4.33) ; extra == 'inner'
|
|
108
112
|
Requires-Dist: transformers-stream-generator ; extra == 'inner'
|
|
109
113
|
Provides-Extra: opencompass
|
|
110
|
-
Requires-Dist: ms-opencompass (>=0.1.
|
|
114
|
+
Requires-Dist: ms-opencompass (>=0.1.1) ; extra == 'opencompass'
|
|
115
|
+
Provides-Extra: rag
|
|
116
|
+
Requires-Dist: ragas ; extra == 'rag'
|
|
117
|
+
Requires-Dist: mteb (>=0.14.16) ; extra == 'rag'
|
|
111
118
|
Provides-Extra: vlmeval
|
|
112
119
|
Requires-Dist: ms-vlmeval (>=0.0.5) ; extra == 'vlmeval'
|
|
113
120
|
|
|
@@ -142,28 +149,11 @@ English | [简体中文](README_zh.md)
|
|
|
142
149
|
|
|
143
150
|
## 📝 Introduction
|
|
144
151
|
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
### Framework Features
|
|
148
|
-
- **Benchmark Datasets**: Preloaded with several commonly used test benchmarks, including MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, HumanEval, etc.
|
|
149
|
-
- **Evaluation Metrics**: Implements various commonly used evaluation metrics.
|
|
150
|
-
- **Model Access**: A unified model access mechanism that is compatible with the Generate and Chat interfaces of multiple model families.
|
|
151
|
-
- **Automated Evaluation**: Includes automatic evaluation of objective questions and complex task evaluation using expert models.
|
|
152
|
-
- **Evaluation Reports**: Automatically generates evaluation reports.
|
|
153
|
-
- **Arena Mode**: Used for comparisons between models and objective evaluation of models, supporting various evaluation modes, including:
|
|
154
|
-
- **Single mode**: Scoring a single model.
|
|
155
|
-
- **Pairwise-baseline mode**: Comparing against a baseline model.
|
|
156
|
-
- **Pairwise (all) mode**: Pairwise comparison among all models.
|
|
157
|
-
- **Visualization Tools**: Provides intuitive displays of evaluation results.
|
|
158
|
-
- **Model Performance Evaluation**: Offers a performance testing tool for model inference services and detailed statistics, see [Model Performance Evaluation Documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test.html).
|
|
159
|
-
- **OpenCompass Integration**: Supports OpenCompass as the evaluation backend, providing advanced encapsulation and task simplification, allowing for easier task submission for evaluation.
|
|
160
|
-
- **VLMEvalKit Integration**: Supports VLMEvalKit as the evaluation backend, facilitating the initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
161
|
-
- **Full-Link Support**: Through seamless integration with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, provides a one-stop development process for model training, model deployment, model evaluation, and report viewing, enhancing user development efficiency.
|
|
162
|
-
|
|
163
|
-
### Overall Architecture
|
|
152
|
+
EvalScope is the official model evaluation and performance benchmarking framework launched by the [ModelScope](https://modelscope.cn/) community. It comes with built-in common benchmarks and evaluation metrics, such as MMLU, CMMLU, C-Eval, GSM8K, ARC, HellaSwag, TruthfulQA, MATH, and HumanEval. EvalScope supports various types of model evaluations, including LLMs, multimodal LLMs, embedding models, and reranker models. It is also applicable to multiple evaluation scenarios, such as end-to-end RAG evaluation, arena mode, and model inference performance stress testing. Moreover, with the seamless integration of the ms-swift training framework, evaluations can be initiated with a single click, providing full end-to-end support from model training to evaluation 🚀
|
|
153
|
+
|
|
164
154
|
<p align="center">
|
|
165
155
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
166
|
-
<br>
|
|
156
|
+
<br>EvalScope Framework.
|
|
167
157
|
</p>
|
|
168
158
|
|
|
169
159
|
The architecture includes the following modules:
|
|
@@ -173,18 +163,25 @@ The architecture includes the following modules:
|
|
|
173
163
|
- **Native**: EvalScope’s own **default evaluation framework**, supporting various evaluation modes, including single model evaluation, arena mode, baseline model comparison mode, etc.
|
|
174
164
|
- **OpenCompass**: Supports [OpenCompass](https://github.com/open-compass/opencompass) as the evaluation backend, providing advanced encapsulation and task simplification, allowing you to submit tasks for evaluation more easily.
|
|
175
165
|
- **VLMEvalKit**: Supports [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) as the evaluation backend, enabling easy initiation of multi-modal evaluation tasks, supporting various multi-modal models and datasets.
|
|
166
|
+
- **RAGEval**: Supports RAG evaluation, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
176
167
|
- **ThirdParty**: Other third-party evaluation tasks, such as ToolBench.
|
|
177
168
|
4. **Performance Evaluator**: Model performance evaluation, responsible for measuring model inference service performance, including performance testing, stress testing, performance report generation, and visualization.
|
|
178
169
|
5. **Evaluation Report**: The final generated evaluation report summarizes the model's performance, which can be used for decision-making and further model optimization.
|
|
179
170
|
6. **Visualization**: Visualization results help users intuitively understand evaluation results, facilitating analysis and comparison of different model performances.
|
|
180
171
|
|
|
172
|
+
|
|
181
173
|
## 🎉 News
|
|
182
|
-
- **[2024.
|
|
183
|
-
- **[2024.
|
|
184
|
-
- **[2024.
|
|
185
|
-
- **[2024.
|
|
186
|
-
- **[2024.
|
|
187
|
-
- **[2024.
|
|
174
|
+
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
175
|
+
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
176
|
+
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
177
|
+
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
178
|
+
- 🔥 **[2024.08.20]** Updated the official documentation, including getting started guides, best practices, and FAQs. Feel free to [📖read it here](https://evalscope.readthedocs.io/en/latest/)!
|
|
179
|
+
- 🔥 **[2024.08.09]** Simplified the installation process, allowing for pypi installation of vlmeval dependencies; optimized the multimodal model evaluation experience, achieving up to 10x acceleration based on the OpenAI API evaluation chain.
|
|
180
|
+
- 🔥 **[2024.07.31]** Important change: The package name `llmuses` has been changed to `evalscope`. Please update your code accordingly.
|
|
181
|
+
- 🔥 **[2024.07.26]** Support for **VLMEvalKit** as a third-party evaluation framework to initiate multimodal model evaluation tasks.
|
|
182
|
+
- 🔥 **[2024.06.29]** Support for **OpenCompass** as a third-party evaluation framework, which we have encapsulated at a higher level, supporting pip installation and simplifying evaluation task configuration.
|
|
183
|
+
- 🔥 **[2024.06.13]** EvalScope seamlessly integrates with the fine-tuning framework SWIFT, providing full-chain support from LLM training to evaluation.
|
|
184
|
+
- 🔥 **[2024.06.13]** Integrated the Agent evaluation dataset ToolBench.
|
|
188
185
|
|
|
189
186
|
|
|
190
187
|
|
|
@@ -264,8 +261,8 @@ If prompted with `Do you wish to run the custom code? [y/N]`, please type `y`.
|
|
|
264
261
|
|
|
265
262
|
#### Basic Parameter Descriptions
|
|
266
263
|
- `--model`: Specifies the `model_id` of the model on [ModelScope](https://modelscope.cn/), allowing automatic download. For example, see the [Qwen2-0.5B-Instruct model link](https://modelscope.cn/models/qwen/Qwen2-0.5B-Instruct/summary); you can also use a local path, such as `/path/to/model`.
|
|
267
|
-
- `--template-type`: Specifies the template type corresponding to the model. Refer to the `Default Template` field in the [template table](https://swift.readthedocs.io/en/latest/
|
|
268
|
-
- `--datasets`: The dataset name, allowing multiple datasets to be specified, separated by spaces; these datasets will be automatically downloaded. Refer to the [supported datasets list](
|
|
264
|
+
- `--template-type`: Specifies the template type corresponding to the model. Refer to the `Default Template` field in the [template table](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-datasets.html#llm) for filling in this field.
|
|
265
|
+
- `--datasets`: The dataset name, allowing multiple datasets to be specified, separated by spaces; these datasets will be automatically downloaded. Refer to the [supported datasets list](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html) for available options.
|
|
269
266
|
|
|
270
267
|
### 2. Parameterized Evaluation
|
|
271
268
|
If you wish to conduct a more customized evaluation, such as modifying model parameters or dataset parameters, you can use the following commands:
|
|
@@ -275,8 +272,8 @@ If you wish to conduct a more customized evaluation, such as modifying model par
|
|
|
275
272
|
python evalscope/run.py \
|
|
276
273
|
--model qwen/Qwen2-0.5B-Instruct \
|
|
277
274
|
--template-type qwen \
|
|
278
|
-
--model-args revision=
|
|
279
|
-
--datasets
|
|
275
|
+
--model-args revision=master,precision=torch.float16,device_map=auto \
|
|
276
|
+
--datasets gsm8k ceval \
|
|
280
277
|
--use-cache true \
|
|
281
278
|
--limit 10
|
|
282
279
|
```
|
|
@@ -341,31 +338,14 @@ from evalscope.run import run_task
|
|
|
341
338
|
run_task(task_cfg=your_task_cfg)
|
|
342
339
|
```
|
|
343
340
|
|
|
344
|
-
### Supported Datasets List
|
|
345
|
-
> [!NOTE]
|
|
346
|
-
> The framework currently supports the following datasets. If the dataset you need is not in the list, please submit an issue, or use the [OpenCompass backend](https://evalscope.readthedocs.io/en/latest/user_guides/opencompass_backend.html) for evaluation, or use the [VLMEvalKit backend](https://evalscope.readthedocs.io/en/latest/user_guides/vlmevalkit_backend.html) for multi-modal model evaluation.
|
|
347
|
-
|
|
348
|
-
| Dataset Name | Link | Status | Note |
|
|
349
|
-
|--------------------|----------------------------------------------------------------------------------------|--------|------|
|
|
350
|
-
| `mmlu` | [mmlu](https://modelscope.cn/datasets/modelscope/mmlu/summary) | Active | |
|
|
351
|
-
| `ceval` | [ceval](https://modelscope.cn/datasets/modelscope/ceval-exam/summary) | Active | |
|
|
352
|
-
| `gsm8k` | [gsm8k](https://modelscope.cn/datasets/modelscope/gsm8k/summary) | Active | |
|
|
353
|
-
| `arc` | [arc](https://modelscope.cn/datasets/modelscope/ai2_arc/summary) | Active | |
|
|
354
|
-
| `hellaswag` | [hellaswag](https://modelscope.cn/datasets/modelscope/hellaswag/summary) | Active | |
|
|
355
|
-
| `truthful_qa` | [truthful_qa](https://modelscope.cn/datasets/modelscope/truthful_qa/summary) | Active | |
|
|
356
|
-
| `competition_math` | [competition_math](https://modelscope.cn/datasets/modelscope/competition_math/summary) | Active | |
|
|
357
|
-
| `humaneval` | [humaneval](https://modelscope.cn/datasets/modelscope/humaneval/summary) | Active | |
|
|
358
|
-
| `bbh` | [bbh](https://modelscope.cn/datasets/modelscope/bbh/summary) | Active | |
|
|
359
|
-
| `race` | [race](https://modelscope.cn/datasets/modelscope/race/summary) | Active | |
|
|
360
|
-
| `trivia_qa` | [trivia_qa](https://modelscope.cn/datasets/modelscope/trivia_qa/summary) | To be integrated | |
|
|
361
|
-
|
|
362
341
|
|
|
363
342
|
## Evaluation Backend
|
|
364
343
|
EvalScope supports using third-party evaluation frameworks to initiate evaluation tasks, which we call Evaluation Backend. Currently supported Evaluation Backend includes:
|
|
365
344
|
- **Native**: EvalScope's own **default evaluation framework**, supporting various evaluation modes including single model evaluation, arena mode, and baseline model comparison mode.
|
|
366
|
-
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/opencompass_backend.html)
|
|
367
|
-
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/vlmevalkit_backend.html)
|
|
368
|
-
- **
|
|
345
|
+
- [OpenCompass](https://github.com/open-compass/opencompass): Initiate OpenCompass evaluation tasks through EvalScope. Lightweight, easy to customize, supports seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/opencompass_backend.html)
|
|
346
|
+
- [VLMEvalKit](https://github.com/open-compass/VLMEvalKit): Initiate VLMEvalKit multimodal evaluation tasks through EvalScope. Supports various multimodal models and datasets, and offers seamless integration with the LLM fine-tuning framework ms-swift. [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/vlmevalkit_backend.html)
|
|
347
|
+
- **RAGEval**: Initiate RAG evaluation tasks through EvalScope, supporting independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html): [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/index.html)
|
|
348
|
+
- **ThirdParty**: Third-party evaluation tasks, such as [ToolBench](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) and [LongBench-Write](https://evalscope.readthedocs.io/en/latest/third_party/longwriter.html).
|
|
369
349
|
|
|
370
350
|
## Custom Dataset Evaluation
|
|
371
351
|
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
|
|
@@ -394,6 +374,8 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
|
|
|
394
374
|
|
|
395
375
|
|
|
396
376
|
## TO-DO List
|
|
377
|
+
- [x] RAG evaluation
|
|
378
|
+
- [x] VLM evaluation
|
|
397
379
|
- [x] Agents evaluation
|
|
398
380
|
- [x] vLLM
|
|
399
381
|
- [ ] Distributed evaluating
|
|
@@ -405,3 +387,7 @@ Refer to : [Leaderboard](https://modelscope.cn/leaderboard/58/ranking?type=free)
|
|
|
405
387
|
- [ ] Auto-reviewer
|
|
406
388
|
- [ ] Qwen-max
|
|
407
389
|
|
|
390
|
+
|
|
391
|
+
## Star History
|
|
392
|
+
|
|
393
|
+
[](https://star-history.com/#modelscope/evalscope&Date)
|
|
@@ -2,24 +2,40 @@ evalscope/__init__.py,sha256=3eLMMrjkAIAs3vGluXNZn5-xTSbO_vfba9yNPbkVtg8,105
|
|
|
2
2
|
evalscope/cache.py,sha256=zpGjL9JMosqjk_dkODVwvIGiUC0WAMmMTHDNJOvBQU8,3288
|
|
3
3
|
evalscope/config.py,sha256=G_rpSn5Kd1aPlFJO6asnZu5FUggZmwcYdAxxpuq0yDs,6972
|
|
4
4
|
evalscope/constants.py,sha256=g8lGYlpA4Wk88HwtqId1-jJX_z8Lr2k02gWLsyofyj0,2670
|
|
5
|
-
evalscope/run.py,sha256=
|
|
5
|
+
evalscope/run.py,sha256=uAXtaxIBcR94jyfHGFAecuzn0y71oLgu-d9VOohCJAw,18738
|
|
6
6
|
evalscope/run_arena.py,sha256=BCWCAiX0BQ9pLMIq08svEcd-IoFr75gFShpV88robIY,8963
|
|
7
7
|
evalscope/run_ms.py,sha256=UtJoGnah64SXigTawJQWTi_TEGjr7Td0rjCTaO-htL8,6028
|
|
8
8
|
evalscope/summarizer.py,sha256=rIyML8HpjQxIpXg8KvQ0CzOS6xMS-JHZh6kUZzkaRsk,6640
|
|
9
|
-
evalscope/version.py,sha256=
|
|
10
|
-
evalscope/backend/__init__.py,sha256=
|
|
9
|
+
evalscope/version.py,sha256=UZ6qDTtcyaqkwS2_IkU2Kzop4lG9AL9cYpfEpYzfCrc,118
|
|
10
|
+
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
evalscope/backend/base.py,sha256=5BLrDNNwxsGp35zorD-kphmN15tlBbkuuqwkz8jWZq0,876
|
|
12
12
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
13
13
|
evalscope/backend/opencompass/api_meta_template.py,sha256=sBW0XbVDOKeJ7mVUDLhmcG4e0yClw3eluazdp_8wtgQ,1753
|
|
14
|
-
evalscope/backend/opencompass/backend_manager.py,sha256=
|
|
14
|
+
evalscope/backend/opencompass/backend_manager.py,sha256=_eg82FLAVxQ6t5e1OqlyuxZcngqD8rxvI5EijLUh_zI,10294
|
|
15
15
|
evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
16
16
|
evalscope/backend/opencompass/tasks/eval_api.py,sha256=12lrgDpMzZ1XBRboq5TEOovDPCMDwwGCJoRT78Ox_yo,1108
|
|
17
|
-
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=
|
|
17
|
+
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=bYFHkjiwZqh2FVRo1I88xEDZ6nYmZjAgG5ZODbthKFI,5241
|
|
18
|
+
evalscope/backend/rag_eval/__init__.py,sha256=yRCcfxhzC7wIYbgb-w76i4D9v8wXI7JmYNM6IZUn064,199
|
|
19
|
+
evalscope/backend/rag_eval/backend_manager.py,sha256=tWkFzYO9LQjaI7paD5yz1c-HtNJUbnAr0a-4biYSZvg,2562
|
|
20
|
+
evalscope/backend/rag_eval/cmteb/__init__.py,sha256=ajVz6XP5hqPq-jm66hp2poA2qKj1V19ZGoqjrGUlO7U,279
|
|
21
|
+
evalscope/backend/rag_eval/cmteb/arguments.py,sha256=698UvPVZp5Ncq_p25P_67SQkYaW2tLSCHenUOZ0n5OI,2217
|
|
22
|
+
evalscope/backend/rag_eval/cmteb/base.py,sha256=sJqTRCej7vk5ASirk21hOobX1_Hz7BO1LIHJFOGLuE4,2731
|
|
23
|
+
evalscope/backend/rag_eval/cmteb/task_template.py,sha256=HreVxyRiF2QUe4Dy9_zKNp1WU74342RWHV5_B8ycXG0,2537
|
|
24
|
+
evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=7adR40W6Uu58-QR9jCUP4k7TdAnG0oT225v4xHXah2g,10635
|
|
25
|
+
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py,sha256=7j1Hts_r4Nv8DlbIiPFMaU1JDxCYgu0wO0JI8T_Y6X8,8969
|
|
26
|
+
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py,sha256=2WkaTE-jF8jqsu1UcNDqN8A4567UzW5boD_0B83j-9A,4008
|
|
27
|
+
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py,sha256=50h-lXaRcb5s6ZpIgnfk5mU7iZur8ZDxwsaFbrqSZ_o,5462
|
|
28
|
+
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=wUxiQH5aOmWNS4YswACyHqBn5xqP5eyvsq6U9WSp5R0,11457
|
|
29
|
+
evalscope/backend/rag_eval/cmteb/tasks/STS.py,sha256=6GMaoCANM-IKYLk4srHOYr_eurav3DGihHMQeJPXR6k,12054
|
|
30
|
+
evalscope/backend/rag_eval/cmteb/tasks/__init__.py,sha256=FrtoBosHq9iRp3yfZEAxWa5NkYhHtA20NmHDG6eiPNU,1421
|
|
31
|
+
evalscope/backend/rag_eval/ragas/__init__.py,sha256=tHB7XGREmcrx8ulF-JZWWoHsEbn2s-PFyWFdGzOZQcw,190
|
|
32
|
+
evalscope/backend/rag_eval/ragas/arguments.py,sha256=plVc2_3auVG5z91ExzBdkbNIhMVjyi_xQYbEzlV0iNw,1208
|
|
33
|
+
evalscope/backend/rag_eval/ragas/task_template.py,sha256=795NHXzGdeqa15ONV1AgDZywpMHucaIlvk_EBF0CK98,3868
|
|
18
34
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=xTgHM95lWzh4s0W7zxLwYkgUbPAZfAb0UoGGmyyBXrs,83
|
|
19
|
-
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=
|
|
20
|
-
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=
|
|
35
|
+
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=ewhpE9yzsqf5ED6kqsqek2YEgg96GBQOupxtVNhaXxI,6046
|
|
36
|
+
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=Yz2A5kB1E8DYBnjuVCA6TTPtLjhg8vYKeJTh6FU_Ecw,1645
|
|
21
37
|
evalscope/benchmarks/__init__.py,sha256=6TKP35wfKf7R_h870fsEtcIlIAgomKOcukNL9M-5I1Y,162
|
|
22
|
-
evalscope/benchmarks/benchmark.py,sha256=
|
|
38
|
+
evalscope/benchmarks/benchmark.py,sha256=EmwYyFdrAHBGMkSbsMZQOR_62Q0CSKl8zeLlr7xvJdQ,2159
|
|
23
39
|
evalscope/benchmarks/data_adapter.py,sha256=eVQvOQYQOQbIl8UlvOEUqRThL3FP3aUD6DSlqF1bqO0,10395
|
|
24
40
|
evalscope/benchmarks/arc/__init__.py,sha256=7k2jFDUCHpEKDdQZ3Bmq59YmImFg9RyIfZQIsGebhE8,314
|
|
25
41
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=Wim8JsH094og7d0eLCEI0kUwDP_0x7AT117oTRPdiAI,5608
|
|
@@ -91,7 +107,7 @@ evalscope/cli/cli.py,sha256=uZ-qC8WBsLd5-Hn94d43sSGg0UC_12RebSD4ToKjypg,844
|
|
|
91
107
|
evalscope/cli/start_perf.py,sha256=TL6bMXYl3ln-tfs5uBmzb9x94uxz6f3PBFIt1l7g3VA,994
|
|
92
108
|
evalscope/cli/start_server.py,sha256=ATGLP2TE0aImJNicpehdzBuFlNb50F7KhyL4A_ZSoGU,3885
|
|
93
109
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
94
|
-
evalscope/evaluator/evaluator.py,sha256=
|
|
110
|
+
evalscope/evaluator/evaluator.py,sha256=eSCgPPDGfIJfKu0cthhbDLFm1xMhj_869iT3ngcQkPc,30817
|
|
95
111
|
evalscope/evaluator/rating_eval.py,sha256=cJbkyXIuwFUZoe7ZJZM6eUskNd9zlORgndckuon2OQ8,5768
|
|
96
112
|
evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
97
113
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=JycPYti9h1j_8DRcu_rc5U0wkEASHYg-XBqrUUoiO-Q,17054
|
|
@@ -101,13 +117,15 @@ evalscope/metrics/math_accuracy.py,sha256=1PCy1VUNYg48JcGy-6SUmUDZNwPeAkMW1QQ_lX
|
|
|
101
117
|
evalscope/metrics/metrics.py,sha256=sDZljGiZwgHsFZ5eNi65-3z3BLCdIwWUzPcq2QpKf1k,12545
|
|
102
118
|
evalscope/metrics/rouge_metric.py,sha256=sN0r-sXXc-nJUdFrthQPAv1VFdOCrF6zzIYDKaLSgrU,4522
|
|
103
119
|
evalscope/metrics/bundled_rouge_score/__init__.py,sha256=PwbTdk8168FwDJe_l8XIqDuBgZQooDsP31vj7di05Fs,650
|
|
104
|
-
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=
|
|
120
|
+
evalscope/metrics/bundled_rouge_score/rouge_scorer.py,sha256=vhzIMSQezhZuJzGndymWjB_iRbDdECoEidOIdNL3NAM,12213
|
|
105
121
|
evalscope/models/__init__.py,sha256=zG27J2HSeKPGiAIUE7QLPHEPLyXLsfaDwYI_TDXjpCg,145
|
|
106
122
|
evalscope/models/dummy_chat_model.py,sha256=xE8wcFVSCkvizEJ-B8ojX0Ir01Q5KrN5mapjMQaQtbg,1325
|
|
107
123
|
evalscope/models/model.py,sha256=ZzzVzZHVzuzdt5F1r-rEBT44ZfW9B7R1spsrV-T8nSw,3020
|
|
108
124
|
evalscope/models/model_adapter.py,sha256=Cgs68ajRwTETEo1eU-OhFiFGuSx4eS1p7-JT3jOpcOk,22740
|
|
109
125
|
evalscope/models/openai_model.py,sha256=PoQS1FIiWIxp1xBJPV7Bq81LFD9FIT3vAHUvNa22DCc,3452
|
|
110
126
|
evalscope/models/template.py,sha256=Yk7-QnvjiLD0zchSZcaDSLmpW8onIeFpngSwtUOYVPk,56035
|
|
127
|
+
evalscope/models/api/__init__.py,sha256=0c75K78O1KaV02BqqtEp-hhtSSClXLawb8E0c2iqN_A,105
|
|
128
|
+
evalscope/models/api/openai_api.py,sha256=o-FVJFSvfk5mFJm4egXcKfR5ya1fduo5b-uqTkeRu9A,7871
|
|
111
129
|
evalscope/models/custom/__init__.py,sha256=K4Ewo7Qrs73-jBuPq4ffxd8hMnttKhic-Zj0amH3wiU,103
|
|
112
130
|
evalscope/models/custom/custom_model.py,sha256=2ivxfGQs5V5HDnQEhTBi5v8KNBxJDbzPVJdNOGo3iSg,1566
|
|
113
131
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -117,7 +135,7 @@ evalscope/perf/custom_api.py,sha256=H2IgM-LMjqXxVhbrtkXuiREb-p14zwMmllgl26a-jgw,
|
|
|
117
135
|
evalscope/perf/dashscope_api.py,sha256=_XUF3czkYdPdVgtP7nqzRxROKxlqDjWs4DQnTyocNvM,3410
|
|
118
136
|
evalscope/perf/dataset_plugin_base.py,sha256=6veUTyZ38W1Iig65vxNV9SfmqrsR8ID_UHgNiUO9Bv4,1814
|
|
119
137
|
evalscope/perf/how_to_analysis_result.py,sha256=UVd_aYJ_7N5hl_wK9oIZig1vSwfgzodxW7XC6IWqbdg,1044
|
|
120
|
-
evalscope/perf/http_client.py,sha256=
|
|
138
|
+
evalscope/perf/http_client.py,sha256=WYHuGY_BCeeh8vHi1fm9zrAndOKpVQp4h21j1kKnM64,34535
|
|
121
139
|
evalscope/perf/openai_api.py,sha256=XrH6jg8VlO9Wu0vGwZna_bHq65XMAlCfCEyqMjs8w1c,5970
|
|
122
140
|
evalscope/perf/plugin_registry.py,sha256=D2MG2AXDBScjuKxB4g_Hg026pSRO752dBimonYtaAzM,782
|
|
123
141
|
evalscope/perf/query_parameters.py,sha256=HfGRZJSzRMVfPezWTvbWhYeprCetGNPX_M_paoDtuOY,1346
|
|
@@ -142,6 +160,18 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=KYLK-xtv_3qtgCZiwwP4-rP_ftc_qUmtsl1Tf
|
|
|
142
160
|
evalscope/registry/tasks/mmlu.yaml,sha256=504yhHVfi9pvUBk_SGPs-Yx7R2hx_2_-nAFiGIiFGx4,726
|
|
143
161
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=wVbosZ5Tm9pwLG5nCphalezXilIjcq5j33nz3MR7_BE,778
|
|
144
162
|
evalscope/third_party/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
163
|
+
evalscope/third_party/longbench_write/__init__.py,sha256=GNbBDc7HAh_V2Hfy5HhND_u7z6OI79czoBlP8lX4PVo,126
|
|
164
|
+
evalscope/third_party/longbench_write/eval.py,sha256=_fwV3f-Yq0qrkuZ6LBXvBiXnM6lpz6sOqd7BfYxEU80,11163
|
|
165
|
+
evalscope/third_party/longbench_write/infer.py,sha256=MB0MdSM1qDx15FyrPSU6BXPbSGnBjxuTWqrcHAgbj9o,8318
|
|
166
|
+
evalscope/third_party/longbench_write/longbench_write.py,sha256=MQzlIzv3sGlNgxgX0FPHtDIuAmgwThfBkMeKNcsR3U8,3926
|
|
167
|
+
evalscope/third_party/longbench_write/utils.py,sha256=l6q9cNZLFVRvG9qYbxFxobuQkcMyteU9Y6NxyMU4tmQ,816
|
|
168
|
+
evalscope/third_party/longbench_write/resources/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
169
|
+
evalscope/third_party/longbench_write/resources/judge.txt,sha256=LEI86IoOtqYUgvQnmXo8A8S8Ef6GEQKJXcrEWSauHVc,1884
|
|
170
|
+
evalscope/third_party/longbench_write/resources/longbench_write.jsonl,sha256=H26ZSXzCTWWJTWXgFAYvOYupRuvdJUt_izOeSNOrV3k,54155
|
|
171
|
+
evalscope/third_party/longbench_write/resources/longbench_write_en.jsonl,sha256=h4AJJ3YfNA5IiZ5N9dR_tyEa1JNqY0INv6l5ZgQUJZ8,24235
|
|
172
|
+
evalscope/third_party/longbench_write/resources/longwrite_ruler.jsonl,sha256=odTr8N8PoWAFZ2kdEcmlLeMDfEo3KXDtLo9S8oieCmI,5718
|
|
173
|
+
evalscope/third_party/longbench_write/tools/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
174
|
+
evalscope/third_party/longbench_write/tools/data_etl.py,sha256=fSc4iT7_bdTvW20TbjlWme-k1pLqj_e2wXV8z831_Yw,5963
|
|
145
175
|
evalscope/third_party/toolbench_static/__init__.py,sha256=BO936RxwodHr4OEpV6W3S_keC91OfOd41_msIJ2d0fs,128
|
|
146
176
|
evalscope/third_party/toolbench_static/eval.py,sha256=TqjMuuYePnD3bGRhQe1_9bIOlAW41kiFSztaEuppRLM,8237
|
|
147
177
|
evalscope/third_party/toolbench_static/infer.py,sha256=WogwVXqDabdcsJ4uftZxAwR2wncp6HYpkS-fACEvjT4,9331
|
|
@@ -155,12 +185,12 @@ evalscope/tools/rewrite_eval_results.py,sha256=ZVi2hVjiTOmR_O5IaLv6qnQNpMz6FnDb9
|
|
|
155
185
|
evalscope/utils/__init__.py,sha256=6RjACRYUSpGj6fkZ7NzYpl0lFppQCp9KVn5ktZe626s,128
|
|
156
186
|
evalscope/utils/arena_utils.py,sha256=RMkymUv9Cxs37arUntzgDY5P0Dand2jGpsb7uy6wZmg,7670
|
|
157
187
|
evalscope/utils/completion_parsers.py,sha256=61l8CTh1VxHgRoMDhtznpAhuJp47MssGgS-LdEe_h80,2997
|
|
158
|
-
evalscope/utils/logger.py,sha256=
|
|
188
|
+
evalscope/utils/logger.py,sha256=cf3U400Mx1speMMNXorjwEE8noDz5Mbd-9PNgaulGeY,3013
|
|
159
189
|
evalscope/utils/task_cfg_parser.py,sha256=LiNQ2X8lbZU0cODpaY_PbKyUhNoxZIC495UsLJigX64,138
|
|
160
|
-
evalscope/utils/task_utils.py,sha256=
|
|
190
|
+
evalscope/utils/task_utils.py,sha256=IMtBSBUp3H95Ko0vn8Q55Wmz2SFZXSfjVy49tyomL_g,537
|
|
161
191
|
evalscope/utils/utils.py,sha256=zHo9hfxGBUVKE2xNMR7lDoEvfRnk4V4946DEfXQhlq4,20509
|
|
162
|
-
evalscope-0.5.
|
|
163
|
-
evalscope-0.5.
|
|
164
|
-
evalscope-0.5.
|
|
165
|
-
evalscope-0.5.
|
|
166
|
-
evalscope-0.5.
|
|
192
|
+
evalscope-0.5.5.dist-info/METADATA,sha256=scs7UaBcWE2qpewo_oe6ZB8HX5CtbohPBvom6UjUY5w,20943
|
|
193
|
+
evalscope-0.5.5.dist-info/WHEEL,sha256=2wepM1nk4DS4eFpYrW1TTqPcoGNfHhhO_i5m4cOimbo,92
|
|
194
|
+
evalscope-0.5.5.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
195
|
+
evalscope-0.5.5.dist-info/top_level.txt,sha256=jNR-HMn3TR8Atolq7_4rW8IWVX6GhvYV5_1Y_KbJKlY,10
|
|
196
|
+
evalscope-0.5.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|