evalscope 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/arguments.py +1 -1
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +21 -5
- evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
- evalscope/backend/rag_eval/utils/embedding.py +49 -3
- evalscope/backend/rag_eval/utils/llm.py +8 -9
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
- evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/__init__.py +0 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
- evalscope/benchmarks/arena_hard/utils.py +162 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
- evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
- evalscope/benchmarks/data_adapter.py +30 -2
- evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
- evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
- evalscope/collections/evaluator.py +4 -2
- evalscope/config.py +2 -2
- evalscope/metrics/llm_judge.py +1 -1
- evalscope/models/chat_adapter.py +32 -11
- evalscope/perf/arguments.py +30 -9
- evalscope/perf/benchmark.py +57 -103
- evalscope/perf/http_client.py +2 -3
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +4 -2
- evalscope/perf/plugin/datasets/custom.py +4 -1
- evalscope/perf/plugin/datasets/line_by_line.py +4 -1
- evalscope/perf/plugin/datasets/longalpaca.py +4 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -1
- evalscope/perf/plugin/datasets/random_dataset.py +13 -6
- evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
- evalscope/perf/utils/benchmark_util.py +12 -6
- evalscope/perf/utils/db_util.py +3 -3
- evalscope/perf/utils/log_utils.py +41 -0
- evalscope/report/app.py +11 -11
- evalscope/run.py +7 -0
- evalscope/summarizer.py +2 -1
- evalscope/utils/utils.py +36 -25
- evalscope/version.py +2 -2
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/METADATA +21 -55
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/RECORD +70 -62
- tests/cli/test_all.py +36 -27
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +38 -20
- tests/perf/test_perf.py +1 -2
- tests/rag/test_clip_benchmark.py +0 -1
- tests/rag/test_mteb.py +37 -8
- tests/rag/test_ragas.py +33 -27
- tests/vlm/test_vlmeval.py +37 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
evalscope/utils/utils.py
CHANGED
|
@@ -90,7 +90,7 @@ class ResponseParser:
|
|
|
90
90
|
return ''
|
|
91
91
|
|
|
92
92
|
@staticmethod
|
|
93
|
-
def parse_first_option_with_choices(text: str, options: list) -> str:
|
|
93
|
+
def parse_first_option_with_choices(text: str, options: list[str]) -> str:
|
|
94
94
|
"""
|
|
95
95
|
Find first valid option for text.
|
|
96
96
|
|
|
@@ -98,7 +98,7 @@ class ResponseParser:
|
|
|
98
98
|
text: The text to parse.
|
|
99
99
|
options: The options to find. e.g. ['A', 'B', 'C', 'D']
|
|
100
100
|
"""
|
|
101
|
-
options_concat =
|
|
101
|
+
options_concat = ResponseParser.process_options(options)
|
|
102
102
|
|
|
103
103
|
patterns = [
|
|
104
104
|
rf'答案是?\s?([{options_concat}])',
|
|
@@ -155,48 +155,53 @@ class ResponseParser:
|
|
|
155
155
|
for i in options:
|
|
156
156
|
if i in outputs:
|
|
157
157
|
return i
|
|
158
|
-
return ''
|
|
158
|
+
return 'No valid option found'
|
|
159
159
|
|
|
160
160
|
@staticmethod
|
|
161
|
-
def parse_first_option(text: str) -> str:
|
|
161
|
+
def parse_first_option(text: str, options: list[str]) -> str:
|
|
162
162
|
"""
|
|
163
163
|
Find first valid option for text.
|
|
164
164
|
|
|
165
165
|
Args:
|
|
166
166
|
text: The text to parse.
|
|
167
167
|
"""
|
|
168
|
+
options_pattern = ResponseParser.process_options(options)
|
|
169
|
+
|
|
168
170
|
patterns = [
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
171
|
+
rf'[Aa]nswer:\s*({options_pattern})',
|
|
172
|
+
rf'ANSWER:\s*({options_pattern})',
|
|
173
|
+
rf'answer is \(?({options_pattern})\)?',
|
|
174
|
+
rf'[Tt]he correct answer is:\s*({options_pattern})',
|
|
175
|
+
rf'[Tt]he correct answer is:\n\s*({options_pattern})',
|
|
176
|
+
rf'[Tt]he correct answer is:\n\n-\s*({options_pattern})',
|
|
177
|
+
rf'[Tt]he answer might be:\n\n-\s*({options_pattern})',
|
|
178
|
+
rf'[Tt]he answer is \s*({options_pattern})',
|
|
176
179
|
]
|
|
177
180
|
|
|
178
181
|
regexes = [re.compile(pattern) for pattern in patterns]
|
|
179
182
|
for regex in regexes:
|
|
180
|
-
|
|
181
|
-
if
|
|
182
|
-
return
|
|
183
|
-
return ''
|
|
183
|
+
matches = regex.search(text)
|
|
184
|
+
if matches:
|
|
185
|
+
return matches.group(1)
|
|
186
|
+
return 'No valid option found'
|
|
187
|
+
|
|
184
188
|
|
|
185
189
|
@staticmethod
|
|
186
|
-
def
|
|
187
|
-
|
|
190
|
+
def parse_bracketed_answer(text: str, options: list[str]) -> str:
|
|
191
|
+
options = ResponseParser.process_options(options)
|
|
192
|
+
# Match the first occurrence of the options in angle brackets
|
|
193
|
+
match = re.search(rf'<({options})>', text)
|
|
188
194
|
if match:
|
|
189
195
|
return match.group(1)
|
|
190
|
-
return ''
|
|
196
|
+
return 'No valid option found'
|
|
191
197
|
|
|
192
198
|
@staticmethod
|
|
193
|
-
def
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
199
|
+
def process_options(options: list[str]) -> str:
|
|
200
|
+
# Escape each option to ensure special characters in options are treated literally
|
|
201
|
+
escaped_options = [re.escape(option) for option in options]
|
|
202
|
+
# Join options into a regex pattern separated by '|', to match any of the options
|
|
203
|
+
options_pattern = '|'.join(escaped_options)
|
|
204
|
+
return options_pattern
|
|
200
205
|
|
|
201
206
|
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
|
|
202
207
|
"""
|
|
@@ -299,3 +304,9 @@ def seed_everything(seed: int):
|
|
|
299
304
|
torch.cuda.manual_seed_all(seed)
|
|
300
305
|
torch.backends.cudnn.deterministic = True
|
|
301
306
|
torch.backends.cudnn.benchmark = False
|
|
307
|
+
|
|
308
|
+
if __name__ == '__main__':
|
|
309
|
+
options = ['A', 'B', 'C', 'D']
|
|
310
|
+
answers = ['Context .... ANSWER: A', 'answer: A']
|
|
311
|
+
for answer in answers:
|
|
312
|
+
print(ResponseParser.parse_first_option(answer, options))
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.14.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -16,11 +16,8 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
16
16
|
Requires-Python: >=3.8
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
|
-
Requires-Dist: absl-py
|
|
20
19
|
Requires-Dist: accelerate
|
|
21
|
-
Requires-Dist: cachetools
|
|
22
20
|
Requires-Dist: datasets<=3.2.0,>=3.0.0
|
|
23
|
-
Requires-Dist: editdistance
|
|
24
21
|
Requires-Dist: immutabledict
|
|
25
22
|
Requires-Dist: jieba
|
|
26
23
|
Requires-Dist: jsonlines
|
|
@@ -31,34 +28,23 @@ Requires-Dist: modelscope[framework]
|
|
|
31
28
|
Requires-Dist: nltk>=3.9
|
|
32
29
|
Requires-Dist: openai
|
|
33
30
|
Requires-Dist: pandas
|
|
34
|
-
Requires-Dist: plotly
|
|
35
31
|
Requires-Dist: pyarrow
|
|
36
|
-
Requires-Dist: pympler
|
|
37
32
|
Requires-Dist: pyyaml
|
|
38
|
-
Requires-Dist: regex
|
|
39
33
|
Requires-Dist: requests
|
|
40
|
-
Requires-Dist: requests-toolbelt
|
|
41
34
|
Requires-Dist: rouge-chinese
|
|
42
35
|
Requires-Dist: rouge-score>=0.1.0
|
|
43
36
|
Requires-Dist: sacrebleu
|
|
44
37
|
Requires-Dist: scikit-learn
|
|
45
38
|
Requires-Dist: seaborn
|
|
46
|
-
Requires-Dist: sentencepiece
|
|
47
|
-
Requires-Dist: simple-ddl-parser
|
|
48
39
|
Requires-Dist: sympy
|
|
49
40
|
Requires-Dist: tabulate
|
|
50
|
-
Requires-Dist: tiktoken
|
|
51
41
|
Requires-Dist: torch
|
|
52
42
|
Requires-Dist: tqdm
|
|
53
43
|
Requires-Dist: transformers>=4.33
|
|
54
|
-
Requires-Dist: transformers-stream-generator
|
|
55
44
|
Requires-Dist: word2number
|
|
56
45
|
Provides-Extra: all
|
|
57
|
-
Requires-Dist: absl-py; extra == "all"
|
|
58
46
|
Requires-Dist: accelerate; extra == "all"
|
|
59
|
-
Requires-Dist: cachetools; extra == "all"
|
|
60
47
|
Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
|
|
61
|
-
Requires-Dist: editdistance; extra == "all"
|
|
62
48
|
Requires-Dist: immutabledict; extra == "all"
|
|
63
49
|
Requires-Dist: jieba; extra == "all"
|
|
64
50
|
Requires-Dist: jsonlines; extra == "all"
|
|
@@ -69,32 +55,28 @@ Requires-Dist: modelscope[framework]; extra == "all"
|
|
|
69
55
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
70
56
|
Requires-Dist: openai; extra == "all"
|
|
71
57
|
Requires-Dist: pandas; extra == "all"
|
|
72
|
-
Requires-Dist: plotly; extra == "all"
|
|
73
58
|
Requires-Dist: pyarrow; extra == "all"
|
|
74
|
-
Requires-Dist: pympler; extra == "all"
|
|
75
59
|
Requires-Dist: pyyaml; extra == "all"
|
|
76
|
-
Requires-Dist: regex; extra == "all"
|
|
77
60
|
Requires-Dist: requests; extra == "all"
|
|
78
|
-
Requires-Dist: requests-toolbelt; extra == "all"
|
|
79
61
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
80
62
|
Requires-Dist: rouge-score>=0.1.0; extra == "all"
|
|
81
63
|
Requires-Dist: sacrebleu; extra == "all"
|
|
82
64
|
Requires-Dist: scikit-learn; extra == "all"
|
|
83
65
|
Requires-Dist: seaborn; extra == "all"
|
|
84
|
-
Requires-Dist: sentencepiece; extra == "all"
|
|
85
|
-
Requires-Dist: simple-ddl-parser; extra == "all"
|
|
86
66
|
Requires-Dist: sympy; extra == "all"
|
|
87
67
|
Requires-Dist: tabulate; extra == "all"
|
|
88
|
-
Requires-Dist: tiktoken; extra == "all"
|
|
89
68
|
Requires-Dist: torch; extra == "all"
|
|
90
69
|
Requires-Dist: tqdm; extra == "all"
|
|
91
70
|
Requires-Dist: transformers>=4.33; extra == "all"
|
|
92
|
-
Requires-Dist: transformers-stream-generator; extra == "all"
|
|
93
71
|
Requires-Dist: word2number; extra == "all"
|
|
94
72
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
|
|
95
73
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
|
|
74
|
+
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
|
|
75
|
+
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
|
|
76
|
+
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
|
|
77
|
+
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
|
|
96
78
|
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
97
|
-
Requires-Dist: ragas==0.2.
|
|
79
|
+
Requires-Dist: ragas==0.2.14; extra == "all"
|
|
98
80
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
99
81
|
Requires-Dist: aiohttp; extra == "all"
|
|
100
82
|
Requires-Dist: fastapi; extra == "all"
|
|
@@ -107,32 +89,6 @@ Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "all"
|
|
|
107
89
|
Provides-Extra: app
|
|
108
90
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
109
91
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
110
|
-
Provides-Extra: inner
|
|
111
|
-
Requires-Dist: absl-py; extra == "inner"
|
|
112
|
-
Requires-Dist: accelerate; extra == "inner"
|
|
113
|
-
Requires-Dist: alibaba-itag-sdk; extra == "inner"
|
|
114
|
-
Requires-Dist: dashscope; extra == "inner"
|
|
115
|
-
Requires-Dist: editdistance; extra == "inner"
|
|
116
|
-
Requires-Dist: jsonlines; extra == "inner"
|
|
117
|
-
Requires-Dist: nltk; extra == "inner"
|
|
118
|
-
Requires-Dist: openai; extra == "inner"
|
|
119
|
-
Requires-Dist: pandas==1.5.3; extra == "inner"
|
|
120
|
-
Requires-Dist: plotly; extra == "inner"
|
|
121
|
-
Requires-Dist: pyarrow; extra == "inner"
|
|
122
|
-
Requires-Dist: pyodps; extra == "inner"
|
|
123
|
-
Requires-Dist: pyyaml; extra == "inner"
|
|
124
|
-
Requires-Dist: regex; extra == "inner"
|
|
125
|
-
Requires-Dist: requests==2.28.1; extra == "inner"
|
|
126
|
-
Requires-Dist: requests-toolbelt==0.10.1; extra == "inner"
|
|
127
|
-
Requires-Dist: rouge-score; extra == "inner"
|
|
128
|
-
Requires-Dist: sacrebleu; extra == "inner"
|
|
129
|
-
Requires-Dist: scikit-learn; extra == "inner"
|
|
130
|
-
Requires-Dist: seaborn; extra == "inner"
|
|
131
|
-
Requires-Dist: simple-ddl-parser; extra == "inner"
|
|
132
|
-
Requires-Dist: streamlit; extra == "inner"
|
|
133
|
-
Requires-Dist: tqdm; extra == "inner"
|
|
134
|
-
Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
|
|
135
|
-
Requires-Dist: transformers-stream-generator; extra == "inner"
|
|
136
92
|
Provides-Extra: opencompass
|
|
137
93
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
|
|
138
94
|
Provides-Extra: perf
|
|
@@ -143,8 +99,12 @@ Requires-Dist: sse-starlette; extra == "perf"
|
|
|
143
99
|
Requires-Dist: transformers; extra == "perf"
|
|
144
100
|
Requires-Dist: unicorn; extra == "perf"
|
|
145
101
|
Provides-Extra: rag
|
|
102
|
+
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
|
|
103
|
+
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
|
|
104
|
+
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
|
|
105
|
+
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
|
|
146
106
|
Requires-Dist: mteb==1.19.4; extra == "rag"
|
|
147
|
-
Requires-Dist: ragas==0.2.
|
|
107
|
+
Requires-Dist: ragas==0.2.14; extra == "rag"
|
|
148
108
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
149
109
|
Provides-Extra: vlmeval
|
|
150
110
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
@@ -161,7 +121,7 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
|
161
121
|
</p>
|
|
162
122
|
|
|
163
123
|
<p align="center">
|
|
164
|
-
<img src="https://img.shields.io/badge/python-%E2%89%A53.
|
|
124
|
+
<img src="https://img.shields.io/badge/python-%E2%89%A53.9-5be.svg">
|
|
165
125
|
<a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
|
|
166
126
|
<a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
|
|
167
127
|
<a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
|
|
@@ -239,6 +199,9 @@ Please scan the QR code below to join our community groups:
|
|
|
239
199
|
|
|
240
200
|
## 🎉 News
|
|
241
201
|
|
|
202
|
+
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
203
|
+
- 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
|
|
204
|
+
- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
242
205
|
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
243
206
|
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
244
207
|
- 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).
|
|
@@ -251,15 +214,14 @@ Please scan the QR code below to join our community groups:
|
|
|
251
214
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
252
215
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
253
216
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
217
|
+
<details><summary>More</summary>
|
|
218
|
+
|
|
254
219
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
255
220
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
256
221
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
257
222
|
- 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
|
|
258
223
|
- 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
|
|
259
224
|
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
260
|
-
|
|
261
|
-
<details><summary>More</summary>
|
|
262
|
-
|
|
263
225
|
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
264
226
|
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
265
227
|
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -542,6 +504,10 @@ Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.i
|
|
|
542
504
|
|
|
543
505
|

|
|
544
506
|
|
|
507
|
+
**Supports swanlab for recording results**
|
|
508
|
+
|
|
509
|
+

|
|
510
|
+
|
|
545
511
|
**Supports Speed Benchmark**
|
|
546
512
|
|
|
547
513
|
It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports:
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
|
-
evalscope/arguments.py,sha256=
|
|
3
|
-
evalscope/config.py,sha256=
|
|
2
|
+
evalscope/arguments.py,sha256=OPYmX_ar7rXFm0ETPuE2hs-knDQtwQ0pFwSazjn3S9Q,5241
|
|
3
|
+
evalscope/config.py,sha256=sc8NoqhspbrNYMS201ZWreCKV-tBJrUEt96vKwpqfDY,9483
|
|
4
4
|
evalscope/constants.py,sha256=Cgzkoz4R3MC3YLtbCM2fmSwF8Z2kuxYdOC8t9FWJj9w,3740
|
|
5
|
-
evalscope/run.py,sha256=
|
|
5
|
+
evalscope/run.py,sha256=XbUhllYPjaJJuR1hPoGZH0jlW8XlvUv9gONrMBc4Ni0,6450
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
|
-
evalscope/summarizer.py,sha256=
|
|
8
|
-
evalscope/version.py,sha256=
|
|
7
|
+
evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
|
|
8
|
+
evalscope/version.py,sha256=4w52xL5au75pTD-PrvG-9l-U1euGk2032efyc-7IkQw,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -14,8 +14,8 @@ evalscope/backend/opencompass/backend_manager.py,sha256=y5NnAIY1pI7E1ZSeKU3acrD-
|
|
|
14
14
|
evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
15
15
|
evalscope/backend/opencompass/tasks/eval_api.py,sha256=ZaGdUbEOtAW5VX3ZXmpHIttg_QrID34EnBTylD3uvos,1152
|
|
16
16
|
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=JHSq4EnPJgv4sRJJplLH80EqE3ghtkn2k8HnV6DaDew,5406
|
|
17
|
-
evalscope/backend/rag_eval/__init__.py,sha256=
|
|
18
|
-
evalscope/backend/rag_eval/backend_manager.py,sha256=
|
|
17
|
+
evalscope/backend/rag_eval/__init__.py,sha256=Tbj7HboP5zzJ77-9qVEwwhHKjHL5V8MwLFr6sw1oeoA,291
|
|
18
|
+
evalscope/backend/rag_eval/backend_manager.py,sha256=OEFADT8kdsuVMU0QOfiafzFQopY7bKbWZ_jhdXyYElY,3472
|
|
19
19
|
evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=C8Vetf52nyHiRwY2Pm74Bjn3UpWboQeghCGNh67X1EM,151
|
|
20
20
|
evalscope/backend/rag_eval/clip_benchmark/arguments.py,sha256=d5UkbC3RXb6iyzy_ILumToAVO1AdwvDeyOiX5KB2u0g,1530
|
|
21
21
|
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=anuIhRk9OC8y0LNBjvttSXppc99gbz-f0TYQjnyLLyU,8347
|
|
@@ -27,7 +27,7 @@ evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=t0U
|
|
|
27
27
|
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=rZY-TulG-Cb8b6GTBxqTDYQ_4Ois3kbgKhuunZq8Ato,8407
|
|
28
28
|
evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
|
|
29
29
|
evalscope/backend/rag_eval/cmteb/__init__.py,sha256=I502GHPFYo8BwlFvoljGKI24PY76eBXJQiquWk8nJNU,280
|
|
30
|
-
evalscope/backend/rag_eval/cmteb/arguments.py,sha256=
|
|
30
|
+
evalscope/backend/rag_eval/cmteb/arguments.py,sha256=y2iTbs3a7R747NgS00nK2j3zO7gmREh8n7mWMrzF1js,2653
|
|
31
31
|
evalscope/backend/rag_eval/cmteb/base.py,sha256=UCobQ81dHkiTmIz_0BJ_VANj_uG6mkJbYLKJztvMXfo,2849
|
|
32
32
|
evalscope/backend/rag_eval/cmteb/task_template.py,sha256=FyFs1reefcsFCrWyi7Ya5dnFYvBhtxph2wIaFtOtFls,2595
|
|
33
33
|
evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=sqbH0XmSiIm4n5UX5sXMwJHby1r-d35mwW1tKIhb2Hg,10848
|
|
@@ -39,32 +39,36 @@ evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=ofmmeoieXHmU6O14JKWO9
|
|
|
39
39
|
evalscope/backend/rag_eval/cmteb/tasks/STS.py,sha256=uhGLsQTo5lM3-L2Na3WJGqOLQw3c1WxHDA22ePJPxtU,12285
|
|
40
40
|
evalscope/backend/rag_eval/cmteb/tasks/__init__.py,sha256=PKBNyp45hIa3FYNA1psiwtwfwUcn7s9eNt6r5aUpyyY,1505
|
|
41
41
|
evalscope/backend/rag_eval/ragas/__init__.py,sha256=D0yJkN9SuNGIAL3niZw4BI08Yh3HznsUUewdIAa_-LM,171
|
|
42
|
-
evalscope/backend/rag_eval/ragas/arguments.py,sha256=
|
|
42
|
+
evalscope/backend/rag_eval/ragas/arguments.py,sha256=S6M1nsqwMQ8lnZZDtlQTdzyOCfLn9WP0QJ_7wAEsVgc,1695
|
|
43
43
|
evalscope/backend/rag_eval/ragas/task_template.py,sha256=a_3bWfLx0j2zJkWgEWNStO0XXAeUFdnFpeukpoGfxLg,1669
|
|
44
44
|
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py,sha256=fX9sCci787ViGiL3BhGsykx0bnWfOWWEFueaJKyR8g4,793
|
|
45
45
|
evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=hErdWKbvV9aRqOpQTzdFHw1tcYoDbnttmic7GpZzKx8,173
|
|
46
46
|
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py,sha256=vFfemiqtPx22u5pwwZxEQJKYf3B9efYmwbpWDI5hY30,1491
|
|
47
47
|
evalscope/backend/rag_eval/ragas/tasks/build_transform.py,sha256=GtAYqdVOy7BxIGyC4rSZ_UfXagKYzE6eEtXbaOI_g-k,5425
|
|
48
|
-
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=
|
|
49
|
-
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=
|
|
48
|
+
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=YSqpaXMFVe8mkVfq3i_oJg1MSnPm98E7WdOBdyUwMpA,5784
|
|
49
|
+
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVfJNvwZEKcgLe_QhSknPg-f2jGjZkU4,1890
|
|
50
50
|
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
51
|
evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
|
|
52
|
-
evalscope/backend/rag_eval/utils/embedding.py,sha256=
|
|
53
|
-
evalscope/backend/rag_eval/utils/llm.py,sha256=
|
|
52
|
+
evalscope/backend/rag_eval/utils/embedding.py,sha256=tFMepPAMO4Kkqeqh-XxXIDYRjGbCMlk7lwuUW7FNvCA,7977
|
|
53
|
+
evalscope/backend/rag_eval/utils/llm.py,sha256=acaD5QHPJUstJGpW1sNJ-3ZPT5J_Z8beOWb61Rtz07U,2607
|
|
54
54
|
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
55
55
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
56
|
-
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=
|
|
57
|
-
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
56
|
+
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
|
|
58
57
|
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
58
|
evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
59
|
+
evalscope/benchmarks/data_adapter.py,sha256=lwW23GjHHAptv4mc1u3xLlKqiRI1EfbSqaG3QGmxqEQ,17750
|
|
61
60
|
evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
|
|
62
61
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
62
|
evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
|
|
64
63
|
evalscope/benchmarks/aime/aime25_adapter.py,sha256=FB_NufY2V7uYdxVnrY_4y81gyyfYDnvedz1_zHdDWt4,1709
|
|
64
|
+
evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
65
|
+
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=2a6wHJSLe89Xh18u1LBkMQEZzfOURiek6o0-k2lCQgM,4065
|
|
65
66
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
66
67
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
67
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
68
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=U-yPDAjYkPUUOXYjCM1ajdvlUVcdeuVoMK7yWJcX6LI,6369
|
|
69
|
+
evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
70
|
+
evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=bdQfLTWB5pFo4hET0uFqu5zMX9PNQNwdoLoGrL5jCBE,6213
|
|
71
|
+
evalscope/benchmarks/arena_hard/utils.py,sha256=NstI1VR5fTaT-bfXRj0cLqm0DtH8EY4EQHR-K9HJubI,5089
|
|
68
72
|
evalscope/benchmarks/bbh/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
69
73
|
evalscope/benchmarks/bbh/bbh_adapter.py,sha256=fROpzenrjpEBWtnvM_RL_m0uXPOhXTtYAglJEZbzUdY,8330
|
|
70
74
|
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt,sha256=xnzlaIRyeGlogG49v8nt4vpJO40J06ev4yc8cv0VSRY,1781
|
|
@@ -98,20 +102,20 @@ evalscope/benchmarks/ceval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTP
|
|
|
98
102
|
evalscope/benchmarks/ceval/ceval_adapter.py,sha256=E4QobCjSSkMZtPJyaT_XBVxiqEqa1bta1I9aFnaHOqs,11308
|
|
99
103
|
evalscope/benchmarks/ceval/ceval_exam.py,sha256=ngOvb6Fymt7iPWIb2fzrUVpqmUT2VBoqh7X_IH8Bcsc,4824
|
|
100
104
|
evalscope/benchmarks/chinese_simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
101
|
-
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=
|
|
105
|
+
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py,sha256=fYvkJn1UcWM3aqhPMTTtBPVzjTL-Rm_g9UwUJx1FvJc,8106
|
|
102
106
|
evalscope/benchmarks/cmmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
103
107
|
evalscope/benchmarks/cmmlu/cmmlu.py,sha256=Y59NIGUFzJEztJbkehZsG4Cz0J_v9Cyju6xazHMYIcA,5022
|
|
104
108
|
evalscope/benchmarks/cmmlu/cmmlu_adapter.py,sha256=TTq2jRz46Hqc_D_ZBaiw_OwKub1FZX6w8C7g7COIdGs,10372
|
|
105
109
|
evalscope/benchmarks/cmmlu/samples.jsonl,sha256=FXbyPQSDorKBGSD0lnOzioZmFjG07lIL87FRDRaMPSY,1722
|
|
106
110
|
evalscope/benchmarks/competition_math/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
107
111
|
evalscope/benchmarks/competition_math/competition_math.py,sha256=Cehyokift7oDKjc8TdmfblZ6mMc39wQWtqqbUi34QLc,2629
|
|
108
|
-
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=
|
|
112
|
+
evalscope/benchmarks/competition_math/competition_math_adapter.py,sha256=lD7sDro0dSWKgYaM_ZgWbBdetxVURpjo_2q1gvVt1XU,6815
|
|
109
113
|
evalscope/benchmarks/data_collection/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
110
|
-
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=
|
|
114
|
+
evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=ecNwAE3p2eKIeC4whSUdZpeJ8NgidbSFZbIYtSW26Xo,2394
|
|
111
115
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
112
116
|
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
|
|
113
117
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
114
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
118
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=8d5znAcQmFSmvyKV-JuMQzbY5k6xDNQQdrWZ7zgPTK4,4603
|
|
115
119
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
116
120
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
117
121
|
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
|
|
@@ -120,12 +124,12 @@ evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTv
|
|
|
120
124
|
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=ZZZ-9oja53IwiU33Kjm7NTk4MbFGWyvonhnHrn_3Na8,10557
|
|
121
125
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
122
126
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
123
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
127
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=SRM_-AKlWtKXi4zrlBAH9YceFnrktZDNsjvQOiPizUM,5893
|
|
124
128
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
125
129
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
126
130
|
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
|
|
127
131
|
evalscope/benchmarks/ifeval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
128
|
-
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=
|
|
132
|
+
evalscope/benchmarks/ifeval/ifeval_adapter.py,sha256=xuQ1EK8Af_093qqeOXPIp_iqTWcG5KGOtE6r5hx3958,1858
|
|
129
133
|
evalscope/benchmarks/ifeval/instructions.py,sha256=oaJ9D_4rvS67BraHBNPpDtFd4TblFAnR4A3YW9HWfuY,56304
|
|
130
134
|
evalscope/benchmarks/ifeval/instructions_registry.py,sha256=tVUmhuSwnOidLtI8onOAw_gpJ6bi8FL07GiX19hSuo8,7288
|
|
131
135
|
evalscope/benchmarks/ifeval/instructions_util.py,sha256=vkemXeylJMmgW8LgfQe4cSy2OF-oH_NcSZtzyZDURW4,25780
|
|
@@ -134,23 +138,26 @@ evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
|
|
|
134
138
|
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=16whmFkJt9fLbei9d-kmjnWB_5y5vsiX9tK5kSuxDw8,2449
|
|
135
139
|
evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
136
140
|
evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=rOWaG8PV4AGIRhS_gqwxEhphEVe1Cqg57Eudwm5HTjI,6820
|
|
137
|
-
evalscope/benchmarks/live_code_bench/execute_utils.py,sha256=MreaMLI0IicNZawpfqcyoRLt67EZ3CJvmxxRTYwhAbU,7397
|
|
138
141
|
evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
|
|
139
|
-
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=
|
|
142
|
+
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=vLr43hvtR0WS9GclJ6xL9MIqwC941EiRSqgZ_hGHZnw,3382
|
|
140
143
|
evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
|
|
141
144
|
evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
|
|
142
145
|
evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
|
|
143
|
-
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=
|
|
146
|
+
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=v4N7Y4MasNL6TjC4w-Duw_4Zn0oLdWAw3HU6ZrM76P8,17161
|
|
147
|
+
evalscope/benchmarks/maritime_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
148
|
+
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=RVbsiglxmEW37-tDYgr4Drywh26I94DRGhwv7uP2aYk,2829
|
|
144
149
|
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
145
150
|
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
|
|
146
151
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
147
152
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
148
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=
|
|
153
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=bQSRTgXk01pCfKdmTxr3si4FxET3j_yBVVmQlLchTns,11586
|
|
149
154
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
150
155
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
151
|
-
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=
|
|
156
|
+
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=hPqxDqDhqin3TxfimfhIxfEc_8UfzTDGAfX7iDrWy28,4248
|
|
157
|
+
evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
158
|
+
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=Kr30i_exxBJRz9PLB5g6F04e2HJ4WuF6LDyAwaRh2MY,9578
|
|
152
159
|
evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
153
|
-
evalscope/benchmarks/musr/musr_adapter.py,sha256=
|
|
160
|
+
evalscope/benchmarks/musr/musr_adapter.py,sha256=85P0sY7H9pthYdCjkE2AOxaiNhcIBW1iZmODkz3FN0M,2464
|
|
154
161
|
evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
155
162
|
evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
|
|
156
163
|
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ydU-r1T0DaYhOxkhZgGL7PhDd4XoeqOBzVO9oiFPd8M,3422
|
|
@@ -159,7 +166,7 @@ evalscope/benchmarks/race/race.py,sha256=TtFC3opqEA6q8AQIAFQRGx07FjD9z7iW8wmtxeO
|
|
|
159
166
|
evalscope/benchmarks/race/race_adapter.py,sha256=RD0B-i5dzeNKuhqnWbremgf4tk9jmOO4_eLAiITB1F0,6381
|
|
160
167
|
evalscope/benchmarks/race/samples.jsonl,sha256=bhSktBgU6axYQCClRtQ7nN8D1x815AU8xMAIG1oflG0,1243
|
|
161
168
|
evalscope/benchmarks/simple_qa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
|
-
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=
|
|
169
|
+
evalscope/benchmarks/simple_qa/simple_qa_adapter.py,sha256=CsRUJ0v1sSUmtO6QWkdzisn9OHN-1JSXB-9ghOuNqgY,8988
|
|
163
170
|
evalscope/benchmarks/super_gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
164
171
|
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt,sha256=vD3RMeQustxY_oWA8IobntjywT8ZUO7Jaub--rElDT4,4718
|
|
165
172
|
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py,sha256=BqNLL8BYnK6tRuIdV6ijL4Uym2SejH_h1BV06XNjSE4,9331
|
|
@@ -180,7 +187,7 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
|
|
|
180
187
|
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
181
188
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
182
189
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
183
|
-
evalscope/collections/evaluator.py,sha256=
|
|
190
|
+
evalscope/collections/evaluator.py,sha256=4IkdbKySOW-MzH9Zjn0uddQviFLe2pOef746fgbjkJo,12784
|
|
184
191
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
185
192
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
186
193
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
@@ -190,7 +197,7 @@ evalscope/evaluator/reviewer/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0Fw
|
|
|
190
197
|
evalscope/evaluator/reviewer/auto_reviewer.py,sha256=PDz1Nt_qq0oGZBBwek2-M8VBUNLkOkmYObzR8gO7nEc,16624
|
|
191
198
|
evalscope/metrics/__init__.py,sha256=SWvqzUzdryW5URz6u4fPkP9XSyA09nQ8zBeE8BbchSg,349
|
|
192
199
|
evalscope/metrics/code_metric.py,sha256=EXE2BZAc9JJT_cpd6eCb0Jo9wwtnzXzHBZxmLzG5Jpw,3300
|
|
193
|
-
evalscope/metrics/llm_judge.py,sha256=
|
|
200
|
+
evalscope/metrics/llm_judge.py,sha256=Di0Q1c6VHLl0nQ_TVOZOOQlMApDIU83HuDPTOV8XrTA,4023
|
|
194
201
|
evalscope/metrics/math_parser.py,sha256=uTDudn305G3b8-GboWTrDE6OfrEwAW-areHnoGXZ6Is,17302
|
|
195
202
|
evalscope/metrics/metrics.py,sha256=_YI7RhxlFu_JOgeE3LF9UKu6mJruvyu4FgqVf78Bjb8,13813
|
|
196
203
|
evalscope/metrics/named_metrics.py,sha256=pSHA2_qdi9B5bDHIh08GYhx63odilSwA_T-95K1Usl0,1380
|
|
@@ -201,7 +208,7 @@ evalscope/metrics/resources/gpt2-zhcn3-v4.bpe,sha256=J_K-oSt9usFEw87Av6j7ETn3J48
|
|
|
201
208
|
evalscope/metrics/resources/gpt2-zhcn3-v4.json,sha256=WkM4J_FDPPNQwYi0kj5sM5SVjk2_6bci7tqf8dV9p_U,1289341
|
|
202
209
|
evalscope/models/__init__.py,sha256=i9vcOBMEF_UM7C2gpmh2GsQk3njwqevoQ6A4CnP1fHs,1000
|
|
203
210
|
evalscope/models/base_adapter.py,sha256=7PbRwfD5PIZCBYVds6ZHI8TBY9C5i2LdPOTu88FJWlY,3414
|
|
204
|
-
evalscope/models/chat_adapter.py,sha256=
|
|
211
|
+
evalscope/models/chat_adapter.py,sha256=2XZmdhxnvy4yezPLXNVRbgrs0QkUY2VznEBq5mCYjKs,7106
|
|
205
212
|
evalscope/models/choice_adapter.py,sha256=fnJdo-FMJ-zvNLbEJGc73odgWXIxtVudL00JIf2vzsA,8239
|
|
206
213
|
evalscope/models/custom_adapter.py,sha256=AGztmZ0aT0g2flh4B4NaiZ8LCDg8tT0gVNxmrP5W1mA,2401
|
|
207
214
|
evalscope/models/local_model.py,sha256=yydggBCLcBAmUWbBhv7o2CA3RbG0DwDZharPdrkbNcg,2628
|
|
@@ -212,32 +219,33 @@ evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk
|
|
|
212
219
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
213
220
|
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
214
221
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
215
|
-
evalscope/perf/arguments.py,sha256=
|
|
216
|
-
evalscope/perf/benchmark.py,sha256=
|
|
217
|
-
evalscope/perf/http_client.py,sha256
|
|
222
|
+
evalscope/perf/arguments.py,sha256=UZKlkbDE2N408dY8Ji-WB8sl1rcmamywzxLvNXpnY0w,10194
|
|
223
|
+
evalscope/perf/benchmark.py,sha256=nv7gtCkeKnLKQQiKM4G0MYO2ambcuwsbx67OgEQG0nM,7917
|
|
224
|
+
evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
|
|
218
225
|
evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
|
|
219
226
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
220
227
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
221
228
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
222
229
|
evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
|
|
223
|
-
evalscope/perf/plugin/api/custom_api.py,sha256=
|
|
230
|
+
evalscope/perf/plugin/api/custom_api.py,sha256=ssE4J8AynA0n5SnXSQyk7K5Co3dwUN6Opph08clZna0,3785
|
|
224
231
|
evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
|
|
225
|
-
evalscope/perf/plugin/api/openai_api.py,sha256=
|
|
232
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=kTL_2OACuKhzd2W0Pf4DirpMumzk4V3rqKZ2mvBZVCs,7655
|
|
226
233
|
evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
|
|
227
234
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
228
|
-
evalscope/perf/plugin/datasets/custom.py,sha256=
|
|
235
|
+
evalscope/perf/plugin/datasets/custom.py,sha256=npreC7H1VsdTGYkqlMESvyOhtXOfZQA7_-ICmxe3FWk,936
|
|
229
236
|
evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
|
|
230
|
-
evalscope/perf/plugin/datasets/line_by_line.py,sha256=
|
|
231
|
-
evalscope/perf/plugin/datasets/longalpaca.py,sha256=
|
|
232
|
-
evalscope/perf/plugin/datasets/openqa.py,sha256=
|
|
233
|
-
evalscope/perf/plugin/datasets/random_dataset.py,sha256=
|
|
234
|
-
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=
|
|
237
|
+
evalscope/perf/plugin/datasets/line_by_line.py,sha256=AqZYG6tVL3BIGnzh_2Tev8lDYezJG_1gqJY8bSNQl3Q,957
|
|
238
|
+
evalscope/perf/plugin/datasets/longalpaca.py,sha256=XelLris0-c3StLInQ-Oav4jqGcXPNfJxEDeYvaetEbI,1297
|
|
239
|
+
evalscope/perf/plugin/datasets/openqa.py,sha256=4Pnx5duFJzoiTUfZCbcK7LO8f-skmcpYNUUrtNR_UUc,1463
|
|
240
|
+
evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANBGCSgSExFbscLwSM_Gmk,2958
|
|
241
|
+
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
|
|
235
242
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
236
243
|
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
237
|
-
evalscope/perf/utils/benchmark_util.py,sha256=
|
|
238
|
-
evalscope/perf/utils/db_util.py,sha256=
|
|
244
|
+
evalscope/perf/utils/benchmark_util.py,sha256=XrpB6ISjY2p1ngwPr5eOQS7O_I1kmlbEn2wCwsC_5AA,6278
|
|
245
|
+
evalscope/perf/utils/db_util.py,sha256=VDqiM6xOK7fSneU3YOOU-78LWB8El3mxj_Ixtw2gX3o,9051
|
|
239
246
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
240
247
|
evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
|
|
248
|
+
evalscope/perf/utils/log_utils.py,sha256=1jmB31W3ol9ukPAPbQ8xG3yoZ9oi3tjEyMK5M3ERmbw,1471
|
|
241
249
|
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
242
250
|
evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
|
|
243
251
|
evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
|
|
@@ -260,7 +268,7 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
|
|
|
260
268
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
261
269
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
262
270
|
evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
|
|
263
|
-
evalscope/report/app.py,sha256=
|
|
271
|
+
evalscope/report/app.py,sha256=Lew--YreNeuyLVktnUNZKIfGvnGE_oAD054kZB-YTHo,26904
|
|
264
272
|
evalscope/report/combinator.py,sha256=O3QirwtYhDhdaWVT4STJMCGZMwoX8BTeJ3HtS9iwnWQ,2567
|
|
265
273
|
evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
|
|
266
274
|
evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
|
|
@@ -307,28 +315,28 @@ evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,15
|
|
|
307
315
|
evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
|
|
308
316
|
evalscope/utils/logger.py,sha256=barHSdtbEu21ynGQj_wS-rd7B02wPPR5AgaWCQzvG4w,3638
|
|
309
317
|
evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
|
|
310
|
-
evalscope/utils/utils.py,sha256=
|
|
318
|
+
evalscope/utils/utils.py,sha256=VuGdJh3xZAZ-cRoGcKeJTx3z8sgSs2eMjH-1JX2ZYOU,10615
|
|
311
319
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
312
320
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
313
321
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
314
|
-
tests/cli/test_all.py,sha256=
|
|
315
|
-
tests/cli/test_collection.py,sha256=
|
|
316
|
-
tests/cli/test_run.py,sha256=
|
|
322
|
+
tests/cli/test_all.py,sha256=pwup--iNxckUEsR_aFjIAbEQo3UogSu5aIWf9ryLP2o,4022
|
|
323
|
+
tests/cli/test_collection.py,sha256=y8FjoPziPRf5BdJK8DHjcXn26ETKz1OyqjnCpwjt-F4,4096
|
|
324
|
+
tests/cli/test_run.py,sha256=RW4AkJILqzzyd0wuIdy8Y9SB_4koSRJFezGjFdXdLJI,16549
|
|
317
325
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
318
|
-
tests/perf/test_perf.py,sha256=
|
|
326
|
+
tests/perf/test_perf.py,sha256=BXd6SCMbBDKmh-P_KGTOpuwVQZ05xCKjvH01zGyvBJI,3787
|
|
319
327
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
320
|
-
tests/rag/test_clip_benchmark.py,sha256=
|
|
321
|
-
tests/rag/test_mteb.py,sha256=
|
|
322
|
-
tests/rag/test_ragas.py,sha256=
|
|
328
|
+
tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
|
|
329
|
+
tests/rag/test_mteb.py,sha256=YJw6X1jwX6SYNB-ryVb-OHJWu3vsE3Y4STATI75rdG0,5619
|
|
330
|
+
tests/rag/test_ragas.py,sha256=E7rfKpKtBqglOL1GcW9adfY8nsOZMuoB8GC55UL1Q3c,4517
|
|
323
331
|
tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
324
332
|
tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
|
|
325
333
|
tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
|
|
326
334
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
327
335
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
328
|
-
tests/vlm/test_vlmeval.py,sha256=
|
|
329
|
-
evalscope-0.
|
|
330
|
-
evalscope-0.
|
|
331
|
-
evalscope-0.
|
|
332
|
-
evalscope-0.
|
|
333
|
-
evalscope-0.
|
|
334
|
-
evalscope-0.
|
|
336
|
+
tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
|
|
337
|
+
evalscope-0.14.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
338
|
+
evalscope-0.14.0.dist-info/METADATA,sha256=HQ1pt-YU950AcwwWiypjGcWg0wYU9n6PFZ7j6PG4uHg,33040
|
|
339
|
+
evalscope-0.14.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
340
|
+
evalscope-0.14.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
341
|
+
evalscope-0.14.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
342
|
+
evalscope-0.14.0.dist-info/RECORD,,
|