evalscope 0.13.2__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/rag_eval/__init__.py +1 -1
- evalscope/backend/rag_eval/backend_manager.py +21 -5
- evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
- evalscope/backend/rag_eval/ragas/arguments.py +0 -1
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
- evalscope/backend/rag_eval/utils/embedding.py +49 -3
- evalscope/backend/rag_eval/utils/llm.py +4 -4
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/data_adapter.py +6 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
- evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
- evalscope/benchmarks/maritime_bench/__init__.py +0 -0
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/collections/evaluator.py +4 -2
- evalscope/config.py +1 -1
- evalscope/perf/arguments.py +24 -5
- evalscope/perf/benchmark.py +28 -42
- evalscope/perf/http_client.py +2 -3
- evalscope/perf/plugin/api/custom_api.py +1 -1
- evalscope/perf/plugin/api/openai_api.py +2 -2
- evalscope/perf/plugin/datasets/custom.py +4 -1
- evalscope/perf/plugin/datasets/line_by_line.py +4 -1
- evalscope/perf/plugin/datasets/longalpaca.py +4 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -1
- evalscope/perf/plugin/datasets/random_dataset.py +13 -6
- evalscope/perf/utils/benchmark_util.py +12 -6
- evalscope/perf/utils/db_util.py +1 -1
- evalscope/perf/utils/log_utils.py +41 -0
- evalscope/report/app.py +11 -11
- evalscope/run.py +7 -0
- evalscope/summarizer.py +2 -1
- evalscope/utils/utils.py +36 -25
- evalscope/version.py +2 -2
- {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/METADATA +20 -15
- {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/RECORD +55 -54
- tests/cli/test_all.py +4 -4
- tests/cli/test_collection.py +2 -1
- tests/cli/test_run.py +9 -8
- tests/perf/test_perf.py +1 -2
- tests/rag/test_clip_benchmark.py +0 -1
- tests/rag/test_mteb.py +37 -8
- tests/rag/test_ragas.py +29 -26
- tests/vlm/test_vlmeval.py +37 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
- evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
- {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.13.2.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0
evalscope/report/app.py
CHANGED
|
@@ -44,7 +44,7 @@ def scan_for_report_folders(root_path):
|
|
|
44
44
|
continue
|
|
45
45
|
datasets = []
|
|
46
46
|
for dataset_item in glob.glob(os.path.join(model_item, '*.json')):
|
|
47
|
-
datasets.append(os.path.basename(dataset_item)
|
|
47
|
+
datasets.append(os.path.splitext(os.path.basename(dataset_item))[0])
|
|
48
48
|
datasets = DATASET_TOKEN.join(datasets)
|
|
49
49
|
reports.append(
|
|
50
50
|
f'{os.path.basename(folder)}{REPORT_TOKEN}{os.path.basename(model_item)}{MODEL_TOKEN}{datasets}')
|
|
@@ -253,17 +253,17 @@ def process_model_prediction(item: Any):
|
|
|
253
253
|
|
|
254
254
|
|
|
255
255
|
def normalize_score(score):
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
else:
|
|
263
|
-
try:
|
|
264
|
-
return float(score)
|
|
265
|
-
except (ValueError, TypeError):
|
|
256
|
+
try:
|
|
257
|
+
if isinstance(score, bool):
|
|
258
|
+
return 1.0 if score else 0.0
|
|
259
|
+
elif isinstance(score, dict):
|
|
260
|
+
for key in score:
|
|
261
|
+
return float(score[key])
|
|
266
262
|
return 0.0
|
|
263
|
+
else:
|
|
264
|
+
return float(score)
|
|
265
|
+
except (ValueError, TypeError):
|
|
266
|
+
return 0.0
|
|
267
267
|
|
|
268
268
|
|
|
269
269
|
def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subset_name: str):
|
evalscope/run.py
CHANGED
|
@@ -58,10 +58,17 @@ def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
|
58
58
|
|
|
59
59
|
outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
|
|
60
60
|
|
|
61
|
+
# Unify the output directory structure
|
|
61
62
|
if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
|
|
62
63
|
task_cfg.eval_config['time_str'] = run_time
|
|
63
64
|
elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
64
65
|
task_cfg.eval_config['work_dir'] = task_cfg.work_dir
|
|
66
|
+
elif task_cfg.eval_backend == EvalBackend.RAG_EVAL:
|
|
67
|
+
from evalscope.backend.rag_eval import Tools
|
|
68
|
+
if task_cfg.eval_config['tool'].lower() == Tools.MTEB:
|
|
69
|
+
task_cfg.eval_config['eval']['output_folder'] = task_cfg.work_dir
|
|
70
|
+
elif task_cfg.eval_config['tool'].lower() == Tools.CLIP_BENCHMARK:
|
|
71
|
+
task_cfg.eval_config['eval']['output_dir'] = task_cfg.work_dir
|
|
65
72
|
return outputs
|
|
66
73
|
|
|
67
74
|
|
evalscope/summarizer.py
CHANGED
|
@@ -105,7 +105,8 @@ class Summarizer:
|
|
|
105
105
|
summary_res: dict = csv_to_list(summary_file_path)[0]
|
|
106
106
|
elif summary_file_path.endswith('json'):
|
|
107
107
|
summary_res: dict = json_to_dict(summary_file_path)
|
|
108
|
-
|
|
108
|
+
base_name = os.path.basename(summary_file_path)
|
|
109
|
+
file_name = os.path.splitext(base_name)[0]
|
|
109
110
|
final_res_list.append({file_name: summary_res})
|
|
110
111
|
|
|
111
112
|
elif eval_backend == EvalBackend.THIRD_PARTY:
|
evalscope/utils/utils.py
CHANGED
|
@@ -90,7 +90,7 @@ class ResponseParser:
|
|
|
90
90
|
return ''
|
|
91
91
|
|
|
92
92
|
@staticmethod
|
|
93
|
-
def parse_first_option_with_choices(text: str, options: list) -> str:
|
|
93
|
+
def parse_first_option_with_choices(text: str, options: list[str]) -> str:
|
|
94
94
|
"""
|
|
95
95
|
Find first valid option for text.
|
|
96
96
|
|
|
@@ -98,7 +98,7 @@ class ResponseParser:
|
|
|
98
98
|
text: The text to parse.
|
|
99
99
|
options: The options to find. e.g. ['A', 'B', 'C', 'D']
|
|
100
100
|
"""
|
|
101
|
-
options_concat =
|
|
101
|
+
options_concat = ResponseParser.process_options(options)
|
|
102
102
|
|
|
103
103
|
patterns = [
|
|
104
104
|
rf'答案是?\s?([{options_concat}])',
|
|
@@ -155,48 +155,53 @@ class ResponseParser:
|
|
|
155
155
|
for i in options:
|
|
156
156
|
if i in outputs:
|
|
157
157
|
return i
|
|
158
|
-
return ''
|
|
158
|
+
return 'No valid option found'
|
|
159
159
|
|
|
160
160
|
@staticmethod
|
|
161
|
-
def parse_first_option(text: str) -> str:
|
|
161
|
+
def parse_first_option(text: str, options: list[str]) -> str:
|
|
162
162
|
"""
|
|
163
163
|
Find first valid option for text.
|
|
164
164
|
|
|
165
165
|
Args:
|
|
166
166
|
text: The text to parse.
|
|
167
167
|
"""
|
|
168
|
+
options_pattern = ResponseParser.process_options(options)
|
|
169
|
+
|
|
168
170
|
patterns = [
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
171
|
+
rf'[Aa]nswer:\s*({options_pattern})',
|
|
172
|
+
rf'ANSWER:\s*({options_pattern})',
|
|
173
|
+
rf'answer is \(?({options_pattern})\)?',
|
|
174
|
+
rf'[Tt]he correct answer is:\s*({options_pattern})',
|
|
175
|
+
rf'[Tt]he correct answer is:\n\s*({options_pattern})',
|
|
176
|
+
rf'[Tt]he correct answer is:\n\n-\s*({options_pattern})',
|
|
177
|
+
rf'[Tt]he answer might be:\n\n-\s*({options_pattern})',
|
|
178
|
+
rf'[Tt]he answer is \s*({options_pattern})',
|
|
176
179
|
]
|
|
177
180
|
|
|
178
181
|
regexes = [re.compile(pattern) for pattern in patterns]
|
|
179
182
|
for regex in regexes:
|
|
180
|
-
|
|
181
|
-
if
|
|
182
|
-
return
|
|
183
|
-
return ''
|
|
183
|
+
matches = regex.search(text)
|
|
184
|
+
if matches:
|
|
185
|
+
return matches.group(1)
|
|
186
|
+
return 'No valid option found'
|
|
187
|
+
|
|
184
188
|
|
|
185
189
|
@staticmethod
|
|
186
|
-
def
|
|
187
|
-
|
|
190
|
+
def parse_bracketed_answer(text: str, options: list[str]) -> str:
|
|
191
|
+
options = ResponseParser.process_options(options)
|
|
192
|
+
# Match the first occurrence of the options in angle brackets
|
|
193
|
+
match = re.search(rf'<({options})>', text)
|
|
188
194
|
if match:
|
|
189
195
|
return match.group(1)
|
|
190
|
-
return ''
|
|
196
|
+
return 'No valid option found'
|
|
191
197
|
|
|
192
198
|
@staticmethod
|
|
193
|
-
def
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
199
|
+
def process_options(options: list[str]) -> str:
|
|
200
|
+
# Escape each option to ensure special characters in options are treated literally
|
|
201
|
+
escaped_options = [re.escape(option) for option in options]
|
|
202
|
+
# Join options into a regex pattern separated by '|', to match any of the options
|
|
203
|
+
options_pattern = '|'.join(escaped_options)
|
|
204
|
+
return options_pattern
|
|
200
205
|
|
|
201
206
|
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
|
|
202
207
|
"""
|
|
@@ -299,3 +304,9 @@ def seed_everything(seed: int):
|
|
|
299
304
|
torch.cuda.manual_seed_all(seed)
|
|
300
305
|
torch.backends.cudnn.deterministic = True
|
|
301
306
|
torch.backends.cudnn.benchmark = False
|
|
307
|
+
|
|
308
|
+
if __name__ == '__main__':
|
|
309
|
+
options = ['A', 'B', 'C', 'D']
|
|
310
|
+
answers = ['Context .... ANSWER: A', 'answer: A']
|
|
311
|
+
for answer in answers:
|
|
312
|
+
print(ResponseParser.parse_first_option(answer, options))
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.14.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -71,12 +71,12 @@ Requires-Dist: transformers>=4.33; extra == "all"
|
|
|
71
71
|
Requires-Dist: word2number; extra == "all"
|
|
72
72
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
|
|
73
73
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
|
|
74
|
-
Requires-Dist: langchain<0.3.0; extra == "all"
|
|
75
|
-
Requires-Dist: langchain-community<0.3.0; extra == "all"
|
|
76
|
-
Requires-Dist: langchain-core<0.3.0; extra == "all"
|
|
77
|
-
Requires-Dist: langchain-openai<0.3.0; extra == "all"
|
|
74
|
+
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "all"
|
|
75
|
+
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "all"
|
|
76
|
+
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "all"
|
|
77
|
+
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "all"
|
|
78
78
|
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
79
|
-
Requires-Dist: ragas==0.2.
|
|
79
|
+
Requires-Dist: ragas==0.2.14; extra == "all"
|
|
80
80
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
81
81
|
Requires-Dist: aiohttp; extra == "all"
|
|
82
82
|
Requires-Dist: fastapi; extra == "all"
|
|
@@ -99,12 +99,12 @@ Requires-Dist: sse-starlette; extra == "perf"
|
|
|
99
99
|
Requires-Dist: transformers; extra == "perf"
|
|
100
100
|
Requires-Dist: unicorn; extra == "perf"
|
|
101
101
|
Provides-Extra: rag
|
|
102
|
-
Requires-Dist: langchain<0.3.0; extra == "rag"
|
|
103
|
-
Requires-Dist: langchain-community<0.3.0; extra == "rag"
|
|
104
|
-
Requires-Dist: langchain-core<0.3.0; extra == "rag"
|
|
105
|
-
Requires-Dist: langchain-openai<0.3.0; extra == "rag"
|
|
102
|
+
Requires-Dist: langchain<0.4.0,>=0.3.0; extra == "rag"
|
|
103
|
+
Requires-Dist: langchain-community<0.4.0,>=0.3.0; extra == "rag"
|
|
104
|
+
Requires-Dist: langchain-core<0.4.0,>=0.3.0; extra == "rag"
|
|
105
|
+
Requires-Dist: langchain-openai<0.4.0,>=0.3.0; extra == "rag"
|
|
106
106
|
Requires-Dist: mteb==1.19.4; extra == "rag"
|
|
107
|
-
Requires-Dist: ragas==0.2.
|
|
107
|
+
Requires-Dist: ragas==0.2.14; extra == "rag"
|
|
108
108
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
109
109
|
Provides-Extra: vlmeval
|
|
110
110
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
@@ -121,7 +121,7 @@ Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
|
121
121
|
</p>
|
|
122
122
|
|
|
123
123
|
<p align="center">
|
|
124
|
-
<img src="https://img.shields.io/badge/python-%E2%89%A53.
|
|
124
|
+
<img src="https://img.shields.io/badge/python-%E2%89%A53.9-5be.svg">
|
|
125
125
|
<a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
|
|
126
126
|
<a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
|
|
127
127
|
<a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
|
|
@@ -199,6 +199,8 @@ Please scan the QR code below to join our community groups:
|
|
|
199
199
|
|
|
200
200
|
## 🎉 News
|
|
201
201
|
|
|
202
|
+
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
203
|
+
- 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
|
|
202
204
|
- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
203
205
|
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
204
206
|
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
@@ -212,15 +214,14 @@ Please scan the QR code below to join our community groups:
|
|
|
212
214
|
- 🔥 **[2025.02.13]** Added support for evaluating DeepSeek distilled models, including AIME24, MATH-500, and GPQA-Diamond datasets,refer to [best practice](https://evalscope.readthedocs.io/en/latest/best_practice/deepseek_r1_distill.html); Added support for specifying the `eval_batch_size` parameter to accelerate model evaluation.
|
|
213
215
|
- 🔥 **[2025.01.20]** Support for visualizing evaluation results, including single model evaluation results and multi-model comparison, refer to the [📖 Visualizing Evaluation Results](https://evalscope.readthedocs.io/en/latest/get_started/visualization.html) for more details; Added [`iquiz`](https://modelscope.cn/datasets/AI-ModelScope/IQuiz/summary) evaluation example, evaluating the IQ and EQ of the model.
|
|
214
216
|
- 🔥 **[2025.01.07]** Native backend: Support for model API evaluation is now available. Refer to the [📖 Model API Evaluation Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html#api) for more details. Additionally, support for the `ifeval` evaluation benchmark has been added.
|
|
217
|
+
<details><summary>More</summary>
|
|
218
|
+
|
|
215
219
|
- 🔥🔥 **[2024.12.31]** Support for adding benchmark evaluations, refer to the [📖 Benchmark Evaluation Addition Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/add_benchmark.html); support for custom mixed dataset evaluations, allowing for more comprehensive model evaluations with less data, refer to the [📖 Mixed Dataset Evaluation Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/collection/index.html).
|
|
216
220
|
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
217
221
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
218
222
|
- 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
|
|
219
223
|
- 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
|
|
220
224
|
- 🔥 **[2024.10.8]** Support for RAG evaluation, including independent evaluation of embedding models and rerankers using [MTEB/CMTEB](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html), as well as end-to-end evaluation using [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html).
|
|
221
|
-
|
|
222
|
-
<details><summary>More</summary>
|
|
223
|
-
|
|
224
225
|
- 🔥 **[2024.09.18]** Our documentation has been updated to include a blog module, featuring some technical research and discussions related to evaluations. We invite you to [📖 read it](https://evalscope.readthedocs.io/en/refact_readme/blog/index.html).
|
|
225
226
|
- 🔥 **[2024.09.12]** Support for LongWriter evaluation, which supports 10,000+ word generation. You can use the benchmark [LongBench-Write](evalscope/third_party/longbench_write/README.md) to measure the long output quality as well as the output length.
|
|
226
227
|
- 🔥 **[2024.08.30]** Support for custom dataset evaluations, including text datasets and multimodal image-text datasets.
|
|
@@ -503,6 +504,10 @@ Reference: Performance Testing [📖 User Guide](https://evalscope.readthedocs.i
|
|
|
503
504
|
|
|
504
505
|

|
|
505
506
|
|
|
507
|
+
**Supports swanlab for recording results**
|
|
508
|
+
|
|
509
|
+

|
|
510
|
+
|
|
506
511
|
**Supports Speed Benchmark**
|
|
507
512
|
|
|
508
513
|
It supports speed testing and provides speed benchmarks similar to those found in the [official Qwen](https://qwen.readthedocs.io/en/latest/benchmark/speed_benchmark.html) reports:
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
evalscope/__init__.py,sha256=XZYDn3ShhM_48je5qQgwymtSdpTt8zYEnNfanYnpBdA,181
|
|
2
2
|
evalscope/arguments.py,sha256=OPYmX_ar7rXFm0ETPuE2hs-knDQtwQ0pFwSazjn3S9Q,5241
|
|
3
|
-
evalscope/config.py,sha256=
|
|
3
|
+
evalscope/config.py,sha256=sc8NoqhspbrNYMS201ZWreCKV-tBJrUEt96vKwpqfDY,9483
|
|
4
4
|
evalscope/constants.py,sha256=Cgzkoz4R3MC3YLtbCM2fmSwF8Z2kuxYdOC8t9FWJj9w,3740
|
|
5
|
-
evalscope/run.py,sha256=
|
|
5
|
+
evalscope/run.py,sha256=XbUhllYPjaJJuR1hPoGZH0jlW8XlvUv9gONrMBc4Ni0,6450
|
|
6
6
|
evalscope/run_arena.py,sha256=WXPCT0L-b_KvLBQ9KnrVW6y8icdDcqVhaXjTZMpS8k8,8572
|
|
7
|
-
evalscope/summarizer.py,sha256=
|
|
8
|
-
evalscope/version.py,sha256=
|
|
7
|
+
evalscope/summarizer.py,sha256=61kU5ZoSh1dd8HMJPqP3ZvJwcY9szwWFCZdu2lfATJA,5920
|
|
8
|
+
evalscope/version.py,sha256=4w52xL5au75pTD-PrvG-9l-U1euGk2032efyc-7IkQw,119
|
|
9
9
|
evalscope/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
evalscope/backend/base.py,sha256=qYu8Shokrtrx-N6T_BAJk_6OCpovUBYuN0p3wngt-dw,1030
|
|
11
11
|
evalscope/backend/opencompass/__init__.py,sha256=UP_TW5KBq6V_Nvqkeb7PGvGGX3rVYussT43npwCwDgE,135
|
|
@@ -14,8 +14,8 @@ evalscope/backend/opencompass/backend_manager.py,sha256=y5NnAIY1pI7E1ZSeKU3acrD-
|
|
|
14
14
|
evalscope/backend/opencompass/tasks/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
15
15
|
evalscope/backend/opencompass/tasks/eval_api.py,sha256=ZaGdUbEOtAW5VX3ZXmpHIttg_QrID34EnBTylD3uvos,1152
|
|
16
16
|
evalscope/backend/opencompass/tasks/eval_datasets.py,sha256=JHSq4EnPJgv4sRJJplLH80EqE3ghtkn2k8HnV6DaDew,5406
|
|
17
|
-
evalscope/backend/rag_eval/__init__.py,sha256=
|
|
18
|
-
evalscope/backend/rag_eval/backend_manager.py,sha256=
|
|
17
|
+
evalscope/backend/rag_eval/__init__.py,sha256=Tbj7HboP5zzJ77-9qVEwwhHKjHL5V8MwLFr6sw1oeoA,291
|
|
18
|
+
evalscope/backend/rag_eval/backend_manager.py,sha256=OEFADT8kdsuVMU0QOfiafzFQopY7bKbWZ_jhdXyYElY,3472
|
|
19
19
|
evalscope/backend/rag_eval/clip_benchmark/__init__.py,sha256=C8Vetf52nyHiRwY2Pm74Bjn3UpWboQeghCGNh67X1EM,151
|
|
20
20
|
evalscope/backend/rag_eval/clip_benchmark/arguments.py,sha256=d5UkbC3RXb6iyzy_ILumToAVO1AdwvDeyOiX5KB2u0g,1530
|
|
21
21
|
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py,sha256=anuIhRk9OC8y0LNBjvttSXppc99gbz-f0TYQjnyLLyU,8347
|
|
@@ -27,7 +27,7 @@ evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py,sha256=t0U
|
|
|
27
27
|
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py,sha256=rZY-TulG-Cb8b6GTBxqTDYQ_4Ois3kbgKhuunZq8Ato,8407
|
|
28
28
|
evalscope/backend/rag_eval/clip_benchmark/utils/webdatasets.txt,sha256=eiiAaxhS48b5rVLy5O9VvFfV2AfxY86ITu_iqT7ZLkQ,649
|
|
29
29
|
evalscope/backend/rag_eval/cmteb/__init__.py,sha256=I502GHPFYo8BwlFvoljGKI24PY76eBXJQiquWk8nJNU,280
|
|
30
|
-
evalscope/backend/rag_eval/cmteb/arguments.py,sha256=
|
|
30
|
+
evalscope/backend/rag_eval/cmteb/arguments.py,sha256=y2iTbs3a7R747NgS00nK2j3zO7gmREh8n7mWMrzF1js,2653
|
|
31
31
|
evalscope/backend/rag_eval/cmteb/base.py,sha256=UCobQ81dHkiTmIz_0BJ_VANj_uG6mkJbYLKJztvMXfo,2849
|
|
32
32
|
evalscope/backend/rag_eval/cmteb/task_template.py,sha256=FyFs1reefcsFCrWyi7Ya5dnFYvBhtxph2wIaFtOtFls,2595
|
|
33
33
|
evalscope/backend/rag_eval/cmteb/tasks/Classification.py,sha256=sqbH0XmSiIm4n5UX5sXMwJHby1r-d35mwW1tKIhb2Hg,10848
|
|
@@ -39,25 +39,24 @@ evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py,sha256=ofmmeoieXHmU6O14JKWO9
|
|
|
39
39
|
evalscope/backend/rag_eval/cmteb/tasks/STS.py,sha256=uhGLsQTo5lM3-L2Na3WJGqOLQw3c1WxHDA22ePJPxtU,12285
|
|
40
40
|
evalscope/backend/rag_eval/cmteb/tasks/__init__.py,sha256=PKBNyp45hIa3FYNA1psiwtwfwUcn7s9eNt6r5aUpyyY,1505
|
|
41
41
|
evalscope/backend/rag_eval/ragas/__init__.py,sha256=D0yJkN9SuNGIAL3niZw4BI08Yh3HznsUUewdIAa_-LM,171
|
|
42
|
-
evalscope/backend/rag_eval/ragas/arguments.py,sha256=
|
|
42
|
+
evalscope/backend/rag_eval/ragas/arguments.py,sha256=S6M1nsqwMQ8lnZZDtlQTdzyOCfLn9WP0QJ_7wAEsVgc,1695
|
|
43
43
|
evalscope/backend/rag_eval/ragas/task_template.py,sha256=a_3bWfLx0j2zJkWgEWNStO0XXAeUFdnFpeukpoGfxLg,1669
|
|
44
44
|
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py,sha256=fX9sCci787ViGiL3BhGsykx0bnWfOWWEFueaJKyR8g4,793
|
|
45
45
|
evalscope/backend/rag_eval/ragas/tasks/__init__.py,sha256=hErdWKbvV9aRqOpQTzdFHw1tcYoDbnttmic7GpZzKx8,173
|
|
46
46
|
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py,sha256=vFfemiqtPx22u5pwwZxEQJKYf3B9efYmwbpWDI5hY30,1491
|
|
47
47
|
evalscope/backend/rag_eval/ragas/tasks/build_transform.py,sha256=GtAYqdVOy7BxIGyC4rSZ_UfXagKYzE6eEtXbaOI_g-k,5425
|
|
48
|
-
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=
|
|
49
|
-
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=
|
|
48
|
+
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py,sha256=YSqpaXMFVe8mkVfq3i_oJg1MSnPm98E7WdOBdyUwMpA,5784
|
|
49
|
+
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py,sha256=6x-4O2pgsjZCVfJNvwZEKcgLe_QhSknPg-f2jGjZkU4,1890
|
|
50
50
|
evalscope/backend/rag_eval/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
51
|
evalscope/backend/rag_eval/utils/clip.py,sha256=GLHhPCac2AH35AvRLvVqePA1gIMAewHTFmCJCDZzvqU,5015
|
|
52
|
-
evalscope/backend/rag_eval/utils/embedding.py,sha256=
|
|
53
|
-
evalscope/backend/rag_eval/utils/llm.py,sha256=
|
|
52
|
+
evalscope/backend/rag_eval/utils/embedding.py,sha256=tFMepPAMO4Kkqeqh-XxXIDYRjGbCMlk7lwuUW7FNvCA,7977
|
|
53
|
+
evalscope/backend/rag_eval/utils/llm.py,sha256=acaD5QHPJUstJGpW1sNJ-3ZPT5J_Z8beOWb61Rtz07U,2607
|
|
54
54
|
evalscope/backend/rag_eval/utils/tools.py,sha256=FU7tNu-8y8V_o_kArFVTTLM_GzL12KBNeXiwQw5SpJA,1529
|
|
55
55
|
evalscope/backend/vlm_eval_kit/__init__.py,sha256=R-GuBm8dAwvDF73XHaGpPSjlt7Y4tycyy-FJgzLdjeY,84
|
|
56
|
-
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=
|
|
57
|
-
evalscope/backend/vlm_eval_kit/custom_dataset.py,sha256=D-8Sr0tZblf-3ZR5T0lOggbgFRuy5wq9Nm02GUpNyAs,1620
|
|
56
|
+
evalscope/backend/vlm_eval_kit/backend_manager.py,sha256=sUYvQxCtPl6CrcwhQpY8lJjW5skqWc-fvHUSnXd_MvQ,6054
|
|
58
57
|
evalscope/benchmarks/__init__.py,sha256=b_SWdV1ZyOqFiwc_9lIjKrIvK1rwnF2cCIF7XN9CN8E,932
|
|
59
58
|
evalscope/benchmarks/benchmark.py,sha256=a_7Ctz36McuTyBSTYi56jis9pvOdWhg7JVSPFrbxqR4,2535
|
|
60
|
-
evalscope/benchmarks/data_adapter.py,sha256=
|
|
59
|
+
evalscope/benchmarks/data_adapter.py,sha256=lwW23GjHHAptv4mc1u3xLlKqiRI1EfbSqaG3QGmxqEQ,17750
|
|
61
60
|
evalscope/benchmarks/utils.py,sha256=6kxeBz4w8Fw68AYH05a4ncjgkaUV4bU3eaFVLqOdkMI,1321
|
|
62
61
|
evalscope/benchmarks/aime/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
62
|
evalscope/benchmarks/aime/aime24_adapter.py,sha256=dBm9yukt4-CByEPUlAPAIN6mL3VkZcI-dw2kz4oQBMo,1715
|
|
@@ -66,7 +65,7 @@ evalscope/benchmarks/alpaca_eval/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRk
|
|
|
66
65
|
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py,sha256=2a6wHJSLe89Xh18u1LBkMQEZzfOURiek6o0-k2lCQgM,4065
|
|
67
66
|
evalscope/benchmarks/arc/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
68
67
|
evalscope/benchmarks/arc/ai2_arc.py,sha256=WtL4Z_ulcCU2KfptWTjTm75T2I2rVGd9aDBBB76P14w,5697
|
|
69
|
-
evalscope/benchmarks/arc/arc_adapter.py,sha256=
|
|
68
|
+
evalscope/benchmarks/arc/arc_adapter.py,sha256=U-yPDAjYkPUUOXYjCM1ajdvlUVcdeuVoMK7yWJcX6LI,6369
|
|
70
69
|
evalscope/benchmarks/arena_hard/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
71
70
|
evalscope/benchmarks/arena_hard/arena_hard_adapter.py,sha256=bdQfLTWB5pFo4hET0uFqu5zMX9PNQNwdoLoGrL5jCBE,6213
|
|
72
71
|
evalscope/benchmarks/arena_hard/utils.py,sha256=NstI1VR5fTaT-bfXRj0cLqm0DtH8EY4EQHR-K9HJubI,5089
|
|
@@ -116,7 +115,7 @@ evalscope/benchmarks/data_collection/data_collection_adapter.py,sha256=ecNwAE3p2
|
|
|
116
115
|
evalscope/benchmarks/general_mcq/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
117
116
|
evalscope/benchmarks/general_mcq/general_mcq_adapter.py,sha256=ZVGzUuuQ0UTOqQtXE40ZyBeMOSl8saSiFEQ5_siJ-c8,5052
|
|
118
117
|
evalscope/benchmarks/general_qa/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
119
|
-
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=
|
|
118
|
+
evalscope/benchmarks/general_qa/general_qa_adapter.py,sha256=8d5znAcQmFSmvyKV-JuMQzbY5k6xDNQQdrWZ7zgPTK4,4603
|
|
120
119
|
evalscope/benchmarks/gpqa/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
121
120
|
evalscope/benchmarks/gpqa/chain_of_thought.txt,sha256=pgoT5a-DMPJaMhoH_M8zfU5s80ibWDTVW6vnonITd8k,5610
|
|
122
121
|
evalscope/benchmarks/gpqa/gpqa_adapter.py,sha256=UB287DtnbkSQXZsbReFJqmQRwbo672DTCeXXilR_-Vc,4790
|
|
@@ -125,7 +124,7 @@ evalscope/benchmarks/gsm8k/gsm8k.py,sha256=ZDN5lfeZyc_pkTDVY0voC_zUExHE1ZoEgEaTv
|
|
|
125
124
|
evalscope/benchmarks/gsm8k/gsm8k_adapter.py,sha256=ZZZ-9oja53IwiU33Kjm7NTk4MbFGWyvonhnHrn_3Na8,10557
|
|
126
125
|
evalscope/benchmarks/hellaswag/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
127
126
|
evalscope/benchmarks/hellaswag/hellaswag.py,sha256=5_c9WbaS1LIdvgXzqEcvjAEtKi2V2Yn0YtszPlFqhXI,4610
|
|
128
|
-
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=
|
|
127
|
+
evalscope/benchmarks/hellaswag/hellaswag_adapter.py,sha256=SRM_-AKlWtKXi4zrlBAH9YceFnrktZDNsjvQOiPizUM,5893
|
|
129
128
|
evalscope/benchmarks/humaneval/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
130
129
|
evalscope/benchmarks/humaneval/humaneval.py,sha256=2Exsg6u8FEu0buADY2tETJluSM8tWacvX06nykKKLSE,3395
|
|
131
130
|
evalscope/benchmarks/humaneval/humaneval_adapter.py,sha256=UOjakV31J0g7TYbrRls0ItcopWOJu54ucPfaqSJB7Os,5250
|
|
@@ -139,25 +138,26 @@ evalscope/benchmarks/iquiz/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJ
|
|
|
139
138
|
evalscope/benchmarks/iquiz/iquiz_adapter.py,sha256=16whmFkJt9fLbei9d-kmjnWB_5y5vsiX9tK5kSuxDw8,2449
|
|
140
139
|
evalscope/benchmarks/live_code_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
141
140
|
evalscope/benchmarks/live_code_bench/evaluate_utils.py,sha256=rOWaG8PV4AGIRhS_gqwxEhphEVe1Cqg57Eudwm5HTjI,6820
|
|
142
|
-
evalscope/benchmarks/live_code_bench/execute_utils.py,sha256=MreaMLI0IicNZawpfqcyoRLt67EZ3CJvmxxRTYwhAbU,7397
|
|
143
141
|
evalscope/benchmarks/live_code_bench/extract_utils.py,sha256=ZcQ8y741uawPo6I_1_XglR3eqJFDNrqc8fILKZupVRs,2375
|
|
144
|
-
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=
|
|
142
|
+
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py,sha256=vLr43hvtR0WS9GclJ6xL9MIqwC941EiRSqgZ_hGHZnw,3382
|
|
145
143
|
evalscope/benchmarks/live_code_bench/load_utils.py,sha256=5i9wtdPLYR8ckjx5MaYQVC2LFYvjKzR6Fa6UZmeOTRc,2445
|
|
146
144
|
evalscope/benchmarks/live_code_bench/pass_k_utils.py,sha256=Ktrp_lXdfFzoHtQNQNdGfIl26ySjaPCHm4Zv-dFvRqM,2024
|
|
147
145
|
evalscope/benchmarks/live_code_bench/prompts.py,sha256=P4KILIAIDT1MKDck0xHYV_6v9820wDZRhxVMazmlL-g,12600
|
|
148
|
-
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=
|
|
146
|
+
evalscope/benchmarks/live_code_bench/testing_util.py,sha256=v4N7Y4MasNL6TjC4w-Duw_4Zn0oLdWAw3HU6ZrM76P8,17161
|
|
147
|
+
evalscope/benchmarks/maritime_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
148
|
+
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py,sha256=RVbsiglxmEW37-tDYgr4Drywh26I94DRGhwv7uP2aYk,2829
|
|
149
149
|
evalscope/benchmarks/math_500/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
150
150
|
evalscope/benchmarks/math_500/math_500_adapter.py,sha256=SB2eb4Z7DTXdptqirEoctqTdDLEu28s7bLeCAMBmAFo,1923
|
|
151
151
|
evalscope/benchmarks/mmlu/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
152
152
|
evalscope/benchmarks/mmlu/mmlu.py,sha256=sA8AC0bN7iURrSazqkY31s_reNVbDZSUCB-NCTQsVeI,5042
|
|
153
|
-
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=
|
|
153
|
+
evalscope/benchmarks/mmlu/mmlu_adapter.py,sha256=bQSRTgXk01pCfKdmTxr3si4FxET3j_yBVVmQlLchTns,11586
|
|
154
154
|
evalscope/benchmarks/mmlu/samples.jsonl,sha256=f5Y2vwbEvNtpE7vrl9BHoJzsdceI4vUAo1frexYyX2o,1345
|
|
155
155
|
evalscope/benchmarks/mmlu_pro/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
156
|
-
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=
|
|
156
|
+
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py,sha256=hPqxDqDhqin3TxfimfhIxfEc_8UfzTDGAfX7iDrWy28,4248
|
|
157
157
|
evalscope/benchmarks/mmlu_redux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
158
|
-
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=
|
|
158
|
+
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py,sha256=Kr30i_exxBJRz9PLB5g6F04e2HJ4WuF6LDyAwaRh2MY,9578
|
|
159
159
|
evalscope/benchmarks/musr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
160
|
-
evalscope/benchmarks/musr/musr_adapter.py,sha256=
|
|
160
|
+
evalscope/benchmarks/musr/musr_adapter.py,sha256=85P0sY7H9pthYdCjkE2AOxaiNhcIBW1iZmODkz3FN0M,2464
|
|
161
161
|
evalscope/benchmarks/process_bench/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
162
162
|
evalscope/benchmarks/process_bench/critique_template.txt,sha256=tycx8n42QEC0uGcwbIvHfZvfTnchlRxGz8Tp1R2_e_Y,489
|
|
163
163
|
evalscope/benchmarks/process_bench/process_bench_adapter.py,sha256=ydU-r1T0DaYhOxkhZgGL7PhDd4XoeqOBzVO9oiFPd8M,3422
|
|
@@ -187,7 +187,7 @@ evalscope/cli/start_eval.py,sha256=MXhVDeaMFd6ny88-gnVtQflH660UaDj240YGYnHccx8,7
|
|
|
187
187
|
evalscope/cli/start_perf.py,sha256=5hLi5jWgM9BJPXLd8d9D1zqrcj_5c0KvkfB1DgD4_RU,831
|
|
188
188
|
evalscope/cli/start_server.py,sha256=DQRIfbsHaOAsVcLGF6iRyJnxmd5Sf_tgytpJNfiWCeE,3662
|
|
189
189
|
evalscope/collections/__init__.py,sha256=hd68Qf-ryeDsz5Pu-Dh83M5V5RE5mhLsG-vc55n5n0o,228
|
|
190
|
-
evalscope/collections/evaluator.py,sha256=
|
|
190
|
+
evalscope/collections/evaluator.py,sha256=4IkdbKySOW-MzH9Zjn0uddQviFLe2pOef746fgbjkJo,12784
|
|
191
191
|
evalscope/collections/sampler.py,sha256=2NwvhJVdi-mrDeK7RWwEGOoE7DdxtpyASRUZU_D6hWw,4855
|
|
192
192
|
evalscope/collections/schema.py,sha256=mjJfNmy_athJ1TmnuJRkrKRlefzefuQXZuTtjn8SHKo,4073
|
|
193
193
|
evalscope/evaluator/__init__.py,sha256=S6MU1O_iiNAaKxNIhO9MEmdW-BSNf_YH2l6NQ9lxVNo,103
|
|
@@ -219,32 +219,33 @@ evalscope/models/custom/__init__.py,sha256=wb6f_Bi39s5sj-VO7EXRDXB2WhyFb49BUtEMk
|
|
|
219
219
|
evalscope/models/custom/custom_model.py,sha256=rBQLAuPEw_OPUtRSCEmxEfpcA8jPj8bAdsmtKs4ygus,1566
|
|
220
220
|
evalscope/models/custom/dummy_model.py,sha256=ODD6pt9FvZq_a54oYsehBDslRKHOsk9zsC9iAZvi5Yg,2020
|
|
221
221
|
evalscope/perf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
222
|
-
evalscope/perf/arguments.py,sha256=
|
|
223
|
-
evalscope/perf/benchmark.py,sha256=
|
|
224
|
-
evalscope/perf/http_client.py,sha256
|
|
222
|
+
evalscope/perf/arguments.py,sha256=UZKlkbDE2N408dY8Ji-WB8sl1rcmamywzxLvNXpnY0w,10194
|
|
223
|
+
evalscope/perf/benchmark.py,sha256=nv7gtCkeKnLKQQiKM4G0MYO2ambcuwsbx67OgEQG0nM,7917
|
|
224
|
+
evalscope/perf/http_client.py,sha256=-c3-N7bxKsj3d5DVsKSaYA3XAHJDzZgoqZBbhuDYIGk,7419
|
|
225
225
|
evalscope/perf/main.py,sha256=w-yDbl0osaTAMgC-JNPpqIq2LQ7U4c-Ht7Amj8Nbjc8,1278
|
|
226
226
|
evalscope/perf/plugin/__init__.py,sha256=1sl5s-csrwKb_LVTnpF3HqArz06TRD5LYJ0hpqvokUA,85
|
|
227
227
|
evalscope/perf/plugin/registry.py,sha256=w1IAt6GDdluzSYK5i-yrntvx3_EvIIqJamEL0xZv3zA,1323
|
|
228
228
|
evalscope/perf/plugin/api/__init__.py,sha256=Ckzbq4CkSMVQTedQcDHCYlRd6FTwQAElt2mHB-VXJac,195
|
|
229
229
|
evalscope/perf/plugin/api/base.py,sha256=B_H04qKx7eRTn155rnDrbTYur7PK1mvxfQKYcqYbndU,2118
|
|
230
|
-
evalscope/perf/plugin/api/custom_api.py,sha256=
|
|
230
|
+
evalscope/perf/plugin/api/custom_api.py,sha256=ssE4J8AynA0n5SnXSQyk7K5Co3dwUN6Opph08clZna0,3785
|
|
231
231
|
evalscope/perf/plugin/api/dashscope_api.py,sha256=V5fwn-p_fLH0dWKzhN9TvYSHRgla4INfXC4NDaIjoQ8,3825
|
|
232
|
-
evalscope/perf/plugin/api/openai_api.py,sha256=
|
|
232
|
+
evalscope/perf/plugin/api/openai_api.py,sha256=kTL_2OACuKhzd2W0Pf4DirpMumzk4V3rqKZ2mvBZVCs,7655
|
|
233
233
|
evalscope/perf/plugin/datasets/__init__.py,sha256=Z6Jc0RxJS_z0nBBV1-b0-56Ija60AtQ7I_67gY6ZfdQ,568
|
|
234
234
|
evalscope/perf/plugin/datasets/base.py,sha256=Z-INWueeYjfEZhP4lbTlBMVwIa6BcXZKWx-w7Pop3mA,1786
|
|
235
|
-
evalscope/perf/plugin/datasets/custom.py,sha256=
|
|
235
|
+
evalscope/perf/plugin/datasets/custom.py,sha256=npreC7H1VsdTGYkqlMESvyOhtXOfZQA7_-ICmxe3FWk,936
|
|
236
236
|
evalscope/perf/plugin/datasets/flickr8k.py,sha256=UzAIFIO0m5inWOkWM1mO6wfV2HOuXAqiTxCJ4b0SiZM,1589
|
|
237
|
-
evalscope/perf/plugin/datasets/line_by_line.py,sha256=
|
|
238
|
-
evalscope/perf/plugin/datasets/longalpaca.py,sha256=
|
|
239
|
-
evalscope/perf/plugin/datasets/openqa.py,sha256=
|
|
240
|
-
evalscope/perf/plugin/datasets/random_dataset.py,sha256=
|
|
237
|
+
evalscope/perf/plugin/datasets/line_by_line.py,sha256=AqZYG6tVL3BIGnzh_2Tev8lDYezJG_1gqJY8bSNQl3Q,957
|
|
238
|
+
evalscope/perf/plugin/datasets/longalpaca.py,sha256=XelLris0-c3StLInQ-Oav4jqGcXPNfJxEDeYvaetEbI,1297
|
|
239
|
+
evalscope/perf/plugin/datasets/openqa.py,sha256=4Pnx5duFJzoiTUfZCbcK7LO8f-skmcpYNUUrtNR_UUc,1463
|
|
240
|
+
evalscope/perf/plugin/datasets/random_dataset.py,sha256=SIlsjAE_Stknfr6o1CBFvANBGCSgSExFbscLwSM_Gmk,2958
|
|
241
241
|
evalscope/perf/plugin/datasets/speed_benchmark.py,sha256=J6q7AF_Re5eHLVejXEw9c1jlk1T1PPmist0yO9UFTPE,2432
|
|
242
242
|
evalscope/perf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
243
243
|
evalscope/perf/utils/analysis_result.py,sha256=ESzaZHGTpr2LoJR3jpOzqMphxSrr79d364ZzD159PmY,1169
|
|
244
|
-
evalscope/perf/utils/benchmark_util.py,sha256=
|
|
245
|
-
evalscope/perf/utils/db_util.py,sha256=
|
|
244
|
+
evalscope/perf/utils/benchmark_util.py,sha256=XrpB6ISjY2p1ngwPr5eOQS7O_I1kmlbEn2wCwsC_5AA,6278
|
|
245
|
+
evalscope/perf/utils/db_util.py,sha256=VDqiM6xOK7fSneU3YOOU-78LWB8El3mxj_Ixtw2gX3o,9051
|
|
246
246
|
evalscope/perf/utils/handler.py,sha256=HyKIxbzC0XCyQanlbb7UEY7yaeqjJTePNea8kMV3Sdc,1192
|
|
247
247
|
evalscope/perf/utils/local_server.py,sha256=clF8i0UFmaxBBB6gX05KvVCyzSv0xzsAidz0_sLLlAk,4627
|
|
248
|
+
evalscope/perf/utils/log_utils.py,sha256=1jmB31W3ol9ukPAPbQ8xG3yoZ9oi3tjEyMK5M3ERmbw,1471
|
|
248
249
|
evalscope/registry/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
249
250
|
evalscope/registry/config/cfg_arena.yaml,sha256=rub6ceaQxxB1mbSjdoFf0IaVgGfbOonV2nYRebv2OKo,3292
|
|
250
251
|
evalscope/registry/config/cfg_arena_zhihu.yaml,sha256=tvvihBwvoTjoezwTSaZwoGOB44ysofpnin4pNyY9TfQ,2755
|
|
@@ -267,7 +268,7 @@ evalscope/registry/tasks/gsm8k.yaml,sha256=M2I7otwOSy0usD8yG8d6QziASQlKdhKLflRHM
|
|
|
267
268
|
evalscope/registry/tasks/mmlu.yaml,sha256=cJcMH1Cvgo9PlYoTmeGx2bcZayysltaa6ehK57dDkvo,726
|
|
268
269
|
evalscope/registry/tasks/mmlu_mini.yaml,sha256=K8ouHh7ve5ZsbkqRtV3Jl-DF01YFPuObfwEdACJA4Pk,778
|
|
269
270
|
evalscope/report/__init__.py,sha256=0Wes3ot2hy9s-WwZaBztst8qkNrXkOF-Hwa1WW1e8lY,260
|
|
270
|
-
evalscope/report/app.py,sha256=
|
|
271
|
+
evalscope/report/app.py,sha256=Lew--YreNeuyLVktnUNZKIfGvnGE_oAD054kZB-YTHo,26904
|
|
271
272
|
evalscope/report/combinator.py,sha256=O3QirwtYhDhdaWVT4STJMCGZMwoX8BTeJ3HtS9iwnWQ,2567
|
|
272
273
|
evalscope/report/generator.py,sha256=2DULY9W8QCUxdtyfNjo8XAP_YxI1LgR95jknK__kYPU,3600
|
|
273
274
|
evalscope/report/utils.py,sha256=DRlbjbqHEmM8rGlA4pwtlHFhOZtyUzcqiS-mejfIDkU,4584
|
|
@@ -314,28 +315,28 @@ evalscope/utils/filters.py,sha256=x_NX40uWMmUsVrAGHCeeV2e63HZZFugWUgdUhk64ivM,15
|
|
|
314
315
|
evalscope/utils/io_utils.py,sha256=Tjdgen1FsAA4ArqiUzu734L0Px5NuiS0GKRRiGIzxSA,4192
|
|
315
316
|
evalscope/utils/logger.py,sha256=barHSdtbEu21ynGQj_wS-rd7B02wPPR5AgaWCQzvG4w,3638
|
|
316
317
|
evalscope/utils/model_utils.py,sha256=hB9W334ecAb6553FhooT6_jM0g-tjj6AU48IV3K1CKw,1131
|
|
317
|
-
evalscope/utils/utils.py,sha256=
|
|
318
|
+
evalscope/utils/utils.py,sha256=VuGdJh3xZAZ-cRoGcKeJTx3z8sgSs2eMjH-1JX2ZYOU,10615
|
|
318
319
|
tests/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
319
320
|
tests/test_run_all.py,sha256=YcMTlWoFpvWY8jevWyIf2G_tz8hgDD1cAwSvmyZt96M,429
|
|
320
321
|
tests/cli/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
321
|
-
tests/cli/test_all.py,sha256=
|
|
322
|
-
tests/cli/test_collection.py,sha256=
|
|
323
|
-
tests/cli/test_run.py,sha256=
|
|
322
|
+
tests/cli/test_all.py,sha256=pwup--iNxckUEsR_aFjIAbEQo3UogSu5aIWf9ryLP2o,4022
|
|
323
|
+
tests/cli/test_collection.py,sha256=y8FjoPziPRf5BdJK8DHjcXn26ETKz1OyqjnCpwjt-F4,4096
|
|
324
|
+
tests/cli/test_run.py,sha256=RW4AkJILqzzyd0wuIdy8Y9SB_4koSRJFezGjFdXdLJI,16549
|
|
324
325
|
tests/perf/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
325
|
-
tests/perf/test_perf.py,sha256=
|
|
326
|
+
tests/perf/test_perf.py,sha256=BXd6SCMbBDKmh-P_KGTOpuwVQZ05xCKjvH01zGyvBJI,3787
|
|
326
327
|
tests/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
327
|
-
tests/rag/test_clip_benchmark.py,sha256=
|
|
328
|
-
tests/rag/test_mteb.py,sha256=
|
|
329
|
-
tests/rag/test_ragas.py,sha256=
|
|
328
|
+
tests/rag/test_clip_benchmark.py,sha256=ZCBtgnF8Vuji6WQlb92-_RIvXlUX_Xt-cHZP4AN_DNI,2552
|
|
329
|
+
tests/rag/test_mteb.py,sha256=YJw6X1jwX6SYNB-ryVb-OHJWu3vsE3Y4STATI75rdG0,5619
|
|
330
|
+
tests/rag/test_ragas.py,sha256=E7rfKpKtBqglOL1GcW9adfY8nsOZMuoB8GC55UL1Q3c,4517
|
|
330
331
|
tests/swift/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
331
332
|
tests/swift/test_run_swift_eval.py,sha256=JKG-0BwTxkbg-XeiXxujPqnVIM3f2EFaJ_9a7p_R4dk,5748
|
|
332
333
|
tests/swift/test_run_swift_vlm_eval.py,sha256=C8DftjewnZaerQWfERI70bU3sQLWQ-ejZUQhtYO5e0o,4898
|
|
333
334
|
tests/swift/test_run_swift_vlm_jugde_eval.py,sha256=THZEXUOSqm9rWslwJHmZyh-Ytv5c_QKpgRW5J2s_69E,6017
|
|
334
335
|
tests/vlm/__init__.py,sha256=I_ANdxdcIHpkIzIXc1yKOlWwzb4oY0FwTPq1kYtgzQw,50
|
|
335
|
-
tests/vlm/test_vlmeval.py,sha256=
|
|
336
|
-
evalscope-0.
|
|
337
|
-
evalscope-0.
|
|
338
|
-
evalscope-0.
|
|
339
|
-
evalscope-0.
|
|
340
|
-
evalscope-0.
|
|
341
|
-
evalscope-0.
|
|
336
|
+
tests/vlm/test_vlmeval.py,sha256=UqRiBPMU3vRtLIG1Qu4ZVhyUQx-zGYQuLCgobwf-7a4,3176
|
|
337
|
+
evalscope-0.14.0.dist-info/LICENSE,sha256=K_2M03pN0PxVMyx9IQUKsHGhhDMkw5ryQ02rlMvzj3I,11416
|
|
338
|
+
evalscope-0.14.0.dist-info/METADATA,sha256=HQ1pt-YU950AcwwWiypjGcWg0wYU9n6PFZ7j6PG4uHg,33040
|
|
339
|
+
evalscope-0.14.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
|
340
|
+
evalscope-0.14.0.dist-info/entry_points.txt,sha256=Qr4oTgGhg_K-iUtKwVH6lWUhFHDUiH9trIqydHGTEug,56
|
|
341
|
+
evalscope-0.14.0.dist-info/top_level.txt,sha256=Yv0iprOqZQ4rfUO-AWJp7Ni6m0Twxny1yvZwO-8hUDM,16
|
|
342
|
+
evalscope-0.14.0.dist-info/RECORD,,
|
tests/cli/test_all.py
CHANGED
|
@@ -4,13 +4,12 @@ from dotenv import dotenv_values
|
|
|
4
4
|
env = dotenv_values('.env')
|
|
5
5
|
|
|
6
6
|
import os
|
|
7
|
-
import subprocess
|
|
8
7
|
import unittest
|
|
9
8
|
|
|
10
9
|
from evalscope.config import TaskConfig
|
|
11
10
|
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
12
11
|
from evalscope.run import run_task
|
|
13
|
-
from evalscope.utils import
|
|
12
|
+
from evalscope.utils import test_level_list
|
|
14
13
|
from evalscope.utils.logger import get_logger
|
|
15
14
|
|
|
16
15
|
os.environ['LOG_LEVEL'] = 'DEBUG'
|
|
@@ -46,6 +45,7 @@ datasets=[
|
|
|
46
45
|
'chinese_simpleqa',
|
|
47
46
|
'alpaca_eval',
|
|
48
47
|
'arena_hard',
|
|
48
|
+
'maritime_bench',
|
|
49
49
|
]
|
|
50
50
|
|
|
51
51
|
dataset_args={
|
|
@@ -134,8 +134,8 @@ class TestRun(unittest.TestCase):
|
|
|
134
134
|
eval_type=EvalType.SERVICE,
|
|
135
135
|
datasets=datasets,
|
|
136
136
|
dataset_args=dataset_args,
|
|
137
|
-
eval_batch_size=
|
|
138
|
-
limit=
|
|
137
|
+
eval_batch_size=1,
|
|
138
|
+
limit=1,
|
|
139
139
|
stream=True,
|
|
140
140
|
generation_config={
|
|
141
141
|
'temperature': 0,
|
tests/cli/test_collection.py
CHANGED
tests/cli/test_run.py
CHANGED
|
@@ -137,7 +137,7 @@ class TestRun(unittest.TestCase):
|
|
|
137
137
|
'subset_list': ['gsm8k'],
|
|
138
138
|
},
|
|
139
139
|
'musr': {
|
|
140
|
-
'subset_list': ['murder_mysteries']
|
|
140
|
+
'subset_list': ['murder_mysteries'],
|
|
141
141
|
},
|
|
142
142
|
'general_mcq': {
|
|
143
143
|
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
@@ -263,7 +263,7 @@ class TestRun(unittest.TestCase):
|
|
|
263
263
|
datasets=[
|
|
264
264
|
# 'iquiz',
|
|
265
265
|
# 'ifeval',
|
|
266
|
-
|
|
266
|
+
'mmlu',
|
|
267
267
|
# 'mmlu_pro',
|
|
268
268
|
# 'musr',
|
|
269
269
|
# 'process_bench',
|
|
@@ -281,9 +281,10 @@ class TestRun(unittest.TestCase):
|
|
|
281
281
|
# 'ceval',
|
|
282
282
|
# 'hellaswag',
|
|
283
283
|
# 'general_mcq',
|
|
284
|
-
'general_qa'
|
|
284
|
+
# 'general_qa'
|
|
285
285
|
# 'super_gpqa',
|
|
286
|
-
# 'mmlu_redux'
|
|
286
|
+
# 'mmlu_redux',
|
|
287
|
+
'maritime_bench'
|
|
287
288
|
],
|
|
288
289
|
dataset_args={
|
|
289
290
|
'mmlu': {
|
|
@@ -322,7 +323,8 @@ class TestRun(unittest.TestCase):
|
|
|
322
323
|
'subset_list': ['gsm8k'],
|
|
323
324
|
},
|
|
324
325
|
'musr': {
|
|
325
|
-
'subset_list': ['murder_mysteries']
|
|
326
|
+
'subset_list': ['murder_mysteries'],
|
|
327
|
+
'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/MuSR'
|
|
326
328
|
},
|
|
327
329
|
'general_mcq': {
|
|
328
330
|
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
@@ -353,10 +355,9 @@ class TestRun(unittest.TestCase):
|
|
|
353
355
|
stream=False,
|
|
354
356
|
generation_config={
|
|
355
357
|
'temperature': 0,
|
|
356
|
-
'n':
|
|
358
|
+
'n': 1,
|
|
357
359
|
'max_tokens': 4096,
|
|
358
|
-
}
|
|
359
|
-
use_cache='outputs/20250326_202848',
|
|
360
|
+
}
|
|
360
361
|
)
|
|
361
362
|
|
|
362
363
|
run_task(task_cfg=task_cfg)
|