evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +6 -2
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +47 -51
- evalscope/backend/rag_eval/utils/embedding.py +13 -12
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +33 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +154 -96
- evalscope/constants.py +50 -32
- evalscope/evaluator/evaluator.py +97 -377
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +16 -3
- evalscope/perf/benchmark.py +9 -11
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +8 -1
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +5 -6
- evalscope/perf/utils/db_util.py +77 -30
- evalscope/perf/utils/local_server.py +21 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +153 -381
- evalscope/run_arena.py +21 -25
- evalscope/summarizer.py +27 -40
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -27
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -4
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
- evalscope/tools/combine_reports.py +27 -34
- evalscope/tools/rewrite_eval_results.py +15 -47
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +4 -5
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +17 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +5 -306
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
- evalscope-0.8.1.dist-info/RECORD +285 -0
- tests/cli/test_run.py +53 -15
- tests/perf/test_perf.py +6 -1
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -84,7 +84,7 @@ Requires-Dist: transformers-stream-generator; extra == "all"
|
|
|
84
84
|
Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
|
|
85
85
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
|
|
86
86
|
Requires-Dist: mteb==1.19.4; extra == "all"
|
|
87
|
-
Requires-Dist: ragas==0.2.
|
|
87
|
+
Requires-Dist: ragas==0.2.7; extra == "all"
|
|
88
88
|
Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
89
89
|
Requires-Dist: aiohttp; extra == "all"
|
|
90
90
|
Requires-Dist: fastapi; extra == "all"
|
|
@@ -129,48 +129,52 @@ Requires-Dist: transformers; extra == "perf"
|
|
|
129
129
|
Requires-Dist: unicorn; extra == "perf"
|
|
130
130
|
Provides-Extra: rag
|
|
131
131
|
Requires-Dist: mteb==1.19.4; extra == "rag"
|
|
132
|
-
Requires-Dist: ragas==0.2.
|
|
132
|
+
Requires-Dist: ragas==0.2.7; extra == "rag"
|
|
133
133
|
Requires-Dist: webdataset>0.2.0; extra == "rag"
|
|
134
134
|
Provides-Extra: vlmeval
|
|
135
135
|
Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
|
|
136
136
|
|
|
137
|
+
<p align="center">
|
|
138
|
+
<br>
|
|
139
|
+
<img src="docs/en/_static/images/evalscope_logo.png"/>
|
|
140
|
+
<br>
|
|
141
|
+
<p>
|
|
137
142
|
|
|
138
143
|
|
|
139
|
-

|
|
140
|
-
|
|
141
144
|
<p align="center">
|
|
142
|
-
|
|
145
|
+
<a href="README_zh.md">中文</a>   |   English  
|
|
143
146
|
</p>
|
|
144
147
|
|
|
145
148
|
<p align="center">
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
<a href="https://evalscope.readthedocs.io/en/latest/"
|
|
149
|
+
<img src="https://img.shields.io/badge/python-%E2%89%A53.8-5be.svg">
|
|
150
|
+
<a href="https://badge.fury.io/py/evalscope"><img src="https://badge.fury.io/py/evalscope.svg" alt="PyPI version" height="18"></a>
|
|
151
|
+
<a href="https://pypi.org/project/evalscope"><img alt="PyPI - Downloads" src="https://static.pepy.tech/badge/evalscope"></a>
|
|
152
|
+
<a href="https://github.com/modelscope/evalscope/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
|
|
153
|
+
<a href='https://evalscope.readthedocs.io/en/latest/?badge=latest'><img src='https://readthedocs.org/projects/evalscope/badge/?version=latest' alt='Documentation Status' /></a>
|
|
154
|
+
<p>
|
|
155
|
+
|
|
156
|
+
<p align="center">
|
|
157
|
+
<a href="https://evalscope.readthedocs.io/zh-cn/latest/"> 📖 中文文档</a>   |   <a href="https://evalscope.readthedocs.io/en/latest/"> 📖 English Documents</a>
|
|
155
158
|
<p>
|
|
156
159
|
|
|
157
160
|
> ⭐ If you like this project, please click the "Star" button at the top right to support us. Your support is our motivation to keep going!
|
|
158
161
|
|
|
159
|
-
## 📋
|
|
162
|
+
## 📋 Contents
|
|
160
163
|
- [Introduction](#introduction)
|
|
161
164
|
- [News](#News)
|
|
162
165
|
- [Installation](#installation)
|
|
163
166
|
- [Quick Start](#quick-start)
|
|
164
167
|
- [Evaluation Backend](#evaluation-backend)
|
|
165
168
|
- [Custom Dataset Evaluation](#custom-dataset-evaluation)
|
|
166
|
-
- [Offline Evaluation](#offline-evaluation)
|
|
167
|
-
- [Arena Mode](#arena-mode)
|
|
168
169
|
- [Model Serving Performance Evaluation](#Model-Serving-Performance-Evaluation)
|
|
170
|
+
- [Arena Mode](#arena-mode)
|
|
169
171
|
|
|
170
172
|
|
|
171
173
|
## 📝 Introduction
|
|
172
174
|
|
|
173
|
-
EvalScope is
|
|
175
|
+
EvalScope is [ModelScope](https://modelscope.cn/)'s official framework for model evaluation and benchmarking, designed for diverse assessment needs. It supports various model types including large language models, multimodal, embedding, reranker, and CLIP models.
|
|
176
|
+
|
|
177
|
+
The framework accommodates multiple evaluation scenarios such as end-to-end RAG evaluation, arena mode, and inference performance testing. It features built-in benchmarks and metrics like MMLU, CMMLU, C-Eval, and GSM8K. Seamlessly integrated with the [ms-swift](https://github.com/modelscope/ms-swift) training framework, EvalScope enables one-click evaluations, offering comprehensive support for model training and assessment 🚀
|
|
174
178
|
|
|
175
179
|
<p align="center">
|
|
176
180
|
<img src="docs/en/_static/images/evalscope_framework.png" width="70%">
|
|
@@ -192,6 +196,7 @@ The architecture includes the following modules:
|
|
|
192
196
|
|
|
193
197
|
|
|
194
198
|
## 🎉 News
|
|
199
|
+
- 🔥 **[2024.12.13]** Model evaluation optimization: no need to pass the `--template-type` parameter anymore; supports starting evaluation with `evalscope eval --args`. Refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/get_started/basic_usage.html) for more details.
|
|
195
200
|
- 🔥 **[2024.11.26]** The model inference service performance evaluator has been completely refactored: it now supports local inference service startup and Speed Benchmark; asynchronous call error handling has been optimized. For more details, refer to the [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/index.html).
|
|
196
201
|
- 🔥 **[2024.10.31]** The best practice for evaluating Multimodal-RAG has been updated, please check the [📖 Blog](https://evalscope.readthedocs.io/zh-cn/latest/blog/RAG/multimodal_RAG.html#multimodal-rag) for more details.
|
|
197
202
|
- 🔥 **[2024.10.23]** Supports multimodal RAG evaluation, including the assessment of image-text retrieval using [CLIP_Benchmark](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/clip_benchmark.html), and extends [RAGAS](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/ragas.html) to support end-to-end multimodal metrics evaluation.
|
|
@@ -263,124 +268,129 @@ We recommend using conda to manage your environment and installing dependencies
|
|
|
263
268
|
|
|
264
269
|
## 🚀 Quick Start
|
|
265
270
|
|
|
266
|
-
|
|
267
|
-
To evaluate a model using default settings on specified datasets, follow the process below:
|
|
271
|
+
To evaluate a model on specified datasets using default configurations, this framework supports two ways to initiate evaluation tasks: using the command line or using Python code.
|
|
268
272
|
|
|
269
|
-
|
|
273
|
+
### Method 1. Using Command Line
|
|
270
274
|
|
|
271
|
-
|
|
275
|
+
Execute the `eval` command in any directory:
|
|
272
276
|
```bash
|
|
273
|
-
|
|
277
|
+
evalscope eval \
|
|
274
278
|
--model Qwen/Qwen2.5-0.5B-Instruct \
|
|
275
|
-
--
|
|
276
|
-
--
|
|
277
|
-
--limit 10
|
|
279
|
+
--datasets gsm8k arc \
|
|
280
|
+
--limit 5
|
|
278
281
|
```
|
|
279
282
|
|
|
280
|
-
|
|
283
|
+
### Method 2. Using Python Code
|
|
281
284
|
|
|
282
|
-
|
|
283
|
-
```bash
|
|
284
|
-
python evalscope/run.py \
|
|
285
|
-
--model Qwen/Qwen2.5-0.5B-Instruct \
|
|
286
|
-
--template-type qwen \
|
|
287
|
-
--datasets gsm8k ceval \
|
|
288
|
-
--limit 10
|
|
289
|
-
```
|
|
285
|
+
When using Python code for evaluation, you need to submit the evaluation task using the `run_task` function, passing a `TaskConfig` as a parameter. It can also be a Python dictionary, yaml file path, or json file path, for example:
|
|
290
286
|
|
|
291
|
-
|
|
287
|
+
**Using Python Dictionary**
|
|
292
288
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
289
|
+
```python
|
|
290
|
+
from evalscope.run import run_task
|
|
291
|
+
|
|
292
|
+
task_cfg = {
|
|
293
|
+
'model': 'Qwen/Qwen2.5-0.5B-Instruct',
|
|
294
|
+
'datasets': ['gsm8k', 'arc'],
|
|
295
|
+
'limit': 5
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
run_task(task_cfg=task_cfg)
|
|
301
299
|
```
|
|
302
300
|
|
|
301
|
+
<details><summary>More Startup Methods</summary>
|
|
303
302
|
|
|
304
|
-
|
|
305
|
-
- `--model`: Specifies the `model_id` of the model on [ModelScope](https://modelscope.cn/), allowing automatic download. For example, see the [Qwen2-0.5B-Instruct model link](https://modelscope.cn/models/qwen/Qwen2-0.5B-Instruct/summary); you can also use a local path, such as `/path/to/model`.
|
|
306
|
-
- `--template-type`: Specifies the template type corresponding to the model. Refer to the `Default Template` field in the [template table](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-datasets.html#llm) for filling in this field.
|
|
307
|
-
- `--datasets`: The dataset name, allowing multiple datasets to be specified, separated by spaces; these datasets will be automatically downloaded. Refer to the [supported datasets list](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html) for available options.
|
|
308
|
-
- `--limit`: Maximum number of evaluation samples per dataset; if not specified, all will be evaluated, which is useful for quick validation.
|
|
303
|
+
**Using `TaskConfig`**
|
|
309
304
|
|
|
305
|
+
```python
|
|
306
|
+
from evalscope.run import run_task
|
|
307
|
+
from evalscope.config import TaskConfig
|
|
310
308
|
|
|
311
|
-
|
|
312
|
-
|
|
309
|
+
task_cfg = TaskConfig(
|
|
310
|
+
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
311
|
+
datasets=['gsm8k', 'arc'],
|
|
312
|
+
limit=5
|
|
313
|
+
)
|
|
313
314
|
|
|
314
|
-
|
|
315
|
-
```shell
|
|
316
|
-
python evalscope/run.py \
|
|
317
|
-
--model qwen/Qwen2-0.5B-Instruct \
|
|
318
|
-
--template-type qwen \
|
|
319
|
-
--model-args revision=master,precision=torch.float16,device_map=auto \
|
|
320
|
-
--datasets gsm8k ceval \
|
|
321
|
-
--use-cache true \
|
|
322
|
-
--limit 10
|
|
315
|
+
run_task(task_cfg=task_cfg)
|
|
323
316
|
```
|
|
324
317
|
|
|
325
|
-
**
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
318
|
+
**Using `yaml` file**
|
|
319
|
+
|
|
320
|
+
`config.yaml`:
|
|
321
|
+
```yaml
|
|
322
|
+
model: Qwen/Qwen2.5-0.5B-Instruct
|
|
323
|
+
datasets:
|
|
324
|
+
- gsm8k
|
|
325
|
+
- arc
|
|
326
|
+
limit: 5
|
|
334
327
|
```
|
|
335
328
|
|
|
336
|
-
#### Parameter Descriptions
|
|
337
|
-
In addition to the three [basic parameters](#basic-parameter-descriptions), the other parameters are as follows:
|
|
338
|
-
- `--model-args`: Model loading parameters, separated by commas, in `key=value` format.
|
|
339
|
-
- `--generation-config`: Generation parameters, separated by commas, in `key=value` format.
|
|
340
|
-
- `do_sample`: Whether to use sampling, default is `false`.
|
|
341
|
-
- `max_new_tokens`: Maximum generation length, default is 1024.
|
|
342
|
-
- `temperature`: Sampling temperature.
|
|
343
|
-
- `top_p`: Sampling threshold.
|
|
344
|
-
- `top_k`: Sampling threshold.
|
|
345
|
-
- `--use-cache`: Whether to use local cache, default is `false`. If set to `true`, previously evaluated model and dataset combinations will not be evaluated again, and will be read directly from the local cache.
|
|
346
|
-
- `--dataset-args`: Evaluation dataset configuration parameters, provided in JSON format, where the key is the dataset name and the value is the parameter; note that these must correspond one-to-one with the values in `--datasets`.
|
|
347
|
-
- `--few_shot_num`: Number of few-shot examples.
|
|
348
|
-
- `--few_shot_random`: Whether to randomly sample few-shot data; if not specified, defaults to `true`.
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
### 3. Use the run_task Function to Submit an Evaluation Task
|
|
352
|
-
Using the `run_task` function to submit an evaluation task requires the same parameters as the command line. You need to pass a dictionary as the parameter, which includes the following fields:
|
|
353
|
-
|
|
354
|
-
#### 1. Configuration Task Dictionary Parameters
|
|
355
329
|
```python
|
|
356
|
-
import
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
'mem_cache': False,
|
|
371
|
-
'dataset_hub': 'ModelScope',
|
|
372
|
-
'dataset_dir': DEFAULT_ROOT_CACHE_DIR,
|
|
373
|
-
'limit': 10,
|
|
374
|
-
'debug': False
|
|
375
|
-
}
|
|
330
|
+
from evalscope.run import run_task
|
|
331
|
+
|
|
332
|
+
run_task(task_cfg="config.yaml")
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
**Using `json` file**
|
|
336
|
+
|
|
337
|
+
`config.json`:
|
|
338
|
+
```json
|
|
339
|
+
{
|
|
340
|
+
"model": "Qwen/Qwen2.5-0.5B-Instruct",
|
|
341
|
+
"datasets": ["gsm8k", "arc"],
|
|
342
|
+
"limit": 5
|
|
343
|
+
}
|
|
376
344
|
```
|
|
377
|
-
Here, `DEFAULT_ROOT_CACHE_DIR` is set to `'~/.cache/evalscope'`.
|
|
378
345
|
|
|
379
|
-
#### 2. Execute Task with run_task
|
|
380
346
|
```python
|
|
381
347
|
from evalscope.run import run_task
|
|
382
|
-
|
|
348
|
+
|
|
349
|
+
run_task(task_cfg="config.json")
|
|
350
|
+
```
|
|
351
|
+
</details>
|
|
352
|
+
|
|
353
|
+
### Basic Parameter
|
|
354
|
+
- `--model`: Specifies the `model_id` of the model in [ModelScope](https://modelscope.cn/), which can be automatically downloaded, e.g., [Qwen/Qwen2.5-0.5B-Instruct](https://modelscope.cn/models/Qwen/Qwen2.5-0.5B-Instruct/summary); or use the local path of the model, e.g., `/path/to/model`
|
|
355
|
+
- `--datasets`: Dataset names, supports inputting multiple datasets separated by spaces. Datasets will be automatically downloaded from modelscope. For supported datasets, refer to the [Dataset List](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
356
|
+
- `--limit`: Maximum amount of evaluation data for each dataset. If not specified, it defaults to evaluating all data. Can be used for quick validation
|
|
357
|
+
|
|
358
|
+
### Output Results
|
|
383
359
|
```
|
|
360
|
+
+-----------------------+-------------------+-----------------+
|
|
361
|
+
| Model | ai2_arc | gsm8k |
|
|
362
|
+
+=======================+===================+=================+
|
|
363
|
+
| Qwen2.5-0.5B-Instruct | (ai2_arc/acc) 0.6 | (gsm8k/acc) 0.6 |
|
|
364
|
+
+-----------------------+-------------------+-----------------+
|
|
365
|
+
```
|
|
366
|
+
|
|
367
|
+
## ⚙️ Complex Evaluation
|
|
368
|
+
For more customized evaluations, such as customizing model parameters or dataset parameters, you can use the following command. The evaluation startup method is the same as simple evaluation. Below shows how to start the evaluation using the `eval` command:
|
|
369
|
+
|
|
370
|
+
```shell
|
|
371
|
+
evalscope eval \
|
|
372
|
+
--model Qwen/Qwen2.5-0.5B-Instruct \
|
|
373
|
+
--model-args revision=master,precision=torch.float16,device_map=auto \
|
|
374
|
+
--generation-config do_sample=true,temperature=0.5 \
|
|
375
|
+
--dataset-args '{"gsm8k": {"few_shot_num": 0, "few_shot_random": false}}' \
|
|
376
|
+
--datasets gsm8k \
|
|
377
|
+
--limit 10
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
### Parameter
|
|
381
|
+
- `--model-args`: Model loading parameters, separated by commas in `key=value` format. Default parameters:
|
|
382
|
+
- `revision`: Model version, default is `master`
|
|
383
|
+
- `precision`: Model precision, default is `auto`
|
|
384
|
+
- `device_map`: Model device allocation, default is `auto`
|
|
385
|
+
- `--generation-config`: Generation parameters, separated by commas in `key=value` format. Default parameters:
|
|
386
|
+
- `do_sample`: Whether to use sampling, default is `false`
|
|
387
|
+
- `max_length`: Maximum length, default is 2048
|
|
388
|
+
- `max_new_tokens`: Maximum length of generation, default is 512
|
|
389
|
+
- `--dataset-args`: Configuration parameters for evaluation datasets, passed in `json` format. The key is the dataset name, and the value is the parameters. Note that it needs to correspond one-to-one with the values in the `--datasets` parameter:
|
|
390
|
+
- `few_shot_num`: Number of few-shot examples
|
|
391
|
+
- `few_shot_random`: Whether to randomly sample few-shot data, if not set, defaults to `true`
|
|
392
|
+
|
|
393
|
+
Reference: [Full Parameter Description](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html)
|
|
384
394
|
|
|
385
395
|
|
|
386
396
|
## Evaluation Backend
|
|
@@ -418,12 +428,7 @@ Speed Benchmark Results:
|
|
|
418
428
|
```
|
|
419
429
|
|
|
420
430
|
## Custom Dataset Evaluation
|
|
421
|
-
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset.html)
|
|
422
|
-
|
|
423
|
-
## Offline Evaluation
|
|
424
|
-
You can use local dataset to evaluate the model without internet connection.
|
|
425
|
-
|
|
426
|
-
Refer to: Offline Evaluation [📖 User Guide](https://evalscope.readthedocs.io/en/latest/user_guides/offline_evaluation.html)
|
|
431
|
+
EvalScope supports custom dataset evaluation. For detailed information, please refer to the Custom Dataset Evaluation [📖User Guide](https://evalscope.readthedocs.io/en/latest/advanced_guides/custom_dataset/index.html)
|
|
427
432
|
|
|
428
433
|
|
|
429
434
|
## Arena Mode
|