evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalscope/__init__.py +4 -1
- evalscope/api/__init__.py +0 -0
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +3 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
- evalscope/api/benchmark/benchmark.py +321 -0
- evalscope/api/benchmark/meta.py +115 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +261 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +355 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +264 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +11 -0
- evalscope/api/messages/chat_message.py +198 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +105 -0
- evalscope/api/mixin/__init__.py +2 -0
- evalscope/api/mixin/dataset_mixin.py +105 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +157 -0
- evalscope/api/model/model.py +383 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +23 -11
- evalscope/app/utils/data_utils.py +42 -26
- evalscope/app/utils/text_utils.py +0 -2
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +6 -7
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -3
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +2 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
- evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +135 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +95 -54
- evalscope/constants.py +29 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +277 -423
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +32 -30
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +47 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +123 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +698 -0
- evalscope/perf/benchmark.py +2 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +7 -5
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +8 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -2
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +101 -6
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +26 -44
- evalscope/summarizer.py +1 -1
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +2 -1
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/io_utils.py +100 -5
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +10 -7
- evalscope/utils/multi_choices.py +271 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
- tests/aigc/test_t2i.py +22 -4
- tests/benchmark/__init__.py +1 -0
- tests/benchmark/test_eval.py +386 -0
- tests/cli/test_all.py +3 -5
- tests/cli/test_collection.py +13 -4
- tests/cli/test_custom.py +22 -15
- tests/rag/test_clip_benchmark.py +1 -0
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0
evalscope/utils/model_utils.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import numpy as np
|
|
2
|
-
import os
|
|
3
2
|
import random
|
|
4
|
-
import torch
|
|
5
3
|
from enum import Enum
|
|
6
4
|
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
|
|
7
5
|
|
|
@@ -69,8 +67,13 @@ def seed_everything(seed: int):
|
|
|
69
67
|
"""
|
|
70
68
|
random.seed(seed)
|
|
71
69
|
np.random.seed(seed)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
torch.
|
|
76
|
-
torch.
|
|
70
|
+
try:
|
|
71
|
+
import torch
|
|
72
|
+
|
|
73
|
+
torch.manual_seed(seed)
|
|
74
|
+
if torch.cuda.is_available():
|
|
75
|
+
torch.cuda.manual_seed_all(seed)
|
|
76
|
+
torch.backends.cudnn.deterministic = True
|
|
77
|
+
torch.backends.cudnn.benchmark = False
|
|
78
|
+
except ImportError:
|
|
79
|
+
pass
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
# flake8: noqa: E501
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import TYPE_CHECKING, List, Optional
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from evalscope.api.evaluator import Choices, Target, TaskState
|
|
9
|
+
|
|
10
|
+
FEW_SHOT_TEMPLATE = r"""Here are some examples of how to answer similar questions:
|
|
11
|
+
|
|
12
|
+
{fewshot}
|
|
13
|
+
|
|
14
|
+
""".lstrip()
|
|
15
|
+
|
|
16
|
+
CHINESE_FEW_SHOT_TEMPLATE = r"""以下是一些示例问题:
|
|
17
|
+
|
|
18
|
+
{fewshot}
|
|
19
|
+
|
|
20
|
+
""".lstrip()
|
|
21
|
+
|
|
22
|
+
CHINESE_SINGLE_ANSWER_TEMPLATE = r"""回答下面的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:"答案:LETTER"(不带引号),其中 LETTER 是 {letters} 中的一个。
|
|
23
|
+
|
|
24
|
+
问题:{question}
|
|
25
|
+
选项:
|
|
26
|
+
{choices}
|
|
27
|
+
""".lstrip()
|
|
28
|
+
|
|
29
|
+
CHINESE_SINGLE_ANSWER_TEMPLATE_COT = r"""回答下面的单项选择题,请选出其中的正确答案。你的回答的最后一行应该是这样的格式:"答案:LETTER"(不带引号),其中 LETTER 是 {letters} 中的一个。请在回答前进行一步步思考。
|
|
30
|
+
|
|
31
|
+
问题:{question}
|
|
32
|
+
选项:
|
|
33
|
+
{choices}
|
|
34
|
+
""".lstrip()
|
|
35
|
+
|
|
36
|
+
SINGLE_ANSWER_TEMPLATE = r"""
|
|
37
|
+
Answer the following multiple choice question. The entire content of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
|
|
38
|
+
|
|
39
|
+
{question}
|
|
40
|
+
|
|
41
|
+
{choices}
|
|
42
|
+
""".strip()
|
|
43
|
+
|
|
44
|
+
SINGLE_ANSWER_TEMPLATE_COT = r"""
|
|
45
|
+
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}. Think step by step before answering.
|
|
46
|
+
|
|
47
|
+
{question}
|
|
48
|
+
|
|
49
|
+
{choices}
|
|
50
|
+
""".strip()
|
|
51
|
+
|
|
52
|
+
MULTIPLE_ANSWER_TEMPLATE = r"""
|
|
53
|
+
Answer the following multiple choice question where multiple answers may be correct. The entire content of your response should be of the following format: 'ANSWER: $LETTERS' (without quotes) where LETTERS is one or more of {letters}.
|
|
54
|
+
|
|
55
|
+
{question}
|
|
56
|
+
|
|
57
|
+
{choices}
|
|
58
|
+
""".strip()
|
|
59
|
+
|
|
60
|
+
MULTIPLE_ANSWER_TEMPLATE_COT = r"""
|
|
61
|
+
Answer the following multiple choice question where multiple answers may be correct. The last line of your response should be of the following format: 'ANSWER: $LETTERS' (without quotes) where LETTERS is one or more of {letters}. Think step by step before answering.
|
|
62
|
+
|
|
63
|
+
{question}
|
|
64
|
+
|
|
65
|
+
{choices}
|
|
66
|
+
""".strip()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def unshuffle_choices(choices: Choices) -> Choices:
|
|
70
|
+
# `sorted` returns `list[Choice]`, but for consistency we wrap this back
|
|
71
|
+
# into a `Choices` object
|
|
72
|
+
return Choices(sorted(choices, key=lambda choice: choice.original_position))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def answer_options(choices: Choices) -> str:
|
|
76
|
+
r"""
|
|
77
|
+
Returns the `choices` formatted as a multiple choice question, e.g.:
|
|
78
|
+
|
|
79
|
+
["choice 1", "choice 2", "choice 3"] ->
|
|
80
|
+
"A) choice 1\nB) choice 2\nC) choice 3"
|
|
81
|
+
"""
|
|
82
|
+
indexes = list(range(len(choices)))
|
|
83
|
+
|
|
84
|
+
return '\n'.join([f'{answer_character(i)}) {choices[j].value}' for i, j in enumerate(indexes)])
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def prompt(question: str, choices: Choices, template: str, fewshot: Optional[str] = None) -> str:
|
|
88
|
+
|
|
89
|
+
choices_text = answer_options(choices)
|
|
90
|
+
letters = ','.join(answer_character(i) for i in range(len(choices)))
|
|
91
|
+
if not fewshot:
|
|
92
|
+
return template.format(
|
|
93
|
+
choices=choices_text,
|
|
94
|
+
letters=letters,
|
|
95
|
+
question=question,
|
|
96
|
+
)
|
|
97
|
+
else:
|
|
98
|
+
return template.format(
|
|
99
|
+
choices=choices_text,
|
|
100
|
+
letters=letters,
|
|
101
|
+
question=question,
|
|
102
|
+
fewshot=fewshot,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def format_example(
|
|
107
|
+
question: str,
|
|
108
|
+
choices: Choices,
|
|
109
|
+
answer: Target,
|
|
110
|
+
) -> str:
|
|
111
|
+
"""Format a single example for few-shot learning.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
question (str): The question text.
|
|
115
|
+
choices (list[str]): The list of choices.
|
|
116
|
+
answer (list[str]): The correct answers.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
str: Formatted example string.
|
|
120
|
+
"""
|
|
121
|
+
choices_text = answer_options(choices)
|
|
122
|
+
return f'{question}\n{choices_text}\nANSWER: {answer.text}'
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def parse_answers(state: TaskState, multiple_correct: bool = False) -> set[str]:
|
|
126
|
+
"""
|
|
127
|
+
Convenience function for extracting answers from the state output.
|
|
128
|
+
|
|
129
|
+
The generated response must be in the format 'ANSWER: <answers>',
|
|
130
|
+
otherwise we can't extract what the model thinks is "true". We can be a
|
|
131
|
+
bit flexible whether these are "AB" vs "A,B" vs "A B".
|
|
132
|
+
|
|
133
|
+
However, if the answer isn't in the expected format the model has
|
|
134
|
+
failed in the task so we'll ultimately just mark it as incorrect
|
|
135
|
+
"""
|
|
136
|
+
# First check whether the string strictly ends with the expected answer
|
|
137
|
+
# In this case, we're looking for a single line which contains the expected
|
|
138
|
+
# ANSWER: <answer> string with only whitespace or a period/full stop at the end.
|
|
139
|
+
match = re.search(
|
|
140
|
+
r'(?i)^ANSWER\s*:\s*([A-Za-z\d ,]+)\s*(?:$|\n|\.)',
|
|
141
|
+
state.output.completion,
|
|
142
|
+
flags=re.MULTILINE,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
# If we couldn't match the strict version, we can try the less strict
|
|
146
|
+
# version for backward compatibility
|
|
147
|
+
if match is None:
|
|
148
|
+
match = re.search(
|
|
149
|
+
r'(?i)ANSWER\s*:\s*([A-Za-z\d ,]+)(?:[^\w]|\n|$|\.)',
|
|
150
|
+
state.output.completion,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if match is None:
|
|
154
|
+
return set()
|
|
155
|
+
|
|
156
|
+
matched = match.group(1)
|
|
157
|
+
|
|
158
|
+
# Strip trailing period / full stop
|
|
159
|
+
matched = matched.strip()
|
|
160
|
+
matched = matched.rstrip('.')
|
|
161
|
+
|
|
162
|
+
allowed_options = set(answer_character(i) for i in range(len(state.choices)))
|
|
163
|
+
|
|
164
|
+
if multiple_correct:
|
|
165
|
+
# Match must contain only the allowed choices
|
|
166
|
+
# (may be separated by commas, spaces, the word 'and', or nothing at all)
|
|
167
|
+
|
|
168
|
+
matched = matched.replace(' and ', '')
|
|
169
|
+
|
|
170
|
+
matched = matched.replace(' ', '')
|
|
171
|
+
|
|
172
|
+
split_comma = set(matched.split(','))
|
|
173
|
+
if split_comma.issubset(allowed_options):
|
|
174
|
+
answers = split_comma
|
|
175
|
+
return answers
|
|
176
|
+
|
|
177
|
+
split_nothing = set(matched)
|
|
178
|
+
if split_nothing.issubset(allowed_options):
|
|
179
|
+
answers = split_nothing
|
|
180
|
+
return answers
|
|
181
|
+
|
|
182
|
+
else:
|
|
183
|
+
# Match must contain a single letter in the allowed choices
|
|
184
|
+
if matched in allowed_options:
|
|
185
|
+
answers = {matched}
|
|
186
|
+
return answers
|
|
187
|
+
|
|
188
|
+
return set()
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def parse_answers_zh(state: TaskState, multiple_correct: bool = False) -> set[str]:
|
|
192
|
+
"""
|
|
193
|
+
Convenience function for extracting answers from the state output in Chinese format.
|
|
194
|
+
|
|
195
|
+
The generated response must be in the format '答案:选项',
|
|
196
|
+
otherwise we can't extract what the model thinks is "true". We can be a
|
|
197
|
+
bit flexible whether these are "AB" vs "A,B" vs "A B".
|
|
198
|
+
"""
|
|
199
|
+
# Simple pattern to capture answers with optional bold markdown
|
|
200
|
+
pattern = r'答案\s*[::]\s*([A-Za-z0-9,,]+)'
|
|
201
|
+
match = re.search(pattern, state.output.completion, flags=re.MULTILINE)
|
|
202
|
+
|
|
203
|
+
if match is None:
|
|
204
|
+
return set()
|
|
205
|
+
|
|
206
|
+
matched = match.group(1).strip().rstrip('。.')
|
|
207
|
+
allowed_options = set(answer_character(i) for i in range(len(state.choices)))
|
|
208
|
+
|
|
209
|
+
if multiple_correct:
|
|
210
|
+
# Handle comma-separated or continuous letters
|
|
211
|
+
matched = matched.replace(' 和 ', '').replace(' ', '').replace(',', ',')
|
|
212
|
+
answers = set(matched.split(',')) if ',' in matched else set(matched)
|
|
213
|
+
return answers if answers.issubset(allowed_options) else set()
|
|
214
|
+
else:
|
|
215
|
+
# Single answer
|
|
216
|
+
return {matched} if matched in allowed_options else set()
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def set_choices_based_on_generated_response(state: TaskState, answers: set[str]) -> None:
|
|
220
|
+
true_answers = [answer_index(letter) for letter in answers]
|
|
221
|
+
|
|
222
|
+
for i in range(len(state.choices)):
|
|
223
|
+
if i in true_answers:
|
|
224
|
+
state.choices.mark_choice(i, True)
|
|
225
|
+
else:
|
|
226
|
+
state.choices.mark_choice(i, False)
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def valid_template(template: str) -> bool:
|
|
230
|
+
"""Check if a template has the required capture groups for a multiple choice question"""
|
|
231
|
+
return bool(re.search(r'\{question\}', template) and re.search(r'\{choices\}', template))
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
class MultipleChoiceTemplate:
|
|
235
|
+
"""
|
|
236
|
+
Templates for multiple choice questions.
|
|
237
|
+
"""
|
|
238
|
+
|
|
239
|
+
SINGLE_ANSWER = SINGLE_ANSWER_TEMPLATE
|
|
240
|
+
SINGLE_ANSWER_COT = SINGLE_ANSWER_TEMPLATE_COT
|
|
241
|
+
MULTIPLE_ANSWER = MULTIPLE_ANSWER_TEMPLATE
|
|
242
|
+
MULTIPLE_ANSWER_COT = MULTIPLE_ANSWER_TEMPLATE_COT
|
|
243
|
+
CHINESE_FEW_SHOT_TEMPLATE = CHINESE_FEW_SHOT_TEMPLATE
|
|
244
|
+
CHINESE_SINGLE_ANSWER_TEMPLATE = CHINESE_SINGLE_ANSWER_TEMPLATE
|
|
245
|
+
CHINESE_SINGLE_ANSWER_TEMPLATE_COT = CHINESE_SINGLE_ANSWER_TEMPLATE_COT
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def answer_character(index: int) -> str:
|
|
249
|
+
r"""
|
|
250
|
+
Helper to go from array index to char, for example:
|
|
251
|
+
|
|
252
|
+
0 -> 'A', 1 -> 'B', etc
|
|
253
|
+
"""
|
|
254
|
+
if index < 26:
|
|
255
|
+
return chr(ord('A') + index)
|
|
256
|
+
else:
|
|
257
|
+
return str(index - 25)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def answer_index(char: str) -> int:
|
|
261
|
+
r"""
|
|
262
|
+
Helper to go from char to array index, for example:
|
|
263
|
+
|
|
264
|
+
'A' -> 0, 'B' -> 1, etc
|
|
265
|
+
"""
|
|
266
|
+
if char.isalpha() or char == ',' or char == ' ':
|
|
267
|
+
return ord(char.upper()) - ord('A')
|
|
268
|
+
elif char.isnumeric():
|
|
269
|
+
return 25 + int(char)
|
|
270
|
+
else:
|
|
271
|
+
raise ValueError(f'Unepxected multiple choice answer: {char} (must be a letter or number)')
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import httpx
|
|
3
|
+
import mimetypes
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def is_http_url(url: str) -> bool:
|
|
8
|
+
return url.startswith('http://') or url.startswith('https://')
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def is_data_uri(url: str) -> bool:
|
|
12
|
+
pattern = r'^data:([^;]+);base64,.*'
|
|
13
|
+
return re.match(pattern, url) is not None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def data_uri_mime_type(data_url: str) -> str | None:
|
|
17
|
+
pattern = r'^data:([^;]+);.*'
|
|
18
|
+
match = re.match(pattern, data_url)
|
|
19
|
+
if match:
|
|
20
|
+
mime_type = match.group(1)
|
|
21
|
+
return mime_type
|
|
22
|
+
else:
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def data_uri_to_base64(data_uri: str) -> str:
|
|
27
|
+
pattern = r'^data:[^,]+,'
|
|
28
|
+
stripped_uri = re.sub(pattern, '', data_uri)
|
|
29
|
+
return stripped_uri
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def file_as_data(file: str) -> tuple[bytes, str]:
|
|
33
|
+
if is_data_uri(file):
|
|
34
|
+
# resolve mime type and base64 content
|
|
35
|
+
mime_type = data_uri_mime_type(file) or 'image/png'
|
|
36
|
+
file_base64 = data_uri_to_base64(file)
|
|
37
|
+
file_bytes = base64.b64decode(file_base64)
|
|
38
|
+
else:
|
|
39
|
+
# guess mime type; need strict=False for webp images
|
|
40
|
+
type, _ = mimetypes.guess_type(file, strict=False)
|
|
41
|
+
if type:
|
|
42
|
+
mime_type = type
|
|
43
|
+
else:
|
|
44
|
+
mime_type = 'image/png'
|
|
45
|
+
|
|
46
|
+
# handle url or file
|
|
47
|
+
if is_http_url(file):
|
|
48
|
+
client = httpx.Client()
|
|
49
|
+
file_bytes = client.get(file).content
|
|
50
|
+
else:
|
|
51
|
+
with open(file, 'rb') as f:
|
|
52
|
+
file_bytes = f.read()
|
|
53
|
+
|
|
54
|
+
# return bytes and type
|
|
55
|
+
return file_bytes, mime_type
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def file_as_data_uri(file: str) -> str:
|
|
59
|
+
if is_data_uri(file):
|
|
60
|
+
return file
|
|
61
|
+
else:
|
|
62
|
+
bytes, mime_type = file_as_data(file)
|
|
63
|
+
base64_file = base64.b64encode(bytes).decode('utf-8')
|
|
64
|
+
file = f'data:{mime_type};base64,{base64_file}'
|
|
65
|
+
return file
|
evalscope/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: evalscope
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 1.0.0
|
|
4
4
|
Summary: EvalScope: Lightweight LLMs Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/modelscope/evalscope
|
|
6
6
|
Author: ModelScope team
|
|
@@ -18,7 +18,10 @@ Requires-Python: >=3.9
|
|
|
18
18
|
Description-Content-Type: text/markdown
|
|
19
19
|
License-File: LICENSE
|
|
20
20
|
Requires-Dist: accelerate
|
|
21
|
-
Requires-Dist:
|
|
21
|
+
Requires-Dist: colorlog
|
|
22
|
+
Requires-Dist: datasets==3.6.0
|
|
23
|
+
Requires-Dist: docstring-parser
|
|
24
|
+
Requires-Dist: dotenv
|
|
22
25
|
Requires-Dist: immutabledict
|
|
23
26
|
Requires-Dist: jieba
|
|
24
27
|
Requires-Dist: jsonlines
|
|
@@ -28,11 +31,14 @@ Requires-Dist: matplotlib
|
|
|
28
31
|
Requires-Dist: modelscope[framework]>=1.27
|
|
29
32
|
Requires-Dist: nltk>=3.9
|
|
30
33
|
Requires-Dist: openai
|
|
34
|
+
Requires-Dist: overrides
|
|
31
35
|
Requires-Dist: pandas
|
|
32
36
|
Requires-Dist: pillow
|
|
33
37
|
Requires-Dist: pyarrow
|
|
38
|
+
Requires-Dist: pydantic
|
|
34
39
|
Requires-Dist: pyyaml>=5.1
|
|
35
40
|
Requires-Dist: requests
|
|
41
|
+
Requires-Dist: rich
|
|
36
42
|
Requires-Dist: rouge-chinese
|
|
37
43
|
Requires-Dist: rouge-score>=0.1.0
|
|
38
44
|
Requires-Dist: sacrebleu
|
|
@@ -50,10 +56,14 @@ Requires-Dist: iopath; extra == "aigc"
|
|
|
50
56
|
Requires-Dist: omegaconf; extra == "aigc"
|
|
51
57
|
Requires-Dist: open-clip-torch; extra == "aigc"
|
|
52
58
|
Requires-Dist: opencv-python; extra == "aigc"
|
|
59
|
+
Requires-Dist: peft>=0.17; extra == "aigc"
|
|
53
60
|
Requires-Dist: torchvision; extra == "aigc"
|
|
54
61
|
Provides-Extra: all
|
|
55
62
|
Requires-Dist: accelerate; extra == "all"
|
|
56
|
-
Requires-Dist:
|
|
63
|
+
Requires-Dist: colorlog; extra == "all"
|
|
64
|
+
Requires-Dist: datasets==3.6.0; extra == "all"
|
|
65
|
+
Requires-Dist: docstring-parser; extra == "all"
|
|
66
|
+
Requires-Dist: dotenv; extra == "all"
|
|
57
67
|
Requires-Dist: immutabledict; extra == "all"
|
|
58
68
|
Requires-Dist: jieba; extra == "all"
|
|
59
69
|
Requires-Dist: jsonlines; extra == "all"
|
|
@@ -63,11 +73,14 @@ Requires-Dist: matplotlib; extra == "all"
|
|
|
63
73
|
Requires-Dist: modelscope[framework]>=1.27; extra == "all"
|
|
64
74
|
Requires-Dist: nltk>=3.9; extra == "all"
|
|
65
75
|
Requires-Dist: openai; extra == "all"
|
|
76
|
+
Requires-Dist: overrides; extra == "all"
|
|
66
77
|
Requires-Dist: pandas; extra == "all"
|
|
67
78
|
Requires-Dist: pillow; extra == "all"
|
|
68
79
|
Requires-Dist: pyarrow; extra == "all"
|
|
80
|
+
Requires-Dist: pydantic; extra == "all"
|
|
69
81
|
Requires-Dist: pyyaml>=5.1; extra == "all"
|
|
70
82
|
Requires-Dist: requests; extra == "all"
|
|
83
|
+
Requires-Dist: rich; extra == "all"
|
|
71
84
|
Requires-Dist: rouge-chinese; extra == "all"
|
|
72
85
|
Requires-Dist: rouge-score>=0.1.0; extra == "all"
|
|
73
86
|
Requires-Dist: sacrebleu; extra == "all"
|
|
@@ -91,7 +104,6 @@ Requires-Dist: webdataset>0.2.0; extra == "all"
|
|
|
91
104
|
Requires-Dist: aiohttp; extra == "all"
|
|
92
105
|
Requires-Dist: fastapi; extra == "all"
|
|
93
106
|
Requires-Dist: numpy; extra == "all"
|
|
94
|
-
Requires-Dist: rich; extra == "all"
|
|
95
107
|
Requires-Dist: sse-starlette; extra == "all"
|
|
96
108
|
Requires-Dist: transformers; extra == "all"
|
|
97
109
|
Requires-Dist: uvicorn; extra == "all"
|
|
@@ -102,8 +114,9 @@ Requires-Dist: iopath; extra == "all"
|
|
|
102
114
|
Requires-Dist: omegaconf; extra == "all"
|
|
103
115
|
Requires-Dist: open-clip-torch; extra == "all"
|
|
104
116
|
Requires-Dist: opencv-python; extra == "all"
|
|
117
|
+
Requires-Dist: peft>=0.17; extra == "all"
|
|
105
118
|
Requires-Dist: torchvision; extra == "all"
|
|
106
|
-
Requires-Dist: bfcl-eval; extra == "all"
|
|
119
|
+
Requires-Dist: bfcl-eval==2025.6.16; extra == "all"
|
|
107
120
|
Requires-Dist: human-eval; extra == "all"
|
|
108
121
|
Requires-Dist: pytest; extra == "all"
|
|
109
122
|
Requires-Dist: pytest-cov; extra == "all"
|
|
@@ -112,7 +125,7 @@ Provides-Extra: app
|
|
|
112
125
|
Requires-Dist: gradio==5.4.0; extra == "app"
|
|
113
126
|
Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
|
|
114
127
|
Provides-Extra: dev
|
|
115
|
-
Requires-Dist: bfcl-eval; extra == "dev"
|
|
128
|
+
Requires-Dist: bfcl-eval==2025.6.16; extra == "dev"
|
|
116
129
|
Requires-Dist: human-eval; extra == "dev"
|
|
117
130
|
Requires-Dist: pytest; extra == "dev"
|
|
118
131
|
Requires-Dist: pytest-cov; extra == "dev"
|
|
@@ -175,9 +188,9 @@ Requires-Dist: ms-vlmeval>=0.0.17; extra == "vlmeval"
|
|
|
175
188
|
- [📝 Introduction](#-introduction)
|
|
176
189
|
- [☎ User Groups](#-user-groups)
|
|
177
190
|
- [🎉 News](#-news)
|
|
178
|
-
- [🛠️
|
|
179
|
-
- [Method 1
|
|
180
|
-
- [Method 2
|
|
191
|
+
- [🛠️ Environment Setup](#️-environment-setup)
|
|
192
|
+
- [Method 1. Install via pip](#method-1-install-via-pip)
|
|
193
|
+
- [Method 2. Install from source](#method-2-install-from-source)
|
|
181
194
|
- [🚀 Quick Start](#-quick-start)
|
|
182
195
|
- [Method 1. Using Command Line](#method-1-using-command-line)
|
|
183
196
|
- [Method 2. Using Python Code](#method-2-using-python-code)
|
|
@@ -258,6 +271,13 @@ Please scan the QR code below to join our community groups:
|
|
|
258
271
|
|
|
259
272
|
|
|
260
273
|
## 🎉 News
|
|
274
|
+
|
|
275
|
+
> [!IMPORTANT]
|
|
276
|
+
> **Version 1.0 Refactoring**
|
|
277
|
+
>
|
|
278
|
+
> Version 1.0 introduces a major overhaul of the evaluation framework, establishing a new, more modular and extensible API layer under `evalscope/api`. Key improvements include standardized data models for benchmarks, samples, and results; a registry-based design for components such as benchmarks and metrics; and a rewritten core evaluator that orchestrates the new architecture. Existing benchmark adapters have been migrated to this API, resulting in cleaner, more consistent, and easier-to-maintain implementations.
|
|
279
|
+
|
|
280
|
+
- 🔥 **[2025.08.22]** Version 1.0 Refactoring.
|
|
261
281
|
- 🔥 **[2025.07.18]** The model stress testing now supports randomly generating image-text data for multimodal model evaluation. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#id4).
|
|
262
282
|
- 🔥 **[2025.07.16]** Support for [τ-bench](https://github.com/sierra-research/tau-bench) has been added, enabling the evaluation of AI Agent performance and reliability in real-world scenarios involving dynamic user and tool interactions. For usage instructions, please refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset/llm.html#bench).
|
|
263
283
|
- 🔥 **[2025.07.14]** Support for "Humanity's Last Exam" ([Humanity's-Last-Exam](https://modelscope.cn/datasets/cais/hle)), a highly challenging evaluation benchmark. For usage instructions, refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset/llm.html#humanity-s-last-exam).
|
|
@@ -268,12 +288,12 @@ Please scan the QR code below to join our community groups:
|
|
|
268
288
|
- 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
|
|
269
289
|
- 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).
|
|
270
290
|
- 🔥 **[2025.05.13]** Added support for the [ToolBench-Static](https://modelscope.cn/datasets/AI-ModelScope/ToolBench-Static) dataset to evaluate model's tool-calling capabilities. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/toolbench.html) for usage instructions. Also added support for the [DROP](https://modelscope.cn/datasets/AI-ModelScope/DROP/dataPeview) and [Winogrande](https://modelscope.cn/datasets/AI-ModelScope/winogrande_val) benchmarks to assess the reasoning capabilities of models.
|
|
291
|
+
<details><summary>More</summary>
|
|
292
|
+
|
|
271
293
|
- 🔥 **[2025.04.29]** Added Qwen3 Evaluation Best Practices, [welcome to read 📖](https://evalscope.readthedocs.io/en/latest/best_practice/qwen3.html)
|
|
272
294
|
- 🔥 **[2025.04.27]** Support for text-to-image evaluation: Supports 8 metrics including MPS, HPSv2.1Score, etc., and evaluation benchmarks such as EvalMuse, GenAI-Bench. Refer to the [user documentation](https://evalscope.readthedocs.io/en/latest/user_guides/aigc/t2i.html) for more details.
|
|
273
295
|
- 🔥 **[2025.04.10]** Model service stress testing tool now supports the `/v1/completions` endpoint (the default endpoint for vLLM benchmarking)
|
|
274
296
|
- 🔥 **[2025.04.08]** Support for evaluating embedding model services compatible with the OpenAI API has been added. For more details, check the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/backend/rageval_backend/mteb.html#configure-evaluation-parameters).
|
|
275
|
-
<details><summary>More</summary>
|
|
276
|
-
|
|
277
297
|
- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
|
|
278
298
|
- 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
|
|
279
299
|
- 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
|
|
@@ -306,58 +326,87 @@ Please scan the QR code below to join our community groups:
|
|
|
306
326
|
|
|
307
327
|
</details>
|
|
308
328
|
|
|
309
|
-
## 🛠️
|
|
310
|
-
|
|
311
|
-
|
|
329
|
+
## 🛠️ Environment Setup
|
|
330
|
+
|
|
331
|
+
### Method 1. Install via pip
|
|
332
|
+
|
|
333
|
+
We recommend using conda to manage your environment and pip to install dependencies. This allows you to use the latest evalscope PyPI package.
|
|
312
334
|
|
|
313
335
|
1. Create a conda environment (optional)
|
|
336
|
+
```shell
|
|
337
|
+
# Python 3.10 is recommended
|
|
338
|
+
conda create -n evalscope python=3.10
|
|
339
|
+
|
|
340
|
+
# Activate the conda environment
|
|
341
|
+
conda activate evalscope
|
|
342
|
+
```
|
|
343
|
+
2. Install dependencies via pip
|
|
344
|
+
```shell
|
|
345
|
+
pip install evalscope
|
|
346
|
+
```
|
|
347
|
+
3. Install additional dependencies (optional)
|
|
348
|
+
- To use model service inference benchmarking features, install the perf dependency:
|
|
314
349
|
```shell
|
|
315
|
-
|
|
316
|
-
conda create -n evalscope python=3.10
|
|
317
|
-
# Activate the conda environment
|
|
318
|
-
conda activate evalscope
|
|
350
|
+
pip install 'evalscope[perf]'
|
|
319
351
|
```
|
|
320
|
-
|
|
321
|
-
|
|
352
|
+
- To use visualization features, install the app dependency:
|
|
353
|
+
```shell
|
|
354
|
+
pip install 'evalscope[app]'
|
|
355
|
+
```
|
|
356
|
+
- If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
|
|
357
|
+
```shell
|
|
358
|
+
pip install 'evalscope[opencompass]'
|
|
359
|
+
pip install 'evalscope[vlmeval]'
|
|
360
|
+
pip install 'evalscope[rag]'
|
|
361
|
+
```
|
|
362
|
+
- To install all dependencies:
|
|
322
363
|
```shell
|
|
323
|
-
pip install evalscope
|
|
324
|
-
# Additional options
|
|
325
|
-
pip install 'evalscope[opencompass]' # Install OpenCompass backend
|
|
326
|
-
pip install 'evalscope[vlmeval]' # Install VLMEvalKit backend
|
|
327
|
-
pip install 'evalscope[rag]' # Install RAGEval backend
|
|
328
|
-
pip install 'evalscope[perf]' # Install dependencies for the model performance testing module
|
|
329
|
-
pip install 'evalscope[app]' # Install dependencies for visualization
|
|
330
|
-
pip install 'evalscope[all]' # Install all backends (Native, OpenCompass, VLMEvalKit, RAGEval)
|
|
364
|
+
pip install 'evalscope[all]'
|
|
331
365
|
```
|
|
332
366
|
|
|
333
|
-
> [!
|
|
334
|
-
>
|
|
367
|
+
> [!NOTE]
|
|
368
|
+
> The project has been renamed to `evalscope`. For version `v0.4.3` or earlier, you can install it with:
|
|
335
369
|
> ```shell
|
|
336
|
-
>
|
|
370
|
+
> pip install llmuses<=0.4.3
|
|
337
371
|
> ```
|
|
338
|
-
>
|
|
339
|
-
> ```
|
|
372
|
+
> Then, import related dependencies using `llmuses`:
|
|
373
|
+
> ```python
|
|
340
374
|
> from llmuses import ...
|
|
341
375
|
> ```
|
|
342
376
|
|
|
343
|
-
### Method 2
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
git clone https://github.com/modelscope/evalscope.git
|
|
347
|
-
```
|
|
377
|
+
### Method 2. Install from source
|
|
378
|
+
|
|
379
|
+
Installing from source allows you to use the latest code and makes it easier for further development and debugging.
|
|
348
380
|
|
|
381
|
+
1. Clone the source code
|
|
382
|
+
```shell
|
|
383
|
+
git clone https://github.com/modelscope/evalscope.git
|
|
384
|
+
```
|
|
349
385
|
2. Install dependencies
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
386
|
+
```shell
|
|
387
|
+
cd evalscope/
|
|
388
|
+
|
|
389
|
+
pip install -e .
|
|
390
|
+
```
|
|
391
|
+
3. Install additional dependencies
|
|
392
|
+
- To use model service inference benchmarking features, install the perf dependency:
|
|
393
|
+
```shell
|
|
394
|
+
pip install '.[perf]'
|
|
395
|
+
```
|
|
396
|
+
- To use visualization features, install the app dependency:
|
|
397
|
+
```shell
|
|
398
|
+
pip install '.[app]'
|
|
399
|
+
```
|
|
400
|
+
- If you need to use other evaluation backends, you can install OpenCompass, VLMEvalKit, or RAGEval as needed:
|
|
401
|
+
```shell
|
|
402
|
+
pip install '.[opencompass]'
|
|
403
|
+
pip install '.[vlmeval]'
|
|
404
|
+
pip install '.[rag]'
|
|
405
|
+
```
|
|
406
|
+
- To install all dependencies:
|
|
407
|
+
```shell
|
|
408
|
+
pip install '.[all]'
|
|
409
|
+
```
|
|
361
410
|
|
|
362
411
|
|
|
363
412
|
## 🚀 Quick Start
|