evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
evalscope/models/local_model.py
DELETED
|
@@ -1,128 +0,0 @@
|
|
|
1
|
-
import importlib
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
from typing import TYPE_CHECKING, Optional
|
|
4
|
-
|
|
5
|
-
from evalscope.constants import DEFAULT_MODEL_CACHE_DIR, DEFAULT_MODEL_REVISION, EvalType, ModelTask
|
|
6
|
-
from evalscope.utils.logger import get_logger
|
|
7
|
-
from evalscope.utils.model_utils import get_device
|
|
8
|
-
|
|
9
|
-
if TYPE_CHECKING:
|
|
10
|
-
from evalscope.config import TaskConfig
|
|
11
|
-
|
|
12
|
-
logger = get_logger()
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class LocalModel(ABC):
|
|
16
|
-
|
|
17
|
-
def __init__(self,
|
|
18
|
-
model_id: str,
|
|
19
|
-
model_revision: str = None,
|
|
20
|
-
device_map: str = None,
|
|
21
|
-
torch_dtype: str = 'auto',
|
|
22
|
-
cache_dir: str = None,
|
|
23
|
-
**kwargs):
|
|
24
|
-
|
|
25
|
-
self.model_id = model_id
|
|
26
|
-
self.model_revision = model_revision or DEFAULT_MODEL_REVISION
|
|
27
|
-
self.device = device_map or get_device()
|
|
28
|
-
self.cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR
|
|
29
|
-
self.kwargs = kwargs
|
|
30
|
-
self.model = None
|
|
31
|
-
self.tokenizer = None
|
|
32
|
-
|
|
33
|
-
if isinstance(torch_dtype, str) and torch_dtype != 'auto':
|
|
34
|
-
import torch
|
|
35
|
-
torch_dtype = eval(torch_dtype)
|
|
36
|
-
self.torch_dtype = torch_dtype
|
|
37
|
-
|
|
38
|
-
self.model_cfg = {
|
|
39
|
-
'model_id': self.model_id,
|
|
40
|
-
'device_map': self.device,
|
|
41
|
-
'torch_dtype': str(self.torch_dtype),
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
@abstractmethod
|
|
45
|
-
def load_model(self):
|
|
46
|
-
pass
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class LocalChatModel(LocalModel):
|
|
50
|
-
|
|
51
|
-
def __init__(self, **kwargs):
|
|
52
|
-
super().__init__(**kwargs)
|
|
53
|
-
|
|
54
|
-
def load_model(self):
|
|
55
|
-
from modelscope import AutoModelForCausalLM, AutoTokenizer
|
|
56
|
-
|
|
57
|
-
logger.info(f'Loading model {self.model_id} ...')
|
|
58
|
-
|
|
59
|
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
60
|
-
self.model_id,
|
|
61
|
-
revision=self.model_revision,
|
|
62
|
-
trust_remote_code=True,
|
|
63
|
-
cache_dir=self.cache_dir,
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
# Fix no padding
|
|
67
|
-
if self.tokenizer.pad_token is None:
|
|
68
|
-
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
69
|
-
|
|
70
|
-
self.model = AutoModelForCausalLM.from_pretrained(
|
|
71
|
-
self.model_id,
|
|
72
|
-
revision=self.model_revision,
|
|
73
|
-
device_map=self.device,
|
|
74
|
-
trust_remote_code=True,
|
|
75
|
-
torch_dtype=self.torch_dtype,
|
|
76
|
-
cache_dir=self.cache_dir,
|
|
77
|
-
)
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
class LocalImageModel(LocalModel):
|
|
81
|
-
|
|
82
|
-
def __init__(self, **kwargs):
|
|
83
|
-
super().__init__(**kwargs)
|
|
84
|
-
|
|
85
|
-
self.pipeline_cls = self.kwargs.pop('pipeline_cls', None)
|
|
86
|
-
# default to DiffusionPipeline if not specified
|
|
87
|
-
if self.pipeline_cls is None:
|
|
88
|
-
if 'flux' in self.model_id.lower():
|
|
89
|
-
self.pipeline_cls = 'FluxPipeline'
|
|
90
|
-
else:
|
|
91
|
-
self.pipeline_cls = 'DiffusionPipeline'
|
|
92
|
-
|
|
93
|
-
def load_model(self):
|
|
94
|
-
# from modelscope import pipeline_cls
|
|
95
|
-
module = getattr(importlib.import_module('modelscope'), self.pipeline_cls)
|
|
96
|
-
|
|
97
|
-
logger.info(f'Loading model {self.model_id} with {self.pipeline_cls} ...')
|
|
98
|
-
|
|
99
|
-
self.model = module.from_pretrained(
|
|
100
|
-
self.model_id,
|
|
101
|
-
revision=self.model_revision,
|
|
102
|
-
torch_dtype=self.torch_dtype,
|
|
103
|
-
cache_dir=self.cache_dir,
|
|
104
|
-
**self.kwargs,
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
self.model.to(self.device)
|
|
108
|
-
|
|
109
|
-
def __call__(self, *args, **kwargs):
|
|
110
|
-
return self.model(*args, **kwargs)
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
def get_local_model(task_cfg: 'TaskConfig') -> Optional[LocalModel]:
|
|
114
|
-
"""Get the base local model for the task. If the task is not checkpoint-based, return None.
|
|
115
|
-
Avoids loading model multiple times for different datasets.
|
|
116
|
-
"""
|
|
117
|
-
if task_cfg.eval_type != EvalType.CHECKPOINT:
|
|
118
|
-
return None
|
|
119
|
-
elif task_cfg.model_task == ModelTask.TEXT_GENERATION:
|
|
120
|
-
base_model = LocalChatModel(model_id=task_cfg.model, **task_cfg.model_args)
|
|
121
|
-
base_model.load_model()
|
|
122
|
-
return base_model
|
|
123
|
-
elif task_cfg.model_task == ModelTask.IMAGE_GENERATION:
|
|
124
|
-
base_model = LocalImageModel(model_id=task_cfg.model, **task_cfg.model_args)
|
|
125
|
-
base_model.load_model()
|
|
126
|
-
return base_model
|
|
127
|
-
else:
|
|
128
|
-
raise ValueError(f'Unsupported model task: {task_cfg.model_task} for model checkpoint.')
|
evalscope/models/register.py
DELETED
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
MODEL_ADAPTERS = {}
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def register_model_adapter(name):
|
|
5
|
-
"""
|
|
6
|
-
Decorator to register a model adapter with a given name.
|
|
7
|
-
:param name: The name of the model adapter.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
def decorator(adapter):
|
|
11
|
-
if name in MODEL_ADAPTERS:
|
|
12
|
-
raise ValueError(f"Model adapter '{name}' is already registered.")
|
|
13
|
-
MODEL_ADAPTERS[name] = adapter
|
|
14
|
-
return adapter
|
|
15
|
-
|
|
16
|
-
return decorator
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def get_model_adapter(name):
|
|
20
|
-
"""
|
|
21
|
-
Retrieve a registered model adapter by name.
|
|
22
|
-
:param name: The name of the model adapter.
|
|
23
|
-
:return: The model adapter class or function.
|
|
24
|
-
"""
|
|
25
|
-
if name not in MODEL_ADAPTERS:
|
|
26
|
-
raise ValueError(
|
|
27
|
-
f"Model adapter '{name}' is not registered. Available model adapters: {list(MODEL_ADAPTERS.keys())}")
|
|
28
|
-
return MODEL_ADAPTERS[name]
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
def register_model_adapter_class(cls, name=None):
|
|
32
|
-
"""
|
|
33
|
-
Register a model adapter class.
|
|
34
|
-
:param cls: The model adapter class to register
|
|
35
|
-
:param name: Optional name for the model adapter. If not provided, the class name will be used.
|
|
36
|
-
"""
|
|
37
|
-
if name is None:
|
|
38
|
-
name = cls.__name__
|
|
39
|
-
if name in MODEL_ADAPTERS:
|
|
40
|
-
raise ValueError(f"Model adapter class '{name}' is already registered.")
|
|
41
|
-
MODEL_ADAPTERS[name] = cls
|
tests/cli/test_run.py
DELETED
|
@@ -1,489 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
from dotenv import dotenv_values
|
|
3
|
-
|
|
4
|
-
from tests.utils import test_level_list
|
|
5
|
-
|
|
6
|
-
env = dotenv_values('.env')
|
|
7
|
-
|
|
8
|
-
import os
|
|
9
|
-
import subprocess
|
|
10
|
-
import unittest
|
|
11
|
-
|
|
12
|
-
from evalscope.config import TaskConfig
|
|
13
|
-
from evalscope.constants import EvalStage, EvalType, JudgeStrategy, OutputType
|
|
14
|
-
from evalscope.run import run_task
|
|
15
|
-
from evalscope.utils.import_utils import is_module_installed
|
|
16
|
-
from evalscope.utils.logger import get_logger
|
|
17
|
-
|
|
18
|
-
os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
19
|
-
|
|
20
|
-
logger = get_logger()
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class TestRun(unittest.TestCase):
|
|
24
|
-
|
|
25
|
-
def setUp(self) -> None:
|
|
26
|
-
logger.info('Init env for evalscope native run UTs ...\n')
|
|
27
|
-
self._check_env('evalscope')
|
|
28
|
-
|
|
29
|
-
def tearDown(self) -> None:
|
|
30
|
-
pass
|
|
31
|
-
|
|
32
|
-
@staticmethod
|
|
33
|
-
def _check_env(module_name: str):
|
|
34
|
-
if is_module_installed(module_name):
|
|
35
|
-
logger.info(f'{module_name} is installed.')
|
|
36
|
-
else:
|
|
37
|
-
raise ModuleNotFoundError(f'run: pip install {module_name}')
|
|
38
|
-
|
|
39
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
40
|
-
def test_run_simple_eval(self):
|
|
41
|
-
model = 'qwen/Qwen2-0.5B-Instruct'
|
|
42
|
-
datasets = 'arc' # arc ceval
|
|
43
|
-
limit = 10
|
|
44
|
-
|
|
45
|
-
cmd_simple = f'evalscope eval ' \
|
|
46
|
-
f'--model {model} ' \
|
|
47
|
-
f'--datasets {datasets} ' \
|
|
48
|
-
f'--limit {limit}'
|
|
49
|
-
|
|
50
|
-
logger.info(f'Start to run command: {cmd_simple}')
|
|
51
|
-
run_res = subprocess.run(cmd_simple, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
52
|
-
|
|
53
|
-
assert run_res.returncode == 0, f'Failed to run command: {cmd_simple}'
|
|
54
|
-
logger.info(f'>>test_run_simple_eval stdout: {run_res.stdout}')
|
|
55
|
-
logger.error(f'>>test_run_simple_eval stderr: {run_res.stderr}')
|
|
56
|
-
|
|
57
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
58
|
-
def test_run_eval_with_args(self):
|
|
59
|
-
model = 'qwen/Qwen2-0.5B-Instruct'
|
|
60
|
-
datasets = 'arc' # arc ceval
|
|
61
|
-
limit = 5
|
|
62
|
-
dataset_args = '{"ceval": {"few_shot_num": 0, "few_shot_random": false}}'
|
|
63
|
-
|
|
64
|
-
cmd_with_args = f'evalscope eval ' \
|
|
65
|
-
f'--model {model} ' \
|
|
66
|
-
f'--datasets {datasets} ' \
|
|
67
|
-
f'--limit {limit} ' \
|
|
68
|
-
f'--generation-config do_sample=true,temperature=0.6,max_length=65535,max_new_tokens=65535,max_tokens=65535,n=1,top_p=0.95,top_k=20 ' \
|
|
69
|
-
f"""--dataset-args \'{dataset_args}\' """
|
|
70
|
-
|
|
71
|
-
logger.info(f'Start to run command: {cmd_with_args}')
|
|
72
|
-
run_res = subprocess.run(cmd_with_args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
73
|
-
|
|
74
|
-
assert run_res.returncode == 0, f'Failed to run command: {cmd_with_args}'
|
|
75
|
-
logger.info(f'>>test_run_eval_with_args stdout: {run_res.stdout}')
|
|
76
|
-
logger.error(f'>>test_run_eval_with_args stderr: {run_res.stderr}')
|
|
77
|
-
|
|
78
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
79
|
-
def test_run_yaml_config(self):
|
|
80
|
-
from evalscope import run_task
|
|
81
|
-
|
|
82
|
-
run_task(task_cfg='examples/tasks/eval_native.yaml')
|
|
83
|
-
|
|
84
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
85
|
-
def test_run_task(self):
|
|
86
|
-
task_cfg = TaskConfig(
|
|
87
|
-
model='qwen/Qwen2.5-0.5B-Instruct',
|
|
88
|
-
datasets=[
|
|
89
|
-
'iquiz',
|
|
90
|
-
# 'ifeval',
|
|
91
|
-
# 'mmlu',
|
|
92
|
-
# 'mmlu_pro',
|
|
93
|
-
# 'musr',
|
|
94
|
-
# 'process_bench',
|
|
95
|
-
# 'race',
|
|
96
|
-
# 'trivia_qa',
|
|
97
|
-
# 'cmmlu',
|
|
98
|
-
# 'humaneval',
|
|
99
|
-
# 'super_gpqa',
|
|
100
|
-
# 'gsm8k',
|
|
101
|
-
# 'bbh',
|
|
102
|
-
# 'competition_math',
|
|
103
|
-
# 'math_500',
|
|
104
|
-
'aime24',
|
|
105
|
-
'gpqa',
|
|
106
|
-
# 'arc',
|
|
107
|
-
# 'ceval',
|
|
108
|
-
# 'hellaswag',
|
|
109
|
-
# 'general_mcq',
|
|
110
|
-
# 'general_qa'
|
|
111
|
-
],
|
|
112
|
-
dataset_args={
|
|
113
|
-
'mmlu': {
|
|
114
|
-
'subset_list': ['elementary_mathematics'],
|
|
115
|
-
'few_shot_num': 0
|
|
116
|
-
},
|
|
117
|
-
'mmlu_pro': {
|
|
118
|
-
'subset_list': ['math', 'health'],
|
|
119
|
-
'few_shot_num': 4
|
|
120
|
-
},
|
|
121
|
-
'ceval': {
|
|
122
|
-
'subset_list': [
|
|
123
|
-
'computer_network', 'operating_system', 'computer_architecture'
|
|
124
|
-
],
|
|
125
|
-
'few_shot_num': 0
|
|
126
|
-
},
|
|
127
|
-
'cmmlu': {
|
|
128
|
-
'subset_list': ['elementary_chinese'],
|
|
129
|
-
'few_shot_num': 0
|
|
130
|
-
},
|
|
131
|
-
'bbh': {
|
|
132
|
-
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
133
|
-
},
|
|
134
|
-
'gpqa': {
|
|
135
|
-
'subset_list': ['gpqa_diamond'],
|
|
136
|
-
'few_shot_num': 0
|
|
137
|
-
},
|
|
138
|
-
'humaneval': {
|
|
139
|
-
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
140
|
-
},
|
|
141
|
-
'competition_math': {
|
|
142
|
-
'subset_list': ['Level 1']
|
|
143
|
-
},
|
|
144
|
-
'process_bench': {
|
|
145
|
-
'subset_list': ['gsm8k'],
|
|
146
|
-
},
|
|
147
|
-
'musr': {
|
|
148
|
-
'subset_list': ['murder_mysteries'],
|
|
149
|
-
},
|
|
150
|
-
'general_mcq': {
|
|
151
|
-
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
152
|
-
'subset_list': [
|
|
153
|
-
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
154
|
-
],
|
|
155
|
-
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
156
|
-
},
|
|
157
|
-
'general_qa': {
|
|
158
|
-
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
159
|
-
'subset_list': [
|
|
160
|
-
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
161
|
-
# 'test'
|
|
162
|
-
],
|
|
163
|
-
'metric_list': ['AverageBLEU']
|
|
164
|
-
},
|
|
165
|
-
'super_gpqa': {
|
|
166
|
-
'subset_list': ['Philosophy', 'Education'],
|
|
167
|
-
'few_shot_num': 0
|
|
168
|
-
},
|
|
169
|
-
'ifeval': {
|
|
170
|
-
'filters': {
|
|
171
|
-
'remove_until': '</think>'
|
|
172
|
-
}
|
|
173
|
-
}
|
|
174
|
-
},
|
|
175
|
-
limit=2,
|
|
176
|
-
eval_batch_size=2,
|
|
177
|
-
generation_config={
|
|
178
|
-
'max_new_tokens': 2048,
|
|
179
|
-
'temperature': 0.7,
|
|
180
|
-
'num_return_sequences': 1,
|
|
181
|
-
},
|
|
182
|
-
# debug=True
|
|
183
|
-
)
|
|
184
|
-
run_task(task_cfg=task_cfg)
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
188
|
-
def test_run_one_task(self):
|
|
189
|
-
from evalscope.config import TaskConfig
|
|
190
|
-
|
|
191
|
-
task_cfg = TaskConfig(
|
|
192
|
-
model='Qwen/Qwen3-1.7B',
|
|
193
|
-
datasets=[
|
|
194
|
-
# 'iquiz',
|
|
195
|
-
# 'math_500',
|
|
196
|
-
# 'aime24',
|
|
197
|
-
# 'competition_math',
|
|
198
|
-
# 'mmlu',
|
|
199
|
-
# 'simple_qa',
|
|
200
|
-
'truthful_qa',
|
|
201
|
-
],
|
|
202
|
-
dataset_args={
|
|
203
|
-
'competition_math': {
|
|
204
|
-
'subset_list': ['Level 4', 'Level 5']
|
|
205
|
-
},
|
|
206
|
-
'mmlu': {
|
|
207
|
-
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
208
|
-
'few_shot_num': 0
|
|
209
|
-
},
|
|
210
|
-
},
|
|
211
|
-
limit=5,
|
|
212
|
-
eval_batch_size=5,
|
|
213
|
-
generation_config={
|
|
214
|
-
'max_new_tokens': 1000, # 最大生成token数,建议设置为较大值避免输出截断
|
|
215
|
-
'temperature': 0.7, # 采样温度 (qwen 报告推荐值)
|
|
216
|
-
'top_p': 0.8, # top-p采样 (qwen 报告推荐值)
|
|
217
|
-
'top_k': 20, # top-k采样 (qwen 报告推荐值)
|
|
218
|
-
'chat_template_kwargs': {'enable_thinking': False} # 关闭思考模式
|
|
219
|
-
},
|
|
220
|
-
judge_strategy=JudgeStrategy.AUTO,
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
run_task(task_cfg=task_cfg)
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
227
|
-
def test_run_task_loop(self):
|
|
228
|
-
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
|
|
229
|
-
from evalscope.config import TaskConfig
|
|
230
|
-
|
|
231
|
-
task_cfg1 = TaskConfig(
|
|
232
|
-
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
233
|
-
model_id='model1',
|
|
234
|
-
datasets=['iquiz'],
|
|
235
|
-
limit=10
|
|
236
|
-
)
|
|
237
|
-
task_cfg2 = TaskConfig(
|
|
238
|
-
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
239
|
-
model_id='model2',
|
|
240
|
-
datasets=['iquiz'],
|
|
241
|
-
limit=10
|
|
242
|
-
)
|
|
243
|
-
task_cfg3 = TaskConfig(
|
|
244
|
-
model='Qwen/Qwen2.5-0.5B-Instruct',
|
|
245
|
-
model_id='model3',
|
|
246
|
-
datasets=['iquiz'],
|
|
247
|
-
limit=10
|
|
248
|
-
)
|
|
249
|
-
|
|
250
|
-
run_task(task_cfg=[task_cfg1, task_cfg2, task_cfg3])
|
|
251
|
-
|
|
252
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
253
|
-
def test_run_server_model(self):
|
|
254
|
-
from evalscope.config import TaskConfig
|
|
255
|
-
|
|
256
|
-
task_cfg = TaskConfig(
|
|
257
|
-
model='qwen-plus',
|
|
258
|
-
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
259
|
-
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
260
|
-
eval_type=EvalType.SERVICE,
|
|
261
|
-
datasets=[
|
|
262
|
-
# 'iquiz',
|
|
263
|
-
# 'ifeval',
|
|
264
|
-
# 'mmlu',
|
|
265
|
-
# 'mmlu_pro',
|
|
266
|
-
# 'musr',
|
|
267
|
-
# 'process_bench',
|
|
268
|
-
# 'race',
|
|
269
|
-
'trivia_qa',
|
|
270
|
-
# 'cmmlu',
|
|
271
|
-
# 'humaneval',
|
|
272
|
-
# 'gsm8k',
|
|
273
|
-
# 'bbh',
|
|
274
|
-
# 'competition_math',
|
|
275
|
-
# 'math_500',
|
|
276
|
-
# 'aime24',
|
|
277
|
-
# 'gpqa',
|
|
278
|
-
# 'arc',
|
|
279
|
-
# 'ceval',
|
|
280
|
-
# 'hellaswag',
|
|
281
|
-
# 'general_mcq',
|
|
282
|
-
# 'general_qa',
|
|
283
|
-
# 'super_gpqa',
|
|
284
|
-
# 'mmlu_redux',
|
|
285
|
-
# 'maritime_bench',
|
|
286
|
-
# 'drop',
|
|
287
|
-
# 'winogrande',
|
|
288
|
-
# 'tool_bench',
|
|
289
|
-
# 'frames',
|
|
290
|
-
# 'bfcl_v3',
|
|
291
|
-
# 'truthful_qa',
|
|
292
|
-
# 'tau_bench',
|
|
293
|
-
# 'hle'
|
|
294
|
-
],
|
|
295
|
-
dataset_args={
|
|
296
|
-
'mmlu': {
|
|
297
|
-
'subset_list': ['elementary_mathematics', 'high_school_european_history', 'nutrition'],
|
|
298
|
-
'few_shot_num': 0
|
|
299
|
-
},
|
|
300
|
-
'mmlu_pro': {
|
|
301
|
-
'subset_list': ['math', 'health'],
|
|
302
|
-
'few_shot_num': 0
|
|
303
|
-
},
|
|
304
|
-
'ceval': {
|
|
305
|
-
'subset_list': [
|
|
306
|
-
'computer_network', 'operating_system', 'computer_architecture'
|
|
307
|
-
],
|
|
308
|
-
'few_shot_num': 0
|
|
309
|
-
},
|
|
310
|
-
'cmmlu': {
|
|
311
|
-
'subset_list': ['elementary_chinese'],
|
|
312
|
-
'few_shot_num': 0
|
|
313
|
-
},
|
|
314
|
-
'bbh': {
|
|
315
|
-
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
316
|
-
},
|
|
317
|
-
'gpqa': {
|
|
318
|
-
# 'subset_list': ['gpqa_diamond'],
|
|
319
|
-
'few_shot_num': 0,
|
|
320
|
-
'local_path': './data/data/gpqa',
|
|
321
|
-
},
|
|
322
|
-
'humaneval': {
|
|
323
|
-
'metric_list': ['Pass@1', 'Pass@2', 'Pass@5'],
|
|
324
|
-
},
|
|
325
|
-
'competition_math': {
|
|
326
|
-
'subset_list': ['Level 1']
|
|
327
|
-
},
|
|
328
|
-
'process_bench': {
|
|
329
|
-
'subset_list': ['gsm8k'],
|
|
330
|
-
},
|
|
331
|
-
'musr': {
|
|
332
|
-
'subset_list': ['murder_mysteries'],
|
|
333
|
-
},
|
|
334
|
-
'general_mcq': {
|
|
335
|
-
'local_path': 'custom_eval/text/mcq', # 自定义数据集路径
|
|
336
|
-
'subset_list': [
|
|
337
|
-
'example' # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
338
|
-
],
|
|
339
|
-
'query_template': 'Question: {question}\n{choices}\nAnswer: {answer}' # 问题模板
|
|
340
|
-
},
|
|
341
|
-
'general_qa': {
|
|
342
|
-
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
343
|
-
'subset_list': [
|
|
344
|
-
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
345
|
-
# 'test'
|
|
346
|
-
],
|
|
347
|
-
'metric_list': ['AverageRouge']
|
|
348
|
-
},
|
|
349
|
-
'super_gpqa': {
|
|
350
|
-
'subset_list': ['Philosophy', 'Education'],
|
|
351
|
-
'few_shot_num': 0
|
|
352
|
-
},
|
|
353
|
-
'mmlu_redux':{
|
|
354
|
-
'subset_list': ['abstract_algebra']
|
|
355
|
-
},
|
|
356
|
-
'frames':{
|
|
357
|
-
'local_path': 'data/iic/frames',
|
|
358
|
-
},
|
|
359
|
-
'bfcl_v3': {
|
|
360
|
-
'subset_list': ['parallel'],
|
|
361
|
-
'extra_params': {
|
|
362
|
-
# 'is_fc_model': False,
|
|
363
|
-
}
|
|
364
|
-
},
|
|
365
|
-
'tau_bench': {
|
|
366
|
-
'extra_params': {
|
|
367
|
-
'user_model': 'qwen-plus',
|
|
368
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
369
|
-
'api_base': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
370
|
-
}
|
|
371
|
-
},
|
|
372
|
-
'hle': {
|
|
373
|
-
'subset_list': ['Math', 'Other'],
|
|
374
|
-
},
|
|
375
|
-
},
|
|
376
|
-
eval_batch_size=10,
|
|
377
|
-
limit=10,
|
|
378
|
-
# debug=True,
|
|
379
|
-
stream=True,
|
|
380
|
-
generation_config={
|
|
381
|
-
'temperature': 0.6,
|
|
382
|
-
'n': 1,
|
|
383
|
-
'max_tokens': 4096,
|
|
384
|
-
# 'extra_headers':{'key': 'value'},
|
|
385
|
-
},
|
|
386
|
-
ignore_errors=False,
|
|
387
|
-
)
|
|
388
|
-
|
|
389
|
-
run_task(task_cfg=task_cfg)
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
394
|
-
def test_run_judge_model(self):
|
|
395
|
-
from evalscope.config import TaskConfig
|
|
396
|
-
|
|
397
|
-
task_cfg = TaskConfig(
|
|
398
|
-
model='qwen-plus',
|
|
399
|
-
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
400
|
-
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
401
|
-
eval_type=EvalType.SERVICE,
|
|
402
|
-
datasets=[
|
|
403
|
-
# 'math_500',
|
|
404
|
-
# 'aime24',
|
|
405
|
-
# 'competition_math',
|
|
406
|
-
# 'arc',
|
|
407
|
-
# 'gsm8k',
|
|
408
|
-
# 'truthful_qa',
|
|
409
|
-
# 'simple_qa',
|
|
410
|
-
# 'chinese_simpleqa',
|
|
411
|
-
# 'live_code_bench',
|
|
412
|
-
# 'humaneval',
|
|
413
|
-
# 'general_qa',
|
|
414
|
-
# 'alpaca_eval',
|
|
415
|
-
# 'arena_hard',
|
|
416
|
-
# 'frames',
|
|
417
|
-
# 'docmath',
|
|
418
|
-
# 'needle_haystack',
|
|
419
|
-
# 'ifeval',
|
|
420
|
-
'hle'
|
|
421
|
-
],
|
|
422
|
-
dataset_args={
|
|
423
|
-
'needle_haystack': {
|
|
424
|
-
'subset_list': ['english'],
|
|
425
|
-
'extra_params': {
|
|
426
|
-
'show_score': True,
|
|
427
|
-
}
|
|
428
|
-
},
|
|
429
|
-
'competition_math': {
|
|
430
|
-
'subset_list': ['Level 4']
|
|
431
|
-
},
|
|
432
|
-
'live_code_bench': {
|
|
433
|
-
'extra_params': {
|
|
434
|
-
'start_date': '2024-08-01',
|
|
435
|
-
'end_date': '2025-02-28'
|
|
436
|
-
},
|
|
437
|
-
'local_path': '/root/.cache/modelscope/hub/datasets/AI-ModelScope/code_generation_lite'
|
|
438
|
-
},
|
|
439
|
-
'general_qa': {
|
|
440
|
-
'local_path': 'custom_eval/text/qa', # 自定义数据集路径
|
|
441
|
-
'subset_list': [
|
|
442
|
-
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
443
|
-
# 'test'
|
|
444
|
-
]
|
|
445
|
-
},
|
|
446
|
-
'chinese_simpleqa': {
|
|
447
|
-
'subset_list': [
|
|
448
|
-
'中华文化'
|
|
449
|
-
]
|
|
450
|
-
},
|
|
451
|
-
'frames': {
|
|
452
|
-
'local_path': '/root/.cache/modelscope/hub/datasets/iic/frames'
|
|
453
|
-
},
|
|
454
|
-
'hle': {
|
|
455
|
-
'subset_list': ['Math', 'Other'],
|
|
456
|
-
},
|
|
457
|
-
},
|
|
458
|
-
eval_batch_size=10,
|
|
459
|
-
limit=3,
|
|
460
|
-
judge_strategy=JudgeStrategy.LLM,
|
|
461
|
-
judge_worker_num=5,
|
|
462
|
-
judge_model_args={
|
|
463
|
-
'model_id': 'qwen2.5-72b-instruct',
|
|
464
|
-
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
465
|
-
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
466
|
-
'generation_config': {
|
|
467
|
-
'temperature': 0.0,
|
|
468
|
-
'max_tokens': 4096
|
|
469
|
-
}
|
|
470
|
-
},
|
|
471
|
-
generation_config={
|
|
472
|
-
'max_new_tokens': 20000,
|
|
473
|
-
'temperature': 0.0,
|
|
474
|
-
'seed': 42,
|
|
475
|
-
'n': 1
|
|
476
|
-
},
|
|
477
|
-
timeout=60000,
|
|
478
|
-
stream=True,
|
|
479
|
-
use_cache='outputs/20250714_150626'
|
|
480
|
-
# analysis_report=True,
|
|
481
|
-
# debug=True,
|
|
482
|
-
# use_cache='outputs/20250616_161931'
|
|
483
|
-
)
|
|
484
|
-
|
|
485
|
-
run_task(task_cfg=task_cfg)
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
if __name__ == '__main__':
|
|
489
|
-
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|