evalscope 0.7.2__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +6 -2
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +47 -51
- evalscope/backend/rag_eval/utils/embedding.py +13 -12
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +35 -39
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +27 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +28 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +192 -7
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +33 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +154 -96
- evalscope/constants.py +50 -32
- evalscope/evaluator/evaluator.py +97 -377
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +48 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +16 -3
- evalscope/perf/benchmark.py +9 -11
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +8 -1
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +5 -6
- evalscope/perf/utils/db_util.py +77 -30
- evalscope/perf/utils/local_server.py +21 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +153 -381
- evalscope/run_arena.py +21 -25
- evalscope/summarizer.py +27 -40
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -27
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -4
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +5 -6
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +4 -3
- evalscope/tools/combine_reports.py +27 -34
- evalscope/tools/rewrite_eval_results.py +15 -47
- evalscope/utils/__init__.py +1 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +4 -5
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +17 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +5 -306
- evalscope/version.py +2 -2
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/METADATA +123 -118
- evalscope-0.8.1.dist-info/RECORD +285 -0
- tests/cli/test_run.py +53 -15
- tests/perf/test_perf.py +6 -1
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.2.dist-info/RECORD +0 -286
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.2.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -20,8 +20,7 @@ model: null # Note: to be implemented as CustomModel
|
|
|
20
20
|
eval_type: custom
|
|
21
21
|
datasets:
|
|
22
22
|
- ceval
|
|
23
|
-
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
24
23
|
use_cache: false
|
|
25
24
|
stage: all
|
|
26
|
-
dataset_hub:
|
|
27
|
-
limit: null
|
|
25
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
26
|
+
limit: null
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -23,5 +23,5 @@ datasets:
|
|
|
23
23
|
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
24
24
|
use_cache: false
|
|
25
25
|
stage: all
|
|
26
|
-
dataset_hub:
|
|
27
|
-
limit: null
|
|
26
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
27
|
+
limit: null
|
|
@@ -24,5 +24,5 @@ datasets:
|
|
|
24
24
|
outputs: ./outputs/eval_qwen-7b-chat_v100 # Directory to save the outputs, structure: logs, predictions, reviews, reports
|
|
25
25
|
use_cache: false
|
|
26
26
|
stage: all
|
|
27
|
-
dataset_hub:
|
|
27
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
28
28
|
limit: 10
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -25,5 +25,5 @@ datasets:
|
|
|
25
25
|
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
26
26
|
use_cache: false
|
|
27
27
|
stage: all
|
|
28
|
-
dataset_hub:
|
|
28
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
29
29
|
limit: null
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -25,5 +25,5 @@ datasets:
|
|
|
25
25
|
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
26
26
|
use_cache: true
|
|
27
27
|
stage: all
|
|
28
|
-
dataset_hub:
|
|
29
|
-
limit: null
|
|
28
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
29
|
+
limit: null
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
model_args: # model args should be followed by benchmark requirements
|
|
2
|
-
revision:
|
|
2
|
+
revision: master
|
|
3
3
|
precision: torch.float16
|
|
4
4
|
device_map: auto
|
|
5
5
|
# model_name_or_path: qwen/qwen-7b-chat
|
|
@@ -23,5 +23,5 @@ datasets:
|
|
|
23
23
|
outputs: null # structure: configs, logs, predictions, reviews, reports # TODO: need to parse
|
|
24
24
|
use_cache: false
|
|
25
25
|
stage: all
|
|
26
|
-
dataset_hub:
|
|
27
|
-
limit: null
|
|
26
|
+
dataset_hub: modelscope # `Local` or `ModelScope`
|
|
27
|
+
limit: null
|
evalscope/run.py
CHANGED
|
@@ -1,408 +1,180 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
import
|
|
2
|
+
"""
|
|
3
|
+
Run evaluation for LLMs.
|
|
4
|
+
"""
|
|
5
|
+
import logging
|
|
6
6
|
import os.path
|
|
7
|
-
|
|
8
|
-
import
|
|
9
|
-
|
|
10
|
-
from
|
|
11
|
-
|
|
7
|
+
import torch
|
|
8
|
+
from argparse import Namespace
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from typing import List, Optional, Union
|
|
11
|
+
|
|
12
|
+
from evalscope.arguments import parse_args
|
|
13
|
+
from evalscope.config import TaskConfig, parse_task_config
|
|
14
|
+
from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType
|
|
12
15
|
from evalscope.evaluator import Evaluator
|
|
13
|
-
from evalscope.evaluator.evaluator import HumanevalEvaluator
|
|
14
16
|
from evalscope.models.custom import CustomModel
|
|
15
|
-
from evalscope.utils import import_module_util,
|
|
16
|
-
from evalscope.utils.
|
|
17
|
+
from evalscope.utils import import_module_util, seed_everything
|
|
18
|
+
from evalscope.utils.io_utils import OutputsStructure, are_paths_same
|
|
19
|
+
from evalscope.utils.logger import configure_logging, get_logger
|
|
17
20
|
|
|
18
21
|
logger = get_logger()
|
|
19
22
|
|
|
20
|
-
"""
|
|
21
|
-
Run evaluation for LLMs.
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
23
|
BENCHMARK_PATH_PREFIX = 'evalscope.benchmarks.'
|
|
25
24
|
MEMBERS_TO_IMPORT = ['DATASET_ID', 'SUBSET_LIST', 'DataAdapterClass', 'ModelAdapterClass']
|
|
26
25
|
|
|
27
26
|
|
|
28
|
-
def
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
parser.add_argument('--model',
|
|
32
|
-
help='The model id on modelscope, or local model dir.',
|
|
33
|
-
type=str,
|
|
34
|
-
# required=True,
|
|
35
|
-
required=False,
|
|
36
|
-
)
|
|
37
|
-
parser.add_argument('--model-type',
|
|
38
|
-
help='Deprecated. See `--template-type`',
|
|
39
|
-
type=str,
|
|
40
|
-
required=False,
|
|
41
|
-
default=None)
|
|
42
|
-
parser.add_argument('--template-type',
|
|
43
|
-
type=str,
|
|
44
|
-
help='The template type for generation, should be a string.'
|
|
45
|
-
'Refer to `https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md` for more details.',
|
|
46
|
-
required=False,
|
|
47
|
-
)
|
|
48
|
-
parser.add_argument('--eval-type',
|
|
49
|
-
type=str,
|
|
50
|
-
help='The type for evaluating. '
|
|
51
|
-
'service - for APIs, TO-DO'
|
|
52
|
-
'checkpoint - for models on ModelScope or local model dir, '
|
|
53
|
-
'custom - for custom models.'
|
|
54
|
-
' Need to set `--model` to evalscope.models.custom.CustomModel format.'
|
|
55
|
-
'default to `checkpoint`.',
|
|
56
|
-
required=False,
|
|
57
|
-
default='checkpoint',
|
|
58
|
-
)
|
|
59
|
-
parser.add_argument('--model-args',
|
|
60
|
-
type=str,
|
|
61
|
-
help='The model args, should be a string.',
|
|
62
|
-
required=False,
|
|
63
|
-
default='revision=None,precision=torch.float16,device_map=auto'
|
|
64
|
-
)
|
|
65
|
-
parser.add_argument('--generation-config',
|
|
66
|
-
type=str,
|
|
67
|
-
help='The generation config, should be a string.',
|
|
68
|
-
required=False,
|
|
69
|
-
default='do_sample=False,repetition_penalty=1.0,max_new_tokens=512',
|
|
70
|
-
)
|
|
71
|
-
parser.add_argument('--datasets',
|
|
72
|
-
help='Dataset id list, align to the module name in evalscope.benchmarks',
|
|
73
|
-
type=str,
|
|
74
|
-
nargs='+',
|
|
75
|
-
required=False,
|
|
76
|
-
)
|
|
77
|
-
parser.add_argument('--dataset-args',
|
|
78
|
-
type=json.loads,
|
|
79
|
-
help='The dataset args, should be a json string. The key of dict should be aligned to datasets,'
|
|
80
|
-
'e.g. {"humaneval": {"local_path": "/to/your/path"}}',
|
|
81
|
-
required=False,
|
|
82
|
-
default='{}')
|
|
83
|
-
parser.add_argument('--dataset-dir',
|
|
84
|
-
help='The datasets dir. Use to specify the local datasets or datasets cache dir.'
|
|
85
|
-
'See --dataset-hub for more details.',
|
|
86
|
-
required=False,
|
|
87
|
-
default=DEFAULT_ROOT_CACHE_DIR)
|
|
88
|
-
parser.add_argument('--dataset-hub',
|
|
89
|
-
help='The datasets hub, can be `ModelScope` or `HuggingFace` or `Local`. '
|
|
90
|
-
'Default to `ModelScope`.'
|
|
91
|
-
'If `Local`, the --dataset-dir should be local input data dir.'
|
|
92
|
-
'Otherwise, the --dataset-dir should be the cache dir for datasets.',
|
|
93
|
-
required=False,
|
|
94
|
-
default='ModelScope')
|
|
95
|
-
parser.add_argument('--outputs',
|
|
96
|
-
help='Outputs dir. Default to `outputs`, which means dump to current path: ./outputs',
|
|
97
|
-
required=False,
|
|
98
|
-
default='outputs')
|
|
99
|
-
parser.add_argument('--work-dir',
|
|
100
|
-
help='The root cache dir.',
|
|
101
|
-
required=False,
|
|
102
|
-
default=DEFAULT_ROOT_CACHE_DIR)
|
|
103
|
-
parser.add_argument('--limit',
|
|
104
|
-
type=int,
|
|
105
|
-
help='Max evaluation samples num for each subset. Default to None, which means no limit.',
|
|
106
|
-
default=None)
|
|
107
|
-
parser.add_argument('--debug',
|
|
108
|
-
help='Debug mode, will print information for debugging.',
|
|
109
|
-
action='store_true',
|
|
110
|
-
default=False)
|
|
111
|
-
parser.add_argument('--dry-run',
|
|
112
|
-
help='Dry run in single processing mode.',
|
|
113
|
-
action='store_true',
|
|
114
|
-
default=False)
|
|
115
|
-
parser.add_argument('--mem-cache',
|
|
116
|
-
help='To use memory cache or not.',
|
|
117
|
-
action='store_true',
|
|
118
|
-
default=False)
|
|
119
|
-
parser.add_argument('--use-cache',
|
|
120
|
-
help='To reuse the cache or not. Default to `true`.',
|
|
121
|
-
type=str,
|
|
122
|
-
default='false')
|
|
123
|
-
parser.add_argument('--stage',
|
|
124
|
-
help='The stage of evaluation pipeline, '
|
|
125
|
-
'can be `all`, `infer`, `review`. Default to `all`.',
|
|
126
|
-
type=str,
|
|
127
|
-
default='all')
|
|
128
|
-
|
|
129
|
-
parser.add_argument('--eval-backend',
|
|
130
|
-
help='The evaluation backend to use. Default to None.'
|
|
131
|
-
'can be `Native`, `OpenCompass` and `ThirdParty`. '
|
|
132
|
-
'Default to `Native`.',
|
|
133
|
-
type=str,
|
|
134
|
-
default=EvalBackend.NATIVE.value,
|
|
135
|
-
required=False)
|
|
136
|
-
|
|
137
|
-
parser.add_argument('--eval-config',
|
|
138
|
-
help='The eval task config file path for evaluation backend, should be a yaml or json file.',
|
|
139
|
-
type=str,
|
|
140
|
-
default=None,
|
|
141
|
-
required=False)
|
|
142
|
-
|
|
143
|
-
args = parser.parse_args()
|
|
144
|
-
|
|
145
|
-
return args
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def parse_str_args(str_args: str) -> dict:
|
|
149
|
-
assert isinstance(str_args, str), 'args should be a string.'
|
|
150
|
-
arg_list: list = str_args.strip().split(',')
|
|
151
|
-
arg_list = [arg.strip() for arg in arg_list]
|
|
152
|
-
arg_dict: dict = dict([arg.split('=') for arg in arg_list])
|
|
153
|
-
|
|
154
|
-
final_args = dict()
|
|
155
|
-
for k, v in arg_dict.items():
|
|
156
|
-
try:
|
|
157
|
-
final_args[k] = eval(v)
|
|
158
|
-
except:
|
|
159
|
-
if v.lower() == 'true':
|
|
160
|
-
v = True
|
|
161
|
-
if v.lower() == 'false':
|
|
162
|
-
v = False
|
|
163
|
-
final_args[k] = v
|
|
164
|
-
|
|
165
|
-
return final_args
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig]]) -> Union[dict, List[dict]]:
|
|
27
|
+
def run_task(task_cfg: Union[str, dict, TaskConfig, List[TaskConfig], Namespace]) -> Union[dict, List[dict]]:
|
|
28
|
+
"""Run evaluation task(s) based on the provided configuration."""
|
|
29
|
+
run_time = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
169
30
|
|
|
31
|
+
# If task_cfg is a list, run each task individually
|
|
170
32
|
if isinstance(task_cfg, list):
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
if isinstance(task_cfg, TaskConfig):
|
|
177
|
-
task_cfg = task_cfg.to_dict()
|
|
178
|
-
elif isinstance(task_cfg, str):
|
|
179
|
-
if task_cfg.endswith('.yaml'):
|
|
180
|
-
task_cfg = yaml_to_dict(task_cfg)
|
|
181
|
-
elif task_cfg.endswith('.json'):
|
|
182
|
-
task_cfg = json_to_dict(task_cfg)
|
|
183
|
-
else:
|
|
184
|
-
raise ValueError(f'Unsupported file format: {task_cfg}, should be a yaml or json file.')
|
|
185
|
-
elif isinstance(task_cfg, dict):
|
|
186
|
-
logger.info('** Args: Task config is provided with dictionary type. **')
|
|
187
|
-
else:
|
|
188
|
-
raise ValueError('** Args: Please provide a valid task config. **')
|
|
189
|
-
|
|
190
|
-
# Check and run evaluation backend
|
|
191
|
-
if task_cfg.get('eval_backend') is None:
|
|
192
|
-
task_cfg['eval_backend'] = EvalBackend.NATIVE.value
|
|
193
|
-
|
|
194
|
-
eval_backend = task_cfg.get('eval_backend')
|
|
195
|
-
eval_config: Union[str, dict] = task_cfg.get('eval_config')
|
|
196
|
-
|
|
197
|
-
if eval_backend != EvalBackend.NATIVE.value:
|
|
198
|
-
|
|
199
|
-
if eval_config is None:
|
|
200
|
-
logger.warning(f'Got eval_backend {eval_backend}, but eval_config is not provided.')
|
|
201
|
-
|
|
202
|
-
if eval_backend == EvalBackend.OPEN_COMPASS.value:
|
|
203
|
-
from evalscope.backend.opencompass import OpenCompassBackendManager
|
|
204
|
-
oc_backend_manager = OpenCompassBackendManager(config=eval_config)
|
|
205
|
-
oc_backend_manager.run()
|
|
206
|
-
elif eval_backend == EvalBackend.VLM_EVAL_KIT.value:
|
|
207
|
-
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
208
|
-
vlm_eval_kit_backend_manager = VLMEvalKitBackendManager(config=eval_config)
|
|
209
|
-
vlm_eval_kit_backend_manager.run()
|
|
210
|
-
elif eval_backend == EvalBackend.RAG_EVAL.value:
|
|
211
|
-
from evalscope.backend.rag_eval import RAGEvalBackendManager
|
|
212
|
-
rag_eval_backend_manager = RAGEvalBackendManager(config=eval_config)
|
|
213
|
-
rag_eval_backend_manager.run()
|
|
214
|
-
# TODO: Add other evaluation backends
|
|
215
|
-
elif eval_backend == EvalBackend.THIRD_PARTY.value:
|
|
216
|
-
raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
|
|
217
|
-
|
|
218
|
-
return dict()
|
|
219
|
-
|
|
220
|
-
# Get the output task config
|
|
221
|
-
output_task_cfg = copy.copy(task_cfg)
|
|
222
|
-
logger.info(output_task_cfg)
|
|
223
|
-
|
|
224
|
-
model_args: dict = task_cfg.get('model_args',
|
|
225
|
-
{'revision': 'default', 'precision': torch.float16, 'device_map': 'auto'})
|
|
226
|
-
# Get the GLOBAL default config (infer_cfg) for prediction
|
|
227
|
-
generation_config: dict = task_cfg.get('generation_config',
|
|
228
|
-
{'do_sample': False,
|
|
229
|
-
'repetition_penalty': 1.0,
|
|
230
|
-
'max_length': 2048,
|
|
231
|
-
'max_new_tokens': 512,
|
|
232
|
-
'temperature': 0.3,
|
|
233
|
-
'top_k': 50,
|
|
234
|
-
'top_p': 0.8, }
|
|
235
|
-
)
|
|
236
|
-
dataset_args: dict = task_cfg.get('dataset_args', {})
|
|
237
|
-
dry_run: bool = task_cfg.get('dry_run', False)
|
|
238
|
-
model: Union[str, CustomModel] = task_cfg.get('model', None)
|
|
239
|
-
model_type: str = task_cfg.get('model_type', None)
|
|
240
|
-
template_type: str = task_cfg.get('template_type', None)
|
|
241
|
-
eval_type: str = task_cfg.get('eval_type', 'checkpoint')
|
|
242
|
-
datasets: list = task_cfg.get('datasets', None)
|
|
243
|
-
work_dir: str = task_cfg.get('work_dir', DEFAULT_ROOT_CACHE_DIR)
|
|
244
|
-
outputs: str = task_cfg.get('outputs', 'outputs')
|
|
245
|
-
mem_cache: bool = task_cfg.get('mem_cache', False)
|
|
246
|
-
use_cache: bool = task_cfg.get('use_cache', True)
|
|
247
|
-
dataset_hub: str = task_cfg.get('dataset_hub', 'ModelScope')
|
|
248
|
-
dataset_dir: str = task_cfg.get('dataset_dir', DEFAULT_ROOT_CACHE_DIR)
|
|
249
|
-
stage: str = task_cfg.get('stage', 'all')
|
|
250
|
-
limit: int = task_cfg.get('limit', None)
|
|
251
|
-
debug: str = task_cfg.get('debug', False)
|
|
252
|
-
|
|
253
|
-
if model is None or datasets is None:
|
|
254
|
-
if not task_cfg.get('eval_backend'):
|
|
255
|
-
raise ValueError('** Args: Please provide model and datasets. **')
|
|
256
|
-
|
|
257
|
-
if model_type:
|
|
258
|
-
logger.warning('** DeprecatedWarning: `--model-type` is deprecated, please use `--template-type` instead.')
|
|
259
|
-
|
|
260
|
-
model_precision = model_args.get('precision', torch.float16)
|
|
261
|
-
if isinstance(model_precision, str):
|
|
262
|
-
model_precision = eval(model_precision)
|
|
263
|
-
|
|
264
|
-
if mem_cache:
|
|
265
|
-
logger.warning('** DeprecatedWarning: `--mem-cache` is deprecated, please use `--use-cache` instead.')
|
|
266
|
-
|
|
267
|
-
logger.info(f'** Set use_cache to {use_cache}.')
|
|
268
|
-
|
|
269
|
-
# Get model args
|
|
270
|
-
if dry_run:
|
|
271
|
-
from evalscope.models.dummy_chat_model import DummyChatModel
|
|
272
|
-
model_id: str = 'dummy'
|
|
273
|
-
model_revision: str = 'v1.0.0'
|
|
274
|
-
elif eval_type == 'custom':
|
|
275
|
-
model_id: str = None
|
|
276
|
-
model_revision: str = None
|
|
277
|
-
else:
|
|
278
|
-
model_id: str = model
|
|
279
|
-
model_revision: str = model_args.get('revision', 'default')
|
|
33
|
+
return [run_single_task(cfg, run_time) for cfg in task_cfg]
|
|
34
|
+
|
|
35
|
+
task_cfg = parse_task_config(task_cfg)
|
|
36
|
+
return run_single_task(task_cfg, run_time)
|
|
37
|
+
|
|
280
38
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
39
|
+
def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
40
|
+
"""Run a single evaluation task."""
|
|
41
|
+
seed_everything(task_cfg.seed)
|
|
42
|
+
outputs = setup_work_directory(task_cfg, run_time)
|
|
43
|
+
configure_logging(task_cfg.debug, os.path.join(outputs.logs_dir, 'eval_log.log'))
|
|
44
|
+
|
|
45
|
+
task_cfg.dump_yaml(outputs.configs_dir)
|
|
46
|
+
logger.info(task_cfg)
|
|
47
|
+
|
|
48
|
+
if task_cfg.eval_backend != EvalBackend.NATIVE:
|
|
49
|
+
return run_non_native_backend(task_cfg)
|
|
285
50
|
else:
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
|
|
297
|
-
|
|
298
|
-
if dataset_name == 'humaneval' and dataset_args.get('humaneval', {}).get('local_path') is None:
|
|
299
|
-
raise ValueError('Please specify the local problem path of humaneval dataset in --dataset-args,'
|
|
300
|
-
'e.g. {"humaneval": {"local_path": "/to/your/path"}}, '
|
|
301
|
-
'And refer to https://github.com/openai/human-eval/tree/master#installation to install it,'
|
|
302
|
-
'Note that you need to enable the execution code in the human_eval/execution.py first.')
|
|
303
|
-
|
|
304
|
-
if dry_run:
|
|
305
|
-
from evalscope.models.dummy_chat_model import DummyChatModel
|
|
306
|
-
model_adapter = DummyChatModel(model_cfg=dict())
|
|
307
|
-
elif eval_type == 'custom':
|
|
308
|
-
if not isinstance(model, CustomModel):
|
|
309
|
-
raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(model)}.')
|
|
310
|
-
from evalscope.models.model_adapter import CustomModelAdapter
|
|
311
|
-
model_adapter = CustomModelAdapter(custom_model=model)
|
|
312
|
-
else:
|
|
313
|
-
# Init model adapter
|
|
314
|
-
device_map = model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
|
|
315
|
-
model_adapter = imported_modules['ModelAdapterClass'](model_id=model_id,
|
|
316
|
-
model_revision=model_revision,
|
|
317
|
-
device_map=device_map,
|
|
318
|
-
torch_dtype=model_precision,
|
|
319
|
-
cache_dir=work_dir,
|
|
320
|
-
template_type=template_type)
|
|
321
|
-
|
|
322
|
-
if dataset_name == 'humaneval':
|
|
323
|
-
problem_file: str = dataset_args.get('humaneval', {}).get('local_path')
|
|
324
|
-
|
|
325
|
-
evaluator = HumanevalEvaluator(problem_file=problem_file,
|
|
326
|
-
model_id=model_id,
|
|
327
|
-
model_revision=model_revision,
|
|
328
|
-
model_adapter=model_adapter,
|
|
329
|
-
outputs_dir=outputs,
|
|
330
|
-
is_custom_outputs_dir=False, )
|
|
331
|
-
else:
|
|
332
|
-
# TODO: CHECK dataset_args
|
|
333
|
-
dataset_name_or_path: str = dataset_args.get(dataset_name, {}).get('local_path') or imported_modules[
|
|
334
|
-
'DATASET_ID']
|
|
335
|
-
|
|
336
|
-
in_prompt_template: str = dataset_args.get(dataset_name, {}).get('prompt_template', '')
|
|
337
|
-
|
|
338
|
-
# Init data adapter
|
|
339
|
-
few_shot_num: int = dataset_args.get(dataset_name, {}).get('few_shot_num', None)
|
|
340
|
-
few_shot_random: bool = dataset_args.get(dataset_name, {}).get('few_shot_random', True)
|
|
341
|
-
data_adapter = imported_modules['DataAdapterClass'](few_shot_num=few_shot_num,
|
|
342
|
-
few_shot_random=few_shot_random,
|
|
343
|
-
prompt_template=in_prompt_template,)
|
|
344
|
-
|
|
345
|
-
in_subset_list: list = dataset_args.get(dataset_name, {})\
|
|
346
|
-
.get('subset_list', imported_modules['SUBSET_LIST'])
|
|
347
|
-
logger.info(f'\n** Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
|
|
348
|
-
|
|
349
|
-
evaluator = Evaluator(
|
|
350
|
-
dataset_name_or_path=dataset_name_or_path,
|
|
351
|
-
subset_list=in_subset_list,
|
|
352
|
-
data_adapter=data_adapter,
|
|
353
|
-
model_adapter=model_adapter,
|
|
354
|
-
use_cache=use_cache,
|
|
355
|
-
root_cache_dir=work_dir,
|
|
356
|
-
outputs_dir=outputs,
|
|
357
|
-
is_custom_outputs_dir=outputs != 'outputs',
|
|
358
|
-
datasets_dir=dataset_dir,
|
|
359
|
-
datasets_hub=dataset_hub,
|
|
360
|
-
stage=stage,
|
|
361
|
-
eval_type=eval_type,
|
|
362
|
-
overall_task_cfg=output_task_cfg,
|
|
363
|
-
)
|
|
364
|
-
|
|
365
|
-
infer_cfg = generation_config or {}
|
|
366
|
-
infer_cfg.update(dict(limit=limit))
|
|
367
|
-
res_dict: dict = evaluator.eval(infer_cfg=infer_cfg, debug=debug)
|
|
51
|
+
return evaluate_model(task_cfg, outputs)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def setup_work_directory(task_cfg: TaskConfig, run_time: str):
|
|
55
|
+
"""Set the working directory for the task."""
|
|
56
|
+
if task_cfg.use_cache:
|
|
57
|
+
task_cfg.work_dir = task_cfg.use_cache
|
|
58
|
+
logger.info(f'Set resume from {task_cfg.work_dir}')
|
|
59
|
+
elif are_paths_same(task_cfg.work_dir, DEFAULT_WORK_DIR):
|
|
60
|
+
task_cfg.work_dir = os.path.join(task_cfg.work_dir, run_time)
|
|
368
61
|
|
|
62
|
+
outputs = OutputsStructure(outputs_dir=task_cfg.work_dir)
|
|
63
|
+
|
|
64
|
+
if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS:
|
|
65
|
+
task_cfg.eval_config['time_str'] = run_time
|
|
66
|
+
elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
67
|
+
task_cfg.eval_config['work_dir'] = task_cfg.work_dir
|
|
68
|
+
return outputs
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def run_non_native_backend(task_cfg: TaskConfig) -> dict:
|
|
72
|
+
"""Run evaluation using a non-native backend."""
|
|
73
|
+
eval_backend = task_cfg.eval_backend
|
|
74
|
+
eval_config = task_cfg.eval_config
|
|
75
|
+
|
|
76
|
+
if eval_config is None:
|
|
77
|
+
logger.warning(f'Got eval_backend {eval_backend}, but eval_config is not provided.')
|
|
78
|
+
|
|
79
|
+
backend_manager_class = get_backend_manager_class(eval_backend)
|
|
80
|
+
backend_manager = backend_manager_class(config=eval_config)
|
|
81
|
+
backend_manager.run()
|
|
82
|
+
|
|
83
|
+
return dict()
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def get_backend_manager_class(eval_backend: EvalBackend):
|
|
87
|
+
"""Get the backend manager class based on the evaluation backend."""
|
|
88
|
+
if eval_backend == EvalBackend.OPEN_COMPASS:
|
|
89
|
+
from evalscope.backend.opencompass import OpenCompassBackendManager
|
|
90
|
+
return OpenCompassBackendManager
|
|
91
|
+
elif eval_backend == EvalBackend.VLM_EVAL_KIT:
|
|
92
|
+
from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager
|
|
93
|
+
return VLMEvalKitBackendManager
|
|
94
|
+
elif eval_backend == EvalBackend.RAG_EVAL:
|
|
95
|
+
from evalscope.backend.rag_eval import RAGEvalBackendManager
|
|
96
|
+
return RAGEvalBackendManager
|
|
97
|
+
elif eval_backend == EvalBackend.THIRD_PARTY:
|
|
98
|
+
raise NotImplementedError(f'Not implemented for evaluation backend {eval_backend}')
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
102
|
+
"""Evaluate the model based on the provided task configuration."""
|
|
103
|
+
# Initialize evaluator
|
|
104
|
+
eval_results = {}
|
|
105
|
+
|
|
106
|
+
for dataset_name in task_cfg.datasets:
|
|
107
|
+
evaluator = create_evaluator(task_cfg, dataset_name, outputs)
|
|
108
|
+
res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit)
|
|
369
109
|
eval_results[dataset_name] = res_dict
|
|
370
110
|
|
|
371
111
|
return eval_results
|
|
372
112
|
|
|
373
113
|
|
|
114
|
+
def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure):
|
|
115
|
+
"""Create an evaluator object for the specified dataset."""
|
|
116
|
+
imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT)
|
|
117
|
+
model_adapter = initialize_model_adapter(task_cfg, dataset_name, imported_modules)
|
|
118
|
+
|
|
119
|
+
dataset_config = task_cfg.dataset_args.get(dataset_name, {})
|
|
120
|
+
dataset_name_or_path = dataset_config.get('local_path') or imported_modules['DATASET_ID']
|
|
121
|
+
in_prompt_template = dataset_config.get('prompt_template', '')
|
|
122
|
+
few_shot_num = dataset_config.get('few_shot_num', None)
|
|
123
|
+
few_shot_random = dataset_config.get('few_shot_random', True)
|
|
124
|
+
|
|
125
|
+
data_adapter = imported_modules['DataAdapterClass'](
|
|
126
|
+
few_shot_num=few_shot_num,
|
|
127
|
+
few_shot_random=few_shot_random,
|
|
128
|
+
prompt_template=in_prompt_template,
|
|
129
|
+
outputs=outputs,
|
|
130
|
+
)
|
|
131
|
+
in_subset_list = dataset_config.get('subset_list', imported_modules['SUBSET_LIST'])
|
|
132
|
+
|
|
133
|
+
logger.info(f'Evaluating on subsets for {dataset_name}: {in_subset_list}\n')
|
|
134
|
+
|
|
135
|
+
return Evaluator(
|
|
136
|
+
dataset_name_or_path=dataset_name_or_path,
|
|
137
|
+
subset_list=in_subset_list,
|
|
138
|
+
data_adapter=data_adapter,
|
|
139
|
+
model_adapter=model_adapter,
|
|
140
|
+
use_cache=task_cfg.use_cache,
|
|
141
|
+
outputs=outputs,
|
|
142
|
+
datasets_dir=task_cfg.dataset_dir,
|
|
143
|
+
datasets_hub=task_cfg.dataset_hub,
|
|
144
|
+
stage=task_cfg.stage,
|
|
145
|
+
eval_type=task_cfg.eval_type,
|
|
146
|
+
overall_task_cfg=task_cfg,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def initialize_model_adapter(task_cfg: TaskConfig, dataset_name: str, imported_modules):
|
|
151
|
+
"""Initialize the model adapter based on the task configuration."""
|
|
152
|
+
if task_cfg.dry_run:
|
|
153
|
+
from evalscope.models.dummy_chat_model import DummyChatModel
|
|
154
|
+
return DummyChatModel(model_cfg=dict())
|
|
155
|
+
elif task_cfg.eval_type == EvalType.CUSTOM:
|
|
156
|
+
if not isinstance(task_cfg.model, CustomModel):
|
|
157
|
+
raise ValueError(f'Expected evalscope.models.custom.CustomModel, but got {type(task_cfg.model)}.')
|
|
158
|
+
from evalscope.models.model_adapter import CustomModelAdapter
|
|
159
|
+
return CustomModelAdapter(custom_model=task_cfg.model)
|
|
160
|
+
else:
|
|
161
|
+
device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None
|
|
162
|
+
model_precision = task_cfg.model_args.get('precision', torch.float16)
|
|
163
|
+
if isinstance(model_precision, str) and model_precision != 'auto':
|
|
164
|
+
model_precision = eval(model_precision)
|
|
165
|
+
return imported_modules['ModelAdapterClass'](
|
|
166
|
+
model_id=task_cfg.model,
|
|
167
|
+
model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION),
|
|
168
|
+
device_map=device_map,
|
|
169
|
+
torch_dtype=model_precision,
|
|
170
|
+
generation_config=task_cfg.generation_config,
|
|
171
|
+
chat_template=task_cfg.chat_template)
|
|
172
|
+
|
|
173
|
+
|
|
374
174
|
def main():
|
|
375
175
|
args = parse_args()
|
|
376
|
-
|
|
377
|
-
# Get task_cfg
|
|
378
|
-
use_cache: bool = False if args.use_cache.lower() == 'false' else True
|
|
379
|
-
task_cfg = {
|
|
380
|
-
'model_args': parse_str_args(args.model_args),
|
|
381
|
-
'generation_config': parse_str_args(args.generation_config),
|
|
382
|
-
'dataset_args': args.dataset_args,
|
|
383
|
-
'dry_run': args.dry_run,
|
|
384
|
-
'model': args.model,
|
|
385
|
-
'template_type': args.template_type,
|
|
386
|
-
'eval_type': args.eval_type,
|
|
387
|
-
'datasets': args.datasets,
|
|
388
|
-
'work_dir': args.work_dir,
|
|
389
|
-
'outputs': args.outputs,
|
|
390
|
-
'mem_cache': args.mem_cache,
|
|
391
|
-
'use_cache': use_cache,
|
|
392
|
-
'dataset_hub': args.dataset_hub,
|
|
393
|
-
'dataset_dir': args.dataset_dir,
|
|
394
|
-
'stage': args.stage,
|
|
395
|
-
'limit': args.limit,
|
|
396
|
-
'debug': args.debug,
|
|
397
|
-
|
|
398
|
-
'eval_backend': args.eval_backend,
|
|
399
|
-
'eval_config': args.eval_config,
|
|
400
|
-
}
|
|
401
|
-
|
|
402
|
-
run_task(task_cfg)
|
|
176
|
+
run_task(args)
|
|
403
177
|
|
|
404
178
|
|
|
405
179
|
if __name__ == '__main__':
|
|
406
|
-
# Usage: python3 evalscope/run.py --model ZhipuAI/chatglm2-6b --datasets mmlu hellaswag --limit 10
|
|
407
|
-
# Usage: python3 evalscope/run.py --model qwen/Qwen-1_8B --generation-config do_sample=false,temperature=0.0 --datasets ceval --dataset-args '{"ceval": {"few_shot_num": 0}}' --limit 10
|
|
408
180
|
main()
|