evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +1 -1
- evalscope/arguments.py +73 -0
- evalscope/backend/base.py +5 -1
- evalscope/backend/opencompass/api_meta_template.py +8 -14
- evalscope/backend/opencompass/backend_manager.py +24 -15
- evalscope/backend/opencompass/tasks/eval_api.py +1 -6
- evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
- evalscope/backend/rag_eval/__init__.py +3 -3
- evalscope/backend/rag_eval/backend_manager.py +21 -25
- evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
- evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
- evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
- evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
- evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
- evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
- evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
- evalscope/backend/rag_eval/cmteb/base.py +22 -23
- evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
- evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
- evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
- evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
- evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
- evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
- evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
- evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
- evalscope/backend/rag_eval/ragas/__init__.py +2 -2
- evalscope/backend/rag_eval/ragas/arguments.py +3 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
- evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
- evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
- evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
- evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
- evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
- evalscope/backend/rag_eval/ragas/task_template.py +10 -15
- evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
- evalscope/backend/rag_eval/utils/clip.py +46 -50
- evalscope/backend/rag_eval/utils/embedding.py +12 -11
- evalscope/backend/rag_eval/utils/llm.py +8 -6
- evalscope/backend/rag_eval/utils/tools.py +12 -11
- evalscope/backend/vlm_eval_kit/__init__.py +1 -1
- evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
- evalscope/benchmarks/arc/__init__.py +3 -2
- evalscope/benchmarks/arc/ai2_arc.py +19 -16
- evalscope/benchmarks/arc/arc_adapter.py +32 -24
- evalscope/benchmarks/bbh/__init__.py +1 -2
- evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
- evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
- evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
- evalscope/benchmarks/benchmark.py +16 -16
- evalscope/benchmarks/ceval/__init__.py +3 -2
- evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
- evalscope/benchmarks/ceval/ceval_exam.py +18 -31
- evalscope/benchmarks/cmmlu/__init__.py +3 -2
- evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
- evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
- evalscope/benchmarks/competition_math/__init__.py +3 -2
- evalscope/benchmarks/competition_math/competition_math.py +7 -16
- evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
- evalscope/benchmarks/data_adapter.py +24 -24
- evalscope/benchmarks/general_qa/__init__.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
- evalscope/benchmarks/gsm8k/__init__.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
- evalscope/benchmarks/hellaswag/__init__.py +3 -2
- evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
- evalscope/benchmarks/humaneval/__init__.py +1 -1
- evalscope/benchmarks/humaneval/humaneval.py +15 -18
- evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
- evalscope/benchmarks/mmlu/__init__.py +3 -2
- evalscope/benchmarks/mmlu/mmlu.py +15 -29
- evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
- evalscope/benchmarks/race/__init__.py +3 -2
- evalscope/benchmarks/race/race.py +21 -35
- evalscope/benchmarks/race/race_adapter.py +32 -29
- evalscope/benchmarks/race/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/__init__.py +3 -2
- evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
- evalscope/benchmarks/truthful_qa/__init__.py +3 -2
- evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
- evalscope/cli/cli.py +6 -5
- evalscope/cli/start_eval.py +31 -0
- evalscope/cli/start_perf.py +0 -3
- evalscope/cli/start_server.py +27 -41
- evalscope/config.py +119 -95
- evalscope/constants.py +61 -29
- evalscope/evaluator/__init__.py +1 -0
- evalscope/evaluator/evaluator.py +96 -377
- evalscope/evaluator/humaneval_evaluator.py +158 -0
- evalscope/evaluator/rating_eval.py +12 -33
- evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
- evalscope/metrics/code_metric.py +3 -9
- evalscope/metrics/math_accuracy.py +3 -6
- evalscope/metrics/metrics.py +21 -21
- evalscope/metrics/rouge_metric.py +11 -25
- evalscope/models/__init__.py +1 -2
- evalscope/models/api/openai_api.py +40 -29
- evalscope/models/custom/__init__.py +0 -1
- evalscope/models/custom/custom_model.py +3 -3
- evalscope/models/dummy_chat_model.py +7 -8
- evalscope/models/model_adapter.py +89 -156
- evalscope/models/openai_model.py +20 -20
- evalscope/perf/arguments.py +15 -3
- evalscope/perf/benchmark.py +7 -9
- evalscope/perf/http_client.py +3 -8
- evalscope/perf/main.py +10 -0
- evalscope/perf/plugin/api/custom_api.py +1 -2
- evalscope/perf/plugin/api/dashscope_api.py +1 -2
- evalscope/perf/plugin/api/openai_api.py +3 -4
- evalscope/perf/plugin/datasets/base.py +1 -2
- evalscope/perf/plugin/datasets/flickr8k.py +1 -2
- evalscope/perf/plugin/datasets/longalpaca.py +1 -2
- evalscope/perf/plugin/datasets/openqa.py +1 -2
- evalscope/perf/utils/analysis_result.py +1 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/perf/utils/db_util.py +11 -8
- evalscope/perf/utils/local_server.py +19 -13
- evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
- evalscope/registry/tasks/arc.yaml +2 -3
- evalscope/registry/tasks/bbh.yaml +3 -4
- evalscope/registry/tasks/bbh_mini.yaml +3 -4
- evalscope/registry/tasks/ceval.yaml +3 -3
- evalscope/registry/tasks/ceval_mini.yaml +3 -4
- evalscope/registry/tasks/cmmlu.yaml +3 -3
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
- evalscope/registry/tasks/general_qa.yaml +1 -1
- evalscope/registry/tasks/gsm8k.yaml +2 -2
- evalscope/registry/tasks/mmlu.yaml +3 -3
- evalscope/registry/tasks/mmlu_mini.yaml +3 -3
- evalscope/run.py +184 -375
- evalscope/run_arena.py +20 -25
- evalscope/summarizer.py +16 -17
- evalscope/third_party/longbench_write/README.md +99 -42
- evalscope/third_party/longbench_write/default_task.json +1 -1
- evalscope/third_party/longbench_write/default_task.yaml +8 -7
- evalscope/third_party/longbench_write/eval.py +29 -28
- evalscope/third_party/longbench_write/infer.py +16 -104
- evalscope/third_party/longbench_write/longbench_write.py +5 -5
- evalscope/third_party/longbench_write/resources/judge.txt +1 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
- evalscope/third_party/longbench_write/utils.py +0 -1
- evalscope/third_party/toolbench_static/eval.py +14 -15
- evalscope/third_party/toolbench_static/infer.py +48 -69
- evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
- evalscope/third_party/toolbench_static/requirements.txt +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
- evalscope/tools/combine_reports.py +25 -30
- evalscope/tools/rewrite_eval_results.py +14 -46
- evalscope/utils/__init__.py +0 -1
- evalscope/utils/arena_utils.py +18 -48
- evalscope/{perf/utils → utils}/chat_service.py +3 -4
- evalscope/utils/completion_parsers.py +3 -8
- evalscope/utils/logger.py +9 -7
- evalscope/utils/model_utils.py +11 -0
- evalscope/utils/utils.py +12 -138
- evalscope/version.py +2 -2
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
- evalscope-0.8.0.dist-info/RECORD +285 -0
- tests/cli/test_run.py +54 -15
- tests/perf/test_perf.py +4 -0
- tests/rag/test_clip_benchmark.py +38 -38
- tests/rag/test_mteb.py +3 -2
- tests/rag/test_ragas.py +5 -5
- tests/swift/test_run_swift_eval.py +2 -3
- tests/swift/test_run_swift_vlm_eval.py +2 -3
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
- evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
- evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
- evalscope/cache.py +0 -98
- evalscope/models/template.py +0 -1446
- evalscope/run_ms.py +0 -140
- evalscope/utils/task_cfg_parser.py +0 -10
- evalscope/utils/task_utils.py +0 -22
- evalscope-0.7.1.dist-info/RECORD +0 -286
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
evalscope/config.py
CHANGED
|
@@ -1,69 +1,127 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import copy
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
from argparse import Namespace
|
|
7
|
+
from dataclasses import dataclass, field
|
|
8
|
+
from typing import Dict, List, Optional, Union
|
|
7
9
|
|
|
8
|
-
from evalscope.constants import
|
|
10
|
+
from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
|
|
9
11
|
from evalscope.models.custom import CustomModel
|
|
10
|
-
from evalscope.utils import yaml_to_dict
|
|
12
|
+
from evalscope.utils import dict_to_yaml, gen_hash, json_to_dict, yaml_to_dict
|
|
11
13
|
from evalscope.utils.logger import get_logger
|
|
12
14
|
|
|
13
15
|
logger = get_logger()
|
|
14
16
|
|
|
15
17
|
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
16
18
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
'
|
|
20
|
-
'
|
|
21
|
-
'
|
|
22
|
-
'
|
|
23
|
-
'
|
|
24
|
-
'
|
|
25
|
-
|
|
26
|
-
# 'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
|
|
27
|
-
# 'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
|
|
28
|
-
# 'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
|
|
29
|
-
|
|
19
|
+
DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16', 'device': 'auto'}
|
|
20
|
+
DEFAULT_GENERATION_CONFIG = {
|
|
21
|
+
'max_length': 2048,
|
|
22
|
+
'max_new_tokens': 512,
|
|
23
|
+
'do_sample': False,
|
|
24
|
+
'top_k': 50,
|
|
25
|
+
'top_p': 1.0,
|
|
26
|
+
'temperature': 1.0,
|
|
30
27
|
}
|
|
31
28
|
|
|
32
29
|
|
|
33
30
|
@dataclass
|
|
34
31
|
class TaskConfig:
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
32
|
+
# Model-related arguments
|
|
33
|
+
model: Union[str, CustomModel, None] = None
|
|
34
|
+
model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
|
|
35
|
+
|
|
36
|
+
# Template-related arguments
|
|
37
|
+
template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
|
|
38
|
+
chat_template: Optional[str] = None
|
|
39
|
+
|
|
40
|
+
# Dataset-related arguments
|
|
41
|
+
datasets: Optional[List[str]] = None
|
|
42
|
+
dataset_args: Optional[Dict] = field(default_factory=dict)
|
|
43
|
+
dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
|
|
44
|
+
dataset_hub: str = HubType.MODELSCOPE
|
|
45
|
+
|
|
46
|
+
# Generation configuration arguments
|
|
47
|
+
generation_config: Optional[Dict] = field(default_factory=lambda: DEFAULT_GENERATION_CONFIG | {})
|
|
48
|
+
|
|
49
|
+
# Evaluation-related arguments
|
|
50
|
+
eval_type: str = EvalType.CHECKPOINT
|
|
51
|
+
eval_backend: str = EvalBackend.NATIVE
|
|
52
|
+
eval_config: Union[str, Dict, None] = None
|
|
53
|
+
stage: str = EvalStage.ALL
|
|
54
|
+
limit: Optional[int] = None
|
|
55
|
+
|
|
56
|
+
# Cache and working directory arguments
|
|
57
|
+
mem_cache: bool = False # Deprecated, will be removed in v1.0.0.
|
|
58
|
+
use_cache: Optional[str] = None
|
|
59
|
+
work_dir: str = DEFAULT_WORK_DIR
|
|
60
|
+
outputs: Optional[str] = None # Deprecated, will be removed in v1.0.0.
|
|
61
|
+
|
|
62
|
+
# Debug and runtime mode arguments
|
|
63
|
+
debug: bool = False
|
|
39
64
|
dry_run: bool = False
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
65
|
+
seed: int = 42
|
|
66
|
+
|
|
67
|
+
def to_dict(self):
|
|
68
|
+
# Note: to avoid serialization error for some model instance
|
|
69
|
+
return self.__dict__
|
|
70
|
+
|
|
71
|
+
def __str__(self):
|
|
72
|
+
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
73
|
+
|
|
74
|
+
def update(self, other: Union['TaskConfig', dict]):
|
|
75
|
+
if isinstance(other, TaskConfig):
|
|
76
|
+
other = other.to_dict()
|
|
77
|
+
self.__dict__.update(other)
|
|
78
|
+
|
|
79
|
+
def dump_yaml(self, output_dir: str):
|
|
80
|
+
"""Dump the task configuration to a YAML file."""
|
|
81
|
+
task_cfg_file = os.path.join(output_dir, f'task_config_{gen_hash(str(self), bits=6)}.yaml')
|
|
82
|
+
try:
|
|
83
|
+
logger.info(f'Dump task config to {task_cfg_file}')
|
|
84
|
+
dict_to_yaml(self.to_dict(), task_cfg_file)
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.warning(f'Failed to dump overall task config: {e}')
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
def list():
|
|
90
|
+
return list(registry_tasks.keys())
|
|
91
|
+
|
|
92
|
+
@staticmethod
|
|
93
|
+
def from_yaml(yaml_file: str):
|
|
94
|
+
return TaskConfig.from_dict(yaml_to_dict(yaml_file))
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def from_dict(d: dict):
|
|
98
|
+
return TaskConfig(**d)
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def from_json(json_file: str):
|
|
102
|
+
return TaskConfig.from_dict(json_to_dict(json_file))
|
|
103
|
+
|
|
104
|
+
@staticmethod
|
|
105
|
+
def from_args(args: Namespace):
|
|
106
|
+
# Convert Namespace to a dictionary and filter out None values
|
|
107
|
+
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
108
|
+
del args_dict['func'] # Note: compat CLI arguments
|
|
109
|
+
|
|
110
|
+
return TaskConfig.from_dict(args_dict)
|
|
111
|
+
|
|
112
|
+
@staticmethod
|
|
113
|
+
def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
|
|
114
|
+
res_list = []
|
|
115
|
+
for task_name in tasks:
|
|
116
|
+
task = registry_tasks.get(task_name, None)
|
|
117
|
+
if task is None:
|
|
118
|
+
logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
task.model = custom_model
|
|
122
|
+
res_list.append(task)
|
|
123
|
+
|
|
124
|
+
return res_list
|
|
67
125
|
|
|
68
126
|
@staticmethod
|
|
69
127
|
def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
|
|
@@ -75,7 +133,7 @@ class TaskConfig:
|
|
|
75
133
|
data_pattern: str, the data pattern for the task.
|
|
76
134
|
e.g. `mmlu`, `ceval`, `gsm8k`, ...
|
|
77
135
|
refer to task_config.list() for all available datasets.
|
|
78
|
-
dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
|
|
136
|
+
dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
|
|
79
137
|
then your specific custom dataset directory will be /path/to/data/{name}
|
|
80
138
|
subset_list: list, the subset list for the dataset.
|
|
81
139
|
e.g. ['middle_school_politics', 'operating_system']
|
|
@@ -83,63 +141,31 @@ class TaskConfig:
|
|
|
83
141
|
"""
|
|
84
142
|
available_datasets = list(registry_tasks.keys())
|
|
85
143
|
if data_pattern not in available_datasets:
|
|
86
|
-
logger.error(
|
|
144
|
+
logger.error(
|
|
145
|
+
f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
|
|
87
146
|
return
|
|
88
147
|
|
|
89
148
|
# Reuse the existing task config and update the datasets
|
|
90
|
-
pattern_config = registry_tasks
|
|
149
|
+
pattern_config = registry_tasks[data_pattern]
|
|
91
150
|
|
|
92
151
|
custom_config = copy.deepcopy(pattern_config)
|
|
93
|
-
custom_config.
|
|
94
|
-
custom_config.
|
|
95
|
-
|
|
96
|
-
if data_pattern not in custom_config:
|
|
97
|
-
custom_config['dataset_args'].update({data_pattern: {}})
|
|
98
|
-
else:
|
|
99
|
-
custom_config.update({'dataset_args': {data_pattern: {}}})
|
|
152
|
+
custom_config.datasets = [data_pattern]
|
|
153
|
+
custom_config.dataset_args = {data_pattern: {}}
|
|
154
|
+
custom_config.eval_type = EvalType.CHECKPOINT
|
|
100
155
|
|
|
101
156
|
if dataset_dir is not None:
|
|
102
|
-
custom_config
|
|
157
|
+
custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
|
|
103
158
|
|
|
104
159
|
if subset_list is not None:
|
|
105
|
-
|
|
106
|
-
custom_config['dataset_args'][data_pattern].update({'subset_list': subset_list})
|
|
160
|
+
custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
|
|
107
161
|
|
|
108
162
|
registry_tasks.update({name: custom_config})
|
|
109
163
|
logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
|
|
110
164
|
|
|
111
|
-
def to_dict(self):
|
|
112
|
-
# Note: to avoid serialization error for some model instance
|
|
113
|
-
_tmp_model = copy.copy(self.model)
|
|
114
|
-
self.model = None
|
|
115
|
-
res_dict = asdict(self)
|
|
116
|
-
res_dict.update({'model': _tmp_model})
|
|
117
|
-
self.model = _tmp_model
|
|
118
165
|
|
|
119
|
-
|
|
166
|
+
tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
|
|
120
167
|
|
|
121
|
-
|
|
122
|
-
def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
|
|
123
|
-
res_list = []
|
|
124
|
-
for task_name in tasks:
|
|
125
|
-
task: dict = registry_tasks.get(task_name, None)
|
|
126
|
-
if task is None:
|
|
127
|
-
logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
|
|
128
|
-
continue
|
|
129
|
-
|
|
130
|
-
res = TaskConfig(**task)
|
|
131
|
-
res.model = custom_model
|
|
132
|
-
if res.outputs is None:
|
|
133
|
-
res.outputs = os.path.join(res.work_dir,
|
|
134
|
-
'outputs',
|
|
135
|
-
f"eval_{'-'.join(tasks)}_{res.model.config['model_id']}_{res.model_args.get('revision', 'default')}")
|
|
136
|
-
res_list.append(res)
|
|
137
|
-
|
|
138
|
-
return res_list
|
|
139
|
-
|
|
140
|
-
@staticmethod
|
|
141
|
-
def list():
|
|
142
|
-
return list(registry_tasks.keys())
|
|
168
|
+
registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
|
|
143
169
|
|
|
144
170
|
|
|
145
171
|
class TempModel(CustomModel):
|
|
@@ -158,9 +184,7 @@ if __name__ == '__main__':
|
|
|
158
184
|
# Register a new task
|
|
159
185
|
TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
|
|
160
186
|
|
|
161
|
-
import json
|
|
162
187
|
swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
|
|
163
188
|
for item in swift_eval_task:
|
|
164
|
-
print(item
|
|
189
|
+
print(item)
|
|
165
190
|
print()
|
|
166
|
-
|
evalscope/constants.py
CHANGED
|
@@ -1,7 +1,18 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
|
|
2
|
+
import os
|
|
3
|
+
from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
|
|
4
|
+
from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
|
|
3
5
|
|
|
4
|
-
|
|
6
|
+
DEFAULT_WORK_DIR = './outputs'
|
|
7
|
+
DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION # master
|
|
8
|
+
DEFAULT_MODEL_CACHE_DIR = get_model_cache_root() # ~/.cache/modelscope/hub
|
|
9
|
+
DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root() # ~/.cache/modelscope/datasets
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class HubType:
|
|
13
|
+
MODELSCOPE = 'modelscope'
|
|
14
|
+
HUGGINGFACE = 'huggingface'
|
|
15
|
+
LOCAL = 'local'
|
|
5
16
|
|
|
6
17
|
|
|
7
18
|
class DumpMode:
|
|
@@ -25,7 +36,7 @@ class MetricsConstant:
|
|
|
25
36
|
]
|
|
26
37
|
|
|
27
38
|
|
|
28
|
-
class MetricMembers
|
|
39
|
+
class MetricMembers:
|
|
29
40
|
|
|
30
41
|
# Math accuracy metric
|
|
31
42
|
MATH_ACCURACY = 'math_accuracy'
|
|
@@ -66,53 +77,51 @@ class ArenaMode:
|
|
|
66
77
|
|
|
67
78
|
|
|
68
79
|
class OutputsStructure:
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
80
|
+
LOGS_DIR = 'logs'
|
|
81
|
+
PREDICTIONS_DIR = 'predictions'
|
|
82
|
+
REVIEWS_DIR = 'reviews'
|
|
83
|
+
REPORTS_DIR = 'reports'
|
|
84
|
+
CONFIGS_DIR = 'configs'
|
|
85
|
+
|
|
86
|
+
def __init__(self, outputs_dir: str, is_make: bool = True):
|
|
87
|
+
self.outputs_dir = outputs_dir
|
|
88
|
+
self.logs_dir = os.path.join(outputs_dir, OutputsStructure.LOGS_DIR)
|
|
89
|
+
self.predictions_dir = os.path.join(outputs_dir, OutputsStructure.PREDICTIONS_DIR)
|
|
90
|
+
self.reviews_dir = os.path.join(outputs_dir, OutputsStructure.REVIEWS_DIR)
|
|
91
|
+
self.reports_dir = os.path.join(outputs_dir, OutputsStructure.REPORTS_DIR)
|
|
92
|
+
self.configs_dir = os.path.join(outputs_dir, OutputsStructure.CONFIGS_DIR)
|
|
93
|
+
|
|
94
|
+
if is_make:
|
|
95
|
+
self.create_directories()
|
|
96
|
+
|
|
97
|
+
def create_directories(self):
|
|
98
|
+
os.makedirs(self.outputs_dir, exist_ok=True)
|
|
99
|
+
os.makedirs(self.logs_dir, exist_ok=True)
|
|
100
|
+
os.makedirs(self.predictions_dir, exist_ok=True)
|
|
101
|
+
os.makedirs(self.reviews_dir, exist_ok=True)
|
|
102
|
+
os.makedirs(self.reports_dir, exist_ok=True)
|
|
103
|
+
os.makedirs(self.configs_dir, exist_ok=True)
|
|
79
104
|
|
|
80
105
|
|
|
81
106
|
class AnswerKeys:
|
|
82
|
-
|
|
83
107
|
ANSWER_ID = 'answer_id'
|
|
84
|
-
|
|
85
108
|
RAW_INPUT = 'raw_input'
|
|
86
|
-
|
|
87
109
|
ORIGIN_PROMPT = 'origin_prompt'
|
|
88
|
-
|
|
89
110
|
MODEL_SPEC = 'model_spec'
|
|
90
|
-
|
|
91
111
|
SUBSET_NAME = 'subset_name'
|
|
92
|
-
|
|
93
112
|
CHOICES = 'choices'
|
|
94
113
|
|
|
95
114
|
|
|
96
115
|
class ReviewKeys:
|
|
97
|
-
|
|
98
116
|
REVIEW_ID = 'review_id'
|
|
99
|
-
|
|
100
117
|
REVIEWED = 'reviewed'
|
|
101
|
-
|
|
102
118
|
REVIEWER_SPEC = 'reviewer_spec'
|
|
103
|
-
|
|
104
119
|
REVIEW_TIME = 'review_time'
|
|
105
|
-
|
|
106
120
|
MESSAGE = 'message'
|
|
107
|
-
|
|
108
121
|
CONTENT = 'content'
|
|
109
|
-
|
|
110
122
|
GOLD = 'gold'
|
|
111
|
-
|
|
112
123
|
PRED = 'pred'
|
|
113
|
-
|
|
114
124
|
RESULT = 'result'
|
|
115
|
-
|
|
116
125
|
REVIEW = 'review'
|
|
117
126
|
|
|
118
127
|
|
|
@@ -148,3 +157,26 @@ class EvalStage:
|
|
|
148
157
|
ALL = 'all'
|
|
149
158
|
INFER = 'infer'
|
|
150
159
|
REVIEW = 'review'
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class EvalType:
|
|
163
|
+
|
|
164
|
+
CUSTOM = 'custom'
|
|
165
|
+
CHECKPOINT = 'checkpoint'
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class EvalBackend:
|
|
169
|
+
# Use native evaluation pipeline of EvalScope
|
|
170
|
+
NATIVE = 'Native'
|
|
171
|
+
|
|
172
|
+
# Use OpenCompass framework as the evaluation backend
|
|
173
|
+
OPEN_COMPASS = 'OpenCompass'
|
|
174
|
+
|
|
175
|
+
# Use VLM Eval Kit as the multi-modal model evaluation backend
|
|
176
|
+
VLM_EVAL_KIT = 'VLMEvalKit'
|
|
177
|
+
|
|
178
|
+
# Use RAGEval as the RAG evaluation backend
|
|
179
|
+
RAG_EVAL = 'RAGEval'
|
|
180
|
+
|
|
181
|
+
# Use third-party evaluation backend/modules
|
|
182
|
+
THIRD_PARTY = 'ThirdParty'
|
evalscope/evaluator/__init__.py
CHANGED