PyPI - evalscope - Versions diffs - 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show

evalscope/__init__.py +1 -1
evalscope/arguments.py +73 -0
evalscope/backend/base.py +5 -1
evalscope/backend/opencompass/api_meta_template.py +8 -14
evalscope/backend/opencompass/backend_manager.py +24 -15
evalscope/backend/opencompass/tasks/eval_api.py +1 -6
evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
evalscope/backend/rag_eval/__init__.py +3 -3
evalscope/backend/rag_eval/backend_manager.py +21 -25
evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
evalscope/backend/rag_eval/cmteb/base.py +22 -23
evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
evalscope/backend/rag_eval/ragas/__init__.py +2 -2
evalscope/backend/rag_eval/ragas/arguments.py +3 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
evalscope/backend/rag_eval/ragas/task_template.py +10 -15
evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
evalscope/backend/rag_eval/utils/clip.py +46 -50
evalscope/backend/rag_eval/utils/embedding.py +12 -11
evalscope/backend/rag_eval/utils/llm.py +8 -6
evalscope/backend/rag_eval/utils/tools.py +12 -11
evalscope/backend/vlm_eval_kit/__init__.py +1 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
evalscope/benchmarks/arc/__init__.py +3 -2
evalscope/benchmarks/arc/ai2_arc.py +19 -16
evalscope/benchmarks/arc/arc_adapter.py +32 -24
evalscope/benchmarks/bbh/__init__.py +1 -2
evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
evalscope/benchmarks/benchmark.py +16 -16
evalscope/benchmarks/ceval/__init__.py +3 -2
evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
evalscope/benchmarks/ceval/ceval_exam.py +18 -31
evalscope/benchmarks/cmmlu/__init__.py +3 -2
evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
evalscope/benchmarks/competition_math/__init__.py +3 -2
evalscope/benchmarks/competition_math/competition_math.py +7 -16
evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
evalscope/benchmarks/data_adapter.py +24 -24
evalscope/benchmarks/general_qa/__init__.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
evalscope/benchmarks/gsm8k/__init__.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
evalscope/benchmarks/hellaswag/__init__.py +3 -2
evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
evalscope/benchmarks/humaneval/__init__.py +1 -1
evalscope/benchmarks/humaneval/humaneval.py +15 -18
evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
evalscope/benchmarks/mmlu/__init__.py +3 -2
evalscope/benchmarks/mmlu/mmlu.py +15 -29
evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
evalscope/benchmarks/race/__init__.py +3 -2
evalscope/benchmarks/race/race.py +21 -35
evalscope/benchmarks/race/race_adapter.py +32 -29
evalscope/benchmarks/race/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/__init__.py +3 -2
evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
evalscope/benchmarks/truthful_qa/__init__.py +3 -2
evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
evalscope/cli/cli.py +6 -5
evalscope/cli/start_eval.py +31 -0
evalscope/cli/start_perf.py +0 -3
evalscope/cli/start_server.py +27 -41
evalscope/config.py +119 -95
evalscope/constants.py +61 -29
evalscope/evaluator/__init__.py +1 -0
evalscope/evaluator/evaluator.py +96 -377
evalscope/evaluator/humaneval_evaluator.py +158 -0
evalscope/evaluator/rating_eval.py +12 -33
evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
evalscope/metrics/code_metric.py +3 -9
evalscope/metrics/math_accuracy.py +3 -6
evalscope/metrics/metrics.py +21 -21
evalscope/metrics/rouge_metric.py +11 -25
evalscope/models/__init__.py +1 -2
evalscope/models/api/openai_api.py +40 -29
evalscope/models/custom/__init__.py +0 -1
evalscope/models/custom/custom_model.py +3 -3
evalscope/models/dummy_chat_model.py +7 -8
evalscope/models/model_adapter.py +89 -156
evalscope/models/openai_model.py +20 -20
evalscope/perf/arguments.py +15 -3
evalscope/perf/benchmark.py +7 -9
evalscope/perf/http_client.py +3 -8
evalscope/perf/main.py +10 -0
evalscope/perf/plugin/api/custom_api.py +1 -2
evalscope/perf/plugin/api/dashscope_api.py +1 -2
evalscope/perf/plugin/api/openai_api.py +2 -3
evalscope/perf/plugin/datasets/base.py +1 -2
evalscope/perf/plugin/datasets/flickr8k.py +1 -2
evalscope/perf/plugin/datasets/longalpaca.py +1 -2
evalscope/perf/plugin/datasets/openqa.py +1 -2
evalscope/perf/utils/analysis_result.py +1 -2
evalscope/perf/utils/benchmark_util.py +1 -2
evalscope/perf/utils/db_util.py +11 -8
evalscope/perf/utils/local_server.py +19 -13
evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
evalscope/registry/tasks/arc.yaml +2 -3
evalscope/registry/tasks/bbh.yaml +3 -4
evalscope/registry/tasks/bbh_mini.yaml +3 -4
evalscope/registry/tasks/ceval.yaml +3 -3
evalscope/registry/tasks/ceval_mini.yaml +3 -4
evalscope/registry/tasks/cmmlu.yaml +3 -3
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
evalscope/registry/tasks/general_qa.yaml +1 -1
evalscope/registry/tasks/gsm8k.yaml +2 -2
evalscope/registry/tasks/mmlu.yaml +3 -3
evalscope/registry/tasks/mmlu_mini.yaml +3 -3
evalscope/run.py +184 -375
evalscope/run_arena.py +20 -25
evalscope/summarizer.py +16 -17
evalscope/third_party/longbench_write/README.md +99 -42
evalscope/third_party/longbench_write/default_task.json +1 -1
evalscope/third_party/longbench_write/default_task.yaml +8 -7
evalscope/third_party/longbench_write/eval.py +29 -28
evalscope/third_party/longbench_write/infer.py +16 -104
evalscope/third_party/longbench_write/longbench_write.py +5 -5
evalscope/third_party/longbench_write/resources/judge.txt +1 -1
evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
evalscope/third_party/longbench_write/utils.py +0 -1
evalscope/third_party/toolbench_static/eval.py +14 -15
evalscope/third_party/toolbench_static/infer.py +48 -69
evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
evalscope/third_party/toolbench_static/requirements.txt +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
evalscope/tools/combine_reports.py +25 -30
evalscope/tools/rewrite_eval_results.py +14 -46
evalscope/utils/__init__.py +0 -1
evalscope/utils/arena_utils.py +18 -48
evalscope/{perf/utils → utils}/chat_service.py +3 -4
evalscope/utils/completion_parsers.py +3 -8
evalscope/utils/logger.py +9 -7
evalscope/utils/model_utils.py +11 -0
evalscope/utils/utils.py +12 -138
evalscope/version.py +2 -2
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/METADATA +123 -118
evalscope-0.8.0.dist-info/RECORD +285 -0
tests/cli/test_run.py +54 -15
tests/perf/test_perf.py +4 -0
tests/rag/test_clip_benchmark.py +38 -38
tests/rag/test_mteb.py +3 -2
tests/rag/test_ragas.py +5 -5
tests/swift/test_run_swift_eval.py +2 -3
tests/swift/test_run_swift_vlm_eval.py +2 -3
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
evalscope/cache.py +0 -98
evalscope/models/template.py +0 -1446
evalscope/run_ms.py +0 -140
evalscope/utils/task_cfg_parser.py +0 -10
evalscope/utils/task_utils.py +0 -22
evalscope-0.7.2.dist-info/RECORD +0 -286
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.7.2.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0

evalscope/config.py CHANGED Viewed

@@ -1,69 +1,127 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import os
 import copy
-from dataclasses import dataclass, asdict, field
-from typing import Optional, List
+import json
+import os
+from argparse import Namespace
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Union
-from evalscope.constants import DEFAULT_ROOT_CACHE_DIR
+from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType
 from evalscope.models.custom import CustomModel
-from evalscope.utils import yaml_to_dict
+from evalscope.utils import dict_to_yaml, gen_hash, json_to_dict, yaml_to_dict
 from evalscope.utils.logger import get_logger
 logger = get_logger()
 cur_path = os.path.dirname(os.path.abspath(__file__))
-registry_tasks = {
-    'arc': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/arc.yaml')),
-    'gsm8k': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/gsm8k.yaml')),
-    'mmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu.yaml')),
-    'cmmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/cmmlu.yaml')),
-    'ceval': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval.yaml')),
-    'bbh': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh.yaml')),
-    'general_qa': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/general_qa.yaml')),
-    # 'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
-    # 'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
-    # 'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
+DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16', 'device': 'auto'}
+DEFAULT_GENERATION_CONFIG = {
+    'max_length': 2048,
+    'max_new_tokens': 512,
+    'do_sample': False,
+    'top_k': 50,
+    'top_p': 1.0,
+    'temperature': 1.0,
 }
 @dataclass
 class TaskConfig:
-    model_args: Optional[dict] = field(default_factory=dict)
-    template_type: Optional[str] = 'default-generation'
-    generation_config: Optional[dict] = field(default_factory=dict)
-    dataset_args: Optional[dict] = field(default_factory=dict)
+    # Model-related arguments
+    model: Union[str, CustomModel, None] = None
+    model_args: Optional[Dict] = field(default_factory=lambda: DEFAULT_MODEL_ARGS | {})
+    # Template-related arguments
+    template_type: Optional[str] = None  # Deprecated, will be removed in v1.0.0.
+    chat_template: Optional[str] = None
+    # Dataset-related arguments
+    datasets: Optional[List[str]] = None
+    dataset_args: Optional[Dict] = field(default_factory=dict)
+    dataset_dir: str = DEFAULT_DATASET_CACHE_DIR
+    dataset_hub: str = HubType.MODELSCOPE
+    # Generation configuration arguments
+    generation_config: Optional[Dict] = field(default_factory=lambda: DEFAULT_GENERATION_CONFIG | {})
+    # Evaluation-related arguments
+    eval_type: str = EvalType.CHECKPOINT
+    eval_backend: str = EvalBackend.NATIVE
+    eval_config: Union[str, Dict, None] = None
+    stage: str = EvalStage.ALL
+    limit: Optional[int] = None
+    # Cache and working directory arguments
+    mem_cache: bool = False  # Deprecated, will be removed in v1.0.0.
+    use_cache: Optional[str] = None
+    work_dir: str = DEFAULT_WORK_DIR
+    outputs: Optional[str] = None  # Deprecated, will be removed in v1.0.0.
+    # Debug and runtime mode arguments
+    debug: bool = False
     dry_run: bool = False
-    model: CustomModel = None
-    eval_type: str = 'custom'
-    datasets: list = field(default_factory=list)
-    work_dir: str = DEFAULT_ROOT_CACHE_DIR
-    outputs: str = None
-    mem_cache: bool = False
-    use_cache: bool = True
-    stage: str = 'all'      # `all` or `infer` or `review`
-    dataset_hub: str = 'ModelScope'
-    dataset_dir: str = DEFAULT_ROOT_CACHE_DIR
-    limit: int = None
-    eval_backend: str = 'Native'
-    eval_config: dict = field(default_factory=dict)
-    # def __post_init__(self):
-    #     self.registry_tasks = {
-    #         'arc': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/arc.yaml')),
-    #         'gsm8k': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/gsm8k.yaml')),
-    #         'mmlu': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu.yaml')),
-    #         'ceval': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval.yaml')),
-    #         'bbh': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh.yaml')),
-    #
-    #         'bbh_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/bbh_mini.yaml')),
-    #         'mmlu_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/mmlu_mini.yaml')),
-    #         'ceval_mini': yaml_to_dict(os.path.join(cur_path, 'registry/tasks/ceval_mini.yaml')),
-    #
-    #     }
+    seed: int = 42
+    def to_dict(self):
+        # Note: to avoid serialization error for some model instance
+        return self.__dict__
+    def __str__(self):
+        return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
+    def update(self, other: Union['TaskConfig', dict]):
+        if isinstance(other, TaskConfig):
+            other = other.to_dict()
+        self.__dict__.update(other)
+    def dump_yaml(self, output_dir: str):
+        """Dump the task configuration to a YAML file."""
+        task_cfg_file = os.path.join(output_dir, f'task_config_{gen_hash(str(self), bits=6)}.yaml')
+        try:
+            logger.info(f'Dump task config to {task_cfg_file}')
+            dict_to_yaml(self.to_dict(), task_cfg_file)
+        except Exception as e:
+            logger.warning(f'Failed to dump overall task config: {e}')
+    @staticmethod
+    def list():
+        return list(registry_tasks.keys())
+    @staticmethod
+    def from_yaml(yaml_file: str):
+        return TaskConfig.from_dict(yaml_to_dict(yaml_file))
+    @staticmethod
+    def from_dict(d: dict):
+        return TaskConfig(**d)
+    @staticmethod
+    def from_json(json_file: str):
+        return TaskConfig.from_dict(json_to_dict(json_file))
+    @staticmethod
+    def from_args(args: Namespace):
+        # Convert Namespace to a dictionary and filter out None values
+        args_dict = {k: v for k, v in vars(args).items() if v is not None}
+        del args_dict['func']  # Note: compat CLI arguments
+        return TaskConfig.from_dict(args_dict)
+    @staticmethod
+    def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
+        res_list = []
+        for task_name in tasks:
+            task = registry_tasks.get(task_name, None)
+            if task is None:
+                logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
+                continue
+            task.model = custom_model
+            res_list.append(task)
+        return res_list
     @staticmethod
     def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
@@ -75,7 +133,7 @@ class TaskConfig:
             data_pattern: str, the data pattern for the task.
                     e.g. `mmlu`, `ceval`, `gsm8k`, ...
                     refer to task_config.list() for all available datasets.
-            dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
+            dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
                 then your specific custom dataset directory will be /path/to/data/{name}
             subset_list: list, the subset list for the dataset.
                 e.g. ['middle_school_politics', 'operating_system']
@@ -83,63 +141,31 @@ class TaskConfig:
         """
         available_datasets = list(registry_tasks.keys())
         if data_pattern not in available_datasets:
-            logger.error(f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
+            logger.error(
+                f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
             return
         # Reuse the existing task config and update the datasets
-        pattern_config = registry_tasks.get(data_pattern)
+        pattern_config = registry_tasks[data_pattern]
         custom_config = copy.deepcopy(pattern_config)
-        custom_config.update({'datasets': [data_pattern]})
-        custom_config.update({'dataset_hub': 'Local'})     # TODO: to support `ModelScope`
-        if 'dataset_args' in custom_config:
-            if data_pattern not in custom_config:
-                custom_config['dataset_args'].update({data_pattern: {}})
-        else:
-            custom_config.update({'dataset_args': {data_pattern: {}}})
+        custom_config.datasets = [data_pattern]
+        custom_config.dataset_args = {data_pattern: {}}
+        custom_config.eval_type = EvalType.CHECKPOINT
         if dataset_dir is not None:
-            custom_config['dataset_args'][data_pattern].update({'local_path': dataset_dir})
+            custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
         if subset_list is not None:
-            # custom_config['dataset_args'].get(data_pattern, {}).update({'subset_list': subset_list})
-            custom_config['dataset_args'][data_pattern].update({'subset_list': subset_list})
+            custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
         registry_tasks.update({name: custom_config})
         logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
-    def to_dict(self):
-        # Note: to avoid serialization error for some model instance
-        _tmp_model = copy.copy(self.model)
-        self.model = None
-        res_dict = asdict(self)
-        res_dict.update({'model': _tmp_model})
-        self.model = _tmp_model
-        return res_dict
+tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
-    @staticmethod
-    def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
-        res_list = []
-        for task_name in tasks:
-            task: dict = registry_tasks.get(task_name, None)
-            if task is None:
-                logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
-                continue
-            res = TaskConfig(**task)
-            res.model = custom_model
-            if res.outputs is None:
-                res.outputs = os.path.join(res.work_dir,
-                                           'outputs',
-                                           f"eval_{'-'.join(tasks)}_{res.model.config['model_id']}_{res.model_args.get('revision', 'default')}")
-            res_list.append(res)
-        return res_list
-    @staticmethod
-    def list():
-        return list(registry_tasks.keys())
+registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
 class TempModel(CustomModel):
@@ -158,9 +184,7 @@ if __name__ == '__main__':
     # Register a new task
     TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
-    import json
     swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
     for item in swift_eval_task:
-        print(item.to_dict())
+        print(item)
         print()

evalscope/constants.py CHANGED Viewed

@@ -1,7 +1,18 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from enum import Enum
+import os
+from modelscope.utils.constant import DEFAULT_REPOSITORY_REVISION
+from modelscope.utils.file_utils import get_dataset_cache_root, get_model_cache_root
-DEFAULT_ROOT_CACHE_DIR = '~/.cache/evalscope'
+DEFAULT_WORK_DIR = './outputs'
+DEFAULT_MODEL_REVISION = DEFAULT_REPOSITORY_REVISION  # master
+DEFAULT_MODEL_CACHE_DIR = get_model_cache_root()  # ~/.cache/modelscope/hub
+DEFAULT_DATASET_CACHE_DIR = get_dataset_cache_root()  # ~/.cache/modelscope/datasets
+class HubType:
+    MODELSCOPE = 'modelscope'
+    HUGGINGFACE = 'huggingface'
+    LOCAL = 'local'
 class DumpMode:
@@ -25,7 +36,7 @@ class MetricsConstant:
     ]
-class MetricMembers(Enum):
+class MetricMembers:
     # Math accuracy metric
     MATH_ACCURACY = 'math_accuracy'
@@ -66,53 +77,51 @@ class ArenaMode:
 class OutputsStructure:
-    LOGS_DIR = 'logs_dir'
-    PREDICTIONS_DIR = 'predictions_dir'
-    REVIEWS_DIR = 'reviews_dir'
-    REPORTS_DIR = 'reports_dir'
-    CONFIGS_DIR = 'configs_dir'
+    LOGS_DIR = 'logs'
+    PREDICTIONS_DIR = 'predictions'
+    REVIEWS_DIR = 'reviews'
+    REPORTS_DIR = 'reports'
+    CONFIGS_DIR = 'configs'
+    def __init__(self, outputs_dir: str, is_make: bool = True):
+        self.outputs_dir = outputs_dir
+        self.logs_dir = os.path.join(outputs_dir, OutputsStructure.LOGS_DIR)
+        self.predictions_dir = os.path.join(outputs_dir, OutputsStructure.PREDICTIONS_DIR)
+        self.reviews_dir = os.path.join(outputs_dir, OutputsStructure.REVIEWS_DIR)
+        self.reports_dir = os.path.join(outputs_dir, OutputsStructure.REPORTS_DIR)
+        self.configs_dir = os.path.join(outputs_dir, OutputsStructure.CONFIGS_DIR)
+        if is_make:
+            self.create_directories()
+    def create_directories(self):
+        os.makedirs(self.outputs_dir, exist_ok=True)
+        os.makedirs(self.logs_dir, exist_ok=True)
+        os.makedirs(self.predictions_dir, exist_ok=True)
+        os.makedirs(self.reviews_dir, exist_ok=True)
+        os.makedirs(self.reports_dir, exist_ok=True)
+        os.makedirs(self.configs_dir, exist_ok=True)
 class AnswerKeys:
     ANSWER_ID = 'answer_id'
     RAW_INPUT = 'raw_input'
     ORIGIN_PROMPT = 'origin_prompt'
     MODEL_SPEC = 'model_spec'
     SUBSET_NAME = 'subset_name'
     CHOICES = 'choices'
 class ReviewKeys:
     REVIEW_ID = 'review_id'
     REVIEWED = 'reviewed'
     REVIEWER_SPEC = 'reviewer_spec'
     REVIEW_TIME = 'review_time'
     MESSAGE = 'message'
     CONTENT = 'content'
     GOLD = 'gold'
     PRED = 'pred'
     RESULT = 'result'
     REVIEW = 'review'
@@ -148,3 +157,26 @@ class EvalStage:
     ALL = 'all'
     INFER = 'infer'
     REVIEW = 'review'
+class EvalType:
+    CUSTOM = 'custom'
+    CHECKPOINT = 'checkpoint'
+class EvalBackend:
+    # Use native evaluation pipeline of EvalScope
+    NATIVE = 'Native'
+    # Use OpenCompass framework as the evaluation backend
+    OPEN_COMPASS = 'OpenCompass'
+    # Use VLM Eval Kit as the multi-modal model evaluation backend
+    VLM_EVAL_KIT = 'VLMEvalKit'
+    # Use RAGEval as the RAG evaluation backend
+    RAG_EVAL = 'RAGEval'
+    # Use third-party evaluation backend/modules
+    THIRD_PARTY = 'ThirdParty'

evalscope/evaluator/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from evalscope.evaluator.evaluator import Evaluator
+from evalscope.evaluator.humaneval_evaluator import HumanevalEvaluator

evalscope 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.7.2py3-none-any.whl → 0.8.0py3-none-any.whl