PyPI - evalscope - Versions diffs - 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

evalscope 0.16.2py3-none-any.whl → 0.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show

evalscope/app/app.py +9 -762
evalscope/app/constants.py +1 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +52 -0
evalscope/app/ui/multi_model.py +323 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +202 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +178 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +91 -0
evalscope/backend/opencompass/backend_manager.py +2 -1
evalscope/backend/rag_eval/backend_manager.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +1 -1
evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
evalscope/benchmarks/__init__.py +15 -1
evalscope/benchmarks/aime/aime24_adapter.py +2 -1
evalscope/benchmarks/aime/aime25_adapter.py +2 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
evalscope/benchmarks/arc/arc_adapter.py +1 -1
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
evalscope/benchmarks/arena_hard/utils.py +0 -12
evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
evalscope/benchmarks/data_adapter.py +20 -5
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
evalscope/benchmarks/general_arena/utils.py +226 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
evalscope/benchmarks/musr/musr_adapter.py +1 -1
evalscope/benchmarks/race/race_adapter.py +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
evalscope/benchmarks/utils.py +1 -2
evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
evalscope/config.py +8 -123
evalscope/evaluator/evaluator.py +15 -12
evalscope/metrics/__init__.py +6 -0
evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
evalscope/metrics/llm_judge.py +105 -20
evalscope/metrics/metrics.py +1 -1
evalscope/models/adapters/base_adapter.py +0 -2
evalscope/models/adapters/server_adapter.py +2 -2
evalscope/models/custom/dummy_model.py +3 -3
evalscope/perf/arguments.py +2 -16
evalscope/perf/main.py +1 -1
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +1 -1
evalscope/report/__init__.py +1 -1
evalscope/report/utils.py +34 -15
evalscope/run.py +1 -1
evalscope/summarizer.py +1 -2
evalscope/utils/__init__.py +63 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/import_utils.py +16 -0
evalscope/utils/io_utils.py +45 -4
evalscope/utils/model_utils.py +37 -1
evalscope/version.py +2 -2
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
tests/aigc/test_t2i.py +1 -1
tests/cli/test_all.py +50 -2
tests/cli/test_collection.py +1 -1
tests/cli/test_custom.py +261 -0
tests/cli/test_run.py +13 -37
tests/perf/test_perf.py +2 -2
tests/rag/test_clip_benchmark.py +2 -1
tests/rag/test_mteb.py +3 -1
tests/rag/test_ragas.py +3 -1
tests/swift/test_run_swift_eval.py +2 -1
tests/swift/test_run_swift_vlm_eval.py +2 -1
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
tests/utils.py +13 -0
tests/vlm/test_vlmeval.py +8 -2
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
/evalscope/{utils → benchmarks}/filters.py +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/general_qa/general_qa_adapter.py CHANGED Viewed

@@ -25,13 +25,21 @@ logger = get_logger()
     prompt_template='请回答问题\n{query}',
 )
 class GeneralQAAdapter(DataAdapter):
-    # TODO: set few_shot_num
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
     def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
+        """
+        Load dataset from the given path or dataset name.
+        Args:
+            dataset_name_or_path (str): Path to dataset directory or file.
+            subset_list (list): List of subset names to load.
+        Returns:
+            dict: Loaded dataset organized by subset.
+        """
         dataset_name_or_path = dataset_name_or_path or self.dataset_id
         subset_list = subset_list or self.subset_list
@@ -61,58 +69,64 @@ class GeneralQAAdapter(DataAdapter):
     def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
         """
+        Generate prompt for the model based on input data.
         Args:
-            input_d:
-                format1: {'history': [['q1', 'a1'], ['q2', 'a2']], 'question': '', 'answer': ''}
-                format2: {'history': [['q1', 'a1'], ['q2', 'a2']], 'query': '', 'response': ''}
+            input_d (dict): Input data dictionary.
+            subset_name (str): Name of the subset.
+            few_shot_list (list): List of few-shot examples.
         Returns:
-            {'data': [prompt]}
+            dict: Dictionary containing the generated prompt.
         """
-        # prompt = f"'<|im_start|>user\n{input_d['input']}<|im_end|>\n<|im_start|>assistant\n'"
-        history = input_d.get('history', [])  # history: [['q1', 'a1'], ['q2', 'a2'], ...]
-        if len(history) > 0:
-            logger.warning('The history is not included in the prompt for GeneralQA. \
-                           To be supported in the future.')
+        messages = input_d.get('messages')
         query = input_d.get('question', '') or input_d.get('query', '')
         system_prompt = input_d.get('system')
         prompt = self.prompt_template.format(query=query)
-        return self.gen_prompt_data(prompt, system_prompt=system_prompt)
+        return self.gen_prompt_data(prompt, system_prompt=system_prompt, messages=messages)
     def get_gold_answer(self, input_d: dict) -> str:
         """
+        Extract the gold (reference) answer from the input data.
         Args:
-            input_d: {'history': [], 'question': '', 'answer': ''}
+            input_d (dict): Input data dictionary.
         Returns:
-            gold_answer: str
+            str: Gold answer string.
         """
-        return input_d.get('answer', '') or input_d.get('response', '')
+        return input_d.get('answer') or input_d.get('response')
     def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
         """
+        Parse the prediction result.
         Args:
-            result: str
+            result (str): Model prediction result.
+            raw_input_d (dict, optional): Original input data.
+            eval_type (str): Evaluation type.
         Returns:
-            pred_result: str
+            str: Parsed prediction result.
         """
         return result
     def match(self, gold: str, pred: str) -> dict:
         """
+        Compute metric scores between gold and predicted answers.
         Args:
-            gold: str
-            pred: str
+            gold (str): Gold answer.
+            pred (str): Predicted answer.
         Returns:
-            bleu_score: dict
+            dict: Dictionary of computed metric scores.
         """
+        # reference free metrics
+        if gold is None:
+            return {'AverageAccuracy': -1}
+        # calculate rouge and bleu scores
         res = dict()
         if 'AverageRouge' in self.metric_list:
             from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
@@ -128,14 +142,13 @@ class GeneralQAAdapter(DataAdapter):
     def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
         """
-        compute weighted mean of the bleu score of all samples
+        Compute weighted mean of the metric scores for all samples.
         Args:
-            review_res_list: [score1, score2, ...]
+            review_res_list (list): List of metric score dictionaries.
         Returns:
-            avg_res: List[dict]
+            list: List of dictionaries with averaged metric results.
         """
         items = super().compute_dict_metric(review_res_list, **kwargs)
         return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]

evalscope/benchmarks/hellaswag/hellaswag_adapter.py CHANGED Viewed

@@ -6,9 +6,9 @@ import re
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
+from evalscope.metrics.completion_parsers import ResponseParser
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
-from evalscope.utils.utils import ResponseParser
 # flake8: noqa

evalscope/benchmarks/ifeval/ifeval_adapter.py CHANGED Viewed

@@ -2,7 +2,6 @@ from collections import defaultdict
 from typing import Any, Dict, List
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.benchmarks.ifeval.utils import process_results
 from evalscope.constants import EvalType
 from evalscope.metrics import Metric, mean, metric_registry
@@ -43,10 +42,9 @@ class IFEvalAdapter(DataAdapter):
     def get_gold_answer(self, input_d: dict) -> str:
         return input_d
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
-        return result
     def match(self, gold: Any, pred: Any) -> Dict:
+        from evalscope.benchmarks.ifeval.utils import process_results
         return process_results(gold, [pred])
     def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:

evalscope/benchmarks/iquiz/iquiz_adapter.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 @Benchmark.register(

evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py CHANGED Viewed

@@ -69,12 +69,6 @@ class LiveCodeBenchAdapter(DataAdapter):
         # Extract the gold answer from the input dict.
         return input_d
-    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
-        """
-        Parse the model output to get the answer. Could be the best choice index.
-        """
-        return result
     def match(self, gold: dict, pred: str) -> float:
         from .evaluate_utils import codegen_metrics
         from .extract_utils import extract_code_generation

evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 SUBSET_LIST = ['default']

evalscope/benchmarks/math_500/math_500_adapter.py CHANGED Viewed

@@ -54,4 +54,5 @@ class Math500Adapter(DataAdapter):
         return result
     def match(self, gold: str, pred: str) -> float:
-        return math_equal(pred, gold)
+        res = math_equal(pred, gold)
+        return 1.0 if res else 0.0

evalscope/benchmarks/mmlu/mmlu_adapter.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 from evalscope.utils.logger import get_logger
 # flake8: noqa

evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any, Dict
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 SUBSET_LIST = [
     'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',

evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py CHANGED Viewed

@@ -4,8 +4,8 @@ from typing import Any, Dict
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
+from evalscope.metrics.completion_parsers import ResponseParser
 from evalscope.utils.logger import get_logger
-from evalscope.utils.utils import ResponseParser
 logger = get_logger()

evalscope/benchmarks/musr/musr_adapter.py CHANGED Viewed

@@ -4,7 +4,7 @@ from typing import Any
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 @Benchmark.register(

evalscope/benchmarks/race/race_adapter.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger

evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py CHANGED Viewed

@@ -96,13 +96,16 @@ class TriviaQaAdapter(DataAdapter):
         def get_sys_prompt(inp: dict) -> str:
             return inp['input'][0]['content']
-        prompt = get_sys_prompt(input_d)
+        if self.few_shot_num > 0:
+            sys_prompt = get_sys_prompt(input_d)
+        else:
+            sys_prompt = None
         few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
-        context: str = '\n'.join(few_shot_prompts) + '\n'
+        context = '\n'.join(few_shot_prompts) + '\n'
         context += self._generate_prompt(input_d=input_d, include_answer=False)
         full_prompt = context
-        return self.gen_prompt_data(full_prompt)
+        return self.gen_prompt_data(full_prompt, system_prompt=sys_prompt)
     def get_gold_answer(self, input_d: dict) -> list:
         # Get the gold choice
@@ -124,7 +127,9 @@ class TriviaQaAdapter(DataAdapter):
         return result
     def match(self, gold: list, pred: str) -> float:
-        is_correct = any([cand in pred for cand in gold])
+        lower_pred = pred.lower()
+        gold = [g.lower() for g in gold]
+        is_correct = any([cand in lower_pred for cand in gold])
         return 1 if is_correct else 0
     @classmethod

evalscope/benchmarks/utils.py CHANGED Viewed

@@ -2,8 +2,7 @@ from dataclasses import asdict, dataclass
 from functools import wraps
 from typing import Dict, List, Optional, Union
-from evalscope.constants import EvalType
-from evalscope.utils.filters import Filter
+from .filters import Filter
 @dataclass

evalscope/benchmarks/winogrande/winogrande_adapter.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 @Benchmark.register(

evalscope/config.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import copy
-import json
 import os
 from argparse import Namespace
 from dataclasses import dataclass, field
@@ -10,18 +9,15 @@ from typing import Dict, List, Optional, Union
 from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType,
                                  JudgeStrategy, ModelTask, OutputType)
 from evalscope.models import CustomModel, DummyCustomModel
-from evalscope.utils import gen_hash
-from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
+from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
+from evalscope.utils.io_utils import dict_to_yaml, gen_hash
 from evalscope.utils.logger import get_logger
-from evalscope.utils.utils import parse_int_or_float
 logger = get_logger()
-cur_path = os.path.dirname(os.path.abspath(__file__))
 @dataclass
-class TaskConfig:
+class TaskConfig(BaseArgument):
     # Model-related arguments
     model: Union[str, 'CustomModel', None] = None
     model_id: Optional[str] = None
@@ -132,15 +128,6 @@ class TaskConfig:
                     'precision': 'torch.float16',
                 }
-    def to_dict(self):
-        result = self.__dict__.copy()
-        if isinstance(self.model, CustomModel):
-            result['model'] = self.model.__class__.__name__
-        return result
-    def __str__(self):
-        return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
     def update(self, other: Union['TaskConfig', dict]):
         if isinstance(other, TaskConfig):
             other = other.to_dict()
@@ -155,91 +142,11 @@ class TaskConfig:
         except Exception as e:
             logger.warning(f'Failed to dump overall task config: {e}')
-    @staticmethod
-    def list():
-        return list(registry_tasks.keys())
-    @staticmethod
-    def from_yaml(yaml_file: str):
-        return TaskConfig.from_dict(yaml_to_dict(yaml_file))
-    @staticmethod
-    def from_dict(d: dict):
-        return TaskConfig(**d)
-    @staticmethod
-    def from_json(json_file: str):
-        return TaskConfig.from_dict(json_to_dict(json_file))
-    @staticmethod
-    def from_args(args: Namespace):
-        # Convert Namespace to a dictionary and filter out None values
-        args_dict = {k: v for k, v in vars(args).items() if v is not None}
-        if 'func' in args_dict:
-            del args_dict['func']  # Note: compat CLI arguments
-        return TaskConfig.from_dict(args_dict)
-    @staticmethod
-    def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
-        res_list = []
-        for task_name in tasks:
-            task = registry_tasks.get(task_name, None)
-            if task is None:
-                logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
-                continue
-            task.model = custom_model
-            task.model_args = custom_model.config
-            task.model_id = type(custom_model).__name__
-            res_list.append(task)
-        return res_list
-    @staticmethod
-    def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
-        """
-        Register a new task (dataset) for evaluation.
-        Args:
-            name: str, the dataset name.
-            data_pattern: str, the data pattern for the task.
-                    e.g. `mmlu`, `ceval`, `gsm8k`, ...
-                    refer to task_config.list() for all available datasets.
-            dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
-                then your specific custom dataset directory will be /path/to/data/{name}
-            subset_list: list, the subset list for the dataset.
-                e.g. ['middle_school_politics', 'operating_system']
-                refer to the mmlu for example.  https://github.com/hendrycks/test/blob/master/categories.py
-        """
-        available_datasets = list(registry_tasks.keys())
-        if data_pattern not in available_datasets:
-            logger.error(
-                f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
-            return
-        # Reuse the existing task config and update the datasets
-        pattern_config = registry_tasks[data_pattern]
-        custom_config = copy.deepcopy(pattern_config)
-        custom_config.datasets = [data_pattern]
-        custom_config.dataset_args = {data_pattern: {}}
-        custom_config.eval_type = EvalType.CHECKPOINT
-        if dataset_dir is not None:
-            custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
-        if subset_list is not None:
-            custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
-        registry_tasks.update({name: custom_config})
-        logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
-tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
-registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
+    def to_dict(self):
+        result = self.__dict__.copy()
+        if isinstance(self.model, CustomModel):
+            result['model'] = self.model.__class__.__name__
+        return result
 def parse_task_config(task_cfg) -> TaskConfig:
@@ -264,25 +171,3 @@ def parse_task_config(task_cfg) -> TaskConfig:
     else:
         raise ValueError('Args: Please provide a valid task config.')
     return task_cfg
-class TempModel(CustomModel):
-    def __init__(self, config: dict):
-        super().__init__(config=config)
-    def predict(self, prompts: str, **kwargs):
-        return [item + ': response' for item in prompts]
-if __name__ == '__main__':
-    model = TempModel(config={'model_id': 'test-swift-dummy-model'})
-    task_config = TaskConfig()
-    # Register a new task
-    TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
-    swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
-    for item in swift_eval_task:
-        print(item)
-        print()

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -14,9 +14,9 @@ from evalscope.config import TaskConfig
 from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
 from evalscope.models import BaseModelAdapter
 from evalscope.report import Report, gen_table
-from evalscope.utils import dict_torch_dtype_to_str, gen_hash
-from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
+from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, gen_hash, jsonl_to_list
 from evalscope.utils.logger import get_logger
+from evalscope.utils.model_utils import dict_torch_dtype_to_str
 logger = get_logger()
@@ -237,9 +237,10 @@ class Evaluator(object):
             if use_llm:
                 # Use LLM as judge
                 assert self.judge is not None, f'Judge model is required for LLM judging {self.data_adapter.name}'
+                pred_content = self.data_adapter.llm_parse_pred_result(
+                    result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
                 review_result = self.data_adapter.llm_match(
-                    gold_content, answer_content, self.judge, raw_input=raw_input_d)
-                pred = answer_content
+                    gold_content, pred_content, self.judge, raw_input=raw_input_d)
             else:
                 # Use rule-based judging
                 pred_content = self.data_adapter.parse_pred_result(
@@ -250,15 +251,14 @@ class Evaluator(object):
                 if (self.task_cfg.judge_strategy == JudgeStrategy.LLM_RECALL
                         and isinstance(review_result, (bool, int, float)) and not bool(review_result)):
                     assert self.judge is not None, f'Judge model is required for LLM_RECALL strategy {self.data_adapter.name}'  # noqa: E501
+                    pred_content = self.data_adapter.llm_parse_pred_result(
+                        result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
                     review_result = self.data_adapter.llm_match(
-                        gold_content, answer_content, self.judge, raw_input=raw_input_d)
-                    pred = answer_content
-                else:
-                    pred = pred_content
+                        gold_content, pred_content, self.judge, raw_input=raw_input_d)
             choice[ReviewKeys.REVIEW] = {
                 ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
-                ReviewKeys.PRED: pred,
+                ReviewKeys.PRED: pred_content,
                 ReviewKeys.RESULT: review_result
             }
             rev_choices.append(choice)
@@ -394,9 +394,6 @@ class Evaluator(object):
         report_map: Report = self.data_adapter.gen_report(
             subset_score_map=reviews_score_all, model_name=self.model_name)
-        # Post process report
-        self.data_adapter.post_process_report(report_map, report_path=report_path)
         # Make table
         try:
             report_table = gen_table(report_list=[report_map], add_overall_metric=True)
@@ -418,6 +415,12 @@ class Evaluator(object):
         report_map.to_json(report_file)
         logger.info(f'Dump report to: {report_file} \n')
+        # Post process report
+        try:
+            self.data_adapter.post_process_report(report_map, report_path=report_path)
+        except Exception as e:
+            logger.error(f'Failed to post process report: {e}')
         return report_map
     def eval(self, **kwargs) -> dict:

evalscope/metrics/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
 from evalscope.utils.import_utils import _LazyModule
 if TYPE_CHECKING:
+    from .completion_parsers import ResponseParser, lmsys_parser, ranking_parser
     from .llm_judge import LLMJudge
     from .math_parser import extract_answer, math_equal, strip_answer_string
     from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
@@ -39,6 +40,11 @@ else:
             'math_equal',
             'strip_answer_string',
         ],
+        'completion_parsers': [
+            'ResponseParser',
+            'lmsys_parser',
+            'ranking_parser',
+        ],
     }
     import sys

evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.2py3-none-any.whl → 0.17.0py3-none-any.whl