PyPI - evalscope - Versions diffs - 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

evalscope 0.9.0py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

evalscope/arguments.py +1 -0
evalscope/benchmarks/arc/arc_adapter.py +3 -5
evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
evalscope/benchmarks/benchmark.py +1 -1
evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
evalscope/benchmarks/data_adapter.py +69 -70
evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
evalscope/benchmarks/ifeval/__init__.py +0 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
evalscope/benchmarks/ifeval/instructions.py +1478 -0
evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
evalscope/benchmarks/ifeval/utils.py +134 -0
evalscope/benchmarks/iquiz/__init__.py +0 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
evalscope/benchmarks/race/race_adapter.py +4 -73
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +29 -0
evalscope/collections/evaluator.py +82 -62
evalscope/collections/sampler.py +47 -41
evalscope/collections/schema.py +14 -10
evalscope/constants.py +4 -0
evalscope/evaluator/evaluator.py +22 -13
evalscope/metrics/__init__.py +2 -5
evalscope/metrics/metrics.py +11 -2
evalscope/metrics/named_metrics.py +17 -0
evalscope/models/server_adapter.py +11 -4
evalscope/perf/__init__.py +1 -0
evalscope/perf/main.py +0 -1
evalscope/perf/plugin/api/custom_api.py +1 -1
evalscope/perf/plugin/api/openai_api.py +1 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/longalpaca.py +1 -1
evalscope/report/__init__.py +5 -0
evalscope/report/app.py +506 -0
evalscope/report/combinator.py +73 -0
evalscope/report/generator.py +80 -0
evalscope/report/utils.py +133 -0
evalscope/run.py +16 -11
evalscope/summarizer.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/logger.py +1 -0
evalscope/utils/model_utils.py +5 -2
evalscope/version.py +2 -2
{evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
{evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
tests/cli/test_collection.py +11 -7
tests/cli/test_run.py +13 -4
evalscope/tools/__init__.py +0 -1
evalscope/tools/combine_reports.py +0 -133
evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
/evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
{evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
{evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
{evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0

evalscope/arguments.py CHANGED Viewed

@@ -33,6 +33,7 @@ def add_argument(parser: argparse.ArgumentParser):
     # yapf: disable
     # Model-related arguments
     parser.add_argument('--model', type=str, required=False, help='The model id on modelscope, or local model dir.')
+    parser.add_argument('--model-id', type=str, required=False, help='The model id for model name in report.')
     parser.add_argument('--model-args', type=str, action=ParseStrArgsAction, help='The model args, should be a string.')
     # Template-related arguments

evalscope/benchmarks/arc/arc_adapter.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType
-from evalscope.metrics import WeightedAverageAccuracy, exact_match
+from evalscope.metrics import AverageAccuracy, exact_match
 from evalscope.models import MultiChoiceModelAdapter
 from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
@@ -20,7 +20,7 @@ logger = get_logger()
     dataset_id='modelscope/ai2_arc',
     model_adapter=MultiChoiceModelAdapter,
     subset_list=['ARC-Easy', 'ARC-Challenge'],
-    metric_list=[WeightedAverageAccuracy],
+    metric_list=[AverageAccuracy],
     few_shot_num=0,
     train_split='train',
     eval_split='test',
@@ -109,12 +109,10 @@ class ARCAdapter(DataAdapter):
         few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
         context: str = '\n'.join(few_shot_prompts)
-        context = f'{self.prompt_template}\n{context}' if self.prompt_template else context
         # context = f'The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:\n {context}'
         full_prompt: str = context + self._generate_prompt(input_d=input_d, include_answer=False)
-        return {'data': [full_prompt], 'multi_choices': self.choices}
+        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
     def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice

evalscope/benchmarks/bbh/bbh_adapter.py CHANGED Viewed

@@ -7,7 +7,7 @@ import re
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import AnswerKeys
-from evalscope.metrics import WeightedAverageAccuracy, exact_match
+from evalscope.metrics import AverageAccuracy, exact_match
 from evalscope.models.chat_adapter import ChatGenerationModelAdapter
 from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
@@ -63,7 +63,7 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
     dataset_id='modelscope/bbh',
     model_adapter=ChatGenerationModelAdapter,
     subset_list=SUBSET_LIST,
-    metric_list=[WeightedAverageAccuracy],
+    metric_list=[AverageAccuracy],
     few_shot_num=3,
     train_split=None,
     eval_split='test',
@@ -122,7 +122,7 @@ class BBHAdapter(DataAdapter):
         cot_prompts: str = few_shot_list[0] if len(few_shot_list) > 0 else ''
         full_prompt: str = f"Follow the given examples and answer the question.\n{cot_prompts}\n\nQ: {input_d['input']}\nA: Let's think step by step."
-        return {'data': [full_prompt]}
+        return {'data': [full_prompt], 'system_prompt': self.prompt_template}
     def gen_prompts(self, data_dict: dict) -> dict:
         """

evalscope/benchmarks/benchmark.py CHANGED Viewed

@@ -22,7 +22,7 @@ class BenchmarkMeta:
     few_shot_random: bool = False
     train_split: Optional[str] = None
     eval_split: Optional[str] = None
-    prompt_template: str = ''
+    prompt_template: Optional[str] = None
     def _update(self, args: dict):
         if args.get('local_path'):

evalscope/benchmarks/ceval/ceval_adapter.py CHANGED Viewed

@@ -4,7 +4,7 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType
-from evalscope.metrics import WeightedAverageAccuracy
+from evalscope.metrics import AverageAccuracy
 from evalscope.metrics.metrics import exact_match, weighted_mean
 from evalscope.models import MultiChoiceModelAdapter
 from evalscope.utils import ResponseParser, normalize_score
@@ -130,7 +130,7 @@ SUBJECT_MAPPING = {
     dataset_id='modelscope/ceval-exam',
     model_adapter=MultiChoiceModelAdapter,
     subset_list=SUBSET_LIST,
-    metric_list=[WeightedAverageAccuracy],
+    metric_list=[AverageAccuracy],
     few_shot_num=0,
     train_split='dev',
     eval_split='val',
@@ -145,9 +145,10 @@ class CEVALAdapter(DataAdapter):
         if few_shot_num > 5:
             logger.warning(f'few_shot_num <= 5 for C-Eval, but got {few_shot_num}. Use 5-shot by default.')
             kwargs['few_shot_num'] = 5
         super().__init__(**kwargs)
+        self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
         for subset_name in subset_list:
@@ -206,7 +207,7 @@ class CEVALAdapter(DataAdapter):
         subject_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
         full_prompt = f'以下是中国关于{subject_name}考试的单项选择题，请选出其中的正确答案。\n' + full_prompt
-        return {'data': [full_prompt], 'multi_choices': self.choices}
+        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': self.prompt_template}
     def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
@@ -236,84 +237,6 @@ class CEVALAdapter(DataAdapter):
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate report for the evaluation.
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-        Returns:
-        {
-            "name":"C-Eval",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"STEM",
-                    "score":0.2528,
-                    "subset":[
-                        {
-                            "name":"computer_network",
-                            "score":0.2632
-                        },
-                        {
-                            "name":"operating_system",
-                            "score":0.3157
-                        },
-                        {
-                            "name":"computer_architecture",
-                            "score":0.4285
-                        }
-                    ]
-                }
-            ],
-            "total_num":59
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        # Get domain-subject mapping
-        subject_review_map = {}
-        for subset_name, (subset_score, num) in subset_score_map.items():
-            domain_name: str = SUBJECT_MAPPING.get(subset_name)[2] if SUBJECT_MAPPING.get(subset_name) else 'DEFAULT'
-            if domain_name in subject_review_map:
-                subject_review_map[domain_name].append((subset_name, subset_score, num))
-            else:
-                subject_review_map[domain_name] = [(subset_name, subset_score, num)]
-        # Get domain score
-        category_list = []
-        for domain_name, domain_res_list in subject_review_map.items():
-            domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
-                                      sum([num for _, _, num in domain_res_list])
-            domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
-            category_list.append({
-                'name':
-                domain_name,
-                'score':
-                domain_weighted_avg_acc,
-                'subset': [{
-                    'name': subset_name,
-                    'score': normalize_score(score=subset_score)
-                } for subset_name, subset_score, _ in domain_res_list]
-            })
-        category_list = sorted(category_list, key=lambda x: x['name'])
-        # Get final dict of report
-        res_map = dict(
-            name=report_name or 'ceval',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=category_list,
-            total_num=total_num)
-        return res_map
     @classmethod
     def _format_example(cls, input_d: dict, include_answer=True):
         example = '问题：' + input_d['question']

evalscope/benchmarks/cmmlu/cmmlu_adapter.py CHANGED Viewed

@@ -5,7 +5,7 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType
-from evalscope.metrics import WeightedAverageAccuracy, exact_match
+from evalscope.metrics import AverageAccuracy, exact_match
 from evalscope.models import MultiChoiceModelAdapter
 from evalscope.utils import ResponseParser, normalize_score
 from evalscope.utils.logger import get_logger
@@ -106,7 +106,7 @@ SUBJECT_MAPPING = {
     dataset_id='modelscope/cmmlu',
     model_adapter=MultiChoiceModelAdapter,
     subset_list=SUBSET_LIST,
-    metric_list=[WeightedAverageAccuracy],
+    metric_list=[AverageAccuracy],
     few_shot_num=5,
     train_split='dev',
     eval_split='test',
@@ -116,9 +116,10 @@ class CMMLUAdapter(DataAdapter):
     choices = ['A', 'B', 'C', 'D']
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
+        self.category_map = {k: v[-1] for k, v in SUBJECT_MAPPING.items()}
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
         for subset_name in subset_list:
@@ -173,7 +174,7 @@ class CMMLUAdapter(DataAdapter):
         full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
-        return {'data': [full_prompt], 'multi_choices': self.choices}
+        return {'data': [full_prompt], 'multi_choices': self.choices, 'system_prompt': prompt}
     def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
@@ -203,81 +204,6 @@ class CMMLUAdapter(DataAdapter):
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate report for the evaluation.
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: the user-defined report name. Default: None
-        Returns:
-        {
-            "name":"CMMLU",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                   "name":"STEM",
-                   "score":0.2528,
-                   "subset":[
-                       {
-                           "name":"computer_network",
-                           "score":0.2632
-                       },
-                       {
-                           "name":"operating_system",
-                           "score":0.3157
-                       },
-                       {
-                           "name":"computer_architecture",
-                           "score":0.4285
-                       }
-                   ]
-                }
-            ],
-            "total_num":59
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        # Get domain-subject mapping
-        subject_review_map = {}
-        for subset_name, (subset_score, num) in subset_score_map.items():
-            domain_name: str = SUBJECT_MAPPING.get(subset_name)[1] if SUBJECT_MAPPING.get(subset_name) else subset_name
-            if domain_name in subject_review_map:
-                subject_review_map[domain_name].append((subset_name, subset_score, num))
-            else:
-                subject_review_map[domain_name] = [(subset_name, subset_score, num)]
-        # Get domain score
-        category_list = []
-        for domain_name, domain_res_list in subject_review_map.items():
-            domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
-                                     sum([num for _, _, num in domain_res_list])
-            domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
-            category_list.append({
-                'name':
-                domain_name,
-                'score':
-                domain_weighted_avg_acc,
-                'subset': [{
-                    'name': subset_name,
-                    'score': normalize_score(subset_score)
-                } for subset_name, subset_score, _ in domain_res_list]
-            })
-        # Get final dict of report
-        res_map = dict(
-            name=report_name or 'cmmlu',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=category_list,
-            total_num=total_num)
-        return res_map
     @classmethod
     def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:

evalscope/benchmarks/competition_math/competition_math_adapter.py CHANGED Viewed

@@ -5,7 +5,7 @@ import json
 import os
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.metrics import WeightedAverageAccuracy
+from evalscope.metrics import AverageAccuracy
 from evalscope.metrics.math_accuracy import is_equiv, last_boxed_only_string, remove_boxed
 from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.logger import get_logger
@@ -20,11 +20,11 @@ logger = get_logger()
     dataset_id='modelscope/competition_math',
     model_adapter=ChatGenerationModelAdapter,
     subset_list=['default'],
-    metric_list=[WeightedAverageAccuracy],
+    metric_list=[AverageAccuracy],
     few_shot_num=4,
     train_split='train',
     eval_split='test',
-    prompt_template='',
+    prompt_template='Put the final answer in \\boxed{}.',
 )
 class CompetitionMathAdapter(DataAdapter):
     """ To be tested for all models. """
@@ -77,7 +77,7 @@ class CompetitionMathAdapter(DataAdapter):
         use_fewshot = self.few_shot_num > 0
         full_prompt = self._generate_prompt(input_d, use_fewshot=use_fewshot)
-        return {'data': [full_prompt], 'system_prompt': 'Put the final answer in \\boxed{}.'}
+        return {'data': [full_prompt], 'system_prompt': self.prompt_template}
     def get_gold_answer(self, input_d: dict) -> str:
         # Extract the gold answer from the input dict.

evalscope/benchmarks/data_adapter.py CHANGED Viewed

@@ -2,10 +2,11 @@
 import os.path
 import random
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any, List, Optional
 from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, AnswerKeys, EvalType, HubType
-from evalscope.utils import normalize_score
+from evalscope.metrics import Metric
+from evalscope.report import Report, ReportGenerator
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -14,12 +15,13 @@ logger = get_logger()
 class DataAdapter(ABC):
     def __init__(self,
+                 name: str,
                  subset_list: list,
-                 metric_list: list,
+                 metric_list: List[Metric],
                  few_shot_num: Optional[int] = 0,
                  train_split: Optional[str] = None,
                  eval_split: Optional[str] = None,
-                 prompt_template: str = '',
+                 prompt_template: Optional[str] = None,
                  **kwargs):
         """
         Data Adapter for the benchmark. You need to implement the following methods:
@@ -28,6 +30,7 @@ class DataAdapter(ABC):
             - parse_pred_result
             - match
         Args:
+            name: str, the name of the benchmark.
             subset_list: list of subset names for the dataset.
             metric_list: list, the metric list to evaluate the model on specific benchmark.
             few_shot_num: int, number of few-shot examples. Default: 0
@@ -37,6 +40,7 @@ class DataAdapter(ABC):
                 e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in
                     the form of A or B or C or D, do not output explanation:`
         """
+        self.name = name
         self.subset_list = subset_list
         self.metric_list = metric_list
         self.few_shot_num = few_shot_num
@@ -44,6 +48,7 @@ class DataAdapter(ABC):
         self.eval_split = eval_split
         self.prompt_template = prompt_template
         self.config_kwargs = kwargs
+        self.category_map = kwargs.get('category_map', {})
     def load(self,
              dataset_name_or_path: str,
@@ -142,59 +147,6 @@ class DataAdapter(ABC):
         return res_dict
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate report for the evaluation results for all subsets.
-        Args:
-            subset_score_map: The subset-score map.
-                e.g. {subset_name: (score, num)}
-            report_name: str, the user-defined report name. Default: None
-        Returns: The evaluation report.  Note: should normalize the score by normalize_score method in utils.
-        Here is a format example for ARC-Challenge:
-        {
-            "name":"ARC-Challenge",
-            "metric":"WeightedAverageAccuracy",
-            "score": 0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score": 0.3389,
-                    "subset":[
-                        {
-                            "name":"ARC-Challenge",
-                            "score": 0.3389,
-                            "num": 100
-                        },
-                    ]
-                }
-            ],
-            "total_num":100
-        }
-        """  # noqa: E501
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score),
-            'num': num
-        } for subset_name, (score, num) in subset_score_map.items()]
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-        res_map = dict(
-            name=report_name or 'DEFAULT',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-        return res_map
     def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
         if k > len(data_list):
@@ -204,28 +156,75 @@ class DataAdapter(ABC):
         else:
             return data_list[:k]
-    def compute_metric(self, review_res_list: list) -> Any:
+    def compute_metric(self, review_res_list: list) -> List[dict]:
         """
         Compute evaluation result by specific metrics.
         Args:
             review_res_list: list, the review result list, each item of which is match result for gold and pred.
-        Attributes:
-            DataAdapter.metric_func_map: metric_name -> metric_func mapping,
-                e.g. {'WeightedAverageAccuracy': weighted_average_acc}
         Returns:
-            Metric results.
+            Metric results. e.g. [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]
         """
         if len(self.metric_list) == 0:
             raise ValueError('No metric list found for the benchmark.')
-        elif len(self.metric_list) == 1:
-            # review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-            items = [(score, 1.0) for score in review_res_list]
-            return self.metric_list[0]['object'](items)
-        else:
-            raise ValueError('Please implement the compute_metric method for multiple metrics.')
+        res_list = []
+        for metric in self.metric_list:
+            metric_name = metric.name
+            metric_func = metric.object
+            res_list.append({
+                'metric_name': metric_name,
+                'score': metric_func(review_res_list),
+                'num': len(review_res_list)
+            })
+        return res_list
+    def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
+        """
+        Generate report for the evaluation results for all subsets.
+        Args:
+            subset_score_map: The subset-score map.
+                e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}]}
+            report_name: str, the user-defined report name. Default: None
+        Returns: The evaluation report.
+        Here is a format example for gsm8k:
+        {
+            "name": "qwen2.5_gsm8k",
+            "metrics": [
+                {
+                    "name": "AverageAccuracy",
+                    "categories": [
+                        {
+                            "name": "default",
+                            "subsets": [
+                                {
+                                    "name": "main",
+                                    "score": 0.0,
+                                    "num": 2
+                                }
+                            ],
+                            "num": 2,
+                            "score": 0.0,
+                            "macro_score": 0.0
+                        }
+                    ],
+                    "num": 2,
+                    "score": 0.0,
+                    "macro_score": 0.0
+                }
+            ],
+            "dataset_name": "gsm8k",
+            "model_name": "qwen2.5"
+        }
+        """  # noqa: E501
+        kwargs['category_map'] = self.category_map
+        kwargs['metric_list'] = self.metric_list
+        return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
     def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
         """
@@ -276,7 +275,7 @@ class DataAdapter(ABC):
         raise NotImplementedError
     @abstractmethod
-    def match(self, gold: Any, pred: Any) -> float:
+    def match(self, gold: Any, pred: Any) -> Any:
         """
         Match the gold answer and the predicted answer.

evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

evalscope 0.9.0py3-none-any.whl → 0.10.0py3-none-any.whl