PyPI - evalscope - Versions diffs - 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

evalscope 0.9.0py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

evalscope/arguments.py +1 -0
evalscope/benchmarks/arc/arc_adapter.py +3 -5
evalscope/benchmarks/bbh/bbh_adapter.py +3 -3
evalscope/benchmarks/benchmark.py +1 -1
evalscope/benchmarks/ceval/ceval_adapter.py +5 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +5 -79
evalscope/benchmarks/competition_math/competition_math_adapter.py +4 -4
evalscope/benchmarks/data_adapter.py +69 -70
evalscope/benchmarks/general_qa/general_qa_adapter.py +10 -63
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +4 -5
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +12 -6
evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -4
evalscope/benchmarks/ifeval/__init__.py +0 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
evalscope/benchmarks/ifeval/instructions.py +1478 -0
evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
evalscope/benchmarks/ifeval/utils.py +134 -0
evalscope/benchmarks/iquiz/__init__.py +0 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +8 -84
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +2 -2
evalscope/benchmarks/race/race_adapter.py +4 -73
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -6
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +8 -57
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +29 -0
evalscope/collections/evaluator.py +82 -62
evalscope/collections/sampler.py +47 -41
evalscope/collections/schema.py +14 -10
evalscope/constants.py +4 -0
evalscope/evaluator/evaluator.py +22 -13
evalscope/metrics/__init__.py +2 -5
evalscope/metrics/metrics.py +11 -2
evalscope/metrics/named_metrics.py +17 -0
evalscope/models/server_adapter.py +11 -4
evalscope/perf/__init__.py +1 -0
evalscope/perf/main.py +0 -1
evalscope/perf/plugin/api/custom_api.py +1 -1
evalscope/perf/plugin/api/openai_api.py +1 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/longalpaca.py +1 -1
evalscope/report/__init__.py +5 -0
evalscope/report/app.py +506 -0
evalscope/report/combinator.py +73 -0
evalscope/report/generator.py +80 -0
evalscope/report/utils.py +133 -0
evalscope/run.py +16 -11
evalscope/summarizer.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/logger.py +1 -0
evalscope/utils/model_utils.py +5 -2
evalscope/version.py +2 -2
{evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/METADATA +84 -7
{evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/RECORD +62 -50
tests/cli/test_collection.py +11 -7
tests/cli/test_run.py +13 -4
evalscope/tools/__init__.py +0 -1
evalscope/tools/combine_reports.py +0 -133
evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
/evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
{evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
{evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
{evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.9.0.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/general_qa/general_qa_adapter.py CHANGED Viewed

@@ -1,13 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import glob
-import json
 import os.path
 from collections import defaultdict
-from typing import Any, Optional
+from typing import List
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.metrics import (WeightedAverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh,
-                               weighted_mean)
+from evalscope.metrics import AverageBLEU, bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
 from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
@@ -20,7 +18,7 @@ logger = get_logger()
     dataset_id='general_qa',
     model_adapter=ChatGenerationModelAdapter,
     subset_list=['default'],
-    metric_list=[WeightedAverageBLEU],
+    metric_list=[AverageBLEU],
     few_shot_num=0,
     train_split=None,
     eval_split='test',
@@ -68,7 +66,7 @@ class GeneralQAAdapter(DataAdapter):
         # if len(history) > 0:
         #     prompt = '\n'.join(history) + '\n' + prompt
-        return {'data': [prompt]}
+        return {'data': [prompt], 'system_prompt': self.prompt_template}
     def get_gold_answer(self, input_d: dict) -> str:
         """
@@ -92,14 +90,14 @@ class GeneralQAAdapter(DataAdapter):
         """
         return result
-    def match(self, gold: str, pred: str) -> float:
+    def match(self, gold: str, pred: str) -> dict:
         """
         Args:
             gold: str
             pred: str
         Returns:
-            bleu_score: float
+            bleu_score: dict
         """
         res = dict()
@@ -107,10 +105,9 @@ class GeneralQAAdapter(DataAdapter):
         bleu_dict = bleu_ngram_one_sample(pred, gold)
         res.update(rouge_dict)
         res.update(bleu_dict)
-        # return bleu(item)
         return res
-    def compute_metric(self, review_res_list: list) -> float:
+    def compute_metric(self, review_res_list: List[dict]) -> List[dict]:
         """
         compute weighted mean of the bleu score of all samples
@@ -118,62 +115,12 @@ class GeneralQAAdapter(DataAdapter):
             review_res_list: [score1, score2, ...]
         Returns:
-            avg_res: float
+            avg_res: List[dict]
         """
         items = defaultdict(list)
         for scores in review_res_list:
             for k, v in scores.items():
-                items[k].append((v, 1.0))
+                items[k].append(v)
         # items = [(score, 1.0) for score in review_res_list]
-        res = {k: weighted_mean(v) for k, v in items.items()}
-        # return weighted_mean(items)
-        return res
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Args:
-            subset_score_map: {subset_name: (score_dict, num), ...}
-            report_name: str, the user-defined report name.
-        Returns:
-        {
-            "name":"GeneralQA",
-            "metric":"WeightedAverageBLEU",
-            "score":0.399,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.399,
-                    "subset":[
-                        {
-                            "name":"default",
-                            "score":0.399
-                        },
-                    ]
-                }
-            ],
-            "total_num":10
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        # weighted_avg_bleu: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': score_dict
-        } for subset_name, (score_dict, _) in subset_score_map.items()]
-        total_avg_list = defaultdict(float)
-        for score_dict, num in subset_score_map.values():
-            for metric, score in score_dict.items():
-                total_avg_list[metric] += score * num / total_num
-        category_d = dict(name='DEFAULT', score=total_avg_list, subset=cate_avg_list)
-        res_map = dict(
-            name=report_name or 'general_qa',
-            metric=self.metric_list[0]['name'],
-            score=total_avg_list,
-            category=[category_d],
-            total_num=total_num)
-        return res_map
+        return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]

evalscope/benchmarks/gsm8k/gsm8k_adapter.py CHANGED Viewed

@@ -6,7 +6,7 @@ import os
 import re
 from evalscope.benchmarks import Benchmark, DataAdapter
-from evalscope.metrics import WeightedAverageAccuracy
+from evalscope.metrics import AverageAccuracy
 from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
@@ -19,7 +19,7 @@ logger = get_logger()
     dataset_id='modelscope/gsm8k',
     model_adapter=ChatGenerationModelAdapter,
     subset_list=['main'],
-    metric_list=[WeightedAverageAccuracy],
+    metric_list=[AverageAccuracy],
     few_shot_num=4,
     train_split='train',
     eval_split='test',
@@ -33,7 +33,7 @@ class GSM8KAdapter(DataAdapter):
         Args:
             subset_list (list): Subset list for the dataset. Default: ['main']
-            metric_list (list): Metric list for the dataset. Default: [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+            metric_list (list): Metric list for the dataset. Default: [{'name': 'AverageAccuracy', 'object': mean}]
             few_shot_num (int): Number of few-shot examples. Default: 4
             train_split (str): Train split name. Default: 'train'
             eval_split (str): The target eval split name. Default: 'test'
@@ -75,9 +75,8 @@ class GSM8KAdapter(DataAdapter):
         use_fewshot = self.few_shot_num > 0
         full_prompt = self._generate_prompt(input_d, few_shot_list=few_shot_list, use_fewshot=use_fewshot)
-        full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
-        return {'data': [full_prompt]}
+        return {'data': [full_prompt], 'system_prompt': self.prompt_template}
     def get_gold_answer(self, input_d: dict) -> str:
         # Extract the gold answer from the input dict.

evalscope/benchmarks/hellaswag/hellaswag_adapter.py CHANGED Viewed

@@ -5,10 +5,11 @@ import re
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType
-from evalscope.metrics import WeightedAverageAccuracy, exact_match
+from evalscope.metrics import AverageAccuracy, exact_match
 from evalscope.models import ContinuationLogitsModelAdapter
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger
+from evalscope.utils.utils import ResponseParser
 # flake8: noqa
@@ -20,11 +21,12 @@ logger = get_logger()
     dataset_id='modelscope/hellaswag',
     model_adapter=ContinuationLogitsModelAdapter,
     subset_list=['default'],
-    metric_list=[WeightedAverageAccuracy],
+    metric_list=[AverageAccuracy],
     few_shot_num=0,
     train_split='train',
     eval_split='validation',
-    prompt_template='',
+    prompt_template=
+    'Respond with the index of sentence that makes the most sense, chose from 0, 1, 2, 3, derive your final answer as `The answer is ...`.',  # noqa: E501
 )
 class HellaSwagAdapter(DataAdapter):
@@ -87,7 +89,11 @@ class HellaSwagAdapter(DataAdapter):
         ctx_continuation_pair_list = [(context.strip(), ' ' + cont.strip()) for cont in endings]
-        return {'data': ctx_continuation_pair_list, 'multi_choices': self.choices}
+        return {
+            'data': ctx_continuation_pair_list,
+            'multi_choices': self.choices,
+            'system_prompt': self.prompt_template
+        }
     def get_gold_answer(self, input_d: dict) -> str:
         # Get the gold choice
@@ -114,9 +120,9 @@ class HellaSwagAdapter(DataAdapter):
             return str(best_choice_idx)
         elif eval_type == EvalType.SERVICE:
-            return result  # TODO: to be supported !
+            return ResponseParser.parse_first_option(result)
         elif eval_type == EvalType.CUSTOM:
-            return result  # TODO: to be supported !
+            return ResponseParser.parse_first_option(result)
         else:
             raise ValueError(f'Invalid eval_type: {eval_type}')

evalscope/benchmarks/humaneval/humaneval_adapter.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import re
-from typing import List
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.metrics import Pass1
@@ -22,7 +21,7 @@ logger = get_logger()
     few_shot_num=0,
     train_split=None,
     eval_split='test',
-    prompt_template='Complete the following python code:\n',
+    prompt_template='',
 )
 class HumanevalAdapter(DataAdapter):
     """
@@ -66,9 +65,9 @@ class HumanevalAdapter(DataAdapter):
             {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}
         """
         full_prompt = input_d['prompt']
-        full_prompt = f'{self.prompt_template}\n{full_prompt}' if self.prompt_template else full_prompt
+        full_prompt = f'Complete the following python code:\n{full_prompt}' if self.prompt_template else full_prompt
-        return {'data': [full_prompt]}
+        return {'data': [full_prompt], 'system_prompt': self.prompt_template}
     @classmethod
     def _postprocess(cls, text: str) -> str:

evalscope/benchmarks/ifeval/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/ifeval/ifeval_adapter.py ADDED Viewed

@@ -0,0 +1,57 @@
+from collections import defaultdict
+from typing import Any, Dict, List
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.benchmarks.ifeval.utils import agg_inst_level_acc, process_results
+from evalscope.constants import EvalType
+from evalscope.metrics import Metric, mean
+from evalscope.models import ChatGenerationModelAdapter
+from evalscope.utils.utils import normalize_score
+@Benchmark.register(
+    name='ifeval',
+    dataset_id='opencompass/ifeval',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=['default'],
+    metric_list=[
+        Metric(name='prompt_level_strict_acc', object=mean),
+        Metric(name='inst_level_strict_acc', object=agg_inst_level_acc),
+        Metric(name='prompt_level_loose_acc', object=mean),
+        Metric(name='inst_level_loose_acc', object=agg_inst_level_acc),
+    ],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='train',
+    prompt_template='',
+)
+class IFEvalAdapter(DataAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
+        return {'data': [input_d['prompt']], 'system_prompt': self.prompt_template}
+    def get_gold_answer(self, input_d: dict) -> str:
+        return input_d
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
+        return result
+    def match(self, gold: Any, pred: Any) -> Dict:
+        return process_results(gold, [pred])
+    def compute_metric(self, review_res_list: List[dict]) -> Any:
+        # aggregate review results
+        res_dict = defaultdict(list)
+        for res in review_res_list:
+            for k, v in res.items():
+                res_dict[k].append(v)
+        metrics = []
+        for metric in self.metric_list:
+            metric_name = metric.name
+            pred_value = res_dict[metric_name]
+            metrics.append({'metric_name': metric_name, 'score': metric.object(pred_value), 'num': len(pred_value)})
+        return metrics

evalscope 0.9.0__py3-none-any.whl → 0.10.0__py3-none-any.whl

evalscope 0.9.0py3-none-any.whl → 0.10.0py3-none-any.whl