PyPI - evalscope - Versions diffs - 0.5.0__py3-none-any.whl - Mend

evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

evalscope/__init__.py +3 -0
evalscope/backend/__init__.py +3 -0
evalscope/backend/base.py +27 -0
evalscope/backend/opencompass/__init__.py +3 -0
evalscope/backend/opencompass/api_meta_template.py +64 -0
evalscope/backend/opencompass/backend_manager.py +247 -0
evalscope/backend/opencompass/tasks/__init__.py +1 -0
evalscope/backend/opencompass/tasks/eval_api.py +30 -0
evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
evalscope/backend/vlm_eval_kit/__init__.py +1 -0
evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
evalscope/benchmarks/__init__.py +4 -0
evalscope/benchmarks/arc/__init__.py +5 -0
evalscope/benchmarks/arc/ai2_arc.py +148 -0
evalscope/benchmarks/arc/arc_adapter.py +231 -0
evalscope/benchmarks/bbh/__init__.py +6 -0
evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
evalscope/benchmarks/benchmark.py +65 -0
evalscope/benchmarks/ceval/__init__.py +5 -0
evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
evalscope/benchmarks/ceval/ceval_exam.py +159 -0
evalscope/benchmarks/cmmlu/__init__.py +5 -0
evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
evalscope/benchmarks/competition_math/__init__.py +5 -0
evalscope/benchmarks/competition_math/competition_math.py +88 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
evalscope/benchmarks/data_adapter.py +263 -0
evalscope/benchmarks/general_qa/__init__.py +5 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
evalscope/benchmarks/gsm8k/__init__.py +5 -0
evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
evalscope/benchmarks/hellaswag/__init__.py +5 -0
evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
evalscope/benchmarks/humaneval/__init__.py +5 -0
evalscope/benchmarks/humaneval/humaneval.py +82 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
evalscope/benchmarks/mmlu/__init__.py +5 -0
evalscope/benchmarks/mmlu/mmlu.py +174 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
evalscope/benchmarks/race/__init__.py +5 -0
evalscope/benchmarks/race/race.py +118 -0
evalscope/benchmarks/race/race_adapter.py +229 -0
evalscope/benchmarks/trivia_qa/__init__.py +5 -0
evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
evalscope/benchmarks/truthful_qa/__init__.py +5 -0
evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
evalscope/cache.py +98 -0
evalscope/cli/__init__.py +1 -0
evalscope/cli/base.py +20 -0
evalscope/cli/cli.py +26 -0
evalscope/cli/start_perf.py +37 -0
evalscope/cli/start_server.py +138 -0
evalscope/config.py +165 -0
evalscope/constants.py +150 -0
evalscope/evaluator/__init__.py +3 -0
evalscope/evaluator/evaluator.py +689 -0
evalscope/evaluator/rating_eval.py +178 -0
evalscope/evaluator/reviewer/__init__.py +1 -0
evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
evalscope/metrics/__init__.py +1 -0
evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
evalscope/metrics/code_metric.py +104 -0
evalscope/metrics/math_accuracy.py +60 -0
evalscope/metrics/metrics.py +405 -0
evalscope/metrics/rouge_metric.py +129 -0
evalscope/models/__init__.py +4 -0
evalscope/models/custom/__init__.py +4 -0
evalscope/models/custom/custom_model.py +53 -0
evalscope/models/dummy_chat_model.py +50 -0
evalscope/models/model.py +88 -0
evalscope/models/model_adapter.py +586 -0
evalscope/models/openai_model.py +103 -0
evalscope/models/template.py +1446 -0
evalscope/perf/__init__.py +0 -0
evalscope/perf/_logging.py +32 -0
evalscope/perf/api_plugin_base.py +60 -0
evalscope/perf/custom_api.py +87 -0
evalscope/perf/dashscope_api.py +84 -0
evalscope/perf/dataset_plugin_base.py +64 -0
evalscope/perf/datasets/__init__.py +0 -0
evalscope/perf/datasets/line_by_line.py +18 -0
evalscope/perf/datasets/longalpaca_12k.py +20 -0
evalscope/perf/datasets/openqa.py +22 -0
evalscope/perf/how_to_analysis_result.py +24 -0
evalscope/perf/http_client.py +756 -0
evalscope/perf/openai_api.py +130 -0
evalscope/perf/plugin_registry.py +35 -0
evalscope/perf/query_parameters.py +42 -0
evalscope/perf/server_sent_event.py +43 -0
evalscope/preprocess/__init__.py +1 -0
evalscope/preprocess/tokenizers/__init__.py +0 -0
evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
evalscope/registry/__init__.py +1 -0
evalscope/registry/tasks/arc.yaml +29 -0
evalscope/registry/tasks/bbh.yaml +27 -0
evalscope/registry/tasks/bbh_mini.yaml +27 -0
evalscope/registry/tasks/ceval.yaml +27 -0
evalscope/registry/tasks/ceval_mini.yaml +27 -0
evalscope/registry/tasks/cmmlu.yaml +27 -0
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
evalscope/registry/tasks/general_qa.yaml +27 -0
evalscope/registry/tasks/gsm8k.yaml +29 -0
evalscope/registry/tasks/mmlu.yaml +29 -0
evalscope/registry/tasks/mmlu_mini.yaml +27 -0
evalscope/run.py +404 -0
evalscope/run_arena.py +204 -0
evalscope/run_ms.py +140 -0
evalscope/summarizer.py +144 -0
evalscope/third_party/__init__.py +1 -0
evalscope/third_party/toolbench_static/__init__.py +3 -0
evalscope/third_party/toolbench_static/eval.py +219 -0
evalscope/third_party/toolbench_static/infer.py +278 -0
evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
evalscope/tools/__init__.py +1 -0
evalscope/tools/combine_reports.py +140 -0
evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
evalscope/tools/rewrite_eval_results.py +95 -0
evalscope/utils/__init__.py +4 -0
evalscope/utils/arena_utils.py +247 -0
evalscope/utils/completion_parsers.py +87 -0
evalscope/utils/logger.py +64 -0
evalscope/utils/task_cfg_parser.py +10 -0
evalscope/utils/task_utils.py +19 -0
evalscope/utils/utils.py +625 -0
evalscope/version.py +4 -0
evalscope-0.5.0.dist-info/METADATA +566 -0
evalscope-0.5.0.dist-info/RECORD +165 -0
evalscope-0.5.0.dist-info/WHEEL +5 -0
evalscope-0.5.0.dist-info/entry_points.txt +3 -0
evalscope-0.5.0.dist-info/top_level.txt +1 -0

evalscope/benchmarks/data_adapter.py ADDED Viewed

@@ -0,0 +1,263 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+import random
+from evalscope.benchmarks import Benchmark
+from evalscope.constants import DEFAULT_ROOT_CACHE_DIR, AnswerKeys
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+class DataAdapter(ABC):
+    def __init__(self,
+                 subset_list: list,
+                 metric_list: list,
+                 few_shot_num: Optional[int] = 0,
+                 train_split: Optional[str] = None,
+                 eval_split: Optional[str] = None,
+                 prompt_template: str = '',
+                 **kwargs):
+        """
+        Args:
+            subset_list: list of subset names for the dataset.
+            metric_list: list, the metric list to evaluate the model on specific benchmark.
+            few_shot_num: int, number of few-shot examples. Default: 0
+            train_split: str, usually for few-shot examples. e.g. 'train'
+            eval_split: str, the target eval split name. e.g. 'test'
+            prompt_template: str, the prompt template for the benchmark,
+                e.g. for ARC, it is `The following are multiple choice questions, please output correct answer in the form of A or B or C or D, do not output explanation:`
+        """
+        self.subset_list = subset_list
+        self.metric_list = metric_list
+        self.few_shot_num = few_shot_num
+        self.train_split = train_split
+        self.eval_split = eval_split
+        self.prompt_template = prompt_template
+        self.config_kwargs = kwargs
+    def load(self,
+             dataset_name_or_path: str,
+             subset_list: list = None,
+             work_dir: Optional[str] = DEFAULT_ROOT_CACHE_DIR,
+             datasets_hub: str = 'ModelScope',
+             **kwargs) -> dict:
+        """
+        Load the dataset. Remote and local datasets are supported.
+        You can rewrite this method to support your own local dataset, just follow the format of the output.
+        Returns: {'subset_name': {'train': train_dataset, 'test': test_dataset}}
+            train_dataset, test_dataset: Iterable dataset, object each item of which is a dict.
+        """
+        dataset_name_or_path = os.path.expanduser(dataset_name_or_path)
+        if datasets_hub == 'Local':
+            # Try to load dataset from local disk
+            if not os.path.exists(dataset_name_or_path):
+                raise FileNotFoundError(f'Dataset path not found: {dataset_name_or_path}')
+            logger.info(f'Loading dataset from local disk: >dataset_name: {dataset_name_or_path}  >work_dir: {work_dir}')
+            data_dict = self.load_from_disk(dataset_name_or_path, subset_list, work_dir, **kwargs)
+            if len(data_dict) == 0 or len(next(iter(data_dict.values()))) == 0:
+                raise ValueError(f'Local dataset is empty: {dataset_name_or_path}')
+        else:
+            # Load dataset from remote
+            logger.info(f'Loading dataset from {datasets_hub} hub: >dataset_name: {dataset_name_or_path}')
+            data_dict = {}
+            split_list = [split for split in [self.train_split, self.eval_split] if split is not None]
+            if len(split_list) == 0:
+                logger.error(f'Got empty split list: {split_list}')
+            subset_list = subset_list if subset_list is not None else self.subset_list
+            for sub_name in subset_list:
+                data_dict[sub_name] = {}
+                # e.g. train: few-shot, test: target dataset to evaluate
+                for split in split_list:
+                    dataset = Benchmark.load(dataset_name=dataset_name_or_path,
+                                             subset=sub_name,
+                                             split=split,
+                                             hub=datasets_hub,
+                                             work_dir=work_dir,
+                                             **kwargs)
+                    data_dict[sub_name].update({split: dataset})
+        return data_dict
+    def load_from_disk(self, *args, **kwargs) -> dict:
+        """
+        Load the dataset from local disk.
+        If you want to support local dataset, please rewrite this method in xxx_data_adapter.
+        """
+        return {}
+    def gen_prompts(self, data_dict: dict) -> dict:
+        """
+        Generate dataset prompts from raw input, unify the prompt format for different datasets.
+        Args:
+            data_dict:  Refer to the output of load method: evalscope.benchmarks.benchmark.Benchmark.load
+        Returns:
+            {'subset_name': [prompt_d_1, prompt_d_2, ...]}
+            prompt_d_i (dict): refer to the output of gen_prompt method.
+        e.g. train -- few-shot data, test -- target dataset to evaluate.
+        """
+        res_dict: dict = {}
+        if self.few_shot_num and self.few_shot_num < 0:
+            raise ValueError(f'Invalid shot_num: {self.few_shot_num} for few-shot evaluation.')
+        logger.info(f'\n** Use default settings: \n'
+                    f'>few_shot_num: {self.few_shot_num}, '
+                    f'>few_shot_split: {self.train_split}, '
+                    f'>target_eval_split: {self.eval_split}')
+        for sub_name, sub_data_dict in data_dict.items():
+            few_shot_data = []
+            if self.few_shot_num and self.few_shot_num > 0:
+                few_shot_random: bool = self.config_kwargs.get('few_shot_random', True)
+                few_shot_data = self.get_fewshot_examples(
+                    [item for item in sub_data_dict[self.train_split]],
+                    self.few_shot_num,
+                    few_shot_random=few_shot_random)
+            res_dict[sub_name] = []
+            for sample_d in sub_data_dict[self.eval_split]:
+                prompt_d = self.gen_prompt(input_d=sample_d, subset_name=sub_name, few_shot_list=few_shot_data)
+                prompt_d[AnswerKeys.RAW_INPUT] = sample_d
+                res_dict[sub_name].append(prompt_d)
+        rnd = random.Random()
+        rnd.seed(42)
+        for k, v in res_dict.items():
+            rnd.shuffle(v)
+        return res_dict
+    @abstractmethod
+    def gen_prompt(self, *args, **kwargs) -> Any:
+        """
+        Generate model prompt from raw input, unify the prompt format for different datasets.
+        The input format is compatible with OpenAI Chat Completions APIs.
+        Refer to: https://platform.openai.com/docs/guides/gpt/chat-completions-api
+        Args:
+            input_d (Any): The raw input. Depending on the dataset.
+        Returns:
+            For class MultiChoiceModelAdapter, the output format is:
+                {'data': [full_prompt]},  -- full_prompt: str, the constructed prompt for each sample from dataset.
+            For class ContinuationEvalModelAdapter, the output format is:
+                {'data': ctx_continuation_pair_list, 'multi_choices': self.choices}
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def get_gold_answer(self, input_d: Any) -> Any:
+        """
+        Parse the raw input labels (gold).
+        Args:
+            input_d: input raw data. Depending on the dataset.
+        Returns:
+            The parsed input. e.g. gold answer ... Depending on the dataset.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> Any:
+        """
+        Parse the predicted result and extract proper answer.
+        Args:
+            result: Predicted answer from the model. Usually a string for chat.
+            raw_input_d: The raw input. Depending on the dataset.
+            eval_type: 'checkpoint' or 'service' or `custom`, default: 'checkpoint'
+        Returns:
+            The parsed answer. Depending on the dataset. Usually a string for chat.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def match(self, gold: Any, pred: Any) -> Any:
+        """
+        Match the gold answer and the predicted answer.
+        Args:
+            gold (Any): The golden answer. Usually a string for chat/multiple-choice-questions.
+                        e.g. 'A'
+            pred (Any): The predicted answer. Usually a string for chat/multiple-choice-questions.
+                        e.g. 'B'
+        Returns:
+            The match result. Usually a score (float) for chat/multiple-choice-questions.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def compute_metric(self, review_res_list: list) -> Any:
+        """
+        Compute evaluation result by specific metrics.
+        Args:
+            review_res_list: list, the review result list, each item of which is match result for gold and pred.
+        Attributes:
+            DataAdapter.metric_func_map: metric_name -> metric_func mapping,
+                e.g. {'WeightedAverageAccuracy': weighted_average_acc}
+        Returns:
+            Metric results.
+        """
+        raise NotImplementedError
+    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
+        """
+        Generate report for the evaluation results for all subsets.
+        Args:
+            subset_score_map: The subset-score map.
+                e.g. {subset_name: (score, num)}
+            report_name: str, the user-defined report name. Default: None
+        Returns: The evaluation report.  Note: should normalize the score by normalize_score method in utils.
+        Here is a format example for ARC-Challenge:
+        {
+            "name":"ARC-Challenge",
+            "metric":"WeightedAverageAccuracy",
+            "score": 0.3389,
+            "category":[
+                {
+                    "name":"DEFAULT",
+                    "score": 0.3389,
+                    "subset":[
+                        {
+                            "name":"ARC-Challenge",
+                            "score": 0.3389
+                        },
+                    ]
+                }
+            ],
+            "total_num":100
+        }
+        """
+        raise NotImplementedError
+    def get_fewshot_examples(self, data_list: list, k: int, few_shot_random: bool = True):
+        if k > len(data_list):
+            k = len(data_list)
+        if few_shot_random:
+            return random.sample(data_list, k)
+        else:
+            return data_list[:k]

evalscope/benchmarks/general_qa/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.benchmarks.general_qa.general_qa_adapter import DATASET_ID, SUBSET_LIST, GeneralQAAdapter
+from evalscope.benchmarks.general_qa.general_qa_adapter import GeneralQAAdapter as DataAdapterClass
+from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass

evalscope/benchmarks/general_qa/general_qa_adapter.py ADDED Viewed

@@ -0,0 +1,186 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import os.path
+from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.metrics.metrics import bleu_ngram_one_sample, weighted_mean
+from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
+from evalscope.utils import jsonl_to_list
+from evalscope.utils.logger import get_logger
+from typing import Any, Optional
+from collections import defaultdict
+import json
+logger = get_logger()
+DATASET_ID = 'general_qa'
+SUBSET_LIST = ['default']
+class GeneralQAAdapter(DataAdapter):
+    # TODO: set few_shot_num
+    def __init__(self,
+                 subset_list: list = None,
+                 metric_list: list = None,
+                 train_split: str = None,
+                 eval_split: str = 'test',
+                 **kwargs):
+        if subset_list is None:
+            subset_list = SUBSET_LIST
+        if metric_list is None:
+            metric_list = [{'name': 'WeightedAverageBLEU', 'object': weighted_mean}]
+        super().__init__(subset_list=subset_list,
+                         metric_list=metric_list,
+                         train_split=train_split,
+                         eval_split=eval_split,
+                         **kwargs)
+    def load(self,
+             dataset_name_or_path: str,
+             subset_list: list = None,
+             **kwargs) -> dict:
+        data_file_list = glob.glob(os.path.join(dataset_name_or_path, '*.jsonl'))
+        data_list = []
+        try:
+            for file_path in data_file_list:
+                data_list.extend(jsonl_to_list(file_path))
+        except Exception as e:
+            raise ValueError(f"Failed to load data from {dataset_name_or_path}, got error: {e}")
+        data_dict = {'default': {'test': data_list}}
+        return data_dict
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
+        """
+        Args:
+            input_d:
+                format1: {'history': [['q1', 'a1'], ['q2', 'a2']], 'question': '', 'answer': ''}
+                format2: {'history': [['q1', 'a1'], ['q2', 'a2']], 'query': '', 'response': ''}
+        Returns:
+            {'data': [prompt]}
+        """
+        # prompt = f"'<|im_start|>user\n{input_d['input']}<|im_end|>\n<|im_start|>assistant\n'"
+        history = input_d.get('history', [])    # history: [['q1', 'a1'], ['q2', 'a2'], ...]
+        if len(history) > 0:
+            logger.warning(f"The history is not included in the prompt for GeneralQA. To be supported in the future.")
+        prompt = input_d.get('question', '') or input_d.get('query', '')
+        # if len(history) > 0:
+        #     prompt = '\n'.join(history) + '\n' + prompt
+        return {'data': [prompt]}
+    def get_gold_answer(self, input_d: dict) -> str:
+        """
+        Args:
+            input_d: {'history': [], 'question': '', 'answer': ''}
+        Returns:
+            gold_answer: str
+        """
+        return input_d.get('answer', '') or input_d.get('response', '')
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+        """
+        Args:
+            result: str
+        Returns:
+            pred_result: str
+        """
+        return result
+    def match(self, gold: str, pred: str) -> float:
+        """
+        Args:
+            gold: str
+            pred: str
+        Returns:
+            bleu_score: float
+        """
+        item = [(gold, pred)]
+        res = dict()
+        rouge_dict = compute_rouge_score_one_sample_zh([pred], [gold])
+        bleu_dict = bleu_ngram_one_sample(pred, gold)
+        res.update(rouge_dict)
+        res.update(bleu_dict)
+        # return bleu(item)
+        return res
+    def compute_metric(self, review_res_list: list) -> float:
+        """
+        compute weighted mean of the bleu score of all samples
+        Args:
+            review_res_list: [score1, score2, ...]
+        Returns:
+            avg_res: float
+        """
+        items = defaultdict(list)
+        for scores in review_res_list:
+            for k,v in scores.items():
+                items[k].append((v, 1.0))
+        # items = [(score, 1.0) for score in review_res_list]
+        res = {k: weighted_mean(v) for k,v in items.items()}
+        # return weighted_mean(items)
+        return res
+    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
+        """
+        Args:
+            subset_score_map: {subset_name: (score_dict, num), ...}
+            report_name: str, the user-defined report name.
+        Returns:
+        {
+            "name":"GeneralQA",
+            "metric":"WeightedAverageBLEU",
+            "score":0.399,
+            "category":[
+                {
+                    "name":"DEFAULT",
+                    "score":0.399,
+                    "subset":[
+                        {
+                            "name":"default",
+                            "score":0.399
+                        },
+                    ]
+                }
+            ],
+            "total_num":10
+        }
+        """
+        total_num: int = sum([num for _, num in subset_score_map.values()])
+        # weighted_avg_bleu: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
+        cate_avg_list = [{'name': subset_name, 'score': score_dict} for subset_name, (score_dict, _) in subset_score_map.items()]
+        total_avg_list = defaultdict(float)
+        for score_dict, num in subset_score_map.values():
+            for metric, score in score_dict.items():
+                total_avg_list[metric] += score * num / total_num
+        category_d = dict(name="DEFAULT",
+                          score=total_avg_list,
+                          subset=cate_avg_list)
+        res_map = dict(name=report_name or "general_qa",
+                       metric=self.metric_list[0]['name'],
+                       score=total_avg_list,
+                       category=[category_d],
+                       total_num=total_num)
+        return res_map

evalscope/benchmarks/gsm8k/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.benchmarks.gsm8k.gsm8k_adapter import DATASET_ID, SUBSET_LIST
+from evalscope.benchmarks.gsm8k.gsm8k_adapter import GSM8KAdapter as DataAdapterClass
+from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass    # noqa

evalscope/benchmarks/gsm8k/gsm8k.py ADDED Viewed

@@ -0,0 +1,127 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+"""Grade School Math 8k dataset."""
+import json
+import textwrap
+import datasets
+_CITATION = """\
+@misc{cobbe2021training,
+      title={Training Verifiers to Solve Math Word Problems},
+      author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
+      year={2021},
+      eprint={2110.14168},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG}
+}
+"""
+_DESCRIPTION = """\
+GSM8K (Grade School Math 8K) is a dataset of 8.5K high quality
+linguistically diverse grade school math word problems. The
+dataset was created to support the task of question answering
+on basic mathematical problems that require multi-step reasoning.
+"""
+_HOMEPAGE = 'https://openai.com/blog/grade-school-math'
+_MODELSCOPE_PAGE = 'https://modelscope.cn/datasets/modelscope/gsm8k/summary'
+_LICENSE = 'MIT'
+# _BASE_URL = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/"
+TRAIN_URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/gsm8k/train.jsonl'
+TEST_URL = 'https://sail-moe.oss-cn-hangzhou.aliyuncs.com/open_data/gsm8k/test.jsonl'
+class Gsm8kConfig(datasets.BuilderConfig):
+    """BuilderConfig for GSM8K."""
+    def __init__(self, urls, **kwargs):
+        """BuilderConfig for GSM8K.
+        Args:
+        urls: *dict[string]*, the urls for each split of the GSM8k set.
+        """
+        super().__init__(version=datasets.Version('1.1.0'), **kwargs)
+        self.urls = urls
+class Gsm8k(datasets.GeneratorBasedBuilder):
+    """Grade School Math 8k (GSM8K)"""
+    BUILDER_CONFIGS = [
+        Gsm8kConfig(
+            name='main',
+            description=textwrap.dedent(
+                """
+                It is segmented into 7.5K training problems and 1K test problems.
+                These problems take between 2 and 8 steps to solve, and solutions
+                primarily involve performing a sequence of elementary calculations
+                using basic arithmetic operations (+ - / *) to reach the final
+                answer. A bright middle school student should be able to solve
+                every problem.
+                """,
+            ),
+            urls={
+                'train': TRAIN_URL,
+                'test': TEST_URL,
+            },
+        ),
+    ]
+    def _info(self):
+        features = datasets.Features(
+            {
+                'question': datasets.Value('string'),
+                'answer': datasets.Value('string'),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        data_dir = dl_manager.download_and_extract(self.config.urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    'filepath': data_dir['train'],
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    'filepath': data_dir['test'],
+                },
+            ),
+        ]
+    def _generate_examples(self, filepath):
+        with open(filepath, encoding='utf-8') as f:
+            for key, row in enumerate(f):
+                data = json.loads(row)
+                yield key, {
+                    'question': data['question'],
+                    'answer': data['answer'],
+                }