PyPI - evalscope - Versions diffs - 0.5.0__py3-none-any.whl - Mend

evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

evalscope/__init__.py +3 -0
evalscope/backend/__init__.py +3 -0
evalscope/backend/base.py +27 -0
evalscope/backend/opencompass/__init__.py +3 -0
evalscope/backend/opencompass/api_meta_template.py +64 -0
evalscope/backend/opencompass/backend_manager.py +247 -0
evalscope/backend/opencompass/tasks/__init__.py +1 -0
evalscope/backend/opencompass/tasks/eval_api.py +30 -0
evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
evalscope/backend/vlm_eval_kit/__init__.py +1 -0
evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
evalscope/benchmarks/__init__.py +4 -0
evalscope/benchmarks/arc/__init__.py +5 -0
evalscope/benchmarks/arc/ai2_arc.py +148 -0
evalscope/benchmarks/arc/arc_adapter.py +231 -0
evalscope/benchmarks/bbh/__init__.py +6 -0
evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
evalscope/benchmarks/benchmark.py +65 -0
evalscope/benchmarks/ceval/__init__.py +5 -0
evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
evalscope/benchmarks/ceval/ceval_exam.py +159 -0
evalscope/benchmarks/cmmlu/__init__.py +5 -0
evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
evalscope/benchmarks/competition_math/__init__.py +5 -0
evalscope/benchmarks/competition_math/competition_math.py +88 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
evalscope/benchmarks/data_adapter.py +263 -0
evalscope/benchmarks/general_qa/__init__.py +5 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
evalscope/benchmarks/gsm8k/__init__.py +5 -0
evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
evalscope/benchmarks/hellaswag/__init__.py +5 -0
evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
evalscope/benchmarks/humaneval/__init__.py +5 -0
evalscope/benchmarks/humaneval/humaneval.py +82 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
evalscope/benchmarks/mmlu/__init__.py +5 -0
evalscope/benchmarks/mmlu/mmlu.py +174 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
evalscope/benchmarks/race/__init__.py +5 -0
evalscope/benchmarks/race/race.py +118 -0
evalscope/benchmarks/race/race_adapter.py +229 -0
evalscope/benchmarks/trivia_qa/__init__.py +5 -0
evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
evalscope/benchmarks/truthful_qa/__init__.py +5 -0
evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
evalscope/cache.py +98 -0
evalscope/cli/__init__.py +1 -0
evalscope/cli/base.py +20 -0
evalscope/cli/cli.py +26 -0
evalscope/cli/start_perf.py +37 -0
evalscope/cli/start_server.py +138 -0
evalscope/config.py +165 -0
evalscope/constants.py +150 -0
evalscope/evaluator/__init__.py +3 -0
evalscope/evaluator/evaluator.py +689 -0
evalscope/evaluator/rating_eval.py +178 -0
evalscope/evaluator/reviewer/__init__.py +1 -0
evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
evalscope/metrics/__init__.py +1 -0
evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
evalscope/metrics/code_metric.py +104 -0
evalscope/metrics/math_accuracy.py +60 -0
evalscope/metrics/metrics.py +405 -0
evalscope/metrics/rouge_metric.py +129 -0
evalscope/models/__init__.py +4 -0
evalscope/models/custom/__init__.py +4 -0
evalscope/models/custom/custom_model.py +53 -0
evalscope/models/dummy_chat_model.py +50 -0
evalscope/models/model.py +88 -0
evalscope/models/model_adapter.py +586 -0
evalscope/models/openai_model.py +103 -0
evalscope/models/template.py +1446 -0
evalscope/perf/__init__.py +0 -0
evalscope/perf/_logging.py +32 -0
evalscope/perf/api_plugin_base.py +60 -0
evalscope/perf/custom_api.py +87 -0
evalscope/perf/dashscope_api.py +84 -0
evalscope/perf/dataset_plugin_base.py +64 -0
evalscope/perf/datasets/__init__.py +0 -0
evalscope/perf/datasets/line_by_line.py +18 -0
evalscope/perf/datasets/longalpaca_12k.py +20 -0
evalscope/perf/datasets/openqa.py +22 -0
evalscope/perf/how_to_analysis_result.py +24 -0
evalscope/perf/http_client.py +756 -0
evalscope/perf/openai_api.py +130 -0
evalscope/perf/plugin_registry.py +35 -0
evalscope/perf/query_parameters.py +42 -0
evalscope/perf/server_sent_event.py +43 -0
evalscope/preprocess/__init__.py +1 -0
evalscope/preprocess/tokenizers/__init__.py +0 -0
evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
evalscope/registry/__init__.py +1 -0
evalscope/registry/tasks/arc.yaml +29 -0
evalscope/registry/tasks/bbh.yaml +27 -0
evalscope/registry/tasks/bbh_mini.yaml +27 -0
evalscope/registry/tasks/ceval.yaml +27 -0
evalscope/registry/tasks/ceval_mini.yaml +27 -0
evalscope/registry/tasks/cmmlu.yaml +27 -0
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
evalscope/registry/tasks/general_qa.yaml +27 -0
evalscope/registry/tasks/gsm8k.yaml +29 -0
evalscope/registry/tasks/mmlu.yaml +29 -0
evalscope/registry/tasks/mmlu_mini.yaml +27 -0
evalscope/run.py +404 -0
evalscope/run_arena.py +204 -0
evalscope/run_ms.py +140 -0
evalscope/summarizer.py +144 -0
evalscope/third_party/__init__.py +1 -0
evalscope/third_party/toolbench_static/__init__.py +3 -0
evalscope/third_party/toolbench_static/eval.py +219 -0
evalscope/third_party/toolbench_static/infer.py +278 -0
evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
evalscope/tools/__init__.py +1 -0
evalscope/tools/combine_reports.py +140 -0
evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
evalscope/tools/rewrite_eval_results.py +95 -0
evalscope/utils/__init__.py +4 -0
evalscope/utils/arena_utils.py +247 -0
evalscope/utils/completion_parsers.py +87 -0
evalscope/utils/logger.py +64 -0
evalscope/utils/task_cfg_parser.py +10 -0
evalscope/utils/task_utils.py +19 -0
evalscope/utils/utils.py +625 -0
evalscope/version.py +4 -0
evalscope-0.5.0.dist-info/METADATA +566 -0
evalscope-0.5.0.dist-info/RECORD +165 -0
evalscope-0.5.0.dist-info/WHEEL +5 -0
evalscope-0.5.0.dist-info/entry_points.txt +3 -0
evalscope-0.5.0.dist-info/top_level.txt +1 -0

evalscope/benchmarks/race/race.py ADDED Viewed

@@ -0,0 +1,118 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import datasets
+import pandas as pd
+_CITATION = """\
+@inproceedings{lai-etal-2017-race,
+    title = "{RACE}: Large-scale {R}e{A}ding Comprehension Dataset From Examinations",
+    author = "Lai, Guokun  and
+      Xie, Qizhe  and
+      Liu, Hanxiao  and
+      Yang, Yiming  and
+      Hovy, Eduard",
+    booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
+    month = sep,
+    year = "2017",
+    address = "Copenhagen, Denmark",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/D17-1082",
+    doi = "10.18653/v1/D17-1082",
+    pages = "785--794",
+}
+"""
+_DESCRIPTION = """\
+RACE is a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions.
+"""
+_HOMEPAGE = "https://modelscope.cn/datasets/modelscope/race/summary"
+_URL = "https://modelscope.cn/api/v1/datasets/modelscope/race/repo?Revision=master&FilePath=race.zip"
+task_list = [
+    "high",
+    "middle",
+]
+class RACEConfig(datasets.BuilderConfig):
+    def __init__(self, **kwargs):
+        super().__init__(version=datasets.Version("1.0.0"), **kwargs)
+class RACE(datasets.GeneratorBasedBuilder):
+    BUILDER_CONFIGS = [
+        RACEConfig(
+            name=task_name,
+        )
+        for task_name in task_list
+    ]
+    def _info(self):
+        features = datasets.Features(
+            {
+                "example_id": datasets.Value("string"),
+                "article": datasets.Value("string"),
+                "answer": datasets.Value("string"),
+                "question": datasets.Value("string"),
+                "options": [datasets.Value("string")],
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            citation=_CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        data_dir = dl_manager.download_and_extract(_URL)
+        task_name = self.config.name
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir, f"race/test/{task_name}-00000-of-00001.parquet"
+                    ),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir, f"race/val/{task_name}-00000-of-00001.parquet"
+                    ),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir, f"race/train/{task_name}-00000-of-00001.parquet"
+                    ),
+                },
+            ),
+        ]
+    def _generate_examples(self, filepath):
+        df = pd.read_parquet(filepath)
+        df.columns = ["example_id", "article", "answer", "question", "options"]
+        for i, instance in enumerate(df.to_dict(orient="records")):
+            yield i, instance

evalscope/benchmarks/race/race_adapter.py ADDED Viewed

@@ -0,0 +1,229 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import json
+from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.metrics.metrics import exact_match, weighted_mean
+from evalscope.utils import normalize_score, jsonl_to_list
+from evalscope.utils.logger import get_logger
+# flake8: noqa
+logger = get_logger()
+DATASET_ID = 'modelscope/race'
+SUBSET_LIST = [
+    "high",
+    "middle"
+]
+SUBJECT_MAPPING = {"high": "High",
+                   "middle": "Middle"
+}
+class RACEAdapter(DataAdapter):
+    choices = ['A', 'B', 'C', 'D']
+    def __init__(self,
+                 subset_list: list = None,
+                 metric_list: list = None,
+                 few_shot_num: int = None,
+                 train_split: str = 'train',
+                 eval_split: str = 'test',
+                 **kwargs):
+        if subset_list is None:
+            subset_list = SUBSET_LIST
+        if metric_list is None:
+            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+        if few_shot_num is None:
+            logger.info(f'Set 3-shot examples by system for RACE.')
+            few_shot_num = 3
+        if few_shot_num > 3:
+            logger.warning(f'few_shot_num <= 3 for RACE, but got {few_shot_num}. Use 3-shot by default.')
+            few_shot_num = 3
+        super().__init__(subset_list=subset_list,
+                         metric_list=metric_list,
+                         few_shot_num=few_shot_num,
+                         train_split=train_split,
+                         eval_split=eval_split,
+                         **kwargs)
+    def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
+        data_dict = {}
+        for subset_name in subset_list:
+            data_dict[subset_name] = {}
+            for split in [self.train_split, self.eval_split]:
+                if os.path.exists(dataset_name_or_path):
+                    file_path = os.path.join(dataset_name_or_path, subset_name, f'{split}.jsonl')
+                else:
+                    file_path = os.path.join(work_dir, dataset_name_or_path, subset_name, f'{split}.jsonl')
+                if os.path.exists(file_path):
+                    data_dict[subset_name][split] = jsonl_to_list(file_path)
+        return data_dict
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
+        """
+        Generate model prompt from raw input, unify the prompt format for RACE benchmark.
+        Args:
+            input_d (dict): The raw input. A single data format of the RACE:
+            {'example_id': 'high3680.txt',
+            'article': 'Astronauts on shorter shuttle missions often work very long days. Tasks are scheduled so tightly that break times are often used to finish the day's work. This type of schedule is far too demanding for long missions on the International Space Station(ISS). ISS crewmembers usually live in space for at least a quarter of a year. They work five days on and two days off to _ the normal way they do things on Earth as much as possible. Weekends give the crew valuable time to rest and do a few hours of housework. They can communicate with family and friends by email , internet phone and through private video conferences. While astronauts cannot go to a baseball game or a movie in orbit, there are many familiar activities that they can still enjoy . Before a mission, the family and friends of each ISS crewmember put together a collection of family photos, messages, videos and reading material for the astronauts to look at when they will be floating 370 kilometers above the Earth. During their mission, the crew also receives care packages with CDs, books, magazines, photos and letters . And as from early 2010, the internet became available on the ISS , giving astronauts the chance to do some "web surfing "in their personal time. Besides relaxing with these more common entertainments, astronauts can simply enjoy the experience of living in space. Many astronauts say that one of the most relaxing things to do in space is to look out the window and stare at the universe and the Earth's vast land mass and oceans.',
+            'answer': 'C',
+            'question': 'The passage mainly discusses how astronauts _ .',
+            'options': [
+                "work for longer missions in space",
+                "connect with people on the Earth",
+                "spend their free time in space",
+                "observe the Earth from space"]}
+        Returns:
+            {'data': [(context, continuation), ...]}
+        """
+        prompt = 'The following are multiple choice reading comprehension questions (with answers).\n\n'.format(
+            self._format_subject(subset_name)
+        )
+        few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
+        context: str = '\n'.join(few_shot_prompts) + '\n'
+        context += self._generate_prompt(input_d=input_d, include_answer=False)
+        context = prompt + context
+        full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
+        return {'data': [full_prompt], 'multi_choices': self.choices}
+    def get_gold_answer(self, input_d: dict) -> str:
+        # Get the gold choice
+        return input_d.get('answer', '')
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+        """
+        Parse the model output to get the answer. Could be the best choice index.
+        Args:
+            result: Predicted answer from the model. Usually a string for chat.
+            raw_input_d: The raw input. Depending on the dataset.
+            eval_type: The evaluation type. e.g. 'checkpoint' or 'service' or 'custom'.
+        Returns:
+            The parsed answer. Depending on the dataset. Usually a string for chat.
+        """
+        if eval_type == 'checkpoint':
+            return result
+        elif eval_type == 'service':        # TODO: to be implemented
+            return result
+        elif eval_type == 'custom':         # TODO: to be implemented
+            return result
+        else:
+            raise ValueError(f'Unknown eval_type: {eval_type}')
+    def match(self, gold: str, pred: str) -> float:
+        return exact_match(gold=gold, pred=pred)
+    def compute_metric(self, review_res_list: list) -> float:
+        """
+        Compute evaluation result by specific metric.
+        Args:
+            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
+        Returns:
+            The metric score.
+        """
+        items = [(score, 1.0) for score in review_res_list]
+        return weighted_mean(items)
+    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
+        """
+        Generate report for the evaluation.
+        Args:
+            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
+            report_name: The user-defined report name.
+        Returns:
+        {
+            "name":"RACE",
+            "metric":"WeightedAverageAccuracy",
+            "score":0.3389,
+            "category":[
+                {
+                   "name":"High",
+                   "score":0.2528,
+                   "subset":[
+                       {
+                           "name":"high",
+                           "score":0.2528
+                       }
+                   ]
+                }
+            ],
+            "total_num":59
+        }
+        """
+        total_num: int = sum([num for _, num in subset_score_map.values()])
+        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
+        # Get domain-subject mapping
+        subject_review_map = {}
+        for subset_name, (subset_score, num) in subset_score_map.items():
+            domain_name: str = SUBJECT_MAPPING.get(subset_name)
+            if domain_name in subject_review_map:
+                subject_review_map[domain_name].append((subset_name, subset_score, num))
+            else:
+                subject_review_map[domain_name] = [(subset_name, subset_score, num)]
+        # Get domain score
+        category_list = []
+        for domain_name, domain_res_list in subject_review_map.items():
+            domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
+                                     sum([num for _, _, num in domain_res_list])
+            domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
+            category_list.append({'name': domain_name,
+                                  'score': normalize_score(score=domain_weighted_avg_acc),
+                                  'subset': [{'name': subset_name, 'score': subset_score}
+                                             for subset_name, subset_score, _ in domain_res_list]})
+        # Get final dict of report
+        res_map = dict(name=report_name or 'race',
+                       metric=self.metric_list[0]['name'],
+                       score=weighted_avg_acc,
+                       category=category_list,
+                       total_num=total_num)
+        return res_map
+    @classmethod
+    def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
+        input_choices: list = input_d['options']
+        example: str = 'Article:\n{}\nQuestion:\n{}'.format(input_d['article'], input_d['question'])
+        for j in range(len(cls.choices)):
+            example += '\n{}. {}'.format(cls.choices[j], input_choices[j])
+        example += '\nAnswer:'
+        if include_answer:
+            example += ' {}\n\n'.format(input_d['answer'])
+        return example
+    @classmethod
+    def _format_subject(cls, subject):
+        l = subject.split('_')
+        s = ''
+        for entry in l:
+            s += ' ' + entry
+        return s

evalscope/benchmarks/trivia_qa/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter, DATASET_ID, SUBSET_LIST
+from evalscope.benchmarks.trivia_qa.trivia_qa_adapter import TriviaQaAdapter as DataAdapterClass
+from evalscope.models.model_adapter import ChatGenerationModelAdapter as ModelAdapterClass    # noqa

evalscope/benchmarks/trivia_qa/trivia_qa.py ADDED Viewed

@@ -0,0 +1,104 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import json
+import datasets
+import pandas as pd
+_CITATION = """\
+@article{2017arXivtriviaqa,
+       author = {{Joshi}, Mandar and {Choi}, Eunsol and {Weld},
+                 Daniel and {Zettlemoyer}, Luke},
+        title = "{triviaqa: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}",
+      journal = {arXiv e-prints},
+         year = 2017,
+          eid = {arXiv:1705.03551},
+        pages = {arXiv:1705.03551},
+archivePrefix = {arXiv},
+       eprint = {1705.03551},
+}
+"""
+_DESCRIPTION = """\
+TriviaqQA is a reading comprehension dataset containing over 650K question-answer-evidence triples.
+"""
+_HOMEPAGE = "https://modelscope.cn/datasets/modelscope/trivia_qa/summary"
+_URL = "https://modelscope.cn/api/v1/datasets/modelscope/trivia_qa/repo?Revision=master&FilePath=trivia_qa.zip"
+task_list = [
+    "default"
+]
+class TriviaQAConfig(datasets.BuilderConfig):
+    def __init__(self, **kwargs):
+        super().__init__(version=datasets.Version("1.0.0"), **kwargs)
+class TriviaQA(datasets.GeneratorBasedBuilder):
+    BUILDER_CONFIGS = [
+        TriviaQAConfig(
+            name=task_name,
+        )
+        for task_name in task_list
+    ]
+    def _info(self):
+        features = datasets.Features(
+            {
+                "input": [{
+                    "role": datasets.features.Value("string"),
+                    "content": datasets.features.Value("string"),
+                }],
+                "ideal": [datasets.Value("string")],
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            citation=_CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        data_dir = dl_manager.download_and_extract(_URL)
+        task_name = self.config.name
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir, f"trivia_qa/test.jsonl"
+                    ),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split("dev"),
+                gen_kwargs={
+                    "filepath": os.path.join(
+                        data_dir, f"trivia_qa/dev.jsonl"
+                    ),
+                },
+            ),
+        ]
+    def _generate_examples(self, filepath):
+        with open(filepath, encoding='utf-8') as f:
+            contents = [json.loads(line) for line in f.readlines()]
+            for i, instance in enumerate(contents):
+                yield i, instance

evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py ADDED Viewed

@@ -0,0 +1,207 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright (c) EleutherAI Inc, and its affiliates.
+import csv
+import os
+from typing import List
+import numpy as np
+from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.metrics.metrics import exact_match, weighted_mean
+from evalscope.utils.logger import get_logger
+# flake8: noqa
+logger = get_logger()
+DATASET_ID = 'modelscope/trivia_qa'
+SUBSET_LIST = ['default']
+class TriviaQaAdapter(DataAdapter):
+    def __init__(self,
+                 subset_list: list = None,
+                 metric_list: list = None,
+                 few_shot_num: int = None,
+                 train_split: str = 'dev',
+                 eval_split: str = 'test',
+                 **kwargs):
+        if subset_list is None:
+            subset_list = SUBSET_LIST
+        if metric_list is None:
+            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+        if few_shot_num is None:
+            logger.info(f'few_shot_num is not specified for TriviaQA, use default value: 5')
+            few_shot_num = 5
+        super().__init__(subset_list=subset_list,
+                         metric_list=metric_list,
+                         few_shot_num=few_shot_num,
+                         train_split=train_split,
+                         eval_split=eval_split,
+                         **kwargs)
+    def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
+        data_dict = {}
+        for subset_name in subset_list:
+            data_dict[subset_name] = {}
+            for split in [self.train_split, self.eval_split]:
+                if os.path.exists(dataset_name_or_path):
+                    file_path = os.path.join(dataset_name_or_path, f'trivia-{split}.qa.csv')
+                else:
+                    file_path = os.path.join(work_dir, dataset_name_or_path, f'trivia-{split}.qa.csv')
+                if os.path.exists(file_path):
+                    with open(file_path, 'r', encoding='utf-8') as f:
+                        reader = csv.reader(f, delimiter='\t')
+                        split_data = []
+                        for row in reader:
+                            assert len(row) == 2
+                            question = row[0]
+                            answers = eval(row[1])
+                            split_data.append({
+                                'input': [
+                                    {"role": "system", "content": "Follow the given examples and answer the question."},
+                                    {"role": "user", "content": question}
+                                ],
+                                'ideal': answers
+                            })
+                        data_dict[subset_name][split] = split_data
+        return data_dict
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
+        """
+        Generate model prompt from raw input, unify the prompt format for TriviaQA benchmark.
+        Args:
+            input_d (dict): The raw input. A single data format of the TriviaQA:
+            {
+                "input": [
+                    {"role": "system", "content": "Follow the given examples and answer the question."},
+                    {"role": "user", "content": "Which Lloyd Webber musical premiered in the US on 10th December 1993?"}
+                ],
+                "ideal": [
+                    "Sunset Blvd",
+                    "West Sunset Boulevard",
+                    "Sunset Boulevard",
+                    "Sunset Bulevard",
+                    "Sunset Blvd.",
+                    "sunset boulevard",
+                    "sunset bulevard",
+                    "west sunset boulevard",
+                    "sunset blvd"
+                ]
+            }
+        Returns:
+            {'data': [(context, continuation), ...]}
+        """
+        def get_sys_prompt(inp: dict) -> str:
+            return inp['input'][0]['content']
+        prompt = get_sys_prompt(input_d)
+        few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
+        context: str = '\n'.join(few_shot_prompts) + '\n'
+        context += self._generate_prompt(input_d=input_d, include_answer=False)
+        full_prompt = prompt + context
+        return {'data': [full_prompt]}
+    def get_gold_answer(self, input_d: dict) -> list:
+        # Get the gold choice
+        ans: list = input_d.get("ideal", [])
+        return ans
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+        """
+        Parse the model output to get the answer.
+        Args:
+            result: Predicted answer from the model. A list of loglikelihood values for inputs pairs.
+            raw_input_d: The raw input. A single data format of the TriviaQA:
+            eval_type: The type of evaluation, e.g. 'checkpoint' or 'service' or 'custom'.
+        Returns:
+            The predicted answer.
+        """
+        if eval_type == 'checkpoint':
+            return result
+        elif eval_type == 'service':  # TODO: to be implemented
+            return result
+        elif eval_type == 'custom':  # TODO: to be implemented
+            return result
+        else:
+            raise ValueError(f'Unknown eval_type: {eval_type}')
+    def match(self, gold: list, pred: str) -> float:
+        return max([exact_match(gold=ref, pred=pred) for ref in gold])
+    def compute_metric(self, review_res_list: list) -> float:
+        """
+        Compute evaluation result by specific metric.
+        Args:
+            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
+        Returns:
+            The metric score.
+        """
+        items = [(score, 1.0) for score in review_res_list]
+        return weighted_mean(items)
+    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
+        """
+        Generate the report for the model output.
+        Args:
+            subset_score_map: {subset_name: (score, num), ...}
+            report_name: The user-defined report name.
+        Returns:
+        {
+            "name":"TriviaQA",
+            "metric":"WeightedAverageAccuracy",
+            "score":0.3389,
+            "category":[
+                {
+                    "name":"DEFAULT",
+                    "score":0.3389,
+                    "subset":[
+                        {
+                            "name":"default",
+                            "score":0.3389
+                        }
+                    ]
+                }
+            ],
+            "total_num":100
+        }
+        """
+        total_num: int = sum([num for _, num in subset_score_map.values()])
+        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
+        cate_avg_list = [{'name': subset_name, 'score': score} for subset_name, (score, _) in subset_score_map.items()]
+        category_d = dict(name='DEFAULT',
+                          score=weighted_avg_acc,
+                          subset=cate_avg_list)
+        res_map = dict(name=report_name or 'trivia_qa',
+                       metric=self.metric_list[0]['name'],
+                       score=weighted_avg_acc,
+                       category=[category_d],
+                       total_num=total_num)
+        return res_map
+    @classmethod
+    def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
+        example: str = f"Question: {input_d['input'][1]['content']}\nAnswer:"
+        if include_answer:
+            example += f" {input_d['ideal'][0]}\n\n"
+        return example

evalscope/benchmarks/truthful_qa/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter, DATASET_ID, SUBSET_LIST
+from evalscope.benchmarks.truthful_qa.truthful_qa_adapter import TruthfulQaAdapter as DataAdapterClass
+from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass    # noqa