PyPI - evalscope - Versions diffs - 0.5.0__py3-none-any.whl - Mend

evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

evalscope/__init__.py +3 -0
evalscope/backend/__init__.py +3 -0
evalscope/backend/base.py +27 -0
evalscope/backend/opencompass/__init__.py +3 -0
evalscope/backend/opencompass/api_meta_template.py +64 -0
evalscope/backend/opencompass/backend_manager.py +247 -0
evalscope/backend/opencompass/tasks/__init__.py +1 -0
evalscope/backend/opencompass/tasks/eval_api.py +30 -0
evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
evalscope/backend/vlm_eval_kit/__init__.py +1 -0
evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
evalscope/benchmarks/__init__.py +4 -0
evalscope/benchmarks/arc/__init__.py +5 -0
evalscope/benchmarks/arc/ai2_arc.py +148 -0
evalscope/benchmarks/arc/arc_adapter.py +231 -0
evalscope/benchmarks/bbh/__init__.py +6 -0
evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
evalscope/benchmarks/benchmark.py +65 -0
evalscope/benchmarks/ceval/__init__.py +5 -0
evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
evalscope/benchmarks/ceval/ceval_exam.py +159 -0
evalscope/benchmarks/cmmlu/__init__.py +5 -0
evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
evalscope/benchmarks/competition_math/__init__.py +5 -0
evalscope/benchmarks/competition_math/competition_math.py +88 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
evalscope/benchmarks/data_adapter.py +263 -0
evalscope/benchmarks/general_qa/__init__.py +5 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
evalscope/benchmarks/gsm8k/__init__.py +5 -0
evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
evalscope/benchmarks/hellaswag/__init__.py +5 -0
evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
evalscope/benchmarks/humaneval/__init__.py +5 -0
evalscope/benchmarks/humaneval/humaneval.py +82 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
evalscope/benchmarks/mmlu/__init__.py +5 -0
evalscope/benchmarks/mmlu/mmlu.py +174 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
evalscope/benchmarks/race/__init__.py +5 -0
evalscope/benchmarks/race/race.py +118 -0
evalscope/benchmarks/race/race_adapter.py +229 -0
evalscope/benchmarks/trivia_qa/__init__.py +5 -0
evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
evalscope/benchmarks/truthful_qa/__init__.py +5 -0
evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
evalscope/cache.py +98 -0
evalscope/cli/__init__.py +1 -0
evalscope/cli/base.py +20 -0
evalscope/cli/cli.py +26 -0
evalscope/cli/start_perf.py +37 -0
evalscope/cli/start_server.py +138 -0
evalscope/config.py +165 -0
evalscope/constants.py +150 -0
evalscope/evaluator/__init__.py +3 -0
evalscope/evaluator/evaluator.py +689 -0
evalscope/evaluator/rating_eval.py +178 -0
evalscope/evaluator/reviewer/__init__.py +1 -0
evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
evalscope/metrics/__init__.py +1 -0
evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
evalscope/metrics/code_metric.py +104 -0
evalscope/metrics/math_accuracy.py +60 -0
evalscope/metrics/metrics.py +405 -0
evalscope/metrics/rouge_metric.py +129 -0
evalscope/models/__init__.py +4 -0
evalscope/models/custom/__init__.py +4 -0
evalscope/models/custom/custom_model.py +53 -0
evalscope/models/dummy_chat_model.py +50 -0
evalscope/models/model.py +88 -0
evalscope/models/model_adapter.py +586 -0
evalscope/models/openai_model.py +103 -0
evalscope/models/template.py +1446 -0
evalscope/perf/__init__.py +0 -0
evalscope/perf/_logging.py +32 -0
evalscope/perf/api_plugin_base.py +60 -0
evalscope/perf/custom_api.py +87 -0
evalscope/perf/dashscope_api.py +84 -0
evalscope/perf/dataset_plugin_base.py +64 -0
evalscope/perf/datasets/__init__.py +0 -0
evalscope/perf/datasets/line_by_line.py +18 -0
evalscope/perf/datasets/longalpaca_12k.py +20 -0
evalscope/perf/datasets/openqa.py +22 -0
evalscope/perf/how_to_analysis_result.py +24 -0
evalscope/perf/http_client.py +756 -0
evalscope/perf/openai_api.py +130 -0
evalscope/perf/plugin_registry.py +35 -0
evalscope/perf/query_parameters.py +42 -0
evalscope/perf/server_sent_event.py +43 -0
evalscope/preprocess/__init__.py +1 -0
evalscope/preprocess/tokenizers/__init__.py +0 -0
evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
evalscope/registry/__init__.py +1 -0
evalscope/registry/tasks/arc.yaml +29 -0
evalscope/registry/tasks/bbh.yaml +27 -0
evalscope/registry/tasks/bbh_mini.yaml +27 -0
evalscope/registry/tasks/ceval.yaml +27 -0
evalscope/registry/tasks/ceval_mini.yaml +27 -0
evalscope/registry/tasks/cmmlu.yaml +27 -0
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
evalscope/registry/tasks/general_qa.yaml +27 -0
evalscope/registry/tasks/gsm8k.yaml +29 -0
evalscope/registry/tasks/mmlu.yaml +29 -0
evalscope/registry/tasks/mmlu_mini.yaml +27 -0
evalscope/run.py +404 -0
evalscope/run_arena.py +204 -0
evalscope/run_ms.py +140 -0
evalscope/summarizer.py +144 -0
evalscope/third_party/__init__.py +1 -0
evalscope/third_party/toolbench_static/__init__.py +3 -0
evalscope/third_party/toolbench_static/eval.py +219 -0
evalscope/third_party/toolbench_static/infer.py +278 -0
evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
evalscope/tools/__init__.py +1 -0
evalscope/tools/combine_reports.py +140 -0
evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
evalscope/tools/rewrite_eval_results.py +95 -0
evalscope/utils/__init__.py +4 -0
evalscope/utils/arena_utils.py +247 -0
evalscope/utils/completion_parsers.py +87 -0
evalscope/utils/logger.py +64 -0
evalscope/utils/task_cfg_parser.py +10 -0
evalscope/utils/task_utils.py +19 -0
evalscope/utils/utils.py +625 -0
evalscope/version.py +4 -0
evalscope-0.5.0.dist-info/METADATA +566 -0
evalscope-0.5.0.dist-info/RECORD +165 -0
evalscope-0.5.0.dist-info/WHEEL +5 -0
evalscope-0.5.0.dist-info/entry_points.txt +3 -0
evalscope-0.5.0.dist-info/top_level.txt +1 -0

evalscope/benchmarks/humaneval/humaneval_adapter.py ADDED Viewed

@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# flake8: noqa
+DATASET_ID = 'modelscope/humaneval'
+SUBSET_LIST = ['openai_humaneval']
+# Note: ONLY FOR CLASS IMPORT, No implementation here.
+# Example:
+# {"task_id": "HumanEval/0", "prompt": "from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    \"\"\" Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    \"\"\"\n", "entry_point": "has_close_elements", "canonical_solution": "    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n", "test": "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.8) == False\n    assert candidate([1.0, 2.0, 3.0, 4.0, 5.0, 2.0], 0.1) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 1.0) == True\n    assert candidate([1.1, 2.2, 3.1, 4.1, 5.1], 0.5) == False\n\n"}
+class HumanevalAdapter:
+    """
+    A placeholder for humaneval adapter, see HumanevalEvaluator for implementation.
+    """
+    def __init__(self):
+        ...

evalscope/benchmarks/mmlu/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.benchmarks.mmlu.mmlu_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST, MMLUAdapter
+from evalscope.benchmarks.mmlu.mmlu_adapter import MMLUAdapter as DataAdapterClass
+from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass    # noqa

evalscope/benchmarks/mmlu/mmlu.py ADDED Viewed

@@ -0,0 +1,174 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# flake8: noqa
+import os
+import datasets
+import pandas as pd
+"""The MMLU dataset on ModelScope hub. READ ONLY, DO NOT MODIFY."""
+_CITATION = """\
+@article{hendryckstest2021,
+  title={Measuring Massive Multitask Language Understanding},
+  author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
+  journal={Proceedings of the International Conference on Learning Representations (ICLR)},
+  year={2021}
+}
+"""
+_DESCRIPTION = """\
+Measuring Massive Multitask Language Understanding by Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas
+Mazeika, Dawn Song, and Jacob Steinhardt (ICLR 2021).
+"""
+_HOMEPAGE = 'https://modelscope.cn/datasets/modelscope/mmlu/summary'
+_LICENSE = 'MIT'
+# _URL = "https://people.eecs.berkeley.edu/~hendrycks/data.tar"
+_URL = 'https://modelscope.cn/api/v1/datasets/modelscope/mmlu/repo?Revision=master&FilePath=data.tar'
+task_list = [
+    'high_school_european_history',
+    'business_ethics',
+    'clinical_knowledge',
+    'medical_genetics',
+    'high_school_us_history',
+    'high_school_physics',
+    'high_school_world_history',
+    'virology',
+    'high_school_microeconomics',
+    'econometrics',
+    'college_computer_science',
+    'high_school_biology',
+    'abstract_algebra',
+    'professional_accounting',
+    'philosophy',
+    'professional_medicine',
+    'nutrition',
+    'global_facts',
+    'machine_learning',
+    'security_studies',
+    'public_relations',
+    'professional_psychology',
+    'prehistory',
+    'anatomy',
+    'human_sexuality',
+    'college_medicine',
+    'high_school_government_and_politics',
+    'college_chemistry',
+    'logical_fallacies',
+    'high_school_geography',
+    'elementary_mathematics',
+    'human_aging',
+    'college_mathematics',
+    'high_school_psychology',
+    'formal_logic',
+    'high_school_statistics',
+    'international_law',
+    'high_school_mathematics',
+    'high_school_computer_science',
+    'conceptual_physics',
+    'miscellaneous',
+    'high_school_chemistry',
+    'marketing',
+    'professional_law',
+    'management',
+    'college_physics',
+    'jurisprudence',
+    'world_religions',
+    'sociology',
+    'us_foreign_policy',
+    'high_school_macroeconomics',
+    'computer_security',
+    'moral_scenarios',
+    'moral_disputes',
+    'electrical_engineering',
+    'astronomy',
+    'college_biology',
+]
+class MMLUConfig(datasets.BuilderConfig):
+    def __init__(self, **kwargs):
+        super().__init__(version=datasets.Version('1.0.0'), **kwargs)
+class MMLU(datasets.GeneratorBasedBuilder):
+    BUILDER_CONFIGS = [
+        MMLUConfig(
+            name=task_name,
+        )
+        for task_name in task_list
+    ]
+    def _info(self):
+        features = datasets.Features(
+            {
+                'input': datasets.Value('string'),
+                'A': datasets.Value('string'),
+                'B': datasets.Value('string'),
+                'C': datasets.Value('string'),
+                'D': datasets.Value('string'),
+                'target': datasets.Value('string'),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        data_dir = dl_manager.download_and_extract(_URL)
+        task_name = self.config.name
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    'filepath': os.path.join(
+                        data_dir, 'data', 'test', f'{task_name}_test.csv'
+                    ),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION,
+                gen_kwargs={
+                    'filepath': os.path.join(
+                        data_dir, 'data', 'val', f'{task_name}_val.csv'
+                    ),
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    'filepath': os.path.join(
+                        data_dir, 'data', 'dev', f'{task_name}_dev.csv'
+                    ),
+                },
+            ),
+        ]
+    def _generate_examples(self, filepath):
+        df = pd.read_csv(filepath)
+        df.columns = ['input', 'A', 'B', 'C', 'D', 'target']
+        for i, instance in enumerate(df.to_dict(orient='records')):
+            yield i, instance

evalscope/benchmarks/mmlu/mmlu_adapter.py ADDED Viewed

@@ -0,0 +1,375 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import csv
+import os
+from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.metrics.metrics import exact_match, weighted_mean
+from evalscope.utils import normalize_score, ResponseParser
+from evalscope.utils.logger import get_logger
+# flake8: noqa
+logger = get_logger()
+DATASET_ID = 'modelscope/mmlu'
+SUBSET_LIST = [
+    'high_school_european_history',
+    'business_ethics',
+    'clinical_knowledge',
+    'medical_genetics',
+    'high_school_us_history',
+    'high_school_physics',
+    'high_school_world_history',
+    'virology',
+    'high_school_microeconomics',
+    'econometrics',
+    'college_computer_science',
+    'high_school_biology',
+    'abstract_algebra',
+    'professional_accounting',
+    'philosophy',
+    'professional_medicine',
+    'nutrition',
+    'global_facts',
+    'machine_learning',
+    'security_studies',
+    'public_relations',
+    'professional_psychology',
+    'prehistory',
+    'anatomy',
+    'human_sexuality',
+    'college_medicine',
+    'high_school_government_and_politics',
+    'college_chemistry',
+    'logical_fallacies',
+    'high_school_geography',
+    'elementary_mathematics',
+    'human_aging',
+    'college_mathematics',
+    'high_school_psychology',
+    'formal_logic',
+    'high_school_statistics',
+    'international_law',
+    'high_school_mathematics',
+    'high_school_computer_science',
+    'conceptual_physics',
+    'miscellaneous',
+    'high_school_chemistry',
+    'marketing',
+    'professional_law',
+    'management',
+    'college_physics',
+    'jurisprudence',
+    'world_religions',
+    'sociology',
+    'us_foreign_policy',
+    'high_school_macroeconomics',
+    'computer_security',
+    'moral_scenarios',
+    'moral_disputes',
+    'electrical_engineering',
+    'astronomy',
+    'college_biology',
+]
+SUBJECT_MAPPING = {'abstract_algebra': ['Abstract Algebra', 'math', 'STEM'],
+                   'anatomy': ['Anatomy', 'health', 'Other'],
+                   'astronomy': ['Astronomy', 'physics', 'STEM'],
+                   'business_ethics': ['Business Ethics', 'business', 'Other'],
+                   'clinical_knowledge': ['Clinical Knowledge', 'health', 'Other'],
+                   'college_biology': ['College Biology', 'biology', 'STEM'],
+                   'college_chemistry': ['College Chemistry', 'chemistry', 'STEM'],
+                   'college_computer_science': ['College Computer Science', 'computer science', 'STEM'],
+                   'college_mathematics': ['College Mathematics', 'math', 'STEM'],
+                   'college_medicine': ['College Medicine', 'health', 'Other'],
+                   'college_physics': ['College Physics', 'physics', 'STEM'],
+                   'computer_security': ['Computer Security', 'computer science', 'STEM'],
+                   'conceptual_physics': ['Conceptual Physics', 'physics', 'STEM'],
+                   'econometrics': ['Econometrics', 'economics', 'Social Science'],
+                   'electrical_engineering': ['Electrical Engineering', 'engineering', 'STEM'],
+                   'elementary_mathematics': ['Elementary Mathematics', 'math', 'STEM'],
+                   'formal_logic': ['Formal Logic', 'philosophy', 'Humanities'],
+                   'global_facts': ['Global Facts', 'other', 'Other'],
+                   'high_school_biology': ['High School Biology', 'biology', 'STEM'],
+                   'high_school_chemistry': ['High School Chemistry', 'chemistry', 'STEM'],
+                   'high_school_computer_science': ['High School Computer Science', 'computer science', 'STEM'],
+                   'high_school_european_history': ['High School European History', 'history', 'Humanities'],
+                   'high_school_geography': ['High School Geography', 'geography', 'Social Science'],
+                   'high_school_government_and_politics': ['High School Government And Politics', 'politics', 'Social Science'],
+                   'high_school_macroeconomics': ['High School Macroeconomics', 'economics', 'Social Science'],
+                   'high_school_mathematics': ['High School Mathematics', 'math', 'STEM'],
+                   'high_school_microeconomics': ['High School Microeconomics', 'economics', 'Social Science'],
+                   'high_school_physics': ['High School Physics', 'physics', 'STEM'],
+                   'high_school_psychology': ['High School Psychology', 'psychology', 'Social Science'],
+                   'high_school_statistics': ['High School Statistics', 'math', 'STEM'],
+                   'high_school_us_history': ['High School Us History', 'history', 'Humanities'],
+                   'high_school_world_history': ['High School World History', 'history', 'Humanities'],
+                   'human_aging': ['Human Aging', 'health', 'Other'],
+                   'human_sexuality': ['Human Sexuality', 'culture', 'Social Science'],
+                   'international_law': ['International Law', 'law', 'Humanities'],
+                   'jurisprudence': ['Jurisprudence', 'law', 'Humanities'],
+                   'logical_fallacies': ['Logical Fallacies', 'philosophy', 'Humanities'],
+                   'machine_learning': ['Machine Learning', 'computer science', 'STEM'],
+                   'management': ['Management', 'business', 'Other'],
+                   'marketing': ['Marketing', 'business', 'Other'],
+                   'medical_genetics': ['Medical Genetics', 'health', 'Other'],
+                   'miscellaneous': ['Miscellaneous', 'other', 'Other'],
+                   'moral_disputes': ['Moral Disputes', 'philosophy', 'Humanities'],
+                   'moral_scenarios': ['Moral Scenarios', 'philosophy', 'Humanities'],
+                   'nutrition': ['Nutrition', 'health', 'Other'],
+                   'philosophy': ['Philosophy', 'philosophy', 'Humanities'],
+                   'prehistory': ['Prehistory', 'history', 'Humanities'],
+                   'professional_accounting': ['Professional Accounting', 'other', 'Other'],
+                   'professional_law': ['Professional Law', 'law', 'Humanities'],
+                   'professional_medicine': ['Professional Medicine', 'health', 'Other'],
+                   'professional_psychology': ['Professional Psychology', 'psychology', 'Social Science'],
+                   'public_relations': ['Public Relations', 'politics', 'Social Science'],
+                   'security_studies': ['Security Studies', 'politics', 'Social Science'],
+                   'sociology': ['Sociology', 'culture', 'Social Science'],
+                   'us_foreign_policy': ['Us Foreign Policy', 'politics', 'Social Science'],
+                   'virology': ['Virology', 'health', 'Other'],
+                   'world_religions': ['World Religions', 'philosophy', 'Humanities'],
+                   }
+class MMLUAdapter(DataAdapter):
+    choices = ['A', 'B', 'C', 'D']
+    def __init__(self,
+                 subset_list: list = None,
+                 metric_list: list = None,
+                 few_shot_num: int = None,
+                 train_split: str = 'train',
+                 eval_split: str = 'test',
+                 **kwargs):
+        if subset_list is None:
+            subset_list = SUBSET_LIST
+        if metric_list is None:
+            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+        if few_shot_num is None:
+            # Use 5-shot by default
+            logger.info(f'Set 5-shot examples by system for MMLU.')
+            few_shot_num = 5
+        if few_shot_num > 5:
+            logger.warning(f'few_shot_num <= 5 for MMLU, but got {few_shot_num}. Use 5-shot by default.')
+            few_shot_num = 5
+        super().__init__(subset_list=subset_list,
+                         metric_list=metric_list,
+                         few_shot_num=few_shot_num,
+                         train_split=train_split,
+                         eval_split=eval_split,
+                         **kwargs)
+    def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
+        data_dict = {}
+        for subset_name in subset_list:
+            data_dict[subset_name] = {}
+            for split_name in [self.train_split, self.eval_split]:
+                if self.train_split == 'train':
+                    split_name_suffix = 'dev'
+                elif self.eval_split == 'test':
+                    split_name_suffix = 'test'
+                elif self.eval_split == 'validation':
+                    split_name_suffix = 'val'
+                else:
+                    raise ValueError(f'Invalid split name: {split_name}')
+                if os.path.exists(dataset_name_or_path):
+                    file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name_suffix}.csv')
+                else:
+                    file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name_suffix}.csv')
+                if os.path.exists(file_path):
+                    with open(file_path, encoding='utf-8') as f:
+                        rows = []
+                        reader = csv.reader(f)
+                        for row in reader:
+                            if len(row) != 6:
+                                logger.error(f'Mismatch len of row: {row}, len of row should be 6. Skip this row.')
+                                continue
+                            rows.append({
+                                'input': row[0],
+                                'A': row[1],
+                                'B': row[2],
+                                'C': row[3],
+                                'D': row[4],
+                                'target': row[5],
+                            })
+                        data_dict[subset_name].update({split_name: rows})
+        return data_dict
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
+        """
+        Generate model prompt from raw input, unify the prompt format for MMLU benchmark.
+        Args:
+            input_d (dict): The raw input. A single data format of the MMLU:
+            {'input': '___________ is based on the idea that customer expectations of the service they will receive shape their perception of the actual service encounter.',
+            'A': 'Service quality.',
+            'B': 'Service action.',
+            'C': 'Service recovery.',
+            'D': 'Service satisfaction.',
+            'target': 'A'}
+        Returns:
+            {'data': [(context, continuation), ...]}
+        """
+        prompt = 'The following are multiple choice questions (with answers) about {}.\n\n'.format(
+            self._format_subject(subset_name)
+        )
+        few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
+        context: str = '\n'.join(few_shot_prompts) + '\n'
+        context += self._generate_prompt(input_d=input_d, include_answer=False)
+        context = prompt + context
+        full_prompt: str = context.strip() + self._generate_prompt(input_d=input_d, include_answer=False)
+        return {'data': [full_prompt], 'multi_choices': self.choices}
+    def get_gold_answer(self, input_d: dict) -> str:
+        # Get the gold choice
+        return input_d.get('target', '')
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
+        """
+        Parse the model output to get the answer. Could be the best choice index.
+        Args:
+            result: Predicted answer from the model. Usually a string for chat.
+            raw_input_d: The raw input. Depending on the dataset.
+            eval_type: 'checkpoint' or 'service' or 'custom'
+        Returns:
+            The parsed answer. Depending on the dataset. Usually a string for chat.
+        """
+        if eval_type == 'checkpoint':
+            return result
+        elif eval_type == 'service':
+            return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
+        elif eval_type == 'custom':
+            return ResponseParser.parse_first_option_with_choices(result, self.choices)  # TODO: to be checked !
+        else:
+            raise ValueError(f'Invalid eval_type: {eval_type}')
+    def match(self, gold: str, pred: str) -> float:
+        return exact_match(gold=gold, pred=pred)
+    def compute_metric(self, review_res_list: list) -> float:
+        """
+        Compute evaluation result by specific metric.
+        Args:
+            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
+        Returns:
+            The metric score.
+        """
+        items = [(score, 1.0) for score in review_res_list]
+        return weighted_mean(items)
+    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
+        """
+        Generate report for the evaluation.
+        Args:
+            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
+            report_name: The user-defined report name.
+        Returns:
+        {
+            "name":"MMLU",
+            "metric":"WeightedAverageAccuracy",
+            "score":0.3389,
+            "category":[
+                {
+                    "name":"STEM",
+                    "score":0.2528,
+                    "subset":[
+                        {
+                            "name":"computer_network",
+                            "score":0.2632
+                        },
+                        {
+                            "name":"operating_system",
+                            "score":0.3157
+                        },
+                        {
+                            "name":"computer_architecture",
+                            "score":0.4285
+                        }
+                    ]
+                }
+            ],
+            "total_num":59
+        }
+        """
+        total_num: int = sum([num for _, num in subset_score_map.values()])
+        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
+        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
+        # Get domain-subject mapping
+        subject_review_map = {}
+        for subset_name, (subset_score, num) in subset_score_map.items():
+            domain_name: str = SUBJECT_MAPPING.get(subset_name)[2] if SUBJECT_MAPPING.get(subset_name) else subset_name
+            if domain_name in subject_review_map:
+                subject_review_map[domain_name].append((subset_name, subset_score, num))
+            else:
+                subject_review_map[domain_name] = [(subset_name, subset_score, num)]
+        # Get domain score
+        category_list = []
+        for domain_name, domain_res_list in subject_review_map.items():
+            domain_weighted_avg_acc = sum([score * num for _, score, num in domain_res_list]) / \
+                                      sum([num for _, _, num in domain_res_list])
+            domain_weighted_avg_acc = normalize_score(score=domain_weighted_avg_acc)
+            category_list.append({'name': domain_name,
+                                  'score': domain_weighted_avg_acc,
+                                  'subset': [{'name': subset_name, 'score': normalize_score(score=subset_score)}
+                                             for subset_name, subset_score, _ in domain_res_list]})
+        category_list = sorted(category_list, key=lambda x: x['name'])
+        # Get final dict of report
+        res_map = dict(name=report_name or 'mmlu',
+                       metric=self.metric_list[0]['name'],
+                       score=weighted_avg_acc,
+                       category=category_list,
+                       total_num=total_num)
+        return res_map
+    @classmethod
+    def _generate_prompt(cls, input_d: dict, include_answer=True) -> str:
+        input_choices: list = [input_d['A'], input_d['B'], input_d['C'], input_d['D']]
+        example: str = input_d['input']
+        for j in range(len(cls.choices)):
+            example += '\n{}. {}'.format(cls.choices[j], input_choices[j])
+        example += '\nAnswer:'
+        if include_answer:
+            example += ' {}\n\n'.format(input_d['target'])
+        return example
+    @classmethod
+    def _format_subject(cls, subject):
+        l = subject.split('_')
+        s = ''
+        for entry in l:
+            s += ' ' + entry
+        return s

evalscope/benchmarks/race/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.benchmarks.race.race_adapter import DATASET_ID, SUBJECT_MAPPING, SUBSET_LIST, RACEAdapter
+from evalscope.benchmarks.race.race_adapter import RACEAdapter as DataAdapterClass
+from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass    # noqa