PyPI - evalscope - Versions diffs - 0.5.0__py3-none-any.whl - Mend

evalscope 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (165) hide show

evalscope/__init__.py +3 -0
evalscope/backend/__init__.py +3 -0
evalscope/backend/base.py +27 -0
evalscope/backend/opencompass/__init__.py +3 -0
evalscope/backend/opencompass/api_meta_template.py +64 -0
evalscope/backend/opencompass/backend_manager.py +247 -0
evalscope/backend/opencompass/tasks/__init__.py +1 -0
evalscope/backend/opencompass/tasks/eval_api.py +30 -0
evalscope/backend/opencompass/tasks/eval_datasets.py +71 -0
evalscope/backend/vlm_eval_kit/__init__.py +1 -0
evalscope/backend/vlm_eval_kit/backend_manager.py +153 -0
evalscope/benchmarks/__init__.py +4 -0
evalscope/benchmarks/arc/__init__.py +5 -0
evalscope/benchmarks/arc/ai2_arc.py +148 -0
evalscope/benchmarks/arc/arc_adapter.py +231 -0
evalscope/benchmarks/bbh/__init__.py +6 -0
evalscope/benchmarks/bbh/bbh_adapter.py +308 -0
evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +23 -0
evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +25 -0
evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +33 -0
evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +72 -0
evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +44 -0
evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +78 -0
evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +28 -0
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +42 -0
evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +25 -0
evalscope/benchmarks/bbh/cot_prompts/navigate.txt +43 -0
evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +37 -0
evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +41 -0
evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +63 -0
evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +44 -0
evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/snarks.txt +30 -0
evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +10 -0
evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +77 -0
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +40 -0
evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +28 -0
evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +17 -0
evalscope/benchmarks/benchmark.py +65 -0
evalscope/benchmarks/ceval/__init__.py +5 -0
evalscope/benchmarks/ceval/ceval_adapter.py +340 -0
evalscope/benchmarks/ceval/ceval_exam.py +159 -0
evalscope/benchmarks/cmmlu/__init__.py +5 -0
evalscope/benchmarks/cmmlu/cmmlu.py +166 -0
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +369 -0
evalscope/benchmarks/competition_math/__init__.py +5 -0
evalscope/benchmarks/competition_math/competition_math.py +88 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +470 -0
evalscope/benchmarks/data_adapter.py +263 -0
evalscope/benchmarks/general_qa/__init__.py +5 -0
evalscope/benchmarks/general_qa/general_qa_adapter.py +186 -0
evalscope/benchmarks/gsm8k/__init__.py +5 -0
evalscope/benchmarks/gsm8k/gsm8k.py +127 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +236 -0
evalscope/benchmarks/hellaswag/__init__.py +5 -0
evalscope/benchmarks/hellaswag/hellaswag.py +116 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +222 -0
evalscope/benchmarks/humaneval/__init__.py +5 -0
evalscope/benchmarks/humaneval/humaneval.py +82 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +21 -0
evalscope/benchmarks/mmlu/__init__.py +5 -0
evalscope/benchmarks/mmlu/mmlu.py +174 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +375 -0
evalscope/benchmarks/race/__init__.py +5 -0
evalscope/benchmarks/race/race.py +118 -0
evalscope/benchmarks/race/race_adapter.py +229 -0
evalscope/benchmarks/trivia_qa/__init__.py +5 -0
evalscope/benchmarks/trivia_qa/trivia_qa.py +104 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +207 -0
evalscope/benchmarks/truthful_qa/__init__.py +5 -0
evalscope/benchmarks/truthful_qa/truthful_qa.py +167 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +351 -0
evalscope/cache.py +98 -0
evalscope/cli/__init__.py +1 -0
evalscope/cli/base.py +20 -0
evalscope/cli/cli.py +26 -0
evalscope/cli/start_perf.py +37 -0
evalscope/cli/start_server.py +138 -0
evalscope/config.py +165 -0
evalscope/constants.py +150 -0
evalscope/evaluator/__init__.py +3 -0
evalscope/evaluator/evaluator.py +689 -0
evalscope/evaluator/rating_eval.py +178 -0
evalscope/evaluator/reviewer/__init__.py +1 -0
evalscope/evaluator/reviewer/auto_reviewer.py +411 -0
evalscope/metrics/__init__.py +1 -0
evalscope/metrics/bundled_rouge_score/__init__.py +14 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +342 -0
evalscope/metrics/code_metric.py +104 -0
evalscope/metrics/math_accuracy.py +60 -0
evalscope/metrics/metrics.py +405 -0
evalscope/metrics/rouge_metric.py +129 -0
evalscope/models/__init__.py +4 -0
evalscope/models/custom/__init__.py +4 -0
evalscope/models/custom/custom_model.py +53 -0
evalscope/models/dummy_chat_model.py +50 -0
evalscope/models/model.py +88 -0
evalscope/models/model_adapter.py +586 -0
evalscope/models/openai_model.py +103 -0
evalscope/models/template.py +1446 -0
evalscope/perf/__init__.py +0 -0
evalscope/perf/_logging.py +32 -0
evalscope/perf/api_plugin_base.py +60 -0
evalscope/perf/custom_api.py +87 -0
evalscope/perf/dashscope_api.py +84 -0
evalscope/perf/dataset_plugin_base.py +64 -0
evalscope/perf/datasets/__init__.py +0 -0
evalscope/perf/datasets/line_by_line.py +18 -0
evalscope/perf/datasets/longalpaca_12k.py +20 -0
evalscope/perf/datasets/openqa.py +22 -0
evalscope/perf/how_to_analysis_result.py +24 -0
evalscope/perf/http_client.py +756 -0
evalscope/perf/openai_api.py +130 -0
evalscope/perf/plugin_registry.py +35 -0
evalscope/perf/query_parameters.py +42 -0
evalscope/perf/server_sent_event.py +43 -0
evalscope/preprocess/__init__.py +1 -0
evalscope/preprocess/tokenizers/__init__.py +0 -0
evalscope/preprocess/tokenizers/gpt2_tokenizer.py +221 -0
evalscope/registry/__init__.py +1 -0
evalscope/registry/tasks/arc.yaml +29 -0
evalscope/registry/tasks/bbh.yaml +27 -0
evalscope/registry/tasks/bbh_mini.yaml +27 -0
evalscope/registry/tasks/ceval.yaml +27 -0
evalscope/registry/tasks/ceval_mini.yaml +27 -0
evalscope/registry/tasks/cmmlu.yaml +27 -0
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +28 -0
evalscope/registry/tasks/general_qa.yaml +27 -0
evalscope/registry/tasks/gsm8k.yaml +29 -0
evalscope/registry/tasks/mmlu.yaml +29 -0
evalscope/registry/tasks/mmlu_mini.yaml +27 -0
evalscope/run.py +404 -0
evalscope/run_arena.py +204 -0
evalscope/run_ms.py +140 -0
evalscope/summarizer.py +144 -0
evalscope/third_party/__init__.py +1 -0
evalscope/third_party/toolbench_static/__init__.py +3 -0
evalscope/third_party/toolbench_static/eval.py +219 -0
evalscope/third_party/toolbench_static/infer.py +278 -0
evalscope/third_party/toolbench_static/llm/__init__.py +1 -0
evalscope/third_party/toolbench_static/llm/swift_infer.py +45 -0
evalscope/third_party/toolbench_static/toolbench_static.py +50 -0
evalscope/tools/__init__.py +1 -0
evalscope/tools/combine_reports.py +140 -0
evalscope/tools/gen_mmlu_subject_mapping.py +90 -0
evalscope/tools/rewrite_eval_results.py +95 -0
evalscope/utils/__init__.py +4 -0
evalscope/utils/arena_utils.py +247 -0
evalscope/utils/completion_parsers.py +87 -0
evalscope/utils/logger.py +64 -0
evalscope/utils/task_cfg_parser.py +10 -0
evalscope/utils/task_utils.py +19 -0
evalscope/utils/utils.py +625 -0
evalscope/version.py +4 -0
evalscope-0.5.0.dist-info/METADATA +566 -0
evalscope-0.5.0.dist-info/RECORD +165 -0
evalscope-0.5.0.dist-info/WHEEL +5 -0
evalscope-0.5.0.dist-info/entry_points.txt +3 -0
evalscope-0.5.0.dist-info/top_level.txt +1 -0

evalscope/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .version import __release_datetime__, __version__

evalscope/backend/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.backend.opencompass.backend_manager import OpenCompassBackendManager

evalscope/backend/base.py ADDED Viewed

@@ -0,0 +1,27 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Union
+from evalscope.utils import yaml_to_dict
+class BackendManager:
+    def __init__(self, config: Union[str, dict], **kwargs):
+        """
+        BackendManager is the base class for the evaluation backend manager.
+        It provides the basic configuration parsing, command generation, task submission, and result fetching.
+        config: str or dict, the configuration of the evaluation backend.
+            could be a string of the path to the configuration file (yaml), or a dictionary.
+        """
+        if isinstance(config, str):
+            self.config_d = yaml_to_dict(config)
+        else:
+            self.config_d = config
+        self.kwargs = kwargs
+    def run(self, *args, **kwargs):
+        """
+        Run the evaluation backend.
+        """
+        raise NotImplementedError

evalscope/backend/opencompass/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.backend.opencompass.backend_manager import OpenCompassBackendManager

evalscope/backend/opencompass/api_meta_template.py ADDED Viewed

@@ -0,0 +1,64 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Dict, Any, List
+"""
+The API meta template for OpenCompass.
+See more details in the OpenCompass documentation: https://opencompass.org.cn/doc
+Search for `meta template` in the documentation.
+"""
+class MetaTemplateType:
+    default_api_meta_template_oc = 'default-api-meta-template-oc'
+    @classmethod
+    def get_template_name_list(cls) -> List[str]:
+        res = []
+        for k in cls.__dict__.keys():
+            if k.startswith('__') or k == 'get_template_name_list':
+                continue
+            res.append(cls.__dict__[k])
+        return res
+TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
+def register_template(name: str,
+                      template: Dict[str, Any],
+                      exists_ok: bool = False):
+    if not exists_ok and name in TEMPLATE_MAPPING:
+        raise ValueError(f"The `{name}` has already been registered in the TEMPLATE_MAPPING.")
+    TEMPLATE_MAPPING[name] = template
+def get_template(name: str) -> Dict[str, Any]:
+    if name not in TEMPLATE_MAPPING:
+        raise ValueError(f"The `{name}` has not been registered in the TEMPLATE_MAPPING.")
+    return TEMPLATE_MAPPING[name]
+# Default API meta template for OpenCompass
+register_template(
+    name=MetaTemplateType.default_api_meta_template_oc,
+    template=dict(
+        round=[
+            dict(role='HUMAN', api_role='HUMAN'),
+            dict(role='BOT', api_role='BOT', generate=True)
+        ],
+        reserved_roles=[
+            dict(role='SYSTEM', api_role='SYSTEM'),
+        ],
+    )
+)
+if __name__ == '__main__':
+    res = MetaTemplateType.get_template_name_list()
+    print(res)
+    print(get_template(MetaTemplateType.default_api_meta_template_oc))

evalscope/backend/opencompass/backend_manager.py ADDED Viewed

@@ -0,0 +1,247 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from enum import Enum
+from typing import Optional, Union
+import subprocess
+from dataclasses import asdict
+import tempfile
+from evalscope.utils import is_module_installed, get_module_path, get_valid_list
+from evalscope.backend.base import BackendManager
+from evalscope.backend.opencompass.api_meta_template import get_template
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+class CmdMode(Enum):
+    # The basic mode is to run the command directly,
+    # e.g. `python -m run --models model1 model2 --datasets dataset1 dataset2`
+    BASIC = 'basic'
+    # The script mode is to run the command with a script,
+    # e.g. `python -m run your_config_script.py`
+    SCRIPT = 'script'
+class RunMode(Enum):
+    # The command mode is to run the command directly with the command line.
+    CMD = 'cmd'
+    # The function mode is to run the command with a function call -- run_task().
+    FUNCTION = 'function'
+class OpenCompassBackendManager(BackendManager):
+    def __init__(self, config: Union[str, dict], **kwargs):
+        """
+        The backend manager for OpenCompass.
+        Args:
+            config: Union[str, dict], the configuration yaml-file or the configuration dictionary.
+                attributes:
+                    datasets: list, the datasets.
+                    models: list, the models.
+                    work_dir (Optional): str, the working directory. Default to None, which means the current directory.
+                    dry_run (Optional): bool, the dry-run flag. Default to False.
+                    debug (Optional): bool, the debug flag. Default to False.
+                    reuse (Optional): str, reuse previous outputs & results. Default to None.
+                    generation_kwargs (Optional): dict, the generation config. Default to {}.
+                    limit (Optional): int or float or str, the limit of the number of examples. Default to None.
+                        if limit is a string, it should be in the format of '[start:end]'.
+                example:
+                    # TODO: add demo config
+                    config = dict(
+                        datasets=[mmlu, ceval],
+                        models=[...],
+                        ...
+                    )
+            **kwargs: the keyword arguments.
+        """
+        self._check_env()
+        super().__init__(config, **kwargs)
+        from opencompass.cli.arguments import Arguments as OpenCompassArguments
+        self.args = OpenCompassArguments(**self.config_d)
+    @property
+    def cmd(self):
+        return self.get_cmd()
+    @staticmethod
+    def _check_env():
+        if is_module_installed('opencompass'):
+            logger.info('Please make sure you have installed the `ms-opencompass`: `pip install ms-opencompass`')
+        else:
+            raise ModuleNotFoundError('Please install the `ms-opencompass` first: `pip install ms-opencompass`')
+    @staticmethod
+    def get_restore_arg(arg_name: str, arg_val: bool):
+        if arg_val:
+            return f'--{arg_name}'
+        else:
+            return ''
+    @staticmethod
+    def get_arg_with_default(arg_name: str, arg_val: Optional[str] = None):
+        if arg_val:
+            return f'--{arg_name} {arg_val}'
+        else:
+            return ''
+    def load_task_template(self):
+        """
+        Load the initial OpenCompass task template from task config file.
+        Returns:
+            (mmengine.config.config.Config), the initial task template config.
+        """
+        from opencompass.utils.run import get_config_from_arg
+        template_config_path = get_module_path('evalscope.backend.opencompass.tasks.eval_api')
+        self.args.config = template_config_path
+        return get_config_from_arg(self.args)
+    @staticmethod
+    def list_datasets(return_details: bool = False):
+        from opencompass.utils.run import get_config_from_arg
+        from dataclasses import dataclass
+        @dataclass
+        class TempArgs:
+            config: str
+            accelerator: str = None
+        template_config_path = get_module_path('evalscope.backend.opencompass.tasks.eval_api')
+        template_cfg = get_config_from_arg(TempArgs(config=template_config_path))
+        # e.g. ['mmlu', 'ceval', 'openai_humaneval', ...]
+        dataset_show_names = list(set([_dataset['dataset_name'] for _dataset in template_cfg.datasets]))
+        if return_details:
+            return dataset_show_names, template_cfg.datasets
+        else:
+            return dataset_show_names
+    def get_task_args(self):
+        return self.args
+    def get_cmd(self, cmd_mode: str = CmdMode.BASIC):
+        if cmd_mode == CmdMode.BASIC:
+            assert self.args.datasets, 'The datasets are required.'
+            assert self.args.models, 'The models are required.'
+            cmd_str = f'python -m run_oc ' \
+                      f'--models {" ".join(self.args.models)} ' \
+                      f'--datasets {" ".join(self.args.datasets)} ' \
+                      f'{self.get_restore_arg("dry-run", self.args.dry_run)} ' \
+                      f'{self.get_arg_with_default("work-dir", self.args.work_dir)}'
+        elif cmd_mode == CmdMode.SCRIPT:
+            assert self.args.config, 'The script file is required.'
+            cmd_str = f'python -m run_oc {self.args.config}'
+        else:
+            raise ValueError(f'Unsupported command mode: {cmd_mode}')
+        return cmd_str
+    def run(self, run_mode: str = RunMode.FUNCTION):
+        """
+        The entry function to run the OpenCompass task.
+        Args:
+            run_mode: str, the running mode, e.g. 'function' or 'cmd'.
+        Returns:
+            None
+        """
+        if run_mode == RunMode.FUNCTION:
+            from opencompass.cli.main import run_task
+            from opencompass.cli.arguments import ApiModelConfig
+            assert isinstance(self.args.models, list) and len(self.args.models) > 0, 'The models are required.'
+            tmp_model_d: dict = self.args.models[0]
+            assert 'path' in tmp_model_d and 'openai_api_base' in tmp_model_d, \
+                f"Got invalid model config: {tmp_model_d}. \nTo get valid format: " \
+                "{'path': 'qwen-7b-chat', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'}"
+            # Get valid datasets
+            dataset_names = self.args.datasets       # e.g. ['mmlu', 'ceval']
+            dataset_names_all, real_dataset_all = self.list_datasets(return_details=True)
+            if not dataset_names:
+                logger.warning(f'No datasets are specified in the config. Use all the datasets: {dataset_names_all}')
+                valid_dataset_names = dataset_names_all
+            else:
+                valid_dataset_names, invalid_dataset_names = get_valid_list(dataset_names, dataset_names_all)
+                if len(invalid_dataset_names) > 0:
+                    logger.error(f'Invalid datasets: {invalid_dataset_names}, '
+                                 f'refer to the following list to get proper dataset name: {dataset_names_all}')
+                assert len(valid_dataset_names) > 0, f'No valid datasets. ' \
+                                                     f'To get the valid datasets, please refer to {dataset_names_all}'
+            valid_datasets = [_dataset for _dataset in real_dataset_all if _dataset['dataset_name'] in valid_dataset_names]
+            for _dataset in valid_datasets:
+                _dataset.pop('dataset_name')
+                _dataset['reader_cfg']['test_range'] = self.args.limit
+            # Get valid models
+            models = []
+            for model_d in self.args.models:
+                # model_d: {'path': 'qwen-7b-chat',
+                #           'meta_template': 'default-api-meta-template-oc',   # Optional
+                #           'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'}
+                # Note: 'meta_template' can be a dict or a string, default is None
+                if 'meta_template' in model_d and isinstance(model_d['meta_template'], str):
+                    model_d['meta_template'] = get_template(model_d['meta_template'])
+                # set the 'abbr' as the 'path' if 'abbr' is not specified
+                model_d['abbr'] = model_d['path']
+                model_config = ApiModelConfig(**model_d)
+                models.append(asdict(model_config))
+            # Load the initial task template and override configs
+            template_cfg = self.load_task_template()
+            template_cfg.datasets = valid_datasets
+            template_cfg.models = models
+            # Dump task config to a temporary file
+            tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.py', mode='w')
+            template_cfg.dump(tmp_file.name)
+            # logger.info(f'The task config is dumped to: {tmp_file.name}')
+            self.args.config = tmp_file.name
+            # Submit the task
+            logger.info(f'*** Run task with config: {self.args.config} \n')
+            run_task(self.args)
+        # TODO: add more arguments for the command line
+        elif run_mode == RunMode.CMD:
+            subprocess.run(self.cmd, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        else:
+            raise ValueError(f'Unsupported run mode: {run_mode}')
+if __name__ == '__main__':
+    # OpenCompassBackendManager.list_datasets()
+    # ['mmlu', 'WSC', 'DRCD', 'chid', 'gsm8k', 'AX_g', 'BoolQ', 'cmnli', 'ARC_e', 'ocnli_fc', 'summedits', 'MultiRC', 'GaokaoBench', 'obqa', 'math', 'agieval', 'hellaswag', 'RTE', 'race', 'flores', 'ocnli', 'strategyqa', 'triviaqa', 'WiC', 'COPA', 'commonsenseqa', 'piqa', 'nq', 'mbpp', 'csl', 'Xsum', 'CB', 'tnews', 'ARC_c', 'afqmc', 'eprstmt', 'ReCoRD', 'bbh', 'TheoremQA', 'CMRC', 'AX_b', 'siqa', 'storycloze', 'humaneval', 'cluewsc', 'winogrande', 'lambada', 'ceval', 'bustm', 'C3', 'lcsts']
+    # 'meta_template': 'default-api-meta-template-oc',
+    # models: llama3-8b-instruct, qwen-7b-chat
+    oc_backend_manager = OpenCompassBackendManager(
+        config={'datasets': ['mmlu', 'ceval', 'ARC_c', 'gsm8k'],
+                'models': [{'path': 'llama3-8b-instruct', 'openai_api_base': 'http://127.0.0.1:8000/v1/chat/completions'}],
+                'limit': 5
+                }
+    )
+    oc_backend_manager.run()

evalscope/backend/opencompass/tasks/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Copyright (c) Alibaba, Inc. and its affiliates.

evalscope/backend/opencompass/tasks/eval_api.py ADDED Viewed

@@ -0,0 +1,30 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from mmengine.config import read_base
+from opencompass.partitioners import NaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+with read_base():
+    from evalscope.backend.opencompass.tasks.eval_datasets import datasets
+# 1. Get datasets
+# Note: OpenAI API format evaluation needs a special humaneval postprocessor
+for _dataset in datasets:
+    if _dataset['path'] == 'openai_humaneval':
+        from opencompass.datasets.humaneval import humaneval_gpt_postprocess
+        _dataset['eval_cfg']['pred_postprocessor']['type'] = humaneval_gpt_postprocess
+# 2. Get models, only for placeholder, you should fill in the real model information from command line
+# See more templates in `opencompass.cli.arguments.ApiModelConfig`
+models = []
+# 3. Get infer config
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=4,
+        task=dict(type=OpenICLInferTask)),
+)

evalscope/backend/opencompass/tasks/eval_datasets.py ADDED Viewed

@@ -0,0 +1,71 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
+    from opencompass.configs.datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
+    from opencompass.configs.datasets.agieval.agieval_gen_64afd3 import agieval_datasets
+    from opencompass.configs.datasets.GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
+    from opencompass.configs.datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
+    from opencompass.configs.datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
+    from opencompass.configs.datasets.CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
+    from opencompass.configs.datasets.CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
+    from opencompass.configs.datasets.CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
+    from opencompass.configs.datasets.CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
+    from opencompass.configs.datasets.CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets
+    from opencompass.configs.datasets.CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
+    from opencompass.configs.datasets.FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets
+    from opencompass.configs.datasets.FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets
+    from opencompass.configs.datasets.FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
+    from opencompass.configs.datasets.FewCLUE_csl.FewCLUE_csl_gen_28b223 import csl_datasets
+    from opencompass.configs.datasets.FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
+    from opencompass.configs.datasets.FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets
+    from opencompass.configs.datasets.FewCLUE_tnews.FewCLUE_tnews_gen_b90e4a import tnews_datasets
+    from opencompass.configs.datasets.lcsts.lcsts_gen_8ee1fe import lcsts_datasets
+    from opencompass.configs.datasets.lambada.lambada_gen_217e11 import lambada_datasets
+    from opencompass.configs.datasets.storycloze.storycloze_gen_7f656a import storycloze_datasets
+    from opencompass.configs.datasets.SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
+    from opencompass.configs.datasets.SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
+    from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_883d50 import BoolQ_datasets
+    from opencompass.configs.datasets.SuperGLUE_CB.SuperGLUE_CB_gen_854c6c import CB_datasets
+    from opencompass.configs.datasets.SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
+    from opencompass.configs.datasets.SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_27071f import MultiRC_datasets
+    from opencompass.configs.datasets.SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
+    from opencompass.configs.datasets.SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
+    from opencompass.configs.datasets.SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
+    from opencompass.configs.datasets.SuperGLUE_WSC.SuperGLUE_WSC_gen_7902a7 import WSC_datasets
+    from opencompass.configs.datasets.race.race_gen_69ee4f import race_datasets
+    from opencompass.configs.datasets.Xsum.Xsum_gen_31397e import Xsum_datasets
+    from opencompass.configs.datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from opencompass.configs.datasets.summedits.summedits_gen_315438 import summedits_datasets
+    from opencompass.configs.datasets.math.math_gen_265cce import math_datasets
+    from opencompass.configs.datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
+    from opencompass.configs.datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+    from opencompass.configs.datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
+    from opencompass.configs.datasets.piqa.piqa_gen_1194eb import piqa_datasets
+    from opencompass.configs.datasets.siqa.siqa_gen_e78df3 import siqa_datasets
+    from opencompass.configs.datasets.strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
+    from opencompass.configs.datasets.winogrande.deprecated_winogrande_gen_a9ede5 import winogrande_datasets
+    from opencompass.configs.datasets.obqa.obqa_gen_9069e4 import obqa_datasets
+    from opencompass.configs.datasets.nq.nq_gen_c788f6 import nq_datasets
+    from opencompass.configs.datasets.triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+    from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
+    # Note: to be supported
+    # from opencompass.configs.datasets.flores.flores_gen_806ede import flores_datasets
+    # from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
+    # from opencompass.configs.datasets.commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
+datasets = []
+_locals = {k: v for k, v in locals().items() if k.endswith('_datasets')}
+for k, v in _locals.items():
+    for _dataset in v:
+        _dataset['dataset_name'] = k.replace('_datasets', '')
+        datasets.append(_dataset)
+if __name__ == '__main__':
+    for _dataset in datasets:
+        print(_dataset)

evalscope/backend/vlm_eval_kit/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from evalscope.backend.vlm_eval_kit.backend_manager import VLMEvalKitBackendManager

evalscope/backend/vlm_eval_kit/backend_manager.py ADDED Viewed

@@ -0,0 +1,153 @@
+from typing import Optional, Union
+from evalscope.utils import is_module_installed, get_module_path, get_valid_list, yaml_to_dict, json_to_dict
+from evalscope.backend.base import BackendManager
+from evalscope.utils.logger import get_logger
+from functools import partial
+import subprocess
+from dataclasses import dataclass
+import copy
+logger = get_logger()
+class ExecutionMode:
+    # The command mode is to run the command directly with the command line.
+    CMD = 'cmd'
+    # The function mode is to run the command with a function call -- run_task().
+    FUNCTION = 'function'
+class VLMEvalKitBackendManager(BackendManager):
+    def __init__(self, config: Union[str, dict], **kwargs):
+        """BackendManager for VLM Evaluation Kit
+        Args:
+            config (Union[str, dict]): the configuration yaml-file or the configuration dictionary
+        """
+        self._check_env()
+        super().__init__(config, **kwargs)
+        from vlmeval.utils.arguments import Arguments as VLMEvalArguments
+        self.args = VLMEvalArguments(**self.config_d)
+        self.valid_models = self.list_supported_VLMs()
+        self.valid_model_names = list(self.valid_models.keys())
+        self.valid_datasets = self.list_supported_datasets()
+        self._check_valid()
+    def _check_valid(self):
+        # Ensure not both model and datasets are empty
+        if not self.args.data or not self.args.model:
+            raise ValueError('** Args: Please provide model and datasets. **')
+        # Check datasets
+        valid_datasets, invalid_datasets = get_valid_list(self.args.data, self.valid_datasets)
+        assert len(invalid_datasets) == 0, f'Invalid datasets: {invalid_datasets}, ' \
+            f'refer to the following list to get proper dataset name: {self.valid_datasets}'
+        # Check model
+        if isinstance(self.args.model[0], dict):
+            model_names = [model['name'] for model in self.args.model]
+            valid_model_names, invalid_model_names = get_valid_list(model_names, self.valid_model_names)
+            assert len(invalid_model_names) == 0, f'Invalid models: {invalid_model_names}, ' \
+                f'refer to the following list to get proper model name: {self.valid_model_names}'
+            # set model_cfg
+            new_model_names = []
+            for model_cfg in self.args.model:
+                model_name = model_cfg['name']
+                model_class = self.valid_models[model_name]
+                if model_name == 'CustomAPIModel':
+                    model_type = model_cfg['type']
+                    self.valid_models.update({
+                                model_type: partial(model_class,
+                                                   model=model_type,
+                                                   **model_cfg)
+                                })
+                    new_model_names.append(model_type)
+                else:
+                    remain_cfg = copy.deepcopy(model_cfg)
+                    del remain_cfg['name'] # remove not used args
+                    self.valid_models[model_name] = partial(model_class, **remain_cfg)
+                    new_model_names.append(model_name)
+            self.args.model = new_model_names
+        elif isinstance(self.args.model[0], str):
+            valid_model_names, invalid_model_names = get_valid_list(self.args.model, self.valid_model_names)
+            assert len(invalid_model_names) == 0, f'Invalid models: {invalid_model_names}, ' \
+                f'refer to the following list to get proper model name: {self.valid_model_names}'
+    @property
+    def cmd(self):
+        return self.get_cmd()
+    @staticmethod
+    def list_supported_VLMs():
+        from vlmeval.config import supported_VLM
+        return supported_VLM
+    @staticmethod
+    def list_supported_datasets():
+        from vlmeval.dataset import SUPPORTED_DATASETS
+        return SUPPORTED_DATASETS
+    @staticmethod
+    def _check_env():
+        if is_module_installed('vlmeval'):
+            logger.info('Please make sure you have installed the `ms-vlmeval`: `pip install ms-vlmeval`')
+        else:
+            raise ModuleNotFoundError('Please install the `ms-vlmeval` first: `pip install ms-vlmeval`')
+    @staticmethod
+    def get_restore_arg(arg_name: str, arg_val: bool):
+        if arg_val:
+            return f'--{arg_name}'
+        else:
+            return ''
+    @staticmethod
+    def get_arg_with_default(arg_name: str, arg_val: Optional[str] = None):
+        if arg_val:
+            return f'--{arg_name} {arg_val}'
+        else:
+            return ''
+    def get_cmd(self):
+        assert self.args.data, 'The datasets are required.'
+        assert self.args.model, 'The models are required.'
+        cmd_str = f'python -m vlmeval ' \
+            f'--model {" ".join(self.args.model)} ' \
+            f'--data {" ".join(self.args.data)} ' \
+            f'{self.get_restore_arg("verbose", self.args.verbose)} ' \
+            f'{self.get_restore_arg("ignore", self.args.ignore)} ' \
+            f'{self.get_restore_arg("rerun", self.args.rerun)} ' \
+            f'{self.get_arg_with_default("work-dir", self.args.work_dir)} ' \
+            f'{self.get_arg_with_default("limit", self.args.limit)} ' \
+            f'{self.get_arg_with_default("mode", self.args.mode)} ' \
+            f'{self.get_arg_with_default("nproc", self.args.nproc)} ' \
+            f'{self.get_arg_with_default("judge", self.args.judge)} ' \
+            f'{self.get_arg_with_default("retry", self.args.retry)} '
+        return cmd_str
+    def run(self, run_mode: str = ExecutionMode.FUNCTION):
+        if run_mode == ExecutionMode.CMD:
+            logger.info(f'** Run command: {self.cmd}')
+            try:
+                subprocess.run(self.cmd, check=True, ext=True, shell=True,)
+            except subprocess.CalledProcessError as e:
+                logger.error(f'** Run command failed: {e.stderr}')
+                raise
+        elif run_mode == ExecutionMode.FUNCTION:
+            from vlmeval.run import run_task
+            logger.info(f'*** Run task with config: {self.args} \n')
+            run_task(self.args)
+        else:
+            raise NotImplementedError

evalscope/benchmarks/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.benchmarks.benchmark import Benchmark
+from evalscope.benchmarks.data_adapter import DataAdapter

evalscope/benchmarks/arc/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from evalscope.benchmarks.arc.arc_adapter import ARCAdapter, DATASET_ID, SUBSET_LIST
+from evalscope.benchmarks.arc.arc_adapter import ARCAdapter as DataAdapterClass
+from evalscope.models.model_adapter import MultiChoiceModelAdapter as ModelAdapterClass    # noqa