PyPI - evalscope - Versions diffs - 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

evalscope 0.8.2py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

evalscope/__init__.py +2 -0
evalscope/arguments.py +11 -3
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
evalscope/backend/rag_eval/utils/llm.py +1 -1
evalscope/benchmarks/__init__.py +20 -1
evalscope/benchmarks/arc/__init__.py +0 -5
evalscope/benchmarks/arc/arc_adapter.py +24 -102
evalscope/benchmarks/bbh/__init__.py +0 -4
evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
evalscope/benchmarks/benchmark.py +70 -59
evalscope/benchmarks/ceval/__init__.py +0 -5
evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
evalscope/benchmarks/cmmlu/__init__.py +0 -5
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
evalscope/benchmarks/competition_math/__init__.py +0 -5
evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
evalscope/benchmarks/data_adapter.py +115 -87
evalscope/benchmarks/general_qa/__init__.py +0 -5
evalscope/benchmarks/general_qa/general_qa_adapter.py +23 -79
evalscope/benchmarks/gsm8k/__init__.py +0 -4
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +21 -101
evalscope/benchmarks/hellaswag/__init__.py +0 -5
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +32 -99
evalscope/benchmarks/humaneval/__init__.py +0 -4
evalscope/benchmarks/humaneval/humaneval_adapter.py +18 -120
evalscope/benchmarks/ifeval/__init__.py +0 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +57 -0
evalscope/benchmarks/ifeval/instructions.py +1478 -0
evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
evalscope/benchmarks/ifeval/utils.py +134 -0
evalscope/benchmarks/iquiz/__init__.py +0 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
evalscope/benchmarks/mmlu/__init__.py +0 -5
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
evalscope/benchmarks/race/__init__.py +0 -5
evalscope/benchmarks/race/race_adapter.py +26 -123
evalscope/benchmarks/trivia_qa/__init__.py +0 -5
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
evalscope/benchmarks/truthful_qa/__init__.py +0 -5
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +29 -0
evalscope/collections/__init__.py +3 -0
evalscope/collections/evaluator.py +198 -0
evalscope/collections/sampler.py +138 -0
evalscope/collections/schema.py +126 -0
evalscope/config.py +7 -5
evalscope/constants.py +9 -26
evalscope/evaluator/evaluator.py +87 -121
evalscope/evaluator/reviewer/auto_reviewer.py +12 -4
evalscope/metrics/__init__.py +3 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
evalscope/metrics/math_accuracy.py +193 -50
evalscope/metrics/metrics.py +18 -6
evalscope/metrics/named_metrics.py +17 -0
evalscope/metrics/rouge_metric.py +13 -8
evalscope/models/__init__.py +14 -1
evalscope/models/base_adapter.py +52 -0
evalscope/models/chat_adapter.py +138 -0
evalscope/models/choice_adapter.py +211 -0
evalscope/models/custom_adapter.py +67 -0
evalscope/models/local_model.py +74 -0
evalscope/models/model.py +141 -0
evalscope/models/server_adapter.py +111 -0
evalscope/perf/__init__.py +1 -0
evalscope/perf/main.py +0 -1
evalscope/perf/plugin/api/custom_api.py +1 -1
evalscope/perf/plugin/api/openai_api.py +1 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/longalpaca.py +1 -1
evalscope/report/__init__.py +5 -0
evalscope/report/app.py +506 -0
evalscope/report/combinator.py +73 -0
evalscope/report/generator.py +80 -0
evalscope/report/utils.py +133 -0
evalscope/run.py +48 -72
evalscope/run_arena.py +1 -1
evalscope/summarizer.py +1 -1
evalscope/utils/__init__.py +1 -1
evalscope/utils/chat_service.py +5 -4
evalscope/utils/io_utils.py +8 -0
evalscope/utils/logger.py +5 -0
evalscope/utils/model_utils.py +15 -2
evalscope/utils/utils.py +3 -25
evalscope/version.py +2 -2
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/METADATA +115 -21
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/RECORD +99 -78
tests/cli/test_collection.py +57 -0
tests/cli/test_run.py +52 -1
tests/rag/test_mteb.py +3 -2
evalscope/models/api/__init__.py +0 -3
evalscope/models/dummy_chat_model.py +0 -49
evalscope/models/model_adapter.py +0 -525
evalscope/models/openai_model.py +0 -103
evalscope/tools/__init__.py +0 -1
evalscope/tools/combine_reports.py +0 -133
evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
/evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +0 -0
/evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt +0 -0

evalscope/models/openai_model.py DELETED Viewed

@@ -1,103 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import openai
-import os
-import time
-from evalscope.models import ChatBaseModel
-from evalscope.utils.logger import get_logger
-logger = get_logger()
-class OpenAIModel(ChatBaseModel):
-    """
-    APIs of OpenAI models.
-    Available models: gpt-3.5-turbo, gpt-4
-    """
-    MAX_RETRIES = 3
-    def __init__(self, model_cfg: dict, **kwargs):
-        super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs)
-        openai_api_key = os.environ.get('OPENAI_API_KEY', None)
-        self.api_key = self.model_cfg.get('api_key', openai_api_key)
-        if not self.api_key:
-            logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY')
-            # raise ValueError(
-            #     'OpenAI API key is not provided, '
-            #     'please set it in environment variable OPENAI_API_KEY')
-    def predict(self, model_id: str, inputs: dict, **kwargs) -> dict:
-        sys_prompt: str = inputs.get('sys_prompt', '')
-        user_prompt: str = inputs.get('user_prompt', '')
-        # model_id: str = kwargs.get('model_id', '')
-        temperature: float = kwargs.pop('temperature', 0.2)
-        max_tokens: int = kwargs.pop('max_tokens', 1024)
-        mode: str = kwargs.pop('mode', 'chat.completion')
-        logger.info(f'Using OpenAI model_id: {model_id}')
-        res = self._predict(
-            model_id=model_id,
-            sys_prompt=sys_prompt,
-            user_prompt=user_prompt,
-            temperature=temperature,
-            max_tokens=max_tokens,
-            mode=mode)
-        return res
-    def _predict(
-        self,
-        model_id,
-        sys_prompt,
-        user_prompt,
-        temperature,
-        max_tokens,
-        mode: str = 'chat.completion',
-    ) -> dict:
-        res = {}
-        openai.api_key = self.api_key
-        for i in range(self.MAX_RETRIES):
-            try:
-                if mode == 'chat.completion':
-                    resp = openai.ChatCompletion.create(
-                        model=model_id,
-                        messages=[{
-                            'role': 'system',
-                            'content': sys_prompt
-                        }, {
-                            'role': 'user',
-                            'content': user_prompt
-                        }],
-                        temperature=temperature,
-                        max_tokens=max_tokens)
-                    if resp:
-                        ans_text = resp['choices'][0]['message']['content']
-                        model_id = resp['model']
-                    else:
-                        logger.warning(f'OpenAI GPT API call failed: got empty response '
-                                       f'for input {sys_prompt} {user_prompt}')
-                        ans_text = ''
-                        model_id = ''
-                    res['ans_text'] = ans_text
-                    res['model_id'] = model_id
-                else:
-                    raise ValueError(f'Invalid mode: {mode}')
-                return res
-            except Exception as e:
-                logger.warning(f'OpenAI API call failed: {e}')
-                time.sleep(3)
-        logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
-        return res

evalscope/tools/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # Copyright (c) Alibaba, Inc. and its affiliates.

evalscope/tools/combine_reports.py DELETED Viewed

@@ -1,133 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import glob
-import json
-import os
-from collections import defaultdict
-from tabulate import tabulate
-from evalscope.utils.logger import get_logger
-logger = get_logger()
-"""
-Combine and generate table for reports of LLMs.
-"""
-def get_report(report_file: str):
-    data_d: dict = json.load(open(report_file, 'r'))
-    dataset_name = data_d['dataset_name']
-    model_name = data_d['model_name']
-    score = data_d['score']  # float or dict
-    metric = data_d['metric']
-    score_d = {}
-    if isinstance(score, dict):
-        score_d = score
-    elif isinstance(score, float):
-        score_d[metric] = score
-    else:
-        raise ValueError(f'Unknown score type: {type(score)}')
-    score_str = '\n'.join(['(' + dataset_name + '/' + k + ') ' + str(v) for k, v in score_d.items()])
-    return model_name, {'dataset_name': dataset_name, 'score': score_str}
-def get_model_reports(model_report_dir: str):
-    model_report_dir = os.path.normpath(model_report_dir)
-    report_files = glob.glob(os.path.join(model_report_dir, '**/*.json'))
-    model_reports_d = defaultdict(list)
-    for file_path in report_files:
-        model_name, report_d = get_report(file_path)
-        model_reports_d[model_name].append(report_d)
-    return model_reports_d
-def gen_table(reports_path_list: list):
-    table_values = []
-    headers = ['Model']
-    is_headers_set = False
-    for report_path in reports_path_list:
-        model_reports_d = get_model_reports(report_path)
-        for model_name, report_list in model_reports_d.items():
-            report_list = sorted(report_list, key=lambda x: x['dataset_name'])
-            if not is_headers_set:
-                headers.extend([x['dataset_name'] for x in report_list])
-                is_headers_set = True
-            single_row = []
-            single_row.append(model_name)
-            for single_report in report_list:
-                # e.g. '28.51 (acc)'
-                single_row.append(single_report['score'])
-            table_values.append(single_row)
-    report_table = tabulate(table_values, headers=headers, tablefmt='grid')
-    return report_table
-class ReportsRecorder:
-    COMMON_DATASET_PATH = []
-    CUSTOM_DATASET_PATH = []
-    def __init__(self, oss_url: str = '', endpoint: str = ''):
-        if oss_url and endpoint:
-            import oss2
-            from oss2.credentials import EnvironmentVariableCredentialsProvider
-            auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())
-            oss_url = oss_url.replace('oss://', '').split('/')
-            bucket_name = oss_url[0]
-            self.object_path = '/'.join(oss_url[1:])
-            self.bucket = oss2.Bucket(auth, endpoint, bucket_name)
-        else:
-            self.object_path = ''
-            self.bucket = None
-    def append_path(self, report_path: str, dataset_name: str):
-        if dataset_name == 'general_qa':
-            self.CUSTOM_DATASET_PATH.append(report_path)
-        else:
-            self.COMMON_DATASET_PATH.append(report_path)
-    def dump_reports(self, output_dir: str):
-        result = {'CommonDataset': [], 'CustomDataset': []}
-        for line in self.COMMON_DATASET_PATH:
-            with open(line, 'r') as f:
-                report = json.load(f)
-                result['CommonDataset'].append(report)
-        for line in self.CUSTOM_DATASET_PATH:
-            with open(line, 'r') as f:
-                report = json.load(f)
-                report.update({'name': os.path.basename(line)})
-                result['CustomDataset'].append(report)
-        os.makedirs(output_dir, exist_ok=True)
-        output_file_name = 'metric.json'
-        output_path = os.path.join(output_dir, output_file_name)
-        with open(output_path, 'w+') as f:
-            f.write(json.dumps(result, ensure_ascii=False, indent=4))
-        if self.bucket:
-            remote_path = os.path.join(self.object_path, output_file_name)
-            logger.info(f'** Upload report to oss: {remote_path}')
-            self.bucket.put_object_from_file(remote_path, output_path)
-if __name__ == '__main__':
-    report_dir_1 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b-base_none/reports'
-    report_dir_2 = '/to/path/20231129_020533_default_ZhipuAI_chatglm2-6b_none/reports'
-    report_table = gen_table([report_dir_1, report_dir_2])
-    print(report_table)
-    # ALL VALUES ONLY FOR EXAMPLE
-    # +--------------------------+-------------------+-------------+
-    # | Model                    | CompetitionMath   | GSM8K       |
-    # +==========================+===================+=============+
-    # | ZhipuAI_chatglm2-6b-base | 25.0 (acc)        | 30.50 (acc) |
-    # +--------------------------+-------------------+-------------+
-    # | ZhipuAI_chatglm2-6b      | 30.5 (acc)        | 40.50 (acc) |
-    # +--------------------------+-------------------+-------------+

evalscope/tools/gen_mmlu_subject_mapping.py DELETED Viewed

@@ -1,90 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# Note: refer to https://github.com/hendrycks/test/blob/master/categories.py
-subcategories = {
-    'abstract_algebra': ['math'],
-    'anatomy': ['health'],
-    'astronomy': ['physics'],
-    'business_ethics': ['business'],
-    'clinical_knowledge': ['health'],
-    'college_biology': ['biology'],
-    'college_chemistry': ['chemistry'],
-    'college_computer_science': ['computer science'],
-    'college_mathematics': ['math'],
-    'college_medicine': ['health'],
-    'college_physics': ['physics'],
-    'computer_security': ['computer science'],
-    'conceptual_physics': ['physics'],
-    'econometrics': ['economics'],
-    'electrical_engineering': ['engineering'],
-    'elementary_mathematics': ['math'],
-    'formal_logic': ['philosophy'],
-    'global_facts': ['other'],
-    'high_school_biology': ['biology'],
-    'high_school_chemistry': ['chemistry'],
-    'high_school_computer_science': ['computer science'],
-    'high_school_european_history': ['history'],
-    'high_school_geography': ['geography'],
-    'high_school_government_and_politics': ['politics'],
-    'high_school_macroeconomics': ['economics'],
-    'high_school_mathematics': ['math'],
-    'high_school_microeconomics': ['economics'],
-    'high_school_physics': ['physics'],
-    'high_school_psychology': ['psychology'],
-    'high_school_statistics': ['math'],
-    'high_school_us_history': ['history'],
-    'high_school_world_history': ['history'],
-    'human_aging': ['health'],
-    'human_sexuality': ['culture'],
-    'international_law': ['law'],
-    'jurisprudence': ['law'],
-    'logical_fallacies': ['philosophy'],
-    'machine_learning': ['computer science'],
-    'management': ['business'],
-    'marketing': ['business'],
-    'medical_genetics': ['health'],
-    'miscellaneous': ['other'],
-    'moral_disputes': ['philosophy'],
-    'moral_scenarios': ['philosophy'],
-    'nutrition': ['health'],
-    'philosophy': ['philosophy'],
-    'prehistory': ['history'],
-    'professional_accounting': ['other'],
-    'professional_law': ['law'],
-    'professional_medicine': ['health'],
-    'professional_psychology': ['psychology'],
-    'public_relations': ['politics'],
-    'security_studies': ['politics'],
-    'sociology': ['culture'],
-    'us_foreign_policy': ['politics'],
-    'virology': ['health'],
-    'world_religions': ['philosophy'],
-}
-categories = {
-    'STEM': ['physics', 'chemistry', 'biology', 'computer science', 'math', 'engineering'],
-    'Humanities': ['history', 'philosophy', 'law'],
-    'Social Science': ['politics', 'culture', 'economics', 'geography', 'psychology'],
-    'Other': ['other', 'business', 'health'],
-}
-def main():
-    reversed_categories = {}
-    for category, subcategory_list in categories.items():
-        for subcategory in subcategory_list:
-            reversed_categories[subcategory] = category
-    subject_mapping = {}
-    for subject, subcategory_list in subcategories.items():
-        category_name: str = reversed_categories[subcategory_list[0]]
-        subject_show_name: str = ' '.join([item.capitalize() for item in subject.split('_')])
-        subject_mapping[subject] = [subject_show_name, subcategory_list[0], category_name]
-    print(subject_mapping)
-if __name__ == '__main__':
-    main()

/evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} RENAMED Viewed

File without changes

/evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py RENAMED Viewed

File without changes

{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{evalscope-0.8.2.dist-info → evalscope-0.10.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

evalscope 0.8.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

evalscope 0.8.2py3-none-any.whl → 0.10.0py3-none-any.whl