PyPI - evalscope - Versions diffs - 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

evalscope 0.8.0py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (147) hide show

evalscope/__init__.py +2 -0
evalscope/arguments.py +11 -3
evalscope/backend/base.py +1 -1
evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
evalscope/backend/rag_eval/utils/clip.py +2 -2
evalscope/backend/rag_eval/utils/embedding.py +1 -1
evalscope/backend/rag_eval/utils/llm.py +1 -1
evalscope/benchmarks/__init__.py +20 -1
evalscope/benchmarks/arc/__init__.py +0 -5
evalscope/benchmarks/arc/arc_adapter.py +24 -102
evalscope/benchmarks/bbh/__init__.py +0 -4
evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
evalscope/benchmarks/benchmark.py +70 -59
evalscope/benchmarks/ceval/__init__.py +0 -5
evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
evalscope/benchmarks/cmmlu/__init__.py +0 -5
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
evalscope/benchmarks/competition_math/__init__.py +0 -5
evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
evalscope/benchmarks/data_adapter.py +115 -87
evalscope/benchmarks/general_qa/__init__.py +0 -5
evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
evalscope/benchmarks/gpqa/__init__.py +0 -0
evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
evalscope/benchmarks/gsm8k/__init__.py +0 -4
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
evalscope/benchmarks/hellaswag/__init__.py +0 -5
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
evalscope/benchmarks/humaneval/__init__.py +0 -4
evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
evalscope/benchmarks/ifeval/__init__.py +0 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
evalscope/benchmarks/ifeval/instructions.py +1477 -0
evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
evalscope/benchmarks/ifeval/utils.py +134 -0
evalscope/benchmarks/iquiz/__init__.py +0 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
evalscope/benchmarks/mmlu/__init__.py +0 -5
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
evalscope/benchmarks/race/__init__.py +0 -5
evalscope/benchmarks/race/race_adapter.py +27 -123
evalscope/benchmarks/trivia_qa/__init__.py +0 -5
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
evalscope/benchmarks/truthful_qa/__init__.py +0 -5
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +30 -0
evalscope/collections/__init__.py +3 -0
evalscope/collections/evaluator.py +198 -0
evalscope/collections/sampler.py +138 -0
evalscope/collections/schema.py +126 -0
evalscope/config.py +45 -7
evalscope/constants.py +7 -38
evalscope/evaluator/__init__.py +0 -1
evalscope/evaluator/evaluator.py +89 -121
evalscope/evaluator/rating_eval.py +1 -1
evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
evalscope/metrics/__init__.py +3 -0
evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
evalscope/metrics/math_accuracy.py +193 -50
evalscope/metrics/metrics.py +18 -6
evalscope/metrics/named_metrics.py +17 -0
evalscope/metrics/rouge_metric.py +13 -8
evalscope/models/__init__.py +14 -1
evalscope/models/base_adapter.py +52 -0
evalscope/models/chat_adapter.py +140 -0
evalscope/models/choice_adapter.py +211 -0
evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
evalscope/models/custom_adapter.py +67 -0
evalscope/models/local_model.py +74 -0
evalscope/models/model.py +141 -0
evalscope/models/server_adapter.py +111 -0
evalscope/perf/__init__.py +1 -0
evalscope/perf/arguments.py +3 -1
evalscope/perf/benchmark.py +3 -3
evalscope/perf/main.py +5 -7
evalscope/perf/plugin/api/custom_api.py +1 -1
evalscope/perf/plugin/api/openai_api.py +54 -50
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/longalpaca.py +1 -1
evalscope/perf/plugin/registry.py +3 -3
evalscope/perf/utils/benchmark_util.py +4 -4
evalscope/perf/utils/db_util.py +66 -22
evalscope/perf/utils/local_server.py +4 -1
evalscope/report/__init__.py +5 -0
evalscope/report/app.py +693 -0
evalscope/report/combinator.py +73 -0
evalscope/report/generator.py +80 -0
evalscope/report/utils.py +133 -0
evalscope/run.py +64 -125
evalscope/run_arena.py +3 -2
evalscope/summarizer.py +15 -27
evalscope/third_party/longbench_write/eval.py +2 -1
evalscope/third_party/longbench_write/longbench_write.py +2 -1
evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
evalscope/utils/__init__.py +1 -0
evalscope/utils/chat_service.py +6 -5
evalscope/utils/io_utils.py +170 -0
evalscope/utils/logger.py +13 -0
evalscope/utils/model_utils.py +15 -2
evalscope/utils/utils.py +3 -200
evalscope/version.py +2 -2
{evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
{evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
tests/cli/test_collection.py +57 -0
tests/cli/test_run.py +57 -7
tests/perf/test_perf.py +3 -2
tests/rag/test_mteb.py +3 -2
tests/vlm/test_vlmeval.py +3 -2
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
evalscope/evaluator/humaneval_evaluator.py +0 -158
evalscope/models/api/__init__.py +0 -3
evalscope/models/dummy_chat_model.py +0 -49
evalscope/models/model_adapter.py +0 -525
evalscope/models/openai_model.py +0 -103
evalscope/tools/__init__.py +0 -1
evalscope/tools/combine_reports.py +0 -135
evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
/evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
{evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
{evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
{evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0

evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json DELETED Viewed

@@ -1,39 +0,0 @@
-{
-  "ragas_version": "0.2.7",
-  "original_hash": 4608101540215877909,
-  "language": "chinese",
-  "instruction": "给定一个主题和角色列表，根据角色描述将每个角色与相关主题关联起来。",
-  "examples": [
-    {
-      "input": {
-        "themes": [
-          "同理心",
-          "包容性",
-          "远程工作"
-        ],
-        "personas": [
-          {
-            "name": "人力资源经理",
-            "role_description": "专注于包容性和员工支持。"
-          },
-          {
-            "name": "远程团队负责人",
-            "role_description": "管理远程团队沟通。"
-          }
-        ]
-      },
-      "output": {
-        "mapping": {
-          "HR Manager": [
-            "包容性",
-            "同理心"
-          ],
-          "Remote Team Lead": [
-            "远程工作",
-            "同理心"
-          ]
-        }
-      }
-    }
-  ]
-}

evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json DELETED Viewed

@@ -1,16 +0,0 @@
-{
-  "ragas_version": "0.2.7",
-  "original_hash": -2203889341293275650,
-  "language": "chinese",
-  "instruction": "将给定文本总结为不超过10个句子。",
-  "examples": [
-    {
-      "input": {
-        "text": "人工智能\n\n人工智能正在通过自动化以前需要人类智能的任务来改变各个行业。从医疗保健到金融，人工智能正在被用来快速准确地分析大量数据。这项技术还推动了自动驾驶汽车和个性化推荐等领域的创新。"
-      },
-      "output": {
-        "text": "人工智能通过自动化任务、分析数据和推动自动驾驶汽车和个性化推荐等创新，正在革新各个行业。"
-      }
-    }
-  ]
-}

evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "ragas_version": "0.2.7",
-  "original_hash": -7344189172470926110,
-  "language": "chinese",
-  "instruction": "从给定的文本中提取主要主题和概念。",
-  "examples": [
-    {
-      "input": {
-        "text": "人工智能通过自动化需要人类智能的任务来改变行业。人工智能快速准确地分析大量数据，推动了自动驾驶汽车和个性化推荐等创新。",
-        "max_num": 10
-      },
-      "output": {
-        "output": [
-          "人工智能",
-          "自动化",
-          "数据分析",
-          "创新",
-          "自动驾驶汽车",
-          "个性化推荐"
-        ]
-      }
-    }
-  ]
-}

evalscope/evaluator/humaneval_evaluator.py DELETED Viewed

@@ -1,158 +0,0 @@
-import json
-import os
-import re
-from tqdm import tqdm
-from typing import List, Optional
-from evalscope.constants import OutputsStructure
-from evalscope.evaluator.evaluator import logger
-from evalscope.models.model_adapter import BaseModelAdapter
-from evalscope.tools.combine_reports import gen_table
-from evalscope.utils import normalize_score
-class HumanevalEvaluator(object):
-    def __init__(
-        self,
-        problem_file: str,
-        model_id: str,
-        model_revision: str,
-        model_adapter: BaseModelAdapter,
-        outputs: Optional[OutputsStructure] = None,
-        k: List[int] = [1, 10, 100],
-        n_workers: int = 4,
-        timeout: float = 3.0,
-    ):
-        try:
-            from human_eval.data import read_problems, write_jsonl
-            from human_eval.evaluation import evaluate_functional_correctness
-        except ImportError:
-            raise ImportError('Please install human_eval:'
-                              'https://github.com/openai/human-eval/tree/master#installation , '
-                              'Note that you need to enable the execution code in the human_eval/execution.py first.')
-        self.problem_file = problem_file
-        self.k = k
-        self.num_workers = n_workers
-        self.timeout = timeout
-        self.model_adapter = model_adapter
-        self.read_problems_func = read_problems
-        self.write_jsonl_func = write_jsonl
-        self.eval_func = evaluate_functional_correctness
-        # {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
-        self.problems = self.read_problems_func(self.problem_file)
-        # Deal with the output paths
-        self.outputs_structure = OutputsStructure(outputs)
-    def get_answers(self, infer_cfg: dict) -> List[dict]:
-        ans_list: list = []
-        system_prompt: str = 'Complete the following python code:\n'
-        for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
-            prompt: str = system_prompt + data_d['prompt']
-            inputs: dict = {'data': [prompt]}
-            # pred_res: dict = self.model_adapter.predict(inputs)
-            pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
-            pred_ans: str = pred_res['choices'][0]['message']['content']
-            pred_ans = self._postprocess(pred_ans)
-            ans_list.append({'task_id': task_id, 'completion': pred_ans})
-        return ans_list
-    def eval(self, infer_cfg: dict, **kwargs):
-        # predict
-        ans_list: list = self.get_answers(infer_cfg)
-        ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
-        self.write_jsonl_func(filename=ans_out_file, data=ans_list)
-        # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
-        logger.info('** Dump predictions successfully.')
-        # evaluate  results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
-        results = self.eval_func(
-            sample_file=ans_out_file,
-            k=self.k,
-            n_workers=self.num_workers,
-            timeout=self.timeout,
-            problem_file=self.problem_file)
-        # output: report
-        report_map: dict = self.gen_report(results=results)
-        report_dir: str = self.outputs_structure.reports_dir
-        report_file: str = os.path.join(report_dir, 'human_eval_report.json')
-        with open(report_file, 'w') as f:
-            f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
-        # logger.info(f'** Dump report to {report_file} \n')
-        logger.info('** Dump report \n')
-        try:
-            # Make table
-            report_table: str = gen_table([report_dir])
-            logger.info(f'** Report table: \n {report_table} \n')
-        except Exception:
-            logger.error('Failed to generate report table.')
-    def gen_report(self, results: dict) -> dict:
-        """
-        Generate report from evaluation results.
-        Returns:
-            {
-            "name":"ARC-Challenge",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.3389,
-                    "subset":[
-                        {
-                            "name":"ARC-Challenge",
-                            "score":0.3389
-                        },
-                    ]
-                }
-            ],
-            "total_num":100
-        }
-        """
-        results = {k: normalize_score(score=v) for k, v in results.items()}
-        category_d = dict(name='DEFAULT', score=results, subset=[])
-        res_map = dict(
-            name='HumanEval', metric='pass@k', score=results, category=[category_d], total_num=len(self.problems))
-        return res_map
-    @classmethod
-    def _postprocess(cls, text: str) -> str:
-        if '```' in text:
-            blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
-            if len(blocks) == 0:
-                text = text.split('```')[1]  # fall back to default strategy
-            else:
-                text = blocks[0]  # fetch the first code block
-                if not text.startswith('\n'):  # in case starting with ```python
-                    text = text[max(text.find('\n') + 1, 0):]
-        if text.strip().startswith('from') or text.strip().startswith('import'):
-            def_idx = text.find('def')
-            if def_idx != -1:
-                text = text[max(text.find('\n', def_idx) + 1, 0):]
-        text = text.split('\n\n')[0]
-        if text.strip().startswith('def'):
-            text = '\n'.join(text.split('\n')[1:])
-        if not text.startswith('    '):
-            if text.startswith(' '):
-                text = '    ' + text.lstrip()
-            else:
-                text = '\n'.join(['    ' + line for line in text.split('\n')])
-        return text

evalscope/models/api/__init__.py DELETED Viewed

@@ -1,3 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-from evalscope.models.api.openai_api import OpenaiApi

evalscope/models/dummy_chat_model.py DELETED Viewed

@@ -1,49 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import random
-import time
-from evalscope.models import ChatBaseModel
-from evalscope.utils.logger import get_logger
-logger = get_logger()
-class DummyChatModel(ChatBaseModel):
-    MODEL_ID = 'dummy_chat_model_0801'
-    REVISION = 'v1.0.0'
-    def __init__(self, model_cfg: dict, **kwargs):
-        model_cfg['model_id'] = self.MODEL_ID
-        model_cfg['revision'] = self.REVISION
-        super(DummyChatModel, self).__init__(model_cfg=model_cfg)
-    def predict(self, inputs: dict, **kwargs) -> dict:
-        debug: bool = False
-        if debug:
-            messages = inputs['messages']
-            history = inputs['history']
-            logger.info(f'** messages: {messages}')
-            logger.info(f'** history: {history}')
-        choice = random.choice(['A', 'B', 'C', 'D'])
-        # Build response
-        res = {
-            'choices': [{
-                'index': 0,
-                'message': {
-                    'content': choice,
-                    'role': 'assistant'
-                }
-            }],
-            'created': time.time(),
-            'model': self.MODEL_ID + '-' + self.REVISION,
-            'object': 'chat.completion',
-            'usage': {}
-        }
-        return res

evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.8.0py3-none-any.whl → 0.10.1py3-none-any.whl