PyPI - evalscope - Versions diffs - 0.13.1__tar.gz → 0.13.2__tar.gz - Mend

evalscope 0.13.1tar.gz → 0.13.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (360) hide show

{evalscope-0.13.1/evalscope.egg-info → evalscope-0.13.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: evalscope
-Version: 0.13.1
+Version: 0.13.2
 Summary: EvalScope: Lightweight LLMs Evaluation Framework
 Home-page: https://github.com/modelscope/evalscope
 Author: ModelScope team
@@ -16,11 +16,8 @@ Classifier: Programming Language :: Python :: 3.10
 Requires-Python: >=3.8
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: absl-py
 Requires-Dist: accelerate
-Requires-Dist: cachetools
 Requires-Dist: datasets<=3.2.0,>=3.0.0
-Requires-Dist: editdistance
 Requires-Dist: immutabledict
 Requires-Dist: jieba
 Requires-Dist: jsonlines
@@ -31,33 +28,29 @@ Requires-Dist: modelscope[framework]
 Requires-Dist: nltk>=3.9
 Requires-Dist: openai
 Requires-Dist: pandas
-Requires-Dist: plotly
 Requires-Dist: pyarrow
-Requires-Dist: pympler
 Requires-Dist: pyyaml
-Requires-Dist: regex
 Requires-Dist: requests
-Requires-Dist: requests-toolbelt
 Requires-Dist: rouge-chinese
 Requires-Dist: rouge-score>=0.1.0
 Requires-Dist: sacrebleu
 Requires-Dist: scikit-learn
 Requires-Dist: seaborn
-Requires-Dist: sentencepiece
-Requires-Dist: simple-ddl-parser
 Requires-Dist: sympy
 Requires-Dist: tabulate
-Requires-Dist: tiktoken
 Requires-Dist: torch
 Requires-Dist: tqdm
 Requires-Dist: transformers>=4.33
-Requires-Dist: transformers_stream_generator
 Requires-Dist: word2number
 Provides-Extra: opencompass
 Requires-Dist: ms-opencompass>=0.1.4; extra == "opencompass"
 Provides-Extra: vlmeval
 Requires-Dist: ms-vlmeval>=0.0.9; extra == "vlmeval"
 Provides-Extra: rag
+Requires-Dist: langchain<0.3.0; extra == "rag"
+Requires-Dist: langchain-community<0.3.0; extra == "rag"
+Requires-Dist: langchain-core<0.3.0; extra == "rag"
+Requires-Dist: langchain-openai<0.3.0; extra == "rag"
 Requires-Dist: mteb==1.19.4; extra == "rag"
 Requires-Dist: ragas==0.2.9; extra == "rag"
 Requires-Dist: webdataset>0.2.0; extra == "rag"
@@ -71,38 +64,9 @@ Requires-Dist: unicorn; extra == "perf"
 Provides-Extra: app
 Requires-Dist: gradio==5.4.0; extra == "app"
 Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
-Provides-Extra: inner
-Requires-Dist: absl-py; extra == "inner"
-Requires-Dist: accelerate; extra == "inner"
-Requires-Dist: alibaba_itag_sdk; extra == "inner"
-Requires-Dist: dashscope; extra == "inner"
-Requires-Dist: editdistance; extra == "inner"
-Requires-Dist: jsonlines; extra == "inner"
-Requires-Dist: nltk; extra == "inner"
-Requires-Dist: openai; extra == "inner"
-Requires-Dist: pandas==1.5.3; extra == "inner"
-Requires-Dist: plotly; extra == "inner"
-Requires-Dist: pyarrow; extra == "inner"
-Requires-Dist: pyodps; extra == "inner"
-Requires-Dist: pyyaml; extra == "inner"
-Requires-Dist: regex; extra == "inner"
-Requires-Dist: requests==2.28.1; extra == "inner"
-Requires-Dist: requests-toolbelt==0.10.1; extra == "inner"
-Requires-Dist: rouge-score; extra == "inner"
-Requires-Dist: sacrebleu; extra == "inner"
-Requires-Dist: scikit-learn; extra == "inner"
-Requires-Dist: seaborn; extra == "inner"
-Requires-Dist: simple-ddl-parser; extra == "inner"
-Requires-Dist: streamlit; extra == "inner"
-Requires-Dist: tqdm; extra == "inner"
-Requires-Dist: transformers<4.43,>=4.33; extra == "inner"
-Requires-Dist: transformers_stream_generator; extra == "inner"
 Provides-Extra: all
-Requires-Dist: absl-py; extra == "all"
 Requires-Dist: accelerate; extra == "all"
-Requires-Dist: cachetools; extra == "all"
 Requires-Dist: datasets<=3.2.0,>=3.0.0; extra == "all"
-Requires-Dist: editdistance; extra == "all"
 Requires-Dist: immutabledict; extra == "all"
 Requires-Dist: jieba; extra == "all"
 Requires-Dist: jsonlines; extra == "all"
@@ -113,30 +77,26 @@ Requires-Dist: modelscope[framework]; extra == "all"
 Requires-Dist: nltk>=3.9; extra == "all"
 Requires-Dist: openai; extra == "all"
 Requires-Dist: pandas; extra == "all"
-Requires-Dist: plotly; extra == "all"
 Requires-Dist: pyarrow; extra == "all"
-Requires-Dist: pympler; extra == "all"
 Requires-Dist: pyyaml; extra == "all"
-Requires-Dist: regex; extra == "all"
 Requires-Dist: requests; extra == "all"
-Requires-Dist: requests-toolbelt; extra == "all"
 Requires-Dist: rouge-chinese; extra == "all"
 Requires-Dist: rouge-score>=0.1.0; extra == "all"
 Requires-Dist: sacrebleu; extra == "all"
 Requires-Dist: scikit-learn; extra == "all"
 Requires-Dist: seaborn; extra == "all"
-Requires-Dist: sentencepiece; extra == "all"
-Requires-Dist: simple-ddl-parser; extra == "all"
 Requires-Dist: sympy; extra == "all"
 Requires-Dist: tabulate; extra == "all"
-Requires-Dist: tiktoken; extra == "all"
 Requires-Dist: torch; extra == "all"
 Requires-Dist: tqdm; extra == "all"
 Requires-Dist: transformers>=4.33; extra == "all"
-Requires-Dist: transformers_stream_generator; extra == "all"
 Requires-Dist: word2number; extra == "all"
 Requires-Dist: ms-opencompass>=0.1.4; extra == "all"
 Requires-Dist: ms-vlmeval>=0.0.9; extra == "all"
+Requires-Dist: langchain<0.3.0; extra == "all"
+Requires-Dist: langchain-community<0.3.0; extra == "all"
+Requires-Dist: langchain-core<0.3.0; extra == "all"
+Requires-Dist: langchain-openai<0.3.0; extra == "all"
 Requires-Dist: mteb==1.19.4; extra == "all"
 Requires-Dist: ragas==0.2.9; extra == "all"
 Requires-Dist: webdataset>0.2.0; extra == "all"
@@ -239,6 +199,7 @@ Please scan the QR code below to join our community groups:
 ## 🎉 News
+- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
 - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
 - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
 - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).

{evalscope-0.13.1 → evalscope-0.13.2}/README.md RENAMED Viewed

@@ -88,6 +88,7 @@ Please scan the QR code below to join our community groups:
 ## 🎉 News
+- 🔥 **[2025.03.27]** Added support for [AlpacaEval](https://www.modelscope.cn/datasets/AI-ModelScope/alpaca_eval/dataPeview) and [ArenaHard](https://modelscope.cn/datasets/AI-ModelScope/arena-hard-auto-v0.1/summary) evaluation benchmarks. For usage notes, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html)
 - 🔥 **[2025.03.20]** The model inference service stress testing now supports generating prompts of specified length using random values. Refer to the [user guide](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/examples.html#using-the-random-dataset) for more details.
 - 🔥 **[2025.03.13]** Added support for the [LiveCodeBench](https://www.modelscope.cn/datasets/AI-ModelScope/code_generation_lite/summary) code evaluation benchmark, which can be used by specifying `live_code_bench`. Supports evaluating QwQ-32B on LiveCodeBench, refer to the [best practices](https://evalscope.readthedocs.io/en/latest/best_practice/eval_qwq.html).
 - 🔥 **[2025.03.11]** Added support for the [SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/SimpleQA/summary) and [Chinese SimpleQA](https://modelscope.cn/datasets/AI-ModelScope/Chinese-SimpleQA/summary) evaluation benchmarks. These are used to assess the factual accuracy of models, and you can specify `simple_qa` and `chinese_simpleqa` for use. Support for specifying a judge model is also available. For more details, refer to the [relevant parameter documentation](https://evalscope.readthedocs.io/en/latest/get_started/parameters.html).

{evalscope-0.13.1 → evalscope-0.13.2}/evalscope/arguments.py RENAMED Viewed

@@ -77,7 +77,7 @@ def add_argument(parser: argparse.ArgumentParser):
     # LLMJudge arguments
     parser.add_argument('--judge-strategy', type=str, default=JudgeStrategy.AUTO, help='The judge strategy.')
     parser.add_argument('--judge-model-args', type=json.loads, default='{}', help='The judge model args, should be a json string.')  # noqa: E501
-    parser.add_argument('--judge-worker-num', type=int, default=8, help='The number of workers for the judge model.')
+    parser.add_argument('--judge-worker-num', type=int, default=1, help='The number of workers for the judge model.')
     # yapf: enable

{evalscope-0.13.1 → evalscope-0.13.2}/evalscope/backend/rag_eval/utils/llm.py RENAMED Viewed

@@ -6,7 +6,7 @@ from modelscope.utils.hf_util import GenerationConfig
 from typing import Any, Dict, Iterator, List, Mapping, Optional
 from evalscope.constants import DEFAULT_MODEL_REVISION
-from evalscope.models import ChatGenerationModelAdapter
+from evalscope.models import ChatGenerationModelAdapter, LocalModel
 class LLM:
@@ -38,8 +38,7 @@ class LocalLLM(BaseLLM):
         super().__init__(**kw)
         self.model_name = os.path.basename(self.model_name_or_path)
         self.model = ChatGenerationModelAdapter(
-            model_id=self.model_name_or_path,
-            model_revision=self.model_revision,
+            model=LocalModel(model_id=self.model_name_or_path, model_revision=self.model_revision),
             generation_config=GenerationConfig(**self.generation_config) if self.generation_config else None,
         )
@@ -53,8 +52,8 @@ class LocalLLM(BaseLLM):
         """Run the LLM on the given input."""
         infer_cfg = {'stop': stop}
-        response = self.model._model_generate(prompt, infer_cfg)
-        return response
+        response, _ = self.model._model_generate([prompt], infer_cfg=infer_cfg)
+        return response[0][0]
     @property
     def _identifying_params(self) -> Dict[str, Any]:

evalscope-0.13.2/evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py ADDED Viewed

@@ -0,0 +1,109 @@
+import re
+from collections import defaultdict
+from typing import Any, List
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.metrics import Metric, mean, metric_registry
+from evalscope.metrics.llm_judge import LLMJudge
+from evalscope.utils.logger import get_logger
+# flake8: noqa
+logger = get_logger()
+GRADER_SYSTEM_PROMPT = """You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers."""
+GRADER_TEMPLATE = """
+I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
+## Instruction
+{{
+    "instruction": "{instruction}"
+}}
+## Model Outputs
+Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
+{{
+    {{
+        "model_identifier": "m",
+        "output": "{output_1}"
+    }},
+    {{
+        "model_identifier": "M",
+        "output": "{output_2}"
+    }}
+}}
+## Task
+Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
+## Best Model Identifier
+""".strip()  # noqa: E501
+@Benchmark.register(
+    name='alpaca_eval',
+    pretty_name='AlpacaEval2.0',
+    dataset_id='AI-ModelScope/alpaca_eval',
+    subset_list=['alpaca_eval_gpt4_baseline'],
+    metric_list=['winrate'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='eval')
+class AlpacaEvalAdapter(DataAdapter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # register metrics
+        metric_registry.register(Metric(name='winrate', object=mean))
+        # whether to use LLM as a judge
+        self.llm_as_a_judge = True
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
+        question = input_d['instruction']
+        return self.gen_prompt_data(question)
+    def get_gold_answer(self, input_d: dict) -> str:
+        return input_d['output']
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
+        return result.strip()
+    def match(self, gold: str, pred: str):
+        # simple match
+        logger.warning(f'Please use LLMJudge to match the result for {self.name}')
+        return None
+    def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> bool:
+        raw_input = kwargs.get('raw_input', None)
+        instruction = raw_input['instruction']
+        # gold is baseline answer 'm', pred is model answer 'M'
+        prompt = GRADER_TEMPLATE.format(instruction=instruction, output_1=gold, output_2=pred)
+        # get grading response
+        grading_response = judge(prompt, system_prompt=GRADER_SYSTEM_PROMPT)
+        # parse grading response
+        match = re.search(r'(m|M)', grading_response)
+        res = match.group(0) if match else None
+        if res:
+            return res == 'M'
+        else:
+            logger.info(f'Failed to parse grading response: {prompt=}\n {grading_response=}')
+            return None
+    def compute_metric(self, review_res_list: List[bool], **kwargs) -> List[dict]:
+        """
+        compute weighted mean of the bleu score of all samples
+        Args:
+            review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
+        """
+        # zip dict answers
+        res_list = [res for res in review_res_list if res is not None]
+        return super().compute_metric(res_list, **kwargs)

evalscope-0.13.2/evalscope/benchmarks/arena_hard/arena_hard_adapter.py ADDED Viewed

@@ -0,0 +1,120 @@
+import re
+from collections import defaultdict
+from typing import Any, List
+from evalscope.benchmarks import Benchmark, DataAdapter
+from evalscope.constants import AnswerKeys
+from evalscope.metrics import Metric, mean, metric_registry
+from evalscope.metrics.llm_judge import LLMJudge
+from evalscope.utils.logger import get_logger
+# flake8: noqa
+logger = get_logger()
+GRADER_SYSTEM_PROMPT = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."  # noqa: E501
+GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>".strip(
+)  # noqa: E501
+@Benchmark.register(
+    name='arena_hard',
+    pretty_name='ArenaHard',
+    dataset_id='AI-ModelScope/arena-hard-auto-v0.1',
+    metric_list=['winrate'],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test')
+class AlpacaEvalAdapter(DataAdapter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # register metrics
+        metric_registry.register(Metric(name='winrate', object=mean))
+        # whether to use LLM as a judge
+        self.llm_as_a_judge = True
+    def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
+        question = input_d['question']
+        return self.gen_prompt_data(question)
+    def get_gold_answer(self, input_d: dict) -> str:
+        return input_d['prediction']
+    def parse_pred_result(self, result: str, raw_input_d: dict = None, **kwargs) -> str:
+        return result.strip()
+    def match(self, gold: str, pred: str):
+        # simple match
+        logger.warning(f'Please use LLMJudge to match the result for {self.name}')
+        return None
+    def llm_match(self, gold: Any, pred: Any, judge: LLMJudge, **kwargs) -> dict:
+        from .utils import post_process_arenahard
+        raw_input = kwargs.get('raw_input', None)
+        question = raw_input['question']
+        # gold is baseline answer 'A', pred is model answer 'B'
+        prompt1 = GRADER_TEMPLATE.format(question=question, answer_1=gold, answer_2=pred)
+        # reverse the order
+        prompt2 = GRADER_TEMPLATE.format(question=question, answer_1=pred, answer_2=gold)
+        # get grading response
+        game1_response = judge(prompt1, system_prompt=GRADER_SYSTEM_PROMPT)
+        game2_response = judge(prompt2, system_prompt=GRADER_SYSTEM_PROMPT)
+        # parse grading response
+        res1 = post_process_arenahard(game1_response)
+        res2 = post_process_arenahard(game2_response)
+        return {
+            'model_a':
+            'gpt4-0314',
+            'model_b':
+            'test_model',
+            'games': [
+                {
+                    'user_prompt': prompt1,
+                    'judgment': game1_response,
+                    'score': res1
+                },
+                {
+                    'user_prompt': prompt2,
+                    'judgment': game2_response,
+                    'score': res2
+                },
+            ]
+        }
+    def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
+        """
+        compute score of the model
+        """
+        import pandas as pd
+        from .utils import compute_mle_elo, get_battles_from_row, get_bootstrap_result, get_win_rate_column
+        if isinstance(review_res_list[0], list):
+            review_res_list = [item for sublist in review_res_list for item in sublist]
+        battles = pd.concat([get_battles_from_row(res) for res in review_res_list])
+        bootstrap_online_elo = compute_mle_elo(battles)
+        # bootstrap_elo_lu = get_bootstrap_result(battles, compute_mle_elo, 100)
+        stats = pd.DataFrame()
+        stats['results'] = None
+        stats['results'] = stats['results'].astype('object')
+        for i, model in enumerate(bootstrap_online_elo.index):
+            # assert model in bootstrap_elo_lu.columns
+            stats.at[i, 'model'] = model
+            stats.at[i, 'score'] = bootstrap_online_elo[model]
+            # stats.at[i, "lower"] = np.percentile(bootstrap_elo_lu[model], 2.5)
+            # stats.at[i, "upper"] = np.percentile(bootstrap_elo_lu[model], 97.5)
+        # stats['score'] = get_win_rate_column(stats, 'score', 'gpt4-0314').tolist()
+        score = get_win_rate_column(stats, 'score', 'gpt4-0314').at['test_model']
+        return [{'metric_name': 'winrate', 'score': score, 'num': len(review_res_list)}]

evalscope-0.13.2/evalscope/benchmarks/arena_hard/utils.py ADDED Viewed

@@ -0,0 +1,162 @@
+import math
+import numpy as np
+import pandas as pd
+import re
+from collections import defaultdict
+from sklearn.linear_model import LogisticRegression
+from tqdm import tqdm
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+def post_process_arenahard(completion):
+    result = re.findall(r'\[\[([AB<>=]+)\]\]', completion)
+    if result:
+        return result[0]
+    else:
+        return None
+def get_battles_from_row(row, first_game_only=False, multiplier=3):
+    results = []
+    output = {'model_a': row['model_a'], 'model_b': row['model_b']}
+    game = row['games'][0]
+    weight = 1
+    if game['score'] == 'A=B':
+        output['winner'] = 'tie'
+    elif game['score'] == 'A>B':
+        output['winner'] = 'model_a'
+    elif game['score'] == 'A>>B':
+        output['winner'] = 'model_a'
+        weight = multiplier
+    elif game['score'] == 'B>A':
+        output['winner'] = 'model_b'
+    elif game['score'] == 'B>>A':
+        output['winner'] = 'model_b'
+        weight = multiplier
+    else:
+        weight = 0
+    if weight:
+        results += [output] * weight
+    if first_game_only:
+        return pd.DataFrame(results)
+    # game 2
+    output = {'model_a': row['model_a'], 'model_b': row['model_b']}
+    game = row['games'][1]
+    weight = 1
+    if game['score'] == 'A=B':
+        output['winner'] = 'tie'
+    elif game['score'] == 'A>B':
+        output['winner'] = 'model_b'
+    elif game['score'] == 'A>>B':
+        output['winner'] = 'model_b'
+        weight = multiplier
+    elif game['score'] == 'B>A':
+        output['winner'] = 'model_a'
+    elif game['score'] == 'B>>A':
+        output['winner'] = 'model_a'
+        weight = multiplier
+    else:
+        weight = 0
+    if weight:
+        results += [output] * weight
+    return pd.DataFrame(results)
+def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
+    models = pd.concat([df['model_a'], df['model_b']]).unique()
+    models = pd.Series(np.arange(len(models)), index=models)
+    # duplicate battles
+    df = pd.concat([df, df], ignore_index=True)
+    p = len(models.index)
+    n = df.shape[0]
+    X = np.zeros([n, p])
+    X[np.arange(n), models[df['model_a']]] = +math.log(BASE)
+    X[np.arange(n), models[df['model_b']]] = -math.log(BASE)
+    # one A win => two A win
+    Y = np.zeros(n)
+    Y[df['winner'] == 'model_a'] = 1.0
+    # one tie => one A win + one B win
+    # find tie + tie (both bad) index
+    tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)')
+    tie_idx[len(tie_idx) // 2:] = False
+    Y[tie_idx] = 1.0
+    if len(np.unique(Y)) < 2:
+        logger.info('Warning: Only one class in the data')
+        elo_scores = pd.Series(INIT_RATING, index=models.index)
+        if np.all(Y == 1.0):
+            elo_scores[df['model_a'].iloc[0]] += SCALE  # Boost the winning model
+        elif np.all(Y == 0.0):
+            elo_scores[df['model_b'].iloc[0]] += SCALE  # Boost the winning model
+        return elo_scores.sort_values(ascending=False)
+    lr = LogisticRegression(
+        fit_intercept=False, penalty=None, tol=1e-8)  # May need to set a small value when not use GPT4 as judge model
+    lr.fit(X, Y)
+    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
+    # set anchor as gpt4-0314 = 1000
+    if 'gpt4-0314' in models.index:
+        elo_scores += 1000 - elo_scores[models['gpt4-0314']]
+    return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
+def get_bootstrap_result(battles, func_compute_elo, num_round):
+    rows = []
+    for _ in tqdm(range(num_round), desc='bootstrap'):
+        res = func_compute_elo(battles.sample(frac=1.0, replace=True))
+        if res is not None:
+            rows.append(res)
+    df = pd.DataFrame(rows)
+    return df[df.median().sort_values(ascending=False).index]
+def preety_print_two_ratings(ratings_1, ratings_2, column_names):
+    df = (
+        pd.DataFrame(
+            [[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()],
+            columns=['Model', column_names[0], column_names[1]],
+        ).sort_values(column_names[0], ascending=False).reset_index(drop=True))
+    df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
+    df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
+    df.index = df.index + 1
+    return df
+def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
+    names = sorted(list(elo_ratings.keys()))
+    wins = defaultdict(lambda: defaultdict(lambda: 0))
+    for a in names:
+        for b in names:
+            ea = 1 / (1 + BASE**((elo_ratings[b] - elo_ratings[a]) / SCALE))
+            wins[a][b] = ea
+            wins[b][a] = 1 - ea
+    data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
+    df = pd.DataFrame(data, index=names)
+    df.index.name = 'model_a'
+    df.columns.name = 'model_b'
+    return df.T
+def get_win_rate_column(df, column, baseline='gpt4-0314'):
+    to_dict = df[['model', column]].set_index('model').to_dict()[column]
+    win_rate_table = predict_win_rate(to_dict)
+    return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x, 4))

{evalscope-0.13.1 → evalscope-0.13.2}/evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py RENAMED Viewed

@@ -126,7 +126,7 @@ class ChineseSimpleQAAdapter(DataAdapter):
     def match(self, gold: str, pred: str) -> float:
         # simple match
-        logger.warning(f'Please use LLMJudge to match the result for ChineseSimpleQA')
+        logger.warning(f'Please use LLMJudge to match the result for {self.name}')
         is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
         is_incorrect = not is_correct
         is_not_attempted = 0
@@ -160,9 +160,6 @@ class ChineseSimpleQAAdapter(DataAdapter):
             review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
         """
         # zip dict answers
-        res_dict = defaultdict(list)
-        for res in review_res_list:
-            for key, value in res.items():
-                res_dict[key].append(value)
+        res_dict = super().compute_dict_metric(review_res_list, **kwargs)
         return super().compute_metric(res_dict, **kwargs)

{evalscope-0.13.1 → evalscope-0.13.2}/evalscope/benchmarks/competition_math/competition_math_adapter.py RENAMED Viewed

@@ -8,7 +8,6 @@ from collections import defaultdict
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import AnswerKeys
 from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
-from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.logger import get_logger
 # flake8: noqa

{evalscope-0.13.1 → evalscope-0.13.2}/evalscope/benchmarks/data_adapter.py RENAMED Viewed

@@ -245,6 +245,29 @@ class DataAdapter(ABC):
             res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
         return res_list
+    def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
+        """
+        compute weighted mean of the bleu score of all samples
+        Args:
+            review_res_list: [score1, score2, ...]
+        Returns:
+            avg_res: List[dict]
+        """
+        if isinstance(review_res_list[0], list):
+            review_res_list = [item for sublist in review_res_list for item in sublist]
+        items = defaultdict(list)
+        for scores in review_res_list:
+            if isinstance(scores, dict):
+                for k, v in scores.items():
+                    items[k].append(v)
+            else:
+                items['AverageAccuracy'].append(scores)
+        return items
     def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
         """
         Generate report for the evaluation results for all subsets.
@@ -291,10 +314,11 @@ class DataAdapter(ABC):
         kwargs['metric_list'] = self.metric_list
         return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
-    def gen_prompt_data(self, prompt: str, **kwargs) -> dict:
+    def gen_prompt_data(self, prompt: str, system_prompt: Optional[str] = None, **kwargs) -> dict:
         if not isinstance(prompt, list):
             prompt = [prompt]
-        prompt_data = PromptData(data=prompt, multi_choices=self.choices, system_prompt=self.system_prompt)
+        prompt_data = PromptData(
+            data=prompt, multi_choices=self.choices, system_prompt=system_prompt or self.system_prompt)
         return prompt_data.to_dict()
     def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:

{evalscope-0.13.1 → evalscope-0.13.2}/evalscope/benchmarks/data_collection/data_collection_adapter.py RENAMED Viewed

@@ -5,7 +5,6 @@ from typing import Any, Optional
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, EvalType, HubType
-from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger

evalscope 0.13.1__tar.gz → 0.13.2__tar.gz

Potentially problematic release.

evalscope 0.13.1tar.gz → 0.13.2tar.gz