PyPI - evalscope - Versions diffs - 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

evalscope 0.13.1py3-none-any.whl → 0.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (72) hide show

evalscope/arguments.py +1 -1
evalscope/backend/rag_eval/__init__.py +1 -1
evalscope/backend/rag_eval/backend_manager.py +21 -5
evalscope/backend/rag_eval/cmteb/arguments.py +10 -0
evalscope/backend/rag_eval/ragas/arguments.py +0 -1
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +7 -2
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +0 -5
evalscope/backend/rag_eval/utils/embedding.py +49 -3
evalscope/backend/rag_eval/utils/llm.py +8 -9
evalscope/backend/vlm_eval_kit/backend_manager.py +4 -2
evalscope/benchmarks/alpaca_eval/__init__.py +0 -0
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +109 -0
evalscope/benchmarks/arc/arc_adapter.py +1 -1
evalscope/benchmarks/arena_hard/__init__.py +0 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +120 -0
evalscope/benchmarks/arena_hard/utils.py +162 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +2 -5
evalscope/benchmarks/competition_math/competition_math_adapter.py +0 -1
evalscope/benchmarks/data_adapter.py +30 -2
evalscope/benchmarks/data_collection/data_collection_adapter.py +0 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +6 -12
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -5
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +1 -3
evalscope/benchmarks/live_code_bench/testing_util.py +365 -549
evalscope/benchmarks/maritime_bench/__init__.py +0 -0
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +79 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +5 -7
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmlu_redux/__init__.py +0 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +182 -0
evalscope/benchmarks/musr/musr_adapter.py +1 -1
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +2 -5
evalscope/collections/evaluator.py +4 -2
evalscope/config.py +2 -2
evalscope/metrics/llm_judge.py +1 -1
evalscope/models/chat_adapter.py +32 -11
evalscope/perf/arguments.py +30 -9
evalscope/perf/benchmark.py +57 -103
evalscope/perf/http_client.py +2 -3
evalscope/perf/plugin/api/custom_api.py +1 -1
evalscope/perf/plugin/api/openai_api.py +4 -2
evalscope/perf/plugin/datasets/custom.py +4 -1
evalscope/perf/plugin/datasets/line_by_line.py +4 -1
evalscope/perf/plugin/datasets/longalpaca.py +4 -1
evalscope/perf/plugin/datasets/openqa.py +4 -1
evalscope/perf/plugin/datasets/random_dataset.py +13 -6
evalscope/perf/plugin/datasets/speed_benchmark.py +11 -0
evalscope/perf/utils/benchmark_util.py +12 -6
evalscope/perf/utils/db_util.py +3 -3
evalscope/perf/utils/log_utils.py +41 -0
evalscope/report/app.py +11 -11
evalscope/run.py +7 -0
evalscope/summarizer.py +2 -1
evalscope/utils/utils.py +36 -25
evalscope/version.py +2 -2
{evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/METADATA +21 -55
{evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/RECORD +70 -62
tests/cli/test_all.py +36 -27
tests/cli/test_collection.py +2 -1
tests/cli/test_run.py +38 -20
tests/perf/test_perf.py +1 -2
tests/rag/test_clip_benchmark.py +0 -1
tests/rag/test_mteb.py +37 -8
tests/rag/test_ragas.py +33 -27
tests/vlm/test_vlmeval.py +37 -1
evalscope/backend/vlm_eval_kit/custom_dataset.py +0 -46
evalscope/benchmarks/live_code_bench/execute_utils.py +0 -267
{evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/LICENSE +0 -0
{evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/WHEEL +0 -0
{evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.13.1.dist-info → evalscope-0.14.0.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/arena_hard/utils.py ADDED Viewed

@@ -0,0 +1,162 @@
+import math
+import numpy as np
+import pandas as pd
+import re
+from collections import defaultdict
+from sklearn.linear_model import LogisticRegression
+from tqdm import tqdm
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+def post_process_arenahard(completion):
+    result = re.findall(r'\[\[([AB<>=]+)\]\]', completion)
+    if result:
+        return result[0]
+    else:
+        return None
+def get_battles_from_row(row, first_game_only=False, multiplier=3):
+    results = []
+    output = {'model_a': row['model_a'], 'model_b': row['model_b']}
+    game = row['games'][0]
+    weight = 1
+    if game['score'] == 'A=B':
+        output['winner'] = 'tie'
+    elif game['score'] == 'A>B':
+        output['winner'] = 'model_a'
+    elif game['score'] == 'A>>B':
+        output['winner'] = 'model_a'
+        weight = multiplier
+    elif game['score'] == 'B>A':
+        output['winner'] = 'model_b'
+    elif game['score'] == 'B>>A':
+        output['winner'] = 'model_b'
+        weight = multiplier
+    else:
+        weight = 0
+    if weight:
+        results += [output] * weight
+    if first_game_only:
+        return pd.DataFrame(results)
+    # game 2
+    output = {'model_a': row['model_a'], 'model_b': row['model_b']}
+    game = row['games'][1]
+    weight = 1
+    if game['score'] == 'A=B':
+        output['winner'] = 'tie'
+    elif game['score'] == 'A>B':
+        output['winner'] = 'model_b'
+    elif game['score'] == 'A>>B':
+        output['winner'] = 'model_b'
+        weight = multiplier
+    elif game['score'] == 'B>A':
+        output['winner'] = 'model_a'
+    elif game['score'] == 'B>>A':
+        output['winner'] = 'model_a'
+        weight = multiplier
+    else:
+        weight = 0
+    if weight:
+        results += [output] * weight
+    return pd.DataFrame(results)
+def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
+    models = pd.concat([df['model_a'], df['model_b']]).unique()
+    models = pd.Series(np.arange(len(models)), index=models)
+    # duplicate battles
+    df = pd.concat([df, df], ignore_index=True)
+    p = len(models.index)
+    n = df.shape[0]
+    X = np.zeros([n, p])
+    X[np.arange(n), models[df['model_a']]] = +math.log(BASE)
+    X[np.arange(n), models[df['model_b']]] = -math.log(BASE)
+    # one A win => two A win
+    Y = np.zeros(n)
+    Y[df['winner'] == 'model_a'] = 1.0
+    # one tie => one A win + one B win
+    # find tie + tie (both bad) index
+    tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)')
+    tie_idx[len(tie_idx) // 2:] = False
+    Y[tie_idx] = 1.0
+    if len(np.unique(Y)) < 2:
+        logger.info('Warning: Only one class in the data')
+        elo_scores = pd.Series(INIT_RATING, index=models.index)
+        if np.all(Y == 1.0):
+            elo_scores[df['model_a'].iloc[0]] += SCALE  # Boost the winning model
+        elif np.all(Y == 0.0):
+            elo_scores[df['model_b'].iloc[0]] += SCALE  # Boost the winning model
+        return elo_scores.sort_values(ascending=False)
+    lr = LogisticRegression(
+        fit_intercept=False, penalty=None, tol=1e-8)  # May need to set a small value when not use GPT4 as judge model
+    lr.fit(X, Y)
+    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
+    # set anchor as gpt4-0314 = 1000
+    if 'gpt4-0314' in models.index:
+        elo_scores += 1000 - elo_scores[models['gpt4-0314']]
+    return pd.Series(elo_scores, index=models.index).sort_values(ascending=False)
+def get_bootstrap_result(battles, func_compute_elo, num_round):
+    rows = []
+    for _ in tqdm(range(num_round), desc='bootstrap'):
+        res = func_compute_elo(battles.sample(frac=1.0, replace=True))
+        if res is not None:
+            rows.append(res)
+    df = pd.DataFrame(rows)
+    return df[df.median().sort_values(ascending=False).index]
+def preety_print_two_ratings(ratings_1, ratings_2, column_names):
+    df = (
+        pd.DataFrame(
+            [[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()],
+            columns=['Model', column_names[0], column_names[1]],
+        ).sort_values(column_names[0], ascending=False).reset_index(drop=True))
+    df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
+    df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
+    df.index = df.index + 1
+    return df
+def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
+    names = sorted(list(elo_ratings.keys()))
+    wins = defaultdict(lambda: defaultdict(lambda: 0))
+    for a in names:
+        for b in names:
+            ea = 1 / (1 + BASE**((elo_ratings[b] - elo_ratings[a]) / SCALE))
+            wins[a][b] = ea
+            wins[b][a] = 1 - ea
+    data = {a: [wins[a][b] if a != b else np.NAN for b in names] for a in names}
+    df = pd.DataFrame(data, index=names)
+    df.index.name = 'model_a'
+    df.columns.name = 'model_b'
+    return df.T
+def get_win_rate_column(df, column, baseline='gpt4-0314'):
+    to_dict = df[['model', column]].set_index('model').to_dict()[column]
+    win_rate_table = predict_win_rate(to_dict)
+    return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x, 4))

evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py CHANGED Viewed

@@ -126,7 +126,7 @@ class ChineseSimpleQAAdapter(DataAdapter):
     def match(self, gold: str, pred: str) -> float:
         # simple match
-        logger.warning(f'Please use LLMJudge to match the result for ChineseSimpleQA')
+        logger.warning(f'Please use LLMJudge to match the result for {self.name}')
         is_correct = 1 if gold.lower().strip() == pred.lower().strip() else 0
         is_incorrect = not is_correct
         is_not_attempted = 0
@@ -160,9 +160,6 @@ class ChineseSimpleQAAdapter(DataAdapter):
             review_res_list: [{'is_correct': 1, 'is_incorrect': 0, 'is_not_attempted': 0}, ...]
         """
         # zip dict answers
-        res_dict = defaultdict(list)
-        for res in review_res_list:
-            for key, value in res.items():
-                res_dict[key].append(value)
+        res_dict = super().compute_dict_metric(review_res_list, **kwargs)
         return super().compute_metric(res_dict, **kwargs)

evalscope/benchmarks/competition_math/competition_math_adapter.py CHANGED Viewed

@@ -8,7 +8,6 @@ from collections import defaultdict
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import AnswerKeys
 from evalscope.metrics.math_parser import extract_answer, math_equal, strip_answer_string
-from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.logger import get_logger
 # flake8: noqa

evalscope/benchmarks/data_adapter.py CHANGED Viewed

@@ -245,6 +245,29 @@ class DataAdapter(ABC):
             res_list.append({'metric_name': metric_name, 'score': metric_func(review_res), 'num': len(review_res)})
         return res_list
+    def compute_dict_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
+        """
+        compute weighted mean of the bleu score of all samples
+        Args:
+            review_res_list: [score1, score2, ...]
+        Returns:
+            avg_res: List[dict]
+        """
+        if isinstance(review_res_list[0], list):
+            review_res_list = [item for sublist in review_res_list for item in sublist]
+        items = defaultdict(list)
+        for scores in review_res_list:
+            if isinstance(scores, dict):
+                for k, v in scores.items():
+                    items[k].append(v)
+            else:
+                items['AverageAccuracy'].append(scores)
+        return items
     def gen_report(self, subset_score_map: dict, report_name: str = None, **kwargs) -> Report:
         """
         Generate report for the evaluation results for all subsets.
@@ -291,10 +314,15 @@ class DataAdapter(ABC):
         kwargs['metric_list'] = self.metric_list
         return ReportGenerator.gen_report(subset_score_map, report_name, **kwargs)
-    def gen_prompt_data(self, prompt: str, **kwargs) -> dict:
+    def gen_prompt_data(self,
+                        prompt: str,
+                        system_prompt: Optional[str] = None,
+                        choices: Optional[List[str]] = None,
+                        **kwargs) -> dict:
         if not isinstance(prompt, list):
             prompt = [prompt]
-        prompt_data = PromptData(data=prompt, multi_choices=self.choices, system_prompt=self.system_prompt)
+        prompt_data = PromptData(
+            data=prompt, multi_choices=choices or self.choices, system_prompt=system_prompt or self.system_prompt)
         return prompt_data.to_dict()
     def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:

evalscope/benchmarks/data_collection/data_collection_adapter.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import Any, Optional
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import DEFAULT_DATASET_CACHE_DIR, EvalType, HubType
-from evalscope.models import ChatGenerationModelAdapter
 from evalscope.utils.io_utils import jsonl_to_list
 from evalscope.utils.logger import get_logger

evalscope/benchmarks/general_qa/general_qa_adapter.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os.path
 from collections import defaultdict
-from typing import List
+from typing import List, Optional, Union
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.metrics import bleu_ngram_one_sample, compute_rouge_score_one_sample_zh, mean
@@ -40,7 +40,7 @@ class GeneralQAAdapter(DataAdapter):
             for subset_name in subset_list:
                 data_file_dict[subset_name] = os.path.join(dataset_name_or_path, f'{subset_name}.jsonl')
         elif os.path.isfile(dataset_name_or_path):
-            cur_subset_name = os.path.basename(dataset_name_or_path).split('.')[0]
+            cur_subset_name = os.path.splitext(os.path.basename(dataset_name_or_path))[0]
             data_file_dict[cur_subset_name] = dataset_name_or_path
         else:
             raise ValueError(f'Invalid dataset path: {dataset_name_or_path}')
@@ -74,8 +74,9 @@ class GeneralQAAdapter(DataAdapter):
                            To be supported in the future.')
         query = input_d.get('question', '') or input_d.get('query', '')
+        system_prompt = input_d.get('system')
         prompt = self.prompt_template.format(query=query)
-        return self.gen_prompt_data(prompt)
+        return self.gen_prompt_data(prompt, system_prompt=system_prompt)
     def get_gold_answer(self, input_d: dict) -> str:
         """
@@ -118,7 +119,7 @@ class GeneralQAAdapter(DataAdapter):
             res.update(bleu_dict)
         return res
-    def compute_metric(self, review_res_list: List[dict], **kwargs) -> List[dict]:
+    def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
         """
         compute weighted mean of the bleu score of all samples
@@ -129,12 +130,5 @@ class GeneralQAAdapter(DataAdapter):
             avg_res: List[dict]
         """
-        items = defaultdict(list)
-        for scores in review_res_list:
-            if isinstance(scores, dict):
-                for k, v in scores.items():
-                    items[k].append(v)
-            else:
-                items['AverageAccuracy'].append(scores)
-        # items = [(score, 1.0) for score in review_res_list]
+        items = super().compute_dict_metric(review_res_list, **kwargs)
         return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]

evalscope/benchmarks/hellaswag/hellaswag_adapter.py CHANGED Viewed

@@ -108,7 +108,7 @@ class HellaSwagAdapter(DataAdapter):
         if self.model_adapter == OutputType.MULTIPLE_CHOICE:
             return result
         else:
-            return ResponseParser.parse_first_option(result)
+            return ResponseParser.parse_first_option(result, options=self.choices)
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=str(gold), pred=str(pred))

evalscope/benchmarks/ifeval/ifeval_adapter.py CHANGED Viewed

@@ -48,9 +48,6 @@ class IFEvalAdapter(DataAdapter):
     def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
         # aggregate review results
-        res_dict = defaultdict(list)
-        for res in review_res_list:
-            for k, v in res.items():
-                res_dict[k].append(v)
+        res_dict = super().compute_dict_metric(review_res_list, **kwargs)
-        return super().compute_metric(res_dict)
+        return super().compute_metric(res_dict, **kwargs)

evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py CHANGED Viewed

@@ -18,7 +18,6 @@ logger = get_logger()
     extra_params={
         'start_date': None,
         'end_date': None,
-        'num_process_evaluate': 1,
         'timeout': 6
     },
     system_prompt=
@@ -33,7 +32,6 @@ class LiveCodeBenchAdapter(DataAdapter):
         extra_params = kwargs.get('extra_params', {})
-        self.num_process_evaluate = extra_params.get('num_process_evaluate', 1)
         self.timeout = extra_params.get('timeout', 6)
         self.start_date = extra_params.get('start_date')
         self.end_date = extra_params.get('end_date')
@@ -84,7 +82,7 @@ class LiveCodeBenchAdapter(DataAdapter):
             references,
             predictions,
             k_list=[1],
-            num_process_evaluate=self.num_process_evaluate,
+            num_process_evaluate=1,
             timeout=self.timeout,
         )
         return metrics['pass@1'] / 100  # convert to point scale

evalscope 0.13.1__py3-none-any.whl → 0.14.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.13.1py3-none-any.whl → 0.14.0py3-none-any.whl