PyPI - evalscope - Versions diffs - 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl - Mend

evalscope 0.16.2py3-none-any.whl → 0.17.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show

evalscope/app/app.py +9 -762
evalscope/app/constants.py +1 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +52 -0
evalscope/app/ui/multi_model.py +323 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +202 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +178 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +91 -0
evalscope/backend/opencompass/backend_manager.py +2 -1
evalscope/backend/rag_eval/backend_manager.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +1 -1
evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
evalscope/benchmarks/__init__.py +15 -1
evalscope/benchmarks/aime/aime24_adapter.py +2 -1
evalscope/benchmarks/aime/aime25_adapter.py +2 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
evalscope/benchmarks/arc/arc_adapter.py +1 -1
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
evalscope/benchmarks/arena_hard/utils.py +0 -12
evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
evalscope/benchmarks/data_adapter.py +20 -5
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
evalscope/benchmarks/general_arena/utils.py +226 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
evalscope/benchmarks/musr/musr_adapter.py +1 -1
evalscope/benchmarks/race/race_adapter.py +1 -1
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
evalscope/benchmarks/utils.py +1 -2
evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
evalscope/config.py +8 -123
evalscope/evaluator/evaluator.py +15 -12
evalscope/metrics/__init__.py +6 -0
evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
evalscope/metrics/llm_judge.py +105 -20
evalscope/metrics/metrics.py +1 -1
evalscope/models/adapters/base_adapter.py +0 -2
evalscope/models/adapters/server_adapter.py +2 -2
evalscope/models/custom/dummy_model.py +3 -3
evalscope/perf/arguments.py +2 -16
evalscope/perf/main.py +1 -1
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +1 -1
evalscope/report/__init__.py +1 -1
evalscope/report/utils.py +34 -15
evalscope/run.py +1 -1
evalscope/summarizer.py +1 -2
evalscope/utils/__init__.py +63 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/import_utils.py +16 -0
evalscope/utils/io_utils.py +45 -4
evalscope/utils/model_utils.py +37 -1
evalscope/version.py +2 -2
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
tests/aigc/test_t2i.py +1 -1
tests/cli/test_all.py +50 -2
tests/cli/test_collection.py +1 -1
tests/cli/test_custom.py +261 -0
tests/cli/test_run.py +13 -37
tests/perf/test_perf.py +2 -2
tests/rag/test_clip_benchmark.py +2 -1
tests/rag/test_mteb.py +3 -1
tests/rag/test_ragas.py +3 -1
tests/swift/test_run_swift_eval.py +2 -1
tests/swift/test_run_swift_vlm_eval.py +2 -1
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
tests/utils.py +13 -0
tests/vlm/test_vlmeval.py +8 -2
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
/evalscope/{utils → benchmarks}/filters.py +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0

evalscope/utils/arena_utils.py DELETED Viewed

@@ -1,217 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# Copyright (c) lmsys.org.
-import numpy as np
-import pandas as pd
-import pyarrow as pa
-import random
-from collections import OrderedDict, defaultdict
-from typing import List, Sequence, Union
-from evalscope.utils.logger import get_logger
-logger = get_logger()
-def compute_elo(battles,
-                col_model_a='model_a',
-                col_model_b='model_b',
-                col_win='win',
-                tie_values=['tie', 'tie (bothbad)'],
-                k=32,
-                scale=400,
-                base=10,
-                init_rating=1000):
-    rating = defaultdict(lambda: init_rating)
-    for rd, model_a, model_b, win in battles[[col_model_a, col_model_b, col_win]].itertuples():
-        ra = rating[model_a]
-        rb = rating[model_b]
-        ea = 1 / (1 + base**((rb - ra) / scale))
-        eb = 1 / (1 + base**((ra - rb) / scale))
-        if win == col_model_a:
-            sa = 1
-        elif win == col_model_b:
-            sa = 0
-        elif win in tie_values:
-            sa = 0.5
-        else:
-            raise Exception(f'unexpected vote {win}')
-        rating[model_a] += k * (sa - ea)
-        rating[model_b] += k * (1 - sa - eb)
-    return rating
-def merge_ques_ans(answer_list_all, merge_key: str = 'question_id', merge_mode: str = 'inner') -> pd.DataFrame:
-    """
-    Merge question and answer list to unifiled data.
-    Args:
-        answer_list_all: list of answer list,
-            e.g. [ans1_list, ans2_list, ...], an ans_list is predicted answers
-            of a specific model, must contain following columns: 'question_id',
-            'text', 'category', 'model_id', 'answer'
-        merge_key: key for dataframe merging
-        merge_mode: mode for dataframe merging,
-            e.g. 'inner', 'left', 'right', 'outer'
-    Returns:
-        pandas DataFrame: merged dataframe, e.g. columns are
-            ['question_id', 'gpt-3.5-turbo', 'llama2-7b']
-    """
-    ans_df = pd.DataFrame()
-    for ans_list in answer_list_all:
-        ans_list = [{'question_id': item['question_id'], item['model_id']: item} for item in ans_list]
-        if ans_df.empty:
-            ans_df = pa.Table.from_pylist(ans_list).to_pandas()
-        else:
-            ans_df = pd.merge(ans_df, pa.Table.from_pylist(ans_list).to_pandas(), on=merge_key, how=merge_mode)
-    return ans_df
-def get_battle_pairs(columns: List[str], baseline_idx: int = -1) -> List[tuple]:
-    """
-    Get battle pair names from columns.
-    Args:
-        columns: list of column names.
-    Returns:
-        list of battle pairs.
-    Example:
-        >>> columns = ['A', 'B', 'C']
-        >>> res = get_battle_pairs(columns)
-        >>> print(res)
-        >>> [('B', 'A'), ('C', 'A'), ('C', 'B')]
-        >>> columns = ['A', 'B', 'C']
-        >>> res = get_battle_pairs(columns, 2)
-        >>> print(res)
-        >>> [('A', 'C'), ('B', 'C')]
-    """
-    res_list = []
-    cols_num = len(columns)
-    if cols_num <= 0:
-        return res_list
-    if baseline_idx != -1:
-        n_column = columns[baseline_idx]
-        res_list = [(column, n_column) for column in columns if column != n_column]
-    else:
-        mat = np.ones((cols_num, cols_num))
-        mat_lower_tril = np.tril(mat, k=-1)
-        x_ids, y_ids = np.where(mat_lower_tril == 1)
-        res_list = [(columns[x_id], columns[y_id]) for x_id, y_id in zip(x_ids, y_ids)]
-    return res_list
-def get_battle_pairs_origin(columns: List[str], compare_base: bool = False, swap: bool = False):  # TODO: to refactor
-    """
-    Get battle pair names from columns.
-    Args:
-        columns: list of column names.
-    Returns:
-        list of battle pairs.
-    Example:
-        >>> columns = ['A', 'B', 'C']
-        >>> res = get_battle_pairs(columns)
-        >>> print(res)
-        >>> [('B', 'A'), ('C', 'A'), ('C', 'B')]
-    """
-    res_list = []
-    cols_num = len(columns)
-    if cols_num <= 0:
-        return res_list
-    if not compare_base:
-        mat = np.ones((cols_num, cols_num))
-        mat_lower_tril = np.tril(mat, k=-1)
-        x_ids, y_ids = np.where(mat_lower_tril == 1)
-        res_list = [(columns[x_id], columns[y_id]) for x_id, y_id in zip(x_ids, y_ids)]
-    else:
-        for column in columns[1:]:
-            res_list.append((columns[0], column))
-    if swap:
-        res_list.extend([(j, i) for i, j in res_list])
-    return res_list
-def shuffle_pairwise_preferences(df: pd.DataFrame, arr_is_shuffle: Sequence[int]) -> pd.DataFrame:
-    """Shuffle the outputs of a pairwise preference dataframe.
-    Examples
-    --------
-    >>> df = pd.DataFrame([dict(instruction='2+2', output_1='3', output_2='4', preference=2),
-                           dict(instruction='2+3', output_1='5', output_2='4', preference=1)])
-    >>> print(shuffle_pairwise_preferences(df, [True, False]))
-        instruction output_1 output_2  preference
-    0         2+2        4        3           1
-    1         2+3        5        4           1
-    """
-    col_1 = df['output_1'].copy()
-    col_2 = df['output_2'].copy()
-    df['output_1'] = np.where(arr_is_shuffle, col_2, col_1)
-    df['output_2'] = np.where(arr_is_shuffle, col_1, col_2)
-    if 'preference' in df.columns:
-        df['preference'] = np.where(arr_is_shuffle, 3 - df['preference'], df['preference'])
-    return df
-class BattlePairSelection:
-    """
-    Select battle pairs by specific strategy.
-    Attributes:
-        model_elo_map(dict): map of model_id--base_elo_score
-    """
-    DEFAULT_K = 5
-    def __init__(self, model_elo_map: Union[dict, OrderedDict]):
-        # Make sure model_elo_map to be ordered when compare_base is true.
-        self.model_elo_map = model_elo_map
-    def top_k(self, k: int = DEFAULT_K, compare_base: bool = False, swap: bool = False) -> list:
-        if k <= 0:
-            k = self.DEFAULT_K
-        sorted_res = sorted(self.model_elo_map.items(), key=lambda x: x[1])[:k]
-        sorted_res = list(dict(sorted_res).keys())
-        return get_battle_pairs_origin(sorted_res, compare_base, swap)
-    def random_k(self, k: int = DEFAULT_K, compare_base: bool = False, swap: bool = False) -> list:
-        if k <= 0:
-            k = self.DEFAULT_K
-        if k > len(self.model_elo_map):
-            k = len(self.model_elo_map)
-        candidate_list = list(self.model_elo_map.items())
-        k = len(candidate_list) if k > len(candidate_list) else k
-        res = dict(random.sample(candidate_list, k=k))
-        res = list(res.keys())
-        return get_battle_pairs_origin(res, compare_base, swap)
-    def volatility_index(self, frac: float = 0.2, compare_base: bool = False, swap: bool = False) -> list:
-        res_list = []
-        candidate_list = get_battle_pairs_origin(list(self.model_elo_map.keys()), compare_base, swap)
-        for t in candidate_list:
-            model_a = t[0]
-            model_b = t[1]
-            base_elo_a = self.model_elo_map.get(model_a)
-            base_elo_b = self.model_elo_map.get(model_b)
-            vol_frac = abs(base_elo_b - base_elo_a) / max(base_elo_a, base_elo_b)
-            if vol_frac <= frac:
-                res_list.append(t)
-        return res_list

evalscope/utils/completion_parsers.py DELETED Viewed

@@ -1,82 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-# flake8: noqa
-import ast
-import re
-# from . import utils as ann_utils
-from evalscope.constants import ArenaWinner
-from evalscope.utils.logger import get_logger
-logger = get_logger()
-one_score_pattern = re.compile('\[\[(\d+\.?\d*)\]\]')
-one_score_pattern_backup = re.compile('\[(\d+\.?\d*)\]')
-# modified from: https://github.com/lm-sys/FastChat/blob/main/fastchat/eval/eval_gpt_review.py#L47
-# does not work with batched completions
-def lmsys_parser(completion, output_format):
-    if output_format == '[[rating]]':
-        match = re.search(one_score_pattern, completion)
-        if not match:
-            match = re.search(one_score_pattern_backup, completion)
-        if match:
-            rating = ast.literal_eval(match.groups()[0])
-        else:
-            logger.error(f'Content: {completion}\n'
-                         'You must manually fix the score.')
-            rating = -1
-        return rating
-    if output_format == '[[rating_a,rating_b]]':
-        try:
-            score_pair = completion.split('\n')[0]
-            score_pair = score_pair.replace(',', ' ')
-            sp = score_pair.split(' ')
-            if len(sp) == 2:
-                score_1 = float(sp[0])
-                score_2 = float(sp[1])
-                if score_1 > score_2:
-                    winner = ArenaWinner.MODEL_A
-                elif score_1 < score_2:
-                    winner = ArenaWinner.MODEL_B
-                else:
-                    if score_1 == score_1 == -1:
-                        winner = ArenaWinner.UNKNOWN
-                    winner = ArenaWinner.TIE
-                return winner, [score_1, score_2]
-            else:
-                raise Exception('Invalid score pair.')
-        except Exception as e:
-            logger.error(f'{e}\nContent: {completion}\nYou must manually fix the score pair.')
-            return ArenaWinner.UNKNOWN, [-1, -1]
-    elif output_format == '[[A]]':
-        if '[[A]]' in completion:
-            winner = ArenaWinner.MODEL_A
-        elif '[[B]]' in completion:
-            winner = ArenaWinner.MODEL_B
-        elif '[[C]]' in completion:
-            winner = ArenaWinner.TIE
-        else:
-            logger.error(f'\nContent: {completion}\nYou must manually fix the score.')
-            winner = ArenaWinner.UNKNOWN
-        return winner
-def ranking_parser(completion, **kwargs):
-    try:
-        if isinstance(completion, str):
-            ordered_completions = ast.literal_eval(completion)
-        else:
-            ordered_completions = completion
-        rank = [c for c in ordered_completions if c['model'] == 'model_a'][0]['rank']
-        assert rank in [1, 2]
-        return ArenaWinner.MODEL_A if rank == 1 else ArenaWinner.MODEL_B
-    except Exception as e:
-        logger.error(f'{e}\nContent: {completion}\n'
-                     'You must manually fix the score pair.')
-        return ArenaWinner.UNKNOWN

/evalscope/{utils → benchmarks}/filters.py RENAMED Viewed

File without changes

{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.2py3-none-any.whl → 0.17.0py3-none-any.whl