PyPI - evalscope - Versions diffs - 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl - Mend

evalscope 0.16.3py3-none-any.whl → 0.17.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show

evalscope/app/app.py +9 -762
evalscope/app/constants.py +1 -0
evalscope/app/ui/__init__.py +20 -0
evalscope/app/ui/app_ui.py +52 -0
evalscope/app/ui/multi_model.py +323 -0
evalscope/app/ui/sidebar.py +42 -0
evalscope/app/ui/single_model.py +202 -0
evalscope/app/ui/visualization.py +36 -0
evalscope/app/utils/data_utils.py +178 -0
evalscope/app/utils/localization.py +221 -0
evalscope/app/utils/text_utils.py +119 -0
evalscope/app/utils/visualization.py +91 -0
evalscope/backend/opencompass/backend_manager.py +2 -1
evalscope/backend/rag_eval/backend_manager.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +1 -1
evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
evalscope/benchmarks/__init__.py +15 -1
evalscope/benchmarks/aime/aime24_adapter.py +2 -1
evalscope/benchmarks/aime/aime25_adapter.py +2 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
evalscope/benchmarks/arc/arc_adapter.py +1 -1
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
evalscope/benchmarks/arena_hard/utils.py +0 -12
evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
evalscope/benchmarks/data_adapter.py +29 -9
evalscope/benchmarks/general_arena/__init__.py +0 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
evalscope/benchmarks/general_arena/utils.py +226 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
evalscope/benchmarks/hle/__init__.py +0 -0
evalscope/benchmarks/hle/hle_adapter.py +118 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
evalscope/benchmarks/musr/musr_adapter.py +1 -1
evalscope/benchmarks/race/race_adapter.py +1 -1
evalscope/benchmarks/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
evalscope/benchmarks/utils.py +2 -2
evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
evalscope/config.py +8 -123
evalscope/constants.py +5 -21
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +20 -15
evalscope/metrics/__init__.py +9 -1
evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
evalscope/metrics/llm_judge.py +106 -20
evalscope/metrics/metrics.py +20 -8
evalscope/models/__init__.py +4 -8
evalscope/models/adapters/__init__.py +4 -9
evalscope/models/adapters/base_adapter.py +4 -0
evalscope/models/adapters/bfcl_adapter.py +2 -0
evalscope/models/adapters/chat_adapter.py +3 -0
evalscope/models/adapters/choice_adapter.py +4 -0
evalscope/models/adapters/custom_adapter.py +7 -3
evalscope/models/adapters/server_adapter.py +4 -2
evalscope/models/adapters/t2i_adapter.py +3 -0
evalscope/models/adapters/tau_bench_adapter.py +189 -0
evalscope/models/custom/dummy_model.py +3 -3
evalscope/models/register.py +0 -14
evalscope/perf/arguments.py +15 -16
evalscope/perf/benchmark.py +38 -39
evalscope/perf/http_client.py +30 -86
evalscope/perf/main.py +3 -3
evalscope/perf/plugin/__init__.py +3 -2
evalscope/perf/plugin/api/__init__.py +4 -3
evalscope/perf/plugin/api/base.py +22 -4
evalscope/perf/plugin/api/custom_api.py +212 -55
evalscope/perf/plugin/api/dashscope_api.py +4 -10
evalscope/perf/plugin/api/default_api.py +105 -0
evalscope/perf/plugin/api/openai_api.py +17 -19
evalscope/perf/plugin/datasets/__init__.py +10 -7
evalscope/perf/plugin/datasets/base.py +22 -1
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +4 -27
evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +2 -1
evalscope/perf/plugin/datasets/random_dataset.py +15 -4
evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
evalscope/perf/plugin/registry.py +36 -16
evalscope/perf/utils/analysis_result.py +24 -23
evalscope/perf/utils/benchmark_util.py +14 -20
evalscope/perf/utils/db_util.py +79 -61
evalscope/report/__init__.py +1 -1
evalscope/report/utils.py +34 -15
evalscope/run.py +1 -1
evalscope/summarizer.py +1 -2
evalscope/utils/__init__.py +63 -2
evalscope/utils/argument_utils.py +64 -0
evalscope/utils/import_utils.py +16 -0
evalscope/utils/io_utils.py +55 -4
evalscope/utils/model_utils.py +37 -1
evalscope/version.py +2 -2
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
tests/aigc/test_t2i.py +1 -1
tests/cli/test_all.py +68 -4
tests/cli/test_collection.py +1 -1
tests/cli/test_custom.py +261 -0
tests/cli/test_run.py +34 -70
tests/perf/test_perf.py +31 -4
tests/rag/test_clip_benchmark.py +2 -1
tests/rag/test_mteb.py +3 -1
tests/rag/test_ragas.py +3 -1
tests/swift/test_run_swift_eval.py +2 -1
tests/swift/test_run_swift_vlm_eval.py +2 -1
tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
tests/utils.py +13 -0
tests/vlm/test_vlmeval.py +8 -2
evalscope/evaluator/rating_eval.py +0 -157
evalscope/evaluator/reviewer/__init__.py +0 -1
evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
evalscope/models/model.py +0 -189
evalscope/registry/__init__.py +0 -1
evalscope/registry/config/cfg_arena.yaml +0 -77
evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
evalscope/registry/config/cfg_single.yaml +0 -78
evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
evalscope/registry/data/qa_browser/battle.jsonl +0 -634
evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
evalscope/registry/data/question.jsonl +0 -80
evalscope/registry/tasks/arc.yaml +0 -28
evalscope/registry/tasks/bbh.yaml +0 -26
evalscope/registry/tasks/bbh_mini.yaml +0 -26
evalscope/registry/tasks/ceval.yaml +0 -27
evalscope/registry/tasks/ceval_mini.yaml +0 -26
evalscope/registry/tasks/cmmlu.yaml +0 -27
evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
evalscope/registry/tasks/general_qa.yaml +0 -27
evalscope/registry/tasks/gsm8k.yaml +0 -29
evalscope/registry/tasks/mmlu.yaml +0 -29
evalscope/registry/tasks/mmlu_mini.yaml +0 -27
evalscope/run_arena.py +0 -202
evalscope/utils/arena_utils.py +0 -217
evalscope/utils/completion_parsers.py +0 -82
/evalscope/{utils → benchmarks}/filters.py +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0

evalscope/backend/vlm_eval_kit/backend_manager.py CHANGED Viewed

@@ -5,7 +5,8 @@ from functools import partial
 from typing import Optional, Union
 from evalscope.backend.base import BackendManager
-from evalscope.utils import get_valid_list, is_module_installed
+from evalscope.utils.import_utils import is_module_installed
+from evalscope.utils.io_utils import get_valid_list
 from evalscope.utils.logger import get_logger
 logger = get_logger()
@@ -68,6 +69,8 @@ class VLMEvalKitBackendManager(BackendManager):
                     del remain_cfg['type']  # remove not used args
                     norm_model_type = os.path.basename(model_type).replace(':', '-').replace('.', '_')
+                    model_cfg['type'] = norm_model_type
                     self.valid_models.update({norm_model_type: partial(model_class, model=model_type, **remain_cfg)})
                     new_model_names.append(norm_model_type)
                 else:

evalscope/benchmarks/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import glob
 import importlib
 import os
+import time
 from evalscope.benchmarks.benchmark import Benchmark, BenchmarkMeta
 from evalscope.benchmarks.data_adapter import DataAdapter
@@ -13,11 +14,24 @@ logger = get_logger()
 pattern = os.path.join(os.path.dirname(__file__), '*', '**', '*_adapter.py')
 files = glob.glob(pattern, recursive=True)
+import_times = []
 for file_path in files:
     if file_path.endswith('.py') and not os.path.basename(file_path).startswith('_'):
         # Convert file path to a module path
         relative_path = os.path.relpath(file_path, os.path.dirname(__file__))
         module_path = relative_path[:-3].replace(os.path.sep, '.')  # strip '.py' and convert to module path
         full_path = f'evalscope.benchmarks.{module_path}'
+        start_time = time.perf_counter()
         importlib.import_module(full_path)
-        # print(f'Importing {full_path}')
+        end_time = time.perf_counter()
+        import_times.append((full_path, end_time - start_time))
+# Sort by import time in descending order
+import_times.sort(key=lambda x: x[1], reverse=True)
+# Log the sorted import times
+for module, duration in import_times:
+    logger.debug(f'Module {module} imported in {duration:.6f} seconds')

evalscope/benchmarks/aime/aime24_adapter.py CHANGED Viewed

@@ -48,4 +48,5 @@ class AIME24Adapter(DataAdapter):
         return result
     def match(self, gold: str, pred: str) -> float:
-        return math_equal(pred, gold)
+        res = math_equal(pred, gold)
+        return 1.0 if res else 0.0

evalscope/benchmarks/aime/aime25_adapter.py CHANGED Viewed

@@ -48,4 +48,5 @@ class AIME25Adapter(DataAdapter):
         return result
     def match(self, gold: str, pred: str) -> float:
-        return math_equal(pred, gold)
+        res = math_equal(pred, gold)
+        return 1.0 if res else 0.0

evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py CHANGED Viewed

@@ -47,7 +47,7 @@ Evaluate the models based on the quality and relevance of their outputs, and sel
 @Benchmark.register(
     name='alpaca_eval',
     pretty_name='AlpacaEval2.0',
-    tags=['Instruction-Following', 'Reasoning'],
+    tags=['Instruction-Following', 'Arena'],
     description='Alpaca Eval 2.0 is an enhanced framework for evaluating instruction-following language models, '
     'featuring an improved auto-annotator, updated baselines, and continuous preference calculation to '
     'provide more accurate and cost-effective model assessments. '

evalscope/benchmarks/arc/arc_adapter.py CHANGED Viewed

@@ -6,7 +6,7 @@ import os
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
 from evalscope.utils.logger import get_logger
 # flake8: noqa

evalscope/benchmarks/arena_hard/arena_hard_adapter.py CHANGED Viewed

@@ -17,7 +17,7 @@ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's A
 @Benchmark.register(
     name='arena_hard',
     pretty_name='ArenaHard',
-    tags=['Instruction-Following', 'Reasoning'],
+    tags=['Instruction-Following', 'Arena'],
     description=
     'ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, '
     'where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. '

evalscope/benchmarks/arena_hard/utils.py CHANGED Viewed

@@ -127,18 +127,6 @@ def get_bootstrap_result(battles, func_compute_elo, num_round):
     return df[df.median().sort_values(ascending=False).index]
-def preety_print_two_ratings(ratings_1, ratings_2, column_names):
-    df = (
-        pd.DataFrame(
-            [[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()],
-            columns=['Model', column_names[0], column_names[1]],
-        ).sort_values(column_names[0], ascending=False).reset_index(drop=True))
-    df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int)
-    df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int)
-    df.index = df.index + 1
-    return df
 def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
     names = sorted(list(elo_ratings.keys()))
     wins = defaultdict(lambda: defaultdict(lambda: 0))

evalscope/benchmarks/bfcl/bfcl_adapter.py CHANGED Viewed

@@ -35,7 +35,7 @@ SUBJECT_MAPPING = {
 @Benchmark.register(
     name='bfcl_v3',
     pretty_name='BFCL-v3',
-    tags=['Agent'],
+    tags=['Agent', 'Function Calling'],
     description=
     'Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive and executable function call evaluation** '
     'dedicated to assessing Large Language Models\' (LLMs) ability to invoke functions. Unlike previous evaluations, '

evalscope/benchmarks/ceval/ceval_adapter.py CHANGED Viewed

@@ -1,11 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import csv
 import os
+from collections import defaultdict
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
+from evalscope.utils.io_utils import csv_to_list
 from evalscope.utils.logger import get_logger
 # flake8: noqa
@@ -154,7 +156,7 @@ class CEVALAdapter(DataAdapter):
         self.choices = ['A', 'B', 'C', 'D']
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
-        data_dict = {}
+        data_dict = defaultdict(dict)
         for subset_name in subset_list:
             for split_name in [self.train_split, self.eval_split]:
                 if os.path.exists(dataset_name_or_path):
@@ -162,20 +164,7 @@ class CEVALAdapter(DataAdapter):
                 else:
                     file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}.csv')
                 if os.path.exists(file_path):
-                    with open(file_path, encoding='utf-8') as f:
-                        rows = []
-                        reader = csv.reader(f)
-                        header = next(reader)
-                        for row in reader:
-                            item = dict(zip(header, row))
-                            item.setdefault('explanation', '')
-                            item.setdefault('answer', '')
-                            rows.append(item)
-                        if subset_name in data_dict:
-                            data_dict[subset_name].update({split_name: rows})
-                        else:
-                            data_dict[subset_name] = {split_name: rows}
+                    data_dict[subset_name][split_name] = csv_to_list(file_path)
         return data_dict

evalscope/benchmarks/cmmlu/cmmlu_adapter.py CHANGED Viewed

@@ -2,11 +2,13 @@
 import csv
 import os
+from collections import defaultdict
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
-from evalscope.utils import ResponseParser
+from evalscope.metrics.completion_parsers import ResponseParser
+from evalscope.utils.io_utils import csv_to_list
 from evalscope.utils.logger import get_logger
 # flake8: noqa
@@ -126,29 +128,15 @@ class CMMLUAdapter(DataAdapter):
         self.choices = ['A', 'B', 'C', 'D']
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
-        data_dict = {}
+        data_dict = defaultdict(dict)
         for subset_name in subset_list:
-            data_dict[subset_name] = {}
             for split_name in [self.train_split, self.eval_split]:
-                file_path = os.path.join(work_dir, dataset_name_or_path, split_name, f'{subset_name}.csv')
+                if os.path.exists(dataset_name_or_path):
+                    file_path = os.path.join(dataset_name_or_path, split_name, f'{subset_name}.csv')
+                else:
+                    file_path = os.path.join(work_dir, dataset_name_or_path, split_name, f'{subset_name}.csv')
                 if os.path.exists(file_path):
-                    with open(file_path, encoding='utf-8') as f:
-                        rows = []
-                        reader = csv.reader(f)
-                        for row in reader:
-                            if len(row) != 7:
-                                logger.error(f'Mismatch len of row: {row}, len of row should be 6. Skip this row.')
-                                continue
-                            rows.append({
-                                'Question': row[1],
-                                'A': row[2],
-                                'B': row[3],
-                                'C': row[4],
-                                'D': row[5],
-                                'Answer': row[6],
-                            })
-                        data_dict[subset_name].update({split_name: rows})
+                    data_dict[subset_name][split_name] = csv_to_list(file_path)
         return data_dict

evalscope/benchmarks/competition_math/competition_math_adapter.py CHANGED Viewed

@@ -105,7 +105,8 @@ class CompetitionMathAdapter(DataAdapter):
         return result
     def match(self, gold: str, pred: str) -> float:
-        return math_equal(pred, gold)
+        res = math_equal(pred, gold)
+        return 1.0 if res else 0.0
     @classmethod
     def _generate_prompt(cls, input_d: dict, use_fewshot: bool = True) -> str:

evalscope/benchmarks/data_adapter.py CHANGED Viewed

@@ -168,7 +168,12 @@ class DataAdapter(ABC):
         If you want to support local dataset, please rewrite this method in xxx_data_adapter.
         Use modelscope.msdatasets.MsDataset.load to load the dataset from local by default.
         """
-        return self.load_from_hub(dataset_name_or_path, subset_list, work_dir, **kwargs)
+        # remove dataset_infos.json file if exists, since MsDataset will occur an error if it exists.
+        dataset_infos_path = os.path.join(dataset_name_or_path, 'dataset_infos.json')
+        if os.path.exists(dataset_infos_path):
+            logger.info(f'Removing dataset_infos.json file at {dataset_infos_path} to avoid MsDataset errors.')
+            os.remove(dataset_infos_path)
+        return self.load_from_hub(dataset_name_or_path, subset_list, None, **kwargs)
     def load_with_snapshot(self,
                            file_structure: Dict[str, List[str]],
@@ -382,7 +387,7 @@ class DataAdapter(ABC):
         pass
     def gen_prompt_data(self,
-                        prompt: str,
+                        prompt: str = '',
                         system_prompt: Optional[str] = None,
                         choices: Optional[List[str]] = None,
                         index: Optional[Union[int, str]] = None,
@@ -413,7 +418,8 @@ class DataAdapter(ABC):
             system_prompt=system_prompt or self.system_prompt,
             index=index or 0,
             id=id,
-            messages=messages)
+            messages=messages,
+            extra_data=kwargs.get('extra_data', None))
         return prompt_data.to_dict()
     def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> Any:
@@ -449,7 +455,6 @@ class DataAdapter(ABC):
         """
         raise NotImplementedError
-    @abstractmethod
     def parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
         """
         Parse the predicted result and extract proper answer.
@@ -462,9 +467,22 @@ class DataAdapter(ABC):
         Returns:
             The parsed answer. Depending on the dataset. Usually a string for chat.
         """
-        raise NotImplementedError
+        return result
+    def llm_parse_pred_result(self, result: Any, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> Any:
+        """
+        Parse the predicted result using LLM.
+        Args:
+            result (Any): The predicted answer from the model.
+            raw_input_d (dict): The raw input data.
+            eval_type (str): The evaluation type, default is 'checkpoint'.
+        Returns:
+            The parsed answer. Usually a string for chat.
+        """
+        return result
-    @abstractmethod
     def match(self, gold: Any, pred: Any) -> Any:
         """
         Match the gold answer and the predicted answer.
@@ -478,7 +496,7 @@ class DataAdapter(ABC):
         Returns:
             The match result. Usually a score (float) for chat/multiple-choice-questions.
         """
-        raise NotImplementedError
+        return 1.0 if gold == pred else 0.0
     def llm_match(self, gold: Any, pred: Any, judge: Optional[LLMJudge] = None, **kwargs) -> float:
         """
@@ -504,5 +522,7 @@ class DataAdapter(ABC):
         # Request judge and obtain score
         prompt = judge.build_prompt(pred, gold, question)
-        score = judge(prompt)
-        return judge.get_score(score)
+        judge_response = judge(prompt)
+        score = judge.get_score(judge_response)
+        return score

evalscope/benchmarks/general_arena/__init__.py ADDED Viewed

File without changes

evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.3py3-none-any.whl → 0.17.1py3-none-any.whl