PyPI - evalscope - Versions diffs - 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl - Mend

evalscope 0.16.1py3-none-any.whl → 0.16.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (82) hide show

evalscope/app/app.py +20 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
evalscope/backend/rag_eval/utils/embedding.py +2 -4
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
evalscope/benchmarks/aime/aime24_adapter.py +3 -1
evalscope/benchmarks/aime/aime25_adapter.py +3 -1
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
evalscope/benchmarks/arc/arc_adapter.py +3 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
evalscope/benchmarks/benchmark.py +1 -0
evalscope/benchmarks/bfcl/__init__.py +0 -0
evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
evalscope/benchmarks/data_adapter.py +2 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
evalscope/benchmarks/drop/drop_adapter.py +3 -0
evalscope/benchmarks/frames/frames_adapter.py +1 -0
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
evalscope/benchmarks/musr/musr_adapter.py +3 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
evalscope/benchmarks/needle_haystack/utils.py +2 -2
evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
evalscope/benchmarks/race/race_adapter.py +3 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
evalscope/collections/evaluator.py +50 -28
evalscope/constants.py +1 -1
evalscope/evaluator/evaluator.py +6 -5
evalscope/metrics/t2v_metrics/__init__.py +9 -23
evalscope/models/adapters/__init__.py +2 -0
evalscope/models/adapters/base_adapter.py +31 -27
evalscope/models/adapters/bfcl_adapter.py +244 -0
evalscope/models/adapters/server_adapter.py +78 -17
evalscope/models/custom/custom_model.py +0 -3
evalscope/models/custom/dummy_model.py +77 -39
evalscope/models/local_model.py +1 -1
evalscope/models/register.py +2 -1
evalscope/perf/arguments.py +2 -0
evalscope/perf/benchmark.py +16 -3
evalscope/perf/plugin/api/openai_api.py +2 -0
evalscope/report/combinator.py +38 -12
evalscope/report/utils.py +24 -1
evalscope/run.py +1 -1
evalscope/summarizer.py +1 -1
evalscope/utils/io_utils.py +59 -2
evalscope/version.py +2 -2
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA +4 -3
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/RECORD +82 -79
tests/aigc/test_t2i.py +8 -8
tests/cli/test_all.py +40 -33
tests/cli/test_collection.py +4 -3
tests/cli/test_run.py +36 -21
tests/rag/test_clip_benchmark.py +5 -1
tests/rag/test_mteb.py +46 -2
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
{evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0

evalscope/benchmarks/general_mcq/general_mcq_adapter.py CHANGED Viewed

@@ -1,11 +1,12 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import csv
 import os
+from collections import defaultdict
 from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import EvalType, OutputType
 from evalscope.metrics import exact_match
 from evalscope.utils import ResponseParser
+from evalscope.utils.io_utils import csv_to_list, jsonl_to_list
 from evalscope.utils.logger import get_logger
 # flake8: noqa
@@ -15,7 +16,9 @@ logger = get_logger()
 @Benchmark.register(
     name='general_mcq',
-    pretty_name='General MCQ',
+    pretty_name='General-MCQ',
+    description='A general multiple-choice question answering dataset.',
+    tags=['MCQ', 'Custom'],
     dataset_id='general_mcq',
     model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -24,7 +27,7 @@ logger = get_logger()
     few_shot_num=0,
     train_split='dev',
     eval_split='val',
-    prompt_template='请回答问题，并选出其中的正确答案\n{query}',
+    prompt_template='请回答问题，并选出其中的正确答案。你的回答的最后一行应该是这样的格式：“答案是：LETTER”（不带引号），其中 LETTER 是 A、B、C、D 中的一个。\n{query}',
     query_template='问题：{question}\n{choices}\n答案: {answer}\n\n')
 class GeneralMCQAdapter(DataAdapter):
@@ -34,28 +37,21 @@ class GeneralMCQAdapter(DataAdapter):
         self.choices = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
-        data_dict = {}
+        data_dict = defaultdict(dict)
         for subset_name in subset_list:
             for split_name in [self.train_split, self.eval_split]:
-                if os.path.exists(dataset_name_or_path):
-                    file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}.csv')
-                else:
-                    file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}.csv')
-                if os.path.exists(file_path):
-                    with open(file_path, encoding='utf-8') as f:
-                        rows = []
-                        reader = csv.reader(f)
-                        header = next(reader)
-                        for row in reader:
-                            item = dict(zip(header, row))
-                            rows.append(item)
-                        if subset_name in data_dict:
-                            data_dict[subset_name].update({split_name: rows})
-                        else:
-                            data_dict[subset_name] = {split_name: rows}
-        return data_dict
+                # Check for files with different extensions
+                for ext, loader in [('.jsonl', jsonl_to_list), ('.csv', csv_to_list)]:
+                    if os.path.exists(dataset_name_or_path):
+                        file_path = os.path.join(dataset_name_or_path, f'{subset_name}_{split_name}{ext}')
+                    else:
+                        file_path = os.path.join(work_dir, dataset_name_or_path, f'{subset_name}_{split_name}{ext}')
+                    if os.path.exists(file_path):
+                        data_dict[subset_name][split_name] = loader(file_path)
+                        break  # Stop checking other extensions once a file is found
+        return dict(data_dict)
     def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
         """

evalscope/benchmarks/general_qa/general_qa_adapter.py CHANGED Viewed

@@ -13,6 +13,9 @@ logger = get_logger()
 @Benchmark.register(
     name='general_qa',
+    pretty_name='General-QA',
+    description='General Question Answering dataset',
+    tags=['QA', 'Custom'],
     dataset_id='general_qa',
     subset_list=['default'],
     metric_list=['AverageBLEU', 'AverageRouge'],

evalscope/benchmarks/gpqa/gpqa_adapter.py CHANGED Viewed

@@ -10,6 +10,9 @@ from evalscope.metrics import exact_match
 @Benchmark.register(
     name='gpqa',
     pretty_name='GPQA',
+    tags=['MCQ', 'Knowledge'],
+    description=
+    'GPQA is a dataset for evaluating the reasoning ability of large language models (LLMs) on complex mathematical problems. It contains questions that require step-by-step reasoning to arrive at the correct answer.',  # noqa: E501
     dataset_id='modelscope/gpqa',
     model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],

evalscope/benchmarks/gsm8k/gsm8k_adapter.py CHANGED Viewed

@@ -15,6 +15,9 @@ logger = get_logger()
 @Benchmark.register(
     name='gsm8k',
     pretty_name='GSM8K',
+    tags=['Mathematics'],
+    description=
+    'GSM8K (Grade School Math 8K) is a dataset of grade school math problems, designed to evaluate the mathematical reasoning abilities of AI models.',
     dataset_id='modelscope/gsm8k',
     subset_list=['main'],
     metric_list=['AverageAccuracy'],

evalscope/benchmarks/hellaswag/hellaswag_adapter.py CHANGED Viewed

@@ -18,6 +18,9 @@ logger = get_logger()
 @Benchmark.register(
     name='hellaswag',
     pretty_name='HellaSwag',
+    tags=['Commonsense', 'MCQ', 'Knowledge'],
+    description=
+    'HellaSwag is a benchmark for commonsense reasoning in natural language understanding tasks. It consists of multiple-choice questions where the model must select the most plausible continuation of a given context.',
     dataset_id='modelscope/hellaswag',
     model_adapter=OutputType.MULTIPLE_CHOICE,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],

evalscope/benchmarks/humaneval/humaneval_adapter.py CHANGED Viewed

@@ -13,6 +13,9 @@ logger = get_logger()
 @Benchmark.register(
     name='humaneval',
     pretty_name='HumanEval',
+    tags=['Coding'],
+    description=
+    'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.',  # noqa: E501
     dataset_id='modelscope/humaneval',
     subset_list=['openai_humaneval'],
     metric_list=['Pass@1'],

evalscope/benchmarks/ifeval/ifeval_adapter.py CHANGED Viewed

@@ -10,6 +10,9 @@ from evalscope.metrics import Metric, mean, metric_registry
 @Benchmark.register(
     name='ifeval',
     pretty_name='IFEval',
+    tags=['Instruction-Following'],
+    description=
+    'IFEval is a benchmark for evaluating instruction-following language models, focusing on their ability to understand and respond to various prompts. It includes a diverse set of tasks and metrics to assess model performance comprehensively.',  # noqa: E501
     dataset_id='opencompass/ifeval',
     subset_list=['default'],
     metric_list=[

evalscope/benchmarks/iquiz/iquiz_adapter.py CHANGED Viewed

@@ -7,6 +7,9 @@ from evalscope.utils.utils import ResponseParser
 @Benchmark.register(
     name='iquiz',
     pretty_name='IQuiz',
+    tags=['Knowledge', 'MCQ', 'Chinese'],
+    description=
+    'IQuiz is a benchmark for evaluating AI models on IQ and EQ questions. It consists of multiple-choice questions where the model must select the correct answer and provide an explanation.',  # noqa: E501
     dataset_id='AI-ModelScope/IQuiz',
     model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],

evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py CHANGED Viewed

@@ -8,7 +8,10 @@ logger = get_logger()
 @Benchmark.register(
     name='live_code_bench',
-    pretty_name='Live Code Bench',
+    pretty_name='Live-Code-Bench',
+    tags=['Coding'],
+    description=
+    'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions.',  # noqa: E501
     dataset_id='AI-ModelScope/code_generation_lite',
     subset_list=['release_latest'],
     metric_list=['Pass@1'],

evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py CHANGED Viewed

@@ -11,6 +11,9 @@ SUBSET_LIST = ['default']
 @Benchmark.register(
     name='maritime_bench',
     pretty_name='MaritimeBench',
+    tags=['Maritime', 'MCQ', 'Knowledge'],
+    description=
+    'MaritimeBench is a benchmark for evaluating AI models on maritime-related multiple-choice questions. It consists of questions related to maritime knowledge, where the model must select the correct answer from given options.',  # noqa: E501
     dataset_id='HiDolphin/MaritimeBench',
     model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],

evalscope/benchmarks/math_500/math_500_adapter.py CHANGED Viewed

@@ -10,6 +10,9 @@ logger = get_logger()
 @Benchmark.register(
     name='math_500',
     pretty_name='MATH-500',
+    tags=['Mathematics'],
+    description=
+    "MATH-500 is a benchmark for evaluating mathematical reasoning capabilities of AI models. It consists of 500 diverse math problems across five levels of difficulty, designed to test a model's ability to solve complex mathematical problems by generating step-by-step solutions and providing the correct final answer.",  # noqa: E501
     dataset_id='AI-ModelScope/MATH-500',
     subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
     metric_list=['AveragePass@1'],

evalscope/benchmarks/mmlu/mmlu_adapter.py CHANGED Viewed

@@ -136,6 +136,9 @@ SUBJECT_MAPPING = {
 @Benchmark.register(
     name='mmlu',
     pretty_name='MMLU',
+    tags=['Knowledge', 'MCQ'],
+    description=
+    "The MMLU (Massive Multitask Language Understanding) benchmark is a comprehensive evaluation suite designed to assess the performance of language models across a wide range of subjects and tasks. It includes multiple-choice questions from various domains, such as history, science, mathematics, and more, providing a robust measure of a model's understanding and reasoning capabilities.",  # noqa: E501
     dataset_id='modelscope/mmlu',
     model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],

evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py CHANGED Viewed

@@ -15,6 +15,9 @@ SUBSET_LIST = [
 @Benchmark.register(
     name='mmlu_pro',
     pretty_name='MMLU-Pro',
+    tags=['MCQ', 'Knowledge'],
+    description=
+    'MMLU-Pro is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.',  # noqa: E501
     dataset_id='modelscope/MMLU-Pro',
     model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],

evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py CHANGED Viewed

@@ -88,6 +88,9 @@ SUBJECT_MAPPING = {
 @Benchmark.register(
     name='mmlu_redux',
     pretty_name='MMLU-Redux',
+    tags=['MCQ', 'Knowledge'],
+    description=
+    'MMLU-Redux is a benchmark for evaluating language models on multiple-choice questions across various subjects. It includes questions from different domains, where the model must select the correct answer from given options.',  # noqa: E501
     dataset_id='AI-ModelScope/mmlu-redux-2.0',
     model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],

evalscope/benchmarks/musr/musr_adapter.py CHANGED Viewed

@@ -10,6 +10,9 @@ from evalscope.utils.utils import ResponseParser
 @Benchmark.register(
     name='musr',
     pretty_name='MuSR',
+    tags=['Reasoning', 'MCQ'],
+    description=
+    'MuSR is a benchmark for evaluating AI models on multiple-choice questions related to murder mysteries, object placements, and team allocation.',  # noqa: E501
     dataset_id='AI-ModelScope/MuSR',
     model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],

evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py CHANGED Viewed

@@ -28,9 +28,11 @@ Don't give information outside the document or repeat your findings."""
 @Benchmark.register(
     name='needle_haystack',
-    pretty_name='Needle in a Haystack',
-    description='Needle in a Haystack is a benchmark focused on information retrieval tasks. \
-    It requires the model to find specific information within a large corpus of text.',
+    pretty_name='Needle-in-a-Haystack',
+    tags=['Retrieval', 'Long Context'],
+    description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
+    'It requires the model to find specific information within a large corpus of text. '
+    '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)',  # noqa: E501
     dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
     metric_list=['AverageAccuracy'],
     subset_list=['english', 'chinese'],
@@ -50,6 +52,7 @@ Don't give information outside the document or repeat your findings."""
         'document_depth_percent_max': 100,
         'document_depth_percent_intervals': 10,
         'tokenizer_path': 'Qwen/Qwen3-0.6B',
+        'show_score': False,
     })
 class NeedleHaystackAdapter(DataAdapter):
@@ -71,11 +74,12 @@ class NeedleHaystackAdapter(DataAdapter):
         self.document_depth_percent_max = extra_params.get('document_depth_percent_max', 100)
         self.document_depth_percent_intervals = extra_params.get('document_depth_percent_intervals', 10)
         self.tokenizer_path = extra_params.get('tokenizer_path', 'Qwen/Qwen3-0.6B')
+        self.show_score = extra_params.get('show_score', False)
-        self.__init_tokenizer()
-        self.__init_length()
+        self._init_tokenizer()
+        self._init_length()
-    def __init_length(self):
+    def _init_length(self):
         """ Initialize context lengths and document depth percentages based on the provided parameters."""
         import numpy as np
@@ -93,7 +97,7 @@ class NeedleHaystackAdapter(DataAdapter):
                 num=self.document_depth_percent_intervals,
                 endpoint=True)).astype(int)
-    def __init_tokenizer(self):
+    def _init_tokenizer(self):
         """ Initialize the tokenizer based on the provided tokenizer path."""
         from modelscope import AutoTokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
@@ -335,7 +339,10 @@ class NeedleHaystackAdapter(DataAdapter):
                 pivot_table = sub_df.pivot_table(
                     values='Score', index=['Depth', 'Context'], aggfunc='mean').reset_index()
                 pivot_table = pivot_table.pivot(index='Depth', columns='Context', values='Score')
-                draw_score_chat(pivot_table, outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'))
+                draw_score_chat(
+                    pivot_table,
+                    outpath=os.path.join(report_path, f'needle_haystack_heatmap_{subset}.png'),
+                    show_score=self.show_score)
         except Exception as e:
             logger.error(f'Error generating charts: {e}')

evalscope/benchmarks/needle_haystack/utils.py CHANGED Viewed

@@ -37,13 +37,13 @@ def parse_score(score_str: str) -> int:
         return 0.0
-def draw_score_chat(pivot_table, outpath):
+def draw_score_chat(pivot_table, outpath, show_score=False):
     # Create a custom colormap. Go to https://coolors.co/ and pick cool colors
     cmap = LinearSegmentedColormap.from_list('custom_cmap', ['#F0496E', '#EBB839', '#0CD79F'])
     # Create the heatmap with better aesthetics
     plt.figure(figsize=(17.5, 8))  # Can adjust these dimensions as needed
-    sns.heatmap(pivot_table, vmin=0.0, vmax=1.0, annot=True, fmt='.1f', cmap=cmap, cbar_kws={'label': 'Score'})
+    sns.heatmap(pivot_table, vmin=0.0, vmax=1.0, annot=show_score, fmt='.1f', cmap=cmap, cbar_kws={'label': 'Score'})
     # More aesthetics
     plt.title('Fact Retrieval Across Context Lengths ("Needle In A HayStack")')  # Adds a title

evalscope/benchmarks/process_bench/process_bench_adapter.py CHANGED Viewed

@@ -12,6 +12,9 @@ cur_path = os.path.dirname(os.path.abspath(__file__))
 @Benchmark.register(
     name='process_bench',
     pretty_name='ProcessBench',
+    tags=['Mathematical', 'Reasoning'],
+    description=
+    'ProcessBench is a benchmark for evaluating AI models on mathematical reasoning tasks. It includes various subsets such as GSM8K, Math, OlympiadBench, and OmniMath, each with its own set of problems that require step-by-step reasoning to arrive at the correct answer.',  # noqa: E501
     dataset_id='Qwen/ProcessBench',
     subset_list=['gsm8k', 'math', 'olympiadbench', 'omnimath'],
     metric_list=['error_acc', 'correct_acc', 'simple_f1_score'],

evalscope/benchmarks/race/race_adapter.py CHANGED Viewed

@@ -17,6 +17,9 @@ logger = get_logger()
 @Benchmark.register(
     name='race',
     pretty_name='RACE',
+    tags=['Reasoning', 'MCQ'],
+    description=
+    'RACE is a benchmark for testing reading comprehension and reasoning abilities of neural models. It is constructed from Chinese middle and high school examinations.',  # noqa: E501
     dataset_id='modelscope/race',
     model_adapter=OutputType.MULTIPLE_CHOICE,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],

evalscope/benchmarks/simple_qa/simple_qa_adapter.py CHANGED Viewed

@@ -95,6 +95,9 @@ Just return the letters "A", "B", or "C", with no text around it.
 @Benchmark.register(
     name='simple_qa',
     pretty_name='SimpleQA',
+    tags=['Knowledge', 'QA'],
+    description=
+    'SimpleQA is a benchmark designed to evaluate the performance of language models on simple question-answering tasks. It includes a set of straightforward questions that require basic reasoning and understanding capabilities.',  # noqa: E501
     dataset_id='AI-ModelScope/SimpleQA',
     metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
     few_shot_num=0,

evalscope/benchmarks/super_gpqa/five_shot_prompt.txt CHANGED Viewed

@@ -85,5 +85,6 @@ Answer: A.
 Question:
 {query}
+{choices}
 Answer: Let's think step by step.

evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py CHANGED Viewed

@@ -109,6 +109,9 @@ SUBSET_MAPPING = {
 @Benchmark.register(
     name='super_gpqa',
     pretty_name='SuperGPQA',
+    tags=['MCQ', 'Knowledge'],
+    description=
+    'SuperGPQA is a large-scale multiple-choice question answering dataset, designed to evaluate the generalization ability of models across different fields. It contains 100,000+ questions from 50+ fields, with each question having 10 options.',  # noqa: E501
     dataset_id='m-a-p/SuperGPQA',
     model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -139,13 +142,15 @@ class SuperGPQAAdapter(DataAdapter):
         return self.reformat_subset(data_dict, subset_key='field', format='{}')
     def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
+        question = input_d['question']
+        choices = self._format_choices(input_d['options'])
         if not self.prompt_template:
             if few_shot_list:
-                prompt = self.few_shot_prompt.format(query=input_d['question'])
+                prompt = self.few_shot_prompt.format(query=question, choices=choices)
             else:
-                prompt = self.zero_shot_prompt.format(query=input_d['question'])
+                prompt = self.zero_shot_prompt.format(query=question, choices=choices)
         else:
-            prompt = self.prompt_template.format(query=input_d['question'])
+            prompt = self.prompt_template.format(query=question, choices=choices)
         return self.gen_prompt_data(prompt)
     def get_gold_answer(self, input_d: dict) -> str:
@@ -189,3 +194,16 @@ class SuperGPQAAdapter(DataAdapter):
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
+    def _format_choices(self, choices: list) -> str:
+        """
+        Format the choices into a string for display.
+        Args:
+            choices (list): List of choices.
+        Returns:
+            str: Formatted string of choices.
+        """
+        choice_list = [f'{option}) {content}' for option, content in zip(self.choices, choices)]
+        return '\n'.join(choice_list)

evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
 {query}
+{choices}

evalscope/benchmarks/tool_bench/tool_bench_adapter.py CHANGED Viewed

@@ -8,6 +8,11 @@ from evalscope.metrics import Metric, mean, metric_registry
 @Benchmark.register(
     name='tool_bench',
     pretty_name='ToolBench-Static',
+    tags=['Reasoning', 'Agent'],
+    description='ToolBench is a benchmark for evaluating AI models on tool use tasks. '
+    'It includes various subsets such as in-domain and out-of-domain, '
+    'each with its own set of problems that require step-by-step reasoning to arrive at the correct answer. '
+    '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/toolbench.html)',  # noqa: E501
     dataset_id='AI-ModelScope/ToolBench-Static',
     subset_list=['in_domain', 'out_of_domain'],
     metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],

evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py CHANGED Viewed

@@ -16,6 +16,9 @@ logger = get_logger()
 @Benchmark.register(
     name='trivia_qa',
     pretty_name='TriviaQA',
+    tags=['QA', 'Reading Comprehension'],
+    description=
+    'TriviaQA is a large-scale reading comprehension dataset consisting of question-answer pairs collected from trivia websites. It includes questions with multiple possible answers, making it suitable for evaluating the ability of models to understand and generate answers based on context.',  # noqa: E501
     dataset_id='modelscope/trivia_qa',
     subset_list=['default'],
     metric_list=['AverageAccuracy'],

evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py CHANGED Viewed

@@ -21,6 +21,9 @@ logger = get_logger()
 @Benchmark.register(
     name='truthful_qa',
     pretty_name='TruthfulQA',
+    tags=['Knowledge'],
+    description=
+    'TruthfulQA is a benchmark designed to evaluate the ability of AI models to answer questions truthfully and accurately. It includes multiple-choice and generation tasks, focusing on the model\'s understanding of factual information and its ability to generate coherent responses.',  # noqa: E501
     dataset_id='modelscope/truthful_qa',
     model_adapter=OutputType.CONTINUOUS,
     output_types=[OutputType.CONTINUOUS, OutputType.GENERATION],

evalscope/benchmarks/winogrande/winogrande_adapter.py CHANGED Viewed

@@ -7,6 +7,9 @@ from evalscope.utils.utils import ResponseParser
 @Benchmark.register(
     name='winogrande',
     pretty_name='Winogrande',
+    tags=['Reasoning', 'MCQ'],
+    description=
+    'Winogrande is a benchmark for evaluating AI models on commonsense reasoning tasks, specifically designed to test the ability to resolve ambiguous pronouns in sentences.',  # noqa: E501
     dataset_id='AI-ModelScope/winogrande_val',
     model_adapter=OutputType.GENERATION,
     output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],

evalscope/collections/evaluator.py CHANGED Viewed

@@ -32,11 +32,22 @@ class SimpleEvaluator(Evaluator):
             task_cfg=task_cfg,
             outputs=outputs)
-    def get_answer(self, samples, infer_cfg) -> List[dict]:
+    def get_answer(self, samples: List[DatasetEntry], infer_cfg: dict) -> List[dict]:
         input_prompts = [sample.prompt for sample in samples]
         subset_name = samples[0].subset_name
+        try:
+            # get answer from model
+            answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
+        except Exception as e:
+            logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
+            # if ignore_errors is True, continue to next input
+            if self.task_cfg.ignore_errors:
+                logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
+                return [None] * len(samples), samples
+            else:
+                raise e
+        # process answers
         answers_list = []
-        answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
         for answer_d, input_prompt in zip(answer_ds, input_prompts):
             answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
             processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
@@ -66,7 +77,7 @@ class EvaluatorCollection:
         self.dataset_id_map = EvaluatorCollection._init_id_map(self.dataset)
         self.evaluators = self._initialize_evaluators()
-    def load(self) -> tuple[list[DatasetEntry], str]:
+    def load(self) -> tuple[List[DatasetEntry], str]:
         dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
         raw_dataset = self.data_adapter.load()
         # random limit the dataset
@@ -86,7 +97,7 @@ class EvaluatorCollection:
         return datasets, dataset_name
     @staticmethod
-    def _init_name_map(dataset):
+    def _init_name_map(dataset: List[DatasetEntry]) -> Dict[str, Dict[str, List[int]]]:
         dataset_name_map = defaultdict(lambda: defaultdict(list))
         for sample in dataset:
             dataset_name, subset_name = sample.dataset_name, sample.subset_name
@@ -94,13 +105,13 @@ class EvaluatorCollection:
         return dataset_name_map
     @staticmethod
-    def _init_id_map(dataset):
+    def _init_id_map(dataset: List[DatasetEntry]) -> Dict[int, DatasetEntry]:
         dataset_id_map = {}
         for sample in dataset:
             dataset_id_map[sample.index] = sample
         return dataset_id_map
-    def _initialize_evaluators(self):
+    def _initialize_evaluators(self) -> Dict[str, SimpleEvaluator]:
         evaluators = {}
         # load dataset args
         dataset_args = deepcopy(self.task_cfg.dataset_args)
@@ -118,6 +129,8 @@ class EvaluatorCollection:
         return evaluators
     def get_report(self, scores):
+        if not scores:
+            return
         def get_dataframe(scores):
             data = []
@@ -241,9 +254,12 @@ class EvaluatorCollection:
                     # Process completed tasks
                     for future in as_completed(futures):
                         answer_list, samples = future.result()
-                        answers[samples[0].index] = answer_list[0]
-                        dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
-                        pbar.update(1)
+                        for answer_d, sample in zip(answer_list, samples):
+                            if answer_d is None:
+                                continue
+                            answers[sample.index] = answer_d
+                            dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
+                            pbar.update(1)
             else:
                 for dataset_name, data_map in dataset_name_map.items():
                     # get evaluator for the dataset
@@ -253,13 +269,14 @@ class EvaluatorCollection:
                             # get batch samples
                             batch_ids = ids[i:i + eval_batch_size]
                             batch_samples = [self.dataset_id_map[_id] for _id in batch_ids]
-                            answer_list, _ = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
+                            answer_list, samples = evaluator.get_answer(batch_samples, self.task_cfg.generation_config)
                             # update answers
-                            for j, _id in enumerate(batch_ids):
-                                answers[_id] = answer_list[j]
-                            dump_jsonl_data(answer_list, pred_file_path, dump_mode=DumpMode.APPEND)
-                            pbar.update(len(batch_ids))
+                            for answer_d, sample in zip(answer_list, samples):
+                                if answer_d is None:
+                                    continue
+                                answers[sample.index] = answer_d
+                                dump_jsonl_data([answer_d], pred_file_path, dump_mode=DumpMode.APPEND)
+                                pbar.update(1)
         return answers
     def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
@@ -289,19 +306,22 @@ class EvaluatorCollection:
         reviews = {}
         for sample in tqdm(self.dataset, desc='Getting reviews'):
-            file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
-            if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
-                # Use cached review if available
-                review_d = review_history_map[file_name][sample.index]
-            else:
-                # Generate new review
-                evaluator = self.evaluators[sample.dataset_name]
-                review_d = evaluator.get_review(answers[sample.index])
-                # Only save the review if it's not in the cache
-                self._save_review(review_file_path, file_name, review_d)
-            reviews[sample.index] = review_d
+            try:
+                file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
+                if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
+                    # Use cached review if available
+                    review_d = review_history_map[file_name][sample.index]
+                else:
+                    # Generate new review
+                    evaluator = self.evaluators[sample.dataset_name]
+                    review_d = evaluator.get_review(answers[sample.index])
+                    # Only save the review if it's not in the cache
+                    self._save_review(review_file_path, file_name, review_d)
+                reviews[sample.index] = review_d
+            except Exception as e:
+                logger.error(f'Error getting review for sample index {sample.index}: {e}. Skipping this sample.')
         return reviews
@@ -339,6 +359,8 @@ class EvaluatorCollection:
         scores = defaultdict(dict)
         for sample in tqdm(self.dataset, desc='Getting scores'):
             evaluator = self.evaluators[sample.dataset_name]
+            if sample.index not in reviews:
+                continue
             review_d = reviews[sample.index]
             score = evaluator.get_score(review_d)
             scores[sample.index] = score

evalscope/constants.py CHANGED Viewed

@@ -146,7 +146,7 @@ class EvalType:
 class OutputType:
-    LOGITS = 'logits'  # for multiple choice tasks
+    LOGITS = 'logits'  # for logits output tasks
     GENERATION = 'generation'  # for text generation tasks and general tasks
     MULTIPLE_CHOICE = 'multiple_choice_logits'  # for multiple choice tasks
     CONTINUOUS = 'continuous_logits'  # for continuous tasks

evalscope 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl

Potentially problematic release.

evalscope 0.16.1py3-none-any.whl → 0.16.2py3-none-any.whl