PyPI - evalscope - Versions diffs - 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

evalscope 1.0.2py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show

evalscope/api/benchmark/__init__.py +8 -1
evalscope/api/benchmark/adapters/__init__.py +1 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/benchmark.py +14 -0
evalscope/api/dataset/dataset.py +21 -0
evalscope/api/dataset/loader.py +6 -2
evalscope/api/mixin/sandbox_mixin.py +32 -54
evalscope/api/model/generate_config.py +6 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +8 -2
evalscope/app/utils/data_utils.py +3 -2
evalscope/app/utils/visualization.py +2 -2
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +111 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/constants.py +4 -0
evalscope/evaluator/evaluator.py +72 -79
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +52 -1
evalscope/metrics/metrics.py +16 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/utils/openai.py +4 -0
evalscope/perf/arguments.py +24 -4
evalscope/perf/benchmark.py +74 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +179 -79
evalscope/perf/plugin/api/openai_api.py +4 -3
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/utils/benchmark_util.py +36 -22
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +0 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +11 -2
evalscope/report/combinator.py +52 -2
evalscope/run.py +4 -0
evalscope/utils/function_utils.py +195 -12
evalscope/utils/io_utils.py +74 -0
evalscope/utils/json_schema.py +8 -6
evalscope/utils/logger.py +49 -17
evalscope/utils/multi_choices.py +16 -1
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
tests/__init__.py +0 -1
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -429
tests/benchmark/test_image_edit.py +0 -65
tests/benchmark/test_sandbox.py +0 -81
tests/benchmark/test_t2i.py +0 -142
tests/benchmark/test_vlm.py +0 -137
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -269
tests/cli/test_collection.py +0 -99
tests/cli/test_custom.py +0 -268
tests/cli/test_reasoning.py +0 -81
tests/common.py +0 -73
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -206
tests/rag/test_clip_benchmark.py +0 -87
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
{tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0

evalscope/benchmarks/gsm8k/gsm8k_adapter.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import re
 from typing import Any, Dict
 from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
@@ -12,13 +13,26 @@ from evalscope.utils.logger import get_logger
 logger = get_logger()
 PROMPT_TEMPLATE = """
-Solve the following math problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
+Solve the following math problem step by step. The last line of your response should display the answer enclosed within \\boxed{{\\text{{$ANSWER}}}}.
-{question}
+Example:
+Let's solve the problem step by step.
+Problem: Eliza's rate per hour for the first 40 hours she works each week is $10. She also receives an overtime pay of 1.2 times her regular hourly rate. If Eliza worked for 45 hours this week, how much are her earnings for this week?
+Step 1: Calculate Eliza's earnings for the first 40 hours. Eliza's hourly rate is $10, so her earnings for the first 40 hours are $10/hour x 40 hours = $400.
+Step 2: Calculate Eliza's overtime pay rate. Eliza's overtime pay rate is 1.2 times her regular hourly rate, so her overtime pay rate is $10/hour x 1.2 = $12/hour.
+Step 3: Calculate Eliza's earnings for the overtime hours. Eliza worked for 45 hours, so her overtime hours are 45 hours - 40 hours = 5 hours. Her earnings for the overtime hours are $12/hour x 5 hours = $60.
+Step 4: Calculate Eliza's total earnings for the week. Eliza's total earnings for the week are her earnings for the first 40 hours plus her earnings for the overtime hours, which is $400 + $60 = $460.
-Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.
+Answer:
+\\boxed{{\\text{{460}}}}
-Reasoning:
+question:
+{question}
+Remember to put your answer on its own line at the end in the form "\\boxed{{\\text{{$ANSWER}}}}" (without quotes), where $ANSWER is replaced by the actual answer to the problem.
 """.lstrip()  # noqa: E501
 FEWSHOT_TEMPLATE = """
@@ -69,6 +83,11 @@ class GSM8KAdapter(DefaultDataAdapter):
             return ''
     def extract_answer(self, prediction: str, task_state: TaskState):
+        boxed_match = re.search(r'\\boxed\\{\\text\\{([^}]*)\\}\\}', prediction)
+        if boxed_match:
+            result = boxed_match.group(1).strip()
+            return result.strip()
         from evalscope.filters.extraction import RegexFilter
         regex = RegexFilter(regex_pattern=r'(-?[0-9.,]{2,})|(-?[0-9]+)', group_select=-1)

evalscope/benchmarks/hallusion_bench/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py ADDED Viewed

@@ -0,0 +1,158 @@
+from collections import defaultdict
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator.state import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.metric.scorer import AggScore, SampleScore, Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+@register_benchmark(
+    BenchmarkMeta(
+        name='hallusion_bench',
+        pretty_name='HallusionBench',
+        tags=[Tags.MULTI_MODAL, Tags.HALLUCINATION, Tags.YES_NO],
+        description=
+        'HallusionBench is an advanced diagnostic benchmark designed to evaluate image-context reasoning, analyze models\' tendencies for language hallucination and visual illusion in large vision-language models (LVLMs).',  # noqa: E501
+        dataset_id='lmms-lab/HallusionBench',
+        metric_list=['aAcc', 'qAcc', 'fAcc'],
+        eval_split='image',
+        prompt_template='{question}\nPlease answer YES or NO without an explanation.',
+    )
+)
+class HallusionBenchAdapter(VisionLanguageAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        input_text = self.prompt_template.format(question=record['question'])
+        content_list: List[Content] = [ContentText(text=input_text)]
+        image = record.get('image')
+        if image:
+            image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
+            content_list.append(ContentImage(image=image_base64))
+        answer = 'NO' if str(record.get('answer', '0')) == '1' else 'YES'
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            target=answer,
+            metadata={
+                'category': record.get('category'),
+                'subcategory': record.get('subcategory'),
+                'visual_input': record.get('visual_input'),
+                'set_id': record.get('set_id'),
+                'figure_id': record.get('figure_id'),
+                'question_id': record.get('question_id'),
+            }
+        )
+    def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
+        score = Score(
+            extracted_prediction=filtered_prediction,
+            prediction=original_prediction,
+        )
+        # Check if the reference answer is in the filtered prediction
+        result = 1 if reference in filtered_prediction.strip().upper() else 0
+        score.value = {'acc': result}
+        return score
+    def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
+        def compute_aAcc(scores: List[SampleScore]):
+            total = len(scores)
+            if total == 0:
+                return 0.0, 0
+            correct = sum(ss.score.main_value for ss in scores)
+            return (correct / total), total
+        def compute_group_accuracy(scores: List[SampleScore], group_type: str):
+            # group_type: 'figure' or 'question'
+            groups = defaultdict(list)
+            for ss in scores:
+                md = ss.sample_metadata
+                subcategory = md.get('subcategory')
+                set_id = md.get('set_id')
+                group_id = md.get('figure_id') if group_type == 'figure' else md.get('question_id')
+                if subcategory is None or set_id is None or group_id is None:
+                    # Skip incomplete records for this grouping
+                    continue
+                key = f'{subcategory}_{set_id}_{group_id}'
+                groups[key].append(ss.score.main_value)
+            if not groups:
+                return 0.0, 0
+            num_correct_groups = sum(1 for vals in groups.values() if all(vals))
+            num_groups = len(groups)
+            return (num_correct_groups / num_groups), num_groups
+        def compute_metrics(scores: List[SampleScore]) -> Dict[str, Dict[str, float]]:
+            a_acc, a_n = compute_aAcc(scores)
+            f_acc, f_n = compute_group_accuracy(scores, 'figure')
+            q_acc, q_n = compute_group_accuracy(scores, 'question')
+            return {
+                'aAcc': {
+                    'score': a_acc,
+                    'num': a_n
+                },
+                'fAcc': {
+                    'score': f_acc,
+                    'num': f_n
+                },
+                'qAcc': {
+                    'score': q_acc,
+                    'num': q_n
+                },
+            }
+        outputs: List[AggScore] = []
+        # By subcategory
+        subcategories = sorted({ss.sample_metadata.get('subcategory') for ss in sample_scores})
+        for subcategory in subcategories:
+            subset = [ss for ss in sample_scores if ss.sample_metadata.get('subcategory') == subcategory]
+            stats = compute_metrics(subset)
+            for metric in ['aAcc', 'fAcc', 'qAcc']:
+                outputs.append(
+                    AggScore(
+                        score=stats[metric]['score'],
+                        metric_name=metric,
+                        aggregation_name=str(subcategory),
+                        num=stats[metric]['num'],
+                    )
+                )
+        # By category
+        categories = sorted({ss.sample_metadata.get('category') for ss in sample_scores})
+        for category in categories:
+            subset = [ss for ss in sample_scores if ss.sample_metadata.get('category') == category]
+            stats = compute_metrics(subset)
+            for metric in ['aAcc', 'fAcc', 'qAcc']:
+                outputs.append(
+                    AggScore(
+                        score=stats[metric]['score'],
+                        metric_name=metric,
+                        aggregation_name=str(category),
+                        num=stats[metric]['num'],
+                    )
+                )
+        # Overall
+        overall = compute_metrics(sample_scores)
+        for metric in ['aAcc', 'fAcc', 'qAcc']:
+            outputs.append(
+                AggScore(
+                    score=overall[metric]['score'],
+                    metric_name=metric,
+                    aggregation_name='Overall',
+                    num=overall[metric]['num'],
+                )
+            )
+        return outputs

evalscope/benchmarks/hle/hle_adapter.py CHANGED Viewed

@@ -57,8 +57,9 @@ Your judgment must focus only on if there are meaningful differences between [co
         'humanities/social science (9%), computer science/artificial intelligence (10%), '
         'engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions '
         'require the ability to understand both text and images, i.e., multi-modality. '
-        '24% of the questions are multiple-choice; the rest are short-answer, exact-match questions. '
-        'To evaluate the performance of model without multi-modality capabilities, please set the extra_params["include_multi_modal"] to False.',  # noqa: E501
+        '24% of the questions are multiple-choice; the rest are short-answer, exact-match questions. \n'
+        '**To evaluate the performance of model without multi-modality capabilities, '
+        'please set the `extra_params["include_multi_modal"]` to `False`.**',  # noqa: E501
         dataset_id='cais/hle',
         subset_list=SUBSET_LIST,
         metric_list=['acc'],

evalscope/benchmarks/humaneval/humaneval_adapter.py CHANGED Viewed

@@ -21,7 +21,8 @@ logger = get_logger()
         pretty_name='HumanEval',
         tags=[Tags.CODING],
         description=
-        'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior.',
+        'HumanEval is a benchmark for evaluating the ability of code generation models to write Python functions based on given specifications. It consists of programming tasks with a defined input-output behavior. '
+        '**By default the code is executed in local environment. We recommend using sandbox execution to safely run and evaluate the generated code, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/sandbox.html) for more details.**',  # noqa: E501
         dataset_id='opencompass/humaneval',
         subset_list=['openai_humaneval'],
         metric_list=['Pass@1'],

evalscope/benchmarks/infovqa/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/infovqa/infovqa_adapter.py ADDED Viewed

@@ -0,0 +1,66 @@
+import json
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator.state import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+PROMPT = """Answer the question according to the image using a single word or phrase.
+{question}
+The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question."""  # noqa: E501
+@register_benchmark(
+    BenchmarkMeta(
+        name='infovqa',
+        pretty_name='InfoVQA',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
+        description=
+        'InfoVQA (Information Visual Question Answering) is a benchmark designed to evaluate how well AI models can answer questions based on information-dense images, such as charts, graphs, diagrams, maps, and infographics.',  # noqa: E501
+        dataset_id='lmms-lab/DocVQA',
+        subset_list=['InfographicVQA'],
+        metric_list=['anls'],
+        eval_split='validation',
+        prompt_template=PROMPT,
+    )
+)
+class InfoVQAAdapter(VisionLanguageAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.add_aggregation_name = False
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        input_text = PROMPT.format(question=record['question'])
+        content_list: List[Content] = [ContentText(text=input_text)]
+        image = record.get('image')
+        if image:
+            image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
+            content_list.append(ContentImage(image=image_base64))
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            target=json.dumps(record.get('answers')),  # answers is a list
+            metadata={
+                'questionId': record.get('questionId'),
+                'answer_type': record.get('answer_type'),
+                'image_url': record.get('image_url'),
+                'ocr': record.get('ocr'),
+            }
+        )
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        import re
+        pattern = r'ANSWER:\s*(.*)'
+        match = re.search(pattern, prediction)
+        if match:
+            return match.group(1).strip()
+        return prediction.strip()

evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py CHANGED Viewed

@@ -1,3 +1,4 @@
+# flake8: noqa: E501
 from typing import Any, Dict
 from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
@@ -19,7 +20,8 @@ logger = get_logger()
         pretty_name='Live-Code-Bench',
         tags=[Tags.CODING],
         description=
-        'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions.',  # noqa: E501
+        'Live Code Bench is a benchmark for evaluating code generation models on real-world coding tasks. It includes a variety of programming problems with test cases to assess the model\'s ability to generate correct and efficient code solutions. '
+        '**By default the code is executed in local environment. We recommend using sandbox execution to safely run and evaluate the generated code, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/user_guides/sandbox.html) for more details.**',
         dataset_id='AI-ModelScope/code_generation_lite',
         subset_list=['release_latest'],
         metric_list=['Pass@1'],

evalscope/benchmarks/math_verse/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/math_verse/math_verse_adapter.py ADDED Viewed

@@ -0,0 +1,100 @@
+# flake8: noqa: E501
+from typing import Any, Dict
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+MULTI_CHOICE_TYPE = 'multi-choice'
+OPEN_TYPE = 'free-form'
+OPEN_PROMPT = '{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.'
+MULT_CHOICE_PROMPT = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of A, B, C, D. Think step by step before answering.
+{question}
+"""
+SUBSET_LIST = ['Text Dominant', 'Text Lite', 'Vision Intensive', 'Vision Dominant', 'Vision Only']
+@register_benchmark(
+    BenchmarkMeta(
+        name='math_verse',
+        pretty_name='MathVerse',
+        dataset_id='evalscope/MathVerse',
+        tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
+        description=
+        'MathVerse, an all-around visual math benchmark designed for an equitable and in-depth evaluation of MLLMs. 2,612 high-quality, multi-subject math problems with diagrams from publicly available sources. Each problem is then transformed by human annotators into six distinct versions, each offering varying degrees of information content in multi-modality, contributing to 15K test samples in total. This approach allows MathVerse to comprehensively assess whether and how much MLLMs can truly understand the visual diagrams for mathematical reasoning.',
+        subset_list=SUBSET_LIST,
+        metric_list=[{
+            'acc': {
+                'numeric': True
+            }
+        }],
+        default_subset='testmini',
+        eval_split='testmini',
+        prompt_template=OPEN_PROMPT,
+    )
+)
+class MathVerseAdapter(VisionLanguageAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.reformat_subset = True
+        self._use_llm_judge = True
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        """
+        Convert a dataset record to a Sample. Unifies handling for both multi-choice and free-form.
+        Builds the content list inline and appends image content if provided.
+        Args:
+            record: Raw dataset record.
+        Returns:
+            Sample: The standardized sample ready for evaluation.
+        """
+        question_type = record.get('question_type', OPEN_TYPE)
+        question: str = record.get('question', '')
+        content_list: list[Content] = []
+        # Choose prompt text based on type; keep a single unified flow for creating Sample
+        if question_type == MULTI_CHOICE_TYPE:
+            prompt_text = MULT_CHOICE_PROMPT.format(question=question).strip()
+        else:
+            prompt_text = OPEN_PROMPT.format(question=question).strip()
+        content_list.append(ContentText(text=prompt_text))
+        # Append image if exists
+        image = record.get('image')
+        if image and isinstance(image, dict):
+            image_bytes = image.get('bytes')
+            if image_bytes:
+                image_base64 = bytes_to_base64(image_bytes, format='png', add_header=True)
+                content_list.append(ContentImage(image=image_base64))
+        metadata: Dict[str, Any] = {
+            'sample_index': record.get('sample_index'),
+            'problem_index': record.get('problem_index'),
+            'problem_version': record.get('problem_version'),
+            'question_type': question_type,
+            'query_wo': record.get('query_wo'),
+            'query_cot': record.get('query_cot'),
+            'question_for_eval': record.get('question_for_eval'),
+        }
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            target=record['answer'],
+            subset_key=record['problem_version'],
+            metadata=metadata,
+        )

evalscope/benchmarks/math_vision/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/math_vision/math_vision_adapter.py ADDED Viewed

@@ -0,0 +1,111 @@
+# flake8: noqa: E501
+import re
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers, prompt
+logger = get_logger()
+OPEN_PROMPT = '{question}\nPlease reason step by step, and put your final answer within \\boxed{{}} without units.'
+MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
+SUBSET_LIST = ['level 1', 'level 2', 'level 3', 'level 4', 'level 5']
+@register_benchmark(
+    BenchmarkMeta(
+        name='math_vision',
+        pretty_name='MathVision',
+        dataset_id='evalscope/MathVision',
+        tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
+        description=
+        'The MATH-Vision (MATH-V) dataset, a meticulously curated collection of 3,040 high-quality mathematical problems with visual contexts sourced from real math competitions.',
+        subset_list=SUBSET_LIST,
+        metric_list=[{
+            'acc': {
+                'numeric': True
+            }
+        }],
+        eval_split='test',
+        prompt_template=OPEN_PROMPT,
+    )
+)
+class MathVisionAdapter(VisionLanguageAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.reformat_subset = True
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        if len(record['options']) > 0:
+            question_type = 'multi_choice'
+        else:
+            question_type = 'free_form'
+        content_list, answers_list = MathVisionAdapter.create_content_and_answers_list(record, question_type)
+        metadata = {
+            'id': record['id'],
+            'image': record['image'],
+            'solution': record['solution'],
+            'level': record['level'],
+            'question_type': question_type,
+            'subject': record['subject']
+        }
+        if question_type == 'multi_choice':
+            label_answer = record['answer']
+            return Sample(
+                input=[ChatMessageUser(content=content_list)],
+                choices=answers_list,
+                target=label_answer,
+                subset_key=f'level {record["level"]}',
+                metadata=metadata
+            )
+        elif question_type == 'free_form':
+            return Sample(
+                input=[ChatMessageUser(content=content_list)],
+                target=record['answer'],
+                subset_key=f'level {record["level"]}',
+                metadata=metadata
+            )
+        else:
+            raise ValueError(f'Unexpected question_type: {question_type}')
+    @staticmethod
+    def create_content_and_answers_list(record: Dict[str, Any], question_type) -> tuple[List[Content], List[str]]:
+        """
+            Create a list of content elements and a list of answers from a record.
+            Args:
+                record (dict): The record containing question, images, and options.
+                question_type (str): The type of this question
+            Returns:
+                tuple: A tuple containing:
+                    - content_list (list): A list of content elements (text and images).
+                    - answers_list (list): A list of possible answers (for multiple-choice questions).
+        """
+        # Replace <image1>, <image2> ... to [image1], [image2], ... from question text
+        question = re.sub(r'<image(\d+)>', r'[image\1]', record['question']).strip()
+        if question_type == 'multi_choice':
+            answers_list = record['options']
+            input_text = prompt(question=question, choices=answers_list, template=MULT_CHOICE_PROMPT)
+            content_list: List[Content] = [ContentText(text=input_text)]
+        else:
+            answers_list: List[str] = []
+            content_list: List[Content] = [ContentText(text=OPEN_PROMPT.format(question=question))]
+        image = record['decoded_image']
+        if image:
+            image_base64 = bytes_to_base64(image['bytes'], format='jpg', add_header=True)
+            content_list.append(ContentImage(image=image_base64))
+        return content_list, answers_list

evalscope/benchmarks/math_vista/math_vista_adapter.py CHANGED Viewed

@@ -4,7 +4,6 @@ from typing import Any, Dict
 from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
 from evalscope.api.dataset import Sample
-from evalscope.api.evaluator import TaskState
 from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
 from evalscope.api.registry import register_benchmark
 from evalscope.constants import Tags
@@ -14,15 +13,7 @@ from evalscope.utils.multi_choices import MultipleChoiceTemplate, parse_answers,
 logger = get_logger()
-SUBSET_LIST = ['default']
-OPEN_PROMPT = """
-Solve the following problem step by step. The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.
-{question}
-Remember to put your answer on its own line at the end in the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem, and you do not need to use a \\boxed command.
-"""
+OPEN_PROMPT = '{question}\nPlease reason step by step, and put your final answer within \\boxed{{}} without units.'
 MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
@@ -38,8 +29,11 @@ OPEN_TYPE = 'free_form'
         tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
         description=
         'MathVista is a consolidated Mathematical reasoning benchmark within Visual contexts. It consists of three newly created datasets, IQTest, FunctionQA, and PaperQA, which address the missing visual domains and are tailored to evaluate logical reasoning on puzzle test figures, algebraic reasoning over functional plots, and scientific reasoning with academic paper figures, respectively. It also incorporates 9 MathQA datasets and 19 VQA datasets from the literature, which significantly enrich the diversity and complexity of visual perception and mathematical reasoning challenges within our benchmark. In total, MathVista includes 6,141 examples collected from 31 different datasets.',
-        subset_list=SUBSET_LIST,
-        metric_list=['acc'],
+        metric_list=[{
+            'acc': {
+                'numeric': True
+            }
+        }],
         eval_split='testmini',
         prompt_template=OPEN_PROMPT,
     )
@@ -86,20 +80,6 @@ class MathVistaAdapter(VisionLanguageAdapter):
             logger.warning(f"Answer '{value}' not found in options: {options}. This may cause evaluation issues.")
             return value
-    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
-        question_type = task_state.metadata['question_type']
-        if question_type == MULTI_CHOICE_TYPE:
-            answers = parse_answers(task_state)
-            return ''.join(sorted(list(answers)))
-        elif question_type == OPEN_TYPE:
-            pattern = r'ANSWER:\s*(.*)'
-            match = re.search(pattern, prediction)
-            if match:
-                return match.group(1).strip()
-            return ''
-        else:
-            raise ValueError(f'Unsupported question type: {question_type}')
     @staticmethod
     def create_content_and_answers_list(record: dict[str, Any], ) -> tuple[list[Content], list[str]]:
         """

evalscope/benchmarks/mm_bench/mm_bench_adapter.py CHANGED Viewed

@@ -35,7 +35,7 @@ class CCBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
     def record_to_sample(self, record: Dict[str, Any]) -> Sample:
         answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
-        input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
+        input_text = prompt(question=record['question'], choices=answers_list, template=self.prompt_template)
         content_list: List[Content] = [ContentText(text=input_text)]
         image = record.get('image')
         if image:
@@ -77,7 +77,7 @@ class MMBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
         answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
         answers_list = [ans for ans in answers_list if (ans.strip() and ans != 'nan')]
         question_hint = record['hint'] + record['question']
-        input_text = prompt(question=question_hint, choices=answers_list, template=MULT_CHOICE_PROMPT)
+        input_text = prompt(question=question_hint, choices=answers_list, template=self.prompt_template)
         content_list: List[Content] = [ContentText(text=input_text)]
         image = record.get('image')
         if image:

evalscope/benchmarks/mmmu/mmmu_adapter.py CHANGED Viewed

@@ -122,7 +122,7 @@ class MMMUAdapter(VisionLanguageAdapter):
             match = re.search(pattern, prediction)
             if match:
                 return match.group(1).strip()
-            return ''
+            return prediction.strip()
         else:
             raise ValueError(f'Unsupported question type: {question_type}')

evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py CHANGED Viewed

@@ -36,7 +36,7 @@ Don't give information outside the document or repeat your findings."""
         tags=[Tags.RETRIEVAL, Tags.LONG_CONTEXT],
         description='Needle in a Haystack is a benchmark focused on information retrieval tasks. '
         'It requires the model to find specific information within a large corpus of text. '
-        '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/needle_haystack.html)',  # noqa: E501
+        '[Usage Example](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html)',  # noqa: E501
         dataset_id='AI-ModelScope/Needle-in-a-Haystack-Corpus',
         metric_list=['acc'],
         subset_list=['english', 'chinese'],

evalscope/benchmarks/ner/__init__.py ADDED Viewed

File without changes

evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.2py3-none-any.whl → 1.1.1py3-none-any.whl