PyPI - evalscope - Versions diffs - 1.0.2__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

evalscope 1.0.2py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (87) hide show

evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +8 -2
evalscope/app/utils/data_utils.py +3 -2
evalscope/app/utils/visualization.py +2 -2
evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
evalscope/benchmarks/bfcl/bfcl_adapter.py +10 -45
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/metrics/metric.py +51 -0
evalscope/metrics/metrics.py +16 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/report/__init__.py +9 -1
evalscope/report/combinator.py +52 -2
evalscope/utils/json_schema.py +8 -6
evalscope/utils/multi_choices.py +16 -1
evalscope/version.py +2 -2
{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/METADATA +6 -32
{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/RECORD +51 -54
{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
tests/__init__.py +0 -1
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -429
tests/benchmark/test_image_edit.py +0 -65
tests/benchmark/test_sandbox.py +0 -81
tests/benchmark/test_t2i.py +0 -142
tests/benchmark/test_vlm.py +0 -137
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -269
tests/cli/test_collection.py +0 -99
tests/cli/test_custom.py +0 -268
tests/cli/test_reasoning.py +0 -81
tests/common.py +0 -73
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -206
tests/rag/test_clip_benchmark.py +0 -87
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
{tests/rag → evalscope/benchmarks/blink}/__init__.py +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0

evalscope/api/benchmark/adapters/default_data_adapter.py CHANGED Viewed

@@ -128,6 +128,9 @@ class DefaultDataAdapter(DataAdapter):
             for sample in self.test_dataset[subset]:
                 if isinstance(sample.input, str):
                     sample.input = self.process_sample_str_input(sample, subset)
+                elif isinstance(sample.input, list):
+                    # Handle list[ChatMessage] and add system prompt if needed
+                    sample.input = self.process_sample_messages_input(sample, subset)
     def process_sample_str_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
         """
@@ -142,6 +145,15 @@ class DefaultDataAdapter(DataAdapter):
             input_messages.insert(0, ChatMessageSystem(content=self.system_prompt))
         return input_messages
+    def process_sample_messages_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
+        """
+        Normalize a sample's existing List[ChatMessage] input and ensure system prompt is set once.
+        """
+        messages = list(sample.input)  # shallow copy to avoid in-place mutations
+        if self.system_prompt and not any(isinstance(m, ChatMessageSystem) for m in messages):
+            messages = [ChatMessageSystem(content=self.system_prompt)] + messages
+        return messages
     def process_sample_input(self, sample: Sample, subset: str) -> str:
         """
         Process a single sample's input by applying prompt templates and few-shot formatting.

evalscope/app/ui/multi_model.py CHANGED Viewed

@@ -204,7 +204,12 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
         data_score_df_b, _ = get_single_dataset_df(report_df_b, dataset_name)
         # Get subset choices - should be same for both models
-        subsets = data_score_df_a[ReportKey.subset_name].unique().tolist()
+        # Only select the subsets that Cat.0 is not '-'
+        df_for_subsets = data_score_df_a.copy()
+        subsets = sorted(
+            df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
+                               ReportKey.subset_name].dropna().unique().tolist()
+        )
         return gr.update(choices=subsets, value=None), None

evalscope/app/ui/single_model.py CHANGED Viewed

@@ -134,11 +134,17 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
     )
     def update_single_report_dataset(dataset_name, report_list):
         logger.debug(f'Updating single report dataset: {dataset_name}')
-        report_df = get_data_frame(report_list=report_list)
+        report_df = get_data_frame(report_list=report_list, flatten_metrics=True, flatten_categories=True)
         analysis = get_report_analysis(report_list, dataset_name)
         data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
         data_score_plot = plot_single_dataset_scores(data_score_df)
-        subsets = data_score_df[ReportKey.subset_name].unique().tolist()
+        # Only select the subsets that Cat.0 is not '-'
+        df_for_subsets = data_score_df.copy()
+        subsets = sorted(
+            df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
+                               ReportKey.subset_name].dropna().unique().tolist()
+        )
         logger.debug(f'subsets: {subsets}')
         return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis

evalscope/app/utils/data_utils.py CHANGED Viewed

@@ -168,9 +168,10 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
             'Index': str(review_result.index),
             'Input': review_result.input.replace('\n', '\n\n'),  # for markdown
             'Metadata': metadata,
-            'Generated': prediction,
+            'Generated': prediction or '',  # Ensure no None value
             'Gold': target,
-            'Pred': extracted_prediction if extracted_prediction != prediction else '*Same as Generated*',
+            'Pred': (extracted_prediction if extracted_prediction != prediction else '*Same as Generated*')
+            or '',  # Ensure no None value
             'Score': score.model_dump(exclude_none=True),
             'NScore': normalize_score(score.main_value)
         }

evalscope/app/utils/visualization.py CHANGED Viewed

@@ -18,7 +18,7 @@ logger = get_logger()
 def plot_single_report_scores(df: pd.DataFrame):
     if df is None:
         return None
-    logger.debug(f'df: {df}')
+    logger.debug(f'df: \n{df}')
     plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
     width = DEFAULT_BAR_WIDTH if len(df[ReportKey.dataset_name]) <= 5 else None
@@ -36,7 +36,7 @@ def plot_single_report_sunburst(report_list: List[Report]):
         df = get_data_frame(report_list=report_list, flatten_metrics=False)
         categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
         path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
-    logger.debug(f'df: {df}')
+    logger.debug(f'df: \n{df}')
     df[categories] = df[categories].fillna('default')  # NOTE: fillna for empty categories
     plot = px.sunburst(

evalscope/benchmarks/ai2d/ai2d_adapter.py CHANGED Viewed

@@ -22,7 +22,8 @@ MULT_CHOICE_PROMPT = MultipleChoiceTemplate.SINGLE_ANSWER_COT
         name='ai2d',
         pretty_name='AI2D',
         tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
-        description='A Diagram Is Worth A Dozen Images',
+        description=
+        'AI2D is a benchmark dataset for researching the understanding of diagrams by AI. It contains over 5,000 diverse diagrams from science textbooks (e.g., the water cycle, food webs). Each diagram is accompanied by multiple-choice questions that test an AI\'s ability to interpret visual elements, text labels, and their relationships. The benchmark is challenging because it requires jointly understanding the layout, symbols, and text to answer questions correctly.',  # noqa: E501
         dataset_id='lmms-lab/ai2d',
         subset_list=SUBSET_LIST,
         metric_list=['acc'],
@@ -37,7 +38,7 @@ class Ai2dAdapter(VisionLanguageAdapter):
     def record_to_sample(self, record: Dict[str, Any]) -> Sample:
         answers_list: list[str] = record['options']
-        input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
+        input_text = prompt(question=record['question'], choices=answers_list, template=self.prompt_template)
         content_list: list[Content] = [ContentText(text=input_text)]
         image = record.get('image')
         if image:

evalscope/benchmarks/bfcl/bfcl_adapter.py CHANGED Viewed

@@ -8,11 +8,10 @@ from evalscope.api.dataset import Sample
 from evalscope.api.evaluator import TaskState
 from evalscope.api.messages.chat_message import ChatMessageUser
 from evalscope.api.metric import Score
-from evalscope.api.metric.scorer import AggScore
 from evalscope.api.model import Model, ModelOutput
 from evalscope.api.registry import register_benchmark
 from evalscope.constants import Tags
-from evalscope.report import Category, Report, Subset
+from evalscope.report import Category, Report, Subset, unweighted_average_from_subsets, weighted_average_from_subsets
 from evalscope.utils.import_utils import check_import
 from evalscope.utils.logger import get_logger
@@ -79,40 +78,6 @@ class BFCLAdapter(DefaultDataAdapter):
         self.underscore_to_dot = self.extra_params.get('underscore_to_dot', True)
         self.is_fc_model = self.extra_params.get('is_fc_model', True)
-    def _weighted_average_from_subsets(self, subset_names: List[str], subset_dict: Dict[str, Subset]) -> Subset:
-        """Calculate weighted average for given subsets.
-        Returns:
-            Subset: A new Subset object with weighted average score
-        """
-        total_score = 0
-        total_count = 0
-        for name in subset_names:
-            if name in subset_dict:
-                subset = subset_dict[name]
-                total_score += subset.score * subset.num
-                total_count += subset.num
-        weighted_avg = total_score / total_count if total_count > 0 else 0
-        return Subset(name='', score=weighted_avg, num=total_count)
-    def _unweighted_average_from_subsets(self, subset_names: List[str], subset_dict: Dict[str, Subset]) -> Subset:
-        """Calculate unweighted average for given subsets.
-        Returns:
-            Subset: A new Subset object with unweighted average score
-        """
-        scores = []
-        total_count = 0
-        for name in subset_names:
-            if name in subset_dict:
-                subset = subset_dict[name]
-                scores.append(subset.score)
-                total_count += subset.num
-        unweighted_avg = sum(scores) / len(scores) if scores else 0
-        return Subset(name='', score=unweighted_avg, num=total_count)
     def preprocess_row(self, row: dict):
         """
         Inplace preprocess the row to ensure it has the correct format for BFCL evaluation.
@@ -323,19 +288,19 @@ class BFCLAdapter(DefaultDataAdapter):
             # Step 1: Calculate simple_ast (simple, java, javascript unweighted average)
             simple_subsets = ['simple', 'java', 'javascript']
-            simple_ast = self._unweighted_average_from_subsets(simple_subsets, subset_dict)
+            simple_ast = unweighted_average_from_subsets(simple_subsets, subset_dict)
             subset_dict['simple_ast'] = simple_ast
             # Step 2.1: Calculate ast_non_live
             # (simple_ast, multiple, parallel, parallel_multiple unweighted average)
             ast_non_live_subsets = ['simple_ast', 'multiple', 'parallel', 'parallel_multiple']
-            ast_non_live = self._unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
+            ast_non_live = unweighted_average_from_subsets(ast_non_live_subsets, subset_dict)
             subset_dict['ast_non_live'] = ast_non_live
             # Step 2.2: Calculate ast_live
             # (live_simple, live_multiple, live_parallel, live_parallel_multiple weighted average)
             live_subsets = ['live_simple', 'live_multiple', 'live_parallel', 'live_parallel_multiple']
-            ast_live = self._weighted_average_from_subsets(live_subsets, subset_dict)
+            ast_live = weighted_average_from_subsets(live_subsets, subset_dict)
             subset_dict['ast_live'] = ast_live
             # Step 2.3: hallucination_non_live (irrelevance)
@@ -346,7 +311,7 @@ class BFCLAdapter(DefaultDataAdapter):
             # Step 2.4: Calculate hallucination_live (live_irrelevance, live_relevance weighted average)
             hallucination_live_subsets = ['live_irrelevance', 'live_relevance']
-            hallucination_live = self._weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
+            hallucination_live = weighted_average_from_subsets(hallucination_live_subsets, subset_dict)
             subset_dict['hallucination_live'] = hallucination_live
             # Step 2.5: multi_turn_base
@@ -356,27 +321,27 @@ class BFCLAdapter(DefaultDataAdapter):
             # Step 2.6: Calculate multi_turn_augmented
             # (multi_turn_miss_func, multi_turn_miss_param, multi_turn_long_context weighted average)
             multi_turn_augmented_subsets = ['multi_turn_miss_func', 'multi_turn_miss_param', 'multi_turn_long_context']
-            multi_turn_augmented = self._weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
+            multi_turn_augmented = weighted_average_from_subsets(multi_turn_augmented_subsets, subset_dict)
             subset_dict['multi_turn_augmented'] = multi_turn_augmented
             # Step 3.1: Calculate non_live (ast_non_live, hallucination_non_live unweighted average)
             non_live_subsets = ['ast_non_live', 'hallucination_non_live']
-            non_live = self._unweighted_average_from_subsets(non_live_subsets, subset_dict)
+            non_live = unweighted_average_from_subsets(non_live_subsets, subset_dict)
             subset_dict['non_live'] = non_live
             # Step 3.2: Calculate live (ast_live, hallucination_live weighted average)
             live_agg_subsets = ['ast_live', 'hallucination_live']
-            live = self._weighted_average_from_subsets(live_agg_subsets, subset_dict)
+            live = weighted_average_from_subsets(live_agg_subsets, subset_dict)
             subset_dict['live'] = live
             # Step 3.3: Calculate multi_turn (multi_turn_base, multi_turn_augmented unweighted average)
             multi_turn_subsets = ['multi_turn_base', 'multi_turn_augmented']
-            multi_turn = self._unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
+            multi_turn = unweighted_average_from_subsets(multi_turn_subsets, subset_dict)
             subset_dict['multi_turn'] = multi_turn
             # Step 4: Calculate overall (non_live, live, multi_turn unweighted average)
             overall_subsets = ['non_live', 'live', 'multi_turn']
-            overall = self._unweighted_average_from_subsets(overall_subsets, subset_dict)
+            overall = unweighted_average_from_subsets(overall_subsets, subset_dict)
             subset_dict['overall'] = overall
             # Add computed scores to the category

evalscope/benchmarks/blink/blink_adapter.py ADDED Viewed

@@ -0,0 +1,61 @@
+import re
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, MultiChoiceAdapter, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+from evalscope.utils.multi_choices import format_letter_choices
+logger = get_logger()
+MULT_CHOICE_PROMPT = r"""
+Answer the following multiple choice question. The last line of your response should be of the following format:
+'ANSWER: $LETTER' (without quotes) where LETTER is one of {letters}.
+{question}
+""".strip()
+SUBSET_LIST = [
+    'Art_Style', 'Counting', 'Forensic_Detection', 'Functional_Correspondence', 'IQ_Test', 'Jigsaw',
+    'Multi-view_Reasoning', 'Object_Localization', 'Relative_Depth', 'Relative_Reflectance', 'Semantic_Correspondence',
+    'Spatial_Relation', 'Visual_Correspondence', 'Visual_Similarity'
+]
+@register_benchmark(
+    BenchmarkMeta(
+        name='blink',
+        pretty_name='BLINK',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.MULTIPLE_CHOICE],
+        description=
+        'BLINK is a benchmark designed to evaluate the core visual perception abilities of multimodal large language models (MLLMs). It transforms 14 classic computer vision tasks into 3,807 multiple-choice questions, accompanied by single or multiple images and visual prompts.',  # noqa: E501
+        dataset_id='evalscope/BLINK',
+        subset_list=SUBSET_LIST,
+        metric_list=['acc'],
+        eval_split='val',
+        prompt_template=MULT_CHOICE_PROMPT,
+    )
+)
+class BLINKAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
+    MAX_IMAGES: int = 4
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        choices = record.get('choices')
+        input_text = MULT_CHOICE_PROMPT.format(question=record['prompt'], letters=format_letter_choices(choices))
+        content_list: List[Content] = [ContentText(text=input_text)]
+        for i in range(1, self.MAX_IMAGES + 1):
+            image = record.get(f'image_{i}')
+            if image:
+                image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
+                content_list.append(ContentImage(image=image_base64))
+        label_answer = record['answer'].strip('(').strip(')')
+        return Sample(input=[ChatMessageUser(content=content_list)], choices=choices, target=label_answer)

evalscope/benchmarks/chartqa/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/chartqa/chartqa_adapter.py ADDED Viewed

@@ -0,0 +1,80 @@
+import re
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.metric.scorer import Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+# flake8: noqa
+logger = get_logger()
+OPEN_PROMPT = """
+{question}
+The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the a single word answer to the problem.
+"""
+@register_benchmark(
+    BenchmarkMeta(
+        name='chartqa',
+        pretty_name='ChartQA',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
+        description=
+        'ChartQA is a benchmark designed to evaluate question-answering capabilities about charts (e.g., bar charts, line graphs, pie charts), focusing on both visual and logical reasoning.',  # noqa: E501
+        dataset_id='lmms-lab/ChartQA',
+        subset_list=['human_test', 'augmented_test'],
+        metric_list=['relaxed_acc'],
+        eval_split='test',
+        prompt_template=OPEN_PROMPT,
+    )
+)
+class ChartQAAdapter(VisionLanguageAdapter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.add_aggregation_name = False
+        self.reformat_subset = True
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        question = record['question']
+        image_data = record['image']
+        image_base64 = bytes_to_base64(image_data['bytes'], format='png', add_header=True)
+        content_list: List[Content] = [
+            ContentText(text=OPEN_PROMPT.format(question=question)),
+            ContentImage(image=image_base64)
+        ]
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            target=record['answer'],
+            subset_key=record['type'],  # 'human_test' or 'augmented_split'
+        )
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        pattern = r'ANSWER:\s*(.*)'
+        match = re.search(pattern, prediction)
+        if match:
+            return match.group(1).strip()
+        return ''
+    def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
+        from .utils import relaxed_correctness
+        score = relaxed_correctness(filtered_prediction, reference)
+        score = 1.0 if score else 0.0
+        return Score(
+            value={'relaxed_acc': score},
+            prediction=original_prediction,
+            extracted_prediction=filtered_prediction,
+        )

evalscope/benchmarks/chartqa/utils.py ADDED Viewed

@@ -0,0 +1,38 @@
+def relaxed_correctness(prediction: str, target: str, max_relative_change: float = 0.05) -> bool:
+    """Calculates relaxed correctness.
+    The correctness tolerates certain error ratio defined by max_relative_change.
+    See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
+    “Following Methani et al. (2020), we use a relaxed accuracy measure for the
+    numeric answers to allow a minor inaccuracy that may result from the automatic
+    data extraction process. We consider an answer to be correct if it is within
+    5% of the gold answer. For non-numeric answers, we still need an exact match
+    to consider an answer to be correct.”
+    This funcion is taken from https://github.com/QwenLM/Qwen-VL/blob/34b4c0ee7b07726371b960911f249fe61b362ca3/eval_mm/evaluate_vqa.py#L113
+    Args:
+      target: List of target string.
+      prediction: List of predicted string.
+      max_relative_change: Maximum relative change.
+    Returns:
+      Whether the prediction was correct given the specified tolerance.
+    """  # noqa: E501
+    def _to_float(text: str):
+        try:
+            if text.endswith('%'):
+                # Convert percentages to floats.
+                return float(text.rstrip('%')) / 100.0
+            else:
+                return float(text)
+        except ValueError:
+            return None
+    prediction_float = _to_float(prediction)
+    target_float = _to_float(target)
+    if prediction_float is not None and target_float:
+        relative_change = abs(prediction_float - target_float) / abs(target_float)
+        return relative_change <= max_relative_change
+    else:
+        return prediction.lower() == target.lower()

evalscope/benchmarks/docvqa/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/docvqa/docvqa_adapter.py ADDED Viewed

@@ -0,0 +1,67 @@
+import json
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator.state import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+PROMPT = """Answer the question according to the image using a single word or phrase.
+{question}
+The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question."""  # noqa: E501
+@register_benchmark(
+    BenchmarkMeta(
+        name='docvqa',
+        pretty_name='DocVQA',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
+        description=
+        'DocVQA (Document Visual Question Answering) is a benchmark designed to evaluate AI systems on their ability to answer questions based on the content of document images, such as scanned pages, forms, or invoices. Unlike general visual question answering, it requires understanding not just the text extracted by OCR, but also the complex layout, structure, and visual elements of a document.',  # noqa: E501
+        dataset_id='lmms-lab/DocVQA',
+        subset_list=['DocVQA'],
+        metric_list=['anls'],
+        eval_split='validation',
+        prompt_template=PROMPT,
+    )
+)
+class DocVQAAdapter(VisionLanguageAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.add_aggregation_name = False
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        input_text = PROMPT.format(question=record['question'])
+        content_list: List[Content] = [ContentText(text=input_text)]
+        image = record.get('image')
+        if image:
+            image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
+            content_list.append(ContentImage(image=image_base64))
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            target=json.dumps(record.get('answers')),  # answers is a list
+            metadata={
+                'questionId': record.get('questionId'),
+                'question_types': record.get('question_types'),
+                'docId': record.get('docId'),
+                'ucsf_document_id': record.get('ucsf_document_id'),
+                'ucsf_document_page_no': record.get('ucsf_document_page_no'),
+            }
+        )
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        import re
+        pattern = r'ANSWER:\s*(.*)'
+        match = re.search(pattern, prediction)
+        if match:
+            return match.group(1).strip()
+        return prediction.strip()

evalscope/benchmarks/general_arena/utils.py CHANGED Viewed

@@ -34,7 +34,8 @@ def process_review_item(review_result: ReviewResult) -> list:
         'Index': str(review_result.index),
         'Input': review_result.input,
         'Question': review_result.input,  # Use input as question
-        'Generated': prediction if prediction != extracted_prediction else extracted_prediction,
+        'Generated':
+        prediction if prediction != extracted_prediction else extracted_prediction or '',  # Ensure no None value
         'Gold': target,
         'Pred': extracted_prediction,
         'Score': sample_score.score.model_dump(exclude_none=True),

evalscope/benchmarks/hle/hle_adapter.py CHANGED Viewed

@@ -57,8 +57,9 @@ Your judgment must focus only on if there are meaningful differences between [co
         'humanities/social science (9%), computer science/artificial intelligence (10%), '
         'engineering (4%), chemistry (7%), and other (9%). Around 14% of the questions '
         'require the ability to understand both text and images, i.e., multi-modality. '
-        '24% of the questions are multiple-choice; the rest are short-answer, exact-match questions. '
-        'To evaluate the performance of model without multi-modality capabilities, please set the extra_params["include_multi_modal"] to False.',  # noqa: E501
+        '24% of the questions are multiple-choice; the rest are short-answer, exact-match questions. \n'
+        '**To evaluate the performance of model without multi-modality capabilities, '
+        'please set the `extra_params["include_multi_modal"]` to `False`.**',  # noqa: E501
         dataset_id='cais/hle',
         subset_list=SUBSET_LIST,
         metric_list=['acc'],

evalscope/benchmarks/infovqa/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/infovqa/infovqa_adapter.py ADDED Viewed

@@ -0,0 +1,66 @@
+import json
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator.state import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+PROMPT = """Answer the question according to the image using a single word or phrase.
+{question}
+The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question."""  # noqa: E501
+@register_benchmark(
+    BenchmarkMeta(
+        name='infovqa',
+        pretty_name='InfoVQA',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
+        description=
+        'InfoVQA (Information Visual Question Answering) is a benchmark designed to evaluate how well AI models can answer questions based on information-dense images, such as charts, graphs, diagrams, maps, and infographics.',  # noqa: E501
+        dataset_id='lmms-lab/DocVQA',
+        subset_list=['InfographicVQA'],
+        metric_list=['anls'],
+        eval_split='validation',
+        prompt_template=PROMPT,
+    )
+)
+class InfoVQAAdapter(VisionLanguageAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.add_aggregation_name = False
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        input_text = PROMPT.format(question=record['question'])
+        content_list: List[Content] = [ContentText(text=input_text)]
+        image = record.get('image')
+        if image:
+            image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
+            content_list.append(ContentImage(image=image_base64))
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            target=json.dumps(record.get('answers')),  # answers is a list
+            metadata={
+                'questionId': record.get('questionId'),
+                'answer_type': record.get('answer_type'),
+                'image_url': record.get('image_url'),
+                'ocr': record.get('ocr'),
+            }
+        )
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        import re
+        pattern = r'ANSWER:\s*(.*)'
+        match = re.search(pattern, prediction)
+        if match:
+            return match.group(1).strip()
+        return prediction.strip()

evalscope/benchmarks/mm_bench/mm_bench_adapter.py CHANGED Viewed

@@ -35,7 +35,7 @@ class CCBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
     def record_to_sample(self, record: Dict[str, Any]) -> Sample:
         answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
-        input_text = prompt(question=record['question'], choices=answers_list, template=MULT_CHOICE_PROMPT)
+        input_text = prompt(question=record['question'], choices=answers_list, template=self.prompt_template)
         content_list: List[Content] = [ContentText(text=input_text)]
         image = record.get('image')
         if image:
@@ -77,7 +77,7 @@ class MMBenchAdapter(VisionLanguageAdapter, MultiChoiceAdapter):
         answers_list: List[str] = [record.get('A', ''), record.get('B', ''), record.get('C', ''), record.get('D', '')]
         answers_list = [ans for ans in answers_list if (ans.strip() and ans != 'nan')]
         question_hint = record['hint'] + record['question']
-        input_text = prompt(question=question_hint, choices=answers_list, template=MULT_CHOICE_PROMPT)
+        input_text = prompt(question=question_hint, choices=answers_list, template=self.prompt_template)
         content_list: List[Content] = [ContentText(text=input_text)]
         image = record.get('image')
         if image:

evalscope/benchmarks/mmmu/mmmu_adapter.py CHANGED Viewed

@@ -122,7 +122,7 @@ class MMMUAdapter(VisionLanguageAdapter):
             match = re.search(pattern, prediction)
             if match:
                 return match.group(1).strip()
-            return ''
+            return prediction.strip()
         else:
             raise ValueError(f'Unsupported question type: {question_type}')

evalscope/benchmarks/ocr_bench/__init__.py ADDED Viewed

File without changes

evalscope 1.0.2__py3-none-any.whl → 1.1.0__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.2py3-none-any.whl → 1.1.0py3-none-any.whl