PyPI - evalscope - Versions diffs - 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl - Mend

evalscope 1.0.2py3-none-any.whl → 1.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show

evalscope/api/benchmark/__init__.py +8 -1
evalscope/api/benchmark/adapters/__init__.py +1 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/benchmark.py +14 -0
evalscope/api/dataset/dataset.py +21 -0
evalscope/api/dataset/loader.py +6 -2
evalscope/api/mixin/sandbox_mixin.py +32 -54
evalscope/api/model/generate_config.py +6 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +8 -2
evalscope/app/utils/data_utils.py +3 -2
evalscope/app/utils/visualization.py +2 -2
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +111 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/constants.py +4 -0
evalscope/evaluator/evaluator.py +72 -79
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +52 -1
evalscope/metrics/metrics.py +16 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/utils/openai.py +4 -0
evalscope/perf/arguments.py +24 -4
evalscope/perf/benchmark.py +74 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +179 -79
evalscope/perf/plugin/api/openai_api.py +4 -3
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/utils/benchmark_util.py +36 -22
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +0 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +11 -2
evalscope/report/combinator.py +52 -2
evalscope/run.py +4 -0
evalscope/utils/function_utils.py +195 -12
evalscope/utils/io_utils.py +74 -0
evalscope/utils/json_schema.py +8 -6
evalscope/utils/logger.py +49 -17
evalscope/utils/multi_choices.py +16 -1
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
tests/__init__.py +0 -1
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -429
tests/benchmark/test_image_edit.py +0 -65
tests/benchmark/test_sandbox.py +0 -81
tests/benchmark/test_t2i.py +0 -142
tests/benchmark/test_vlm.py +0 -137
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -269
tests/cli/test_collection.py +0 -99
tests/cli/test_custom.py +0 -268
tests/cli/test_reasoning.py +0 -81
tests/common.py +0 -73
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -206
tests/rag/test_clip_benchmark.py +0 -87
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
{tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0

evalscope/benchmarks/visu_logic/visu_logic_adapter.py ADDED Viewed

@@ -0,0 +1,75 @@
+# flake8: noqa: E501
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+from evalscope.utils.multi_choices import parse_answers
+logger = get_logger()
+MULT_CHOICE_PROMPT = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of A, B, C, D. Think step by step before answering.
+{question}
+"""
+SUBSET_LIST = [
+    'Quantitative Reasoning', 'Other', 'Positional Reasoning', 'Stylistic Reasoning', 'Spatial Reasoning',
+    'Attribute Reasoning'
+]
+@register_benchmark(
+    BenchmarkMeta(
+        name='visulogic',
+        pretty_name='VisuLogic',
+        dataset_id='evalscope/VisuLogic',
+        tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
+        description=
+        'VisuLogic is a benchmark aimed at evaluating the visual reasoning capabilities of Multi-modal Large Language Models (MLLMs), independent of textual reasoning processes. It features carefully constructed visual reasoning tasks spanning multiple categories, divided into six types based on required reasoning skills (e.g., Quantitative Reasoning, which involves understanding and deducing changes in the quantity of elements in images). Unlike existing benchmarks, VisuLogic is a challenging visual reasoning benchmark that is inherently difficult to articulate using language, providing a more rigorous evaluation of the visual reasoning capabilities of MLLMs.',
+        subset_list=SUBSET_LIST,
+        metric_list=['acc'],
+        eval_split='test',
+        prompt_template=MULT_CHOICE_PROMPT,
+    )
+)
+class VisuLogicAdapter(VisionLanguageAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.reformat_subset = True
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        question = record.get('question', '')
+        content_list: List[Content] = []
+        prompt_text = self.prompt_template.format(question=question).strip()
+        content_list.append(ContentText(text=prompt_text))
+        image = record.get('image')
+        if image and isinstance(image, dict):
+            image_bytes = image.get('bytes')
+            if image_bytes:
+                image_base64 = bytes_to_base64(image_bytes, format='png', add_header=True)
+                content_list.append(ContentImage(image=image_base64))
+        metadata = {
+            'id': record['id'],
+        }
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            target=record['label'],
+            choices=['A', 'B', 'C', 'D'],
+            subset_key=record['tag'],
+            metadata=metadata,
+        )
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        answers = parse_answers(task_state)
+        return ''.join(sorted(list(answers)))

evalscope/benchmarks/zerobench/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/zerobench/zerobench_adapter.py ADDED Viewed

@@ -0,0 +1,64 @@
+# flake8: noqa: E501
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64, compress_image_to_limit
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+# 定义提示模板
+PROMPT_TEMPLATE = """{question}
+\n\n\nLet's think step by step and give the final answer in curly braces,
+like this: {{final answer}}"
+"""
+SUBSET_LIST = ['default']
+@register_benchmark(
+    BenchmarkMeta(
+        name='zerobench',
+        pretty_name='ZeroBench',
+        dataset_id='evalscope/zerobench',
+        tags=[Tags.KNOWLEDGE, Tags.QA, Tags.MULTI_MODAL],
+        description=
+        'ZeroBench is a challenging visual reasoning benchmark for Large Multimodal Models (LMMs). It consists of a main set of 100 high-quality, manually curated questions covering numerous domains, reasoning types and image type. Questions in ZeroBench have been designed and calibrated to be beyond the capabilities of current frontier models. As such, none of the evaluated models achieves a non-zero pass@1 (with greedy decoding) or 5/5 reliability score.',
+        subset_list=SUBSET_LIST,
+        metric_list=['acc'],
+        eval_split='zerobench',
+        train_split='zerobench_subquestions',
+        prompt_template=PROMPT_TEMPLATE,
+    )
+)
+class ZeroBenchAdapter(VisionLanguageAdapter):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._use_llm_judge = True
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        question = record['question_text']
+        content_list: List[Content] = [ContentText(text=self.prompt_template.format(question=question))]
+        image = record['question_images_decoded']
+        if len(image) > 0:
+            for img in image:
+                # Ensure image is under OpenAI's 10MB data-URI limit by compressing if needed
+                processed_bytes, fmt = compress_image_to_limit(img['bytes'], 10_000_000)
+                image_base64 = bytes_to_base64(processed_bytes, format=fmt, add_header=True)
+                content_list.append(ContentImage(image=image_base64))
+        metadata = {
+            'question_id': record['question_id'],
+            'question_images': record['question_images'],
+            'image_attribution': record['image_attribution']
+        }
+        return Sample(
+            input=[ChatMessageUser(content=content_list)], target=record['question_answer'], metadata=metadata
+        )

evalscope/constants.py CHANGED Viewed

@@ -16,6 +16,7 @@ DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
     os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
 )  # ~/.cache/evalscope
 IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1'  # To avoid some heavy dependencies when building doc
+HEARTBEAT_INTERVAL_SEC = 60  # 60 seconds
 class HubType:
@@ -121,6 +122,7 @@ class Tags:
     CHINESE = 'Chinese'
     COMMONSENSE = 'Commonsense'
     QA = 'QA'
+    NER = 'NER'
     READING_COMPREHENSION = 'ReadingComprehension'
     CUSTOM = 'Custom'
     INSTRUCTION_FOLLOWING = 'InstructionFollowing'
@@ -133,6 +135,8 @@ class Tags:
     MULTI_MODAL = 'MultiModal'
     MULTI_LINGUAL = 'MultiLingual'
     MULTI_TURN = 'MultiTurn'
+    YES_NO = 'Yes/No'
+    HALLUCINATION = 'Hallucination'
 class FileConstants:

evalscope/evaluator/evaluator.py CHANGED Viewed

@@ -10,14 +10,14 @@ and report generation.
 import os
 import traceback
 from collections import defaultdict
-from concurrent.futures import ThreadPoolExecutor, TimeoutError, as_completed
-from tqdm import tqdm
-from typing import TYPE_CHECKING, Dict, List, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List
 from evalscope.api.dataset import Dataset, DatasetDict, Sample
 from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
 from evalscope.api.metric import AggScore, SampleScore
+from evalscope.constants import HEARTBEAT_INTERVAL_SEC
 from evalscope.report import Report, gen_table
+from evalscope.utils.function_utils import run_in_threads_with_progress
 from evalscope.utils.logger import get_logger
 if TYPE_CHECKING:
@@ -91,22 +91,27 @@ class DefaultEvaluator(Evaluator):
             Report: The complete evaluation report containing all metrics and results.
         """
         # Load the dataset and evaluate each subset
+        logger.info(f'Start evaluating benchmark: {self.benchmark_name}')
         dataset_dict = self.benchmark.load_dataset()
         agg_score_dict = defaultdict(list)
         # Process each subset (e.g., test, validation) independently
+        logger.info('Evaluating all subsets of the dataset...')
         for subset, dataset in dataset_dict.items():
             if len(dataset) == 0:
                 logger.info(f'No samples found in subset: {subset}, skipping.')
                 continue
+            logger.info(f'Evaluating subset: {subset}')
             subset_score = self.evaluate_subset(subset, dataset)
             agg_score_dict[subset] = subset_score
         # Generate the report based on aggregated scores
+        logger.info('Generating report...')
         report = self.get_report(agg_score_dict)
         # Finalize the evaluation process
         self.finalize()
+        logger.info(f'Benchmark {self.benchmark_name} evaluation finished.')
         return report
     def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
@@ -126,12 +131,15 @@ class DefaultEvaluator(Evaluator):
             List[AggScore]: Aggregated scores for this subset.
         """
         # Get model predictions for all samples in the subset
+        logger.info(f'Getting predictions for subset: {subset}')
         task_states = self.get_answers(subset, dataset)
         # Calculate evaluation metrics for each prediction
+        logger.info(f'Getting reviews for subset: {subset}')
         sample_scores = self.get_reviews(subset, task_states)
         # Aggregate individual sample scores into subset-level metrics
+        logger.info(f'Aggregating scores for subset: {subset}')
         agg_scores = self.benchmark.aggregate_scores(sample_scores=sample_scores)
         return agg_scores
@@ -162,44 +170,38 @@ class DefaultEvaluator(Evaluator):
         # Convert dataset to list for parallel processing
         dataset_list = list(dataset)
         if not dataset_list:
             return task_state_list
-        # Process samples in parallel using ThreadPoolExecutor
-        with ThreadPoolExecutor(max_workers=min(len(dataset_list), self.task_config.eval_batch_size)) as executor:
-            # Submit all prediction tasks
-            future_to_sample = {
-                executor.submit(self._predict_sample, sample, model_prediction_dir): sample
-                for sample in dataset_list
-            }
-            # Process completed tasks with progress bar
-            with tqdm(total=len(dataset_list), desc=f'Predicting[{self.benchmark_name}@{subset}]: ') as pbar:
-                for future in as_completed(future_to_sample):
-                    sample = future_to_sample[future]
-                    try:
-                        task_state = future.result()
-                        task_state_list.append(task_state)
-                        # Save the prediction result to cache for future use
-                        model_result = self.cache_manager.save_prediction_cache(
-                            subset, task_state, self.benchmark.save_metadata
-                        )
-                        logger.debug(f'Model result: \n{model_result.pretty_print()}')
-                    except Exception as exc:
-                        tb_str = traceback.format_exc()
-                        logger.error(
-                            f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}'
-                        )
-                        if self.task_config.ignore_errors:
-                            logger.warning('Error ignored, continuing with next sample.')
-                        else:
-                            raise exc
-                    finally:
-                        pbar.update(1)
+        logger.info(f'Processing {len(dataset_list)} samples, if data is large, it may take a while.')
+        def worker(sample: Sample) -> TaskState:
+            return self._predict_sample(sample, model_prediction_dir)
+        def on_result(sample: Sample, task_state: TaskState) -> None:
+            model_result = self.cache_manager.save_prediction_cache(subset, task_state, self.benchmark.save_metadata)
+            logger.debug(f'Model result: \n{model_result.pretty_print()}')
+        def on_error(sample: Sample, exc: Exception) -> None:
+            tb_str = traceback.format_exc()
+            logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}')
+            if self.task_config.ignore_errors:
+                logger.warning('Error ignored, continuing with next sample.')
+                return
+            raise exc
+        new_task_states = run_in_threads_with_progress(
+            dataset_list,
+            worker,
+            desc=f'Predicting[{self.benchmark_name}@{subset}]: ',
+            max_workers=self.task_config.eval_batch_size,
+            heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
+            on_result=on_result,
+            on_error=on_error,
+        )
+        task_state_list.extend(new_task_states)
+        logger.info(f'Finished getting predictions for subset: {subset}.')
         return task_state_list
     def _predict_sample(self, sample: Sample, model_prediction_dir: str) -> TaskState:
@@ -246,49 +248,40 @@ class DefaultEvaluator(Evaluator):
         if not task_states:
             return sample_score_list
-        # Process task states in parallel using ThreadPoolExecutor
-        with ThreadPoolExecutor(max_workers=min(len(task_states), self.task_config.judge_worker_num)) as executor:
-            # Submit all review tasks
-            future_to_task_state = {
-                executor.submit(self._review_task_state, task_state): task_state
-                for task_state in task_states
-            }
-            # Process completed tasks with progress bar
-            with tqdm(total=len(task_states), desc=f'Reviewing[{self.benchmark_name}@{subset}]: ') as pbar:
-                for future in as_completed(future_to_task_state):
-                    task_state = future_to_task_state[future]
-                    try:
-                        try:
-                            sample_score = future.result()
-                        except TimeoutError:
-                            logger.warning(
-                                f'Timeout when reviewing sample {task_state.sample_id}, setting score to zero.'
-                            )
-                            sample_score = SampleScore(sample_id=task_state.sample_id, scores={})
-                        sample_score_list.append(sample_score)
-                        # Save the review result to cache for future use
-                        review_result = self.cache_manager.save_review_cache(
-                            subset=subset,
-                            task_state=task_state,
-                            sample_score=sample_score,
-                            save_metadata=self.benchmark.save_metadata
-                        )
-                        logger.debug(f'Review result: \n{review_result.pretty_print()}')
-                    except Exception as exc:
-                        tb_str = traceback.format_exc()
-                        logger.error(
-                            f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}'
-                        )
-                        if self.task_config.ignore_errors:
-                            logger.warning('Error ignored, continuing with next sample.')
-                        else:
-                            raise exc
-                    finally:
-                        pbar.update(1)
+        logger.info(f'Reviewing {len(task_states)} samples, if data is large, it may take a while.')
+        def worker(task_state: TaskState) -> SampleScore:
+            return self._review_task_state(task_state)
+        def on_result(task_state: TaskState, sample_score: SampleScore) -> None:
+            review_result = self.cache_manager.save_review_cache(
+                subset=subset,
+                task_state=task_state,
+                sample_score=sample_score,
+                save_metadata=self.benchmark.save_metadata
+            )
+            logger.debug(f'Review result: \n{review_result.pretty_print()}')
+        def on_error(task_state: TaskState, exc: Exception) -> None:
+            tb_str = traceback.format_exc()
+            logger.error(f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}')
+            if self.task_config.ignore_errors:
+                logger.warning('Error ignored, continuing with next sample.')
+                return
+            raise exc
+        new_scores = run_in_threads_with_progress(
+            task_states,
+            worker,
+            desc=f'Reviewing[{self.benchmark_name}@{subset}]: ',
+            max_workers=self.task_config.judge_worker_num,
+            heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
+            on_result=on_result,
+            on_error=on_error,
+        )
+        sample_score_list.extend(new_scores)
+        logger.info(f'Finished reviewing subset: {subset}. Total reviewed: {len(sample_score_list)}')
         return sample_score_list
     def _review_task_state(self, task_state: TaskState) -> SampleScore:

evalscope/metrics/math_parser.py CHANGED Viewed

@@ -211,6 +211,11 @@ def strip_answer_string(string):
     # Remove grade level (e.g., 12th grade) and just maintain the integer
     string = re.sub(r'thgrade$', '', string)
+    # Normalize thousands-formatted numbers (e.g., 70,000 or -1,234,567.89) by removing commas
+    # This must run before the "list of integers" sorting to avoid misclassifying numbers with thousand separators.
+    if re.fullmatch(r'\s*-?\d{1,3}(?:,\d{3})+(?:\.\d+)?\s*', string):
+        string = string.replace(',', '')
     # If the answer is a list of integers (without parenthesis), sort them
     if re.fullmatch(r'(\s*-?\d+\s*,)*\s*-?\d+\s*', string):
         # Split the string into a list of integers
@@ -262,6 +267,8 @@ def extract_answer(pred_str, use_last_number=True):
     elif '答案是' in pred_str:
         # Handle Chinese few-shot multiple choice problem answer extraction
         pred = pred_str.split('答案是')[1].strip().split('\n\n')[0].strip()
+    elif 'ANSWER:' in pred_str:
+        pred = pred_str.split('ANSWER:')[-1].strip()
     else:  # use the last number
         if use_last_number:
             pattern = '-?\d*\.?\d+'
@@ -529,3 +536,10 @@ def symbolic_equal(a, b):
         pass
     return False
+if __name__ == '__main__':
+    print(math_equal('\n\\boxed{70,\\!000}\n', '70000'))
+    print(extract_answer('The answer is \\boxed{70,\\!000}'))
+    print(strip_answer_string(extract_answer('The answer is \\boxed{70,\\!000}')))
+    print(math_equal(extract_answer('The answer is \\boxed{70,\\!000}'), '70000'))

evalscope/metrics/metric.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import json
 from collections import defaultdict
 from typing import List
@@ -42,7 +43,7 @@ class Accuracy(ExactMatch):
             results = []
             for prediction, reference in zip(predictions, references):
-                pred_answer = strip_answer_string(extract_answer(prediction))
+                pred_answer = extract_answer(prediction)
                 ref_answer = strip_answer_string(reference)
                 results.append(float(math_equal(pred_answer, ref_answer)))
@@ -100,6 +101,56 @@ class MultiChoiceAcc(Metric):
         return res
+@register_metric(name='anls')
+class ANLS(Metric):
+    def __init__(self, thresh_hold=0.5):
+        self.thresh_hold = thresh_hold
+    def apply(self, predictions, references):
+        """
+        Calculate ANLS (Average Normalized Levenshtein Similarity) for a list of predictions and references.
+        This implementation is adapted from
+        https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/infographicsvqa_eval.py
+        Args:
+            references (List[str]): List of correct answers. Each answer can be a string of json.
+            predictions (List[str]): List of predicted answers.
+        """
+        from .metrics import levenshtein_distance
+        res = []
+        # Unwrap predictions if it's a nested list
+        for prediction, reference in zip(predictions, references):
+            # Parse the reference which is a json string
+            try:
+                answer = json.loads(reference)
+            except json.JSONDecodeError:
+                answer = reference
+            if isinstance(answer, str):
+                answer = [answer]
+            assert isinstance(answer, list), 'The reference answer should be a list of answers.'
+            # Calculate ANLS for each reference answer
+            values = []
+            for ans in answer:
+                # preprocess both the answers - gt and prediction
+                gt_answer = ' '.join(ans.strip().lower().split())
+                det_answer = ' '.join(prediction.strip().lower().split())
+                dist = levenshtein_distance(gt_answer, det_answer)
+                length = max(len(ans.upper()), len(prediction.upper()))
+                values.append(0.0 if length == 0 else float(dist) / float(length))
+            question_result = 0.0
+            if values:
+                question_result = 1 - min(values)
+                if question_result < self.thresh_hold:
+                    question_result = 0.0
+            res.append(question_result)
+        return res
 # ##################
 # T2I Metrics ######
 ####################

evalscope/metrics/metrics.py CHANGED Viewed

@@ -467,3 +467,19 @@ def calculate_pass_at_k(
         num_samples_it = iter(num_samples)
     return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
+def levenshtein_distance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2 + 1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
+        distances = distances_
+    return distances[-1]

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py CHANGED Viewed

@@ -30,13 +30,9 @@ from transformers.modeling_outputs import (
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from transformers.modeling_utils import (
-    PreTrainedModel,
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
+from transformers.modeling_utils import PreTrainedModel
 from transformers.models.bert.configuration_bert import BertConfig
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.utils import logging
 from typing import Any, Dict, Optional, Tuple

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py CHANGED Viewed

@@ -14,13 +14,9 @@ from transformers.modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
-from transformers.modeling_utils import (
-    PreTrainedModel,
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
+from transformers.modeling_utils import PreTrainedModel
 from transformers.models.bert.configuration_bert import BertConfig
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.utils import logging
 from typing import Tuple

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py CHANGED Viewed

@@ -31,13 +31,9 @@ from transformers.modeling_outputs import (
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from transformers.modeling_utils import (
-    PreTrainedModel,
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
+from transformers.modeling_utils import PreTrainedModel
 from transformers.models.bert.configuration_bert import BertConfig
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.utils import logging
 from typing import Optional, Tuple

evalscope/models/utils/openai.py CHANGED Viewed

@@ -204,6 +204,10 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
         )
     if config.extra_body:
         params['extra_body'] = config.extra_body
+    if config.extra_query:
+        params['extra_query'] = config.extra_query
+    if config.extra_headers:
+        params['extra_headers'] = config.extra_headers
     return params

evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

Potentially problematic release.

evalscope 1.0.2py3-none-any.whl → 1.1.1py3-none-any.whl