PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/metrics/llm_judge.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import re
 from typing import Any, Dict, List, Optional
+from evalscope.api.messages import ChatMessage, ChatMessageSystem, ChatMessageUser
 from evalscope.constants import JudgeScoreType
 from evalscope.utils.logger import get_logger
@@ -109,20 +110,31 @@ class LLMJudge:
             config=GenerateConfig(**self.generation_config),
         )
-    def judge(self, prompt: str, system_prompt: Optional[str] = None) -> str:
+    def judge(
+        self,
+        prompt: str = '',
+        system_prompt: Optional[str] = None,
+        messages: Optional[List[ChatMessage]] = None
+    ) -> str:
         """
+        Generate a response from the LLM based on the provided prompt and context.
+        If messages is provided, it will be used as the input context.
         Args:
             prompt (str): The prompt to evaluate
             system_prompt (str, optional): The system prompt to use for the evaluation
+            messages (List[ChatMessage], optional): A list of chat messages to include in the evaluation
         Returns:
             str: The response from the LLM
         """
-        from evalscope.api.messages import ChatMessageSystem, ChatMessageUser
-        system_content = system_prompt or self.system_prompt
-        input_messages = [ChatMessageUser(content=prompt)]
-        if system_content:
-            input_messages.insert(0, ChatMessageSystem(content=system_content))
+        # parse messages
+        if messages is not None:
+            input_messages = messages
+        else:
+            system_content = system_prompt or self.system_prompt
+            input_messages = [ChatMessageUser(content=prompt)]
+            if system_content:
+                input_messages.insert(0, ChatMessageSystem(content=system_content))
         try:
             # Send request using ServerModelAdapter
             response = self.model.generate(input_messages)

evalscope/metrics/math_parser.py CHANGED Viewed

@@ -211,6 +211,11 @@ def strip_answer_string(string):
     # Remove grade level (e.g., 12th grade) and just maintain the integer
     string = re.sub(r'thgrade$', '', string)
+    # Normalize thousands-formatted numbers (e.g., 70,000 or -1,234,567.89) by removing commas
+    # This must run before the "list of integers" sorting to avoid misclassifying numbers with thousand separators.
+    if re.fullmatch(r'\s*-?\d{1,3}(?:,\d{3})+(?:\.\d+)?\s*', string):
+        string = string.replace(',', '')
     # If the answer is a list of integers (without parenthesis), sort them
     if re.fullmatch(r'(\s*-?\d+\s*,)*\s*-?\d+\s*', string):
         # Split the string into a list of integers
@@ -262,6 +267,8 @@ def extract_answer(pred_str, use_last_number=True):
     elif '答案是' in pred_str:
         # Handle Chinese few-shot multiple choice problem answer extraction
         pred = pred_str.split('答案是')[1].strip().split('\n\n')[0].strip()
+    elif 'ANSWER:' in pred_str:
+        pred = pred_str.split('ANSWER:')[-1].strip()
     else:  # use the last number
         if use_last_number:
             pattern = '-?\d*\.?\d+'
@@ -529,3 +536,10 @@ def symbolic_equal(a, b):
         pass
     return False
+if __name__ == '__main__':
+    print(math_equal('\n\\boxed{70,\\!000}\n', '70000'))
+    print(extract_answer('The answer is \\boxed{70,\\!000}'))
+    print(strip_answer_string(extract_answer('The answer is \\boxed{70,\\!000}')))
+    print(math_equal(extract_answer('The answer is \\boxed{70,\\!000}'), '70000'))

evalscope/metrics/metric.py CHANGED Viewed

@@ -1,16 +1,27 @@
+import json
+import numpy as np
+import os
 from collections import defaultdict
-from typing import List
+from typing import Dict, List
-from evalscope.api.metric import Aggregator, AggScore, Metric, SampleScore, T2IMetric
+from evalscope.api.metric import Aggregator, AggScore, Metric, SampleScore, SingletonMetric, T2IMetric
 from evalscope.api.registry import register_aggregation, register_metric
-from .metrics import mean
+from evalscope.utils.import_utils import check_import
+from .metrics import calculate_pass_at_k, calculate_pass_hat_k, mean, normalize_text
+# ##################
+# NLP Metrics ######
+# ##################
 @register_metric(name='exact_match')
 class ExactMatch(Metric):
     def apply(self, predictions, references):
-        return [float(prediction == reference) for prediction, reference in zip(predictions, references)]
+        return [
+            float(normalize_text(prediction) == normalize_text(reference))
+            for prediction, reference in zip(predictions, references)
+        ]
 @register_metric(name='acc')
@@ -30,13 +41,12 @@ class Accuracy(ExactMatch):
                     results.append(0.0)
             return results
         elif self.numeric:
-            from .math_parser import extract_answer, math_equal, strip_answer_string
+            from .math_parser import math_equal, strip_answer_string
             results = []
             for prediction, reference in zip(predictions, references):
-                pred_answer = strip_answer_string(extract_answer(prediction))
                 ref_answer = strip_answer_string(reference)
-                results.append(float(math_equal(pred_answer, ref_answer)))
+                results.append(float(math_equal(prediction, ref_answer)))
             return results
         else:
@@ -92,9 +102,114 @@ class MultiChoiceAcc(Metric):
         return res
+@register_metric(name='anls')
+class ANLS(Metric):
+    def __init__(self, thresh_hold=0.5):
+        self.thresh_hold = thresh_hold
+    def apply(self, predictions, references):
+        """
+        Calculate ANLS (Average Normalized Levenshtein Similarity) for a list of predictions and references.
+        This implementation is adapted from
+        https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/infographicsvqa_eval.py
+        Args:
+            references (List[str]): List of correct answers. Each answer can be a string of json.
+            predictions (List[str]): List of predicted answers.
+        """
+        from .metrics import levenshtein_distance
+        res = []
+        # Unwrap predictions if it's a nested list
+        for prediction, reference in zip(predictions, references):
+            # Parse the reference which is a json string
+            try:
+                answer = json.loads(reference)
+            except json.JSONDecodeError:
+                answer = reference
+            if isinstance(answer, str):
+                answer = [answer]
+            assert isinstance(answer, list), 'The reference answer should be a list of answers.'
+            # Calculate ANLS for each reference answer
+            values = []
+            for ans in answer:
+                # preprocess both the answers - gt and prediction
+                gt_answer = ' '.join(ans.strip().lower().split())
+                det_answer = ' '.join(prediction.strip().lower().split())
+                dist = levenshtein_distance(gt_answer, det_answer)
+                length = max(len(ans.upper()), len(prediction.upper()))
+                values.append(0.0 if length == 0 else float(dist) / float(length))
+            question_result = 0.0
+            if values:
+                question_result = 1 - min(values)
+                if question_result < self.thresh_hold:
+                    question_result = 0.0
+            res.append(question_result)
+        return res
+@register_metric(name='bertscore')
+class BertScore(SingletonMetric):
+    def _init_once(self, model_id_or_path: str = 'google-bert/bert-base-chinese', **kwargs):
+        """BertScore metric.
+        Args:
+            model_id_or_path (str, optional): The model ID on modelscope or path to the pre-trained model.
+                Defaults to 'google-bert/bert-base-chinese'.
+        """
+        check_import('torch', 'torch', raise_error=True, feature_name='BertScore Metric')
+        from .bert_score.scorer import BERTScorer
+        self.scorer = BERTScorer(model_id_or_path=model_id_or_path, batch_size=1024, **kwargs)
+    def apply(self, predictions: List[str], references: List[str]) -> List[float]:
+        _, _, F1 = self.scorer.score(predictions, references)
+        return [round(f1.item(), 6) for f1 in F1]
+@register_metric(name='comet')
+class COMETScore(SingletonMetric):
+    def _init_once(self, model_id_or_path: str = 'evalscope/wmt22-comet-da'):
+        """COMETScore metric.
+        Args:
+            model_name (str, optional): The model name on huggingface.
+                Defaults to 'evalscope/wmt22-comet-da'.
+        """
+        check_import('comet', 'unbabel-comet', raise_error=True, feature_name='COMETScore Metric')
+        from comet import load_from_checkpoint
+        from modelscope import snapshot_download
+        self.model_name = model_id_or_path
+        model_path = snapshot_download(model_id_or_path)
+        checkpoint_path = os.path.join(model_path, 'checkpoints', 'model.ckpt')
+        self.comet_scorer = load_from_checkpoint(checkpoint_path)
+    def apply(self, samples: List[Dict[str, str]]) -> List[float]:
+        """Apply COMET scoring."""
+        import torch
+        model_output = self.comet_scorer.predict(
+            samples=samples,
+            batch_size=1024,
+            gpus=1 if torch.cuda.is_available() else 0,
+            progress_bar=False,
+        )
+        scores = model_output.scores if hasattr(model_output, 'scores') else [model_output.system_score] * len(samples)
+        return [round(score, 6) for score in scores]
 # ##################
 # T2I Metrics ######
-####################
+# ##################
 @register_metric(name='VQAScore')
 class VQAScore(T2IMetric):
@@ -202,6 +317,9 @@ class Mean(Aggregator):
     name = 'mean'
+    def agg_func(self, values: List[float]) -> float:
+        return mean(values)
     def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
         """Aggregate scores by computing the mean for each metric.
@@ -230,7 +348,7 @@ class Mean(Aggregator):
             if values:  # Only process non-empty value lists
                 aggregated_scores.append(
                     AggScore(
-                        score=mean(values),
+                        score=self.agg_func(values),
                         metric_name=metric_name,
                         aggregation_name=self.name,
                         num=len(values),
@@ -241,6 +359,20 @@ class Mean(Aggregator):
         return aggregated_scores
+@register_aggregation(name='clipped_mean')
+class ClippedMean(Mean):
+    name = 'clipped_mean'
+    def __init__(self, clip_min: float = 0.0, clip_max: float = 1.0):
+        self.clip_min = clip_min
+        self.clip_max = clip_max
+    def agg_func(self, values: List[float]) -> float:
+        clipped_values = min(max(mean(values), self.clip_min), self.clip_max)
+        return clipped_values
 @register_aggregation(name='pass_at_k')
 class PassAtK(Aggregator):
@@ -260,10 +392,6 @@ class PassAtK(Aggregator):
         if not scores:
             return []
-        import numpy as np
-        from .metrics import calculate_pass_at_k
         # Group scores by metric name and group_id
         metric_groups = defaultdict(lambda: defaultdict(list))
@@ -305,3 +433,179 @@ class PassAtK(Aggregator):
                 )
         return aggregated_scores
+@register_aggregation(name='mean_and_pass_at_k')
+class MeanPassAtK(Aggregator):
+    def __init__(self):
+        self.name = 'mean_and_pass_at_k'
+    def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
+        """Add per-metric pass@k (computed via calculate_pass_at_k) to each sample, then mean-aggregate.
+        For each metric:
+        - Group scores by group_id
+        - Collect binary correctness values
+        - Infer k as (total samples / number of groups) assuming uniform repetitions
+        - Compute per-group pass@k via calculate_pass_at_k
+        - Annotate each sample with metric_pass@k for its group
+        Finally run Mean() over the augmented metric set.
+        """
+        if not scores:
+            return []
+        # Extract metric names present in score values
+        metrics = list(scores[0].score.value.keys())
+        for metric_name in metrics:
+            # group_id -> list[float] (0/1 correctness values)
+            group_values: Dict[str, List[float]] = defaultdict(list)
+            for s in scores:
+                group_id = getattr(s, 'group_id', s.sample_id)
+                value = float(s.score.value[metric_name])
+                group_values[group_id].append(value)
+            if not group_values:
+                continue
+            # Infer k (assumes roughly uniform repeats)
+            k = int(len(scores) / len(group_values)) if len(group_values) > 0 else 1
+            if k <= 0:
+                k = 1
+            # Prepare inputs for calculate_pass_at_k
+            num_samples: List[int] = []
+            num_correct: List[int] = []
+            group_order: List[str] = []
+            for gid, vals in group_values.items():
+                group_order.append(gid)
+                num_samples.append(len(vals))
+                num_correct.append(int(sum(vals)))
+            # Compute per-group pass@k
+            pass_at_k_list = calculate_pass_at_k(num_samples, num_correct, k)
+            # Map back: group_id -> pass@k value
+            pass_at_k_map = {gid: float(v) for gid, v in zip(group_order, pass_at_k_list)}
+            # Annotate each sample with its group's pass@k
+            for s in scores:
+                group_id = getattr(s, 'group_id', s.sample_id)
+                s.score.value[f'{metric_name}_pass@{k}'] = pass_at_k_map[group_id]
+        # Delegate mean aggregation over original + injected pass@k metrics
+        m = Mean()
+        return m(scores)
+@register_aggregation(name='mean_and_vote_at_k')
+class MeanVoteAtK(Aggregator):
+    def __init__(self):
+        self.name = 'mean_and_vote_at_k'
+    def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
+        """Aggregate scores by computing the vote@k for each metric using group_id.
+        Args:
+            scores: List of sample scores to aggregate
+        Returns:
+            List of aggregated scores with vote@k values
+        """
+        if not scores:
+            return []
+        metrics = list(scores[0].score.value.keys())
+        # Calculate vote@k for all metrics
+        for metric_name in metrics:
+            # Count of occurrences for each answer in each group_id
+            answer_groups = defaultdict(lambda: defaultdict(int))
+            # Score for each answer in each group_id
+            scores_groups = defaultdict(lambda: defaultdict(float))
+            # Score of the most frequently occurring answer
+            final_scores_groups = defaultdict(float)
+            # Count different answers for this metric
+            for score in scores:
+                group_id = getattr(score, 'group_id', score.sample_id)  # fallback to sample_id if no group_id
+                answer_prediction = getattr(score.score, 'extracted_prediction', None)
+                answer_groups[group_id][answer_prediction] += 1
+                scores_groups[group_id][answer_prediction] = score.score.value[metric_name]
+            # Calculate the repetition count k for each problem
+            k = int(len(scores) / len(answer_groups))
+            # Use the score of the most frequently occurring answer as the group's score
+            for group_id in answer_groups:
+                final_scores_groups[group_id] = scores_groups[group_id][
+                    max(answer_groups[group_id], key=answer_groups[group_id].get)]
+            # Add the corresponding vote@k for the metric to each score's value
+            for score in scores:
+                group_id = getattr(score, 'group_id', score.sample_id)
+                score.score.value.update({f'{metric_name}_vote@{k}': final_scores_groups[group_id]})
+        # Calculate the mean value for all metrics and their corresponding vote@k
+        m = Mean()
+        return m(scores)
+@register_aggregation(name='mean_and_pass_hat_k')
+class MeanPassHatK(Aggregator):
+    def __init__(self):
+        self.name = 'mean_and_pass_hat_k'
+    def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
+        """Add per-metric pass^k using calculate_pass_hat_k, then mean-aggregate.
+        For each metric:
+        - Group scores by group_id
+        - Collect binary correctness values
+        - Infer k as approximate repeats and clamp to min attempts across groups
+        - Compute per-group pass^k via calculate_pass_hat_k
+        - Annotate each sample with metric_pass^{k} for its group
+        Finally run Mean() over the augmented metric set.
+        """
+        if not scores:
+            return []
+        # Freeze metric names before augmenting values to avoid iterating injected keys
+        metrics = list(scores[0].score.value.keys())
+        for metric_name in metrics:
+            # group_id -> list[float] (0/1 correctness values)
+            group_values: Dict[str, List[float]] = defaultdict(list)
+            for s in scores:
+                group_id = getattr(s, 'group_id', s.sample_id)
+                value = float(s.score.value[metric_name])
+                group_values[group_id].append(value)
+            if not group_values:
+                continue
+            # Infer repeats and clamp to the smallest group size to satisfy k <= n
+            approx_k = int(len(scores) / len(group_values)) if len(group_values) > 0 else 1
+            min_n = min(len(vals) for vals in group_values.values())
+            k = max(1, min(approx_k, min_n))
+            # Compute per-group pass^k
+            pass_hat_k_map: Dict[str, float] = {}
+            for gid, vals in group_values.items():
+                n = len(vals)
+                c = int(sum(vals))
+                # calculate_pass_hat_k requires k <= n; ensured by clamping above
+                pass_hat_k_map[gid] = float(calculate_pass_hat_k(n, c, k))
+            # Annotate each sample with its group's pass^k
+            suffix = f'pass^{k}'
+            injected_key = f'{metric_name}_{suffix}'
+            for s in scores:
+                group_id = getattr(s, 'group_id', s.sample_id)
+                s.score.value[injected_key] = pass_hat_k_map[group_id]
+        # Mean aggregate over original + injected pass^k metrics
+        m = Mean()
+        return m(scores)

evalscope/metrics/metrics.py CHANGED Viewed

@@ -12,6 +12,11 @@ from collections.abc import Iterable
 from typing import Dict, List, Union
+def normalize_text(text: str) -> str:
+    """Normalize text by lowering case and stripping whitespace."""
+    return text.strip().lower()
 def mean(arr: list):
     if not arr:
         return 0.0
@@ -467,3 +472,35 @@ def calculate_pass_at_k(
         num_samples_it = iter(num_samples)
     return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
+def calculate_pass_hat_k(num_trials: int, success_count: int, k: int) -> float:
+    """
+    Compute the pass^k metric for the given number of trials, success count, and k.
+    from https://arxiv.org/pdf/2406.12045
+    Args:
+        num_trials: The number of trials.
+        success_count: The number of successful trials.
+        k: The number of trials to consider.
+    Returns:
+        The pass^k metric.
+    """
+    if num_trials < k:
+        raise ValueError(f'Number of trials {num_trials} is less than k {k}.')
+    return math.comb(success_count, k) / math.comb(num_trials, k)
+def levenshtein_distance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2 + 1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
+        distances = distances_
+    return distances[-1]

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py CHANGED Viewed

File without changes

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py CHANGED Viewed

@@ -30,13 +30,9 @@ from transformers.modeling_outputs import (
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from transformers.modeling_utils import (
-    PreTrainedModel,
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
+from transformers.modeling_utils import PreTrainedModel
 from transformers.models.bert.configuration_bert import BertConfig
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.utils import logging
 from typing import Any, Dict, Optional, Tuple

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py CHANGED Viewed

@@ -14,13 +14,9 @@ from transformers.modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
 )
-from transformers.modeling_utils import (
-    PreTrainedModel,
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
+from transformers.modeling_utils import PreTrainedModel
 from transformers.models.bert.configuration_bert import BertConfig
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.utils import logging
 from typing import Tuple

evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py CHANGED Viewed

@@ -31,13 +31,9 @@ from transformers.modeling_outputs import (
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
-from transformers.modeling_utils import (
-    PreTrainedModel,
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
+from transformers.modeling_utils import PreTrainedModel
 from transformers.models.bert.configuration_bert import BertConfig
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.utils import logging
 from typing import Optional, Tuple

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl