PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (273) hide show

evalscope/__init__.py +4 -1
evalscope/api/__init__.py +0 -0
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +3 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +683 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +155 -0
evalscope/api/benchmark/benchmark.py +321 -0
evalscope/api/benchmark/meta.py +115 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +261 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +355 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +264 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +11 -0
evalscope/api/messages/chat_message.py +198 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +105 -0
evalscope/api/mixin/__init__.py +2 -0
evalscope/api/mixin/dataset_mixin.py +105 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +157 -0
evalscope/api/model/model.py +383 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +23 -11
evalscope/app/utils/data_utils.py +42 -26
evalscope/app/utils/text_utils.py +0 -2
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +6 -7
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -3
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +2 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aigc/i2i/__init__.py +0 -0
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +44 -0
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +53 -55
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +41 -46
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +29 -45
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +34 -44
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +16 -27
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +181 -160
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +94 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +183 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +135 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +136 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +192 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +112 -54
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -265
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +95 -54
evalscope/constants.py +29 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +277 -423
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +32 -30
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +47 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +123 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +698 -0
evalscope/perf/benchmark.py +2 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +7 -5
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +8 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -2
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +101 -6
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +26 -44
evalscope/summarizer.py +1 -1
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +2 -1
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/io_utils.py +100 -5
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +10 -7
evalscope/utils/multi_choices.py +271 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/METADATA +98 -49
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/RECORD +234 -216
tests/aigc/test_t2i.py +22 -4
tests/benchmark/__init__.py +1 -0
tests/benchmark/test_eval.py +386 -0
tests/cli/test_all.py +3 -5
tests/cli/test_collection.py +13 -4
tests/cli/test_custom.py +22 -15
tests/rag/test_clip_benchmark.py +1 -0
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.0.dist-info}/top_level.txt +0 -0

evalscope/metrics/metric.py ADDED Viewed

@@ -0,0 +1,307 @@
+from collections import defaultdict
+from typing import List
+from evalscope.api.metric import Aggregator, AggScore, Metric, SampleScore, T2IMetric
+from evalscope.api.registry import register_aggregation, register_metric
+from .metrics import mean
+@register_metric(name='exact_match')
+class ExactMatch(Metric):
+    def apply(self, predictions, references):
+        return [float(prediction == reference) for prediction, reference in zip(predictions, references)]
+@register_metric(name='acc')
+class Accuracy(ExactMatch):
+    def __init__(self, allow_inclusion: bool = False, numeric: bool = False):
+        self.allow_inclusion = allow_inclusion
+        self.numeric = numeric
+    def apply(self, predictions, references):
+        if self.allow_inclusion:
+            results = []
+            for prediction, reference in zip(predictions, references):
+                if prediction and prediction in reference:
+                    results.append(1.0)
+                else:
+                    results.append(0.0)
+            return results
+        elif self.numeric:
+            from .math_parser import extract_answer, math_equal, strip_answer_string
+            results = []
+            for prediction, reference in zip(predictions, references):
+                pred_answer = strip_answer_string(extract_answer(prediction))
+                ref_answer = strip_answer_string(reference)
+                results.append(float(math_equal(pred_answer, ref_answer)))
+            return results
+        else:
+            return super().apply(predictions, references)
+@register_metric(name='numeric_match')
+class NumericMatch(Metric):
+    def apply(self, predictions, references):
+        return [float(prediction == reference) for prediction, reference in zip(predictions, references)]
+@register_metric(name='math_acc')
+class MathAcc(Metric):
+    def apply(self, predictions, references):
+        from .math_parser import extract_answer, math_equal, strip_answer_string
+        results = []
+        for prediction, reference in zip(predictions, references):
+            pred_answer = strip_answer_string(extract_answer(prediction))
+            ref_answer = strip_answer_string(reference)
+            results.append(float(math_equal(pred_answer, ref_answer)))
+        return results
+@register_metric(name='multi_choice_acc')
+class MultiChoiceAcc(Metric):
+    def apply(self, predictions, references):
+        """
+        Calculate accuracy for multiple-choice questions.
+        Args:
+            predictions (List[str]): List of predicted answers.
+            references (List[str]): List of correct answers.
+        Returns:
+            List[float]: List of accuracy scores (1.0 for correct, 0.0 for incorrect).
+        """
+        res = []
+        for prediction, reference in zip(predictions, references):
+            prediction = set(prediction.strip().upper())
+            reference = set(reference.strip().upper())
+            # if the prediction has answer that not in reference, it is wrong
+            if not prediction.issubset(reference):
+                res.append(0.0)
+                continue
+            common = prediction.intersection(reference)
+            res.append(len(common) / len(reference) if reference else 0.0)
+        return res
+# ##################
+# T2I Metrics ######
+####################
+@register_metric(name='VQAScore')
+class VQAScore(T2IMetric):
+    def _init_once(self, model: str = 'clip-flant5-xxl'):
+        from .t2v_metrics.vqascore import VQAScore
+        self.model = VQAScore(model=model)
+    def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
+        return self.model(images, texts, **kwargs)
+@register_metric(name='PickScore')
+class PickScore(T2IMetric):
+    def _init_once(self, model: str = 'pickscore-v1'):
+        from .t2v_metrics.clipscore import CLIPScore
+        self.model = CLIPScore(model=model)
+    def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
+        return self.model(images, texts, **kwargs)
+@register_metric(name='CLIPScore')
+class CLIPScore(T2IMetric):
+    def _init_once(self, model: str = 'openai:ViT-L-14-336'):
+        from .t2v_metrics.clipscore import CLIPScore
+        self.model = CLIPScore(model=model)
+    def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
+        return self.model(images, texts, **kwargs)
+@register_metric(name='BLIPv2Score')
+class BLIPv2Score(T2IMetric):
+    def _init_once(self, model: str = 'blip2-itm'):
+        from .t2v_metrics.itmscore import ITMScore
+        self.model = ITMScore(model=model)
+    def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
+        return self.model(images, texts, **kwargs)
+@register_metric(name='HPSv2Score')
+class HPSv2Score(T2IMetric):
+    def _init_once(self, model: str = 'hpsv2'):
+        from .t2v_metrics.clipscore import CLIPScore
+        self.model = CLIPScore(model=model)
+    def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
+        return self.model(images, texts, **kwargs)
+@register_metric(name='HPSv2.1Score')
+class HPSv2_1Score(T2IMetric):
+    def _init_once(self, model: str = 'hpsv2.1'):
+        from .t2v_metrics.clipscore import CLIPScore
+        self.model = CLIPScore(model=model)
+    def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
+        return self.model(images, texts, **kwargs)
+@register_metric(name='ImageRewardScore')
+class ImageRewardScore(T2IMetric):
+    def _init_once(self, model: str = 'image-reward-v1'):
+        from .t2v_metrics.itmscore import ITMScore
+        self.model = ITMScore(model=model)
+    def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
+        return self.model(images, texts, **kwargs)
+@register_metric(name='FGA_BLIP2Score')
+class FGA_BLIP2Score(T2IMetric):
+    def _init_once(self, model: str = 'fga_blip2'):
+        from .t2v_metrics.itmscore import ITMScore
+        self.model = ITMScore(model=model)
+    def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
+        return self.model(images, texts, **kwargs)
+@register_metric(name='MPS')
+class MPS(T2IMetric):
+    def _init_once(self, model: str = 'mps'):
+        from .t2v_metrics.clipscore import CLIPScore
+        self.model = CLIPScore(model=model)
+    def apply(self, images: List[str], texts: List[str], **kwargs) -> List[float]:
+        return self.model(images, texts, **kwargs)
+# ##################
+# Aggregators ######
+# ##################
+@register_aggregation(name='mean')
+class Mean(Aggregator):
+    name = 'mean'
+    def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
+        """Aggregate scores by computing the mean for each metric.
+        Args:
+            scores: List of sample scores to aggregate
+        Returns:
+            List of aggregated scores with mean values
+        """
+        if not scores:
+            return []
+        # Group score values by metric name
+        metric_values = defaultdict(list)
+        metric_sample_ids = defaultdict(list)
+        for score in scores:
+            for metric_name, value in score.score.value.items():
+                metric_values[metric_name].append(value)
+                metric_sample_ids[metric_name].append(score.sample_id)
+        # Calculate mean for each metric
+        aggregated_scores = []
+        for metric_name, values in metric_values.items():
+            if values:  # Only process non-empty value lists
+                aggregated_scores.append(
+                    AggScore(
+                        score=mean(values),
+                        metric_name=metric_name,
+                        aggregation_name=self.name,
+                        num=len(values),
+                        ids=metric_sample_ids[metric_name]
+                    )
+                )
+        return aggregated_scores
+@register_aggregation(name='pass_at_k')
+class PassAtK(Aggregator):
+    def __init__(self, k: int = 1):
+        self.k = k
+        self.name = f'pass_at_{k}'
+    def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
+        """Aggregate scores by computing the pass@k for each metric using group_id.
+        Args:
+            scores: List of sample scores to aggregate
+        Returns:
+            List of aggregated scores with pass@k values
+        """
+        if not scores:
+            return []
+        import numpy as np
+        from .metrics import calculate_pass_at_k
+        # Group scores by metric name and group_id
+        metric_groups = defaultdict(lambda: defaultdict(list))
+        for score in scores:
+            group_id = getattr(score, 'group_id', score.sample_id)  # fallback to sample_id if no group_id
+            for metric_name, value in score.score.value.items():
+                metric_groups[metric_name][group_id].append(float(value))
+        # Calculate pass@k for each metric
+        aggregated_scores = []
+        for metric_name, groups in metric_groups.items():
+            if not groups:
+                continue
+            # Calculate pass@k for each group (problem)
+            num_samples = []
+            num_correct = []
+            all_sample_ids = []
+            for group_id, group_values in groups.items():
+                num_samples.append(len(group_values))
+                num_correct.append(sum(group_values))  # count how many passed in this group
+                all_sample_ids.extend([f'{group_id}_{i}' for i in range(len(group_values))])
+            if num_samples:
+                # Use the calculate_pass_at_k function from metrics
+                pass_at_k_values = calculate_pass_at_k(num_samples, num_correct, self.k)
+                overall_pass_at_k = float(np.mean(pass_at_k_values))
+                aggregated_scores.append(
+                    AggScore(
+                        score=overall_pass_at_k,
+                        metric_name=f'pass@{self.k}',
+                        aggregation_name='',
+                        num=len(scores),
+                        ids=all_sample_ids
+                    )
+                )
+        return aggregated_scores

evalscope/metrics/metrics.py CHANGED Viewed

@@ -191,7 +191,7 @@ def bleu(items):
     return sacrebleu.corpus_bleu(preds, refs).score
-def bleu_ngram_one_sample(predict, reference):
+def bleu_ngram_one_sample(predict: str, reference: str):
     """
     Calculate BLEU-1, BLEU-2, BLEU-3, and BLEU-4 scores
@@ -322,11 +322,11 @@ def bootstrap_stderr(f, xs, iters):
     print('bootstrapping for stddev:', f.__name__)
     for bootstrap in tqdm(
-            pool.imap(
-                _bootstrap_internal(f, chunk_size),
-                [(i, xs) for i in range(iters // chunk_size)],
-            ),
-            total=iters // chunk_size,
+        pool.imap(
+            _bootstrap_internal(f, chunk_size),
+            [(i, xs) for i in range(iters // chunk_size)],
+        ),
+        total=iters // chunk_size,
     ):
         # sample w replacement
         res.extend(bootstrap)
@@ -361,15 +361,17 @@ def yesno(x):
         return 'no'
-def compute_elo(battles,
-                col_model_a='model_a',
-                col_model_b='model_b',
-                col_win='win',
-                tie_values=['tie', 'tie (bothbad)'],
-                k=32,
-                scale=400,
-                base=10,
-                init_rating=1000):
+def compute_elo(
+    battles,
+    col_model_a='model_a',
+    col_model_b='model_b',
+    col_win='win',
+    tie_values=['tie', 'tie (bothbad)'],
+    k=32,
+    scale=400,
+    base=10,
+    init_rating=1000
+):
     rating = defaultdict(lambda: init_rating)
     for rd, model_a, model_b, win in battles[[col_model_a, col_model_b, col_win]].itertuples():
@@ -434,9 +436,11 @@ def calculate_arc_accuracy(question_answers: Dict[str, str], predictions: Dict[s
     return score / len(question_answers)
-def calculate_pass_at_k(num_samples: Union[int, List[int], np.ndarray],
-                        num_correct: Union[List[int], np.ndarray],
-                        k: int = 1) -> np.ndarray:
+def calculate_pass_at_k(
+    num_samples: Union[int, List[int], np.ndarray],
+    num_correct: Union[List[int], np.ndarray],
+    k: int = 1
+) -> np.ndarray:
     """
     Estimates pass@k of each problem and returns them in an array.
     Examples:

evalscope/metrics/t2v_metrics/__init__.py CHANGED Viewed

@@ -1,52 +0,0 @@
-def clip_flant5_score():
-    from .vqascore import VQAScore
-    clip_flant5_score = VQAScore(model='clip-flant5-xxl')
-    return clip_flant5_score
-def pick_score():
-    from .clipscore import CLIPScore
-    pick_score = CLIPScore(model='pickscore-v1')
-    return pick_score
-def clip_score():
-    from .clipscore import CLIPScore
-    clip_score = CLIPScore(model='openai:ViT-L-14-336')
-    return clip_score
-def blip2_score():
-    from .itmscore import ITMScore
-    blip_itm_score = ITMScore(model='blip2-itm')
-    return blip_itm_score
-def hpsv2_score():
-    from .clipscore import CLIPScore
-    hpsv2_score = CLIPScore(model='hpsv2')
-    return hpsv2_score
-def hpsv2_1_score():
-    from .clipscore import CLIPScore
-    hpsv2_1_score = CLIPScore(model='hpsv2.1')
-    return hpsv2_1_score
-def image_reward_score():
-    from .itmscore import ITMScore
-    image_reward_score = ITMScore(model='image-reward-v1')
-    return image_reward_score
-def fga_blip2_score():
-    from .itmscore import ITMScore
-    fga_blip2_score = ITMScore(model='fga_blip2')
-    return fga_blip2_score
-def mps_score():
-    from .clipscore import CLIPScore
-    mps_score = CLIPScore(model='mps')
-    return mps_score

evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py CHANGED Viewed

@@ -27,7 +27,8 @@ class XCLIPModel(HFCLIPModel):
         # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         text_outputs = self.text_model(
@@ -63,7 +64,8 @@ class XCLIPModel(HFCLIPModel):
         # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states)
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         vision_outputs = self.vision_model(

evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py CHANGED Viewed

@@ -178,15 +178,9 @@ class ParallelTransformerBlock(nn.Module):
 class CrossAttention(nn.Module):
-    def __init__(self,
-                 dim,
-                 *,
-                 context_dim=None,
-                 dim_head=64,
-                 heads=12,
-                 parallel_ff=False,
-                 ff_mult=4,
-                 norm_context=False):
+    def __init__(
+        self, dim, *, context_dim=None, dim_head=64, heads=12, parallel_ff=False, ff_mult=4, norm_context=False
+    ):
         super().__init__()
         self.heads = heads
         self.scale = dim_head**-0.5
@@ -205,8 +199,8 @@ class CrossAttention(nn.Module):
         ff_inner_dim = ff_mult * dim
         self.ff = nn.Sequential(
-            nn.Linear(dim, ff_inner_dim
-                      * 2, bias=False), SwiGLU(), nn.Linear(ff_inner_dim, dim, bias=False)) if parallel_ff else None
+            nn.Linear(dim, ff_inner_dim * 2, bias=False), SwiGLU(), nn.Linear(ff_inner_dim, dim, bias=False)
+        ) if parallel_ff else None
     def forward(self, x, context, mask):
         """
@@ -273,9 +267,11 @@ class Cross_model(nn.Module):
             self.layers.append(
                 nn.ModuleList([
                     Residual(
-                        CrossAttention(dim=dim, dim_head=dim_head, heads=heads, parallel_ff=True, ff_mult=ff_mult)),
+                        CrossAttention(dim=dim, dim_head=dim_head, heads=heads, parallel_ff=True, ff_mult=ff_mult)
+                    ),
                     Residual(ParallelTransformerBlock(dim=dim, dim_head=dim_head, heads=heads, ff_mult=ff_mult))
-                ]))
+                ])
+            )
     def forward(self, query_tokens, context_tokens, mask):

evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py CHANGED Viewed

@@ -86,7 +86,8 @@ class CLIPScoreModel(ScoreModel):
         model_file_path = download_open_clip_model(self.arch, self.pretrained, self.cache_dir)
         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
-            self.arch, pretrained=model_file_path, device=self.device)
+            self.arch, pretrained=model_file_path, device=self.device
+        )
         self.tokenizer = open_clip.get_tokenizer(self.arch)
         self.model.eval()

evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py CHANGED Viewed

@@ -44,11 +44,12 @@ class HPSV2ScoreModel(ScoreModel):
             image_std=None,
             image_resize_mode='longest',
             aug_cfg={},
-            output_dict=True)
+            output_dict=True
+        )
         # update weight
         model_weight_path = download_file('AI-ModelScope/HPSv2', HPS_VERSION_MAP[self.model_name], self.cache_dir)
-        checkpoint = torch.load(model_weight_path, map_location=self.device)
+        checkpoint = torch.load(model_weight_path, map_location=self.device, weights_only=False)
         self.model.load_state_dict(checkpoint['state_dict'])
         self.tokenizer = open_clip.get_tokenizer(self.arch)
         self.model.eval()

evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py CHANGED Viewed

@@ -29,7 +29,8 @@ class MPSModel(ScoreModel):
         config = download_file('AI-ModelScope/MPS', file_name='config.json', cache_dir=self.cache_dir)
         model_pretrained_path = download_file(
-            'AI-ModelScope/MPS', file_name='MPS_overall_state_dict.pt', cache_dir=self.cache_dir)  # modelscope model
+            'AI-ModelScope/MPS', file_name='MPS_overall_state_dict.pt', cache_dir=self.cache_dir
+        )  # modelscope model
         model_weight = torch.load(model_pretrained_path, weights_only=True, map_location='cpu')
         self.model = CLIPModel(config=CLIPConfig.from_json_file(config))

evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py CHANGED Viewed

@@ -31,8 +31,8 @@ class PickScoreModel(ScoreModel):
         """Load the image(s), and return a tensor (no preprocessing!!) put on self.device
         """
         image = [self.image_loader(x) for x in image]
-        image = self.processor(
-            images=image, padding=True, truncation=True, max_length=77, return_tensors='pt').to(self.device)
+        image = self.processor(images=image, padding=True, truncation=True, max_length=77,
+                               return_tensors='pt').to(self.device)
         # image = torch.stack(image, dim=0).to(self.device)
         return image

evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py CHANGED Viewed

@@ -66,7 +66,8 @@ class BLIP2ITMScoreModel(ScoreModel):
         query_att = torch.ones(query_token.size()[:-1], dtype=torch.long).to(query_token.device)
         text_input = self.model.tokenizer(
-            texts, padding='max_length', truncation=True, max_length=35, return_tensors='pt').to(self.device)
+            texts, padding='max_length', truncation=True, max_length=35, return_tensors='pt'
+        ).to(self.device)
         attention_mask_all = torch.cat([query_att, text_input.attention_mask], dim=1)
         output_itm = self.model.Qformer.bert(

evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py CHANGED Viewed

@@ -42,10 +42,12 @@ class FGA_BLIP2ScoreModel(ScoreModel):
         # load model
         self.variant = FGA_BLIP2_MODELS[self.model_name]['variant']
         self.model, self.vis_processors, self.text_processors = load_model_and_preprocess(
-            'fga_blip2', self.variant, is_eval=True, device=self.device)
+            'fga_blip2', self.variant, is_eval=True, device=self.device
+        )
         # load pretrained weights
         model_weight_path = download_file(
-            'AI-ModelScope/FGA-BLIP2', file_name='fga_blip2.pth', cache_dir=self.cache_dir)
+            'AI-ModelScope/FGA-BLIP2', file_name='fga_blip2.pth', cache_dir=self.cache_dir
+        )
         self.model.load_checkpoint(model_weight_path)
         self.model.eval()

evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py CHANGED Viewed

@@ -47,7 +47,8 @@ class MLP(nn.Module):
             nn.Dropout(0.1),
             nn.Linear(64, 16),
             #nn.ReLU(),
-            nn.Linear(16, 1))
+            nn.Linear(16, 1)
+        )
         # initial MLP param
         for name, param in self.layers.named_parameters():
@@ -100,7 +101,8 @@ class ImageReward(nn.Module):
         # text encode
         text_input = self.blip.tokenizer(
-            prompt, padding='max_length', truncation=True, max_length=35, return_tensors='pt').to(self.device)
+            prompt, padding='max_length', truncation=True, max_length=35, return_tensors='pt'
+        ).to(self.device)
         # image encode
         if isinstance(image, Image.Image):
@@ -109,7 +111,8 @@ class ImageReward(nn.Module):
             pil_image = Image.open(image)
         else:
             raise TypeError(
-                r'This image parameter type has not been supportted yet. Please pass PIL.Image or file path str.')
+                r'This image parameter type has not been supportted yet. Please pass PIL.Image or file path str.'
+            )
         image = self.preprocess(pil_image).unsqueeze(0).to(self.device)
         image_embeds = self.blip.visual_encoder(image)
@@ -133,7 +136,8 @@ class ImageReward(nn.Module):
     def inference_rank(self, prompt, generations_list):
         text_input = self.blip.tokenizer(
-            prompt, padding='max_length', truncation=True, max_length=35, return_tensors='pt').to(self.device)
+            prompt, padding='max_length', truncation=True, max_length=35, return_tensors='pt'
+        ).to(self.device)
         txt_set = []
         for generation in generations_list:
@@ -145,7 +149,8 @@ class ImageReward(nn.Module):
                     pil_image = Image.open(generation)
             else:
                 raise TypeError(
-                    r'This image parameter type has not been supportted yet. Please pass PIL.Image or file path str.')
+                    r'This image parameter type has not been supportted yet. Please pass PIL.Image or file path str.'
+                )
             image = self.preprocess(pil_image).unsqueeze(0).to(self.device)
             image_embeds = self.blip.visual_encoder(image)

evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py CHANGED Viewed

@@ -30,7 +30,8 @@ def create_vit(vit, image_size, use_grad_checkpointing=False, ckpt_layer=0, drop
             num_heads=12,
             use_grad_checkpointing=use_grad_checkpointing,
             ckpt_layer=ckpt_layer,
-            drop_path_rate=0 or drop_path_rate)
+            drop_path_rate=0 or drop_path_rate
+        )
     elif vit == 'large':
         vision_width = 1024
         visual_encoder = VisionTransformer(
@@ -41,7 +42,8 @@ def create_vit(vit, image_size, use_grad_checkpointing=False, ckpt_layer=0, drop
             num_heads=16,
             use_grad_checkpointing=use_grad_checkpointing,
             ckpt_layer=ckpt_layer,
-            drop_path_rate=0.1 or drop_path_rate)
+            drop_path_rate=0.1 or drop_path_rate
+        )
     return visual_encoder, vision_width

evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py CHANGED Viewed

@@ -53,7 +53,8 @@ class ImageRewardScoreModel(ScoreModel):
         images = self.load_images(images)
         for index in range(len(texts)):
             text_input = self.model.blip.tokenizer(
-                texts[index], padding='max_length', truncation=True, max_length=35, return_tensors='pt').to(self.device)
+                texts[index], padding='max_length', truncation=True, max_length=35, return_tensors='pt'
+            ).to(self.device)
             image_embeds = self.model.blip.visual_encoder(images[index].unsqueeze(0))
             image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(self.device)
             text_output = self.model.blip.text_encoder(

evalscope 0.17.1__py3-none-any.whl → 1.0.0__py3-none-any.whl

evalscope 0.17.1py3-none-any.whl → 1.0.0py3-none-any.whl