PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +5 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +356 -0
evalscope/api/benchmark/meta.py +121 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +262 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +378 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +275 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +243 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +155 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/app.py +3 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +26 -14
evalscope/app/utils/data_utils.py +43 -27
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -14
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +7 -10
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +10 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +136 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +157 -57
evalscope/constants.py +37 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +275 -419
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +47 -33
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +67 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +126 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +701 -0
evalscope/perf/benchmark.py +4 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +15 -10
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +11 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -3
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +51 -35
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +33 -47
evalscope/summarizer.py +1 -1
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +3 -2
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +142 -6
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +11 -7
evalscope/utils/multi_choices.py +288 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
tests/benchmark/test_eval.py +385 -0
tests/benchmark/test_image_edit.py +65 -0
tests/{aigc → benchmark}/test_t2i.py +22 -4
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +85 -47
tests/cli/test_collection.py +20 -8
tests/cli/test_custom.py +22 -15
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -2
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
/evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/__init__.py +0 -0

evalscope/api/benchmark/adapters/image_edit_adapter.py ADDED Viewed

@@ -0,0 +1,82 @@
+import os
+from typing import Optional
+from evalscope.constants import EvalType, FileConstants
+from evalscope.utils import get_logger
+from evalscope.utils.function_utils import thread_safe
+from evalscope.utils.io_utils import jsonl_to_list
+from .text2image_adapter import Text2ImageAdapter
+logger = get_logger()
+class ImageEditAdapter(Text2ImageAdapter):
+    """
+    Support two methods:
+    1. Inference using modelscope pipeline
+    2. Load local inference jsonl file with key to corresponding prompt
+    """
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.local_file = self.extra_params.get('local_file', None)
+        self.id_key = self.extra_params.get('id_key', FileConstants.ID)
+        self.image_key = self.extra_params.get('image_key', FileConstants.IMAGE_PATH)
+        self.local_data = self.load_local_file()
+    def load_local_file(self) -> Optional[dict]:
+        if not self.local_file:
+            return None
+        # Load file and check
+        data_list = jsonl_to_list(self.local_file)
+        data_dict = {}
+        for record in data_list:
+            if self.image_key not in record:
+                raise ValueError(f"Image key '{self.image_key}' not found in record: {record}, file {self.local_file}")
+            if self.id_key not in record:
+                raise ValueError(f"ID key '{self.id_key}' not found in record: {record}, file {self.local_file}")
+            image_path = record[self.image_key]
+            if not os.path.isabs(image_path):
+                image_path = os.path.join(os.path.dirname(self.local_file), image_path)
+            if not os.path.exists(image_path):
+                raise FileNotFoundError(f"Image file '{image_path}' not found.")
+            data_dict[record[self.id_key]] = record
+        return data_dict
+    def get_image_path_from_id(self, image_id) -> Optional[str]:
+        if not self.local_file:
+            return None
+        record = self.local_data.get(image_id)
+        if not record:
+            return None
+        return record[self.image_key]
+    def _post_process_samples(self):
+        super()._post_process_samples()
+        # Add local image path if exists
+        for subset in self.test_dataset.keys():
+            for sample in self.test_dataset[subset]:
+                local_image_path = self.get_image_path_from_id(sample.metadata.get(FileConstants.ID))
+                if local_image_path:
+                    sample.metadata[FileConstants.IMAGE_PATH] = local_image_path
+    def sample_filter(self, sample) -> bool:
+        """
+        Filter samples based on metadata availability.
+        If local file is not available, all samples are considered valid.
+        Otherwise, only samples with valid metadata and image path are kept.
+        """
+        if not self.local_data:
+            return True
+        else:
+            sample_id = sample.metadata.get(FileConstants.ID)
+            if (not sample_id) or (not self.get_image_path_from_id(sample_id)):
+                return False
+            return True

evalscope/api/benchmark/adapters/multi_choice_adapter.py ADDED Viewed

@@ -0,0 +1,83 @@
+from evalscope.api.dataset.dataset import Sample
+from evalscope.api.evaluator import Choices, Target, TaskState
+from evalscope.utils.multi_choices import (
+    FEW_SHOT_TEMPLATE,
+    MultipleChoiceTemplate,
+    format_example,
+    parse_answers,
+    parse_answers_zh,
+    prompt,
+    valid_template,
+)
+from .default_data_adapter import DefaultDataAdapter
+class MultiChoiceAdapter(DefaultDataAdapter):
+    """
+    Adapter for multi-choice benchmarks.
+    This adapter formats the input for multi-choice questions and handles few-shot examples.
+    """
+    multiple_correct: bool = False
+    """Whether the benchmark allows multiple correct answers."""
+    def format_prompt_template(self, sample: Sample) -> str:
+        """
+        Format the basic prompt template with the sample data.
+        Args:
+            sample (Sample): The sample object containing the prompt data
+        Returns:
+            str: The formatted prompt ready for model input
+        """
+        assert valid_template(self.prompt_template), 'Prompt template is not valid'
+        return prompt(
+            question=sample.input,
+            choices=Choices(sample.choices),
+            template=self.prompt_template,
+        )
+    def format_fewshot_template(self, fewshot: str, sample: Sample) -> str:
+        """
+        Format the few-shot template with demonstrations and the main prompt.
+        Args:
+            fewshot (str): The formatted few-shot demonstration examples
+            sample (Sample): The sample object containing the prompt data
+        Returns:
+            str: The complete formatted input with few-shot context
+        """
+        few_shot_prompt_template = self.few_shot_prompt_template or (FEW_SHOT_TEMPLATE + self.prompt_template)
+        assert valid_template(few_shot_prompt_template), 'Few-shot prompt template is not valid'
+        return prompt(
+            question=sample.input, choices=Choices(sample.choices), template=few_shot_prompt_template, fewshot=fewshot
+        )
+    def sample_to_fewshot(self, sample: Sample) -> str:
+        """
+        Convert a sample to a few-shot formatted string.
+        Args:
+            sample (Sample): The sample object to format
+        Returns:
+            str: The formatted few-shot example string
+        """
+        return format_example(question=sample.input, choices=Choices(sample.choices), answer=Target(sample.target))
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        if self.prompt_template in [
+            MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE_COT,
+            MultipleChoiceTemplate.CHINESE_SINGLE_ANSWER_TEMPLATE
+        ]:
+            # For Chinese COT template, we use a different extraction method
+            answers = parse_answers_zh(task_state, multiple_correct=self.multiple_correct)
+        else:
+            answers = parse_answers(task_state, multiple_correct=self.multiple_correct)
+        return ''.join(sorted(list(answers)))

evalscope/api/benchmark/adapters/text2image_adapter.py ADDED Viewed

@@ -0,0 +1,156 @@
+import base64
+import os
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.messages.chat_message import ChatMessageUser
+from evalscope.api.messages.content import ContentImage
+from evalscope.api.metric import Score
+from evalscope.api.model import ChatCompletionChoice, Model, ModelOutput
+from evalscope.api.registry import get_metric
+from evalscope.constants import EvalType, FileConstants
+from evalscope.utils import get_logger
+from evalscope.utils.function_utils import thread_safe
+from .default_data_adapter import DefaultDataAdapter
+logger = get_logger()
+class Text2ImageAdapter(DefaultDataAdapter):
+    """Text to Image Adapter for benchmarks."""
+    def load_from_disk(self, **kwargs):
+        return super().load_from_disk(use_local_loader=True)
+    def record_to_sample(self, record) -> Sample:
+        """Convert a record dictionary to a Sample object."""
+        return Sample(
+            input=[ChatMessageUser(content=record['prompt'])],
+            metadata={
+                'prompt': record['prompt'],
+                'category': record.get('category', ''),
+                'tags': record.get('tags', []),
+                FileConstants.ID: record[FileConstants.ID],
+                FileConstants.IMAGE_PATH: record.get(FileConstants.IMAGE_PATH,
+                                                     ''),  # Optional field for existing image path
+            }
+        )
+    def _on_inference(self, model: Model, sample: Sample) -> ModelOutput:
+        """
+        Hook method called during the actual inference process.
+        This method executes the model inference and can be overridden
+        to implement custom inference logic or model interaction patterns.
+        Args:
+            model (Model): The model to use for inference
+            sample (Sample): The sample to process
+        Returns:
+            ModelOutput: The raw output from the model
+        """
+        if self.eval_type == EvalType.MOCK_LLM:
+            return ModelOutput(
+                model=model.name,
+                choices=[ChatCompletionChoice.from_content('')],
+            )
+        else:
+            # Execute model inference with the processed input and any tools
+            model_output = model.generate(input=sample.input, tools=sample.tools)
+            return model_output
+    def _on_inference_end(
+        self, model: Model, sample: Sample, model_output: ModelOutput, output_dir: str, **kwargs
+    ) -> TaskState:
+        """
+        Hook method called after inference completes. Save generated images to output_dir.
+        Args:
+            model (Model): The model that performed inference
+            sample (Sample): The processed sample
+            model_output (ModelOutput): The raw model output
+            output_dir (str): The directory where the model output was saved
+        Returns:
+            TaskState: Complete state object for the inference task
+        """
+        if self.eval_type == EvalType.MOCK_LLM:
+            return TaskState(
+                model=model.name,
+                sample=sample,
+                messages=[model_output.message],
+                output=model_output,
+                completed=True,
+            )
+        else:
+            image_id = f'{sample.metadata.get(FileConstants.ID, sample.id)}_{sample.group_id}'
+            output_path = os.path.join(output_dir, 'images', f'{image_id}.png')
+            if not os.path.exists(os.path.dirname(output_path)):
+                os.makedirs(os.path.dirname(output_path))
+            # get base64 image from model_output
+            content = model_output.message.content[0]
+            assert isinstance(content, ContentImage), 'Expected ContentImage in model output'
+            image_base64 = content.image
+            with open(output_path, 'wb') as f:
+                f.write(base64.b64decode(image_base64))
+            sample.metadata[FileConstants.IMAGE_PATH] = output_path
+            return TaskState(
+                model=model.name,
+                sample=sample,
+                messages=[model_output.message],
+                output=model_output,
+                completed=True,
+            )
+    # NOTE: thread safe is needed, since we can't batch inference here.
+    @thread_safe
+    def match_score(
+        self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
+    ) -> Score:
+        # Get prediction and prompt from task state
+        image_path = task_state.metadata.get(FileConstants.IMAGE_PATH, original_prediction)
+        prompt = task_state.input[0].content
+        meta = task_state.metadata
+        # Initialize the score object with prediction details
+        score = Score(
+            extracted_prediction=image_path,
+            prediction=image_path,
+        )
+        # Calculate scores for each configured metric
+        for metric in self.metric_list:
+            try:
+                if isinstance(metric, str):
+                    metric_name = metric
+                    metric_scorer = get_metric(metric)  # Get metric implementation from registry
+                    metric_func = metric_scorer()  # Instantiate the metric scorer
+                elif isinstance(metric, dict):
+                    metric_name = list(metric.keys())[0]
+                    metric_cls = get_metric(metric_name)
+                    metric_func = metric_cls(**metric[metric_name])  # Initialize with parameters
+                metric_score = metric_func(image_path, prompt)[0]
+                # fine-granular metrics
+                category = meta.get('category')
+                if category:
+                    metric_name = f'{metric_name}_{category}'
+                if isinstance(metric_score, dict):
+                    for k, v in metric_score.items():
+                        score.value[f'{metric_name}_{k}'] = v.cpu().item()
+                else:
+                    score.value[metric_name] = metric_score.cpu().item()
+            except Exception as e:
+                logger.error(f'Error calculating metric {metric}: {e}')
+                score.value[metric_name] = 0
+                score.metadata[metric_name] = f'error: {str(e)}'
+        return score
+    def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
+        # Don't add aggregation name for needle haystack adapter
+        return super()._on_generate_report(scores, model_name, False)

evalscope/api/benchmark/adapters/vision_language_adapter.py ADDED Viewed

@@ -0,0 +1,6 @@
+from .default_data_adapter import DefaultDataAdapter
+class VisionLanguageAdapter(DefaultDataAdapter):
+    """Adapter for vision-language benchmarks. e.g., image captioning, visual question answering, etc."""
+    pass

evalscope/api/benchmark/benchmark.py ADDED Viewed

@@ -0,0 +1,356 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import contextlib
+from abc import ABC, abstractmethod
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from evalscope.api.dataset import DatasetDict, Sample
+from evalscope.api.evaluator import TaskState
+from evalscope.api.filter import FilterEnsemble, build_filter_ensemble
+from evalscope.api.metric import AggScore, SampleScore
+from evalscope.api.mixin import LLMJudgeMixin
+from evalscope.api.model import Model
+from evalscope.report import Report
+from evalscope.utils.logger import get_logger
+if TYPE_CHECKING:
+    from evalscope.api.benchmark import BenchmarkMeta
+    from evalscope.config import TaskConfig
+logger = get_logger()
+class DataAdapter(LLMJudgeMixin, ABC):
+    """
+    Data Adapter for the benchmark.
+    """
+    def __init__(self, benchmark_meta: 'BenchmarkMeta', task_config: Optional['TaskConfig'] = None):
+        self._benchmark_meta = benchmark_meta
+        self._task_config = task_config
+        super().__init__(task_config=task_config)
+        self.reformat_subset = False
+        """Whether to reformat the subset data with subset key"""
+        self.split_as_subset = False
+        """Whether to use the split name as the dataset subsets"""
+        self.shuffle_choices = False
+        """Whether to shuffle the choices in the dataset"""
+        self.save_metadata = True
+        """Whether to save metadata in the review result"""
+        self.category_map = {}
+        """Category map for the benchmark"""
+        self.current_subset_name = ''
+        """Subset name when loading datasets"""
+        # dataset
+        self.test_dataset: Optional[DatasetDict] = None
+        """Dataset to be evaluated"""
+        self.fewshot_dataset: Optional[DatasetDict] = None
+        """Dataset for few-shot evaluation"""
+        # filters
+        self._filter_ensemble: Optional[OrderedDict] = None
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert the benchmark metadata to a dictionary."""
+        return self._benchmark_meta.to_string_dict()
+    @abstractmethod
+    def load_dataset(self) -> DatasetDict:
+        pass
+    @abstractmethod
+    def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
+        pass
+    @abstractmethod
+    def calculate_metrics(self, task_state: TaskState) -> SampleScore:
+        pass
+    @abstractmethod
+    def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
+        pass
+    @abstractmethod
+    def generate_report(self, scores: Dict[str, List[AggScore]], model_name: str, output_dir: str, **kwargs) -> Report:
+        """
+        Generate a report based on the evaluation results.
+        """
+        pass
+    @property
+    def name(self) -> str:
+        """
+        Return the unique name of the benchmark.
+        """
+        return self._benchmark_meta.name
+    @property
+    def dataset_id(self) -> str:
+        """
+        Return the dataset ID or path to the benchmark.
+        """
+        return self._benchmark_meta.dataset_id
+    @property
+    def output_types(self) -> Optional[List[str]]:
+        """
+        Return the output types of the benchmark.
+        """
+        return self._benchmark_meta.output_types
+    @property
+    def limit(self) -> Optional[Union[int, float]]:
+        """
+        Return the limit for the benchmark.
+        """
+        return self._task_config.limit
+    @property
+    def repeats(self) -> int:
+        """
+        Return the number of repeats for each sample in the benchmark.
+        """
+        return self._task_config.repeats
+    @property
+    def dataset_hub(self) -> str:
+        """
+        Return the dataset hub type for the benchmark.
+        """
+        return self._task_config.dataset_hub
+    @dataset_hub.setter
+    def dataset_hub(self, value: str):
+        """
+        Set the dataset hub type for the benchmark.
+        """
+        self._task_config.dataset_hub = value
+    @property
+    def eval_type(self) -> str:
+        """
+        Return the evaluation type for the benchmark.
+        """
+        return self._task_config.eval_type
+    @property
+    def subset_list(self) -> List[str]:
+        """
+        Return the subset list of the benchmark.
+        """
+        return self._benchmark_meta.subset_list
+    @subset_list.setter
+    def subset_list(self, value: List[str]):
+        """
+        Set the subset list of the benchmark.
+        """
+        self._benchmark_meta.subset_list = value
+    @property
+    def metric_list(self) -> List[Union[str, Dict[str, Any]]]:
+        """
+        Return the metric list of the benchmark.
+        """
+        return self._benchmark_meta.metric_list
+    @property
+    def default_subset(self) -> str:
+        """
+        Return the default subset of the benchmark.
+        """
+        return self._benchmark_meta.default_subset
+    @default_subset.setter
+    def default_subset(self, value: str):
+        """
+        Set the default subset of the benchmark.
+        """
+        self._benchmark_meta.default_subset = value
+    @property
+    def few_shot_num(self) -> int:
+        """
+        Return the few shot number of the benchmark.
+        """
+        return self._benchmark_meta.few_shot_num
+    @few_shot_num.setter
+    def few_shot_num(self, value: int):
+        """
+        Set the few shot number of the benchmark.
+        """
+        self._benchmark_meta.few_shot_num = value
+    @property
+    def few_shot_random(self) -> bool:
+        """
+        Return whether few shot is random for the benchmark.
+        """
+        return self._benchmark_meta.few_shot_random
+    @property
+    def train_split(self) -> Optional[str]:
+        """
+        Return the train split of the benchmark.
+        """
+        return self._benchmark_meta.train_split
+    @property
+    def eval_split(self) -> Optional[str]:
+        """
+        Return the eval split of the benchmark.
+        """
+        return self._benchmark_meta.eval_split
+    @property
+    def prompt_template(self) -> Optional[str]:
+        """
+        Return the prompt template of the benchmark.
+        """
+        return self._benchmark_meta.prompt_template
+    @prompt_template.setter
+    def prompt_template(self, value: str):
+        """
+        Set the prompt template of the benchmark.
+        """
+        self._benchmark_meta.prompt_template = value
+    @property
+    def system_prompt(self) -> Optional[str]:
+        """
+        Return the system prompt of the benchmark.
+        """
+        return self._benchmark_meta.system_prompt
+    @property
+    def query_template(self) -> Optional[str]:
+        """
+        Return the query template of the benchmark.
+        """
+        return self._benchmark_meta.query_template
+    @property
+    def few_shot_prompt_template(self) -> Optional[str]:
+        """
+        Return the few-shot prompt template of the benchmark.
+        """
+        return self._benchmark_meta.few_shot_prompt_template
+    @property
+    def pretty_name(self) -> Optional[str]:
+        """
+        Return the pretty name of the benchmark.
+        """
+        return self._benchmark_meta.pretty_name
+    @property
+    def description(self) -> Optional[str]:
+        """
+        Return the description of the benchmark.
+        """
+        return self._benchmark_meta.description
+    @property
+    def tags(self) -> Optional[List[str]]:
+        """
+        Return the tags of the benchmark.
+        """
+        return self._benchmark_meta.tags
+    @property
+    def filters(self) -> Optional[OrderedDict]:
+        """
+        Return the filters of the benchmark.
+        """
+        return self._benchmark_meta.filters
+    @property
+    def filter_ensemble(self) -> Optional[FilterEnsemble]:
+        """
+        Return the filter ensemble of the benchmark.
+        """
+        if self._filter_ensemble is None:
+            if self.filters:
+                self._filter_ensemble = build_filter_ensemble(filters=self.filters)
+        return self._filter_ensemble
+    @property
+    def aggregation(self) -> str:
+        """
+        Return the aggregation function for the metrics.
+        """
+        return self._benchmark_meta.aggregation
+    @property
+    def extra_params(self) -> Optional[Dict]:
+        """
+        Return the extra parameters of the benchmark.
+        """
+        return self._benchmark_meta.extra_params
+    @property
+    def seed(self) -> Optional[int]:
+        """
+        Return the seed for the benchmark.
+        """
+        return self._task_config.seed
+    @property
+    def shuffle(self) -> bool:
+        """
+        Return whether to shuffle the dataset before evaluation.
+        """
+        return self._benchmark_meta.shuffle
+    @shuffle.setter
+    def shuffle(self, value: bool):
+        """
+        Set whether to shuffle the dataset before evaluation.
+        """
+        self._benchmark_meta.shuffle = value
+    @property
+    def shuffle_choices(self) -> bool:
+        """
+        Return whether to shuffle the choices in multiple-choice datasets.
+        """
+        return self._benchmark_meta.shuffle_choices
+    @shuffle_choices.setter
+    def shuffle_choices(self, value: bool):
+        """
+        Set whether to shuffle the choices in multiple-choice datasets.
+        """
+        self._benchmark_meta.shuffle_choices = value
+    @contextlib.contextmanager
+    def _temporary_attribute(self, attr_name: str, new_value):
+        """
+        Set a temporary value for an attribute and restore the original value after the context block.
+        Args:
+            attr_name: The name of the attribute to temporarily set.
+            new_value: The new value to set for the attribute.
+        """
+        had_attr = hasattr(self, attr_name)
+        original_value = getattr(self, attr_name, None) if had_attr else None
+        setattr(self, attr_name, new_value)
+        try:
+            yield
+        finally:
+            if had_attr:
+                setattr(self, attr_name, original_value)
+            else:
+                delattr(self, attr_name)

evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl