PyPI - evalscope - Versions diffs - 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of evalscope might be problematic. Click here for more details.

Files changed (302) hide show

evalscope/__init__.py +4 -1
evalscope/api/benchmark/__init__.py +3 -0
evalscope/api/benchmark/adapters/__init__.py +5 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
evalscope/api/benchmark/benchmark.py +356 -0
evalscope/api/benchmark/meta.py +121 -0
evalscope/api/dataset/__init__.py +2 -0
evalscope/api/dataset/dataset.py +349 -0
evalscope/api/dataset/loader.py +262 -0
evalscope/api/dataset/utils.py +143 -0
evalscope/api/evaluator/__init__.py +3 -0
evalscope/api/evaluator/cache.py +378 -0
evalscope/api/evaluator/evaluator.py +56 -0
evalscope/api/evaluator/state.py +275 -0
evalscope/api/filter/__init__.py +1 -0
evalscope/api/filter/filter.py +72 -0
evalscope/api/messages/__init__.py +12 -0
evalscope/api/messages/chat_message.py +243 -0
evalscope/api/messages/content.py +102 -0
evalscope/api/messages/utils.py +35 -0
evalscope/api/metric/__init__.py +2 -0
evalscope/api/metric/metric.py +55 -0
evalscope/api/metric/scorer.py +113 -0
evalscope/api/mixin/__init__.py +1 -0
evalscope/api/mixin/llm_judge_mixin.py +168 -0
evalscope/api/model/__init__.py +12 -0
evalscope/api/model/generate_config.py +155 -0
evalscope/api/model/model.py +386 -0
evalscope/api/model/model_output.py +285 -0
evalscope/api/registry.py +182 -0
evalscope/api/tool/__init__.py +3 -0
evalscope/api/tool/tool_call.py +101 -0
evalscope/api/tool/tool_info.py +173 -0
evalscope/api/tool/utils.py +64 -0
evalscope/app/app.py +3 -0
evalscope/app/ui/app_ui.py +2 -1
evalscope/app/ui/multi_model.py +50 -25
evalscope/app/ui/single_model.py +26 -14
evalscope/app/utils/data_utils.py +43 -27
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -14
evalscope/app/utils/visualization.py +9 -4
evalscope/arguments.py +7 -10
evalscope/backend/opencompass/api_meta_template.py +2 -1
evalscope/backend/opencompass/backend_manager.py +6 -5
evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
evalscope/backend/rag_eval/ragas/task_template.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
evalscope/backend/rag_eval/utils/embedding.py +10 -1
evalscope/backend/rag_eval/utils/llm.py +13 -12
evalscope/benchmarks/__init__.py +0 -2
evalscope/benchmarks/aime/aime24_adapter.py +38 -40
evalscope/benchmarks/aime/aime25_adapter.py +34 -40
evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
evalscope/benchmarks/arc/arc_adapter.py +34 -147
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
evalscope/benchmarks/arena_hard/utils.py +37 -1
evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
evalscope/benchmarks/bfcl/generation.py +222 -0
evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
evalscope/benchmarks/docmath/utils.py +4 -5
evalscope/benchmarks/drop/drop_adapter.py +88 -40
evalscope/benchmarks/frames/frames_adapter.py +136 -52
evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
evalscope/benchmarks/general_arena/utils.py +23 -27
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
evalscope/benchmarks/hle/hle_adapter.py +127 -93
evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
evalscope/benchmarks/ifeval/instructions.py +109 -64
evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/ifeval/utils.py +6 -7
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
evalscope/benchmarks/musr/musr_adapter.py +33 -64
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
evalscope/benchmarks/race/race_adapter.py +33 -119
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
evalscope/benchmarks/super_gpqa/utils.py +2 -1
evalscope/benchmarks/tau_bench/generation.py +147 -0
evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
evalscope/cli/cli.py +2 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/cli/start_server.py +6 -3
evalscope/collections/__init__.py +2 -10
evalscope/collections/sampler.py +10 -10
evalscope/collections/schema.py +13 -11
evalscope/config.py +157 -57
evalscope/constants.py +37 -61
evalscope/evaluator/__init__.py +1 -1
evalscope/evaluator/evaluator.py +275 -419
evalscope/filters/__init__.py +2 -0
evalscope/filters/extraction.py +126 -0
evalscope/filters/selection.py +57 -0
evalscope/metrics/__init__.py +13 -13
evalscope/metrics/llm_judge.py +47 -33
evalscope/metrics/math_parser.py +27 -22
evalscope/metrics/metric.py +307 -0
evalscope/metrics/metrics.py +22 -18
evalscope/metrics/t2v_metrics/__init__.py +0 -52
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
evalscope/models/__init__.py +6 -29
evalscope/models/image_edit_model.py +125 -0
evalscope/models/mockllm.py +65 -0
evalscope/models/model_apis.py +67 -0
evalscope/models/modelscope.py +455 -0
evalscope/models/openai_compatible.py +126 -0
evalscope/models/text2image_model.py +124 -0
evalscope/models/utils/openai.py +701 -0
evalscope/perf/benchmark.py +4 -1
evalscope/perf/http_client.py +4 -2
evalscope/perf/plugin/api/custom_api.py +5 -4
evalscope/perf/plugin/api/openai_api.py +11 -9
evalscope/perf/plugin/datasets/custom.py +2 -1
evalscope/perf/plugin/datasets/flickr8k.py +1 -1
evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
evalscope/perf/plugin/datasets/line_by_line.py +2 -1
evalscope/perf/plugin/datasets/longalpaca.py +2 -1
evalscope/perf/plugin/datasets/openqa.py +4 -2
evalscope/perf/utils/benchmark_util.py +15 -10
evalscope/perf/utils/db_util.py +9 -6
evalscope/perf/utils/local_server.py +11 -3
evalscope/perf/utils/rich_display.py +16 -10
evalscope/report/__init__.py +2 -3
evalscope/report/combinator.py +18 -12
evalscope/report/generator.py +51 -35
evalscope/report/{utils.py → report.py} +8 -6
evalscope/run.py +33 -47
evalscope/summarizer.py +1 -1
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/__init__.py +21 -2
evalscope/utils/chat_service.py +3 -2
evalscope/utils/deprecation_utils.py +12 -1
evalscope/utils/function_utils.py +29 -0
evalscope/utils/import_utils.py +23 -1
evalscope/utils/io_utils.py +142 -6
evalscope/utils/json_schema.py +208 -0
evalscope/utils/logger.py +51 -12
evalscope/utils/model_utils.py +11 -7
evalscope/utils/multi_choices.py +288 -0
evalscope/utils/url_utils.py +65 -0
evalscope/version.py +2 -2
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
tests/benchmark/test_eval.py +385 -0
tests/benchmark/test_image_edit.py +65 -0
tests/{aigc → benchmark}/test_t2i.py +22 -4
tests/benchmark/test_vlm.py +80 -0
tests/cli/test_all.py +85 -47
tests/cli/test_collection.py +20 -8
tests/cli/test_custom.py +22 -15
tests/cli/test_reasoning.py +81 -0
tests/common.py +73 -0
tests/perf/test_perf.py +4 -2
tests/rag/test_clip_benchmark.py +0 -2
evalscope/benchmarks/aigc/t2i/base.py +0 -56
evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
evalscope/benchmarks/arc/ai2_arc.py +0 -151
evalscope/benchmarks/benchmark.py +0 -81
evalscope/benchmarks/ceval/ceval_exam.py +0 -146
evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
evalscope/benchmarks/competition_math/competition_math.py +0 -79
evalscope/benchmarks/data_adapter.py +0 -528
evalscope/benchmarks/filters.py +0 -59
evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
evalscope/benchmarks/humaneval/humaneval.py +0 -79
evalscope/benchmarks/mmlu/mmlu.py +0 -160
evalscope/benchmarks/mmlu/samples.jsonl +0 -5
evalscope/benchmarks/process_bench/critique_template.txt +0 -13
evalscope/benchmarks/race/race.py +0 -104
evalscope/benchmarks/race/samples.jsonl +0 -5
evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
evalscope/benchmarks/utils.py +0 -60
evalscope/collections/evaluator.py +0 -375
evalscope/metrics/completion_parsers.py +0 -227
evalscope/metrics/named_metrics.py +0 -55
evalscope/models/adapters/__init__.py +0 -14
evalscope/models/adapters/base_adapter.py +0 -84
evalscope/models/adapters/bfcl_adapter.py +0 -246
evalscope/models/adapters/chat_adapter.py +0 -207
evalscope/models/adapters/choice_adapter.py +0 -222
evalscope/models/adapters/custom_adapter.py +0 -71
evalscope/models/adapters/server_adapter.py +0 -236
evalscope/models/adapters/t2i_adapter.py +0 -79
evalscope/models/adapters/tau_bench_adapter.py +0 -189
evalscope/models/custom/__init__.py +0 -4
evalscope/models/custom/custom_model.py +0 -50
evalscope/models/custom/dummy_model.py +0 -99
evalscope/models/local_model.py +0 -128
evalscope/models/register.py +0 -41
tests/cli/test_run.py +0 -489
/evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
{evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
/tests/{aigc → benchmark}/__init__.py +0 -0

evalscope/benchmarks/ifeval/instructions.py CHANGED Viewed

@@ -21,7 +21,7 @@ import re
 import string
 from typing import Dict, Optional, Sequence, Union
-from evalscope.benchmarks.ifeval import instructions_util
+from . import instructions_util
 _InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
@@ -140,8 +140,9 @@ class ResponseLanguageChecker(Instruction):
         if self._language is None:
             self._language = random.choice(list(_LANGUAGES.keys()))
         # TODO(tianjianlu): opens the description generation to more choices.
-        self._description_pattern = ('Your ENTIRE response should be in {language} language, no other '
-                                     + 'language is allowed.')
+        self._description_pattern = (
+            'Your ENTIRE response should be in {language} language, no other ' + 'language is allowed.'
+        )
         return self._description_pattern.format(language=_LANGUAGES[self._language])
     def get_instruction_args(self):
@@ -197,8 +198,10 @@ class NumberOfSentences(Instruction):
         if relation is None:
             self._comparison_relation = random.choice(_COMPARISON_RELATION)
         elif relation not in _COMPARISON_RELATION:
-            raise ValueError('The supported relation for comparison must be in '
-                             f'{_COMPARISON_RELATION}, but {relation} is given.')
+            raise ValueError(
+                'The supported relation for comparison must be in '
+                f'{_COMPARISON_RELATION}, but {relation} is given.'
+            )
         else:
             self._comparison_relation = relation
@@ -255,8 +258,10 @@ class PlaceholderChecker(Instruction):
         self._num_placeholders = num_placeholders
         if self._num_placeholders is None or self._num_placeholders < 0:
             self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
-        self._description_pattern = ('The response must contain at least {num_placeholders} placeholders '
-                                     + 'represented by square brackets, such as [address].')
+        self._description_pattern = (
+            'The response must contain at least {num_placeholders} placeholders '
+            + 'represented by square brackets, such as [address].'
+        )
         return self._description_pattern.format(num_placeholders=self._num_placeholders)
     def get_instruction_args(self):
@@ -298,9 +303,10 @@ class BulletListChecker(Instruction):
         self._num_bullets = num_bullets
         if self._num_bullets is None or self._num_bullets < 0:
             self._num_bullets = random.randint(1, _NUM_BULLETS)
-        self._description_pattern = ('Your answer must contain exactly {num_bullets} bullet points. '
-                                     + 'Use the markdown bullet points such as:\n' + '* This is point 1. \n'
-                                     + '* This is point 2')
+        self._description_pattern = (
+            'Your answer must contain exactly {num_bullets} bullet points. '
+            + 'Use the markdown bullet points such as:\n' + '* This is point 1. \n' + '* This is point 2'
+        )
         return self._description_pattern.format(num_bullets=self._num_bullets)
     def get_instruction_args(self):
@@ -379,8 +385,9 @@ class ConstrainedStartChecker(Instruction):
         self._starter = starter.strip() if isinstance(starter, str) else starter
         if self._starter is None:
             self._starter = random.choice(_STARTER_OPTIONS)
-        self._description_pattern = ('During the conversation, when it is your turn, '
-                                     + 'please always start with {starter}')
+        self._description_pattern = (
+            'During the conversation, when it is your turn, ' + 'please always start with {starter}'
+        )
         return self._description_pattern.format(starter=self._starter)
     def get_instruction_args(self):
@@ -423,8 +430,10 @@ class HighlightSectionChecker(Instruction):
         if self._num_highlights is None or self._num_highlights < 0:
             self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
-        self._description_pattern = ('Highlight at least {num_highlights} sections in your answer with '
-                                     + 'markdown, i.e. *highlighted section*.')
+        self._description_pattern = (
+            'Highlight at least {num_highlights} sections in your answer with '
+            + 'markdown, i.e. *highlighted section*.'
+        )
         return self._description_pattern.format(num_highlights=self._num_highlights)
@@ -482,9 +491,11 @@ class SectionChecker(Instruction):
         if self._num_sections is None or self._num_sections < 0:
             self._num_sections = random.randint(1, _NUM_SECTIONS)
-        self._description_pattern = ('Your response must have {num_sections} sections. Mark the beginning '
-                                     + 'of each section with {section_spliter} X, such as:\n' + '{section_spliter} 1\n'
-                                     + '[content of section 1]\n' + '{section_spliter} 2\n' + '[content of section 2]')
+        self._description_pattern = (
+            'Your response must have {num_sections} sections. Mark the beginning '
+            + 'of each section with {section_spliter} X, such as:\n' + '{section_spliter} 1\n'
+            + '[content of section 1]\n' + '{section_spliter} 2\n' + '[content of section 2]'
+        )
         return self._description_pattern.format(num_sections=self._num_sections, section_spliter=self._section_spliter)
@@ -534,8 +545,9 @@ class ParagraphChecker(Instruction):
         if self._num_paragraphs is None or self._num_paragraphs < 0:
             self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
-        self._description_pattern = ('There should be {num_paragraphs} paragraphs. '
-                                     + 'Paragraphs are separated with the markdown divider: ***')
+        self._description_pattern = (
+            'There should be {num_paragraphs} paragraphs. ' + 'Paragraphs are separated with the markdown divider: ***'
+        )
         return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
@@ -585,12 +597,14 @@ class PostscriptChecker(Instruction):
           A string representing the instruction description.
         """
         self._postscript_marker = (
-            postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker)
+            postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
+        )
         if self._postscript_marker is None:
             self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
-        self._description_pattern = ('At the end of your response, please explicitly add a postscript '
-                                     + 'starting with {postscript}')
+        self._description_pattern = (
+            'At the end of your response, please explicitly add a postscript ' + 'starting with {postscript}'
+        )
         return self._description_pattern.format(postscript=self._postscript_marker)
@@ -644,8 +658,10 @@ class RephraseChecker(Instruction):
                              'in the form of *change me*.')
         self._reference_without_change = original_message
-        self._description = ('Rephrasing: Your rephrased response should only'
-                             + 'change the words/sentences in between two asterisks' + 'such as *change me*.')
+        self._description = (
+            'Rephrasing: Your rephrased response should only' + 'change the words/sentences in between two asterisks'
+            + 'such as *change me*.'
+        )
         return self._description
     def get_instruction_args(self):
@@ -757,13 +773,16 @@ class KeywordFrequencyChecker(Instruction):
         if relation is None:
             self._comparison_relation = random.choice(_COMPARISON_RELATION)
         elif relation not in _COMPARISON_RELATION:
-            raise ValueError('The supported relation for comparison must be in '
-                             f'{_COMPARISON_RELATION}, but {relation} is given.')
+            raise ValueError(
+                'The supported relation for comparison must be in '
+                f'{_COMPARISON_RELATION}, but {relation} is given.'
+            )
         else:
             self._comparison_relation = relation
-        self._description_pattern = ('In your response, the word {keyword} should appear {relation} '
-                                     + '{frequency} times.')
+        self._description_pattern = (
+            'In your response, the word {keyword} should appear {relation} ' + '{frequency} times.'
+        )
         return self._description_pattern.format(
             keyword=self._keyword,
@@ -819,8 +838,10 @@ class NumberOfWords(Instruction):
         if relation is None:
             self._comparison_relation = random.choice(_COMPARISON_RELATION)
         elif relation not in _COMPARISON_RELATION:
-            raise ValueError('The supported relation for comparison must be in '
-                             f'{_COMPARISON_RELATION}, but {relation} is given.')
+            raise ValueError(
+                'The supported relation for comparison must be in '
+                f'{_COMPARISON_RELATION}, but {relation} is given.'
+            )
         else:
             self._comparison_relation = relation
@@ -850,8 +871,10 @@ class JsonFormat(Instruction):
     """Check the Json format."""
     def build_description(self):
-        self._description_pattern = ('Entire output should be wrapped in JSON format. You can use markdown'
-                                     ' ticks such as ```.')
+        self._description_pattern = (
+            'Entire output should be wrapped in JSON format. You can use markdown'
+            ' ticks such as ```.'
+        )
         return self._description_pattern
     def get_instruction_args(self):
@@ -864,8 +887,9 @@ class JsonFormat(Instruction):
     def check_following(self, value):
         value = (
-            value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix(
-                '```').removesuffix('```').strip())
+            value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix('```').
+            removesuffix('```').strip()
+        )
         try:
             json.loads(value)
         except ValueError:
@@ -903,10 +927,12 @@ class ParagraphFirstWordCheck(Instruction):
             self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
         self._first_word = self._first_word.lower()
-        self._description_pattern = ('There should be {num_paragraphs} paragraphs. '
-                                     + 'Paragraphs and only paragraphs are separated with each other by two '
-                                     + "new lines as if it was '\\n\\n' in python. "
-                                     + 'Paragraph {nth_paragraph} must start with word {first_word}.')
+        self._description_pattern = (
+            'There should be {num_paragraphs} paragraphs. '
+            + 'Paragraphs and only paragraphs are separated with each other by two '
+            + "new lines as if it was '\\n\\n' in python. "
+            + 'Paragraph {nth_paragraph} must start with word {first_word}.'
+        )
         return self._description_pattern.format(
             num_paragraphs=self._num_paragraphs,
@@ -1084,11 +1110,12 @@ class RephraseParagraph(Instruction):
         self._low = low
         self._high = high
-        self._description = ('Rephrase the following paragraph: ' + '{original_paragraph}\nYour response should have '
-                             + 'between {low} and {high} of the same words. '
-                             + 'Words are the same if and only if all of the '
-                             + 'letters, ignoring cases, are the same. For '
-                             + "example, 'run' is the same as 'Run' but different " + "to 'ran'.")
+        self._description = (
+            'Rephrase the following paragraph: ' + '{original_paragraph}\nYour response should have '
+            + 'between {low} and {high} of the same words. ' + 'Words are the same if and only if all of the '
+            + 'letters, ignoring cases, are the same. For ' + "example, 'run' is the same as 'Run' but different "
+            + "to 'ran'."
+        )
         return self._description.format(original_paragraph=original_paragraph, low=self._low, high=self._high)
@@ -1123,8 +1150,10 @@ class TwoResponsesChecker(Instruction):
     def build_description(self):
         """Build the instruction description."""
-        self._description_pattern = ('Give two different responses. Responses and only responses should'
-                                     ' be separated by 6 asterisk symbols: ******.')
+        self._description_pattern = (
+            'Give two different responses. Responses and only responses should'
+            ' be separated by 6 asterisk symbols: ******.'
+        )
         return self._description_pattern
     def get_instruction_args(self):
@@ -1171,10 +1200,12 @@ class RepeatPromptThenAnswer(Instruction):
             raise ValueError('prompt_to_repeat must be set.')
         else:
             self._prompt_to_repeat = prompt_to_repeat
-        self._description_pattern = ('First repeat the request word for word without change,'
-                                     ' then give your answer (1. do not say any words or characters'
-                                     ' before repeating the request; 2. the request you need to repeat'
-                                     ' does not include this sentence)')
+        self._description_pattern = (
+            'First repeat the request word for word without change,'
+            ' then give your answer (1. do not say any words or characters'
+            ' before repeating the request; 2. the request you need to repeat'
+            ' does not include this sentence)'
+        )
         return self._description_pattern
     def get_instruction_args(self):
@@ -1205,8 +1236,10 @@ class EndChecker(Instruction):
         self._end_phrase = (end_phrase.strip() if isinstance(end_phrase, str) else end_phrase)
         if self._end_phrase is None:
             self._end_phrase = random.choice(_ENDING_OPTIONS)
-        self._description_pattern = ('Finish your response with this exact phrase {ender}. '
-                                     'No other words should follow this phrase.')
+        self._description_pattern = (
+            'Finish your response with this exact phrase {ender}. '
+            'No other words should follow this phrase.'
+        )
         return self._description_pattern.format(ender=self._end_phrase)
     def get_instruction_args(self):
@@ -1228,8 +1261,10 @@ class TitleChecker(Instruction):
     def build_description(self):
         """Build the instruction description."""
-        self._description_pattern = ('Your answer must contain a title, wrapped in double angular brackets,'
-                                     ' such as <<poem of joy>>.')
+        self._description_pattern = (
+            'Your answer must contain a title, wrapped in double angular brackets,'
+            ' such as <<poem of joy>>.'
+        )
         return self._description_pattern
     def get_instruction_args(self):
@@ -1283,13 +1318,17 @@ class LetterFrequencyChecker(Instruction):
         if let_relation is None:
             self._comparison_relation = random.choice(_COMPARISON_RELATION)
         elif let_relation not in _COMPARISON_RELATION:
-            raise ValueError('The supported relation for comparison must be in '
-                             f'{_COMPARISON_RELATION}, but {let_relation} is given.')
+            raise ValueError(
+                'The supported relation for comparison must be in '
+                f'{_COMPARISON_RELATION}, but {let_relation} is given.'
+            )
         else:
             self._comparison_relation = let_relation
-        self._description_pattern = ('In your response, the letter {letter} should appear {let_relation}'
-                                     ' {let_frequency} times.')
+        self._description_pattern = (
+            'In your response, the letter {letter} should appear {let_relation}'
+            ' {let_frequency} times.'
+        )
         return self._description_pattern.format(
             letter=self._letter,
@@ -1352,8 +1391,10 @@ class LowercaseLettersEnglishChecker(Instruction):
     def build_description(self):
         """Build the instruction description."""
-        self._description_pattern = ('Your entire response should be in English, and in all lowercase'
-                                     ' letters. No capital letters are allowed.')
+        self._description_pattern = (
+            'Your entire response should be in English, and in all lowercase'
+            ' letters. No capital letters are allowed.'
+        )
         return self._description_pattern
     def get_instruction_args(self):
@@ -1422,11 +1463,15 @@ class CapitalWordFrequencyChecker(Instruction):
         if capital_relation is None:
             self._comparison_relation = random.choice(_COMPARISON_RELATION)
         elif capital_relation not in _COMPARISON_RELATION:
-            raise ValueError('The supported relation for comparison must be in '
-                             f'{_COMPARISON_RELATION}, but {capital_relation} is given.')
-        self._description_pattern = ('In your response, words with all capital letters should appear'
-                                     ' {relation} {frequency} times.')
+            raise ValueError(
+                'The supported relation for comparison must be in '
+                f'{_COMPARISON_RELATION}, but {capital_relation} is given.'
+            )
+        self._description_pattern = (
+            'In your response, words with all capital letters should appear'
+            ' {relation} {frequency} times.'
+        )
         return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation)

evalscope/benchmarks/ifeval/instructions_registry.py CHANGED Viewed

@@ -13,7 +13,7 @@
 # limitations under the License.
 """Registry of all instructions."""
-from evalscope.benchmarks.ifeval import instructions
+from . import instructions
 _KEYWORD = 'keywords:'

evalscope/benchmarks/ifeval/instructions_util.py CHANGED Viewed

@@ -14,7 +14,6 @@
 """Utility library of instructions."""
 import functools
-import immutabledict
 import nltk
 import os
 import random
@@ -1551,7 +1550,7 @@ WORD_LIST = [
 ]  # pylint: disable=line-too-long
 # ISO 639-1 codes to language names.
-LANGUAGE_CODES = immutabledict.immutabledict({
+LANGUAGE_CODES = {
     'en': 'English',
     'es': 'Spanish',
     'pt': 'Portuguese',
@@ -1582,7 +1581,7 @@ LANGUAGE_CODES = immutabledict.immutabledict({
     'pa': 'Punjabi',
     'ml': 'Malayalam',
     'fi': 'Finnish',
-})
+}
 _ALPHABETS = '([A-Za-z])'
 _PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]'

evalscope/benchmarks/ifeval/utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import dataclasses
 from typing import Dict, Optional, Union
-from evalscope.benchmarks.ifeval import instructions_registry
+from . import instructions_registry
 @dataclasses.dataclass
@@ -121,14 +121,13 @@ def process_results(doc, results):
     out_loose = test_instruction_following_loose(inp, response)
     return {
-        'prompt_level_strict_acc': out_strict.follow_all_instructions,
-        'inst_level_strict_acc': out_strict.follow_instruction_list,
-        'prompt_level_loose_acc': out_loose.follow_all_instructions,
-        'inst_level_loose_acc': out_loose.follow_instruction_list,
+        'prompt_level_strict': float(out_strict.follow_all_instructions),
+        'inst_level_strict': agg_inst_level_acc(out_strict.follow_instruction_list),
+        'prompt_level_loose': float(out_loose.follow_all_instructions),
+        'inst_level_loose': agg_inst_level_acc(out_loose.follow_instruction_list),
     }
 def agg_inst_level_acc(items):
-    flat_items = [item for sublist in items for item in sublist]
-    inst_level_acc = sum(flat_items) / len(flat_items)
+    inst_level_acc = sum(items) / len(items) if items else 0
     return inst_level_acc

evalscope/benchmarks/image_edit/gedit/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/image_edit/gedit/gedit_adapter.py ADDED Viewed

@@ -0,0 +1,138 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import copy
+import os
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, ImageEditAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator.state import TaskState
+from evalscope.api.messages import ChatMessage, ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.metric.scorer import Score
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import FileConstants, Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+SUBSET_LIST = [
+    'background_change', 'color_alter', 'material_alter', 'motion_change', 'ps_human', 'style_change', 'subject-add',
+    'subject-remove', 'subject-replace', 'text_change', 'tone_transfer'
+]
+LANGUAGE_LIST = ['en', 'cn']
+@register_benchmark(
+    BenchmarkMeta(
+        name='gedit',
+        pretty_name='GEdit-Bench',
+        dataset_id='stepfun-ai/GEdit-Bench',
+        description='GEdit-Bench Image Editing Benchmark, grounded in real-world '
+        'usages is developed to support more authentic and '
+        'comprehensive evaluation of image editing models.',
+        tags=[Tags.IMAGE_EDITING],
+        subset_list=SUBSET_LIST,
+        metric_list=['Semantic Consistency', 'Perceptual Similarity'],
+        few_shot_num=0,
+        train_split=None,
+        eval_split='train',
+        extra_params={'language': f'# language of the instruction, choose from {LANGUAGE_LIST}, default to `en`'}
+    )
+)
+class GEditAdapter(ImageEditAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.language = self.extra_params.get('language', 'en')
+        if self.language not in LANGUAGE_LIST:
+            logger.warning(f"Invalid language '{self.language}', fallback to 'en'")
+            self.language = 'en'
+        self.reformat_subset = True
+        self._use_llm_judge = True
+        self.load_prompt()
+    def load_prompt(self):
+        from . import vie_prompts
+        self.context = vie_prompts._context_no_delimit
+        self.SC_prompt = '\n'.join([
+            self.context, vie_prompts._prompts_0shot_two_image_edit_rule, vie_prompts._prompts_0shot_tie_rule_SC
+        ])
+        self.PQ_prompt = '\n'.join([self.context, vie_prompts._prompts_0shot_rule_PQ])
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        record = copy.deepcopy(record)
+        # Process instruction and image
+        instruction = record['instruction']
+        image_bytes = record['input_image']['bytes']
+        input_image = bytes_to_base64(image_bytes, format='png', add_header=True)
+        record['input_image'] = input_image
+        record[FileConstants.ID] = record['key']
+        del record['input_image_raw']
+        text_content = ContentText(text=instruction)
+        image_content = ContentImage(image=input_image)
+        messages: List[ChatMessage] = [
+            ChatMessageUser(content=[text_content, image_content]),
+        ]
+        return Sample(input=messages, subset_key=record['task_type'], metadata=record)
+    def sample_filter(self, sample: Sample) -> bool:
+        language = sample.metadata.get('instruction_language', 'en')
+        return super().sample_filter(sample) and language == self.language
+    def llm_match_score(self, original_prediction, filtered_prediction, reference, task_state: TaskState) -> Score:
+        import math
+        from .utils import mllm_output_to_dict
+        metadata = task_state.metadata
+        text_prompt = metadata['instruction']
+        input_image = metadata['input_image']  # base64 image
+        edited_image = metadata[FileConstants.IMAGE_PATH]  # local image path
+        _SC_prompt = self.SC_prompt.replace('<instruction>', text_prompt)
+        # Initialize the score object with prediction details
+        score = Score(
+            extracted_prediction=edited_image,
+            prediction=edited_image,
+        )
+        # Build prompts
+        SC_prompt_final = [
+            ChatMessageUser(
+                content=[
+                    ContentImage(image=input_image),
+                    ContentImage(image=edited_image),
+                    ContentText(text=_SC_prompt)
+                ]
+            )
+        ]
+        PQ_prompt_final = [
+            ChatMessageUser(content=[ContentImage(image=edited_image),
+                                     ContentText(text=self.PQ_prompt)])
+        ]
+        guess_if_cannot_parse = True
+        result_SC = self.llm_judge.judge(messages=SC_prompt_final)
+        result_PQ = self.llm_judge.judge(messages=PQ_prompt_final)
+        SC_dict = mllm_output_to_dict(result_SC, give_up_parsing=guess_if_cannot_parse)
+        PQ_dict = mllm_output_to_dict(result_PQ, give_up_parsing=guess_if_cannot_parse)
+        SC_score = min(SC_dict['score'])
+        PQ_score = min(PQ_dict['score'])
+        O_score = math.sqrt(SC_score * PQ_score)
+        score.value = {'Semantic Consistency': SC_score, 'Perceptual Quality': PQ_score, 'Overall': O_score}
+        score.main_score_name = 'Overall'
+        score.metadata = {
+            'SC_dict': SC_dict,
+            'PQ_dict': PQ_dict,
+        }
+        return score

evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl

Potentially problematic release.

evalscope 0.17.1py3-none-any.whl → 1.0.1py3-none-any.whl