evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
|
@@ -21,7 +21,7 @@ import re
|
|
|
21
21
|
import string
|
|
22
22
|
from typing import Dict, Optional, Sequence, Union
|
|
23
23
|
|
|
24
|
-
from
|
|
24
|
+
from . import instructions_util
|
|
25
25
|
|
|
26
26
|
_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]]
|
|
27
27
|
|
|
@@ -140,8 +140,9 @@ class ResponseLanguageChecker(Instruction):
|
|
|
140
140
|
if self._language is None:
|
|
141
141
|
self._language = random.choice(list(_LANGUAGES.keys()))
|
|
142
142
|
# TODO(tianjianlu): opens the description generation to more choices.
|
|
143
|
-
self._description_pattern = (
|
|
144
|
-
|
|
143
|
+
self._description_pattern = (
|
|
144
|
+
'Your ENTIRE response should be in {language} language, no other ' + 'language is allowed.'
|
|
145
|
+
)
|
|
145
146
|
return self._description_pattern.format(language=_LANGUAGES[self._language])
|
|
146
147
|
|
|
147
148
|
def get_instruction_args(self):
|
|
@@ -197,8 +198,10 @@ class NumberOfSentences(Instruction):
|
|
|
197
198
|
if relation is None:
|
|
198
199
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
199
200
|
elif relation not in _COMPARISON_RELATION:
|
|
200
|
-
raise ValueError(
|
|
201
|
-
|
|
201
|
+
raise ValueError(
|
|
202
|
+
'The supported relation for comparison must be in '
|
|
203
|
+
f'{_COMPARISON_RELATION}, but {relation} is given.'
|
|
204
|
+
)
|
|
202
205
|
else:
|
|
203
206
|
self._comparison_relation = relation
|
|
204
207
|
|
|
@@ -255,8 +258,10 @@ class PlaceholderChecker(Instruction):
|
|
|
255
258
|
self._num_placeholders = num_placeholders
|
|
256
259
|
if self._num_placeholders is None or self._num_placeholders < 0:
|
|
257
260
|
self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
|
|
258
|
-
self._description_pattern = (
|
|
259
|
-
|
|
261
|
+
self._description_pattern = (
|
|
262
|
+
'The response must contain at least {num_placeholders} placeholders '
|
|
263
|
+
+ 'represented by square brackets, such as [address].'
|
|
264
|
+
)
|
|
260
265
|
return self._description_pattern.format(num_placeholders=self._num_placeholders)
|
|
261
266
|
|
|
262
267
|
def get_instruction_args(self):
|
|
@@ -298,9 +303,10 @@ class BulletListChecker(Instruction):
|
|
|
298
303
|
self._num_bullets = num_bullets
|
|
299
304
|
if self._num_bullets is None or self._num_bullets < 0:
|
|
300
305
|
self._num_bullets = random.randint(1, _NUM_BULLETS)
|
|
301
|
-
self._description_pattern = (
|
|
302
|
-
|
|
303
|
-
|
|
306
|
+
self._description_pattern = (
|
|
307
|
+
'Your answer must contain exactly {num_bullets} bullet points. '
|
|
308
|
+
+ 'Use the markdown bullet points such as:\n' + '* This is point 1. \n' + '* This is point 2'
|
|
309
|
+
)
|
|
304
310
|
return self._description_pattern.format(num_bullets=self._num_bullets)
|
|
305
311
|
|
|
306
312
|
def get_instruction_args(self):
|
|
@@ -379,8 +385,9 @@ class ConstrainedStartChecker(Instruction):
|
|
|
379
385
|
self._starter = starter.strip() if isinstance(starter, str) else starter
|
|
380
386
|
if self._starter is None:
|
|
381
387
|
self._starter = random.choice(_STARTER_OPTIONS)
|
|
382
|
-
self._description_pattern = (
|
|
383
|
-
|
|
388
|
+
self._description_pattern = (
|
|
389
|
+
'During the conversation, when it is your turn, ' + 'please always start with {starter}'
|
|
390
|
+
)
|
|
384
391
|
return self._description_pattern.format(starter=self._starter)
|
|
385
392
|
|
|
386
393
|
def get_instruction_args(self):
|
|
@@ -423,8 +430,10 @@ class HighlightSectionChecker(Instruction):
|
|
|
423
430
|
if self._num_highlights is None or self._num_highlights < 0:
|
|
424
431
|
self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
|
|
425
432
|
|
|
426
|
-
self._description_pattern = (
|
|
427
|
-
|
|
433
|
+
self._description_pattern = (
|
|
434
|
+
'Highlight at least {num_highlights} sections in your answer with '
|
|
435
|
+
+ 'markdown, i.e. *highlighted section*.'
|
|
436
|
+
)
|
|
428
437
|
|
|
429
438
|
return self._description_pattern.format(num_highlights=self._num_highlights)
|
|
430
439
|
|
|
@@ -482,9 +491,11 @@ class SectionChecker(Instruction):
|
|
|
482
491
|
if self._num_sections is None or self._num_sections < 0:
|
|
483
492
|
self._num_sections = random.randint(1, _NUM_SECTIONS)
|
|
484
493
|
|
|
485
|
-
self._description_pattern = (
|
|
486
|
-
|
|
487
|
-
|
|
494
|
+
self._description_pattern = (
|
|
495
|
+
'Your response must have {num_sections} sections. Mark the beginning '
|
|
496
|
+
+ 'of each section with {section_spliter} X, such as:\n' + '{section_spliter} 1\n'
|
|
497
|
+
+ '[content of section 1]\n' + '{section_spliter} 2\n' + '[content of section 2]'
|
|
498
|
+
)
|
|
488
499
|
|
|
489
500
|
return self._description_pattern.format(num_sections=self._num_sections, section_spliter=self._section_spliter)
|
|
490
501
|
|
|
@@ -534,8 +545,9 @@ class ParagraphChecker(Instruction):
|
|
|
534
545
|
if self._num_paragraphs is None or self._num_paragraphs < 0:
|
|
535
546
|
self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
|
|
536
547
|
|
|
537
|
-
self._description_pattern = (
|
|
538
|
-
|
|
548
|
+
self._description_pattern = (
|
|
549
|
+
'There should be {num_paragraphs} paragraphs. ' + 'Paragraphs are separated with the markdown divider: ***'
|
|
550
|
+
)
|
|
539
551
|
|
|
540
552
|
return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
|
|
541
553
|
|
|
@@ -585,12 +597,14 @@ class PostscriptChecker(Instruction):
|
|
|
585
597
|
A string representing the instruction description.
|
|
586
598
|
"""
|
|
587
599
|
self._postscript_marker = (
|
|
588
|
-
postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
|
|
600
|
+
postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
|
|
601
|
+
)
|
|
589
602
|
if self._postscript_marker is None:
|
|
590
603
|
self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
|
|
591
604
|
|
|
592
|
-
self._description_pattern = (
|
|
593
|
-
|
|
605
|
+
self._description_pattern = (
|
|
606
|
+
'At the end of your response, please explicitly add a postscript ' + 'starting with {postscript}'
|
|
607
|
+
)
|
|
594
608
|
|
|
595
609
|
return self._description_pattern.format(postscript=self._postscript_marker)
|
|
596
610
|
|
|
@@ -644,8 +658,10 @@ class RephraseChecker(Instruction):
|
|
|
644
658
|
'in the form of *change me*.')
|
|
645
659
|
|
|
646
660
|
self._reference_without_change = original_message
|
|
647
|
-
self._description = (
|
|
648
|
-
|
|
661
|
+
self._description = (
|
|
662
|
+
'Rephrasing: Your rephrased response should only' + 'change the words/sentences in between two asterisks'
|
|
663
|
+
+ 'such as *change me*.'
|
|
664
|
+
)
|
|
649
665
|
return self._description
|
|
650
666
|
|
|
651
667
|
def get_instruction_args(self):
|
|
@@ -757,13 +773,16 @@ class KeywordFrequencyChecker(Instruction):
|
|
|
757
773
|
if relation is None:
|
|
758
774
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
759
775
|
elif relation not in _COMPARISON_RELATION:
|
|
760
|
-
raise ValueError(
|
|
761
|
-
|
|
776
|
+
raise ValueError(
|
|
777
|
+
'The supported relation for comparison must be in '
|
|
778
|
+
f'{_COMPARISON_RELATION}, but {relation} is given.'
|
|
779
|
+
)
|
|
762
780
|
else:
|
|
763
781
|
self._comparison_relation = relation
|
|
764
782
|
|
|
765
|
-
self._description_pattern = (
|
|
766
|
-
|
|
783
|
+
self._description_pattern = (
|
|
784
|
+
'In your response, the word {keyword} should appear {relation} ' + '{frequency} times.'
|
|
785
|
+
)
|
|
767
786
|
|
|
768
787
|
return self._description_pattern.format(
|
|
769
788
|
keyword=self._keyword,
|
|
@@ -819,8 +838,10 @@ class NumberOfWords(Instruction):
|
|
|
819
838
|
if relation is None:
|
|
820
839
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
821
840
|
elif relation not in _COMPARISON_RELATION:
|
|
822
|
-
raise ValueError(
|
|
823
|
-
|
|
841
|
+
raise ValueError(
|
|
842
|
+
'The supported relation for comparison must be in '
|
|
843
|
+
f'{_COMPARISON_RELATION}, but {relation} is given.'
|
|
844
|
+
)
|
|
824
845
|
else:
|
|
825
846
|
self._comparison_relation = relation
|
|
826
847
|
|
|
@@ -850,8 +871,10 @@ class JsonFormat(Instruction):
|
|
|
850
871
|
"""Check the Json format."""
|
|
851
872
|
|
|
852
873
|
def build_description(self):
|
|
853
|
-
self._description_pattern = (
|
|
854
|
-
|
|
874
|
+
self._description_pattern = (
|
|
875
|
+
'Entire output should be wrapped in JSON format. You can use markdown'
|
|
876
|
+
' ticks such as ```.'
|
|
877
|
+
)
|
|
855
878
|
return self._description_pattern
|
|
856
879
|
|
|
857
880
|
def get_instruction_args(self):
|
|
@@ -864,8 +887,9 @@ class JsonFormat(Instruction):
|
|
|
864
887
|
|
|
865
888
|
def check_following(self, value):
|
|
866
889
|
value = (
|
|
867
|
-
value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix(
|
|
868
|
-
|
|
890
|
+
value.strip().removeprefix('```json').removeprefix('```Json').removeprefix('```JSON').removeprefix('```').
|
|
891
|
+
removesuffix('```').strip()
|
|
892
|
+
)
|
|
869
893
|
try:
|
|
870
894
|
json.loads(value)
|
|
871
895
|
except ValueError:
|
|
@@ -903,10 +927,12 @@ class ParagraphFirstWordCheck(Instruction):
|
|
|
903
927
|
self._first_word = instructions_util.generate_keywords(num_keywords=1)[0]
|
|
904
928
|
self._first_word = self._first_word.lower()
|
|
905
929
|
|
|
906
|
-
self._description_pattern = (
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
930
|
+
self._description_pattern = (
|
|
931
|
+
'There should be {num_paragraphs} paragraphs. '
|
|
932
|
+
+ 'Paragraphs and only paragraphs are separated with each other by two '
|
|
933
|
+
+ "new lines as if it was '\\n\\n' in python. "
|
|
934
|
+
+ 'Paragraph {nth_paragraph} must start with word {first_word}.'
|
|
935
|
+
)
|
|
910
936
|
|
|
911
937
|
return self._description_pattern.format(
|
|
912
938
|
num_paragraphs=self._num_paragraphs,
|
|
@@ -1084,11 +1110,12 @@ class RephraseParagraph(Instruction):
|
|
|
1084
1110
|
self._low = low
|
|
1085
1111
|
self._high = high
|
|
1086
1112
|
|
|
1087
|
-
self._description = (
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1113
|
+
self._description = (
|
|
1114
|
+
'Rephrase the following paragraph: ' + '{original_paragraph}\nYour response should have '
|
|
1115
|
+
+ 'between {low} and {high} of the same words. ' + 'Words are the same if and only if all of the '
|
|
1116
|
+
+ 'letters, ignoring cases, are the same. For ' + "example, 'run' is the same as 'Run' but different "
|
|
1117
|
+
+ "to 'ran'."
|
|
1118
|
+
)
|
|
1092
1119
|
|
|
1093
1120
|
return self._description.format(original_paragraph=original_paragraph, low=self._low, high=self._high)
|
|
1094
1121
|
|
|
@@ -1123,8 +1150,10 @@ class TwoResponsesChecker(Instruction):
|
|
|
1123
1150
|
|
|
1124
1151
|
def build_description(self):
|
|
1125
1152
|
"""Build the instruction description."""
|
|
1126
|
-
self._description_pattern = (
|
|
1127
|
-
|
|
1153
|
+
self._description_pattern = (
|
|
1154
|
+
'Give two different responses. Responses and only responses should'
|
|
1155
|
+
' be separated by 6 asterisk symbols: ******.'
|
|
1156
|
+
)
|
|
1128
1157
|
return self._description_pattern
|
|
1129
1158
|
|
|
1130
1159
|
def get_instruction_args(self):
|
|
@@ -1171,10 +1200,12 @@ class RepeatPromptThenAnswer(Instruction):
|
|
|
1171
1200
|
raise ValueError('prompt_to_repeat must be set.')
|
|
1172
1201
|
else:
|
|
1173
1202
|
self._prompt_to_repeat = prompt_to_repeat
|
|
1174
|
-
self._description_pattern = (
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1203
|
+
self._description_pattern = (
|
|
1204
|
+
'First repeat the request word for word without change,'
|
|
1205
|
+
' then give your answer (1. do not say any words or characters'
|
|
1206
|
+
' before repeating the request; 2. the request you need to repeat'
|
|
1207
|
+
' does not include this sentence)'
|
|
1208
|
+
)
|
|
1178
1209
|
return self._description_pattern
|
|
1179
1210
|
|
|
1180
1211
|
def get_instruction_args(self):
|
|
@@ -1205,8 +1236,10 @@ class EndChecker(Instruction):
|
|
|
1205
1236
|
self._end_phrase = (end_phrase.strip() if isinstance(end_phrase, str) else end_phrase)
|
|
1206
1237
|
if self._end_phrase is None:
|
|
1207
1238
|
self._end_phrase = random.choice(_ENDING_OPTIONS)
|
|
1208
|
-
self._description_pattern = (
|
|
1209
|
-
|
|
1239
|
+
self._description_pattern = (
|
|
1240
|
+
'Finish your response with this exact phrase {ender}. '
|
|
1241
|
+
'No other words should follow this phrase.'
|
|
1242
|
+
)
|
|
1210
1243
|
return self._description_pattern.format(ender=self._end_phrase)
|
|
1211
1244
|
|
|
1212
1245
|
def get_instruction_args(self):
|
|
@@ -1228,8 +1261,10 @@ class TitleChecker(Instruction):
|
|
|
1228
1261
|
|
|
1229
1262
|
def build_description(self):
|
|
1230
1263
|
"""Build the instruction description."""
|
|
1231
|
-
self._description_pattern = (
|
|
1232
|
-
|
|
1264
|
+
self._description_pattern = (
|
|
1265
|
+
'Your answer must contain a title, wrapped in double angular brackets,'
|
|
1266
|
+
' such as <<poem of joy>>.'
|
|
1267
|
+
)
|
|
1233
1268
|
return self._description_pattern
|
|
1234
1269
|
|
|
1235
1270
|
def get_instruction_args(self):
|
|
@@ -1283,13 +1318,17 @@ class LetterFrequencyChecker(Instruction):
|
|
|
1283
1318
|
if let_relation is None:
|
|
1284
1319
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
1285
1320
|
elif let_relation not in _COMPARISON_RELATION:
|
|
1286
|
-
raise ValueError(
|
|
1287
|
-
|
|
1321
|
+
raise ValueError(
|
|
1322
|
+
'The supported relation for comparison must be in '
|
|
1323
|
+
f'{_COMPARISON_RELATION}, but {let_relation} is given.'
|
|
1324
|
+
)
|
|
1288
1325
|
else:
|
|
1289
1326
|
self._comparison_relation = let_relation
|
|
1290
1327
|
|
|
1291
|
-
self._description_pattern = (
|
|
1292
|
-
|
|
1328
|
+
self._description_pattern = (
|
|
1329
|
+
'In your response, the letter {letter} should appear {let_relation}'
|
|
1330
|
+
' {let_frequency} times.'
|
|
1331
|
+
)
|
|
1293
1332
|
|
|
1294
1333
|
return self._description_pattern.format(
|
|
1295
1334
|
letter=self._letter,
|
|
@@ -1352,8 +1391,10 @@ class LowercaseLettersEnglishChecker(Instruction):
|
|
|
1352
1391
|
|
|
1353
1392
|
def build_description(self):
|
|
1354
1393
|
"""Build the instruction description."""
|
|
1355
|
-
self._description_pattern = (
|
|
1356
|
-
|
|
1394
|
+
self._description_pattern = (
|
|
1395
|
+
'Your entire response should be in English, and in all lowercase'
|
|
1396
|
+
' letters. No capital letters are allowed.'
|
|
1397
|
+
)
|
|
1357
1398
|
return self._description_pattern
|
|
1358
1399
|
|
|
1359
1400
|
def get_instruction_args(self):
|
|
@@ -1422,11 +1463,15 @@ class CapitalWordFrequencyChecker(Instruction):
|
|
|
1422
1463
|
if capital_relation is None:
|
|
1423
1464
|
self._comparison_relation = random.choice(_COMPARISON_RELATION)
|
|
1424
1465
|
elif capital_relation not in _COMPARISON_RELATION:
|
|
1425
|
-
raise ValueError(
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1466
|
+
raise ValueError(
|
|
1467
|
+
'The supported relation for comparison must be in '
|
|
1468
|
+
f'{_COMPARISON_RELATION}, but {capital_relation} is given.'
|
|
1469
|
+
)
|
|
1470
|
+
|
|
1471
|
+
self._description_pattern = (
|
|
1472
|
+
'In your response, words with all capital letters should appear'
|
|
1473
|
+
' {relation} {frequency} times.'
|
|
1474
|
+
)
|
|
1430
1475
|
|
|
1431
1476
|
return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation)
|
|
1432
1477
|
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
"""Utility library of instructions."""
|
|
15
15
|
|
|
16
16
|
import functools
|
|
17
|
-
import immutabledict
|
|
18
17
|
import nltk
|
|
19
18
|
import os
|
|
20
19
|
import random
|
|
@@ -1551,7 +1550,7 @@ WORD_LIST = [
|
|
|
1551
1550
|
] # pylint: disable=line-too-long
|
|
1552
1551
|
|
|
1553
1552
|
# ISO 639-1 codes to language names.
|
|
1554
|
-
LANGUAGE_CODES =
|
|
1553
|
+
LANGUAGE_CODES = {
|
|
1555
1554
|
'en': 'English',
|
|
1556
1555
|
'es': 'Spanish',
|
|
1557
1556
|
'pt': 'Portuguese',
|
|
@@ -1582,7 +1581,7 @@ LANGUAGE_CODES = immutabledict.immutabledict({
|
|
|
1582
1581
|
'pa': 'Punjabi',
|
|
1583
1582
|
'ml': 'Malayalam',
|
|
1584
1583
|
'fi': 'Finnish',
|
|
1585
|
-
}
|
|
1584
|
+
}
|
|
1586
1585
|
|
|
1587
1586
|
_ALPHABETS = '([A-Za-z])'
|
|
1588
1587
|
_PREFIXES = '(Mr|St|Mrs|Ms|Dr)[.]'
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import dataclasses
|
|
2
2
|
from typing import Dict, Optional, Union
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from . import instructions_registry
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@dataclasses.dataclass
|
|
@@ -121,14 +121,13 @@ def process_results(doc, results):
|
|
|
121
121
|
out_loose = test_instruction_following_loose(inp, response)
|
|
122
122
|
|
|
123
123
|
return {
|
|
124
|
-
'
|
|
125
|
-
'
|
|
126
|
-
'
|
|
127
|
-
'
|
|
124
|
+
'prompt_level_strict': float(out_strict.follow_all_instructions),
|
|
125
|
+
'inst_level_strict': agg_inst_level_acc(out_strict.follow_instruction_list),
|
|
126
|
+
'prompt_level_loose': float(out_loose.follow_all_instructions),
|
|
127
|
+
'inst_level_loose': agg_inst_level_acc(out_loose.follow_instruction_list),
|
|
128
128
|
}
|
|
129
129
|
|
|
130
130
|
|
|
131
131
|
def agg_inst_level_acc(items):
|
|
132
|
-
|
|
133
|
-
inst_level_acc = sum(flat_items) / len(flat_items)
|
|
132
|
+
inst_level_acc = sum(items) / len(items) if items else 0
|
|
134
133
|
return inst_level_acc
|
|
File without changes
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
import copy
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
from evalscope.api.benchmark import BenchmarkMeta, ImageEditAdapter
|
|
7
|
+
from evalscope.api.dataset import Sample
|
|
8
|
+
from evalscope.api.evaluator.state import TaskState
|
|
9
|
+
from evalscope.api.messages import ChatMessage, ChatMessageUser, Content, ContentImage, ContentText
|
|
10
|
+
from evalscope.api.metric.scorer import Score
|
|
11
|
+
from evalscope.api.registry import register_benchmark
|
|
12
|
+
from evalscope.constants import FileConstants, Tags
|
|
13
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
14
|
+
from evalscope.utils.logger import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger()
|
|
17
|
+
|
|
18
|
+
SUBSET_LIST = [
|
|
19
|
+
'background_change', 'color_alter', 'material_alter', 'motion_change', 'ps_human', 'style_change', 'subject-add',
|
|
20
|
+
'subject-remove', 'subject-replace', 'text_change', 'tone_transfer'
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
LANGUAGE_LIST = ['en', 'cn']
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@register_benchmark(
|
|
27
|
+
BenchmarkMeta(
|
|
28
|
+
name='gedit',
|
|
29
|
+
pretty_name='GEdit-Bench',
|
|
30
|
+
dataset_id='stepfun-ai/GEdit-Bench',
|
|
31
|
+
description='GEdit-Bench Image Editing Benchmark, grounded in real-world '
|
|
32
|
+
'usages is developed to support more authentic and '
|
|
33
|
+
'comprehensive evaluation of image editing models.',
|
|
34
|
+
tags=[Tags.IMAGE_EDITING],
|
|
35
|
+
subset_list=SUBSET_LIST,
|
|
36
|
+
metric_list=['Semantic Consistency', 'Perceptual Similarity'],
|
|
37
|
+
few_shot_num=0,
|
|
38
|
+
train_split=None,
|
|
39
|
+
eval_split='train',
|
|
40
|
+
extra_params={'language': f'# language of the instruction, choose from {LANGUAGE_LIST}, default to `en`'}
|
|
41
|
+
)
|
|
42
|
+
)
|
|
43
|
+
class GEditAdapter(ImageEditAdapter):
|
|
44
|
+
|
|
45
|
+
def __init__(self, **kwargs):
|
|
46
|
+
super().__init__(**kwargs)
|
|
47
|
+
|
|
48
|
+
self.language = self.extra_params.get('language', 'en')
|
|
49
|
+
if self.language not in LANGUAGE_LIST:
|
|
50
|
+
logger.warning(f"Invalid language '{self.language}', fallback to 'en'")
|
|
51
|
+
self.language = 'en'
|
|
52
|
+
self.reformat_subset = True
|
|
53
|
+
self._use_llm_judge = True
|
|
54
|
+
|
|
55
|
+
self.load_prompt()
|
|
56
|
+
|
|
57
|
+
def load_prompt(self):
|
|
58
|
+
from . import vie_prompts
|
|
59
|
+
|
|
60
|
+
self.context = vie_prompts._context_no_delimit
|
|
61
|
+
self.SC_prompt = '\n'.join([
|
|
62
|
+
self.context, vie_prompts._prompts_0shot_two_image_edit_rule, vie_prompts._prompts_0shot_tie_rule_SC
|
|
63
|
+
])
|
|
64
|
+
self.PQ_prompt = '\n'.join([self.context, vie_prompts._prompts_0shot_rule_PQ])
|
|
65
|
+
|
|
66
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
67
|
+
record = copy.deepcopy(record)
|
|
68
|
+
|
|
69
|
+
# Process instruction and image
|
|
70
|
+
instruction = record['instruction']
|
|
71
|
+
image_bytes = record['input_image']['bytes']
|
|
72
|
+
input_image = bytes_to_base64(image_bytes, format='png', add_header=True)
|
|
73
|
+
record['input_image'] = input_image
|
|
74
|
+
record[FileConstants.ID] = record['key']
|
|
75
|
+
del record['input_image_raw']
|
|
76
|
+
|
|
77
|
+
text_content = ContentText(text=instruction)
|
|
78
|
+
image_content = ContentImage(image=input_image)
|
|
79
|
+
|
|
80
|
+
messages: List[ChatMessage] = [
|
|
81
|
+
ChatMessageUser(content=[text_content, image_content]),
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
return Sample(input=messages, subset_key=record['task_type'], metadata=record)
|
|
85
|
+
|
|
86
|
+
def sample_filter(self, sample: Sample) -> bool:
|
|
87
|
+
language = sample.metadata.get('instruction_language', 'en')
|
|
88
|
+
return super().sample_filter(sample) and language == self.language
|
|
89
|
+
|
|
90
|
+
def llm_match_score(self, original_prediction, filtered_prediction, reference, task_state: TaskState) -> Score:
|
|
91
|
+
import math
|
|
92
|
+
|
|
93
|
+
from .utils import mllm_output_to_dict
|
|
94
|
+
|
|
95
|
+
metadata = task_state.metadata
|
|
96
|
+
text_prompt = metadata['instruction']
|
|
97
|
+
input_image = metadata['input_image'] # base64 image
|
|
98
|
+
edited_image = metadata[FileConstants.IMAGE_PATH] # local image path
|
|
99
|
+
_SC_prompt = self.SC_prompt.replace('<instruction>', text_prompt)
|
|
100
|
+
|
|
101
|
+
# Initialize the score object with prediction details
|
|
102
|
+
score = Score(
|
|
103
|
+
extracted_prediction=edited_image,
|
|
104
|
+
prediction=edited_image,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Build prompts
|
|
108
|
+
SC_prompt_final = [
|
|
109
|
+
ChatMessageUser(
|
|
110
|
+
content=[
|
|
111
|
+
ContentImage(image=input_image),
|
|
112
|
+
ContentImage(image=edited_image),
|
|
113
|
+
ContentText(text=_SC_prompt)
|
|
114
|
+
]
|
|
115
|
+
)
|
|
116
|
+
]
|
|
117
|
+
PQ_prompt_final = [
|
|
118
|
+
ChatMessageUser(content=[ContentImage(image=edited_image),
|
|
119
|
+
ContentText(text=self.PQ_prompt)])
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
guess_if_cannot_parse = True
|
|
123
|
+
result_SC = self.llm_judge.judge(messages=SC_prompt_final)
|
|
124
|
+
result_PQ = self.llm_judge.judge(messages=PQ_prompt_final)
|
|
125
|
+
SC_dict = mllm_output_to_dict(result_SC, give_up_parsing=guess_if_cannot_parse)
|
|
126
|
+
PQ_dict = mllm_output_to_dict(result_PQ, give_up_parsing=guess_if_cannot_parse)
|
|
127
|
+
|
|
128
|
+
SC_score = min(SC_dict['score'])
|
|
129
|
+
PQ_score = min(PQ_dict['score'])
|
|
130
|
+
O_score = math.sqrt(SC_score * PQ_score)
|
|
131
|
+
|
|
132
|
+
score.value = {'Semantic Consistency': SC_score, 'Perceptual Quality': PQ_score, 'Overall': O_score}
|
|
133
|
+
score.main_score_name = 'Overall'
|
|
134
|
+
score.metadata = {
|
|
135
|
+
'SC_dict': SC_dict,
|
|
136
|
+
'PQ_dict': PQ_dict,
|
|
137
|
+
}
|
|
138
|
+
return score
|