PyPI - evalscope - Versions diffs - 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

evalscope/api/benchmark/__init__.py +9 -1
evalscope/api/benchmark/adapters/__init__.py +4 -0
evalscope/api/benchmark/adapters/agent_adapter.py +8 -0
evalscope/api/benchmark/adapters/default_data_adapter.py +75 -4
evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
evalscope/api/benchmark/adapters/text2image_adapter.py +12 -10
evalscope/api/benchmark/adapters/vision_language_adapter.py +8 -0
evalscope/api/benchmark/benchmark.py +85 -2
evalscope/api/benchmark/meta.py +10 -1
evalscope/api/dataset/dataset.py +27 -6
evalscope/api/dataset/loader.py +8 -3
evalscope/api/evaluator/cache.py +31 -4
evalscope/api/evaluator/evaluator.py +5 -0
evalscope/api/evaluator/state.py +17 -1
evalscope/api/messages/__init__.py +1 -0
evalscope/api/messages/chat_message.py +52 -2
evalscope/api/metric/__init__.py +1 -1
evalscope/api/metric/metric.py +6 -1
evalscope/api/metric/scorer.py +15 -7
evalscope/api/mixin/__init__.py +1 -1
evalscope/api/mixin/llm_judge_mixin.py +2 -0
evalscope/api/mixin/sandbox_mixin.py +182 -0
evalscope/api/model/generate_config.py +10 -6
evalscope/api/model/model.py +5 -2
evalscope/api/tool/tool_info.py +1 -1
evalscope/app/app.py +3 -0
evalscope/app/ui/multi_model.py +6 -1
evalscope/app/ui/single_model.py +11 -5
evalscope/app/utils/data_utils.py +8 -7
evalscope/app/utils/env_utils.py +12 -0
evalscope/app/utils/text_utils.py +14 -12
evalscope/app/utils/visualization.py +2 -2
evalscope/arguments.py +8 -4
evalscope/backend/opencompass/backend_manager.py +0 -2
evalscope/backend/rag_eval/utils/embedding.py +9 -1
evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
evalscope/benchmarks/aime/aime24_adapter.py +5 -0
evalscope/benchmarks/aime/aime25_adapter.py +136 -1
evalscope/benchmarks/aime/grader.py +307 -0
evalscope/benchmarks/aime/math_normalize.py +189 -0
evalscope/benchmarks/amc/amc_adapter.py +51 -0
evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -0
evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
evalscope/benchmarks/bfcl/{bfcl_adapter.py → v3/bfcl_v3_adapter.py} +131 -19
evalscope/benchmarks/bfcl/{generation.py → v3/generation.py} +9 -9
evalscope/benchmarks/bfcl/v3/utils.py +23 -0
evalscope/benchmarks/bfcl/v4/__init__.py +0 -0
evalscope/benchmarks/bfcl/v4/bfcl_v4_adapter.py +229 -0
evalscope/benchmarks/bfcl/v4/utils.py +410 -0
evalscope/benchmarks/biomix_qa/__init__.py +0 -0
evalscope/benchmarks/biomix_qa/biomix_qa_adapter.py +36 -0
evalscope/benchmarks/blink/__init__.py +0 -0
evalscope/benchmarks/blink/blink_adapter.py +61 -0
evalscope/benchmarks/ceval/ceval_adapter.py +1 -2
evalscope/benchmarks/chartqa/__init__.py +0 -0
evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
evalscope/benchmarks/chartqa/utils.py +38 -0
evalscope/benchmarks/coin_flip/__init__.py +0 -0
evalscope/benchmarks/coin_flip/coin_flip_adapter.py +128 -0
evalscope/benchmarks/commonsense_qa/__init__.py +0 -0
evalscope/benchmarks/commonsense_qa/commonsense_qa_adapter.py +32 -0
evalscope/benchmarks/competition_math/competition_math_adapter.py +5 -0
evalscope/benchmarks/data_collection/data_collection_adapter.py +24 -19
evalscope/benchmarks/docvqa/__init__.py +0 -0
evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
evalscope/benchmarks/drivelology/__init__.py +0 -0
evalscope/benchmarks/drivelology/drivelology_binary_adapter.py +170 -0
evalscope/benchmarks/drivelology/drivelology_multilabel_adapter.py +254 -0
evalscope/benchmarks/drivelology/drivelology_selection_adapter.py +49 -0
evalscope/benchmarks/drivelology/drivelology_writing_adapter.py +218 -0
evalscope/benchmarks/drop/drop_adapter.py +15 -44
evalscope/benchmarks/drop/utils.py +97 -0
evalscope/benchmarks/frames/frames_adapter.py +2 -1
evalscope/benchmarks/general_arena/general_arena_adapter.py +7 -2
evalscope/benchmarks/general_arena/utils.py +2 -1
evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
evalscope/benchmarks/gsm8k/gsm8k_adapter.py +25 -9
evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +159 -0
evalscope/benchmarks/halu_eval/__init__.py +0 -0
evalscope/benchmarks/halu_eval/halu_eval_adapter.py +128 -0
evalscope/benchmarks/halu_eval/halu_eval_instructions.py +84 -0
evalscope/benchmarks/healthbench/__init__.py +0 -0
evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
evalscope/benchmarks/healthbench/utils.py +102 -0
evalscope/benchmarks/hle/hle_adapter.py +3 -2
evalscope/benchmarks/humaneval/humaneval_adapter.py +24 -52
evalscope/benchmarks/humaneval/utils.py +235 -0
evalscope/benchmarks/ifeval/instructions_util.py +2 -3
evalscope/benchmarks/image_edit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
evalscope/benchmarks/infovqa/__init__.py +0 -0
evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +66 -54
evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
evalscope/benchmarks/logi_qa/__int__.py +0 -0
evalscope/benchmarks/logi_qa/logi_qa_adapter.py +41 -0
evalscope/benchmarks/math_500/math_500_adapter.py +5 -1
evalscope/benchmarks/math_qa/__init__.py +0 -0
evalscope/benchmarks/math_qa/math_qa_adapter.py +35 -0
evalscope/benchmarks/math_verse/__init__.py +0 -0
evalscope/benchmarks/math_verse/math_verse_adapter.py +105 -0
evalscope/benchmarks/math_vision/__init__.py +0 -0
evalscope/benchmarks/math_vision/math_vision_adapter.py +116 -0
evalscope/benchmarks/math_vista/__init__.py +0 -0
evalscope/benchmarks/math_vista/math_vista_adapter.py +114 -0
evalscope/benchmarks/med_mcqa/__init__.py +0 -0
evalscope/benchmarks/med_mcqa/med_mcqa_adapter.py +32 -0
evalscope/benchmarks/minerva_math/__init__.py +0 -0
evalscope/benchmarks/minerva_math/minerva_math_adapter.py +53 -0
evalscope/benchmarks/mm_bench/__init__.py +0 -0
evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
evalscope/benchmarks/mm_star/__init__.py +0 -0
evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
evalscope/benchmarks/mmmu/__init__.py +0 -0
evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +124 -0
evalscope/benchmarks/mri_mcqa/__init__.py +0 -0
evalscope/benchmarks/mri_mcqa/mri_mcqa_adapter.py +34 -0
evalscope/benchmarks/multi_if/__init__.py +0 -0
evalscope/benchmarks/multi_if/ifeval.py +3354 -0
evalscope/benchmarks/multi_if/metrics.py +120 -0
evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
evalscope/benchmarks/music_trivia/__init__.py +0 -0
evalscope/benchmarks/music_trivia/music_trivia_adapter.py +36 -0
evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +7 -6
evalscope/benchmarks/ner/__init__.py +0 -0
evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
evalscope/benchmarks/ner/copious_adapter.py +85 -0
evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
evalscope/benchmarks/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench/ocr_bench_adapter.py +101 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/IoUscore_metric.py +87 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/TEDS_metric.py +963 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/page_ocr_metric.py +50 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/parallel.py +46 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/__init__.py +0 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/readme.txt +26 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_eval/script.py +481 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/spotting_metric.py +179 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/utils.py +433 -0
evalscope/benchmarks/ocr_bench/ocr_bench_v2/vqa_metric.py +254 -0
evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
evalscope/benchmarks/olympiad_bench/utils.py +565 -0
evalscope/benchmarks/omni_bench/__init__.py +0 -0
evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
evalscope/benchmarks/piqa/__init__.py +0 -0
evalscope/benchmarks/piqa/piqa_adapter.py +32 -0
evalscope/benchmarks/poly_math/__init__.py +0 -0
evalscope/benchmarks/poly_math/poly_math_adapter.py +132 -0
evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
evalscope/benchmarks/pope/__init__.py +0 -0
evalscope/benchmarks/pope/pope_adapter.py +112 -0
evalscope/benchmarks/process_bench/process_bench_adapter.py +1 -0
evalscope/benchmarks/pumed_qa/__init__.py +0 -0
evalscope/benchmarks/pumed_qa/pubmed_qa_adapter.py +175 -0
evalscope/benchmarks/qasc/__init__.py +0 -0
evalscope/benchmarks/qasc/qasc_adapter.py +35 -0
evalscope/benchmarks/real_world_qa/__init__.py +0 -0
evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
evalscope/benchmarks/sciq/__init__.py +0 -0
evalscope/benchmarks/sciq/sciq_adapter.py +36 -0
evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -1
evalscope/benchmarks/simple_vqa/__init__.py +0 -0
evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
evalscope/benchmarks/siqa/__init__.py +0 -0
evalscope/benchmarks/siqa/siqa_adapter.py +39 -0
evalscope/benchmarks/tau_bench/tau2_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/tau2_bench/generation.py +158 -0
evalscope/benchmarks/tau_bench/tau2_bench/tau2_bench_adapter.py +146 -0
evalscope/benchmarks/tau_bench/tau_bench/__init__.py +0 -0
evalscope/benchmarks/tau_bench/{generation.py → tau_bench/generation.py} +1 -1
evalscope/benchmarks/tau_bench/{tau_bench_adapter.py → tau_bench/tau_bench_adapter.py} +29 -29
evalscope/benchmarks/text2image/__init__.py +0 -0
evalscope/benchmarks/{aigc/t2i → text2image}/evalmuse_adapter.py +3 -1
evalscope/benchmarks/{aigc/t2i → text2image}/genai_bench_adapter.py +2 -2
evalscope/benchmarks/{aigc/t2i → text2image}/general_t2i_adapter.py +1 -1
evalscope/benchmarks/{aigc/t2i → text2image}/hpdv2_adapter.py +7 -2
evalscope/benchmarks/{aigc/t2i → text2image}/tifa_adapter.py +1 -0
evalscope/benchmarks/tool_bench/tool_bench_adapter.py +3 -3
evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +1 -2
evalscope/benchmarks/visu_logic/__init__.py +0 -0
evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
evalscope/benchmarks/wmt/__init__.py +0 -0
evalscope/benchmarks/wmt/wmt24_adapter.py +294 -0
evalscope/benchmarks/zerobench/__init__.py +0 -0
evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
evalscope/cli/start_app.py +7 -1
evalscope/cli/start_perf.py +7 -1
evalscope/config.py +103 -18
evalscope/constants.py +18 -0
evalscope/evaluator/evaluator.py +138 -82
evalscope/metrics/bert_score/__init__.py +0 -0
evalscope/metrics/bert_score/scorer.py +338 -0
evalscope/metrics/bert_score/utils.py +697 -0
evalscope/metrics/llm_judge.py +19 -7
evalscope/metrics/math_parser.py +14 -0
evalscope/metrics/metric.py +317 -13
evalscope/metrics/metrics.py +37 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
evalscope/models/image_edit_model.py +125 -0
evalscope/models/model_apis.py +22 -0
evalscope/models/openai_compatible.py +21 -0
evalscope/models/text2image_model.py +2 -2
evalscope/models/utils/openai.py +16 -6
evalscope/perf/arguments.py +26 -4
evalscope/perf/benchmark.py +76 -89
evalscope/perf/http_client.py +31 -16
evalscope/perf/main.py +15 -2
evalscope/perf/plugin/api/base.py +9 -7
evalscope/perf/plugin/api/custom_api.py +13 -58
evalscope/perf/plugin/api/default_api.py +188 -79
evalscope/perf/plugin/api/openai_api.py +85 -20
evalscope/perf/plugin/datasets/base.py +21 -0
evalscope/perf/plugin/datasets/custom.py +2 -3
evalscope/perf/plugin/datasets/flickr8k.py +2 -2
evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
evalscope/perf/plugin/datasets/line_by_line.py +2 -3
evalscope/perf/plugin/datasets/longalpaca.py +2 -3
evalscope/perf/plugin/datasets/openqa.py +2 -4
evalscope/perf/plugin/datasets/random_dataset.py +1 -3
evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
evalscope/perf/utils/benchmark_util.py +43 -27
evalscope/perf/utils/db_util.py +14 -19
evalscope/perf/utils/local_server.py +3 -44
evalscope/perf/utils/log_utils.py +21 -6
evalscope/report/__init__.py +13 -3
evalscope/report/combinator.py +91 -20
evalscope/report/generator.py +8 -87
evalscope/report/report.py +8 -4
evalscope/run.py +13 -5
evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
evalscope/utils/argument_utils.py +1 -1
evalscope/utils/chat_service.py +1 -1
evalscope/utils/function_utils.py +249 -12
evalscope/utils/import_utils.py +73 -1
evalscope/utils/io_utils.py +132 -7
evalscope/utils/json_schema.py +25 -2
evalscope/utils/logger.py +69 -18
evalscope/utils/model_utils.py +4 -3
evalscope/utils/multi_choices.py +39 -7
evalscope/utils/ner.py +377 -0
evalscope/version.py +2 -2
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/METADATA +252 -408
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/RECORD +290 -154
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/WHEEL +1 -1
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/top_level.txt +0 -1
evalscope/api/mixin/dataset_mixin.py +0 -105
evalscope/benchmarks/aigc/i2i/general_i2i_adapter.py +0 -44
tests/__init__.py +0 -1
tests/aigc/__init__.py +0 -1
tests/aigc/test_t2i.py +0 -142
tests/benchmark/__init__.py +0 -1
tests/benchmark/test_eval.py +0 -386
tests/cli/__init__.py +0 -1
tests/cli/test_all.py +0 -229
tests/cli/test_collection.py +0 -96
tests/cli/test_custom.py +0 -268
tests/perf/__init__.py +0 -1
tests/perf/test_perf.py +0 -176
tests/rag/test_clip_benchmark.py +0 -90
tests/rag/test_mteb.py +0 -213
tests/rag/test_ragas.py +0 -128
tests/swift/__init__.py +0 -1
tests/swift/test_run_swift_eval.py +0 -146
tests/swift/test_run_swift_vlm_eval.py +0 -128
tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
tests/test_run_all.py +0 -12
tests/utils.py +0 -13
tests/vlm/__init__.py +0 -1
tests/vlm/test_vlmeval.py +0 -102
/evalscope/benchmarks/{aigc → aa_lcr}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/i2i → ai2d}/__init__.py +0 -0
/evalscope/benchmarks/{aigc/t2i → amc}/__init__.py +0 -0
{tests/rag → evalscope/benchmarks/bfcl/v3}/__init__.py +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info}/entry_points.txt +0 -0
{evalscope-1.0.0.dist-info → evalscope-1.2.0.dist-info/licenses}/LICENSE +0 -0

evalscope/benchmarks/image_edit/gedit/vie_prompts.py ADDED Viewed

@@ -0,0 +1,406 @@
+# flake8: noqa: E501
+# This file is generated automatically through parse_prompt.py
+_context_no_delimit = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
+All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
+You will have to give your output in this way (Keep your reasoning concise and short.):
+{
+"score" : [...],
+"reasoning" : "..."
+}"""
+_context = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
+All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
+You will have to give your output in this way (the delimiter is necessary. Keep your reasoning concise and short.):
+||V^=^V||
+{
+"score" :
+"reasoning" :
+}
+||V^=^V||"""
+_context_no_format = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
+All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials."""
+_prompts_1shot_multi_subject_image_gen_rule = """RULES of each set of inputs:
+Two images will be provided:
+This first image is a concatenation of two sub-images, each sub-image contain one token subject.
+The second image being an AI-generated image using the first image as guidance.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_1shot_mie_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
+A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.
+First lets look at the first set of input (1st and 2nd images) as an example.
+Editing instruction: What if the man had a hat?
+Output:
+||V^=^V||
+{
+"score" : [5, 10],
+"reasoning" :  "The hat exists but does not suit well. The hat also looks distorted. But it is a good edit because only a hat is added and the background is persevered."
+}
+||V^=^V||
+Now evaluate the second set of input (3th, 4th images).
+Editing instruction: <instruction>
+"""
+_prompts_1shot_msdig_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
+A second score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first sub-image.
+(0 indicates that the subject in the second image does not look like the token subject in the first sub-image at all. 10 indicates the subject in the second image look exactly alike the token subject in the first sub-image.)
+A third score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the second sub-image.
+(0 indicates that the subject in the second image does not look like the token subject in the second sub-image at all. 10 indicates the subject in the second image look exactly alike the token subject in the second sub-image.)
+Put the score in a list such that output score = [score1, score2, score3], where 'score1' evaluates the prompt and 'score2' evaluates the resemblance for the first sub-image, and 'score3' evaluates the resemblance for the second sub-image.
+First lets look at the first set of input (1st and 2nd images) as an example.
+Text Prompt: A digital illustration of a cat beside a wooden pot
+Output:
+||V^=^V||
+{
+"score" : [5, 5, 10],
+"reasoning" :  "The cat is not beside the wooden pot. The pot looks partially resemble to the subject pot. The cat looks highly resemble to the subject cat."
+}
+||V^=^V||
+Now evaluate the second set of input (3th, 4th images).
+Text Prompt: <prompt>"""
+_prompts_1shot_t2i_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the AI generated image does not follow the prompt at all. 10 indicates the AI generated image follows the prompt perfectly.)
+Put the score in a list such that output score = [score].
+First lets look at the first set of input (1st image) as an example.
+Text Prompt: A pink and a white frisbee are on the ground.
+Output:
+||V^=^V||
+{
+"score" : [5],
+"reasoning" :  "White frisbee not present in the image."
+}
+||V^=^V||
+Now evaluate the second set of input (2nd image).
+Text Prompt: <prompt>
+"""
+_prompts_1shot_tie_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
+A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.
+First lets look at the first set of input (1st and 2nd images) as an example.
+Editing instruction: What if the man had a hat?
+Output:
+||V^=^V||
+{
+"score" : [5, 10],
+"reasoning" :  "The hat exists but does not suit well. The hat also looks distorted. But it is a good edit because only a hat is added and the background is persevered."
+}
+||V^=^V||
+Now evaluate the second set of input (3th, 4th images).
+Editing instruction: <instruction>
+"""
+_prompts_1shot_sdie_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the second image.
+(0 indicates that the subject in the third image does not look like the token subject at all. 10 indicates the subject in the third image look exactly alike the token subject.)
+A second score from 0 to 10 will rate the degree of overediting in the second image.
+(0 indicates that the scene in the edited image is completely different from the first image. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the resemblance and 'score2' evaluates the degree of overediting.
+First lets look at the first set of input (1st, 2nd and 3rd images) as an example.
+Subject: <subject>
+Output:
+||V^=^V||
+{
+"score" : [5, 10],
+"reasoning" :  "The monster toy looks partially resemble to the token subject. The edit is minimal."
+}
+||V^=^V||
+Now evaluate the second set of input (4th, 5th, and 6th images).
+Subject: <subject>
+"""
+_prompts_1shot_one_image_gen_rule = """RULES of each set of inputs:
+One image will be provided; The image is an AI-generated image.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_1shot_sdig_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
+A second score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first image.
+(0 indicates that the subject in the second image does not look like the token subject at all. 10 indicates the subject in the second image look exactly alike the token subject.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the prompt and 'score2' evaluates the resemblance.
+First lets look at the first set of input (1st and 2nd images) as an example.
+Text Prompt: a red cartoon figure eating a banana
+Output:
+||V^=^V||
+{
+"score" : [10, 5],
+"reasoning" :  "The red cartoon figure is eating a banana. The red cartoon figure looks partially resemble to the subject."
+}
+||V^=^V||
+Now evaluate the second set of input (3th, 4th images).
+Text Prompt: <prompt>
+"""
+_prompts_1shot_rule_PQ = """RULES of each set of inputs:
+One image will be provided; The image is an AI-generated image.
+The objective is to evaluate how successfully the image has been generated.
+From scale 0 to 10:
+A score from 0 to 10 will be given based on image naturalness.
+(
+    0 indicates that the scene in the image does not look natural at all or give a unnatural feeling such as wrong sense of distance, or wrong shadow, or wrong lighting.
+    10 indicates that the image looks natural.
+)
+A second score from 0 to 10 will rate the image artifacts.
+(
+    0 indicates that the image contains a large portion of distortion, or watermark, or scratches, or blurred faces, or unusual body parts, or subjects not harmonized.
+    10 indicates the image has no artifacts.
+)
+Put the score in a list such that output score = [naturalness, artifacts]
+First lets look at the first set of input (1st image) as an example.
+Output:
+||V^=^V||
+{
+"score" : [5, 5],
+"reasoning" :  "The image gives an unnatural feeling on hands of the girl. There is also minor distortion on the eyes of the girl."
+}
+||V^=^V||
+Now evaluate the second set of input (2nd image).
+"""
+_prompts_1shot_subject_image_gen_rule = """RULES of each set of inputs:
+Two images will be provided: The first being a token subject image and the second being an AI-generated image using the first image as guidance.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_1shot_cig_rule_SC = """
+From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
+A second score from 0 to 10 will rate how well the generated image is following the guidance image.
+(0 indicates that the second image is not following the guidance at all. 10 indicates that second image is following the guidance image.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the prompt and 'score2' evaluates the guidance.
+First lets look at the first set of input (1st and 2nd images) as an example.
+Text Prompt: the bridge is red, Golden Gate Bridge in San Francisco, USA
+Output:
+||V^=^V||
+{
+"score" : [5, 5],
+"reasoning" :  "The bridge is red. But half of the bridge is gone."
+}
+||V^=^V||
+Now evaluate the second set of input (3th, 4th images).
+Text Prompt: <prompt>
+"""
+_prompts_1shot_two_image_edit_rule = """RULES of each set of inputs:
+Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
+The objective is to evaluate how successfully the editing instruction has been executed in the second image.
+Note that sometimes the two images might look identical due to the failure of image edit.
+"""
+_prompts_1shot_subject_image_edit_rule = """RULES of each set of inputs:
+Three images will be provided:
+The first image is a input image to be edited.
+The second image is a token subject image.
+The third image is an AI-edited image from the first image. it should contain a subject that looks alike the subject in second image.
+The objective is to evaluate how successfully the image has been edited.
+"""
+_prompts_1shot_control_image_gen_rule = """RULES of each set of inputs:
+Two images will be provided: The first being a processed image (e.g. Canny edges, openpose, grayscale etc.) and the second being an AI-generated image using the first image as guidance.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_0shot_two_image_edit_rule = """RULES:
+Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
+The objective is to evaluate how successfully the editing instruction has been executed in the second image.
+Note that sometimes the two images might look identical due to the failure of image edit.
+"""
+_prompts_0shot_one_video_gen_rule = """RULES:
+The images are extracted from a AI-generated video according to the text prompt.
+The objective is to evaluate how successfully the video has been generated.
+"""
+_prompts_0shot_t2v_rule_PQ = """RULES:
+The image frames are AI-generated.
+The objective is to evaluate how successfully the image frames has been generated.
+From scale 0 to 10:
+A score from 0 to 10 will be given based on the image frames naturalness.
+(
+    0 indicates that the scene in the image frames does not look natural at all or give a unnatural feeling such as wrong sense of distance, or wrong shadow, or wrong lighting.
+    10 indicates that the image frames looks natural.
+)
+A second score from 0 to 10 will rate the image frames artifacts.
+(
+    0 indicates that the image frames contains a large portion of distortion, or watermark, or scratches, or blurred faces, or unusual body parts, or subjects not harmonized.
+    10 indicates the image frames has no artifacts.
+)
+Put the score in a list such that output score = [naturalness, artifacts]
+"""
+_prompts_0shot_msdig_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
+A second score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first sub-image.
+(0 indicates that the subject in the second image does not look like the token subject in the first sub-image at all. 10 indicates the subject in the second image look exactly alike the token subject in the first sub-image.)
+A third score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the second sub-image.
+(0 indicates that the subject in the second image does not look like the token subject in the second sub-image at all. 10 indicates the subject in the second image look exactly alike the token subject in the second sub-image.)
+Put the score in a list such that output score = [score1, score2, score3], where 'score1' evaluates the prompt and 'score2' evaluates the resemblance for the first sub-image, and 'score3' evaluates the resemblance for the second sub-image.
+Text Prompt: <prompt>
+"""
+_prompts_0shot_sdie_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the second image.
+(0 indicates that the subject in the third image does not look like the token subject at all. 10 indicates the subject in the third image look exactly alike the token subject.)
+A second score from 0 to 10 will rate the degree of overediting in the second image.
+(0 indicates that the scene in the edited image is completely different from the first image. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the resemblance and 'score2' evaluates the degree of overediting.
+Subject: <subject>"""
+_prompts_0shot_subject_image_edit_rule = """RULES:
+Three images will be provided:
+The first image is a input image to be edited.
+The second image is a token subject image.
+The third image is an AI-edited image from the first image. it should contain a subject that looks alike the subject in second image.
+The objective is to evaluate how successfully the image has been edited.
+"""
+_prompts_0shot_mie_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
+A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.
+Editing instruction: <instruction>
+"""
+_prompts_0shot_sdig_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
+A second score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first image.
+(0 indicates that the subject in the second image does not look like the token subject at all. 10 indicates the subject in the second image look exactly alike the token subject.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the prompt and 'score2' evaluates the resemblance.
+Text Prompt: <prompt>
+"""
+_prompts_0shot_tie_rule_SC = """
+From scale 0 to 10:
+A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
+A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.
+Editing instruction: <instruction>
+"""
+_prompts_0shot_t2i_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the AI generated image does not follow the prompt at all. 10 indicates the AI generated image follows the prompt perfectly.)
+Put the score in a list such that output score = [score].
+Text Prompt: <prompt>
+"""
+_prompts_0shot_cig_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
+A second score from 0 to 10 will rate how well the generated image is following the guidance image.
+(0 indicates that the second image is not following the guidance at all. 10 indicates that second image is following the guidance image.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the prompt and 'score2' evaluates the guidance.
+Text Prompt: <prompt>"""
+_prompts_0shot_control_image_gen_rule = """RULES:
+Two images will be provided: The first being a processed image (e.g. Canny edges, openpose, grayscale etc.) and the second being an AI-generated image using the first image as guidance.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_0shot_rule_PQ = """RULES:
+The image is an AI-generated image.
+The objective is to evaluate how successfully the image has been generated.
+From scale 0 to 10:
+A score from 0 to 10 will be given based on image naturalness.
+(
+    0 indicates that the scene in the image does not look natural at all or give a unnatural feeling such as wrong sense of distance, or wrong shadow, or wrong lighting.
+    10 indicates that the image looks natural.
+)
+A second score from 0 to 10 will rate the image artifacts.
+(
+    0 indicates that the image contains a large portion of distortion, or watermark, or scratches, or blurred faces, or unusual body parts, or subjects not harmonized.
+    10 indicates the image has no artifacts.
+)
+Put the score in a list such that output score = [naturalness, artifacts]
+"""
+_prompts_0shot_t2v_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the image frames does not follow the prompt at all. 10 indicates the image frames follows the prompt perfectly.)
+Put the score in a list such that output score = [score].
+Text Prompt: <prompt>
+"""
+_prompts_0shot_multi_subject_image_gen_rule = """RULES:
+Two images will be provided:
+This first image is a concatenation of two sub-images, each sub-image contain one token subject.
+The second image being an AI-generated image using the first image as guidance.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_0shot_subject_image_gen_rule = """RULES:
+Two images will be provided: The first being a token subject image and the second being an AI-generated image using the first image as guidance.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_0shot_one_image_gen_rule = """RULES:
+The image is an AI-generated image according to the text prompt.
+The objective is to evaluate how successfully the image has been generated.
+"""

evalscope/benchmarks/infovqa/__init__.py ADDED Viewed

File without changes

evalscope/benchmarks/infovqa/infovqa_adapter.py ADDED Viewed

@@ -0,0 +1,66 @@
+import json
+from typing import Any, Dict, List
+from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
+from evalscope.api.dataset import Sample
+from evalscope.api.evaluator.state import TaskState
+from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
+from evalscope.api.registry import register_benchmark
+from evalscope.constants import Tags
+from evalscope.utils.io_utils import bytes_to_base64
+from evalscope.utils.logger import get_logger
+logger = get_logger()
+PROMPT = """Answer the question according to the image using a single word or phrase.
+{question}
+The last line of your response should be of the form "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the question."""  # noqa: E501
+@register_benchmark(
+    BenchmarkMeta(
+        name='infovqa',
+        pretty_name='InfoVQA',
+        tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
+        description=
+        'InfoVQA (Information Visual Question Answering) is a benchmark designed to evaluate how well AI models can answer questions based on information-dense images, such as charts, graphs, diagrams, maps, and infographics.',  # noqa: E501
+        dataset_id='lmms-lab/DocVQA',
+        subset_list=['InfographicVQA'],
+        metric_list=['anls'],
+        eval_split='validation',
+        prompt_template=PROMPT,
+    )
+)
+class InfoVQAAdapter(VisionLanguageAdapter):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.add_aggregation_name = False
+    def record_to_sample(self, record: Dict[str, Any]) -> Sample:
+        input_text = PROMPT.format(question=record['question'])
+        content_list: List[Content] = [ContentText(text=input_text)]
+        image = record.get('image')
+        if image:
+            image_base64 = bytes_to_base64(image['bytes'], format='png', add_header=True)
+            content_list.append(ContentImage(image=image_base64))
+        return Sample(
+            input=[ChatMessageUser(content=content_list)],
+            target=json.dumps(record.get('answers')),  # answers is a list
+            metadata={
+                'questionId': record.get('questionId'),
+                'answer_type': record.get('answer_type'),
+                'image_url': record.get('image_url'),
+                'ocr': record.get('ocr'),
+            }
+        )
+    def extract_answer(self, prediction: str, task_state: TaskState) -> str:
+        import re
+        pattern = r'ANSWER:\s*(.*)'
+        match = re.search(pattern, prediction)
+        if match:
+            return match.group(1).strip()
+        return prediction.strip()

evalscope/benchmarks/live_code_bench/evaluate_utils.py CHANGED Viewed

@@ -9,6 +9,19 @@ from .pass_k_utils import compute_metrics_from_results
 logger = get_logger()
+def _temp_run(sample, generation, debug, result, metadata_list, timeout):
+    """Runs a test in a separate process to enforce a timeout.
+    This function is defined at the module's top level to ensure it can be
+    pickled by `multiprocessing.Process`. This is a requirement on platforms
+    like macOS (on Apple Silicon) which use the 'spawn' start method, as
+    nested functions are not picklable.
+    """
+    from .testing_util import run_test
+    res, metadata = run_test(sample, test=generation, debug=debug, timeout=timeout)
+    result.append(res)
+    metadata_list.append(metadata)
 def codegen_check_correctness(sample, generation, timeout, debug=True):
     """Check correctness of code generation with a global timeout.
@@ -16,12 +29,6 @@ def codegen_check_correctness(sample, generation, timeout, debug=True):
     timeouts inside `run_test`
     """
-    def _temp_run(sample, generation, debug, result, metadata_list, timeout):
-        from .testing_util import run_test
-        res, metadata = run_test(sample, test=generation, debug=debug, timeout=timeout)
-        result.append(res)
-        metadata_list.append(metadata)
     manager = multiprocessing.Manager()
     result = manager.list()
     metadata_list = manager.list()

evalscope 1.0.0__py3-none-any.whl → 1.2.0__py3-none-any.whl

evalscope 1.0.0py3-none-any.whl → 1.2.0py3-none-any.whl