evalscope 0.17.1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/__init__.py +4 -1
- evalscope/api/benchmark/__init__.py +3 -0
- evalscope/api/benchmark/adapters/__init__.py +5 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +684 -0
- evalscope/api/benchmark/adapters/image_edit_adapter.py +82 -0
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +83 -0
- evalscope/api/benchmark/adapters/text2image_adapter.py +156 -0
- evalscope/api/benchmark/adapters/vision_language_adapter.py +6 -0
- evalscope/api/benchmark/benchmark.py +356 -0
- evalscope/api/benchmark/meta.py +121 -0
- evalscope/api/dataset/__init__.py +2 -0
- evalscope/api/dataset/dataset.py +349 -0
- evalscope/api/dataset/loader.py +262 -0
- evalscope/api/dataset/utils.py +143 -0
- evalscope/api/evaluator/__init__.py +3 -0
- evalscope/api/evaluator/cache.py +378 -0
- evalscope/api/evaluator/evaluator.py +56 -0
- evalscope/api/evaluator/state.py +275 -0
- evalscope/api/filter/__init__.py +1 -0
- evalscope/api/filter/filter.py +72 -0
- evalscope/api/messages/__init__.py +12 -0
- evalscope/api/messages/chat_message.py +243 -0
- evalscope/api/messages/content.py +102 -0
- evalscope/api/messages/utils.py +35 -0
- evalscope/api/metric/__init__.py +2 -0
- evalscope/api/metric/metric.py +55 -0
- evalscope/api/metric/scorer.py +113 -0
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +168 -0
- evalscope/api/model/__init__.py +12 -0
- evalscope/api/model/generate_config.py +155 -0
- evalscope/api/model/model.py +386 -0
- evalscope/api/model/model_output.py +285 -0
- evalscope/api/registry.py +182 -0
- evalscope/api/tool/__init__.py +3 -0
- evalscope/api/tool/tool_call.py +101 -0
- evalscope/api/tool/tool_info.py +173 -0
- evalscope/api/tool/utils.py +64 -0
- evalscope/app/app.py +3 -0
- evalscope/app/ui/app_ui.py +2 -1
- evalscope/app/ui/multi_model.py +50 -25
- evalscope/app/ui/single_model.py +26 -14
- evalscope/app/utils/data_utils.py +43 -27
- evalscope/app/utils/env_utils.py +12 -0
- evalscope/app/utils/text_utils.py +14 -14
- evalscope/app/utils/visualization.py +9 -4
- evalscope/arguments.py +7 -10
- evalscope/backend/opencompass/api_meta_template.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +6 -5
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +10 -10
- evalscope/backend/rag_eval/clip_benchmark/task_template.py +8 -4
- evalscope/backend/rag_eval/ragas/task_template.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/build_transform.py +7 -4
- evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +2 -1
- evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +10 -1
- evalscope/backend/rag_eval/utils/llm.py +13 -12
- evalscope/benchmarks/__init__.py +0 -2
- evalscope/benchmarks/aime/aime24_adapter.py +38 -40
- evalscope/benchmarks/aime/aime25_adapter.py +34 -40
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +86 -60
- evalscope/benchmarks/arc/arc_adapter.py +34 -147
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +96 -70
- evalscope/benchmarks/arena_hard/utils.py +37 -1
- evalscope/benchmarks/bbh/bbh_adapter.py +72 -144
- evalscope/benchmarks/bfcl/bfcl_adapter.py +188 -171
- evalscope/benchmarks/bfcl/generation.py +222 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +93 -162
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +85 -82
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +34 -125
- evalscope/benchmarks/competition_math/competition_math_adapter.py +56 -108
- evalscope/benchmarks/data_collection/data_collection_adapter.py +187 -45
- evalscope/benchmarks/docmath/docmath_adapter.py +109 -51
- evalscope/benchmarks/docmath/utils.py +4 -5
- evalscope/benchmarks/drop/drop_adapter.py +88 -40
- evalscope/benchmarks/frames/frames_adapter.py +136 -52
- evalscope/benchmarks/general_arena/general_arena_adapter.py +140 -98
- evalscope/benchmarks/general_arena/utils.py +23 -27
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +40 -101
- evalscope/benchmarks/general_qa/general_qa_adapter.py +73 -134
- evalscope/benchmarks/gpqa/gpqa_adapter.py +61 -100
- evalscope/benchmarks/gpqa/{chain_of_thought.txt → prompt.py} +12 -5
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +62 -142
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +35 -124
- evalscope/benchmarks/hle/hle_adapter.py +127 -93
- evalscope/benchmarks/humaneval/humaneval_adapter.py +86 -55
- evalscope/benchmarks/ifeval/ifeval_adapter.py +69 -40
- evalscope/benchmarks/ifeval/instructions.py +109 -64
- evalscope/benchmarks/ifeval/instructions_registry.py +1 -1
- evalscope/benchmarks/ifeval/instructions_util.py +2 -3
- evalscope/benchmarks/ifeval/utils.py +6 -7
- evalscope/benchmarks/image_edit/gedit/__init__.py +0 -0
- evalscope/benchmarks/image_edit/gedit/gedit_adapter.py +138 -0
- evalscope/benchmarks/image_edit/gedit/utils.py +372 -0
- evalscope/benchmarks/image_edit/gedit/vie_prompts.py +406 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +30 -65
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +2 -2
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +121 -71
- evalscope/benchmarks/live_code_bench/load_utils.py +13 -21
- evalscope/benchmarks/live_code_bench/testing_util.py +6 -2
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +49 -75
- evalscope/benchmarks/math_500/math_500_adapter.py +41 -48
- evalscope/benchmarks/math_vista/__init__.py +0 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +129 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -205
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +80 -99
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +64 -110
- evalscope/benchmarks/mmmu/__init__.py +0 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +159 -0
- evalscope/benchmarks/mmmu_pro/__init__.py +0 -0
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +129 -0
- evalscope/benchmarks/musr/musr_adapter.py +33 -64
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +196 -152
- evalscope/benchmarks/process_bench/process_bench_adapter.py +144 -76
- evalscope/benchmarks/race/race_adapter.py +33 -119
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +72 -70
- evalscope/benchmarks/super_gpqa/{five_shot_prompt.txt → prompt.py} +14 -16
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +73 -117
- evalscope/benchmarks/super_gpqa/utils.py +2 -1
- evalscope/benchmarks/tau_bench/generation.py +147 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +114 -60
- evalscope/benchmarks/text2image/__init__.py +0 -0
- evalscope/benchmarks/text2image/evalmuse_adapter.py +78 -0
- evalscope/benchmarks/text2image/genai_bench_adapter.py +53 -0
- evalscope/benchmarks/text2image/general_t2i_adapter.py +42 -0
- evalscope/benchmarks/text2image/hpdv2_adapter.py +52 -0
- evalscope/benchmarks/text2image/tifa_adapter.py +27 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +91 -70
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +56 -124
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +70 -266
- evalscope/benchmarks/winogrande/winogrande_adapter.py +28 -54
- evalscope/cli/cli.py +2 -0
- evalscope/cli/start_app.py +7 -1
- evalscope/cli/start_perf.py +7 -1
- evalscope/cli/start_server.py +6 -3
- evalscope/collections/__init__.py +2 -10
- evalscope/collections/sampler.py +10 -10
- evalscope/collections/schema.py +13 -11
- evalscope/config.py +157 -57
- evalscope/constants.py +37 -61
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +275 -419
- evalscope/filters/__init__.py +2 -0
- evalscope/filters/extraction.py +126 -0
- evalscope/filters/selection.py +57 -0
- evalscope/metrics/__init__.py +13 -13
- evalscope/metrics/llm_judge.py +47 -33
- evalscope/metrics/math_parser.py +27 -22
- evalscope/metrics/metric.py +307 -0
- evalscope/metrics/metrics.py +22 -18
- evalscope/metrics/t2v_metrics/__init__.py +0 -52
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/clip_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/build_mps_model/cross_modeling.py +9 -13
- evalscope/metrics/t2v_metrics/models/clipscore_models/clip_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/hpsv2_model.py +3 -2
- evalscope/metrics/t2v_metrics/models/clipscore_models/mps_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/clipscore_models/pickscore_model.py +2 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/blip2_itm_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/itmscore_models/fga_blip2_model.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/ImageReward.py +10 -5
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward/blip_pretrain.py +4 -2
- evalscope/metrics/t2v_metrics/models/itmscore_models/image_reward_model.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/language_model/clip_t5.py +15 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5/model/multimodal_encoder/clip_encoder.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/clip_t5_model.py +15 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/gpt4v_model.py +9 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +2 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +3 -9
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +16 -10
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +4 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/__init__.py +8 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +47 -25
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_qformer.py +12 -7
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5.py +23 -17
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/blip2_t5_instruct.py +33 -23
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/fga_blip2.py +2 -1
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_llama.py +46 -30
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/modeling_t5.py +69 -37
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/__init__.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip.py +6 -4
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_image_text_matching.py +7 -5
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_nlvr.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_outputs.py +5 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/blip_vqa.py +17 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +35 -19
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/clip_vit.py +14 -12
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/eva_vit.py +63 -52
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +63 -38
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/vit.py +6 -3
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/__init__.py +6 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/processors/randaugment.py +3 -2
- evalscope/metrics/t2v_metrics/models/vqascore_models/mm_utils.py +15 -13
- evalscope/metrics/t2v_metrics/models/vqascore_models/vqa_model.py +3 -2
- evalscope/models/__init__.py +6 -29
- evalscope/models/image_edit_model.py +125 -0
- evalscope/models/mockllm.py +65 -0
- evalscope/models/model_apis.py +67 -0
- evalscope/models/modelscope.py +455 -0
- evalscope/models/openai_compatible.py +126 -0
- evalscope/models/text2image_model.py +124 -0
- evalscope/models/utils/openai.py +701 -0
- evalscope/perf/benchmark.py +4 -1
- evalscope/perf/http_client.py +4 -2
- evalscope/perf/plugin/api/custom_api.py +5 -4
- evalscope/perf/plugin/api/openai_api.py +11 -9
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +1 -1
- evalscope/perf/plugin/datasets/kontext_bench.py +1 -1
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +4 -2
- evalscope/perf/utils/benchmark_util.py +15 -10
- evalscope/perf/utils/db_util.py +9 -6
- evalscope/perf/utils/local_server.py +11 -3
- evalscope/perf/utils/rich_display.py +16 -10
- evalscope/report/__init__.py +2 -3
- evalscope/report/combinator.py +18 -12
- evalscope/report/generator.py +51 -35
- evalscope/report/{utils.py → report.py} +8 -6
- evalscope/run.py +33 -47
- evalscope/summarizer.py +1 -1
- evalscope/third_party/toolbench_static/llm/swift_infer.py +0 -4
- evalscope/utils/__init__.py +21 -2
- evalscope/utils/chat_service.py +3 -2
- evalscope/utils/deprecation_utils.py +12 -1
- evalscope/utils/function_utils.py +29 -0
- evalscope/utils/import_utils.py +23 -1
- evalscope/utils/io_utils.py +142 -6
- evalscope/utils/json_schema.py +208 -0
- evalscope/utils/logger.py +51 -12
- evalscope/utils/model_utils.py +11 -7
- evalscope/utils/multi_choices.py +288 -0
- evalscope/utils/url_utils.py +65 -0
- evalscope/version.py +2 -2
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/METADATA +108 -62
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/RECORD +258 -226
- tests/benchmark/test_eval.py +385 -0
- tests/benchmark/test_image_edit.py +65 -0
- tests/{aigc → benchmark}/test_t2i.py +22 -4
- tests/benchmark/test_vlm.py +80 -0
- tests/cli/test_all.py +85 -47
- tests/cli/test_collection.py +20 -8
- tests/cli/test_custom.py +22 -15
- tests/cli/test_reasoning.py +81 -0
- tests/common.py +73 -0
- tests/perf/test_perf.py +4 -2
- tests/rag/test_clip_benchmark.py +0 -2
- evalscope/benchmarks/aigc/t2i/base.py +0 -56
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +0 -78
- evalscope/benchmarks/aigc/t2i/genai_bench_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/general_t2i_adapter.py +0 -58
- evalscope/benchmarks/aigc/t2i/hpdv2_adapter.py +0 -57
- evalscope/benchmarks/aigc/t2i/tifa_adapter.py +0 -37
- evalscope/benchmarks/arc/ai2_arc.py +0 -151
- evalscope/benchmarks/benchmark.py +0 -81
- evalscope/benchmarks/ceval/ceval_exam.py +0 -146
- evalscope/benchmarks/cmmlu/cmmlu.py +0 -161
- evalscope/benchmarks/cmmlu/samples.jsonl +0 -5
- evalscope/benchmarks/competition_math/competition_math.py +0 -79
- evalscope/benchmarks/data_adapter.py +0 -528
- evalscope/benchmarks/filters.py +0 -59
- evalscope/benchmarks/gsm8k/gsm8k.py +0 -121
- evalscope/benchmarks/hellaswag/hellaswag.py +0 -112
- evalscope/benchmarks/humaneval/humaneval.py +0 -79
- evalscope/benchmarks/mmlu/mmlu.py +0 -160
- evalscope/benchmarks/mmlu/samples.jsonl +0 -5
- evalscope/benchmarks/process_bench/critique_template.txt +0 -13
- evalscope/benchmarks/race/race.py +0 -104
- evalscope/benchmarks/race/samples.jsonl +0 -5
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +0 -4
- evalscope/benchmarks/trivia_qa/trivia_qa.py +0 -89
- evalscope/benchmarks/truthful_qa/truthful_qa.py +0 -163
- evalscope/benchmarks/utils.py +0 -60
- evalscope/collections/evaluator.py +0 -375
- evalscope/metrics/completion_parsers.py +0 -227
- evalscope/metrics/named_metrics.py +0 -55
- evalscope/models/adapters/__init__.py +0 -14
- evalscope/models/adapters/base_adapter.py +0 -84
- evalscope/models/adapters/bfcl_adapter.py +0 -246
- evalscope/models/adapters/chat_adapter.py +0 -207
- evalscope/models/adapters/choice_adapter.py +0 -222
- evalscope/models/adapters/custom_adapter.py +0 -71
- evalscope/models/adapters/server_adapter.py +0 -236
- evalscope/models/adapters/t2i_adapter.py +0 -79
- evalscope/models/adapters/tau_bench_adapter.py +0 -189
- evalscope/models/custom/__init__.py +0 -4
- evalscope/models/custom/custom_model.py +0 -50
- evalscope/models/custom/dummy_model.py +0 -99
- evalscope/models/local_model.py +0 -128
- evalscope/models/register.py +0 -41
- tests/cli/test_run.py +0 -489
- /evalscope/{benchmarks/aigc → api}/__init__.py +0 -0
- /evalscope/benchmarks/{aigc/t2i → image_edit}/__init__.py +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/LICENSE +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/WHEEL +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.17.1.dist-info → evalscope-1.0.1.dist-info}/top_level.txt +0 -0
- /tests/{aigc → benchmark}/__init__.py +0 -0
tests/cli/test_all.py
CHANGED
|
@@ -17,44 +17,44 @@ os.environ['EVALSCOPE_LOG_LEVEL'] = 'DEBUG'
|
|
|
17
17
|
logger = get_logger()
|
|
18
18
|
|
|
19
19
|
datasets=[
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
20
|
+
'iquiz',
|
|
21
|
+
'ifeval',
|
|
22
|
+
'mmlu',
|
|
23
|
+
'mmlu_pro',
|
|
24
|
+
'musr',
|
|
25
|
+
'process_bench',
|
|
26
|
+
'race',
|
|
27
|
+
'trivia_qa',
|
|
28
|
+
'cmmlu',
|
|
29
|
+
'humaneval',
|
|
30
|
+
'gsm8k',
|
|
31
|
+
'bbh',
|
|
32
|
+
'competition_math',
|
|
33
|
+
'math_500',
|
|
34
|
+
'aime24',
|
|
35
|
+
'gpqa_diamond',
|
|
36
|
+
'arc',
|
|
37
|
+
'ceval',
|
|
38
|
+
'hellaswag',
|
|
39
|
+
'general_mcq',
|
|
40
|
+
'general_qa',
|
|
41
|
+
'super_gpqa',
|
|
42
|
+
# 'live_code_bench',
|
|
43
|
+
'mmlu_redux',
|
|
44
|
+
'simple_qa',
|
|
45
|
+
'chinese_simpleqa',
|
|
46
|
+
'alpaca_eval',
|
|
47
|
+
'arena_hard',
|
|
48
|
+
'maritime_bench',
|
|
49
|
+
'drop',
|
|
50
|
+
'winogrande',
|
|
51
|
+
'tool_bench',
|
|
52
|
+
'frames',
|
|
53
|
+
'docmath',
|
|
54
|
+
'needle_haystack',
|
|
55
|
+
'bfcl_v3',
|
|
56
|
+
'hle',
|
|
57
|
+
'tau_bench',
|
|
58
58
|
]
|
|
59
59
|
|
|
60
60
|
# Reverse the datasets list to ensure the order is from most recent to oldest
|
|
@@ -82,8 +82,7 @@ dataset_args={
|
|
|
82
82
|
'bbh': {
|
|
83
83
|
'subset_list': ['word_sorting', 'movie_recommendation'],
|
|
84
84
|
},
|
|
85
|
-
'
|
|
86
|
-
'subset_list': ['gpqa_diamond'],
|
|
85
|
+
'gpqa_diamond': {
|
|
87
86
|
'few_shot_num': 0,
|
|
88
87
|
},
|
|
89
88
|
'humaneval': {
|
|
@@ -112,8 +111,7 @@ dataset_args={
|
|
|
112
111
|
'subset_list': [
|
|
113
112
|
'example', # 评测数据集名称,上述 *_dev.csv 中的 *
|
|
114
113
|
# 'test'
|
|
115
|
-
]
|
|
116
|
-
'metric_list': ['AverageBLEU']
|
|
114
|
+
]
|
|
117
115
|
},
|
|
118
116
|
'super_gpqa': {
|
|
119
117
|
'subset_list': ['Philosophy', 'Education'],
|
|
@@ -152,7 +150,6 @@ dataset_args={
|
|
|
152
150
|
}
|
|
153
151
|
|
|
154
152
|
class TestRun(unittest.TestCase):
|
|
155
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
156
153
|
def test_benchmarks(self):
|
|
157
154
|
from evalscope.config import TaskConfig
|
|
158
155
|
|
|
@@ -182,19 +179,60 @@ class TestRun(unittest.TestCase):
|
|
|
182
179
|
|
|
183
180
|
run_task(task_cfg=task_cfg)
|
|
184
181
|
|
|
182
|
+
def test_vlm_benchmark(self):
|
|
183
|
+
from evalscope.config import TaskConfig
|
|
184
|
+
|
|
185
|
+
task_cfg = TaskConfig(
|
|
186
|
+
model='qwen-vl-plus',
|
|
187
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
188
|
+
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
189
|
+
eval_type=EvalType.SERVICE,
|
|
190
|
+
datasets=[
|
|
191
|
+
'mmmu',
|
|
192
|
+
# 'math_vista',
|
|
193
|
+
],
|
|
194
|
+
dataset_args={
|
|
195
|
+
'mmmu': {
|
|
196
|
+
'subset_list': ['Accounting']
|
|
197
|
+
},
|
|
198
|
+
'math_vista': {
|
|
199
|
+
'subset_list': ['default']
|
|
200
|
+
}
|
|
201
|
+
},
|
|
202
|
+
eval_batch_size=1,
|
|
203
|
+
limit=1,
|
|
204
|
+
stream=True,
|
|
205
|
+
generation_config={
|
|
206
|
+
'temperature': 0,
|
|
207
|
+
'n': 1,
|
|
208
|
+
'max_tokens': 4096,
|
|
209
|
+
'image_height': 512,
|
|
210
|
+
'image_width': 512,
|
|
211
|
+
'image_num': 2,
|
|
212
|
+
},
|
|
213
|
+
judge_worker_num=5,
|
|
214
|
+
judge_strategy=JudgeStrategy.AUTO,
|
|
215
|
+
judge_model_args={
|
|
216
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
217
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
218
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
219
|
+
}
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
run_task(task_cfg=task_cfg)
|
|
185
223
|
|
|
186
|
-
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
187
224
|
def test_ci_lite(self):
|
|
188
225
|
from evalscope.config import TaskConfig
|
|
189
226
|
|
|
227
|
+
api_key = env.get('DASHSCOPE_API_KEY')
|
|
228
|
+
|
|
190
229
|
task_cfg = TaskConfig(
|
|
191
230
|
model='qwen-plus',
|
|
192
231
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
193
|
-
api_key=
|
|
194
|
-
eval_type=EvalType.SERVICE,
|
|
232
|
+
api_key=api_key,
|
|
233
|
+
eval_type=EvalType.SERVICE if api_key else EvalType.MOCK_LLM,
|
|
195
234
|
datasets=[
|
|
196
235
|
'general_mcq',
|
|
197
|
-
'general_qa',
|
|
198
236
|
'iquiz',
|
|
199
237
|
],
|
|
200
238
|
dataset_args={
|
tests/cli/test_collection.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from dotenv import dotenv_values
|
|
2
|
+
|
|
3
|
+
env = dotenv_values('.env')
|
|
1
4
|
import json
|
|
2
5
|
import os
|
|
3
6
|
import unittest
|
|
@@ -15,7 +18,6 @@ class TestCollection(unittest.TestCase):
|
|
|
15
18
|
CollectionSchema(name='math', datasets=[
|
|
16
19
|
CollectionSchema(name='generation', datasets=[
|
|
17
20
|
DatasetInfo(name='gsm8k', weight=1, task_type='math', tags=['en', 'math']),
|
|
18
|
-
DatasetInfo(name='competition_math', weight=1, task_type='math', tags=['en', 'math']),
|
|
19
21
|
]),
|
|
20
22
|
CollectionSchema(name='multiple_choice', datasets=[
|
|
21
23
|
DatasetInfo(name='cmmlu', weight=2, task_type='math', tags=['zh', 'math'], args={'subset_list': ['college_mathematics', 'high_school_mathematics']}),
|
|
@@ -45,15 +47,25 @@ class TestCollection(unittest.TestCase):
|
|
|
45
47
|
from evalscope import TaskConfig, run_task
|
|
46
48
|
|
|
47
49
|
task_cfg = TaskConfig(
|
|
48
|
-
model='
|
|
49
|
-
api_url='
|
|
50
|
-
api_key='
|
|
50
|
+
model='qwen-plus',
|
|
51
|
+
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
52
|
+
api_key=env.get('DASHSCOPE_API_KEY'),
|
|
51
53
|
eval_type=EvalType.SERVICE,
|
|
52
54
|
datasets=['data_collection'],
|
|
53
|
-
dataset_args={
|
|
54
|
-
'
|
|
55
|
-
|
|
56
|
-
|
|
55
|
+
dataset_args={
|
|
56
|
+
'data_collection': {
|
|
57
|
+
# 'local_path': 'outputs/test_mix.jsonl'
|
|
58
|
+
'local_path': 'outputs/mixed_data_test.jsonl',
|
|
59
|
+
'shuffle': True,
|
|
60
|
+
}
|
|
61
|
+
},
|
|
62
|
+
eval_batch_size=5,
|
|
63
|
+
generation_config = {
|
|
64
|
+
'max_tokens': 10000,
|
|
65
|
+
'temperature': 0.0,
|
|
66
|
+
},
|
|
67
|
+
limit=10,
|
|
68
|
+
# use_cache='outputs/20250822_161804'
|
|
57
69
|
)
|
|
58
70
|
run_task(task_cfg=task_cfg)
|
|
59
71
|
|
tests/cli/test_custom.py
CHANGED
|
@@ -10,7 +10,7 @@ import subprocess
|
|
|
10
10
|
import unittest
|
|
11
11
|
|
|
12
12
|
from evalscope.config import TaskConfig
|
|
13
|
-
from evalscope.constants import
|
|
13
|
+
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
14
14
|
from evalscope.run import run_task
|
|
15
15
|
from evalscope.utils.import_utils import is_module_installed
|
|
16
16
|
from evalscope.utils.logger import get_logger
|
|
@@ -120,7 +120,7 @@ class TestRunCustom(unittest.TestCase):
|
|
|
120
120
|
from evalscope.config import TaskConfig
|
|
121
121
|
|
|
122
122
|
task_cfg = TaskConfig(
|
|
123
|
-
model='qwen2.5-
|
|
123
|
+
model='qwen2.5-7b-instruct',
|
|
124
124
|
api_url='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
125
125
|
api_key= env.get('DASHSCOPE_API_KEY'),
|
|
126
126
|
eval_type=EvalType.SERVICE,
|
|
@@ -132,7 +132,7 @@ class TestRunCustom(unittest.TestCase):
|
|
|
132
132
|
'dataset_id': 'custom_eval/text/qa',
|
|
133
133
|
'subset_list': [
|
|
134
134
|
'arena',
|
|
135
|
-
'example'
|
|
135
|
+
# 'example'
|
|
136
136
|
],
|
|
137
137
|
}
|
|
138
138
|
},
|
|
@@ -147,7 +147,7 @@ class TestRunCustom(unittest.TestCase):
|
|
|
147
147
|
},
|
|
148
148
|
ignore_errors=False,
|
|
149
149
|
judge_model_args={
|
|
150
|
-
'model_id': 'qwen2.5-
|
|
150
|
+
'model_id': 'qwen2.5-7b-instruct',
|
|
151
151
|
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
152
152
|
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
153
153
|
'generation_config': {
|
|
@@ -155,9 +155,19 @@ class TestRunCustom(unittest.TestCase):
|
|
|
155
155
|
'max_tokens': 4096
|
|
156
156
|
},
|
|
157
157
|
'score_type': 'numeric',
|
|
158
|
+
'prompt_template': """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
|
|
159
|
+
Begin your evaluation by providing a short explanation. Be as objective as possible.
|
|
160
|
+
After providing your explanation, you must rate the response on a scale of 0 (worst) to 100 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\"
|
|
161
|
+
|
|
162
|
+
[Question]
|
|
163
|
+
{question}
|
|
164
|
+
|
|
165
|
+
[Response]
|
|
166
|
+
{pred}
|
|
167
|
+
"""
|
|
158
168
|
},
|
|
159
169
|
judge_worker_num=5,
|
|
160
|
-
judge_strategy=JudgeStrategy.
|
|
170
|
+
judge_strategy=JudgeStrategy.LLM,
|
|
161
171
|
)
|
|
162
172
|
|
|
163
173
|
run_task(task_cfg=task_cfg)
|
|
@@ -203,8 +213,9 @@ class TestRunCustom(unittest.TestCase):
|
|
|
203
213
|
},
|
|
204
214
|
'score_type': 'pattern',
|
|
205
215
|
},
|
|
206
|
-
judge_worker_num=
|
|
207
|
-
judge_strategy=JudgeStrategy.
|
|
216
|
+
judge_worker_num=1,
|
|
217
|
+
judge_strategy=JudgeStrategy.LLM_RECALL,
|
|
218
|
+
use_cache='outputs/20250818_170420'
|
|
208
219
|
)
|
|
209
220
|
|
|
210
221
|
run_task(task_cfg=task_cfg)
|
|
@@ -223,20 +234,16 @@ class TestRunCustom(unittest.TestCase):
|
|
|
223
234
|
'general_arena': {
|
|
224
235
|
'extra_params':{
|
|
225
236
|
'models':[
|
|
226
|
-
{
|
|
227
|
-
'name': 'qwen2.5-0.5b',
|
|
228
|
-
'report_path': 'outputs/20250702_140354/reports/qwen2.5-0.5b-instruct'
|
|
229
|
-
},
|
|
230
237
|
{
|
|
231
238
|
'name': 'qwen2.5-7b',
|
|
232
|
-
'report_path': 'outputs/
|
|
239
|
+
'report_path': 'outputs/20250819_165034/reports/qwen2.5-7b-instruct'
|
|
233
240
|
},
|
|
234
241
|
{
|
|
235
242
|
'name': 'qwen2.5-72b',
|
|
236
|
-
'report_path': 'outputs/
|
|
243
|
+
'report_path': 'outputs/20250819_164926/reports/qwen2.5-72b-instruct'
|
|
237
244
|
}
|
|
238
245
|
],
|
|
239
|
-
'baseline': 'qwen2.5-
|
|
246
|
+
'baseline': 'qwen2.5-72b'
|
|
240
247
|
}
|
|
241
248
|
}
|
|
242
249
|
},
|
|
@@ -255,7 +262,7 @@ class TestRunCustom(unittest.TestCase):
|
|
|
255
262
|
},
|
|
256
263
|
},
|
|
257
264
|
judge_worker_num=5,
|
|
258
|
-
use_cache='outputs/
|
|
265
|
+
# use_cache='outputs/20250819_173546'
|
|
259
266
|
)
|
|
260
267
|
|
|
261
268
|
run_task(task_cfg=task_cfg)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
6
|
+
import unittest
|
|
7
|
+
from unittest import TestCase
|
|
8
|
+
|
|
9
|
+
from evalscope.config import TaskConfig
|
|
10
|
+
from evalscope.constants import EvalType, JudgeStrategy, OutputType
|
|
11
|
+
from evalscope.run import run_task
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestReasoning(TestCase):
|
|
18
|
+
"""Benchmark evaluation test cases."""
|
|
19
|
+
|
|
20
|
+
def setUp(self):
|
|
21
|
+
"""Setup common test configuration."""
|
|
22
|
+
self.base_config = {
|
|
23
|
+
'model': 'Qwen3-0.6B',
|
|
24
|
+
'api_url': 'http://0.0.0.0:8801/v1',
|
|
25
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
26
|
+
'eval_type': EvalType.SERVICE,
|
|
27
|
+
'eval_batch_size': 5,
|
|
28
|
+
'limit': 5,
|
|
29
|
+
'generation_config': {
|
|
30
|
+
'max_tokens': 4096,
|
|
31
|
+
'temperature': 0.0,
|
|
32
|
+
'seed': 42,
|
|
33
|
+
'parallel_tool_calls': True,
|
|
34
|
+
'extra_body':{'chat_template_kwargs': {'enable_thinking': False}} # 关闭思考模式
|
|
35
|
+
},
|
|
36
|
+
'judge_strategy': JudgeStrategy.AUTO,
|
|
37
|
+
'judge_worker_num': 5,
|
|
38
|
+
'judge_model_args': {
|
|
39
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
40
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
41
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
42
|
+
'generation_config': {
|
|
43
|
+
'temperature': 0.0,
|
|
44
|
+
'max_tokens': 4096,
|
|
45
|
+
}
|
|
46
|
+
},
|
|
47
|
+
'debug': True,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
|
|
51
|
+
"""Helper method to run test for a specific dataset."""
|
|
52
|
+
config = self.base_config.copy()
|
|
53
|
+
config['datasets'] = [dataset_name]
|
|
54
|
+
|
|
55
|
+
if use_mock:
|
|
56
|
+
config['eval_type'] = EvalType.MOCK_LLM
|
|
57
|
+
|
|
58
|
+
# 应用配置覆盖
|
|
59
|
+
config.update(config_overrides)
|
|
60
|
+
|
|
61
|
+
if dataset_args:
|
|
62
|
+
config['dataset_args'] = {dataset_name: dataset_args}
|
|
63
|
+
|
|
64
|
+
task_cfg = TaskConfig(**config)
|
|
65
|
+
run_task(task_cfg=task_cfg)
|
|
66
|
+
|
|
67
|
+
def _run_dataset_load_test(self, dataset_name, dataset_args=None):
|
|
68
|
+
"""Helper method to test dataset loading."""
|
|
69
|
+
|
|
70
|
+
self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
|
|
71
|
+
|
|
72
|
+
# Math & Reasoning datasets
|
|
73
|
+
def test_gsm8k(self):
|
|
74
|
+
"""Test GSM8K math reasoning dataset."""
|
|
75
|
+
self._run_dataset_test('gsm8k')
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if __name__ == '__main__':
|
|
79
|
+
# Run specific test: python -m unittest test_eval.TestBenchmark.test_gsm8k
|
|
80
|
+
# Run all tests: python -m unittest test_eval.TestBenchmark
|
|
81
|
+
unittest.main()
|
tests/common.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from dotenv import dotenv_values
|
|
3
|
+
|
|
4
|
+
env = dotenv_values('.env')
|
|
5
|
+
|
|
6
|
+
import unittest
|
|
7
|
+
from unittest import TestCase
|
|
8
|
+
|
|
9
|
+
from evalscope.config import TaskConfig
|
|
10
|
+
from evalscope.constants import EvalType, JudgeStrategy
|
|
11
|
+
from evalscope.run import run_task
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestBenchmark(TestCase):
|
|
18
|
+
"""Benchmark evaluation test cases."""
|
|
19
|
+
|
|
20
|
+
def setUp(self):
|
|
21
|
+
"""Setup common test configuration."""
|
|
22
|
+
self.base_config = {
|
|
23
|
+
'model': 'qwen-plus',
|
|
24
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
25
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
26
|
+
'eval_type': EvalType.SERVICE,
|
|
27
|
+
'eval_batch_size': 5,
|
|
28
|
+
'limit': 5,
|
|
29
|
+
'generation_config': {
|
|
30
|
+
'max_tokens': 4096,
|
|
31
|
+
'temperature': 0.0,
|
|
32
|
+
'seed': 42,
|
|
33
|
+
'parallel_tool_calls': True
|
|
34
|
+
},
|
|
35
|
+
'judge_strategy': JudgeStrategy.AUTO,
|
|
36
|
+
'judge_worker_num': 5,
|
|
37
|
+
'judge_model_args': {
|
|
38
|
+
'model_id': 'qwen2.5-72b-instruct',
|
|
39
|
+
'api_url': 'https://dashscope.aliyuncs.com/compatible-mode/v1',
|
|
40
|
+
'api_key': env.get('DASHSCOPE_API_KEY'),
|
|
41
|
+
'generation_config': {
|
|
42
|
+
'temperature': 0.0,
|
|
43
|
+
'max_tokens': 4096,
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
'debug': True,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
def _run_dataset_test(self, dataset_name, dataset_args=None, use_mock=False, **config_overrides):
|
|
50
|
+
"""Helper method to run test for a specific dataset."""
|
|
51
|
+
config = self.base_config.copy()
|
|
52
|
+
config['datasets'] = [dataset_name]
|
|
53
|
+
|
|
54
|
+
if not env.get('DASHSCOPE_API_KEY'):
|
|
55
|
+
use_mock = True
|
|
56
|
+
logger.warning('DASHSCOPE_API_KEY is not set. Using mock evaluation.')
|
|
57
|
+
|
|
58
|
+
if use_mock:
|
|
59
|
+
config['eval_type'] = EvalType.MOCK_LLM
|
|
60
|
+
|
|
61
|
+
# 应用配置覆盖
|
|
62
|
+
config.update(config_overrides)
|
|
63
|
+
|
|
64
|
+
if dataset_args:
|
|
65
|
+
config['dataset_args'] = {dataset_name: dataset_args}
|
|
66
|
+
|
|
67
|
+
task_cfg = TaskConfig(**config)
|
|
68
|
+
run_task(task_cfg=task_cfg)
|
|
69
|
+
|
|
70
|
+
def _run_dataset_load_test(self, dataset_name, dataset_args=None):
|
|
71
|
+
"""Helper method to test dataset loading."""
|
|
72
|
+
|
|
73
|
+
self._run_dataset_test(dataset_name, dataset_args, use_mock=True, limit=None)
|
tests/perf/test_perf.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os
|
|
3
2
|
from dotenv import dotenv_values
|
|
4
3
|
|
|
5
4
|
env = dotenv_values('.env')
|
|
6
|
-
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
|
7
5
|
import unittest
|
|
8
6
|
|
|
9
7
|
from evalscope.perf.main import run_perf_benchmark
|
|
@@ -123,6 +121,10 @@ class TestPerf(unittest.TestCase):
|
|
|
123
121
|
|
|
124
122
|
@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
|
|
125
123
|
def test_run_perf_multi_parallel(self):
|
|
124
|
+
if not env.get('DASHSCOPE_API_KEY'):
|
|
125
|
+
self.skipTest('DASHSCOPE_API_KEY is not set.')
|
|
126
|
+
return
|
|
127
|
+
|
|
126
128
|
from evalscope.perf.arguments import Arguments
|
|
127
129
|
task_cfg = Arguments(
|
|
128
130
|
parallel=[1, 2],
|
tests/rag/test_clip_benchmark.py
CHANGED
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional, Union
|
|
2
|
-
|
|
3
|
-
from evalscope.benchmarks import DataAdapter
|
|
4
|
-
from evalscope.metrics import mean, metric_registry
|
|
5
|
-
from evalscope.utils.logger import get_logger
|
|
6
|
-
|
|
7
|
-
logger = get_logger()
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
class T2IBaseAdapter(DataAdapter):
|
|
11
|
-
|
|
12
|
-
def __init__(self, **kwargs):
|
|
13
|
-
|
|
14
|
-
super().__init__(**kwargs)
|
|
15
|
-
|
|
16
|
-
logger.info(f'Initializing metrics: {self.metric_list}')
|
|
17
|
-
self.metrics = {m: metric_registry.get(m).object() for m in self.metric_list}
|
|
18
|
-
|
|
19
|
-
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
20
|
-
# dummy prompt for general t2i
|
|
21
|
-
return self.gen_prompt_data(prompt=input_d.get('prompt', ''), id=input_d.get('id', 0))
|
|
22
|
-
|
|
23
|
-
def get_gold_answer(self, input_d: dict) -> str:
|
|
24
|
-
# dummy gold answer for general t2i
|
|
25
|
-
return input_d.get('prompt', '')
|
|
26
|
-
|
|
27
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
28
|
-
# dummy parse pred result for general t2i
|
|
29
|
-
return result or raw_input_d.get('image_path', '')
|
|
30
|
-
|
|
31
|
-
def match(self, gold: str, pred: str) -> dict:
|
|
32
|
-
# dummy match for general t2i
|
|
33
|
-
# pred is the image path, gold is the prompt
|
|
34
|
-
res = {}
|
|
35
|
-
for metric_name, metric_func in self.metrics.items():
|
|
36
|
-
score = metric_func(images=[pred], texts=[gold])[0][0]
|
|
37
|
-
if isinstance(score, dict):
|
|
38
|
-
for k, v in score.items():
|
|
39
|
-
res[f'{metric_name}_{k}'] = v.cpu().item()
|
|
40
|
-
else:
|
|
41
|
-
res[metric_name] = score.cpu().item() # Updated to use score.cpu().item()
|
|
42
|
-
return res
|
|
43
|
-
|
|
44
|
-
def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
|
|
45
|
-
"""
|
|
46
|
-
compute weighted mean of the bleu score of all samples
|
|
47
|
-
|
|
48
|
-
Args:
|
|
49
|
-
review_res_list: [score1, score2, ...]
|
|
50
|
-
|
|
51
|
-
Returns:
|
|
52
|
-
avg_res: List[dict]
|
|
53
|
-
|
|
54
|
-
"""
|
|
55
|
-
items = super().compute_dict_metric(review_res_list, **kwargs)
|
|
56
|
-
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
|
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
-
import os.path
|
|
3
|
-
from collections import defaultdict
|
|
4
|
-
from typing import List, Optional, Union
|
|
5
|
-
|
|
6
|
-
from evalscope.benchmarks import Benchmark
|
|
7
|
-
from evalscope.constants import OutputType
|
|
8
|
-
from evalscope.metrics import mean
|
|
9
|
-
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
|
-
from evalscope.utils.logger import get_logger
|
|
11
|
-
from .base import T2IBaseAdapter
|
|
12
|
-
|
|
13
|
-
logger = get_logger()
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@Benchmark.register(
|
|
17
|
-
name='evalmuse',
|
|
18
|
-
dataset_id='AI-ModelScope/T2V-Eval-Prompts',
|
|
19
|
-
model_adapter=OutputType.IMAGE_GENERATION,
|
|
20
|
-
output_types=[OutputType.IMAGE_GENERATION],
|
|
21
|
-
subset_list=['EvalMuse'],
|
|
22
|
-
metric_list=['FGA_BLIP2Score'],
|
|
23
|
-
few_shot_num=0,
|
|
24
|
-
train_split=None,
|
|
25
|
-
eval_split='test',
|
|
26
|
-
)
|
|
27
|
-
class EvalMuseAdapter(T2IBaseAdapter):
|
|
28
|
-
|
|
29
|
-
def __init__(self, **kwargs):
|
|
30
|
-
super().__init__(**kwargs)
|
|
31
|
-
|
|
32
|
-
def load(self, **kwargs) -> dict:
|
|
33
|
-
if os.path.isfile(self.dataset_id):
|
|
34
|
-
data_list = jsonl_to_list(self.dataset_id)
|
|
35
|
-
data_dict = {self.subset_list[0]: {'test': data_list}}
|
|
36
|
-
return data_dict
|
|
37
|
-
else:
|
|
38
|
-
return super().load(**kwargs)
|
|
39
|
-
|
|
40
|
-
def get_gold_answer(self, input_d: dict) -> dict:
|
|
41
|
-
# return prompt and elements dict
|
|
42
|
-
return {'prompt': input_d.get('prompt'), 'tags': input_d.get('tags', {})}
|
|
43
|
-
|
|
44
|
-
def match(self, gold: dict, pred: str) -> dict:
|
|
45
|
-
# dummy match for general t2i
|
|
46
|
-
# pred is the image path, gold is the prompt
|
|
47
|
-
res = {}
|
|
48
|
-
for metric_name, metric_func in self.metrics.items():
|
|
49
|
-
if metric_name == 'FGA_BLIP2Score':
|
|
50
|
-
# For FGA_BLIP2Score, we need to pass the dictionary
|
|
51
|
-
score = metric_func(images=[pred], texts=[gold])[0][0]
|
|
52
|
-
else:
|
|
53
|
-
score = metric_func(images=[pred], texts=[gold['prompt']])[0][0]
|
|
54
|
-
if isinstance(score, dict):
|
|
55
|
-
for k, v in score.items():
|
|
56
|
-
res[f'{metric_name}:{k}'] = v.cpu().item()
|
|
57
|
-
else:
|
|
58
|
-
res[metric_name] = score.cpu().item()
|
|
59
|
-
return res
|
|
60
|
-
|
|
61
|
-
def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
|
|
62
|
-
"""
|
|
63
|
-
compute weighted mean of the bleu score of all samples
|
|
64
|
-
"""
|
|
65
|
-
items = super().compute_dict_metric(review_res_list, **kwargs)
|
|
66
|
-
# add statistics for each metric
|
|
67
|
-
new_items = defaultdict(list)
|
|
68
|
-
for metric_name, value_list in items.items():
|
|
69
|
-
if 'FGA_BLIP2Score' in metric_name and '(' in metric_name: # FGA_BLIP2Score element score
|
|
70
|
-
metrics_prefix = metric_name.split(':')[0]
|
|
71
|
-
category = metric_name.rpartition('(')[-1].split(')')[0]
|
|
72
|
-
category = category.split('-')[0].lower() # remove the suffix if exists
|
|
73
|
-
new_items[f'{metrics_prefix}:{category}'].extend(value_list)
|
|
74
|
-
else:
|
|
75
|
-
new_items[metric_name].extend(value_list)
|
|
76
|
-
|
|
77
|
-
# calculate mean for each metric
|
|
78
|
-
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in new_items.items()]
|